diff --git a/.gitattributes b/.gitattributes index a6344aac8c09253b3b630fb776ae94478aa0275b..d47ac04a3b4c8922810df77cab6f0394dbe9bdb7 100644 --- a/.gitattributes +++ b/.gitattributes @@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text *.zip filter=lfs diff=lfs merge=lfs -text *.zst filter=lfs diff=lfs merge=lfs -text *tfevents* filter=lfs diff=lfs merge=lfs -text +asset/banner.png filter=lfs diff=lfs merge=lfs -text +docs/resources/web-ui.jpg filter=lfs diff=lfs merge=lfs -text +docs/resources/dpo_data.png filter=lfs diff=lfs merge=lfs -text diff --git a/asset/banner.png b/asset/banner.png new file mode 100644 index 0000000000000000000000000000000000000000..cdc0d7e59391a45b8dad44778a6382ea9e063fe4 --- /dev/null +++ b/asset/banner.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aed7b0ac0bbb353df62f86b80e26eeab10fc69a0a49de161c544b51ab4ea9bea +size 380549 diff --git a/docs/resources/dpo_data.png b/docs/resources/dpo_data.png new file mode 100644 index 0000000000000000000000000000000000000000..9fc359fbfcfb744418a97806318c24daf22e5566 --- /dev/null +++ b/docs/resources/dpo_data.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8d87ca58f3ac43a79836ba40f5d9cb788b8fabc26d717fb4e8eb1f1400f6598d +size 354747 diff --git a/docs/resources/web-ui.jpg b/docs/resources/web-ui.jpg new file mode 100644 index 0000000000000000000000000000000000000000..1a8a42f1d97d9cda57ccaa283f34ba6ede308027 --- /dev/null +++ b/docs/resources/web-ui.jpg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5e83bb4b4ecda9386286b99c6e83551f1dd1fdcdaf7be2efa3117208e3806000 +size 181923 diff --git "a/docs/source/Instruction/\350\257\204\346\265\213.md" "b/docs/source/Instruction/\350\257\204\346\265\213.md" new file mode 100644 index 0000000000000000000000000000000000000000..147eae9894662e74ed8568e483a4ff07ff09f925 --- /dev/null +++ "b/docs/source/Instruction/\350\257\204\346\265\213.md" @@ -0,0 +1,269 @@ +# 评测 + +SWIFT支持了eval(评测)能力,用于对原始模型和训练后的模型给出标准化的评测指标。 + +## 能力介绍 + +SWIFT的eval能力使用了魔搭社区[评测框架EvalScope](https://github.com/modelscope/eval-scope),并进行了高级封装以支持各类模型的评测需求。 + +> 注意:EvalScope支持许多其他的复杂能力,例如[模型的性能评测](https://evalscope.readthedocs.io/zh-cn/latest/user_guides/stress_test/quick_start.html),请直接使用EvalScope框架。 + +目前我们支持了**标准评测集**的评测流程,以及**用户自定义**评测集的评测流程。其中**标准评测集**由三个评测后端提供支持: + +下面展示所支持的数据集名称,若需了解数据集的详细信息,请参考[所有支持的数据集](https://evalscope.readthedocs.io/zh-cn/latest/get_started/supported_dataset.html) + +1. Native(默认): + + 主要支持纯文本评测,同时**支持**评测结果可视化 + ```text + 'arc', 'bbh', 'ceval', 'cmmlu', 'competition_math', + 'general_qa', 'gpqa', 'gsm8k', 'hellaswag', 'humaneval', + 'ifeval', 'iquiz', 'mmlu', 'mmlu_pro', + 'race', 'trivia_qa', 'truthful_qa' + ``` + +2. OpenCompass: + + 主要支持纯文本评测,暂**不支持**评测结果可视化 + ```text + 'obqa', 'cmb', 'AX_b', 'siqa', 'nq', 'mbpp', 'winogrande', 'mmlu', 'BoolQ', 'cluewsc', 'ocnli', 'lambada', + 'CMRC', 'ceval', 'csl', 'cmnli', 'bbh', 'ReCoRD', 'math', 'humaneval', 'eprstmt', 'WSC', 'storycloze', + 'MultiRC', 'RTE', 'chid', 'gsm8k', 'AX_g', 'bustm', 'afqmc', 'piqa', 'lcsts', 'strategyqa', 'Xsum', 'agieval', + 'ocnli_fc', 'C3', 'tnews', 'race', 'triviaqa', 'CB', 'WiC', 'hellaswag', 'summedits', 'GaokaoBench', + 'ARC_e', 'COPA', 'ARC_c', 'DRCD' + ``` + +3. VLMEvalKit: + + 主要支持多模态评测,暂**不支持**评测结果可视化 + ```text + 'COCO_VAL', 'MME', 'HallusionBench', 'POPE', 'MMBench_DEV_EN', 'MMBench_TEST_EN', 'MMBench_DEV_CN', 'MMBench_TEST_CN', + 'MMBench', 'MMBench_CN', 'MMBench_DEV_EN_V11', 'MMBench_TEST_EN_V11', 'MMBench_DEV_CN_V11', + 'MMBench_TEST_CN_V11', 'MMBench_V11', 'MMBench_CN_V11', 'SEEDBench_IMG', 'SEEDBench2', + 'SEEDBench2_Plus', 'ScienceQA_VAL', 'ScienceQA_TEST', 'MMT-Bench_ALL_MI', 'MMT-Bench_ALL', + 'MMT-Bench_VAL_MI', 'MMT-Bench_VAL', 'AesBench_VAL', 'AesBench_TEST', 'CCBench', 'AI2D_TEST', 'MMStar', + 'RealWorldQA', 'MLLMGuard_DS', 'BLINK', 'OCRVQA_TEST', 'OCRVQA_TESTCORE', 'TextVQA_VAL', 'DocVQA_VAL', + 'DocVQA_TEST', 'InfoVQA_VAL', 'InfoVQA_TEST', 'ChartQA_TEST', 'MathVision', 'MathVision_MINI', + 'MMMU_DEV_VAL', 'MMMU_TEST', 'OCRBench', 'MathVista_MINI', 'LLaVABench', 'MMVet', 'MTVQA_TEST', + 'MMLongBench_DOC', 'VCR_EN_EASY_500', 'VCR_EN_EASY_100', 'VCR_EN_EASY_ALL', 'VCR_EN_HARD_500', + 'VCR_EN_HARD_100', 'VCR_EN_HARD_ALL', 'VCR_ZH_EASY_500', 'VCR_ZH_EASY_100', 'VCR_ZH_EASY_ALL', + 'VCR_ZH_HARD_500', 'VCR_ZH_HARD_100', 'VCR_ZH_HARD_ALL', 'MMDU', 'MMBench-Video', 'Video-MME' + ``` + +## 环境准备 + +```shell +pip install ms-swift[eval] -U +``` + +或从源代码安装: + +```shell +git clone https://github.com/modelscope/ms-swift.git +cd ms-swift +pip install -e '.[eval]' +``` + +## 评测 + +支持纯文本评测、多模态评测、url评测、自定义数据集评测四种方式 + +**基本示例** + +```shell +CUDA_VISIBLE_DEVICES=0 \ +swift eval \ + --model Qwen/Qwen2.5-0.5B-Instruct \ + --eval_backend Native \ + --infer_backend pt \ + --eval_limit 10 \ + --eval_dataset gsm8k +``` +其中: +- model: 可指定本地模型路径或者modelscope上的模型ID +- eval_backend: 可选 Native, OpenCompass, VLMEvalKit,默认为 Native +- infer_backend: 可选 pt, vllm, lmdeploy,默认为 pt +- eval_limit: 每个评测集的采样数,默认为None,表示使用全部数据,可用于快速验证 +- eval_dataset: 评测数据集,可设置多个数据集,用空格分割 + +具体评测的参数列表可以参考[这里](命令行参数.md#评测参数)。 + +## 训练中评测 + +SWIFT支持在训练过程中使用EvalScope对当前的模型进行评测,以便及时了解模型的训练效果。 + +**基本示例** + +```shell +CUDA_VISIBLE_DEVICES=0 \ +swift sft \ + --model "Qwen/Qwen2.5-0.5B-Instruct" \ + --train_type "lora" \ + --dataset "AI-ModelScope/alpaca-gpt4-data-zh#100" \ + --torch_dtype "bfloat16" \ + --num_train_epochs "1" \ + --per_device_train_batch_size "1" \ + --learning_rate "1e-4" \ + --lora_rank "8" \ + --lora_alpha "32" \ + --target_modules "all-linear" \ + --gradient_accumulation_steps "16" \ + --save_steps "50" \ + --save_total_limit "5" \ + --logging_steps "5" \ + --max_length "2048" \ + --eval_strategy "steps" \ + --eval_steps "5" \ + --per_device_eval_batch_size "5" \ + --eval_use_evalscope \ + --eval_datasets "gsm8k" \ + --eval_datasets_args '{"gsm8k": {"few_shot_num": 0}}' \ + --eval_limit "10" +``` + +注意启动命令为`sft`,其中eval相关的参数有: +- eval_strategy: 评估策略。默认为None,跟随`save_strategy`的策略 +- eval_steps: 默认为None,如果存在评估数据集,则跟随`save_steps`的策略 +- eval_use_evalscope: 是否使用evalscope进行评测,需要设置该参数来开启评测 +- eval_datasets: 评测数据集,可设置多个数据集,用空格分割 +- eval_datasets_args: 评测数据集参数,json格式,可设置多个数据集的参数 +- eval_limit: 评测数据集采样数 +- eval_generation_config: 评测时模型推理配置,json格式,默认为`{'max_tokens': 512}` + + +更多评测的样例可以参考[examples](https://github.com/modelscope/ms-swift/tree/main/examples/eval) + +## 自定义评测集 + +本框架支持选择题和问答题,两种预定义的数据集格式,使用流程如下: + +*注意:使用自定义评测时,eval_backend参数必须为Native* + +### 选择题格式(MCQ) +适合用户是选择题的场景,评测指标为准确率(accuracy)。 + +**数据准备** + +准备选择题格式的csv文件,该目录结构如下: + +```text +mcq/ +├── example_dev.csv # (可选)文件名组成为`{subset_name}_dev.csv`,用于fewshot评测 +└── example_val.csv # 文件名组成为`{subset_name}_val.csv`,用于实际评测的数据 +``` + +其中csv文件需要为下面的格式: + +```text +id,question,A,B,C,D,answer +1,通常来说,组成动物蛋白质的氨基酸有____,4种,22种,20种,19种,C +2,血液内存在的下列物质中,不属于代谢终产物的是____。,尿素,尿酸,丙酮酸,二氧化碳,C +``` +其中: +- `id`是序号(可选) +- `question`是问题 +- `A`, `B`, `C`, `D`等是可选项,最大支持10个选项 +- `answer`是正确选项 + +**启动评测** + +运行下面的命令: + +```bash +CUDA_VISIBLE_DEVICES=0 \ +swift eval \ + --model Qwen/Qwen2.5-0.5B-Instruct \ + --eval_backend Native \ + --infer_backend pt \ + --eval_dataset general_mcq \ + --dataset_args '{"general_mcq": {"local_path": "/path/to/mcq", "subset_list": ["example"]}}' +``` +其中: +- `eval_dataset` 需要设置为 `general_mcq` +- `dataset_args` 需要设置 + - `local_path` 自定义数据集文件夹路径 + - `subset_list` 评测数据集名称,上述 `*_dev.csv` 中的 `*` + +**运行结果** + +```text ++---------------------+-------------+-----------------+----------+-------+---------+---------+ +| Model | Dataset | Metric | Subset | Num | Score | Cat.0 | ++=====================+=============+=================+==========+=======+=========+=========+ +| Qwen2-0.5B-Instruct | general_mcq | AverageAccuracy | example | 12 | 0.5833 | default | ++---------------------+-------------+-----------------+----------+-------+---------+---------+ +``` + +## 问答题格式(QA) +适合用户是问答题的场景,评测指标是`ROUGE`和`BLEU`。 + +**数据准备** + +准备一个问答题格式的jsonline文件,该目录包含了一个文件: + +```text +qa/ +└── example.jsonl +``` + +该jsonline文件需要为下面的格式: + +```json +{"query": "中国的首都是哪里?", "response": "中国的首都是北京"} +{"query": "世界上最高的山是哪座山?", "response": "是珠穆朗玛峰"} +{"query": "为什么北极见不到企鹅?", "response": "因为企鹅大多生活在南极"} +``` + +**启动评测** + +运行下面的命令: + +```bash +CUDA_VISIBLE_DEVICES=0 \ +swift eval \ + --model Qwen/Qwen2.5-0.5B-Instruct \ + --eval_backend Native \ + --infer_backend pt \ + --eval_dataset general_qa \ + --dataset_args '{"general_qa": {"local_path": "/path/to/qa", "subset_list": ["example"]}}' +``` + +其中: +- `eval_dataset` 需要设置为 `general_qa` +- `dataset_args` 是一个json字符串,需要设置: + - `local_path` 自定义数据集文件夹路径 + - `subset_list` 评测数据集名称,上述 `*.jsonl` 中的 `*` + +**运行结果** + +```text ++---------------------+-------------+-----------------+----------+-------+---------+---------+ +| Model | Dataset | Metric | Subset | Num | Score | Cat.0 | ++=====================+=============+=================+==========+=======+=========+=========+ +| Qwen2-0.5B-Instruct | general_qa | bleu-1 | default | 12 | 0.2324 | default | ++---------------------+-------------+-----------------+----------+-------+---------+---------+ +| Qwen2-0.5B-Instruct | general_qa | bleu-2 | default | 12 | 0.1451 | default | ++---------------------+-------------+-----------------+----------+-------+---------+---------+ +| Qwen2-0.5B-Instruct | general_qa | bleu-3 | default | 12 | 0.0625 | default | ++---------------------+-------------+-----------------+----------+-------+---------+---------+ +| Qwen2-0.5B-Instruct | general_qa | bleu-4 | default | 12 | 0.0556 | default | ++---------------------+-------------+-----------------+----------+-------+---------+---------+ +| Qwen2-0.5B-Instruct | general_qa | rouge-1-f | default | 12 | 0.3441 | default | ++---------------------+-------------+-----------------+----------+-------+---------+---------+ +| Qwen2-0.5B-Instruct | general_qa | rouge-1-p | default | 12 | 0.2393 | default | ++---------------------+-------------+-----------------+----------+-------+---------+---------+ +| Qwen2-0.5B-Instruct | general_qa | rouge-1-r | default | 12 | 0.8889 | default | ++---------------------+-------------+-----------------+----------+-------+---------+---------+ +| Qwen2-0.5B-Instruct | general_qa | rouge-2-f | default | 12 | 0.2062 | default | ++---------------------+-------------+-----------------+----------+-------+---------+---------+ +| Qwen2-0.5B-Instruct | general_qa | rouge-2-p | default | 12 | 0.1453 | default | ++---------------------+-------------+-----------------+----------+-------+---------+---------+ +| Qwen2-0.5B-Instruct | general_qa | rouge-2-r | default | 12 | 0.6167 | default | ++---------------------+-------------+-----------------+----------+-------+---------+---------+ +| Qwen2-0.5B-Instruct | general_qa | rouge-l-f | default | 12 | 0.333 | default | ++---------------------+-------------+-----------------+----------+-------+---------+---------+ +| Qwen2-0.5B-Instruct | general_qa | rouge-l-p | default | 12 | 0.2324 | default | ++---------------------+-------------+-----------------+----------+-------+---------+---------+ +| Qwen2-0.5B-Instruct | general_qa | rouge-l-r | default | 12 | 0.8889 | default | ++---------------------+-------------+-----------------+----------+-------+---------+---------+ +``` diff --git a/docs/source_en/BestPractices/Rapidly-Training-VL-model.md b/docs/source_en/BestPractices/Rapidly-Training-VL-model.md new file mode 100644 index 0000000000000000000000000000000000000000..ec4bf6cfd104f04ca4e9d63a6ec0285d9c576b2e --- /dev/null +++ b/docs/source_en/BestPractices/Rapidly-Training-VL-model.md @@ -0,0 +1,228 @@ +# Best Practices for Rapidly Training Vision-Language (VL) Models + +This document provides best practices for quickly training vision-language (VL) models from scratch. + +Model Links +- [Qwen2.5-VL-7B-Instruct](https://www.modelscope.cn/models/Qwen/Qwen2.5-VL-7B-Instruct) +- [Qwen3-8B](https://www.modelscope.cn/models/Qwen/Qwen3-8B) + +Trained Model Link +- [Simple-VL-8B](https://www.modelscope.cn/models/swift/Simple-VL-8B/summary) + + +The training workflow builds upon the Qwen2.5-VL-7B-Instruct model architecture by replacing its internal large language model (LLM) component with the weights from Qwen3-8B , thereby enhancing the model's visual understanding capabilities. The process involves the following steps: + +1. Modify the original model’s configuration file config.json to align with Qwen3-8B. +2. Initialize and load new model weights, saving them as a new model. +3. Fine-tune the new model in two stages: + 1. Stage 1 : Train only the vision-to-language alignment module (aligner), freezing the ViT and LLM components. + 2. Stage 2 : Unfreeze all modules and perform joint fine-tuning to improve overall performance. + + +## Model Modification + +### Config File (config.json) Update +Due to structural differences between Qwen2.5-7B-Instruct and Qwen3-8B (e.g., number of layers, hidden dimensions), create a new config.json based on the Qwen2.5-VL-7B-Instruct config and update the following parameters to match Qwen3-8B: + + +``` +Modified Parameters +1. hidden_size 3584->4096 +2. intermediate_size: 18944->12288 +3. num_attention_heads: 28->32 +4. num_key_value_heads: 4->8 +5. num_hidden_layers: 28->32 +6. vocab_size:152064->151936 +7. max_window_layers:28->36 + +Newly Added Parameter +1. head_dim: 128 +``` + +### Model Weight Initialization and Replacement +Use the following Python script to initialize, replace, and save the model weights: +```python +import torch +from modelscope import Qwen2_5_VLForConditionalGeneration, AutoModelForCausalLM, AutoConfig +from transformers.models.qwen2_5_vl.modeling_qwen2_5_vl import Qwen2_5_VLPatchMerger, Qwen2_5_VLModel +from accelerate import Accelerator + +# Load original VL model and Qwen3-8B model +qwen2_5_vl_7b_model = Qwen2_5_VLForConditionalGeneration.from_pretrained( + "Qwen/Qwen2.5-VL-7B-Instruct", + device_map="cuda", + torch_dtype=torch.bfloat16 +) +device = qwen2_5_vl_7b_model.device + +qwen3_8b_model = AutoModelForCausalLM.from_pretrained( + "Qwen/Qwen3-8B", + device_map=device, + torch_dtype=torch.bfloat16 +) + +# Load configurations +old_config = AutoConfig.from_pretrained("Qwen/Qwen2.5-VL-7B-Instruct") +new_config = AutoConfig.from_pretrained("/path/to/new_config_dir") # Path to new config directory + +# Replace merger (aligner) layer +new_merger = Qwen2_5_VLPatchMerger( + dim=new_visual_config.out_hidden_size, + context_dim=new_visual_config.hidden_size, + spatial_merge_size=new_visual_config.spatial_merge_size, +).to(device).to(torch.bfloat16) +qwen2_5_vl_7b_model.visual.merger = new_merger + +# Replace LLM part of the VL model +new_llm_model = Qwen2_5_VLModel(new_config).to(device).to(torch.bfloat16) + +for name, param in qwen3_8b_model.model.named_parameters(): + if name in new_llm_model.state_dict(): + new_llm_model.state_dict()[name].copy_(param) + +qwen2_5_vl_7b_model.model = new_llm_model +qwen2_5_vl_7b_model.lm_head = qwen3_8b_model.lm_head + +# Save modified model +accelerator = Accelerator() +accelerator.save_model( + model=qwen2_5_vl_7b_model, + save_directory="/path/to/save/Qwen3-VL-Model", + max_shard_size="4GB", + safe_serialization=True +) +``` + + +## Training +To simplify the process, we skip pre-training and proceed directly to supervised fine-tuning (SFT). The training is divided into two stages: + +### Stage 1: Train Aligner Layer +Train only the vision-to-language alignment module while freezing the ViT and LLM parts: +```bash +NNODES=$WORLD_SIZE \ +NODE_RANK=$RANK \ +NPROC_PER_NODE=8 \ +MAX_PIXELS=1003520 \ +CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \ +swift sft \ + --model /path/to/new_vl_model \ + --model_type qwen2_5_vl \ + --train_type full \ + --dataset xxx \ + --torch_dtype bfloat16 \ + --attn_impl flash_attn \ + --freeze_vit true \ + --freeze_llm true \ + --freeze_aligner false \ + --num_train_epochs 3 \ + --per_device_train_batch_size 2 \ + --learning_rate 5e-6 \ + --gradient_accumulation_steps 8 \ + --eval_steps -1 \ + --save_steps 1000 \ + --save_total_limit 10 \ + --logging_steps 5 \ + --max_length 8192 \ + --output_dir output \ + --warmup_ratio 0.05 \ + --dataloader_num_workers 4 \ + --dataset_num_proc 8 \ + --deepspeed zero2 +``` + +### Stage 2: Full Model Training + +Unfreeze all modules and jointly train to enhance the model's visual understanding: + +```bash +NNODES=$WORLD_SIZE \ +NODE_RANK=$RANK \ +NPROC_PER_NODE=8 \ +MAX_PIXELS=1003520 \ +CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \ +swift sft \ + --model /path/to/stage1_checkpoint \ + --model_type qwen2_5_vl \ + --train_type full \ + --dataset xxx \ + --torch_dtype bfloat16 \ + --attn_impl flash_attn \ + --freeze_vit false \ + --freeze_llm false \ + --freeze_aligner false \ + --num_train_epochs 3 \ + --per_device_train_batch_size 2 \ + --learning_rate 5e-6 \ + --gradient_accumulation_steps 8 \ + --eval_steps -1 \ + --save_steps 1000 \ + --save_total_limit 10 \ + --logging_steps 5 \ + --max_length 8192 \ + --output_dir output \ + --warmup_ratio 0.05 \ + --dataloader_num_workers 4 \ + --dataset_num_proc 8 \ + --deepspeed zero2 +``` + +## Inference / Deployment / Evaluation + +### Inference +Perform inference using `swift infer`: +```bash +swift infer \ + --model /path/to/stage2_checkpoint +``` + +### Deoloyment +Accelerate model serving with vLLM: +```bash +CUDA_VISIBLE_DEVICES=0 \ +MAX_PIXELS=1003520 \ +VIDEO_MAX_PIXELS=50176 \ +FPS_MAX_FRAMES=12 \ +swift deploy \ + --model /path/to/stage2_checkpoint \ + --infer_backend vllm \ + --gpu_memory_utilization 0.9 \ + --max_model_len 8192 \ + --max_new_tokens 2048 \ + --limit_mm_per_prompt '{"image": 5, "video": 2}' \ + --served_model_name Qwen3-VL +``` + +### Evaluation +Evaluate the trained VL model using [EvalScope](https://github.com/modelscope/evalscope/). + +Example Evaluation Using MMMU Benchmark +```python +from evalscope import TaskConfig, run_task + +task_cfg_dict = TaskConfig( + work_dir='outputs', + eval_backend='VLMEvalKit', + eval_config={ + 'data': ['MMMU_DEV_VAL'], + 'mode': 'all', + 'model': [ + { + 'api_base': 'http://localhost:8000/v1/chat/completions', + 'key': 'EMPTY', + 'name': 'CustomAPIModel', + 'temperature': 0.6, + 'type': 'Qwen3-VL', + 'img_size': -1, + 'video_llm': False, + 'max_tokens': 512, + } + ], + 'reuse': False, + 'nproc': 64, + 'judge': 'exact_matching' + }, +) + +run_task(task_cfg=task_cfg_dict) +``` diff --git a/docs/source_en/Customization/Custom-model.md b/docs/source_en/Customization/Custom-model.md new file mode 100644 index 0000000000000000000000000000000000000000..65d6911636dd08e7f8b852281dc8cdb3b2cefef8 --- /dev/null +++ b/docs/source_en/Customization/Custom-model.md @@ -0,0 +1,35 @@ +# Custom Model + +The models built into ms-swift can be used directly by specifying either `model_id` or `model_path`: `--model `. ms-swift determines the `model_type` based on the suffix of `model_id/model_path` and the `config.json` file. Each `model_type` has a unique model structure, template, and loading method. Of course, you can also manually override these by passing `--model_type` and `--template`. You can check the supported `model_type` and templates in the [Supported Models and Datasets](../Instruction/Supported-models-and-datasets.md). + +## Model Registration + +Custom models are typically implemented using model registration. You can refer to the [built-in model](https://github.com/modelscope/ms-swift/blob/main/swift/llm/model/model/qwen.py), the [built-in dialogue template](https://github.com/modelscope/ms-swift/blob/main/swift/llm/template/template/qwen.py), or the example code in the [examples](https://github.com/modelscope/swift/blob/main/examples/custom). You can specify the `--custom_register_path xxx.py` to parse the externally registered content, which is convenient for users installing via pip instead of git clone. + +The `register_model` function registers a model in the `MODEL_MAPPING`. You can complete the model registration by calling the function `register_model(model_meta)`, where `model_meta` will store the model's metadata. The parameter list for ModelMeta is as follows: + +- model_type: Required. The model type, which is also the unique ID. +- model_groups: Required. Lists the ModelScope/HuggingFace model IDs and local paths. Running the [run_model_info.py](https://github.com/modelscope/ms-swift/blob/main/scripts/utils/run_model_info.py) file will automatically generate the [supported models documentation](https://swift.readthedocs.io/en/latest/Instruction/Supported-models-and-datasets.html) and automatically match the model_type based on the `--model` suffix. +- template: Required. The default template type when `--template` is not specified. +- get_function: Required. The loading function for the model and tokenizer/processor (for multi-modal models). LLM is typically set to `get_model_tokenizer_with_flash_attn`. +- model_arch: The model architecture. Defaults to None. Multi-modal model training requires setting this parameter to determine the prefix for llm/vit/aligner. +- architectures: The architectures item in config.json, used to automatically match the model with its model_type. Defaults to `[]`. +- additional_saved_files: Files that need to be additionally saved during full parameter training and merge-lora. Defaults to `[]`. +- torch_dtype: The default dtype when `torch_dtype` is not passed during model loading. Defaults to None, read from config.json. +- is_multimodal: Indicates whether the model is multi-modal. Defaults to False. +- ignore_patterns: File patterns to be ignored when downloading from the hub. Defaults to `[]`. + +The `register_template` function registers a dialogue template in `TEMPLATE_MAPPING`. To complete the registration of the dialogue template, simply call the function `register_template(template_meta)`, where `template_meta` will store the metadata of the template. The parameter list for TemplateMeta is as follows: + +- template_type: Required. The type of dialogue template, which also serves as a unique ID. +- prefix: Required. The prefix of the dialogue template, usually encompassing parts like system, bos_token, and is generated independently of multi-turn dialogue loops. For example, the prefix for qwen is `[]`. +- prompt: Required. Represents the dialogue portion before `{{RESPONSE}}`. We use `{{QUERY}}` as a placeholder for the user's inquiry part. For example, the prompt for qwen is `['<|im_start|>user\n{{QUERY}}<|im_end|>\n<|im_start|>assistant\n']`. +- chat_sep: Required. The separator for each turn in multi-turn dialogues. If set to None, the template does not support multi-turn dialogue. For example, the chat_sep for qwen is `['<|im_end|>\n']`. +- suffix: Defaults to `[['eos_token_id']]`. The suffix part of the dialogue template, generated independently of multi-turn dialogue loops, usually the eos_token. For example, the suffix for qwen is `['<|im_end|>']`. +- template_cls: Defaults to `Template`. Customization is generally required when defining templates for multimodal models, particularly in customizing the `_encode`, `_post_encode`, and `_data_collator` functions. +- system_prefix: Defaults to None. The prefix for dialogue templates with a system. We use`{{SYSTEM}}`as a placeholder for the system. For example, the system_prefix for qwen is`['<|im_start|>system\n{{SYSTEM}}<|im_end|>\n']`. + - Note: If the system is empty and `prefix` can be replaced by `system_prefix`, you can write `prefix` as a prefix including the system without setting `system_prefix`. + - If the prefix does not include `{{SYSTEM}}` and system_prefix is not set, the template does not support the system. +- default_system: Defaults to None. The default system used when `--system` is not provided. For example, the default_system for qwen is `'You are a helpful assistant.'`. +- stop_words: Defaults to`[]`. Additional stop words besides eos_token and`suffix[-1]`. For example, the stop_words for qwen is`['<|endoftext|>']` + - Note: During inference, the output response will be filtered by eos_token and `suffix[-1]`, but additional stop_words will be retained. diff --git a/docs/transformers/.github/ISSUE_TEMPLATE/bug-report.yml b/docs/transformers/.github/ISSUE_TEMPLATE/bug-report.yml new file mode 100644 index 0000000000000000000000000000000000000000..19e9d106dbd1d976b416c4ed0f4de402ad0a6033 --- /dev/null +++ b/docs/transformers/.github/ISSUE_TEMPLATE/bug-report.yml @@ -0,0 +1,134 @@ +name: "\U0001F41B Bug Report" +description: Submit a bug report to help us improve transformers +labels: [ "bug" ] +body: + - type: markdown + attributes: + value: | + Thanks for taking the time to fill out this bug report! 🤗 + + Before you submit your bug report: + + - If it is your first time submitting, be sure to check our [bug report guidelines](https://github.com/huggingface/transformers/blob/main/CONTRIBUTING.md#did-you-find-a-bug) + - Try our [docs bot](https://huggingface.co/spaces/huggingchat/hf-docs-chat) -- it might be able to help you with your issue + + - type: textarea + id: system-info + attributes: + label: System Info + description: Please share your system info with us. You can run the command `transformers-cli env` and copy-paste its output below. + placeholder: transformers version, platform, python version, ... + validations: + required: true + + - type: textarea + id: who-can-help + attributes: + label: Who can help? + description: | + Your issue will be replied to more quickly if you can figure out the right person to tag with @ + If you know how to use git blame, that is the easiest way, otherwise, here is a rough guide of **who to tag**. + + All issues are read by one of the core maintainers, so if you don't know who to tag, just leave this blank and + a core maintainer will ping the right person. + + Please tag fewer than 3 people. + + Models: + + - text models: @ArthurZucker + - vision models: @amyeroberts, @qubvel + - speech models: @eustlb + - graph models: @clefourrier + + Library: + + - flax: @gante and @Rocketknight1 + - generate: @zucchini-nlp (visual-language models) or @gante (all others) + - pipelines: @Rocketknight1 + - tensorflow: @gante and @Rocketknight1 + - tokenizers: @ArthurZucker and @itazap + - trainer: @zach-huggingface @SunMarc + + Integrations: + + - deepspeed: HF Trainer/Accelerate: @SunMarc @zach-huggingface + - ray/raytune: @richardliaw, @amogkam + - Big Model Inference: @SunMarc + - quantization (bitsandbytes, autogpt): @SunMarc @MekkCyber + + Devices/Backends: + + - AMD ROCm: @ivarflakstad + - Intel XPU: @IlyasMoutawwakil + - Ascend NPU: @ivarflakstad + + Documentation: @stevhliu + + Model hub: + + - for issues with a model, report at https://discuss.huggingface.co/ and tag the model's creator. + + HF projects: + + - accelerate: [different repo](https://github.com/huggingface/accelerate) + - datasets: [different repo](https://github.com/huggingface/datasets) + - diffusers: [different repo](https://github.com/huggingface/diffusers) + - rust tokenizers: [different repo](https://github.com/huggingface/tokenizers) + + Maintained examples (not research project or legacy): + + - Flax: @Rocketknight1 + - PyTorch: See Models above and tag the person corresponding to the modality of the example. + - TensorFlow: @Rocketknight1 + + Research projects are not maintained and should be taken as is. + + placeholder: "@Username ..." + + - type: checkboxes + id: information-scripts-examples + attributes: + label: Information + description: 'The problem arises when using:' + options: + - label: "The official example scripts" + - label: "My own modified scripts" + + - type: checkboxes + id: information-tasks + attributes: + label: Tasks + description: "The tasks I am working on are:" + options: + - label: "An officially supported task in the `examples` folder (such as GLUE/SQuAD, ...)" + - label: "My own task or dataset (give details below)" + + - type: textarea + id: reproduction + validations: + required: true + attributes: + label: Reproduction + description: | + Please provide a code sample that reproduces the problem you ran into. It can be a Colab link or just a code snippet. + Please include relevant config information with your code, for example your Trainers, TRL, Peft, and DeepSpeed configs. + If you have code snippets, error messages, stack traces please provide them here as well. + Important! Use code tags to correctly format your code. See https://help.github.com/en/github/writing-on-github/creating-and-highlighting-code-blocks#syntax-highlighting + Do not use screenshots, as they are hard to read and (more importantly) don't allow others to copy-and-paste your code. + + placeholder: | + Steps to reproduce the behavior: + + 1. + 2. + 3. + + + - type: textarea + id: expected-behavior + validations: + required: true + attributes: + label: Expected behavior + description: "A clear and concise description of what you would expect to happen." diff --git a/docs/transformers/.github/ISSUE_TEMPLATE/config.yml b/docs/transformers/.github/ISSUE_TEMPLATE/config.yml new file mode 100644 index 0000000000000000000000000000000000000000..1b9386d11221fc6be29a88ee0b77e768a4d5eb9d --- /dev/null +++ b/docs/transformers/.github/ISSUE_TEMPLATE/config.yml @@ -0,0 +1,12 @@ +blank_issues_enabled: true +version: 2.1 +contact_links: + - name: Model checkpoints on the Hugging Face Hub + url: https://huggingface.co/models + about: Open a Pull request / Discussion related to a specific model checkpoint directly on the Hugging Face Hub + - name: Website Related + url: https://github.com/huggingface/hub-docs/issues + about: Feature requests and bug reports related to the website + - name: Forum + url: https://discuss.huggingface.co/ + about: General usage questions and community discussions diff --git a/docs/transformers/.github/ISSUE_TEMPLATE/migration.yml b/docs/transformers/.github/ISSUE_TEMPLATE/migration.yml new file mode 100644 index 0000000000000000000000000000000000000000..778413141b1f3b42f71c123a4d83f61bc01663e1 --- /dev/null +++ b/docs/transformers/.github/ISSUE_TEMPLATE/migration.yml @@ -0,0 +1,72 @@ +name: "\U0001F4DA Migration from pytorch-pretrained-bert or pytorch-transformers" +description: Report a problem when migrating from pytorch-pretrained-bert or pytorch-transformers to transformers +labels: [ "migration" ] +body: + - type: textarea + id: system-info + attributes: + label: System Info + description: Please share your system info with us. You can run the command `transformers-cli env` and copy-paste its output below. + render: shell + placeholder: transformers version, platform, python version, ... + validations: + required: true + + - type: checkboxes + id: information-scripts-examples + attributes: + label: Information + description: 'The problem arises when using:' + options: + - label: "The official example scripts" + - label: "My own modified scripts" + + - type: checkboxes + id: information-tasks + attributes: + label: Tasks + description: "The tasks I am working on are:" + options: + - label: "An officially supported task in the `examples` folder (such as GLUE/SQuAD, ...)" + - label: "My own task or dataset (give details below)" + + - type: textarea + id: reproduction + validations: + required: true + attributes: + label: Reproduction + description: | + Please provide a code sample that reproduces the problem you ran into. It can be a Colab link or just a code snippet. + If you have code snippets, error messages, stack traces please provide them here as well. + Important! Use code tags to correctly format your code. See https://help.github.com/en/github/writing-on-github/creating-and-highlighting-code-blocks#syntax-highlighting + Do not use screenshots, as they are hard to read and (more importantly) don't allow others to copy-and-paste your code. + + placeholder: | + Steps to reproduce the behavior: + + 1. + 2. + 3. + + + - type: textarea + id: expected-behavior + validations: + required: true + attributes: + label: Expected behavior + description: "A clear and concise description of what you would expect to happen." + render: shell + + - type: checkboxes + id: checklist + attributes: + label: Checklist + options: + - label: "I have read the migration guide in the readme. + ([pytorch-transformers](https://github.com/huggingface/transformers#migrating-from-pytorch-transformers-to-transformers); + [pytorch-pretrained-bert](https://github.com/huggingface/transformers#migrating-from-pytorch-pretrained-bert-to-transformers))" + required: true + - label: "I checked if a related official extension example runs on my machine." + required: true diff --git a/docs/transformers/.github/conda/build.sh b/docs/transformers/.github/conda/build.sh new file mode 100644 index 0000000000000000000000000000000000000000..a40f1097a8631648d5c2cecb1be4dd144cc632c2 --- /dev/null +++ b/docs/transformers/.github/conda/build.sh @@ -0,0 +1 @@ +$PYTHON setup.py install # Python command to install the script. diff --git a/docs/transformers/.github/conda/meta.yaml b/docs/transformers/.github/conda/meta.yaml new file mode 100644 index 0000000000000000000000000000000000000000..89dc353b127774528faa0c979e28215be12de6f0 --- /dev/null +++ b/docs/transformers/.github/conda/meta.yaml @@ -0,0 +1,56 @@ +{% set name = "transformers" %} + +package: + name: "{{ name|lower }}" + version: "{{ TRANSFORMERS_VERSION }}" + +source: + path: ../../ + +build: + noarch: python + +requirements: + host: + - python + - pip + - numpy >=1.17 + - dataclasses + - huggingface_hub + - packaging + - filelock + - requests + - tqdm >=4.27 + - sacremoses + - regex !=2019.12.17 + - protobuf + - tokenizers >=0.11.1,!=0.11.3,<0.13 + - pyyaml >=5.1 + - safetensors + - fsspec + run: + - python + - numpy >=1.17 + - dataclasses + - huggingface_hub + - packaging + - filelock + - requests + - tqdm >=4.27 + - sacremoses + - regex !=2019.12.17 + - protobuf + - tokenizers >=0.11.1,!=0.11.3,<0.13 + - pyyaml >=5.1 + - safetensors + - fsspec + +test: + imports: + - transformers + +about: + home: https://huggingface.co + license: Apache License 2.0 + license_file: LICENSE + summary: "🤗Transformers: State-of-the-art Natural Language Processing for Pytorch and TensorFlow 2.0." diff --git a/docs/transformers/.github/workflows/assign-reviewers.yml b/docs/transformers/.github/workflows/assign-reviewers.yml new file mode 100644 index 0000000000000000000000000000000000000000..46bcb52a169f5fb7368395ee626b3c6dbdd6ec25 --- /dev/null +++ b/docs/transformers/.github/workflows/assign-reviewers.yml @@ -0,0 +1,26 @@ +name: Assign PR Reviewers +on: + pull_request_target: + branches: + - main + types: [ready_for_review] + +jobs: + assign_reviewers: + permissions: + pull-requests: write + runs-on: ubuntu-22.04 + steps: + - uses: actions/checkout@v4 + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.13' + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install PyGithub + - name: Run assignment script + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + run: python .github/scripts/assign_reviewers.py \ No newline at end of file diff --git a/docs/transformers/.github/workflows/build-docker-images.yml b/docs/transformers/.github/workflows/build-docker-images.yml new file mode 100644 index 0000000000000000000000000000000000000000..a51b1f9f1545a410a9039720b7b7188496bfe86c --- /dev/null +++ b/docs/transformers/.github/workflows/build-docker-images.yml @@ -0,0 +1,393 @@ +name: Build docker images (scheduled) + +on: + push: + branches: + - build_ci_docker_image* + repository_dispatch: + workflow_call: + inputs: + image_postfix: + required: true + type: string + schedule: + - cron: "17 0 * * *" + +concurrency: + group: docker-images-builds + cancel-in-progress: false + +jobs: + latest-docker: + name: "Latest PyTorch + TensorFlow [dev]" + runs-on: + group: aws-general-8-plus + steps: + - + name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + - + name: Check out code + uses: actions/checkout@v4 + - + name: Login to DockerHub + uses: docker/login-action@v3 + with: + username: ${{ secrets.DOCKERHUB_USERNAME }} + password: ${{ secrets.DOCKERHUB_PASSWORD }} + - + name: Build and push + uses: docker/build-push-action@v5 + with: + context: ./docker/transformers-all-latest-gpu + build-args: | + REF=main + push: true + tags: huggingface/transformers-all-latest-gpu${{ inputs.image_postfix }} + # Push CI images still need to be re-built daily + - + name: Build and push (for Push CI) in a daily basis + # This condition allows `schedule` events, or `push` events that trigger this workflow NOT via `workflow_call`. + # The later case is useful for manual image building for debugging purpose. Use another tag in this case! + if: inputs.image_postfix != '-push-ci' + uses: docker/build-push-action@v5 + with: + context: ./docker/transformers-all-latest-gpu + build-args: | + REF=main + push: true + tags: huggingface/transformers-all-latest-gpu-push-ci + + - name: Post to Slack + if: always() + uses: huggingface/hf-workflows/.github/actions/post-slack@main + with: + slack_channel: ${{ secrets.CI_SLACK_CHANNEL_DOCKER }} + title: 🤗 Results of the transformers-all-latest-gpu-push-ci docker build + status: ${{ job.status }} + slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }} + + latest-torch-deepspeed-docker: + name: "Latest PyTorch + DeepSpeed" + runs-on: + group: aws-g4dn-2xlarge-cache + steps: + - + name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + - + name: Check out code + uses: actions/checkout@v4 + - + name: Login to DockerHub + uses: docker/login-action@v3 + with: + username: ${{ secrets.DOCKERHUB_USERNAME }} + password: ${{ secrets.DOCKERHUB_PASSWORD }} + - + name: Build and push + uses: docker/build-push-action@v5 + with: + context: ./docker/transformers-pytorch-deepspeed-latest-gpu + build-args: | + REF=main + push: true + tags: huggingface/transformers-pytorch-deepspeed-latest-gpu${{ inputs.image_postfix }} + + - name: Post to Slack + if: always() + uses: huggingface/hf-workflows/.github/actions/post-slack@main + with: + slack_channel: ${{ secrets.CI_SLACK_CHANNEL_DOCKER}} + title: 🤗 Results of the transformers-pytorch-deepspeed-latest-gpu docker build + status: ${{ job.status }} + slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }} + + # Can't build 2 images in a single job `latest-torch-deepspeed-docker` (for `nvcr.io/nvidia`) + latest-torch-deepspeed-docker-for-push-ci-daily-build: + name: "Latest PyTorch + DeepSpeed (Push CI - Daily Build)" + runs-on: + group: aws-general-8-plus + steps: + - + name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + - + name: Check out code + uses: actions/checkout@v4 + - + name: Login to DockerHub + uses: docker/login-action@v3 + with: + username: ${{ secrets.DOCKERHUB_USERNAME }} + password: ${{ secrets.DOCKERHUB_PASSWORD }} + # Push CI images still need to be re-built daily + - + name: Build and push (for Push CI) in a daily basis + # This condition allows `schedule` events, or `push` events that trigger this workflow NOT via `workflow_call`. + # The later case is useful for manual image building for debugging purpose. Use another tag in this case! + if: inputs.image_postfix != '-push-ci' + uses: docker/build-push-action@v5 + with: + context: ./docker/transformers-pytorch-deepspeed-latest-gpu + build-args: | + REF=main + push: true + tags: huggingface/transformers-pytorch-deepspeed-latest-gpu-push-ci + + - name: Post to Slack + if: always() + uses: huggingface/hf-workflows/.github/actions/post-slack@main + with: + slack_channel: ${{ secrets.CI_SLACK_CHANNEL_DOCKER }} + title: 🤗 Results of the transformers-pytorch-deepspeed-latest-gpu-push-ci docker build + status: ${{ job.status }} + slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }} + + doc-builder: + name: "Doc builder" + # Push CI doesn't need this image + if: inputs.image_postfix != '-push-ci' + runs-on: + group: aws-general-8-plus + steps: + - + name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + - + name: Check out code + uses: actions/checkout@v4 + - + name: Login to DockerHub + uses: docker/login-action@v3 + with: + username: ${{ secrets.DOCKERHUB_USERNAME }} + password: ${{ secrets.DOCKERHUB_PASSWORD }} + - + name: Build and push + uses: docker/build-push-action@v5 + with: + context: ./docker/transformers-doc-builder + push: true + tags: huggingface/transformers-doc-builder + + - name: Post to Slack + if: always() + uses: huggingface/hf-workflows/.github/actions/post-slack@main + with: + slack_channel: ${{ secrets.CI_SLACK_CHANNEL_DOCKER }} + title: 🤗 Results of the huggingface/transformers-doc-builder docker build + status: ${{ job.status }} + slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }} + + latest-pytorch: + name: "Latest PyTorch [dev]" + # Push CI doesn't need this image + if: inputs.image_postfix != '-push-ci' + runs-on: + group: aws-general-8-plus + steps: + - + name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + - + name: Check out code + uses: actions/checkout@v4 + - + name: Login to DockerHub + uses: docker/login-action@v3 + with: + username: ${{ secrets.DOCKERHUB_USERNAME }} + password: ${{ secrets.DOCKERHUB_PASSWORD }} + - + name: Build and push + uses: docker/build-push-action@v5 + with: + context: ./docker/transformers-pytorch-gpu + build-args: | + REF=main + push: true + tags: huggingface/transformers-pytorch-gpu + + - name: Post to Slack + if: always() + uses: huggingface/hf-workflows/.github/actions/post-slack@main + with: + slack_channel: ${{ secrets.CI_SLACK_CHANNEL_DOCKER }} + title: 🤗 Results of the huggingface/transformers-pytorch-gpudocker build + status: ${{ job.status }} + slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }} + + latest-pytorch-amd: + name: "Latest PyTorch (AMD) [dev]" + runs-on: + group: aws-general-8-plus + steps: + - + name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + - + name: Check out code + uses: actions/checkout@v4 + - + name: Login to DockerHub + uses: docker/login-action@v3 + with: + username: ${{ secrets.DOCKERHUB_USERNAME }} + password: ${{ secrets.DOCKERHUB_PASSWORD }} + - + name: Build and push + uses: docker/build-push-action@v5 + with: + context: ./docker/transformers-pytorch-amd-gpu + build-args: | + REF=main + push: true + tags: huggingface/transformers-pytorch-amd-gpu${{ inputs.image_postfix }} + # Push CI images still need to be re-built daily + - + name: Build and push (for Push CI) in a daily basis + # This condition allows `schedule` events, or `push` events that trigger this workflow NOT via `workflow_call`. + # The later case is useful for manual image building for debugging purpose. Use another tag in this case! + if: inputs.image_postfix != '-push-ci' + uses: docker/build-push-action@v5 + with: + context: ./docker/transformers-pytorch-amd-gpu + build-args: | + REF=main + push: true + tags: huggingface/transformers-pytorch-amd-gpu-push-ci + + - name: Post to Slack + if: always() + uses: huggingface/hf-workflows/.github/actions/post-slack@main + with: + slack_channel: ${{ secrets.CI_SLACK_CHANNEL_DOCKER }} + title: 🤗 Results of the huggingface/transformers-pytorch-amd-gpu-push-ci build + status: ${{ job.status }} + slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }} + + latest-tensorflow: + name: "Latest TensorFlow [dev]" + # Push CI doesn't need this image + if: inputs.image_postfix != '-push-ci' + runs-on: + group: aws-general-8-plus + steps: + - + name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + - + name: Check out code + uses: actions/checkout@v4 + - + name: Login to DockerHub + uses: docker/login-action@v3 + with: + username: ${{ secrets.DOCKERHUB_USERNAME }} + password: ${{ secrets.DOCKERHUB_PASSWORD }} + - + name: Build and push + uses: docker/build-push-action@v5 + with: + context: ./docker/transformers-tensorflow-gpu + build-args: | + REF=main + push: true + tags: huggingface/transformers-tensorflow-gpu + + - name: Post to Slack + if: always() + uses: huggingface/hf-workflows/.github/actions/post-slack@main + with: + slack_channel: ${{ secrets.CI_SLACK_CHANNEL_DOCKER }} + title: 🤗 Results of the huggingface/transformers-tensorflow-gpu build + status: ${{ job.status }} + slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }} + + latest-pytorch-deepspeed-amd: + name: "PyTorch + DeepSpeed (AMD) [dev]" + runs-on: + group: aws-general-8-plus + steps: + - + name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + - + name: Check out code + uses: actions/checkout@v4 + - + name: Login to DockerHub + uses: docker/login-action@v3 + with: + username: ${{ secrets.DOCKERHUB_USERNAME }} + password: ${{ secrets.DOCKERHUB_PASSWORD }} + - + name: Build and push + uses: docker/build-push-action@v5 + with: + context: ./docker/transformers-pytorch-deepspeed-amd-gpu + build-args: | + REF=main + push: true + tags: huggingface/transformers-pytorch-deepspeed-amd-gpu${{ inputs.image_postfix }} + # Push CI images still need to be re-built daily + - + name: Build and push (for Push CI) in a daily basis + # This condition allows `schedule` events, or `push` events that trigger this workflow NOT via `workflow_call`. + # The later case is useful for manual image building for debugging purpose. Use another tag in this case! + if: inputs.image_postfix != '-push-ci' + uses: docker/build-push-action@v5 + with: + context: ./docker/transformers-pytorch-deepspeed-amd-gpu + build-args: | + REF=main + push: true + tags: huggingface/transformers-pytorch-deepspeed-amd-gpu-push-ci + + - name: Post to Slack + if: always() + uses: huggingface/hf-workflows/.github/actions/post-slack@main + with: + slack_channel: ${{ secrets.CI_SLACK_CHANNEL_DOCKER }} + title: 🤗 Results of the transformers-pytorch-deepspeed-amd-gpu build + status: ${{ job.status }} + slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }} + + latest-quantization-torch-docker: + name: "Latest Pytorch + Quantization [dev]" + # Push CI doesn't need this image + if: inputs.image_postfix != '-push-ci' + runs-on: + group: aws-general-8-plus + steps: + - + name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + - + name: Check out code + uses: actions/checkout@v4 + - + name: Login to DockerHub + uses: docker/login-action@v3 + with: + username: ${{ secrets.DOCKERHUB_USERNAME }} + password: ${{ secrets.DOCKERHUB_PASSWORD }} + - + name: Build and push + uses: docker/build-push-action@v5 + with: + context: ./docker/transformers-quantization-latest-gpu + build-args: | + REF=main + push: true + tags: huggingface/transformers-quantization-latest-gpu${{ inputs.image_postfix }} + + - name: Post to Slack + if: always() + uses: huggingface/hf-workflows/.github/actions/post-slack@main + with: + slack_channel: ${{ secrets.CI_SLACK_CHANNEL_DOCKER }} + title: 🤗 Results of the transformers-quantization-latest-gpu build + status: ${{ job.status }} + slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }} diff --git a/docs/transformers/.github/workflows/build-nightly-ci-docker-images.yml b/docs/transformers/.github/workflows/build-nightly-ci-docker-images.yml new file mode 100644 index 0000000000000000000000000000000000000000..dfab083503c2ca2961b245d9be00c6d7afff3af9 --- /dev/null +++ b/docs/transformers/.github/workflows/build-nightly-ci-docker-images.yml @@ -0,0 +1,67 @@ +name: Build docker images (Nightly CI) + +on: + workflow_call: + push: + branches: + - build_nightly_ci_docker_image* + +concurrency: + group: docker-images-builds + cancel-in-progress: false + +jobs: + latest-with-torch-nightly-docker: + name: "Nightly PyTorch + Stable TensorFlow" + runs-on: + group: aws-general-8-plus + steps: + - + name: Set up Docker Buildx + uses: docker/setup-buildx-action@v2 + - + name: Check out code + uses: actions/checkout@v4 + - + name: Login to DockerHub + uses: docker/login-action@v2 + with: + username: ${{ secrets.DOCKERHUB_USERNAME }} + password: ${{ secrets.DOCKERHUB_PASSWORD }} + - + name: Build and push + uses: docker/build-push-action@v3 + with: + context: ./docker/transformers-all-latest-gpu + build-args: | + REF=main + PYTORCH=pre + push: true + tags: huggingface/transformers-all-latest-torch-nightly-gpu + + nightly-torch-deepspeed-docker: + name: "Nightly PyTorch + DeepSpeed" + runs-on: + group: aws-g4dn-2xlarge-cache + steps: + - + name: Set up Docker Buildx + uses: docker/setup-buildx-action@v2 + - + name: Check out code + uses: actions/checkout@v4 + - + name: Login to DockerHub + uses: docker/login-action@v2 + with: + username: ${{ secrets.DOCKERHUB_USERNAME }} + password: ${{ secrets.DOCKERHUB_PASSWORD }} + - + name: Build and push + uses: docker/build-push-action@v3 + with: + context: ./docker/transformers-pytorch-deepspeed-nightly-gpu + build-args: | + REF=main + push: true + tags: huggingface/transformers-pytorch-deepspeed-nightly-gpu diff --git a/docs/transformers/.github/workflows/change_pr_to_draft.yml b/docs/transformers/.github/workflows/change_pr_to_draft.yml new file mode 100644 index 0000000000000000000000000000000000000000..c8132d2f49ea5b045abb84d083bb789f8d42e1c6 --- /dev/null +++ b/docs/transformers/.github/workflows/change_pr_to_draft.yml @@ -0,0 +1,25 @@ +name: Change PR to draft + +on: + pull_request_target: + types: [opened, reopened] + +jobs: + convert_pr_to_draft: + runs-on: ubuntu-22.04 + name: Convert PR to draft + permissions: + pull-requests: write + contents: write + if: github.event.pull_request.draft == false + steps: + - name: Convert PR to draft + shell: bash + env: + PR_NUMBER: ${{ github.event.number }} + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + REPO: ${{ github.repository }} + run: | + echo $PR_NUMBER + gh pr ready $PR_NUMBER --repo $REPO --undo + gh pr comment $PR_NUMBER --repo $REPO --body "Hi 👋, thank you for opening this pull request! The pull request is converted to draft by default. The CI will be paused while the PR is in draft mode. When it is ready for review, please click the \`Ready for review\` button (at the bottom of the PR page). This will assign reviewers and trigger CI." diff --git a/docs/transformers/.github/workflows/check_failed_model_tests.yml b/docs/transformers/.github/workflows/check_failed_model_tests.yml new file mode 100644 index 0000000000000000000000000000000000000000..5963523fd76c9d5476d034ffe698c87491dc71e1 --- /dev/null +++ b/docs/transformers/.github/workflows/check_failed_model_tests.yml @@ -0,0 +1,128 @@ +name: Process failed tests + +on: + workflow_call: + inputs: + docker: + required: true + type: string + start_sha: + required: true + type: string + + +env: + HF_HOME: /mnt/cache + TRANSFORMERS_IS_CI: yes + OMP_NUM_THREADS: 8 + MKL_NUM_THREADS: 8 + RUN_SLOW: yes + # For gated repositories, we still need to agree to share information on the Hub repo. page in order to get access. + # This token is created under the bot `hf-transformers-bot`. + HF_HUB_READ_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }} + SIGOPT_API_TOKEN: ${{ secrets.SIGOPT_API_TOKEN }} + TF_FORCE_GPU_ALLOW_GROWTH: true + CUDA_VISIBLE_DEVICES: 0,1 + + +jobs: + run_models_gpu: + name: " " + runs-on: + group: aws-g4dn-2xlarge-cache + container: + image: ${{ inputs.docker }} + options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ + steps: + - uses: actions/download-artifact@v4 + with: + name: ci_results_run_models_gpu + path: /transformers/ci_results_run_models_gpu + + - name: Update clone + working-directory: /transformers + run: git fetch && git checkout ${{ github.sha }} + + - name: Get target commit + working-directory: /transformers/utils + run: | + echo "END_SHA=$(TOKEN=${{ secrets.ACCESS_REPO_INFO_TOKEN }} python3 -c 'import os; from get_previous_daily_ci import get_last_daily_ci_run_commit; commit=get_last_daily_ci_run_commit(token=os.environ["TOKEN"]); print(commit)')" >> $GITHUB_ENV + + - name: Checkout to `start_sha` + working-directory: /transformers + run: git fetch && git checkout ${{ inputs.start_sha }} + + - name: Reinstall transformers in edit mode (remove the one installed during docker image build) + working-directory: /transformers + run: python3 -m pip uninstall -y transformers && python3 -m pip install -e . + + - name: NVIDIA-SMI + run: | + nvidia-smi + + - name: Environment + working-directory: /transformers + run: | + python3 utils/print_env.py + + - name: Show installed libraries and their versions + working-directory: /transformers + run: pip freeze + + - name: Check failed tests + working-directory: /transformers + run: python3 utils/check_bad_commit.py --start_commit ${{ inputs.start_sha }} --end_commit ${{ env.END_SHA }} --file ci_results_run_models_gpu/new_model_failures.json --output_file new_model_failures_with_bad_commit.json + + - name: Show results + working-directory: /transformers + run: | + ls -l new_model_failures_with_bad_commit.json + cat new_model_failures_with_bad_commit.json + + - name: Checkout back + working-directory: /transformers + run: | + git checkout ${{ inputs.start_sha }} + + - name: Process report + shell: bash + working-directory: /transformers + env: + TRANSFORMERS_CI_RESULTS_UPLOAD_TOKEN: ${{ secrets.TRANSFORMERS_CI_RESULTS_UPLOAD_TOKEN }} + run: | + python3 utils/process_bad_commit_report.py + + - name: Process report + shell: bash + working-directory: /transformers + env: + TRANSFORMERS_CI_RESULTS_UPLOAD_TOKEN: ${{ secrets.TRANSFORMERS_CI_RESULTS_UPLOAD_TOKEN }} + run: | + { + echo 'REPORT_TEXT<> "$GITHUB_ENV" + + - name: Send processed report + if: ${{ !endsWith(env.REPORT_TEXT, '{}') }} + uses: slackapi/slack-github-action@6c661ce58804a1a20f6dc5fbee7f0381b469e001 + with: + # Slack channel id, channel name, or user id to post message. + # See also: https://api.slack.com/methods/chat.postMessage#channels + channel-id: '#transformers-ci-feedback-tests' + # For posting a rich message using Block Kit + payload: | + { + "blocks": [ + { + "type": "section", + "text": { + "type": "mrkdwn", + "text": "${{ env.REPORT_TEXT }}" + } + } + ] + } + env: + SLACK_BOT_TOKEN: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }} diff --git a/docs/transformers/.github/workflows/check_tiny_models.yml b/docs/transformers/.github/workflows/check_tiny_models.yml new file mode 100644 index 0000000000000000000000000000000000000000..a2b4846051a05460d62a86cdc45d59b1fa323487 --- /dev/null +++ b/docs/transformers/.github/workflows/check_tiny_models.yml @@ -0,0 +1,82 @@ +name: Check Tiny Models + +on: + push: + branches: + - check_tiny_models* + repository_dispatch: + schedule: + - cron: "0 2 * * *" + +env: + TOKEN: ${{ secrets.TRANSFORMERS_HUB_BOT_HF_TOKEN }} + +jobs: + check_tiny_models: + name: Check tiny models + runs-on: ubuntu-22.04 + steps: + - name: Checkout transformers + uses: actions/checkout@v4 + with: + fetch-depth: 2 + + - uses: actions/checkout@v4 + - name: Set up Python 3.8 + uses: actions/setup-python@v5 + with: + # Semantic version range syntax or exact version of a Python version + python-version: '3.8' + # Optional - x64 or x86 architecture, defaults to x64 + architecture: 'x64' + + - name: Install + run: | + sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev espeak-ng cmake + pip install --upgrade pip + python -m pip install -U .[sklearn,torch,testing,sentencepiece,torch-speech,vision,timm,video,tf-cpu] + pip install tensorflow_probability + python -m pip install -U 'natten<0.15.0' + + - name: Create all tiny models (locally) + run: | + python utils/create_dummy_models.py tiny_local_models --all --num_workers 2 + + - name: Local tiny model reports artifacts + if: ${{ always() }} + uses: actions/upload-artifact@v4 + with: + name: tiny_local_model_creation_reports + path: tiny_local_models/reports + + # GitHub-hosted runners have 2-core CPUs + - name: Run pipeline tests against all new (local) tiny models + run: | + OMP_NUM_THREADS=1 TRANSFORMERS_TINY_MODEL_PATH=tiny_local_models python -m pytest --max-worker-restart=0 -n 2 --dist=loadfile -s -rA --make-reports=tests_pipelines tests/models -m is_pipeline_test -k "test_pipeline_" | tee tests_output.txt + + - name: Test suite reports artifacts + if: ${{ always() }} + uses: actions/upload-artifact@v4 + with: + name: tiny_local_model_creation_reports + path: reports/tests_pipelines + + - name: Create + Upload tiny models for new model architecture(s) + run: | + python utils/update_tiny_models.py --num_workers 2 + + - name: Full report + run: cat tiny_models/reports/tiny_model_creation_report.json + + - name: Failure report + run: cat tiny_models/reports/simple_failed_report.txt + + - name: Summary report + run: cat tiny_models/reports/tiny_model_summary.json + + - name: New tiny model creation reports artifacts + if: ${{ always() }} + uses: actions/upload-artifact@v4 + with: + name: tiny_model_creation_reports + path: tiny_models/reports diff --git a/docs/transformers/.github/workflows/doctest_job.yml b/docs/transformers/.github/workflows/doctest_job.yml new file mode 100644 index 0000000000000000000000000000000000000000..eb62b797b8eb5575a33a6c0452563d809ce7fce7 --- /dev/null +++ b/docs/transformers/.github/workflows/doctest_job.yml @@ -0,0 +1,83 @@ +name: Doctest job + +on: + workflow_call: + inputs: + job_splits: + required: true + type: string + split_keys: + required: true + type: string + +env: + HF_HOME: /mnt/cache + TRANSFORMERS_IS_CI: yes + RUN_SLOW: yes + OMP_NUM_THREADS: 16 + MKL_NUM_THREADS: 16 + SIGOPT_API_TOKEN: ${{ secrets.SIGOPT_API_TOKEN }} + TF_FORCE_GPU_ALLOW_GROWTH: true + +jobs: + run_doctests: + name: " " + strategy: + max-parallel: 8 # 8 jobs at a time + fail-fast: false + matrix: + split_keys: ${{ fromJson(inputs.split_keys) }} + runs-on: + group: aws-g4dn-2xlarge-cache + container: + image: huggingface/transformers-all-latest-gpu + options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ + steps: + - name: Update clone + working-directory: /transformers + run: git fetch && git checkout ${{ github.sha }} + + - name: Reinstall transformers in edit mode (remove the one installed during docker image build) + working-directory: /transformers + run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .[flax] + + - name: GPU visibility + working-directory: /transformers + run: | + python3 utils/print_env.py + + - name: Show installed libraries and their versions + run: pip freeze + + - name: Get doctest files + working-directory: /transformers + run: | + echo "${{ toJson(fromJson(inputs.job_splits)[matrix.split_keys]) }}" > doc_tests.txt + cat doc_tests.txt + + - name: Set `split_keys` + shell: bash + run: | + echo "${{ matrix.split_keys }}" + split_keys=${{ matrix.split_keys }} + split_keys=${split_keys//'/'/'_'} + echo "split_keys" + echo "split_keys=$split_keys" >> $GITHUB_ENV + + - name: Run doctests + working-directory: /transformers + run: | + cat doc_tests.txt + python3 -m pytest -v --make-reports doc_tests_gpu_${{ env.split_keys }} --doctest-modules $(cat doc_tests.txt) -sv --doctest-continue-on-failure --doctest-glob="*.md" + + - name: Failure short reports + if: ${{ failure() }} + continue-on-error: true + run: cat /transformers/reports/doc_tests_gpu_${{ env.split_keys }}/failures_short.txt + + - name: "Test suite reports artifacts: doc_tests_gpu_test_reports_${{ env.split_keys }}" + if: ${{ always() }} + uses: actions/upload-artifact@v4 + with: + name: doc_tests_gpu_test_reports_${{ env.split_keys }} + path: /transformers/reports/doc_tests_gpu_${{ env.split_keys }} diff --git a/docs/transformers/.github/workflows/model_jobs.yml b/docs/transformers/.github/workflows/model_jobs.yml new file mode 100644 index 0000000000000000000000000000000000000000..95584176d6ce72e4c64f41731f8c570d8cca7a6d --- /dev/null +++ b/docs/transformers/.github/workflows/model_jobs.yml @@ -0,0 +1,142 @@ +name: model jobs + +on: + workflow_call: + inputs: + folder_slices: + required: true + type: string + machine_type: + required: true + type: string + slice_id: + required: true + type: number + runner: + required: true + type: string + docker: + required: true + type: string + report_name_prefix: + required: false + default: run_models_gpu + type: string + +env: + HF_HOME: /mnt/cache + TRANSFORMERS_IS_CI: yes + OMP_NUM_THREADS: 8 + MKL_NUM_THREADS: 8 + RUN_SLOW: yes + # For gated repositories, we still need to agree to share information on the Hub repo. page in order to get access. + # This token is created under the bot `hf-transformers-bot`. + HF_HUB_READ_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }} + SIGOPT_API_TOKEN: ${{ secrets.SIGOPT_API_TOKEN }} + TF_FORCE_GPU_ALLOW_GROWTH: true + CUDA_VISIBLE_DEVICES: 0,1 + +jobs: + run_models_gpu: + name: " " + strategy: + max-parallel: 8 + fail-fast: false + matrix: + folders: ${{ fromJson(inputs.folder_slices)[inputs.slice_id] }} + runs-on: + group: '${{ inputs.machine_type }}' + container: + image: ${{ inputs.docker }} + options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ + steps: + - name: Echo input and matrix info + shell: bash + run: | + echo "${{ inputs.folder_slices }}" + echo "${{ matrix.folders }}" + echo "${{ toJson(fromJson(inputs.folder_slices)[inputs.slice_id]) }}" + + - name: Echo folder ${{ matrix.folders }} + shell: bash + # For folders like `models/bert`, set an env. var. (`matrix_folders`) to `models_bert`, which will be used to + # set the artifact folder names (because the character `/` is not allowed). + run: | + echo "${{ matrix.folders }}" + matrix_folders=${{ matrix.folders }} + matrix_folders=${matrix_folders/'models/'/'models_'} + echo "$matrix_folders" + echo "matrix_folders=$matrix_folders" >> $GITHUB_ENV + + - name: Update clone + working-directory: /transformers + run: git fetch && git checkout ${{ github.sha }} + + - name: Reinstall transformers in edit mode (remove the one installed during docker image build) + working-directory: /transformers + run: python3 -m pip uninstall -y transformers && python3 -m pip install -e . + + - name: Update / Install some packages (for Past CI) + if: ${{ contains(inputs.docker, '-past-') }} + working-directory: /transformers + run: | + python3 -m pip install -U datasets + + - name: Update / Install some packages (for Past CI) + if: ${{ contains(inputs.docker, '-past-') && contains(inputs.docker, '-pytorch-') }} + working-directory: /transformers + run: | + python3 -m pip install --no-cache-dir git+https://github.com/huggingface/accelerate@main#egg=accelerate + + - name: NVIDIA-SMI + run: | + nvidia-smi + + - name: Environment + working-directory: /transformers + run: | + python3 utils/print_env.py + + - name: Show installed libraries and their versions + working-directory: /transformers + run: pip freeze + + - name: Set `machine_type` for report and artifact names + working-directory: /transformers + shell: bash + run: | + echo "${{ inputs.machine_type }}" + + if [ "${{ inputs.machine_type }}" = "aws-g4dn-2xlarge-cache" ]; then + machine_type=single-gpu + elif [ "${{ inputs.machine_type }}" = "aws-g4dn-12xlarge-cache" ]; then + machine_type=multi-gpu + else + machine_type=${{ inputs.machine_type }} + fi + + echo "$machine_type" + echo "machine_type=$machine_type" >> $GITHUB_ENV + + - name: Run all tests on GPU + working-directory: /transformers + run: python3 -m pytest -rsfE -v --make-reports=${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ matrix.folders }}_test_reports tests/${{ matrix.folders }} + + - name: Failure short reports + if: ${{ failure() }} + continue-on-error: true + run: cat /transformers/reports/${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ matrix.folders }}_test_reports/failures_short.txt + + - name: Run test + shell: bash + run: | + mkdir -p /transformers/reports/${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ matrix.folders }}_test_reports + echo "hello" > /transformers/reports/${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ matrix.folders }}_test_reports/hello.txt + echo "${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ matrix.folders }}_test_reports" + + - name: "Test suite reports artifacts: ${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ env.matrix_folders }}_test_reports" + if: ${{ always() }} + uses: actions/upload-artifact@v4 + with: + name: ${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ env.matrix_folders }}_test_reports + path: /transformers/reports/${{ env.machine_type }}_${{ inputs.report_name_prefix }}_${{ matrix.folders }}_test_reports diff --git a/docs/transformers/.github/workflows/model_jobs_amd.yml b/docs/transformers/.github/workflows/model_jobs_amd.yml new file mode 100644 index 0000000000000000000000000000000000000000..c90181ec6f1b6df86a90f9215ff13e64e7a2f781 --- /dev/null +++ b/docs/transformers/.github/workflows/model_jobs_amd.yml @@ -0,0 +1,128 @@ +name: model jobs + +on: + workflow_call: + inputs: + folder_slices: + required: true + type: string + machine_type: + required: true + type: string + slice_id: + required: true + type: number + runner: + required: true + type: string + docker: + required: true + type: string + +env: + HF_HOME: /mnt/cache + TRANSFORMERS_IS_CI: yes + OMP_NUM_THREADS: 8 + MKL_NUM_THREADS: 8 + RUN_SLOW: yes + # For gated repositories, we still need to agree to share information on the Hub repo. page in order to get access. + # This token is created under the bot `hf-transformers-bot`. + HF_HUB_READ_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }} + SIGOPT_API_TOKEN: ${{ secrets.SIGOPT_API_TOKEN }} + TF_FORCE_GPU_ALLOW_GROWTH: true + CUDA_VISIBLE_DEVICES: 0,1 + +jobs: + run_models_gpu: + name: " " + strategy: + max-parallel: 1 # For now, not to parallelize. Can change later if it works well. + fail-fast: false + matrix: + folders: ${{ fromJson(inputs.folder_slices)[inputs.slice_id] }} + runs-on: ['${{ inputs.machine_type }}', self-hosted, amd-gpu, '${{ inputs.runner }}'] + container: + image: ${{ inputs.docker }} + options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ + steps: + - name: Echo input and matrix info + shell: bash + run: | + echo "${{ inputs.folder_slices }}" + echo "${{ matrix.folders }}" + echo "${{ toJson(fromJson(inputs.folder_slices)[inputs.slice_id]) }}" + + - name: Echo folder ${{ matrix.folders }} + shell: bash + # For folders like `models/bert`, set an env. var. (`matrix_folders`) to `models_bert`, which will be used to + # set the artifact folder names (because the character `/` is not allowed). + run: | + echo "${{ matrix.folders }}" + matrix_folders=${{ matrix.folders }} + matrix_folders=${matrix_folders/'models/'/'models_'} + echo "$matrix_folders" + echo "matrix_folders=$matrix_folders" >> $GITHUB_ENV + + - name: Update clone + working-directory: /transformers + run: git fetch && git checkout ${{ github.sha }} + + - name: Reinstall transformers in edit mode (remove the one installed during docker image build) + working-directory: /transformers + run: python3 -m pip uninstall -y transformers && python3 -m pip install -e . + + - name: Update / Install some packages (for Past CI) + if: ${{ contains(inputs.docker, '-past-') }} + working-directory: /transformers + run: | + python3 -m pip install -U datasets + + - name: Update / Install some packages (for Past CI) + if: ${{ contains(inputs.docker, '-past-') && contains(inputs.docker, '-pytorch-') }} + working-directory: /transformers + run: | + python3 -m pip install --no-cache-dir git+https://github.com/huggingface/accelerate@main#egg=accelerate + + - name: ROCM-SMI + run: | + rocm-smi + + - name: ROCM-INFO + run: | + rocminfo | grep "Agent" -A 14 + + - name: Show ROCR environment + run: | + echo "ROCR: $ROCR_VISIBLE_DEVICES" + + - name: Environment + working-directory: /transformers + run: | + python3 utils/print_env.py + + - name: Show installed libraries and their versions + working-directory: /transformers + run: pip freeze + + - name: Run all tests on GPU + working-directory: /transformers + run: python3 -m pytest -rsfE -v --make-reports=${{ inputs.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports tests/${{ matrix.folders }} -m "not not_device_test" + + - name: Failure short reports + if: ${{ failure() }} + continue-on-error: true + run: cat /transformers/reports/${{ inputs.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports/failures_short.txt + + - name: Run test + shell: bash + run: | + mkdir -p /transformers/reports/${{ inputs.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports + echo "hello" > /transformers/reports/${{ inputs.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports/hello.txt + echo "${{ inputs.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports" + + - name: "Test suite reports artifacts: ${{ inputs.machine_type }}_run_models_gpu_${{ env.matrix_folders }}_test_reports" + if: ${{ always() }} + uses: actions/upload-artifact@v4 + with: + name: ${{ inputs.machine_type }}_run_models_gpu_${{ env.matrix_folders }}_test_reports + path: /transformers/reports/${{ inputs.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports diff --git a/docs/transformers/.github/workflows/new_model_pr_merged_notification.yml b/docs/transformers/.github/workflows/new_model_pr_merged_notification.yml new file mode 100644 index 0000000000000000000000000000000000000000..6282528c0b74cb07c021e7d3420c1221f2b40b38 --- /dev/null +++ b/docs/transformers/.github/workflows/new_model_pr_merged_notification.yml @@ -0,0 +1,68 @@ +# Used to notify core maintainers about new model PR being merged +name: New model PR merged notification + +on: + push: + branches: + - main + paths: + - 'src/transformers/models/*/modeling_*' + +jobs: + notify_new_model: + name: Notify new model + runs-on: ubuntu-22.04 + steps: + - uses: actions/checkout@v4 + with: + fetch-depth: 0 + - name: Check new model + shell: bash + run: | + python -m pip install gitpython + python -c 'from utils.pr_slow_ci_models import get_new_model; new_model = get_new_model(diff_with_last_commit=True); print(new_model)' | tee output.txt + echo "NEW_MODEL=$(tail -n 1 output.txt)" >> $GITHUB_ENV + echo "COMMIT_SHA=$(git log -1 --format=%H)" >> $GITHUB_ENV + + - name: print commit sha + if: ${{ env.NEW_MODEL != ''}} + shell: bash + run: | + echo "$COMMIT_SHA" + + - name: print new model + if: ${{ env.NEW_MODEL != ''}} + shell: bash + run: | + echo "$NEW_MODEL" + + - name: Notify + if: ${{ env.NEW_MODEL != ''}} + uses: slackapi/slack-github-action@6c661ce58804a1a20f6dc5fbee7f0381b469e001 + with: + # Slack channel id, channel name, or user id to post message. + # See also: https://api.slack.com/methods/chat.postMessage#channels + channel-id: transformers-new-model-notification + # For posting a rich message using Block Kit + payload: | + { + "blocks": [ + { + "type": "header", + "text": { + "type": "plain_text", + "text": "New model!", + "emoji": true + } + }, + { + "type": "section", + "text": { + "type": "mrkdwn", + "text": " GH_ArthurZucker, GH_lysandrejik, GH_ydshieh" + } + } + ] + } + env: + SLACK_BOT_TOKEN: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }} \ No newline at end of file diff --git a/docs/transformers/.github/workflows/push-important-models.yml b/docs/transformers/.github/workflows/push-important-models.yml new file mode 100644 index 0000000000000000000000000000000000000000..099ded8018e93bb6c9388a1062148cae9ad6a817 --- /dev/null +++ b/docs/transformers/.github/workflows/push-important-models.yml @@ -0,0 +1,135 @@ +name: Slow tests on important models (on Push - A10) + +on: + push: + branches: [ main ] + +env: + OUTPUT_SLACK_CHANNEL_ID: "C06L2SGMEEA" + HF_HUB_READ_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }} + HF_HOME: /mnt/cache + TRANSFORMERS_IS_CI: yes + OMP_NUM_THREADS: 8 + MKL_NUM_THREADS: 8 + RUN_SLOW: yes # For gated repositories, we still need to agree to share information on the Hub repo. page in order to get access. # This token is created under the bot `hf-transformers-bot`. + SIGOPT_API_TOKEN: ${{ secrets.SIGOPT_API_TOKEN }} + TF_FORCE_GPU_ALLOW_GROWTH: true + +jobs: + get_modified_models: + name: "Get all modified files" + runs-on: ubuntu-latest + outputs: + matrix: ${{ steps.set-matrix.outputs.matrix }} + steps: + - name: Check out code + uses: actions/checkout@v4 + + - name: Get changed files + id: changed-files + uses: tj-actions/changed-files@1c8e6069583811afb28f97afeaf8e7da80c6be5c + with: + files: src/transformers/models/** + + - name: Run step if only the files listed above change + if: steps.changed-files.outputs.any_changed == 'true' + id: set-matrix + env: + ALL_CHANGED_FILES: ${{ steps.changed-files.outputs.all_changed_files }} + run: | + model_arrays=() + for file in $ALL_CHANGED_FILES; do + model_path="${file#*models/}" + model_path="models/${model_path%%/*}" + if grep -qFx "$model_path" utils/important_models.txt; then + # Append the file to the matrix string + model_arrays+=("$model_path") + fi + done + matrix_string=$(printf '"%s", ' "${model_arrays[@]}" | sed 's/, $//') + echo "matrix=[$matrix_string]" >> $GITHUB_OUTPUT + test_modified_files: + needs: get_modified_models + name: Slow & FA2 tests + runs-on: + group: aws-g5-4xlarge-cache + container: + image: huggingface/transformers-all-latest-gpu + options: --gpus all --privileged --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ + if: ${{ needs.get_modified_models.outputs.matrix != '[]' && needs.get_modified_models.outputs.matrix != '' && fromJson(needs.get_modified_models.outputs.matrix)[0] != null }} + strategy: + fail-fast: false + matrix: + model-name: ${{ fromJson(needs.get_modified_models.outputs.matrix) }} + + steps: + - name: Check out code + uses: actions/checkout@v4 + + - name: Install locally transformers & other libs + run: | + apt install sudo + sudo -H pip install --upgrade pip + sudo -H pip uninstall -y transformers + sudo -H pip install -U -e ".[testing]" + MAX_JOBS=4 pip install flash-attn --no-build-isolation + pip install bitsandbytes + + - name: NVIDIA-SMI + run: | + nvidia-smi + + - name: Show installed libraries and their versions + run: pip freeze + + - name: Run FA2 tests + id: run_fa2_tests + run: + pytest -rsfE -m "flash_attn_test" --make-reports=${{ matrix.model-name }}_fa2_tests/ tests/${{ matrix.model-name }}/test_modeling_* + + - name: "Test suite reports artifacts: ${{ matrix.model-name }}_fa2_tests" + if: ${{ always() }} + uses: actions/upload-artifact@v4 + with: + name: ${{ matrix.model-name }}_fa2_tests + path: /transformers/reports/${{ matrix.model-name }}_fa2_tests + + - name: Post to Slack + if: always() + uses: huggingface/hf-workflows/.github/actions/post-slack@main + with: + slack_channel: ${{ env.OUTPUT_SLACK_CHANNEL_ID }} + title: 🤗 Results of the FA2 tests - ${{ matrix.model-name }} + status: ${{ steps.run_fa2_tests.conclusion}} + slack_token: ${{ secrets.CI_SLACK_BOT_TOKEN }} + + - name: Run integration tests + id: run_integration_tests + if: always() + run: + pytest -rsfE -k "IntegrationTest" --make-reports=tests_integration_${{ matrix.model-name }} tests/${{ matrix.model-name }}/test_modeling_* + + - name: "Test suite reports artifacts: tests_integration_${{ matrix.model-name }}" + if: ${{ always() }} + uses: actions/upload-artifact@v4 + with: + name: tests_integration_${{ matrix.model-name }} + path: /transformers/reports/tests_integration_${{ matrix.model-name }} + + - name: Post to Slack + if: always() + uses: huggingface/hf-workflows/.github/actions/post-slack@main + with: + slack_channel: ${{ env.OUTPUT_SLACK_CHANNEL_ID }} + title: 🤗 Results of the Integration tests - ${{ matrix.model-name }} + status: ${{ steps.run_integration_tests.conclusion}} + slack_token: ${{ secrets.CI_SLACK_BOT_TOKEN }} + + - name: Tailscale # In order to be able to SSH when a test fails + if: ${{ runner.debug == '1'}} + uses: huggingface/tailscale-action@v1 + with: + authkey: ${{ secrets.TAILSCALE_SSH_AUTHKEY }} + slackChannel: ${{ secrets.SLACK_CIFEEDBACK_CHANNEL }} + slackToken: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }} + waitForSSH: true diff --git a/docs/transformers/.github/workflows/release-conda.yml b/docs/transformers/.github/workflows/release-conda.yml new file mode 100644 index 0000000000000000000000000000000000000000..c0e28d7a510d7fae787b7b007ac4235360a5d666 --- /dev/null +++ b/docs/transformers/.github/workflows/release-conda.yml @@ -0,0 +1,47 @@ +name: Release - Conda + +on: + push: + tags: + - v* + branches: + - conda_* + +env: + ANACONDA_API_TOKEN: ${{ secrets.ANACONDA_API_TOKEN }} + +jobs: + build_and_package: + runs-on: ubuntu-22.04 + defaults: + run: + shell: bash -l {0} + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Install miniconda + uses: conda-incubator/setup-miniconda@v2 + with: + auto-update-conda: true + auto-activate-base: false + python-version: 3.8 + activate-environment: "build-transformers" + channels: huggingface + + - name: Setup conda env + run: | + conda install -c defaults anaconda-client conda-build + + - name: Extract version + run: echo "TRANSFORMERS_VERSION=`python setup.py --version`" >> $GITHUB_ENV + + - name: Build conda packages + run: | + conda info + conda list + conda-build .github/conda + + - name: Upload to Anaconda + run: anaconda upload `conda-build .github/conda --output` --force diff --git a/docs/transformers/.github/workflows/self-comment-ci.yml b/docs/transformers/.github/workflows/self-comment-ci.yml new file mode 100644 index 0000000000000000000000000000000000000000..6567d9fad69c3300a0bde3abd39db728ff26d285 --- /dev/null +++ b/docs/transformers/.github/workflows/self-comment-ci.yml @@ -0,0 +1,416 @@ +name: PR comment GitHub CI + +on: + issue_comment: + types: + - created + branches-ignore: + - main +concurrency: + group: ${{ github.workflow }}-${{ github.event.issue.number }}-${{ startsWith(github.event.comment.body, 'run-slow') || startsWith(github.event.comment.body, 'run slow') || startsWith(github.event.comment.body, 'run_slow') }} + cancel-in-progress: true +permissions: read-all + +env: + HF_HOME: /mnt/cache + TRANSFORMERS_IS_CI: yes + OMP_NUM_THREADS: 8 + MKL_NUM_THREADS: 8 + RUN_SLOW: yes + # For gated repositories, we still need to agree to share information on the Hub repo. page in order to get access. + # This token is created under the bot `hf-transformers-bot`. + HF_HUB_READ_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }} + SIGOPT_API_TOKEN: ${{ secrets.SIGOPT_API_TOKEN }} + TF_FORCE_GPU_ALLOW_GROWTH: true + CUDA_VISIBLE_DEVICES: 0,1 + +jobs: + get-pr-number: + runs-on: ubuntu-22.04 + name: Get PR number + # For security: only allow team members to run + if: ${{ github.event.issue.state == 'open' && contains(fromJSON('["ydshieh", "ArthurZucker", "zucchini-nlp", "qubvel", "molbap", "gante", "LysandreJik", "Cyrilvallez", "Rocketknight1", "SunMarc", "muellerzr", "eustlb", "MekkCyber"]'), github.actor) && (startsWith(github.event.comment.body, 'run-slow') || startsWith(github.event.comment.body, 'run slow') || startsWith(github.event.comment.body, 'run_slow')) }} + outputs: + PR_NUMBER: ${{ steps.set_pr_number.outputs.PR_NUMBER }} + steps: + - name: Get PR number + shell: bash + run: | + if [[ "${{ github.event.issue.number }}" != "" && "${{ github.event.issue.pull_request }}" != "" ]]; then + echo "PR_NUMBER=${{ github.event.issue.number }}" >> $GITHUB_ENV + else + echo "PR_NUMBER=" >> $GITHUB_ENV + fi + + - name: Check PR number + shell: bash + run: | + echo "${{ env.PR_NUMBER }}" + + - name: Set PR number + id: set_pr_number + run: echo "PR_NUMBER=${{ env.PR_NUMBER }}" >> "$GITHUB_OUTPUT" + + get-sha: + runs-on: ubuntu-22.04 + needs: get-pr-number + if: ${{ needs.get-pr-number.outputs.PR_NUMBER != ''}} + outputs: + PR_HEAD_SHA: ${{ steps.get_sha.outputs.PR_HEAD_SHA }} + PR_MERGE_SHA: ${{ steps.get_sha.outputs.PR_MERGE_SHA }} + steps: + - uses: actions/checkout@v4 + with: + fetch-depth: "0" + ref: "refs/pull/${{needs.get-pr-number.outputs.PR_NUMBER}}/merge" + + - name: Get SHA (and verify timestamps against the issue comment date) + id: get_sha + env: + PR_NUMBER: ${{ needs.get-pr-number.outputs.PR_NUMBER }} + COMMENT_DATE: ${{ github.event.comment.created_at }} + run: | + git fetch origin refs/pull/$PR_NUMBER/head:refs/remotes/pull/$PR_NUMBER/head + git checkout refs/remotes/pull/$PR_NUMBER/head + echo "PR_HEAD_SHA: $(git log -1 --format=%H)" + echo "PR_HEAD_SHA=$(git log -1 --format=%H)" >> "$GITHUB_OUTPUT" + git fetch origin refs/pull/$PR_NUMBER/merge:refs/remotes/pull/$PR_NUMBER/merge + git checkout refs/remotes/pull/$PR_NUMBER/merge + echo "PR_MERGE_SHA: $(git log -1 --format=%H)" + echo "PR_MERGE_SHA=$(git log -1 --format=%H)" >> "$GITHUB_OUTPUT" + PR_MERGE_COMMIT_TIMESTAMP=$(git log -1 --date=unix --format=%cd) + echo "PR_MERGE_COMMIT_TIMESTAMP: $PR_MERGE_COMMIT_TIMESTAMP" + COMMENT_TIMESTAMP=$(date -d "${COMMENT_DATE}" +"%s") + echo "COMMENT_DATE: $COMMENT_DATE" + echo "COMMENT_TIMESTAMP: $COMMENT_TIMESTAMP" + if [ $COMMENT_TIMESTAMP -le $PR_MERGE_COMMIT_TIMESTAMP ]; then + echo "Last commit on the pull request is newer than the issue comment triggering this run! Abort!"; + exit -1; + fi + + # use a python script to handle this complex logic + # case 1: `run-slow` (auto. infer with limited number of models, but in particular, new model) + # case 2: `run-slow model_1, model_2` + get-tests: + runs-on: ubuntu-22.04 + needs: [get-pr-number, get-sha] + if: ${{ needs.get-pr-number.outputs.PR_NUMBER != ''}} + outputs: + models: ${{ steps.models_to_run.outputs.models }} + quantizations: ${{ steps.models_to_run.outputs.quantizations }} + steps: + - uses: actions/checkout@v4 + with: + fetch-depth: "0" + ref: "refs/pull/${{needs.get-pr-number.outputs.PR_NUMBER}}/merge" + + - name: Verify merge commit SHA + env: + VERIFIED_PR_MERGE_SHA: ${{ needs.get-sha.outputs.PR_MERGE_SHA }} + run: | + PR_MERGE_SHA=$(git log -1 --format=%H) + if [ $PR_MERGE_SHA != $VERIFIED_PR_MERGE_SHA ]; then + echo "The merged commit SHA is not the same as the verified one! Security issue detected, abort the workflow!"; + exit -1; + fi + + - name: Get models to test + env: + PR_COMMENT: ${{ github.event.comment.body }} + run: | + python -m pip install GitPython + python utils/pr_slow_ci_models.py --message "$PR_COMMENT" | tee output.txt + echo "models=$(tail -n 1 output.txt)" >> $GITHUB_ENV + python utils/pr_slow_ci_models.py --message "$PR_COMMENT" --quantization | tee output2.txt + echo "quantizations=$(tail -n 1 output2.txt)" >> $GITHUB_ENV + + - name: Show models to test + id: models_to_run + run: | + echo "${{ env.models }}" + echo "models=${{ env.models }}" >> $GITHUB_ENV + echo "models=${{ env.models }}" >> $GITHUB_OUTPUT + echo "${{ env.quantizations }}" + echo "quantizations=${{ env.quantizations }}" >> $GITHUB_OUTPUT + + reply_to_comment: + name: Reply to the comment + if: ${{ needs.get-tests.outputs.models != '[]' || needs.get-tests.outputs.quantizations != '[]' }} + needs: [get-pr-number, get-tests] + permissions: + pull-requests: write + runs-on: ubuntu-22.04 + steps: + - name: Reply to the comment + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + MODELS: ${{ needs.get-tests.outputs.models }} + BODY: "This comment contains run-slow, running the specified jobs:\n\nmodels: ${{ needs.get-tests.outputs.models }}\nquantizations: ${{ needs.get-tests.outputs.quantizations }}" + run: | + gh api \ + --method POST \ + -H "Accept: application/vnd.github+json" \ + -H "X-GitHub-Api-Version: 2022-11-28" \ + repos/${{ github.repository }}/issues/${{ needs.get-pr-number.outputs.PR_NUMBER }}/comments \ + -f "body=This comment contains run-slow, running the specified jobs: ${{ env.BODY }} ..." + + create_run: + name: Create run + if: ${{ needs.get-tests.outputs.models != '[]' || needs.get-tests.outputs.quantizations != '[]' }} + needs: [get-sha, get-tests, reply_to_comment] + permissions: + statuses: write + runs-on: ubuntu-22.04 + steps: + - name: Create Run + id: create_run + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + # Create a commit status (pending) for a run of this workflow. The status has to be updated later in `update_run_status`. + # See https://docs.github.com/en/rest/commits/statuses?apiVersion=2022-11-28#create-a-commit-status + GITHUB_RUN_URL: https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }} + run: | + gh api \ + --method POST \ + -H "Accept: application/vnd.github+json" \ + -H "X-GitHub-Api-Version: 2022-11-28" \ + repos/${{ github.repository }}/statuses/${{ needs.get-sha.outputs.PR_HEAD_SHA }} \ + -f "target_url=$GITHUB_RUN_URL" -f "state=pending" -f "description=Slow CI job" -f "context=pytest/custom-tests" + + run_models_gpu: + name: Run all tests for the model + if: ${{ needs.get-tests.outputs.models != '[]' }} + needs: [get-pr-number, get-sha, get-tests, create_run] + strategy: + fail-fast: false + matrix: + folders: ${{ fromJson(needs.get-tests.outputs.models) }} + machine_type: [aws-g4dn-2xlarge-cache, aws-g4dn-12xlarge-cache] + runs-on: + group: '${{ matrix.machine_type }}' + container: + image: huggingface/transformers-all-latest-gpu + options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ + steps: + - name: Echo input and matrix info + shell: bash + run: | + echo "${{ matrix.folders }}" + + - name: Echo folder ${{ matrix.folders }} + shell: bash + # For folders like `models/bert`, set an env. var. (`matrix_folders`) to `models_bert`, which will be used to + # set the artifact folder names (because the character `/` is not allowed). + run: | + echo "${{ matrix.folders }}" + matrix_folders=${{ matrix.folders }} + matrix_folders=${matrix_folders/'models/'/'models_'} + echo "$matrix_folders" + echo "matrix_folders=$matrix_folders" >> $GITHUB_ENV + + - name: Checkout to PR merge commit + working-directory: /transformers + run: | + git fetch origin refs/pull/${{ needs.get-pr-number.outputs.PR_NUMBER }}/merge:refs/remotes/pull/${{ needs.get-pr-number.outputs.PR_NUMBER }}/merge + git checkout refs/remotes/pull/${{ needs.get-pr-number.outputs.PR_NUMBER }}/merge + git log -1 --format=%H + + - name: Verify merge commit SHA + env: + VERIFIED_PR_MERGE_SHA: ${{ needs.get-sha.outputs.PR_MERGE_SHA }} + working-directory: /transformers + run: | + PR_MERGE_SHA=$(git log -1 --format=%H) + if [ $PR_MERGE_SHA != $VERIFIED_PR_MERGE_SHA ]; then + echo "The merged commit SHA is not the same as the verified one! Security issue detected, abort the workflow!"; + exit -1; + fi + + - name: Reinstall transformers in edit mode (remove the one installed during docker image build) + working-directory: /transformers + run: python3 -m pip uninstall -y transformers && python3 -m pip install -e . + + - name: NVIDIA-SMI + run: | + nvidia-smi + + - name: Set `machine_type` for report and artifact names + working-directory: /transformers + shell: bash + run: | + echo "${{ matrix.machine_type }}" + if [ "${{ matrix.machine_type }}" = "aws-g4dn-2xlarge-cache" ]; then + machine_type=single-gpu + elif [ "${{ matrix.machine_type }}" = "aws-g4dn-12xlarge-cache" ]; then + machine_type=multi-gpu + else + machine_type=${{ matrix.machine_type }} + fi + echo "$machine_type" + echo "machine_type=$machine_type" >> $GITHUB_ENV + + - name: Environment + working-directory: /transformers + run: | + python3 utils/print_env.py + + - name: Show installed libraries and their versions + working-directory: /transformers + run: pip freeze + + - name: Run all tests on GPU + working-directory: /transformers + run: | + export CUDA_VISIBLE_DEVICES="$(python3 utils/set_cuda_devices_for_ci.py --test_folder ${{ matrix.folders }})" + echo $CUDA_VISIBLE_DEVICES + python3 -m pytest -v -rsfE --make-reports=${{ env.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports tests/${{ matrix.folders }} + + - name: Failure short reports + if: ${{ failure() }} + continue-on-error: true + run: cat /transformers/reports/${{ env.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports/failures_short.txt + + - name: Make sure report directory exists + shell: bash + run: | + mkdir -p /transformers/reports/${{ env.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports + echo "hello" > /transformers/reports/${{ env.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports/hello.txt + echo "${{ env.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports" + + - name: "Test suite reports artifacts: ${{ env.machine_type }}_run_models_gpu_${{ env.matrix_folders }}_test_reports" + if: ${{ always() }} + uses: actions/upload-artifact@v4 + with: + name: ${{ env.machine_type }}_run_models_gpu_${{ env.matrix_folders }}_test_reports + path: /transformers/reports/${{ env.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports + + run_quantization_torch_gpu: + name: Run all tests for a quantization + if: ${{ needs.get-tests.outputs.quantizations != '[]' }} + needs: [get-pr-number, get-sha, get-tests, create_run] + strategy: + fail-fast: false + matrix: + folders: ${{ fromJson(needs.get-tests.outputs.quantizations) }} + machine_type: [aws-g4dn-2xlarge-cache, aws-g4dn-12xlarge-cache] + runs-on: + group: '${{ matrix.machine_type }}' + container: + image: huggingface/transformers-quantization-latest-gpu + options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ + steps: + - name: Echo folder ${{ matrix.folders }} + shell: bash + run: | + echo "${{ matrix.folders }}" + matrix_folders=${{ matrix.folders }} + matrix_folders=${matrix_folders/'quantization/'/'quantization_'} + echo "$matrix_folders" + echo "matrix_folders=$matrix_folders" >> $GITHUB_ENV + + - name: Checkout to PR merge commit + working-directory: /transformers + run: | + git fetch origin refs/pull/${{ needs.get-pr-number.outputs.PR_NUMBER }}/merge:refs/remotes/pull/${{ needs.get-pr-number.outputs.PR_NUMBER }}/merge + git checkout refs/remotes/pull/${{ needs.get-pr-number.outputs.PR_NUMBER }}/merge + git log -1 --format=%H + + - name: Verify merge commit SHA + env: + VERIFIED_PR_MERGE_SHA: ${{ needs.get-sha.outputs.PR_MERGE_SHA }} + working-directory: /transformers + run: | + PR_MERGE_SHA=$(git log -1 --format=%H) + if [ $PR_MERGE_SHA != $VERIFIED_PR_MERGE_SHA ]; then + echo "The merged commit SHA is not the same as the verified one! Security issue detected, abort the workflow!"; + exit -1; + fi + + - name: Reinstall transformers in edit mode (remove the one installed during docker image build) + working-directory: /transformers + run: python3 -m pip uninstall -y transformers && python3 -m pip install -e . + - name: NVIDIA-SMI + run: | + nvidia-smi + + - name: Set `machine_type` for report and artifact names + working-directory: /transformers + shell: bash + run: | + echo "${{ matrix.machine_type }}" + if [ "${{ matrix.machine_type }}" = "aws-g4dn-2xlarge-cache" ]; then + machine_type=single-gpu + elif [ "${{ matrix.machine_type }}" = "aws-g4dn-12xlarge-cache" ]; then + machine_type=multi-gpu + else + machine_type=${{ matrix.machine_type }} + fi + echo "$machine_type" + echo "machine_type=$machine_type" >> $GITHUB_ENV + + - name: Environment + working-directory: /transformers + run: | + python3 utils/print_env.py + + - name: Show installed libraries and their versions + working-directory: /transformers + run: pip freeze + + - name: Run quantization tests on GPU + working-directory: /transformers + run: | + python3 -m pytest -v --make-reports=${{ env.machine_type }}_run_quantization_torch_gpu_${{ matrix.folders }}_test_reports tests/${{ matrix.folders }} + + - name: Failure short reports + if: ${{ failure() }} + continue-on-error: true + run: cat /transformers/reports/${{ env.machine_type }}_run_quantization_torch_gpu_${{ matrix.folders }}_test_reports/failures_short.txt + + - name: Make sure report directory exists + shell: bash + run: | + mkdir -p /transformers/reports/${{ env.machine_type }}_run_quantization_gpu_${{ matrix.folders }}_test_reports + echo "hello" > /transformers/reports/${{ env.machine_type }}_run_quantization_gpu_${{ matrix.folders }}_test_reports/hello.txt + echo "${{ env.machine_type }}_run_quantization_gpu_${{ matrix.folders }}_test_reports" + + - name: "Test suite reports artifacts: ${{ env.machine_type }}_run_quantization_torch_gpu_${{ env.matrix_folders }}_test_reports" + if: ${{ always() }} + uses: actions/upload-artifact@v4 + with: + name: ${{ env.machine_type }}_run_quantization_torch_gpu_${{ env.matrix_folders }}_test_reports + path: /transformers/reports/${{ env.machine_type }}_run_quantization_torch_gpu_${{ matrix.folders }}_test_reports + + update_run_status: + name: Update Check Run Status + needs: [get-sha, create_run, run_models_gpu, run_quantization_torch_gpu] + permissions: + statuses: write + if: ${{ always() && needs.create_run.result == 'success' }} + runs-on: ubuntu-22.04 + env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + GITHUB_RUN_URL: https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }} + STATUS_OK: ${{ contains(fromJSON('["skipped", "success"]'), needs.run_models_gpu.result) && contains(fromJSON('["skipped", "success"]'), needs.run_quantization_torch_gpu.result) }} + steps: + - name: Get `run_models_gpu` job status + run: | + echo "${{ needs.run_models_gpu.result }}" + echo "${{ needs.run_quantization_torch_gpu.result }}" + echo $STATUS_OK + if [ "$STATUS_OK" = "true" ]; then + echo "STATUS=success" >> $GITHUB_ENV + else + echo "STATUS=failure" >> $GITHUB_ENV + fi + + - name: Update PR commit statuses + run: | + echo "${{ needs.run_models_gpu.result }}" + echo "${{ env.STATUS }}" + gh api \ + --method POST \ + -H "Accept: application/vnd.github+json" \ + -H "X-GitHub-Api-Version: 2022-11-28" \ + repos/${{ github.repository }}/statuses/${{ needs.get-sha.outputs.PR_HEAD_SHA }} \ + -f "target_url=$GITHUB_RUN_URL" -f "state=${{ env.STATUS }}" -f "description=Slow CI job" -f "context=pytest/custom-tests" diff --git a/docs/transformers/.github/workflows/self-nightly-past-ci-caller.yml b/docs/transformers/.github/workflows/self-nightly-past-ci-caller.yml new file mode 100644 index 0000000000000000000000000000000000000000..46d811d4a43394a86f256669b69d19cf5d109fab --- /dev/null +++ b/docs/transformers/.github/workflows/self-nightly-past-ci-caller.yml @@ -0,0 +1,99 @@ +name: Self-hosted runner (nightly-past-ci-caller) + +on: + schedule: + - cron: "17 2,14 * * *" + push: + branches: + - run_past_ci* + +jobs: + get_number: + name: Get number + runs-on: ubuntu-22.04 + outputs: + run_number: ${{ steps.get_number.outputs.run_number }} + steps: + - name: Get number + id: get_number + run: | + echo "${{ github.run_number }}" + echo "$(python3 -c 'print(int(${{ github.run_number }}) % 10)')" + echo "run_number=$(python3 -c 'print(int(${{ github.run_number }}) % 10)')" >> $GITHUB_OUTPUT + + run_past_ci_tensorflow_2-11: + name: TensorFlow 2.11 + needs: get_number + if: needs.get_number.outputs.run_number == 3 && (cancelled() != true) && ((github.event_name == 'push') && startsWith(github.ref_name, 'run_past_ci')) + uses: ./.github/workflows/self-past-caller.yml + with: + framework: tensorflow + version: "2.11" + sha: ${{ github.sha }} + secrets: inherit + + run_past_ci_tensorflow_2-10: + name: TensorFlow 2.10 + needs: get_number + if: needs.get_number.outputs.run_number == 4 && (cancelled() != true) && ((github.event_name == 'push') && startsWith(github.ref_name, 'run_past_ci')) + uses: ./.github/workflows/self-past-caller.yml + with: + framework: tensorflow + version: "2.10" + sha: ${{ github.sha }} + secrets: inherit + + run_past_ci_tensorflow_2-9: + name: TensorFlow 2.9 + needs: get_number + if: needs.get_number.outputs.run_number == 5 && (cancelled() != true) && ((github.event_name == 'push') && startsWith(github.ref_name, 'run_past_ci')) + uses: ./.github/workflows/self-past-caller.yml + with: + framework: tensorflow + version: "2.9" + sha: ${{ github.sha }} + secrets: inherit + + run_past_ci_tensorflow_2-8: + name: TensorFlow 2.8 + needs: get_number + if: needs.get_number.outputs.run_number == 6 && (cancelled() != true) && ((github.event_name == 'push') && startsWith(github.ref_name, 'run_past_ci')) + uses: ./.github/workflows/self-past-caller.yml + with: + framework: tensorflow + version: "2.8" + sha: ${{ github.sha }} + secrets: inherit + + run_past_ci_tensorflow_2-7: + name: TensorFlow 2.7 + needs: get_number + if: needs.get_number.outputs.run_number == 7 && (cancelled() != true) && ((github.event_name == 'push') && startsWith(github.ref_name, 'run_past_ci')) + uses: ./.github/workflows/self-past-caller.yml + with: + framework: tensorflow + version: "2.7" + sha: ${{ github.sha }} + secrets: inherit + + run_past_ci_tensorflow_2-6: + name: TensorFlow 2.6 + needs: get_number + if: needs.get_number.outputs.run_number == 8 && (cancelled() != true) && ((github.event_name == 'push') && startsWith(github.ref_name, 'run_past_ci')) + uses: ./.github/workflows/self-past-caller.yml + with: + framework: tensorflow + version: "2.6" + sha: ${{ github.sha }} + secrets: inherit + + run_past_ci_tensorflow_2-5: + name: TensorFlow 2.5 + needs: get_number + if: needs.get_number.outputs.run_number == 9 && (cancelled() != true) && ((github.event_name == 'push') && startsWith(github.ref_name, 'run_past_ci')) + uses: ./.github/workflows/self-past-caller.yml + with: + framework: tensorflow + version: "2.5" + sha: ${{ github.sha }} + secrets: inherit diff --git a/docs/transformers/.github/workflows/self-past-caller.yml b/docs/transformers/.github/workflows/self-past-caller.yml new file mode 100644 index 0000000000000000000000000000000000000000..1929a01c34d94726d1e820924933cec1412a02d6 --- /dev/null +++ b/docs/transformers/.github/workflows/self-past-caller.yml @@ -0,0 +1,40 @@ +name: Self-hosted runner (past-ci) + + +on: + workflow_call: + inputs: + framework: + required: true + type: string + version: + required: true + type: string + # Use this to control the commit to test against + sha: + default: 'main' + required: false + type: string + +jobs: + model-ci: + name: Model CI + uses: ./.github/workflows/self-scheduled.yml + with: + job: run_models_gpu + slack_report_channel: "#transformers-ci-past-future" + runner: past-ci + docker: huggingface/transformers-${{ inputs.framework }}-past-${{ inputs.version }}-gpu + ci_event: Past CI - ${{ inputs.framework }}-${{ inputs.version }} + secrets: inherit + + deepspeed-ci: + name: DeepSpeed CI + uses: ./.github/workflows/self-scheduled.yml + with: + job: run_torch_cuda_extensions_gpu + slack_report_channel: "#transformers-ci-past-future" + runner: past-ci + docker: huggingface/transformers-${{ inputs.framework }}-past-${{ inputs.version }}-gpu + ci_event: Past CI - ${{ inputs.framework }}-${{ inputs.version }} + secrets: inherit diff --git a/docs/transformers/.github/workflows/self-push-amd-mi210-caller.yml b/docs/transformers/.github/workflows/self-push-amd-mi210-caller.yml new file mode 100644 index 0000000000000000000000000000000000000000..45b325f7b357bf838f2f876bb7f4748ed6204ee2 --- /dev/null +++ b/docs/transformers/.github/workflows/self-push-amd-mi210-caller.yml @@ -0,0 +1,25 @@ +name: Self-hosted runner (AMD mi210 CI caller) + +on: + #workflow_run: + # workflows: ["Self-hosted runner (push-caller)"] + # branches: ["main"] + # types: [completed] + push: + branches: + - run_amd_push_ci_caller* + paths: + - "src/**" + - "tests/**" + - ".github/**" + - "templates/**" + - "utils/**" + +jobs: + run_amd_ci: + name: AMD mi210 + if: (cancelled() != true) && ((github.event_name == 'workflow_run') || ((github.event_name == 'push') && startsWith(github.ref_name, 'run_amd_push_ci_caller'))) + uses: ./.github/workflows/self-push-amd.yml + with: + gpu_flavor: mi210 + secrets: inherit diff --git a/docs/transformers/.github/workflows/self-push-amd-mi300-caller.yml b/docs/transformers/.github/workflows/self-push-amd-mi300-caller.yml new file mode 100644 index 0000000000000000000000000000000000000000..797916125a24fbaf75ae74e87eddbd46a7507154 --- /dev/null +++ b/docs/transformers/.github/workflows/self-push-amd-mi300-caller.yml @@ -0,0 +1,25 @@ +name: Self-hosted runner (AMD mi300 CI caller) + +on: + #workflow_run: + # workflows: ["Self-hosted runner (push-caller)"] + # branches: ["main"] + # types: [completed] + push: + branches: + - run_amd_push_ci_caller* + paths: + - "src/**" + - "tests/**" + - ".github/**" + - "templates/**" + - "utils/**" + +jobs: + run_amd_ci: + name: AMD mi300 + if: (cancelled() != true) && ((github.event_name == 'workflow_run') || ((github.event_name == 'push') && (startsWith(github.ref_name, 'run_amd_push_ci_caller') || startsWith(github.ref_name, 'mi300-ci')))) + uses: ./.github/workflows/self-push-amd.yml + with: + gpu_flavor: mi300 + secrets: inherit diff --git a/docs/transformers/.github/workflows/self-push-amd.yml b/docs/transformers/.github/workflows/self-push-amd.yml new file mode 100644 index 0000000000000000000000000000000000000000..621061988949d3f57aa47b2a3740603c86800733 --- /dev/null +++ b/docs/transformers/.github/workflows/self-push-amd.yml @@ -0,0 +1,334 @@ +name: Self-hosted runner AMD GPU (push) + +on: + workflow_call: + inputs: + gpu_flavor: + required: true + type: string + +env: + HF_HOME: /mnt/cache + TRANSFORMERS_IS_CI: yes + OMP_NUM_THREADS: 8 + MKL_NUM_THREADS: 8 + PYTEST_TIMEOUT: 60 + TF_FORCE_GPU_ALLOW_GROWTH: true + HF_HUB_READ_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }} + +jobs: + check_runner_status: + name: Check Runner Status + runs-on: ubuntu-22.04 + steps: + - name: Checkout transformers + uses: actions/checkout@v4 + with: + fetch-depth: 2 + + - name: Check Runner Status + run: python utils/check_self_hosted_runner.py --target_runners amd-mi210-single-gpu-ci-runner-docker --token ${{ secrets.ACCESS_REPO_INFO_TOKEN }} + + check_runners: + name: Check Runners + needs: check_runner_status + strategy: + matrix: + machine_type: [single-gpu, multi-gpu] + runs-on: [self-hosted, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}'] + container: + image: huggingface/transformers-pytorch-amd-gpu-push-ci # <--- We test only for PyTorch for now + options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ + steps: + - name: ROCM-SMI + run: | + rocm-smi + - name: ROCM-INFO + run: | + rocminfo | grep "Agent" -A 14 + - name: Show ROCR environment + run: | + echo "ROCR: $ROCR_VISIBLE_DEVICES" + + setup_gpu: + name: Setup + needs: check_runners + strategy: + matrix: + machine_type: [single-gpu, multi-gpu] + runs-on: [self-hosted, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}'] + container: + image: huggingface/transformers-pytorch-amd-gpu-push-ci # <--- We test only for PyTorch for now + options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ + outputs: + matrix: ${{ steps.set-matrix.outputs.matrix }} + test_map: ${{ steps.set-matrix.outputs.test_map }} + env: + # `CI_BRANCH_PUSH`: The branch name from the push event + # `CI_BRANCH_WORKFLOW_RUN`: The name of the branch on which this workflow is triggered by `workflow_run` event + # `CI_SHA_PUSH`: The commit SHA from the push event + # `CI_SHA_WORKFLOW_RUN`: The commit SHA that triggers this workflow by `workflow_run` event + CI_BRANCH_PUSH: ${{ github.event.ref }} + CI_BRANCH_WORKFLOW_RUN: ${{ github.event.workflow_run.head_branch }} + CI_SHA_PUSH: ${{ github.event.head_commit.id }} + CI_SHA_WORKFLOW_RUN: ${{ github.event.workflow_run.head_sha }} + steps: + # Necessary to get the correct branch name and commit SHA for `workflow_run` event + # We also take into account the `push` event (we might want to test some changes in a branch) + - name: Prepare custom environment variables + shell: bash + # `CI_BRANCH`: The non-empty branch name from the above two (one and only one of them is empty) + # `CI_SHA`: The non-empty commit SHA from the above two (one and only one of them is empty) + run: | + CI_BRANCH_PUSH=${CI_BRANCH_PUSH/'refs/heads/'/''} + echo $CI_BRANCH_PUSH + echo $CI_BRANCH_WORKFLOW_RUN + echo $CI_SHA_PUSH + echo $CI_SHA_WORKFLOW_RUN + [[ ! -z "$CI_BRANCH_PUSH" ]] && echo "CI_BRANCH=$CI_BRANCH_PUSH" >> $GITHUB_ENV || echo "CI_BRANCH=$CI_BRANCH_WORKFLOW_RUN" >> $GITHUB_ENV + [[ ! -z "$CI_SHA_PUSH" ]] && echo "CI_SHA=$CI_SHA_PUSH" >> $GITHUB_ENV || echo "CI_SHA=$CI_SHA_WORKFLOW_RUN" >> $GITHUB_ENV + + - name: print environment variables + run: | + echo "env.CI_BRANCH = ${{ env.CI_BRANCH }}" + echo "env.CI_SHA = ${{ env.CI_SHA }}" + + - name: Update clone using environment variables + working-directory: /transformers + run: | + echo "original branch = $(git branch --show-current)" + git fetch && git checkout ${{ env.CI_BRANCH }} + echo "updated branch = $(git branch --show-current)" + git checkout ${{ env.CI_SHA }} + echo "log = $(git log -n 1)" + + - name: Cleanup + working-directory: /transformers + run: | + rm -rf tests/__pycache__ + rm -rf tests/models/__pycache__ + rm -rf reports + + - name: Show installed libraries and their versions + working-directory: /transformers + run: pip freeze + + - name: Fetch the tests to run + working-directory: /transformers + # TODO: add `git-python` in the docker images + run: | + pip install --upgrade git-python + python3 utils/tests_fetcher.py --diff_with_last_commit | tee test_preparation.txt + + - name: Report fetched tests + uses: actions/upload-artifact@v4 + with: + name: test_fetched + path: /transformers/test_preparation.txt + + - id: set-matrix + name: Organize tests into models + working-directory: /transformers + # The `keys` is used as GitHub actions matrix for jobs, i.e. `models/bert`, `tokenization`, `pipeline`, etc. + # The `test_map` is used to get the actual identified test files under each key. + # If no test to run (so no `test_map.json` file), create a dummy map (empty matrix will fail) + run: | + if [ -f test_map.json ]; then + keys=$(python3 -c 'import json; fp = open("test_map.json"); test_map = json.load(fp); fp.close(); d = list(test_map.keys()); print(d)') + test_map=$(python3 -c 'import json; fp = open("test_map.json"); test_map = json.load(fp); fp.close(); print(test_map)') + else + keys=$(python3 -c 'keys = ["dummy"]; print(keys)') + test_map=$(python3 -c 'test_map = {"dummy": []}; print(test_map)') + fi + echo $keys + echo $test_map + echo "matrix=$keys" >> $GITHUB_OUTPUT + echo "test_map=$test_map" >> $GITHUB_OUTPUT + + run_models_gpu: + name: Model tests + needs: setup_gpu + # `dummy` means there is no test to run + if: contains(fromJson(needs.setup_gpu.outputs.matrix), 'dummy') != true + strategy: + fail-fast: false + matrix: + folders: ${{ fromJson(needs.setup_gpu.outputs.matrix) }} + machine_type: [single-gpu, multi-gpu] + runs-on: [self-hosted, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}'] + container: + image: huggingface/transformers-pytorch-amd-gpu-push-ci # <--- We test only for PyTorch for now + options: --device /dev/kfd --device /dev/dri --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ + env: + # For the meaning of these environment variables, see the job `Setup` + CI_BRANCH_PUSH: ${{ github.event.ref }} + CI_BRANCH_WORKFLOW_RUN: ${{ github.event.workflow_run.head_branch }} + CI_SHA_PUSH: ${{ github.event.head_commit.id }} + CI_SHA_WORKFLOW_RUN: ${{ github.event.workflow_run.head_sha }} + steps: + # Necessary to get the correct branch name and commit SHA for `workflow_run` event + # We also take into account the `push` event (we might want to test some changes in a branch) + - name: Prepare custom environment variables + shell: bash + # For the meaning of these environment variables, see the job `Setup` + run: | + CI_BRANCH_PUSH=${CI_BRANCH_PUSH/'refs/heads/'/''} + echo $CI_BRANCH_PUSH + echo $CI_BRANCH_WORKFLOW_RUN + echo $CI_SHA_PUSH + echo $CI_SHA_WORKFLOW_RUN + [[ ! -z "$CI_BRANCH_PUSH" ]] && echo "CI_BRANCH=$CI_BRANCH_PUSH" >> $GITHUB_ENV || echo "CI_BRANCH=$CI_BRANCH_WORKFLOW_RUN" >> $GITHUB_ENV + [[ ! -z "$CI_SHA_PUSH" ]] && echo "CI_SHA=$CI_SHA_PUSH" >> $GITHUB_ENV || echo "CI_SHA=$CI_SHA_WORKFLOW_RUN" >> $GITHUB_ENV + + - name: print environment variables + run: | + echo "env.CI_BRANCH = ${{ env.CI_BRANCH }}" + echo "env.CI_SHA = ${{ env.CI_SHA }}" + + - name: Update clone using environment variables + working-directory: /transformers + run: | + echo "original branch = $(git branch --show-current)" + git fetch && git checkout ${{ env.CI_BRANCH }} + echo "updated branch = $(git branch --show-current)" + git checkout ${{ env.CI_SHA }} + echo "log = $(git log -n 1)" + + - name: Reinstall transformers in edit mode (remove the one installed during docker image build) + working-directory: /transformers + run: python3 -m pip uninstall -y transformers && python3 -m pip install -e . + + - name: Echo folder ${{ matrix.folders }} + shell: bash + # For folders like `models/bert`, set an env. var. (`matrix_folders`) to `models_bert`, which will be used to + # set the artifact folder names (because the character `/` is not allowed). + run: | + echo "${{ matrix.folders }}" + echo "${{ fromJson(needs.setup_gpu.outputs.test_map)[matrix.folders] }}" + matrix_folders=${{ matrix.folders }} + matrix_folders=${matrix_folders/'models/'/'models_'} + echo "$matrix_folders" + echo "matrix_folders=$matrix_folders" >> $GITHUB_ENV + + - name: ROCM-SMI + run: | + rocm-smi + - name: ROCM-INFO + run: | + rocminfo | grep "Agent" -A 14 + - name: Show ROCR environment + run: | + echo "ROCR: $ROCR_VISIBLE_DEVICES" + + - name: Environment + working-directory: /transformers + run: | + python3 utils/print_env.py + + - name: Show installed libraries and their versions + working-directory: /transformers + run: pip freeze + + - name: Run all non-slow selected tests on GPU + working-directory: /transformers + run: | + python3 -m pytest -n 2 --dist=loadfile -v --make-reports=${{ matrix.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports ${{ fromJson(needs.setup_gpu.outputs.test_map)[matrix.folders] }} -m "not not_device_test" + + - name: Failure short reports + if: ${{ failure() }} + continue-on-error: true + run: cat /transformers/reports/${{ matrix.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports/failures_short.txt + + - name: "Test suite reports artifacts: ${{ matrix.machine_type }}_run_models_gpu_${{ env.matrix_folders }}_test_reports" + if: ${{ always() }} + uses: actions/upload-artifact@v4 + with: + name: ${{ matrix.machine_type }}_run_models_gpu_${{ env.matrix_folders }}_test_reports + path: /transformers/reports/${{ matrix.machine_type }}_run_models_gpu_${{ matrix.folders }}_test_reports + + send_results: + name: Send results to webhook + runs-on: ubuntu-22.04 + if: always() + needs: [ + check_runner_status, + check_runners, + setup_gpu, + run_models_gpu, +# run_tests_torch_cuda_extensions_single_gpu, +# run_tests_torch_cuda_extensions_multi_gpu + ] + env: + # For the meaning of these environment variables, see the job `Setup` + CI_BRANCH_PUSH: ${{ github.event.ref }} + CI_BRANCH_WORKFLOW_RUN: ${{ github.event.workflow_run.head_branch }} + CI_SHA_PUSH: ${{ github.event.head_commit.id }} + CI_SHA_WORKFLOW_RUN: ${{ github.event.workflow_run.head_sha }} + steps: + - name: Preliminary job status + shell: bash + # For the meaning of these environment variables, see the job `Setup` + run: | + echo "Runner availability: ${{ needs.check_runner_status.result }}" + echo "Setup status: ${{ needs.setup_gpu.result }}" + echo "Runner status: ${{ needs.check_runners.result }}" + + # Necessary to get the correct branch name and commit SHA for `workflow_run` event + # We also take into account the `push` event (we might want to test some changes in a branch) + - name: Prepare custom environment variables + shell: bash + # For the meaning of these environment variables, see the job `Setup` + run: | + CI_BRANCH_PUSH=${CI_BRANCH_PUSH/'refs/heads/'/''} + echo $CI_BRANCH_PUSH + echo $CI_BRANCH_WORKFLOW_RUN + echo $CI_SHA_PUSH + echo $CI_SHA_WORKFLOW_RUN + [[ ! -z "$CI_BRANCH_PUSH" ]] && echo "CI_BRANCH=$CI_BRANCH_PUSH" >> $GITHUB_ENV || echo "CI_BRANCH=$CI_BRANCH_WORKFLOW_RUN" >> $GITHUB_ENV + [[ ! -z "$CI_SHA_PUSH" ]] && echo "CI_SHA=$CI_SHA_PUSH" >> $GITHUB_ENV || echo "CI_SHA=$CI_SHA_WORKFLOW_RUN" >> $GITHUB_ENV + + - name: print environment variables + run: | + echo "env.CI_BRANCH = ${{ env.CI_BRANCH }}" + echo "env.CI_SHA = ${{ env.CI_SHA }}" + + - uses: actions/checkout@v4 + # To avoid failure when multiple commits are merged into `main` in a short period of time. + # Checking out to an old commit beyond the fetch depth will get an error `fatal: reference is not a tree: ... + # (Only required for `workflow_run` event, where we get the latest HEAD on `main` instead of the event commit) + with: + fetch-depth: 20 + + - name: Update clone using environment variables + run: | + echo "original branch = $(git branch --show-current)" + git fetch && git checkout ${{ env.CI_BRANCH }} + echo "updated branch = $(git branch --show-current)" + git checkout ${{ env.CI_SHA }} + echo "log = $(git log -n 1)" + + - uses: actions/download-artifact@v4 + - name: Send message to Slack + env: + CI_SLACK_BOT_TOKEN: ${{ secrets.CI_SLACK_BOT_TOKEN }} + CI_SLACK_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID }} + CI_SLACK_CHANNEL_ID_DAILY: ${{ secrets.CI_SLACK_CHANNEL_ID_DAILY }} + CI_SLACK_CHANNEL_ID_AMD: ${{ secrets.CI_SLACK_CHANNEL_ID_AMD }} + CI_SLACK_CHANNEL_DUMMY_TESTS: ${{ secrets.CI_SLACK_CHANNEL_DUMMY_TESTS }} + CI_SLACK_REPORT_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID_AMD }} + ACCESS_REPO_INFO_TOKEN: ${{ secrets.ACCESS_REPO_INFO_TOKEN }} + CI_EVENT: Push CI (AMD) - ${{ inputs.gpu_flavor }} + CI_TITLE_PUSH: ${{ github.event.head_commit.message }} + CI_TITLE_WORKFLOW_RUN: ${{ github.event.workflow_run.head_commit.message }} + CI_SHA: ${{ env.CI_SHA }} + RUNNER_STATUS: ${{ needs.check_runner_status.result }} + RUNNER_ENV_STATUS: ${{ needs.check_runners.result }} + SETUP_STATUS: ${{ needs.setup_gpu.result }} + + # We pass `needs.setup_gpu.outputs.matrix` as the argument. A processing in `notification_service.py` to change + # `models/bert` to `models_bert` is required, as the artifact names use `_` instead of `/`. + run: | + pip install huggingface_hub + pip install slack_sdk + pip show slack_sdk + python utils/notification_service.py "${{ needs.setup_gpu.outputs.matrix }}" diff --git a/docs/transformers/.github/workflows/self-push.yml b/docs/transformers/.github/workflows/self-push.yml new file mode 100644 index 0000000000000000000000000000000000000000..3b3be41e3e9b561b0b038ae4ac74c74d9ca5d4ce --- /dev/null +++ b/docs/transformers/.github/workflows/self-push.yml @@ -0,0 +1,652 @@ +name: Self-hosted runner (push) + +on: + workflow_run: + workflows: ["Self-hosted runner (push-caller)"] + branches: ["main"] + types: [completed] + push: + branches: + - ci_* + - ci-* + paths: + - "src/**" + - "tests/**" + - ".github/**" + - "templates/**" + - "utils/**" + repository_dispatch: + +env: + HF_HOME: /mnt/cache + TRANSFORMERS_IS_CI: yes + OMP_NUM_THREADS: 8 + MKL_NUM_THREADS: 8 + PYTEST_TIMEOUT: 60 + TF_FORCE_GPU_ALLOW_GROWTH: true + CUDA_VISIBLE_DEVICES: 0,1 + +jobs: + setup: + name: Setup + strategy: + matrix: + machine_type: [aws-g4dn-2xlarge-cache, aws-g4dn-12xlarge-cache] + runs-on: + group: '${{ matrix.machine_type }}' + container: + image: huggingface/transformers-all-latest-gpu-push-ci + options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ + outputs: + matrix: ${{ steps.set-matrix.outputs.matrix }} + test_map: ${{ steps.set-matrix.outputs.test_map }} + env: + # `CI_BRANCH_PUSH`: The branch name from the push event + # `CI_BRANCH_WORKFLOW_RUN`: The name of the branch on which this workflow is triggered by `workflow_run` event + # `CI_SHA_PUSH`: The commit SHA from the push event + # `CI_SHA_WORKFLOW_RUN`: The commit SHA that triggers this workflow by `workflow_run` event + CI_BRANCH_PUSH: ${{ github.event.ref }} + CI_BRANCH_WORKFLOW_RUN: ${{ github.event.workflow_run.head_branch }} + CI_SHA_PUSH: ${{ github.event.head_commit.id }} + CI_SHA_WORKFLOW_RUN: ${{ github.event.workflow_run.head_sha }} + steps: + # Necessary to get the correct branch name and commit SHA for `workflow_run` event + # We also take into account the `push` event (we might want to test some changes in a branch) + - name: Prepare custom environment variables + shell: bash + # `CI_BRANCH`: The non-empty branch name from the above two (one and only one of them is empty) + # `CI_SHA`: The non-empty commit SHA from the above two (one and only one of them is empty) + run: | + CI_BRANCH_PUSH=${CI_BRANCH_PUSH/'refs/heads/'/''} + echo $CI_BRANCH_PUSH + echo $CI_BRANCH_WORKFLOW_RUN + echo $CI_SHA_PUSH + echo $CI_SHA_WORKFLOW_RUN + [[ ! -z "$CI_BRANCH_PUSH" ]] && echo "CI_BRANCH=$CI_BRANCH_PUSH" >> $GITHUB_ENV || echo "CI_BRANCH=$CI_BRANCH_WORKFLOW_RUN" >> $GITHUB_ENV + [[ ! -z "$CI_SHA_PUSH" ]] && echo "CI_SHA=$CI_SHA_PUSH" >> $GITHUB_ENV || echo "CI_SHA=$CI_SHA_WORKFLOW_RUN" >> $GITHUB_ENV + + - name: print environment variables + run: | + echo "env.CI_BRANCH = ${{ env.CI_BRANCH }}" + echo "env.CI_SHA = ${{ env.CI_SHA }}" + + - name: Update clone using environment variables + working-directory: /transformers + run: | + echo "original branch = $(git branch --show-current)" + git fetch && git checkout ${{ env.CI_BRANCH }} + echo "updated branch = $(git branch --show-current)" + git checkout ${{ env.CI_SHA }} + echo "log = $(git log -n 1)" + + - name: Cleanup + working-directory: /transformers + run: | + rm -rf tests/__pycache__ + rm -rf tests/models/__pycache__ + rm -rf reports + + - name: Show installed libraries and their versions + working-directory: /transformers + run: pip freeze + + - name: Fetch the tests to run + working-directory: /transformers + # TODO: add `git-python` in the docker images + run: | + pip install --upgrade git-python + python3 utils/tests_fetcher.py --diff_with_last_commit | tee test_preparation.txt + + - name: Report fetched tests + uses: actions/upload-artifact@v4 + with: + name: test_fetched + path: /transformers/test_preparation.txt + + - id: set-matrix + name: Organize tests into models + working-directory: /transformers + # The `keys` is used as GitHub actions matrix for jobs, i.e. `models/bert`, `tokenization`, `pipeline`, etc. + # The `test_map` is used to get the actual identified test files under each key. + # If no test to run (so no `test_map.json` file), create a dummy map (empty matrix will fail) + run: | + if [ -f test_map.json ]; then + keys=$(python3 -c 'import json; fp = open("test_map.json"); test_map = json.load(fp); fp.close(); d = list(test_map.keys()); print(d)') + test_map=$(python3 -c 'import json; fp = open("test_map.json"); test_map = json.load(fp); fp.close(); print(test_map)') + else + keys=$(python3 -c 'keys = ["dummy"]; print(keys)') + test_map=$(python3 -c 'test_map = {"dummy": []}; print(test_map)') + fi + echo $keys + echo $test_map + echo "matrix=$keys" >> $GITHUB_OUTPUT + echo "test_map=$test_map" >> $GITHUB_OUTPUT + + run_tests_single_gpu: + name: Model tests + needs: setup + # `dummy` means there is no test to run + if: contains(fromJson(needs.setup.outputs.matrix), 'dummy') != true + strategy: + fail-fast: false + matrix: + folders: ${{ fromJson(needs.setup.outputs.matrix) }} + machine_type: [aws-g4dn-2xlarge-cache] + runs-on: + group: '${{ matrix.machine_type }}' + container: + image: huggingface/transformers-all-latest-gpu-push-ci + options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ + env: + # For the meaning of these environment variables, see the job `Setup` + CI_BRANCH_PUSH: ${{ github.event.ref }} + CI_BRANCH_WORKFLOW_RUN: ${{ github.event.workflow_run.head_branch }} + CI_SHA_PUSH: ${{ github.event.head_commit.id }} + CI_SHA_WORKFLOW_RUN: ${{ github.event.workflow_run.head_sha }} + steps: + # Necessary to get the correct branch name and commit SHA for `workflow_run` event + # We also take into account the `push` event (we might want to test some changes in a branch) + - name: Prepare custom environment variables + shell: bash + # For the meaning of these environment variables, see the job `Setup` + run: | + CI_BRANCH_PUSH=${CI_BRANCH_PUSH/'refs/heads/'/''} + echo $CI_BRANCH_PUSH + echo $CI_BRANCH_WORKFLOW_RUN + echo $CI_SHA_PUSH + echo $CI_SHA_WORKFLOW_RUN + [[ ! -z "$CI_BRANCH_PUSH" ]] && echo "CI_BRANCH=$CI_BRANCH_PUSH" >> $GITHUB_ENV || echo "CI_BRANCH=$CI_BRANCH_WORKFLOW_RUN" >> $GITHUB_ENV + [[ ! -z "$CI_SHA_PUSH" ]] && echo "CI_SHA=$CI_SHA_PUSH" >> $GITHUB_ENV || echo "CI_SHA=$CI_SHA_WORKFLOW_RUN" >> $GITHUB_ENV + + - name: print environment variables + run: | + echo "env.CI_BRANCH = ${{ env.CI_BRANCH }}" + echo "env.CI_SHA = ${{ env.CI_SHA }}" + + - name: Set `machine_type` for report and artifact names + working-directory: /transformers + shell: bash + run: | + echo "${{ matrix.machine_type }}" + + if [ "${{ matrix.machine_type }}" = "aws-g4dn-2xlarge-cache" ]; then + machine_type=single-gpu + elif [ "${{ matrix.machine_type }}" = "aws-g4dn-12xlarge-cache" ]; then + machine_type=multi-gpu + else + machine_type=${{ matrix.machine_type }} + fi + + echo "$machine_type" + echo "machine_type=$machine_type" >> $GITHUB_ENV + + - name: Update clone using environment variables + working-directory: /transformers + run: | + echo "original branch = $(git branch --show-current)" + git fetch && git checkout ${{ env.CI_BRANCH }} + echo "updated branch = $(git branch --show-current)" + git checkout ${{ env.CI_SHA }} + echo "log = $(git log -n 1)" + + - name: Reinstall transformers in edit mode (remove the one installed during docker image build) + working-directory: /transformers + run: python3 -m pip uninstall -y transformers && python3 -m pip install -e . + + - name: Echo folder ${{ matrix.folders }} + shell: bash + # For folders like `models/bert`, set an env. var. (`matrix_folders`) to `models_bert`, which will be used to + # set the artifact folder names (because the character `/` is not allowed). + run: | + echo "${{ matrix.folders }}" + echo "${{ fromJson(needs.setup.outputs.test_map)[matrix.folders] }}" + matrix_folders=${{ matrix.folders }} + matrix_folders=${matrix_folders/'models/'/'models_'} + echo "$matrix_folders" + echo "matrix_folders=$matrix_folders" >> $GITHUB_ENV + + - name: NVIDIA-SMI + run: | + nvidia-smi + + - name: Environment + working-directory: /transformers + run: | + python3 utils/print_env.py + + - name: Show installed libraries and their versions + working-directory: /transformers + run: pip freeze + + - name: Run all non-slow selected tests on GPU + working-directory: /transformers + run: | + python3 -m pytest -n 2 --dist=loadfile -v --make-reports=${{ env.machine_type }}_tests_gpu_${{ matrix.folders }} ${{ fromJson(needs.setup.outputs.test_map)[matrix.folders] }} + + - name: Failure short reports + if: ${{ failure() }} + continue-on-error: true + run: cat /transformers/reports/${{ env.machine_type }}_tests_gpu_${{ matrix.folders }}/failures_short.txt + + - name: "Test suite reports artifacts: ${{ env.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports" + if: ${{ always() }} + uses: actions/upload-artifact@v4 + with: + name: ${{ env.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports + path: /transformers/reports/${{ env.machine_type }}_tests_gpu_${{ matrix.folders }} + + run_tests_multi_gpu: + name: Model tests + needs: setup + # `dummy` means there is no test to run + if: contains(fromJson(needs.setup.outputs.matrix), 'dummy') != true + strategy: + fail-fast: false + matrix: + folders: ${{ fromJson(needs.setup.outputs.matrix) }} + machine_type: [aws-g4dn-12xlarge-cache] + runs-on: + group: '${{ matrix.machine_type }}' + container: + image: huggingface/transformers-all-latest-gpu-push-ci + options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ + env: + # For the meaning of these environment variables, see the job `Setup` + CI_BRANCH_PUSH: ${{ github.event.ref }} + CI_BRANCH_WORKFLOW_RUN: ${{ github.event.workflow_run.head_branch }} + CI_SHA_PUSH: ${{ github.event.head_commit.id }} + CI_SHA_WORKFLOW_RUN: ${{ github.event.workflow_run.head_sha }} + steps: + # Necessary to get the correct branch name and commit SHA for `workflow_run` event + # We also take into account the `push` event (we might want to test some changes in a branch) + - name: Prepare custom environment variables + shell: bash + # For the meaning of these environment variables, see the job `Setup` + run: | + CI_BRANCH_PUSH=${CI_BRANCH_PUSH/'refs/heads/'/''} + echo $CI_BRANCH_PUSH + echo $CI_BRANCH_WORKFLOW_RUN + echo $CI_SHA_PUSH + echo $CI_SHA_WORKFLOW_RUN + [[ ! -z "$CI_BRANCH_PUSH" ]] && echo "CI_BRANCH=$CI_BRANCH_PUSH" >> $GITHUB_ENV || echo "CI_BRANCH=$CI_BRANCH_WORKFLOW_RUN" >> $GITHUB_ENV + [[ ! -z "$CI_SHA_PUSH" ]] && echo "CI_SHA=$CI_SHA_PUSH" >> $GITHUB_ENV || echo "CI_SHA=$CI_SHA_WORKFLOW_RUN" >> $GITHUB_ENV + + - name: print environment variables + run: | + echo "env.CI_BRANCH = ${{ env.CI_BRANCH }}" + echo "env.CI_SHA = ${{ env.CI_SHA }}" + + - name: Set `machine_type` for report and artifact names + working-directory: /transformers + shell: bash + run: | + echo "${{ matrix.machine_type }}" + + if [ "${{ matrix.machine_type }}" = "aws-g4dn-2xlarge-cache" ]; then + machine_type=single-gpu + elif [ "${{ matrix.machine_type }}" = "aws-g4dn-12xlarge-cache" ]; then + machine_type=multi-gpu + else + machine_type=${{ matrix.machine_type }} + fi + + echo "$machine_type" + echo "machine_type=$machine_type" >> $GITHUB_ENV + + - name: Update clone using environment variables + working-directory: /transformers + run: | + echo "original branch = $(git branch --show-current)" + git fetch && git checkout ${{ env.CI_BRANCH }} + echo "updated branch = $(git branch --show-current)" + git checkout ${{ env.CI_SHA }} + echo "log = $(git log -n 1)" + + - name: Reinstall transformers in edit mode (remove the one installed during docker image build) + working-directory: /transformers + run: python3 -m pip uninstall -y transformers && python3 -m pip install -e . + + - name: Echo folder ${{ matrix.folders }} + shell: bash + # For folders like `models/bert`, set an env. var. (`matrix_folders`) to `models_bert`, which will be used to + # set the artifact folder names (because the character `/` is not allowed). + run: | + echo "${{ matrix.folders }}" + echo "${{ fromJson(needs.setup.outputs.test_map)[matrix.folders] }}" + matrix_folders=${{ matrix.folders }} + matrix_folders=${matrix_folders/'models/'/'models_'} + echo "$matrix_folders" + echo "matrix_folders=$matrix_folders" >> $GITHUB_ENV + + - name: NVIDIA-SMI + run: | + nvidia-smi + + - name: Environment + working-directory: /transformers + run: | + python3 utils/print_env.py + + - name: Show installed libraries and their versions + working-directory: /transformers + run: pip freeze + + - name: Run all non-slow selected tests on GPU + env: + MKL_SERVICE_FORCE_INTEL: 1 + working-directory: /transformers + run: | + python3 -m pytest -n 2 --dist=loadfile -v --make-reports=${{ env.machine_type }}_tests_gpu_${{ matrix.folders }} ${{ fromJson(needs.setup.outputs.test_map)[matrix.folders] }} + + - name: Failure short reports + if: ${{ failure() }} + continue-on-error: true + run: cat /transformers/reports/${{ env.machine_type }}_tests_gpu_${{ matrix.folders }}/failures_short.txt + + - name: "Test suite reports artifacts: ${{ env.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports" + if: ${{ always() }} + uses: actions/upload-artifact@v4 + with: + name: ${{ env.machine_type }}_run_all_tests_gpu_${{ env.matrix_folders }}_test_reports + path: /transformers/reports/${{ env.machine_type }}_tests_gpu_${{ matrix.folders }} + + run_tests_torch_cuda_extensions_single_gpu: + name: Torch CUDA extension tests + needs: setup + if: contains(fromJson(needs.setup.outputs.matrix), 'deepspeed') || contains(fromJson(needs.setup.outputs.matrix), 'extended') + strategy: + fail-fast: false + matrix: + machine_type: [aws-g4dn-2xlarge-cache] + runs-on: + group: '${{ matrix.machine_type }}' + container: + image: huggingface/transformers-pytorch-deepspeed-latest-gpu-push-ci + options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ + env: + # For the meaning of these environment variables, see the job `Setup` + CI_BRANCH_PUSH: ${{ github.event.ref }} + CI_BRANCH_WORKFLOW_RUN: ${{ github.event.workflow_run.head_branch }} + CI_SHA_PUSH: ${{ github.event.head_commit.id }} + CI_SHA_WORKFLOW_RUN: ${{ github.event.workflow_run.head_sha }} + steps: + # Necessary to get the correct branch name and commit SHA for `workflow_run` event + # We also take into account the `push` event (we might want to test some changes in a branch) + - name: Prepare custom environment variables + shell: bash + # For the meaning of these environment variables, see the job `Setup` + run: | + CI_BRANCH_PUSH=${CI_BRANCH_PUSH/'refs/heads/'/''} + echo $CI_BRANCH_PUSH + echo $CI_BRANCH_WORKFLOW_RUN + echo $CI_SHA_PUSH + echo $CI_SHA_WORKFLOW_RUN + [[ ! -z "$CI_BRANCH_PUSH" ]] && echo "CI_BRANCH=$CI_BRANCH_PUSH" >> $GITHUB_ENV || echo "CI_BRANCH=$CI_BRANCH_WORKFLOW_RUN" >> $GITHUB_ENV + [[ ! -z "$CI_SHA_PUSH" ]] && echo "CI_SHA=$CI_SHA_PUSH" >> $GITHUB_ENV || echo "CI_SHA=$CI_SHA_WORKFLOW_RUN" >> $GITHUB_ENV + + - name: print environment variables + run: | + echo "env.CI_BRANCH = ${{ env.CI_BRANCH }}" + echo "env.CI_SHA = ${{ env.CI_SHA }}" + + - name: Set `machine_type` for report and artifact names + working-directory: /workspace/transformers + shell: bash + run: | + echo "${{ matrix.machine_type }}" + + if [ "${{ matrix.machine_type }}" = "aws-g4dn-2xlarge-cache" ]; then + machine_type=single-gpu + elif [ "${{ matrix.machine_type }}" = "aws-g4dn-12xlarge-cache" ]; then + machine_type=multi-gpu + else + machine_type=${{ matrix.machine_type }} + fi + + echo "$machine_type" + echo "machine_type=$machine_type" >> $GITHUB_ENV + + - name: Update clone using environment variables + working-directory: /workspace/transformers + run: | + echo "original branch = $(git branch --show-current)" + git fetch && git checkout ${{ env.CI_BRANCH }} + echo "updated branch = $(git branch --show-current)" + git checkout ${{ env.CI_SHA }} + echo "log = $(git log -n 1)" + + - name: Reinstall transformers in edit mode (remove the one installed during docker image build) + working-directory: /workspace/transformers + run: python3 -m pip uninstall -y transformers && python3 -m pip install -e . + + - name: Remove cached torch extensions + run: rm -rf /github/home/.cache/torch_extensions/ + + # To avoid unknown test failures + - name: Pre build DeepSpeed *again* + working-directory: /workspace + run: | + python3 -m pip uninstall -y deepspeed + DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check + + - name: NVIDIA-SMI + run: | + nvidia-smi + + - name: Environment + working-directory: /workspace/transformers + run: | + python utils/print_env.py + + - name: Show installed libraries and their versions + working-directory: /workspace/transformers + run: pip freeze + + - name: Run all non-slow selected tests on GPU + working-directory: /workspace/transformers + # TODO: Here we pass all tests in the 2 folders for simplicity. It's better to pass only the identified tests. + run: | + python -m pytest -n 1 --dist=loadfile -v --make-reports=${{ env.machine_type }}_run_torch_cuda_extensions_gpu_test_reports tests/deepspeed tests/extended + + - name: Failure short reports + if: ${{ failure() }} + continue-on-error: true + run: cat /workspace/transformers/reports/${{ env.machine_type }}_run_torch_cuda_extensions_gpu_test_reports/failures_short.txt + + - name: "Test suite reports artifacts: ${{ env.machine_type }}_run_torch_cuda_extensions_gpu_test_reports" + if: ${{ always() }} + uses: actions/upload-artifact@v4 + with: + name: ${{ env.machine_type }}_run_torch_cuda_extensions_gpu_test_reports + path: /workspace/transformers/reports/${{ env.machine_type }}_run_torch_cuda_extensions_gpu_test_reports + + run_tests_torch_cuda_extensions_multi_gpu: + name: Torch CUDA extension tests + needs: setup + if: contains(fromJson(needs.setup.outputs.matrix), 'deepspeed') || contains(fromJson(needs.setup.outputs.matrix), 'extended') + strategy: + fail-fast: false + matrix: + machine_type: [aws-g4dn-12xlarge-cache] + runs-on: + group: '${{ matrix.machine_type }}' + container: + image: huggingface/transformers-pytorch-deepspeed-latest-gpu-push-ci + options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ + env: + # For the meaning of these environment variables, see the job `Setup` + CI_BRANCH_PUSH: ${{ github.event.ref }} + CI_BRANCH_WORKFLOW_RUN: ${{ github.event.workflow_run.head_branch }} + CI_SHA_PUSH: ${{ github.event.head_commit.id }} + CI_SHA_WORKFLOW_RUN: ${{ github.event.workflow_run.head_sha }} + steps: + # Necessary to get the correct branch name and commit SHA for `workflow_run` event + # We also take into account the `push` event (we might want to test some changes in a branch) + - name: Prepare custom environment variables + shell: bash + # For the meaning of these environment variables, see the job `Setup` + run: | + CI_BRANCH_PUSH=${CI_BRANCH_PUSH/'refs/heads/'/''} + echo $CI_BRANCH_PUSH + echo $CI_BRANCH_WORKFLOW_RUN + echo $CI_SHA_PUSH + echo $CI_SHA_WORKFLOW_RUN + [[ ! -z "$CI_BRANCH_PUSH" ]] && echo "CI_BRANCH=$CI_BRANCH_PUSH" >> $GITHUB_ENV || echo "CI_BRANCH=$CI_BRANCH_WORKFLOW_RUN" >> $GITHUB_ENV + [[ ! -z "$CI_SHA_PUSH" ]] && echo "CI_SHA=$CI_SHA_PUSH" >> $GITHUB_ENV || echo "CI_SHA=$CI_SHA_WORKFLOW_RUN" >> $GITHUB_ENV + + - name: print environment variables + run: | + echo "env.CI_BRANCH = ${{ env.CI_BRANCH }}" + echo "env.CI_SHA = ${{ env.CI_SHA }}" + + - name: Set `machine_type` for report and artifact names + working-directory: /workspace/transformers + shell: bash + run: | + echo "${{ matrix.machine_type }}" + + if [ "${{ matrix.machine_type }}" = "aws-g4dn-2xlarge-cache" ]; then + machine_type=single-gpu + elif [ "${{ matrix.machine_type }}" = "aws-g4dn-12xlarge-cache" ]; then + machine_type=multi-gpu + else + machine_type=${{ matrix.machine_type }} + fi + + echo "$machine_type" + echo "machine_type=$machine_type" >> $GITHUB_ENV + + - name: Update clone using environment variables + working-directory: /workspace/transformers + run: | + echo "original branch = $(git branch --show-current)" + git fetch && git checkout ${{ env.CI_BRANCH }} + echo "updated branch = $(git branch --show-current)" + git checkout ${{ env.CI_SHA }} + echo "log = $(git log -n 1)" + + - name: Reinstall transformers in edit mode (remove the one installed during docker image build) + working-directory: /workspace/transformers + run: python3 -m pip uninstall -y transformers && python3 -m pip install -e . + + - name: Remove cached torch extensions + run: rm -rf /github/home/.cache/torch_extensions/ + + # To avoid unknown test failures + - name: Pre build DeepSpeed *again* + working-directory: /workspace + run: | + python3 -m pip uninstall -y deepspeed + DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check + + - name: NVIDIA-SMI + run: | + nvidia-smi + + - name: Environment + working-directory: /workspace/transformers + run: | + python utils/print_env.py + + - name: Show installed libraries and their versions + working-directory: /workspace/transformers + run: pip freeze + + - name: Run all non-slow selected tests on GPU + working-directory: /workspace/transformers + # TODO: Here we pass all tests in the 2 folders for simplicity. It's better to pass only the identified tests. + run: | + python -m pytest -n 1 --dist=loadfile -v --make-reports=${{ env.machine_type }}_run_torch_cuda_extensions_gpu_test_reports tests/deepspeed tests/extended + + - name: Failure short reports + if: ${{ failure() }} + continue-on-error: true + run: cat /workspace/transformers/reports/${{ env.machine_type }}_run_torch_cuda_extensions_gpu_test_reports/failures_short.txt + + - name: "Test suite reports artifacts: ${{ env.machine_type }}_run_torch_cuda_extensions_gpu_test_reports" + if: ${{ always() }} + uses: actions/upload-artifact@v4 + with: + name: ${{ env.machine_type }}_run_torch_cuda_extensions_gpu_test_reports + path: /workspace/transformers/reports/${{ env.machine_type }}_run_torch_cuda_extensions_gpu_test_reports + + send_results: + name: Send results to webhook + runs-on: ubuntu-22.04 + if: always() + needs: [ + setup, + run_tests_single_gpu, + run_tests_multi_gpu, + run_tests_torch_cuda_extensions_single_gpu, + run_tests_torch_cuda_extensions_multi_gpu + ] + env: + # For the meaning of these environment variables, see the job `Setup` + CI_BRANCH_PUSH: ${{ github.event.ref }} + CI_BRANCH_WORKFLOW_RUN: ${{ github.event.workflow_run.head_branch }} + CI_SHA_PUSH: ${{ github.event.head_commit.id }} + CI_SHA_WORKFLOW_RUN: ${{ github.event.workflow_run.head_sha }} + steps: + - name: Preliminary job status + shell: bash + # For the meaning of these environment variables, see the job `Setup` + run: | + echo "Setup status: ${{ needs.setup.result }}" + + # Necessary to get the correct branch name and commit SHA for `workflow_run` event + # We also take into account the `push` event (we might want to test some changes in a branch) + - name: Prepare custom environment variables + shell: bash + # For the meaning of these environment variables, see the job `Setup` + run: | + CI_BRANCH_PUSH=${CI_BRANCH_PUSH/'refs/heads/'/''} + echo $CI_BRANCH_PUSH + echo $CI_BRANCH_WORKFLOW_RUN + echo $CI_SHA_PUSH + echo $CI_SHA_WORKFLOW_RUN + [[ ! -z "$CI_BRANCH_PUSH" ]] && echo "CI_BRANCH=$CI_BRANCH_PUSH" >> $GITHUB_ENV || echo "CI_BRANCH=$CI_BRANCH_WORKFLOW_RUN" >> $GITHUB_ENV + [[ ! -z "$CI_SHA_PUSH" ]] && echo "CI_SHA=$CI_SHA_PUSH" >> $GITHUB_ENV || echo "CI_SHA=$CI_SHA_WORKFLOW_RUN" >> $GITHUB_ENV + + - name: print environment variables + run: | + echo "env.CI_BRANCH = ${{ env.CI_BRANCH }}" + echo "env.CI_SHA = ${{ env.CI_SHA }}" + + - uses: actions/checkout@v4 + # To avoid failure when multiple commits are merged into `main` in a short period of time. + # Checking out to an old commit beyond the fetch depth will get an error `fatal: reference is not a tree: ... + # (Only required for `workflow_run` event, where we get the latest HEAD on `main` instead of the event commit) + with: + fetch-depth: 20 + + - name: Update clone using environment variables + run: | + echo "original branch = $(git branch --show-current)" + git fetch && git checkout ${{ env.CI_BRANCH }} + echo "updated branch = $(git branch --show-current)" + git checkout ${{ env.CI_SHA }} + echo "log = $(git log -n 1)" + + - uses: actions/download-artifact@v4 + - name: Send message to Slack + env: + CI_SLACK_BOT_TOKEN: ${{ secrets.CI_SLACK_BOT_TOKEN }} + CI_SLACK_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID }} + CI_SLACK_CHANNEL_ID_DAILY: ${{ secrets.CI_SLACK_CHANNEL_ID_DAILY }} + CI_SLACK_CHANNEL_DUMMY_TESTS: ${{ secrets.CI_SLACK_CHANNEL_DUMMY_TESTS }} + CI_SLACK_REPORT_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID }} + ACCESS_REPO_INFO_TOKEN: ${{ secrets.ACCESS_REPO_INFO_TOKEN }} + CI_EVENT: push + CI_TITLE_PUSH: ${{ github.event.head_commit.message }} + CI_TITLE_WORKFLOW_RUN: ${{ github.event.workflow_run.head_commit.message }} + CI_SHA: ${{ env.CI_SHA }} + SETUP_STATUS: ${{ needs.setup.result }} + + # We pass `needs.setup.outputs.matrix` as the argument. A processing in `notification_service.py` to change + # `models/bert` to `models_bert` is required, as the artifact names use `_` instead of `/`. + run: | + pip install huggingface_hub + pip install slack_sdk + pip show slack_sdk + python utils/notification_service.py "${{ needs.setup.outputs.matrix }}" diff --git a/docs/transformers/.github/workflows/self-scheduled-amd-caller.yml b/docs/transformers/.github/workflows/self-scheduled-amd-caller.yml new file mode 100644 index 0000000000000000000000000000000000000000..dc5c7b7e905bd88424452ab95da23af096bffa26 --- /dev/null +++ b/docs/transformers/.github/workflows/self-scheduled-amd-caller.yml @@ -0,0 +1,14 @@ +name: Self-hosted runner (AMD scheduled CI caller) + +on: + schedule: + - cron: "17 2 * * *" + +jobs: + run_scheduled_amd_ci: + name: Trigger Scheduled AMD CI + runs-on: ubuntu-22.04 + if: ${{ always() }} + steps: + - name: Trigger scheduled AMD CI via workflow_run + run: echo "Trigger scheduled AMD CI via workflow_run" diff --git a/docs/transformers/.github/workflows/self-scheduled-amd-mi210-caller.yml b/docs/transformers/.github/workflows/self-scheduled-amd-mi210-caller.yml new file mode 100644 index 0000000000000000000000000000000000000000..6109faca00932e9a2fdb11c1098aba1d7f7a8a5b --- /dev/null +++ b/docs/transformers/.github/workflows/self-scheduled-amd-mi210-caller.yml @@ -0,0 +1,55 @@ +name: Self-hosted runner (AMD mi210 scheduled CI caller) + +on: + workflow_run: + workflows: ["Self-hosted runner (AMD scheduled CI caller)"] + branches: ["main"] + types: [completed] + push: + branches: + - run_amd_scheduled_ci_caller* + +jobs: + model-ci: + name: Model CI + uses: huggingface/hf-workflows/.github/workflows/transformers_amd_ci_scheduled.yaml@main + with: + job: run_models_gpu + slack_report_channel: "#transformers-ci-daily-amd" + runner: mi210 + docker: huggingface/transformers-pytorch-amd-gpu + ci_event: Scheduled CI (AMD) - mi210 + secrets: inherit + + torch-pipeline: + name: Torch pipeline CI + uses: huggingface/hf-workflows/.github/workflows/transformers_amd_ci_scheduled.yaml@main + with: + job: run_pipelines_torch_gpu + slack_report_channel: "#transformers-ci-daily-amd" + runner: mi210 + docker: huggingface/transformers-pytorch-amd-gpu + ci_event: Scheduled CI (AMD) - mi210 + secrets: inherit + + example-ci: + name: Example CI + uses: huggingface/hf-workflows/.github/workflows/transformers_amd_ci_scheduled.yaml@main + with: + job: run_examples_gpu + slack_report_channel: "#transformers-ci-daily-amd" + runner: mi210 + docker: huggingface/transformers-pytorch-amd-gpu + ci_event: Scheduled CI (AMD) - mi210 + secrets: inherit + + deepspeed-ci: + name: DeepSpeed CI + uses: huggingface/hf-workflows/.github/workflows/transformers_amd_ci_scheduled.yaml@main + with: + job: run_torch_cuda_extensions_gpu + slack_report_channel: "#transformers-ci-daily-amd" + runner: mi210 + docker: huggingface/transformers-pytorch-deepspeed-amd-gpu + ci_event: Scheduled CI (AMD) - mi210 + secrets: inherit diff --git a/docs/transformers/.github/workflows/self-scheduled.yml b/docs/transformers/.github/workflows/self-scheduled.yml new file mode 100644 index 0000000000000000000000000000000000000000..7fce6d60800944d836a3be26158af0d24e2124ac --- /dev/null +++ b/docs/transformers/.github/workflows/self-scheduled.yml @@ -0,0 +1,598 @@ +name: Self-hosted runner (scheduled) + +# Note that each job's dependencies go into a corresponding docker file. +# +# For example for `run_torch_cuda_extensions_gpu` the docker image is +# `huggingface/transformers-pytorch-deepspeed-latest-gpu`, which can be found at +# `docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile` + +on: + workflow_call: + inputs: + job: + required: true + type: string + slack_report_channel: + required: true + type: string + runner: + required: true + type: string + docker: + required: true + type: string + ci_event: + required: true + type: string + working-directory-prefix: + default: '' + required: false + type: string + +env: + HF_HOME: /mnt/cache + TRANSFORMERS_IS_CI: yes + OMP_NUM_THREADS: 8 + MKL_NUM_THREADS: 8 + RUN_SLOW: yes + # For gated repositories, we still need to agree to share information on the Hub repo. page in order to get access. + # This token is created under the bot `hf-transformers-bot`. + HF_HUB_READ_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }} + SIGOPT_API_TOKEN: ${{ secrets.SIGOPT_API_TOKEN }} + TF_FORCE_GPU_ALLOW_GROWTH: true + CUDA_VISIBLE_DEVICES: 0,1 + NUM_SLICES: 2 + +jobs: + setup: + if: contains(fromJSON('["run_models_gpu", "run_trainer_and_fsdp_gpu", "run_quantization_torch_gpu"]'), inputs.job) + name: Setup + strategy: + matrix: + machine_type: [aws-g4dn-2xlarge-cache, aws-g4dn-12xlarge-cache] + runs-on: + group: '${{ matrix.machine_type }}' + container: + image: huggingface/transformers-all-latest-gpu + options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ + outputs: + folder_slices: ${{ steps.set-matrix.outputs.folder_slices }} + slice_ids: ${{ steps.set-matrix.outputs.slice_ids }} + quantization_matrix: ${{ steps.set-matrix-quantization.outputs.quantization_matrix }} + steps: + - name: Update clone + working-directory: /transformers + run: | + git fetch && git checkout ${{ github.sha }} + + - name: Cleanup + working-directory: /transformers + run: | + rm -rf tests/__pycache__ + rm -rf tests/models/__pycache__ + rm -rf reports + + - name: Show installed libraries and their versions + working-directory: /transformers + run: pip freeze + + - id: set-matrix + if: contains(fromJSON('["run_models_gpu", "run_trainer_and_fsdp_gpu"]'), inputs.job) + name: Identify models to test + working-directory: /transformers/tests + run: | + if [ "${{ inputs.job }}" = "run_models_gpu" ]; then + echo "folder_slices=$(python3 ../utils/split_model_tests.py --num_splits ${{ env.NUM_SLICES }})" >> $GITHUB_OUTPUT + echo "slice_ids=$(python3 -c 'd = list(range(${{ env.NUM_SLICES }})); print(d)')" >> $GITHUB_OUTPUT + elif [ "${{ inputs.job }}" = "run_trainer_and_fsdp_gpu" ]; then + echo "folder_slices=[['trainer'], ['fsdp']]" >> $GITHUB_OUTPUT + echo "slice_ids=[0, 1]" >> $GITHUB_OUTPUT + fi + + - id: set-matrix-quantization + if: ${{ inputs.job == 'run_quantization_torch_gpu' }} + name: Identify quantization method to test + working-directory: /transformers/tests + run: | + echo "quantization_matrix=$(python3 -c 'import os; tests = os.getcwd(); quantization_tests = os.listdir(os.path.join(tests, "quantization")); d = sorted(list(filter(os.path.isdir, [f"quantization/{x}" for x in quantization_tests]))) ; print(d)')" >> $GITHUB_OUTPUT + + - name: NVIDIA-SMI + run: | + nvidia-smi + + run_models_gpu: + if: ${{ inputs.job == 'run_models_gpu' }} + name: " " + needs: setup + strategy: + fail-fast: false + matrix: + machine_type: [aws-g4dn-2xlarge-cache, aws-g4dn-12xlarge-cache] + slice_id: ${{ fromJSON(needs.setup.outputs.slice_ids) }} + uses: ./.github/workflows/model_jobs.yml + with: + folder_slices: ${{ needs.setup.outputs.folder_slices }} + machine_type: ${{ matrix.machine_type }} + slice_id: ${{ matrix.slice_id }} + runner: ${{ inputs.runner }} + docker: ${{ inputs.docker }} + secrets: inherit + + run_trainer_and_fsdp_gpu: + if: ${{ inputs.job == 'run_trainer_and_fsdp_gpu' }} + name: " " + needs: setup + strategy: + fail-fast: false + matrix: + machine_type: [aws-g4dn-2xlarge-cache, aws-g4dn-12xlarge-cache] + slice_id: [0, 1] + uses: ./.github/workflows/model_jobs.yml + with: + folder_slices: ${{ needs.setup.outputs.folder_slices }} + machine_type: ${{ matrix.machine_type }} + slice_id: ${{ matrix.slice_id }} + runner: ${{ inputs.runner }} + docker: ${{ inputs.docker }} + report_name_prefix: run_trainer_and_fsdp_gpu + secrets: inherit + + run_pipelines_torch_gpu: + if: ${{ inputs.job == 'run_pipelines_torch_gpu' }} + name: PyTorch pipelines + strategy: + fail-fast: false + matrix: + machine_type: [aws-g4dn-2xlarge-cache, aws-g4dn-12xlarge-cache] + runs-on: + group: '${{ matrix.machine_type }}' + container: + image: huggingface/transformers-pytorch-gpu + options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ + steps: + - name: Update clone + working-directory: /transformers + run: git fetch && git checkout ${{ github.sha }} + + - name: Reinstall transformers in edit mode (remove the one installed during docker image build) + working-directory: /transformers + run: python3 -m pip uninstall -y transformers && python3 -m pip install -e . + + - name: NVIDIA-SMI + run: | + nvidia-smi + + - name: Environment + working-directory: /transformers + run: | + python3 utils/print_env.py + + - name: Show installed libraries and their versions + working-directory: /transformers + run: pip freeze + + - name: Set `machine_type` for report and artifact names + working-directory: /transformers + shell: bash + run: | + echo "${{ matrix.machine_type }}" + + if [ "${{ matrix.machine_type }}" = "aws-g4dn-2xlarge-cache" ]; then + machine_type=single-gpu + elif [ "${{ matrix.machine_type }}" = "aws-g4dn-12xlarge-cache" ]; then + machine_type=multi-gpu + else + machine_type=${{ matrix.machine_type }} + fi + + echo "$machine_type" + echo "machine_type=$machine_type" >> $GITHUB_ENV + + - name: Run all pipeline tests on GPU + working-directory: /transformers + run: | + python3 -m pytest -n 1 -v --dist=loadfile --make-reports=${{ env.machine_type }}_run_pipelines_torch_gpu_test_reports tests/pipelines + + - name: Failure short reports + if: ${{ failure() }} + continue-on-error: true + run: cat /transformers/reports/${{ env.machine_type }}_run_pipelines_torch_gpu_test_reports/failures_short.txt + + - name: "Test suite reports artifacts: ${{ env.machine_type }}_run_pipelines_torch_gpu_test_reports" + if: ${{ always() }} + uses: actions/upload-artifact@v4 + with: + name: ${{ env.machine_type }}_run_pipelines_torch_gpu_test_reports + path: /transformers/reports/${{ env.machine_type }}_run_pipelines_torch_gpu_test_reports + + run_pipelines_tf_gpu: + if: ${{ inputs.job == 'run_pipelines_tf_gpu' }} + name: TensorFlow pipelines + strategy: + fail-fast: false + matrix: + machine_type: [aws-g4dn-2xlarge-cache, aws-g4dn-12xlarge-cache] + runs-on: + group: '${{ matrix.machine_type }}' + container: + image: huggingface/transformers-tensorflow-gpu + options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ + steps: + - name: Update clone + working-directory: /transformers + run: | + git fetch && git checkout ${{ github.sha }} + + - name: Reinstall transformers in edit mode (remove the one installed during docker image build) + working-directory: /transformers + run: python3 -m pip uninstall -y transformers && python3 -m pip install -e . + + - name: NVIDIA-SMI + run: | + nvidia-smi + + - name: Environment + working-directory: /transformers + run: | + python3 utils/print_env.py + + - name: Show installed libraries and their versions + working-directory: /transformers + run: pip freeze + + - name: Set `machine_type` for report and artifact names + working-directory: /transformers + shell: bash + run: | + echo "${{ matrix.machine_type }}" + + if [ "${{ matrix.machine_type }}" = "aws-g4dn-2xlarge-cache" ]; then + machine_type=single-gpu + elif [ "${{ matrix.machine_type }}" = "aws-g4dn-12xlarge-cache" ]; then + machine_type=multi-gpu + else + machine_type=${{ matrix.machine_type }} + fi + + echo "$machine_type" + echo "machine_type=$machine_type" >> $GITHUB_ENV + + - name: Run all pipeline tests on GPU + working-directory: /transformers + run: | + python3 -m pytest -n 1 -v --dist=loadfile --make-reports=${{ env.machine_type }}_run_pipelines_tf_gpu_test_reports tests/pipelines + + - name: Failure short reports + if: ${{ always() }} + run: | + cat /transformers/reports/${{ env.machine_type }}_run_pipelines_tf_gpu_test_reports/failures_short.txt + + - name: "Test suite reports artifacts: ${{ env.machine_type }}_run_pipelines_tf_gpu_test_reports" + if: ${{ always() }} + uses: actions/upload-artifact@v4 + with: + name: ${{ env.machine_type }}_run_pipelines_tf_gpu_test_reports + path: /transformers/reports/${{ env.machine_type }}_run_pipelines_tf_gpu_test_reports + + run_examples_gpu: + if: ${{ inputs.job == 'run_examples_gpu' }} + name: Examples directory + strategy: + fail-fast: false + matrix: + machine_type: [aws-g4dn-2xlarge-cache] + runs-on: + group: '${{ matrix.machine_type }}' + container: + image: huggingface/transformers-all-latest-gpu + options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ + steps: + - name: Update clone + working-directory: /transformers + run: git fetch && git checkout ${{ github.sha }} + + - name: Reinstall transformers in edit mode (remove the one installed during docker image build) + working-directory: /transformers + run: python3 -m pip uninstall -y transformers && python3 -m pip install -e . + + - name: NVIDIA-SMI + run: | + nvidia-smi + + - name: Environment + working-directory: /transformers + run: | + python3 utils/print_env.py + + - name: Show installed libraries and their versions + working-directory: /transformers + run: pip freeze + + - name: Set `machine_type` for report and artifact names + working-directory: /transformers + shell: bash + run: | + echo "${{ matrix.machine_type }}" + + if [ "${{ matrix.machine_type }}" = "aws-g4dn-2xlarge-cache" ]; then + machine_type=single-gpu + elif [ "${{ matrix.machine_type }}" = "aws-g4dn-12xlarge-cache" ]; then + machine_type=multi-gpu + else + machine_type=${{ matrix.machine_type }} + fi + + echo "$machine_type" + echo "machine_type=$machine_type" >> $GITHUB_ENV + + - name: Run examples tests on GPU + working-directory: /transformers + run: | + pip install -r examples/pytorch/_tests_requirements.txt + python3 -m pytest -v --make-reports=${{ env.machine_type }}_run_examples_gpu_test_reports examples/pytorch + + - name: Failure short reports + if: ${{ failure() }} + continue-on-error: true + run: cat /transformers/reports/${{ env.machine_type }}_run_examples_gpu_test_reports/failures_short.txt + + - name: "Test suite reports artifacts: ${{ env.machine_type }}_run_examples_gpu_test_reports" + if: ${{ always() }} + uses: actions/upload-artifact@v4 + with: + name: ${{ env.machine_type }}_run_examples_gpu_test_reports + path: /transformers/reports/${{ env.machine_type }}_run_examples_gpu_test_reports + + run_torch_cuda_extensions_gpu: + if: ${{ inputs.job == 'run_torch_cuda_extensions_gpu' }} + name: Torch CUDA extension tests + strategy: + fail-fast: false + matrix: + machine_type: [aws-g4dn-2xlarge-cache, aws-g4dn-12xlarge-cache] + runs-on: + group: '${{ matrix.machine_type }}' + container: + image: ${{ inputs.docker }} + options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ + steps: + - name: Update clone + working-directory: ${{ inputs.working-directory-prefix }}/transformers + run: git fetch && git checkout ${{ github.sha }} + + - name: Reinstall transformers in edit mode (remove the one installed during docker image build) + working-directory: ${{ inputs.working-directory-prefix }}/transformers + run: python3 -m pip uninstall -y transformers && python3 -m pip install -e . + + - name: Update / Install some packages (for Past CI) + if: ${{ contains(inputs.docker, '-past-') && contains(inputs.docker, '-pytorch-') }} + working-directory: ${{ inputs.working-directory-prefix }}/transformers + run: | + python3 -m pip install -U datasets + python3 -m pip install --no-cache-dir git+https://github.com/huggingface/accelerate@main#egg=accelerate + + - name: Remove cached torch extensions + run: rm -rf /github/home/.cache/torch_extensions/ + + # To avoid unknown test failures + - name: Pre build DeepSpeed *again* (for daily CI) + if: ${{ contains(inputs.ci_event, 'Daily CI') }} + working-directory: ${{ inputs.working-directory-prefix }}/ + run: | + python3 -m pip uninstall -y deepspeed + DS_DISABLE_NINJA=1 DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check + + # To avoid unknown test failures + - name: Pre build DeepSpeed *again* (for nightly & Past CI) + if: ${{ contains(inputs.ci_event, 'Nightly CI') || contains(inputs.ci_event, 'Past CI') }} + working-directory: ${{ inputs.working-directory-prefix }}/ + run: | + python3 -m pip uninstall -y deepspeed + rm -rf DeepSpeed + git clone https://github.com/deepspeedai/DeepSpeed && cd DeepSpeed && rm -rf build + DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 python3 -m pip install . --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check + + - name: NVIDIA-SMI + run: | + nvidia-smi + + - name: Environment + working-directory: ${{ inputs.working-directory-prefix }}/transformers + run: | + python3 utils/print_env.py + + - name: Show installed libraries and their versions + working-directory: ${{ inputs.working-directory-prefix }}/transformers + run: pip freeze + + - name: Set `machine_type` for report and artifact names + working-directory: ${{ inputs.working-directory-prefix }}/transformers + shell: bash + run: | + echo "${{ matrix.machine_type }}" + + if [ "${{ matrix.machine_type }}" = "aws-g4dn-2xlarge-cache" ]; then + machine_type=single-gpu + elif [ "${{ matrix.machine_type }}" = "aws-g4dn-12xlarge-cache" ]; then + machine_type=multi-gpu + else + machine_type=${{ matrix.machine_type }} + fi + + echo "$machine_type" + echo "machine_type=$machine_type" >> $GITHUB_ENV + + - name: Run all tests on GPU + working-directory: ${{ inputs.working-directory-prefix }}/transformers + run: | + python3 -m pytest -v --make-reports=${{ env.machine_type }}_run_torch_cuda_extensions_gpu_test_reports tests/deepspeed tests/extended + + - name: Failure short reports + if: ${{ failure() }} + continue-on-error: true + run: cat ${{ inputs.working-directory-prefix }}/transformers/reports/${{ env.machine_type }}_run_torch_cuda_extensions_gpu_test_reports/failures_short.txt + + - name: "Test suite reports artifacts: ${{ env.machine_type }}_run_torch_cuda_extensions_gpu_test_reports" + if: ${{ always() }} + uses: actions/upload-artifact@v4 + with: + name: ${{ env.machine_type }}_run_torch_cuda_extensions_gpu_test_reports + path: ${{ inputs.working-directory-prefix }}/transformers/reports/${{ env.machine_type }}_run_torch_cuda_extensions_gpu_test_reports + + run_quantization_torch_gpu: + if: ${{ inputs.job == 'run_quantization_torch_gpu' }} + name: " " + needs: setup + strategy: + max-parallel: 4 + fail-fast: false + matrix: + folders: ${{ fromJson(needs.setup.outputs.quantization_matrix) }} + machine_type: [aws-g4dn-2xlarge-cache, aws-g4dn-12xlarge-cache] + runs-on: + group: '${{ matrix.machine_type }}' + container: + image: huggingface/transformers-quantization-latest-gpu + options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ + steps: + - name: Echo folder ${{ matrix.folders }} + shell: bash + run: | + echo "${{ matrix.folders }}" + matrix_folders=${{ matrix.folders }} + matrix_folders=${matrix_folders/'quantization/'/'quantization_'} + echo "$matrix_folders" + echo "matrix_folders=$matrix_folders" >> $GITHUB_ENV + + - name: Update clone + working-directory: /transformers + run: git fetch && git checkout ${{ github.sha }} + + - name: Reinstall transformers in edit mode (remove the one installed during docker image build) + working-directory: /transformers + run: python3 -m pip uninstall -y transformers && python3 -m pip install -e . + + - name: NVIDIA-SMI + run: | + nvidia-smi + + - name: Environment + working-directory: /transformers + run: | + python3 utils/print_env.py + + - name: Show installed libraries and their versions + working-directory: /transformers + run: pip freeze + + - name: Set `machine_type` for report and artifact names + working-directory: /transformers + shell: bash + run: | + echo "${{ matrix.machine_type }}" + + if [ "${{ matrix.machine_type }}" = "aws-g4dn-2xlarge-cache" ]; then + machine_type=single-gpu + elif [ "${{ matrix.machine_type }}" = "aws-g4dn-12xlarge-cache" ]; then + machine_type=multi-gpu + else + machine_type=${{ matrix.machine_type }} + fi + + echo "$machine_type" + echo "machine_type=$machine_type" >> $GITHUB_ENV + + - name: Run quantization tests on GPU + working-directory: /transformers + run: | + python3 -m pytest -v --make-reports=${{ env.machine_type }}_run_quantization_torch_gpu_${{ matrix.folders }}_test_reports tests/${{ matrix.folders }} + + - name: Failure short reports + if: ${{ failure() }} + continue-on-error: true + run: cat /transformers/reports/${{ env.machine_type }}_run_quantization_torch_gpu_${{ matrix.folders }}_test_reports/failures_short.txt + + - name: "Test suite reports artifacts: ${{ env.machine_type }}_run_quantization_torch_gpu_${{ env.matrix_folders }}_test_reports" + if: ${{ always() }} + uses: actions/upload-artifact@v4 + with: + name: ${{ env.machine_type }}_run_quantization_torch_gpu_${{ env.matrix_folders }}_test_reports + path: /transformers/reports/${{ env.machine_type }}_run_quantization_torch_gpu_${{ matrix.folders }}_test_reports + + run_extract_warnings: + # Let's only do this for the job `run_models_gpu` to simplify the (already complex) logic. + if: ${{ always() && inputs.job == 'run_models_gpu' }} + name: Extract warnings in CI artifacts + runs-on: ubuntu-22.04 + needs: [setup, run_models_gpu] + steps: + - name: Checkout transformers + uses: actions/checkout@v4 + with: + fetch-depth: 2 + + - name: Install transformers + run: pip install transformers + + - name: Show installed libraries and their versions + run: pip freeze + + - name: Create output directory + run: mkdir warnings_in_ci + + - uses: actions/download-artifact@v4 + with: + path: warnings_in_ci + + - name: Show artifacts + run: echo "$(python3 -c 'import os; d = os.listdir(); print(d)')" + working-directory: warnings_in_ci + + - name: Extract warnings in CI artifacts + run: | + python3 utils/extract_warnings.py --workflow_run_id ${{ github.run_id }} --output_dir warnings_in_ci --token ${{ secrets.ACCESS_REPO_INFO_TOKEN }} --from_gh + echo "$(python3 -c 'import os; import json; fp = open("warnings_in_ci/selected_warnings.json"); d = json.load(fp); d = "\n".join(d) ;print(d)')" + + - name: Upload artifact + if: ${{ always() }} + uses: actions/upload-artifact@v4 + with: + name: warnings_in_ci + path: warnings_in_ci/selected_warnings.json + + send_results: + name: Slack Report + needs: [ + setup, + run_models_gpu, + run_trainer_and_fsdp_gpu, + run_pipelines_torch_gpu, + run_pipelines_tf_gpu, + run_examples_gpu, + run_torch_cuda_extensions_gpu, + run_quantization_torch_gpu, + run_extract_warnings + ] + if: ${{ always() }} + uses: ./.github/workflows/slack-report.yml + with: + job: ${{ inputs.job }} + # This would be `skipped` if `setup` is skipped. + setup_status: ${{ needs.setup.result }} + slack_report_channel: ${{ inputs.slack_report_channel }} + # This would be an empty string if `setup` is skipped. + folder_slices: ${{ needs.setup.outputs.folder_slices }} + quantization_matrix: ${{ needs.setup.outputs.quantization_matrix }} + ci_event: ${{ inputs.ci_event }} + + secrets: inherit + + check_new_model_failures: + if: ${{ always() && inputs.ci_event == 'Daily CI' && inputs.job == 'run_models_gpu' && needs.send_results.result == 'success' }} + name: Check new model failures + needs: send_results + uses: ./.github/workflows/check_failed_model_tests.yml + with: + docker: ${{ inputs.docker }} + start_sha: ${{ github.sha }} + secrets: inherit diff --git a/docs/transformers/.github/workflows/ssh-runner.yml b/docs/transformers/.github/workflows/ssh-runner.yml new file mode 100644 index 0000000000000000000000000000000000000000..e648883f191e0da240c5e72fc09496564b0a2891 --- /dev/null +++ b/docs/transformers/.github/workflows/ssh-runner.yml @@ -0,0 +1,113 @@ +name: SSH into our runners + +on: + workflow_dispatch: + inputs: + runner_type: + description: 'Type of runner to test (a10 or t4)' + required: true + docker_image: + description: 'Name of the Docker image' + required: true + num_gpus: + description: 'Type of the number of gpus to use (`single` or `multi`)' + required: true + +env: + HF_HUB_READ_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }} + HF_HOME: /mnt/cache + TRANSFORMERS_IS_CI: yes + OMP_NUM_THREADS: 8 + MKL_NUM_THREADS: 8 + RUN_SLOW: yes # For gated repositories, we still need to agree to share information on the Hub repo. page in order to get access. # This token is created under the bot `hf-transformers-bot`. + SIGOPT_API_TOKEN: ${{ secrets.SIGOPT_API_TOKEN }} + TF_FORCE_GPU_ALLOW_GROWTH: true + CUDA_VISIBLE_DEVICES: 0,1 + +jobs: + get_runner: + name: "Get runner to use" + runs-on: ubuntu-22.04 + outputs: + RUNNER: ${{ steps.set_runner.outputs.RUNNER }} + steps: + - name: Get runner to use + shell: bash + run: | + if [[ "${{ github.event.inputs.num_gpus }}" == "single" && "${{ github.event.inputs.runner_type }}" == "t4" ]]; then + echo "RUNNER=aws-g4dn-2xlarge-cache" >> $GITHUB_ENV + elif [[ "${{ github.event.inputs.num_gpus }}" == "multi" && "${{ github.event.inputs.runner_type }}" == "t4" ]]; then + echo "RUNNER=aws-g4dn-12xlarge-cache" >> $GITHUB_ENV + elif [[ "${{ github.event.inputs.num_gpus }}" == "single" && "${{ github.event.inputs.runner_type }}" == "a10" ]]; then + echo "RUNNER=aws-g5-4xlarge-cache" >> $GITHUB_ENV + elif [[ "${{ github.event.inputs.num_gpus }}" == "multi" && "${{ github.event.inputs.runner_type }}" == "a10" ]]; then + echo "RUNNER=aws-g5-12xlarge-cache" >> $GITHUB_ENV + else + echo "RUNNER=" >> $GITHUB_ENV + fi + + - name: Set runner to use + id: set_runner + run: | + echo ${{ env.RUNNER }} + echo "RUNNER=${{ env.RUNNER }}" >> $GITHUB_OUTPUT + + ssh_runner: + name: "SSH" + needs: get_runner + runs-on: + group: ${{ needs.get_runner.outputs.RUNNER }} + container: + image: ${{ github.event.inputs.docker_image }} + options: --gpus all --privileged --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ + + steps: + - name: Update clone + working-directory: /transformers + run: | + git fetch && git checkout ${{ github.sha }} + + - name: Cleanup + working-directory: /transformers + run: | + rm -rf tests/__pycache__ + rm -rf tests/models/__pycache__ + rm -rf reports + + - name: Show installed libraries and their versions + working-directory: /transformers + run: pip freeze + + - name: NVIDIA-SMI + run: | + nvidia-smi + + - name: Store Slack infos + #because the SSH can be enabled dynamically if the workflow failed, so we need to store slack infos to be able to retrieve them during the waitforssh step + shell: bash + run: | + echo "${{ github.actor }}" + github_actor=${{ github.actor }} + github_actor=${github_actor/'-'/'_'} + echo "$github_actor" + echo "github_actor=$github_actor" >> $GITHUB_ENV + + - name: Store Slack infos + #because the SSH can be enabled dynamically if the workflow failed, so we need to store slack infos to be able to retrieve them during the waitforssh step + shell: bash + run: | + echo "${{ env.github_actor }}" + if [ "${{ secrets[format('{0}_{1}', env.github_actor, 'SLACK_ID')] }}" != "" ]; then + echo "SLACKCHANNEL=${{ secrets[format('{0}_{1}', env.github_actor, 'SLACK_ID')] }}" >> $GITHUB_ENV + else + echo "SLACKCHANNEL=${{ secrets.SLACK_CIFEEDBACK_CHANNEL }}" >> $GITHUB_ENV + fi + + - name: Tailscale # In order to be able to SSH when a test fails + uses: huggingface/tailscale-action@main + with: + authkey: ${{ secrets.TAILSCALE_SSH_AUTHKEY }} + slackChannel: ${{ env.SLACKCHANNEL }} + slackToken: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }} + waitForSSH: true + sshTimeout: 15m diff --git a/docs/transformers/.github/workflows/stale.yml b/docs/transformers/.github/workflows/stale.yml new file mode 100644 index 0000000000000000000000000000000000000000..65eaf755ab3a69f9d978f8ab6f7e14584918262c --- /dev/null +++ b/docs/transformers/.github/workflows/stale.yml @@ -0,0 +1,29 @@ +name: Stale Bot + +on: + schedule: + - cron: "0 8 * * *" + +jobs: + close_stale_issues: + name: Close Stale Issues + if: github.repository == 'huggingface/transformers' + runs-on: ubuntu-22.04 + permissions: + issues: write + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + steps: + - uses: actions/checkout@v4 + + - name: Setup Python + uses: actions/setup-python@v5 + with: + python-version: 3.8 + + - name: Install requirements + run: | + pip install PyGithub + - name: Close stale issues + run: | + python scripts/stale.py diff --git a/docs/transformers/.github/workflows/trufflehog.yml b/docs/transformers/.github/workflows/trufflehog.yml new file mode 100644 index 0000000000000000000000000000000000000000..7f6646e114d82c20dfba79f0d625a02e1ee3522d --- /dev/null +++ b/docs/transformers/.github/workflows/trufflehog.yml @@ -0,0 +1,20 @@ +on: + push: + +name: Secret Leaks + +permissions: + contents: read + +jobs: + trufflehog: + runs-on: ubuntu-latest + steps: + - name: Checkout code + uses: actions/checkout@v4 + with: + fetch-depth: 0 + - name: Secret Scanning + uses: trufflesecurity/trufflehog@main + with: + extra_args: --results=verified,unknown diff --git a/docs/transformers/.github/workflows/update_metdata.yml b/docs/transformers/.github/workflows/update_metdata.yml new file mode 100644 index 0000000000000000000000000000000000000000..d55b6e336c096709620cecfdc92f19c185038979 --- /dev/null +++ b/docs/transformers/.github/workflows/update_metdata.yml @@ -0,0 +1,27 @@ +name: Update Transformers metadata + +on: + push: + branches: + - main + - update_transformers_metadata* + +jobs: + build_and_package: + runs-on: ubuntu-22.04 + defaults: + run: + shell: bash -l {0} + + steps: + - uses: actions/checkout@v4 + + - name: Setup environment + run: | + pip install --upgrade pip + pip install datasets pandas + pip install .[torch,tf,flax] + + - name: Update metadata + run: | + python utils/update_metadata.py --token ${{ secrets.LYSANDRE_HF_TOKEN }} --commit_sha ${{ github.sha }} diff --git a/docs/transformers/.github/workflows/upload_pr_documentation.yml b/docs/transformers/.github/workflows/upload_pr_documentation.yml new file mode 100644 index 0000000000000000000000000000000000000000..64befc595c421e1167765404e551b080211cb192 --- /dev/null +++ b/docs/transformers/.github/workflows/upload_pr_documentation.yml @@ -0,0 +1,16 @@ +name: Upload PR Documentation + +on: + workflow_run: + workflows: ["Build PR Documentation"] + types: + - completed + +jobs: + build: + uses: huggingface/doc-builder/.github/workflows/upload_pr_documentation.yml@main + with: + package_name: transformers + secrets: + hf_token: ${{ secrets.HF_DOC_BUILD_PUSH }} + comment_bot_token: ${{ secrets.COMMENT_BOT_TOKEN }} \ No newline at end of file diff --git a/docs/transformers/README.md b/docs/transformers/README.md new file mode 100644 index 0000000000000000000000000000000000000000..e3a42595ddeb2c6534a20369c6d319b0b953331f --- /dev/null +++ b/docs/transformers/README.md @@ -0,0 +1,322 @@ + + +

+ + + + Hugging Face Transformers Library + +
+
+

+ +

+ Checkpoints on Hub + Build + GitHub + Documentation + GitHub release + Contributor Covenant + DOI +

+ +

+

+ English | + 简体中文 | + 繁體中文 | + 한국어 | + Español | + 日本語 | + हिन्दी | + Русский | + Рortuguês | + తెలుగు | + Français | + Deutsch | + Tiếng Việt | + العربية | + اردو | +

+

+ +

+

State-of-the-art pretrained models for inference and training

+

+ +

+ +

+ +Transformers is a library of pretrained text, computer vision, audio, video, and multimodal models for inference and training. Use Transformers to fine-tune models on your data, build inference applications, and for generative AI use cases across multiple modalities. + +There are over 500K+ Transformers [model checkpoints](https://huggingface.co/models?library=transformers&sort=trending) on the [Hugging Face Hub](https://huggingface.com/models) you can use. + +Explore the [Hub](https://huggingface.com/) today to find a model and use Transformers to help you get started right away. + +## Installation + +Transformers works with Python 3.9+ [PyTorch](https://pytorch.org/get-started/locally/) 2.1+, [TensorFlow](https://www.tensorflow.org/install/pip) 2.6+, and [Flax](https://flax.readthedocs.io/en/latest/) 0.4.1+. + +Create and activate a virtual environment with [venv](https://docs.python.org/3/library/venv.html) or [uv](https://docs.astral.sh/uv/), a fast Rust-based Python package and project manager. + +```py +# venv +python -m venv .my-env +source .my-env/bin/activate + +# uv +uv venv .my-env +source .my-env/bin/activate +``` + +Install Transformers in your virtual environment. + +```py +# pip +pip install transformers + +# uv +uv pip install transformers +``` + +Install Transformers from source if you want the latest changes in the library or are interested in contributing. However, the *latest* version may not be stable. Feel free to open an [issue](https://github.com/huggingface/transformers/issues) if you encounter an error. + +```shell +git clone https://github.com/huggingface/transformers.git +cd transformers +pip install . +``` + +## Quickstart + +Get started with Transformers right away with the [Pipeline](https://huggingface.co/docs/transformers/pipeline_tutorial) API. The `Pipeline` is a high-level inference class that supports text, audio, vision, and multimodal tasks. It handles preprocessing the input and returns the appropriate output. + +Instantiate a pipeline and specify model to use for text generation. The model is downloaded and cached so you can easily reuse it again. Finally, pass some text to prompt the model. + +```py +from transformers import pipeline + +pipeline = pipeline(task="text-generation", model="Qwen/Qwen2.5-1.5B") +pipeline("the secret to baking a really good cake is ") +[{'generated_text': 'the secret to baking a really good cake is 1) to use the right ingredients and 2) to follow the recipe exactly. the recipe for the cake is as follows: 1 cup of sugar, 1 cup of flour, 1 cup of milk, 1 cup of butter, 1 cup of eggs, 1 cup of chocolate chips. if you want to make 2 cakes, how much sugar do you need? To make 2 cakes, you will need 2 cups of sugar.'}] +``` + +To chat with a model, the usage pattern is the same. The only difference is you need to construct a chat history (the input to `Pipeline`) between you and the system. + +> [!TIP] +> You can also chat with a model directly from the command line. +> ```shell +> transformers-cli chat --model_name_or_path Qwen/Qwen2.5-0.5B-Instruct +> ``` + +```py +import torch +from transformers import pipeline + +chat = [ + {"role": "system", "content": "You are a sassy, wise-cracking robot as imagined by Hollywood circa 1986."}, + {"role": "user", "content": "Hey, can you tell me any fun things to do in New York?"} +] + +pipeline = pipeline(task="text-generation", model="meta-llama/Meta-Llama-3-8B-Instruct", torch_dtype=torch.bfloat16, device_map="auto") +response = pipeline(chat, max_new_tokens=512) +print(response[0]["generated_text"][-1]["content"]) +``` + +Expand the examples below to see how `Pipeline` works for different modalities and tasks. + +
+Automatic speech recognition + +```py +from transformers import pipeline + +pipeline = pipeline(task="automatic-speech-recognition", model="openai/whisper-large-v3") +pipeline("https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/mlk.flac") +{'text': ' I have a dream that one day this nation will rise up and live out the true meaning of its creed.'} +``` + +
+ +
+Image classification + +

+ +

+ +```py +from transformers import pipeline + +pipeline = pipeline(task="image-classification", model="facebook/dinov2-small-imagenet1k-1-layer") +pipeline("https://huggingface.co/datasets/Narsil/image_dummy/raw/main/parrots.png") +[{'label': 'macaw', 'score': 0.997848391532898}, + {'label': 'sulphur-crested cockatoo, Kakatoe galerita, Cacatua galerita', + 'score': 0.0016551691805943847}, + {'label': 'lorikeet', 'score': 0.00018523589824326336}, + {'label': 'African grey, African gray, Psittacus erithacus', + 'score': 7.85409429227002e-05}, + {'label': 'quail', 'score': 5.502637941390276e-05}] +``` + +
+ +
+Visual question answering + + +

+ +

+ +```py +from transformers import pipeline + +pipeline = pipeline(task="visual-question-answering", model="Salesforce/blip-vqa-base") +pipeline( + image="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/idefics-few-shot.jpg", + question="What is in the image?", +) +[{'answer': 'statue of liberty'}] +``` + +
+ +## Why should I use Transformers? + +1. Easy-to-use state-of-the-art models: + - High performance on natural language understanding & generation, computer vision, audio, video, and multimodal tasks. + - Low barrier to entry for researchers, engineers, and developers. + - Few user-facing abstractions with just three classes to learn. + - A unified API for using all our pretrained models. + +1. Lower compute costs, smaller carbon footprint: + - Share trained models instead of training from scratch. + - Reduce compute time and production costs. + - Dozens of model architectures with 1M+ pretrained checkpoints across all modalities. + +1. Choose the right framework for every part of a models lifetime: + - Train state-of-the-art models in 3 lines of code. + - Move a single model between PyTorch/JAX/TF2.0 frameworks at will. + - Pick the right framework for training, evaluation, and production. + +1. Easily customize a model or an example to your needs: + - We provide examples for each architecture to reproduce the results published by its original authors. + - Model internals are exposed as consistently as possible. + - Model files can be used independently of the library for quick experiments. + + + Hugging Face Enterprise Hub +
+ +## Why shouldn't I use Transformers? + +- This library is not a modular toolbox of building blocks for neural nets. The code in the model files is not refactored with additional abstractions on purpose, so that researchers can quickly iterate on each of the models without diving into additional abstractions/files. +- The training API is optimized to work with PyTorch models provided by Transformers. For generic machine learning loops, you should use another library like [Accelerate](https://huggingface.co/docs/accelerate). +- The [example scripts]((https://github.com/huggingface/transformers/tree/main/examples)) are only *examples*. They may not necessarily work out-of-the-box on your specific use case and you'll need to adapt the code for it to work. + +## 100 projects using Transformers + +Transformers is more than a toolkit to use pretrained models, it's a community of projects built around it and the +Hugging Face Hub. We want Transformers to enable developers, researchers, students, professors, engineers, and anyone +else to build their dream projects. + +In order to celebrate Transformers 100,000 stars, we wanted to put the spotlight on the +community with the [awesome-transformers](./awesome-transformers.md) page which lists 100 +incredible projects built with Transformers. + +If you own or use a project that you believe should be part of the list, please open a PR to add it! + +## Example models + +You can test most of our models directly on their [Hub model pages](https://huggingface.co/models). + +Expand each modality below to see a few example models for various use cases. + +
+Audio + +- Audio classification with [Whisper](https://huggingface.co/openai/whisper-large-v3-turbo) +- Automatic speech recognition with [Moonshine](https://huggingface.co/UsefulSensors/moonshine) +- Keyword spotting with [Wav2Vec2](https://huggingface.co/superb/wav2vec2-base-superb-ks) +- Speech to speech generation with [Moshi](https://huggingface.co/kyutai/moshiko-pytorch-bf16) +- Text to audio with [MusicGen](https://huggingface.co/facebook/musicgen-large) +- Text to speech with [Bark](https://huggingface.co/suno/bark) + +
+ +
+Computer vision + +- Automatic mask generation with [SAM](https://huggingface.co/facebook/sam-vit-base) +- Depth estimation with [DepthPro](https://huggingface.co/apple/DepthPro-hf) +- Image classification with [DINO v2](https://huggingface.co/facebook/dinov2-base) +- Keypoint detection with [SuperGlue](https://huggingface.co/magic-leap-community/superglue_outdoor) +- Keypoint matching with [SuperGlue](https://huggingface.co/magic-leap-community/superglue) +- Object detection with [RT-DETRv2](https://huggingface.co/PekingU/rtdetr_v2_r50vd) +- Pose Estimation with [VitPose](https://huggingface.co/usyd-community/vitpose-base-simple) +- Universal segmentation with [OneFormer](https://huggingface.co/shi-labs/oneformer_ade20k_swin_large) +- Video classification with [VideoMAE](https://huggingface.co/MCG-NJU/videomae-large) + +
+ +
+Multimodal + +- Audio or text to text with [Qwen2-Audio](https://huggingface.co/Qwen/Qwen2-Audio-7B) +- Document question answering with [LayoutLMv3](https://huggingface.co/microsoft/layoutlmv3-base) +- Image or text to text with [Qwen-VL](https://huggingface.co/Qwen/Qwen2.5-VL-3B-Instruct) +- Image captioning [BLIP-2](https://huggingface.co/Salesforce/blip2-opt-2.7b) +- OCR-based document understanding with [GOT-OCR2](https://huggingface.co/stepfun-ai/GOT-OCR-2.0-hf) +- Table question answering with [TAPAS](https://huggingface.co/google/tapas-base) +- Unified multimodal understanding and generation with [Emu3](https://huggingface.co/BAAI/Emu3-Gen) +- Vision to text with [Llava-OneVision](https://huggingface.co/llava-hf/llava-onevision-qwen2-0.5b-ov-hf) +- Visual question answering with [Llava](https://huggingface.co/llava-hf/llava-1.5-7b-hf) +- Visual referring expression segmentation with [Kosmos-2](https://huggingface.co/microsoft/kosmos-2-patch14-224) + +
+ +
+NLP + +- Masked word completion with [ModernBERT](https://huggingface.co/answerdotai/ModernBERT-base) +- Named entity recognition with [Gemma](https://huggingface.co/google/gemma-2-2b) +- Question answering with [Mixtral](https://huggingface.co/mistralai/Mixtral-8x7B-v0.1) +- Summarization with [BART](https://huggingface.co/facebook/bart-large-cnn) +- Translation with [T5](https://huggingface.co/google-t5/t5-base) +- Text generation with [Llama](https://huggingface.co/meta-llama/Llama-3.2-1B) +- Text classification with [Qwen](https://huggingface.co/Qwen/Qwen2.5-0.5B) + +
+ +## Citation + +We now have a [paper](https://www.aclweb.org/anthology/2020.emnlp-demos.6/) you can cite for the 🤗 Transformers library: +```bibtex +@inproceedings{wolf-etal-2020-transformers, + title = "Transformers: State-of-the-Art Natural Language Processing", + author = "Thomas Wolf and Lysandre Debut and Victor Sanh and Julien Chaumond and Clement Delangue and Anthony Moi and Pierric Cistac and Tim Rault and Rémi Louf and Morgan Funtowicz and Joe Davison and Sam Shleifer and Patrick von Platen and Clara Ma and Yacine Jernite and Julien Plu and Canwen Xu and Teven Le Scao and Sylvain Gugger and Mariama Drame and Quentin Lhoest and Alexander M. Rush", + booktitle = "Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing: System Demonstrations", + month = oct, + year = "2020", + address = "Online", + publisher = "Association for Computational Linguistics", + url = "https://www.aclweb.org/anthology/2020.emnlp-demos.6", + pages = "38--45" +} +``` diff --git a/docs/transformers/benchmark/README.md b/docs/transformers/benchmark/README.md new file mode 100644 index 0000000000000000000000000000000000000000..3935f02b389d841367d16f11bbf7c6775fb09a14 --- /dev/null +++ b/docs/transformers/benchmark/README.md @@ -0,0 +1,49 @@ +# Benchmarks + +You might want to add new benchmarks. + +You will need to define a python function named `run_benchmark` in your python file and the file must be located in this `benchmark/` directory. + +The expected function signature is the following: + +```py +def run_benchmark(logger: Logger, branch: str, commit_id: str, commit_msg: str, num_tokens_to_generate=100): +``` + +## Writing metrics to the database + +`MetricsRecorder` is thread-safe, in the sense of the python [`Thread`](https://docs.python.org/3/library/threading.html#threading.Thread). This means you can start a background thread to do the readings on the device measurements while not blocking the main thread to execute the model measurements. + +cf [`llama.py`](./llama.py) to see an example of this in practice. + +```py +from benchmarks_entrypoint import MetricsRecorder +import psycopg2 + +def run_benchmark(logger: Logger, branch: str, commit_id: str, commit_msg: str, num_tokens_to_generate=100): + metrics_recorder = MetricsRecorder(psycopg2.connect("dbname=metrics"), logger, branch, commit_id, commit_msg) + benchmark_id = metrics_recorder.initialise_benchmark({"gpu_name": gpu_name, "model_id": model_id}) + # To collect device measurements + metrics_recorder.collect_device_measurements( + benchmark_id, cpu_util, mem_megabytes, gpu_util, gpu_mem_megabytes + ) + # To collect your model measurements + metrics_recorder.collect_model_measurements( + benchmark_id, + { + "model_load_time": model_load_time, + "first_eager_forward_pass_time_secs": first_eager_fwd_pass_time, + "second_eager_forward_pass_time_secs": second_eager_fwd_pass_time, + "first_eager_generate_time_secs": first_eager_generate_time, + "second_eager_generate_time_secs": second_eager_generate_time, + "time_to_first_token_secs": time_to_first_token, + "time_to_second_token_secs": time_to_second_token, + "time_to_third_token_secs": time_to_third_token, + "time_to_next_token_mean_secs": mean_time_to_next_token, + "first_compile_generate_time_secs": first_compile_generate_time, + "second_compile_generate_time_secs": second_compile_generate_time, + "third_compile_generate_time_secs": third_compile_generate_time, + "fourth_compile_generate_time_secs": fourth_compile_generate_time, + }, + ) +``` diff --git a/docs/transformers/benchmark/__init__.py b/docs/transformers/benchmark/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/docs/transformers/benchmark/benchmark.py b/docs/transformers/benchmark/benchmark.py new file mode 100644 index 0000000000000000000000000000000000000000..304bbd4441cf660d3e4eb60e86b1ef36ccf979ec --- /dev/null +++ b/docs/transformers/benchmark/benchmark.py @@ -0,0 +1,326 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +Run benchmark using the `optimum-benchmark` library with some customization in `transformers`. + +Assume we are under `transformers` root directory: (make sure the commits are valid commits) +```bash +python benchmark/benchmark.py --config-dir benchmark/config --config-name generation --commit=9b9c7f03da625b13643e99205c691fe046461724 --metrics=decode.latency.mean,per_token.latency.mean,per_token.throughput.value backend.model=google/gemma-2b benchmark.input_shapes.sequence_length=5,7 benchmark.input_shapes.batch_size=1,2 --multirun +``` +""" + +import argparse +import glob +import json +import os.path +import re +import tempfile +from contextlib import contextmanager +from pathlib import Path + +from git import Repo + +from huggingface_hub import HfApi + +from optimum_benchmark import Benchmark +from optimum_benchmark_wrapper import main + + +PATH_TO_REPO = Path(__file__).parent.parent.resolve() + + +@contextmanager +def checkout_commit(repo: Repo, commit_id: str): + """ + Context manager that checks out a given commit when entered, but gets back to the reference it was at on exit. + Args: + repo (`git.Repo`): A git repository (for instance the Transformers repo). + commit_id (`str`): The commit reference to checkout inside the context manager. + """ + current_head = repo.head.commit if repo.head.is_detached else repo.head.ref + + try: + repo.git.checkout(commit_id) + yield + + finally: + repo.git.checkout(current_head) + + +def summarize(run_dir, metrics, expand_metrics=False): + """Produce a summary for each optimum-benchmark launched job's output directory found in `run_dir`. + + Each summary's format is as follows (for `expand_metrics=False`): + ``` + { + "model": "google/gemma-2b", + "commit": "3cd6ed22e4d49219f300f5055e71e3929aba20d7", + "config": "benchmark.input_shapes.batch_size=1,benchmark.input_shapes.sequence_length=5", + "metrics": { + "decode.latency.mean": 1.624666809082031, + "per_token.latency.mean": 0.012843788806628804, + "per_token.throughput.value": 77.85864553330948 + } + } + ``` + """ + reports = glob.glob(os.path.join(run_dir, "**/benchmark_report.json"), recursive=True) + report_dirs = [str(Path(report).parent) for report in reports] + + summaries = [] + for report_dir in report_dirs: + commit = re.search(r"/commit=([^/]+)", report_dir).groups()[0] + + if not os.path.isfile(os.path.join(report_dir, "benchmark.json")): + continue + benchmark = Benchmark.from_json(os.path.join(report_dir, "benchmark.json")) + report = benchmark.report + + model = benchmark.config.backend["model"] + + # Ths looks like `benchmark.input_shapes.batch_size=1,benchmark.input_shapes.sequence_length=5`. + # (we rely on the usage of hydra's `${hydra.job.override_dirname}`.) + benchmark_name = re.sub(f"backend.model={model},*", "", report_dir) + benchmark_name = str(Path(benchmark_name).parts[-1]) + if benchmark_name.startswith("commit="): + benchmark_name = benchmark.config.name + + metrics_values = {} + # post-processing of report: show a few selected/important metric + for metric in metrics: + keys = metric.split(".") + value = report.to_dict() + current = metrics_values + for key in keys: + # Avoid KeyError when a user's specified metric has typo. + # TODO: Give warnings. + if key not in value: + continue + value = value[key] + + if expand_metrics: + if isinstance(value, dict): + if key not in current: + current[key] = {} + current = current[key] + else: + current[key] = value + + if not expand_metrics: + metrics_values[metric] = value + + # show some config information + print(f"model: {model}") + print(f"commit: {commit}") + print(f"config: {benchmark_name}") + if len(metrics_values) > 0: + print("metrics:") + if expand_metrics: + print(metrics_values) + else: + for metric, value in metrics_values.items(): + print(f" - {metric}: {value}") + print("-" * 80) + + summary = { + "model": model, + "commit": commit, + "config": benchmark_name, + "metrics": metrics_values, + } + summaries.append(summary) + + with open(os.path.join(report_dir, "summary.json"), "w") as fp: + json.dump(summary, fp, indent=4) + + return summaries + + +def combine_summaries(summaries): + """Combine a list of summary obtained from the function `summarize`. + + The combined summary's format is as follows: + ``` + "google/gemma-2b": { + "benchmark.input_shapes.batch_size=1,benchmark.input_shapes.sequence_length=5": { + "3cd6ed22e4d49219f300f5055e71e3929aba20d7": { + "metrics": {"decode.latency.mean": 1.624666809082031} + }, + "c97ee28b117c0abe8e08891f402065e4df6d72aa": { + "metrics": {"decode.latency.mean": 1.6278163452148438} + } + }, + "benchmark.input_shapes.batch_size=2,benchmark.input_shapes.sequence_length=5": { + "3cd6ed22e4d49219f300f5055e71e3929aba20d7": { + "metrics": {"decode.latency.mean": 1.6947791748046876} + }, + "c97ee28b117c0abe8e08891f402065e4df6d72aa": { + "metrics": { + "decode.latency.mean": 1.6980519409179688} + } + } + } + ``` + """ + combined = {} + for summary in summaries: + model = summary["model"] + config = summary["config"] + commit = summary["commit"] + + if model not in combined: + combined[model] = {} + + if config not in combined[model]: + combined[model][config] = {} + + if commit not in combined[model][config]: + combined[model][config][commit] = {"metrics": summary["metrics"]} + + with open(os.path.join(exp_run_dir, "summary.json"), "w") as fp: + json.dump(combined, fp, indent=4) + + print(json.dumps(combined, indent=4)) + + return combined + + +if __name__ == "__main__": + + def list_str(values): + return values.split(",") + + parser = argparse.ArgumentParser() + + parser.add_argument("--config-dir", type=str, required=True, help="The path to the config directory.") + parser.add_argument("--config-name", type=str, required=True, help="The config name.") + + # arguments specific to this wrapper for our own customization + parser.add_argument("--ensure_empty", type=bool, default=True, help="If to create a temporary directory.") + parser.add_argument( + "--commit", + type=list_str, + default="", + help="Comma-separated list of branch names and/or commit sha values on which the benchmark will run. If `diff` is specified, it will run on both the current head and the `main` branch.", + ) + parser.add_argument("--metrics", type=str, help="The metrics to be included in the summary.") + + parser.add_argument("--repo_id", type=str, default=None, help="The repository to which the file will be uploaded.") + parser.add_argument("--path_in_repo", type=str, default=None, help="Relative filepath in the repo.") + parser.add_argument("--token", type=str, default=None, help="A valid user access token (string).") + + args, optimum_benchmark_args = parser.parse_known_args() + + repo = Repo(PATH_TO_REPO) + + metrics = [ + "prefill.latency.mean", + "prefill.throughput.value", + "decode.latency.mean", + "decode.throughput.value", + "per_token.latency.mean", + "per_token.throughput.value", + ] + if args.metrics is not None: + metrics = args.metrics.split(",") + + # Get `backend.model` in a hacky way: We want to control the experiment flow manually. + models = [""] + for idx, arg in enumerate(optimum_benchmark_args): + if arg.startswith("backend.model="): + models = arg[len("backend.model=") :] + models = models.split(",") + break + optimum_benchmark_args = [arg for arg in optimum_benchmark_args if not arg.startswith("backend.model=")] + + # Get the commit(s) + current_head = str(repo.head.commit) if repo.head.is_detached else str(repo.head.ref) + commits = [x for x in args.commit if x != ""] + if len(commits) == 0: + commits = [current_head] + elif len(commits) == 1 and commits[0] == "diff": + # compare to `main` + commits = ["main", current_head] + + # Get the specified run directory + run_dir_arg_idx, run_dir = -1, None + sweep_dir_arg_idx, sweep_dir = -1, None + for idx, arg in enumerate(optimum_benchmark_args): + if arg.startswith("hydra.run.dir="): + run_dir = arg[len("hydra.run.dir=") :] + run_dir_arg_idx = idx + elif arg.startswith("hydra.sweep.dir="): + sweep_dir = arg[len("hydra.sweep.dir=") :] + sweep_dir_arg_idx = idx + exp_run_dir, arg_dix, arg_name = ( + (sweep_dir, sweep_dir_arg_idx, "hydra.sweep.dir") + if "--multirun" in optimum_benchmark_args + else (run_dir, run_dir_arg_idx, "hydra.run.dir") + ) + + # TODO: not hardcoded + if exp_run_dir is None and args.ensure_empty: + exp_run_dir = "_benchmark" + + if args.ensure_empty: + os.makedirs(exp_run_dir, exist_ok=True) + exp_run_dir = tempfile.mkdtemp(dir=exp_run_dir) + + run_summaries = [] + for commit in commits: + with checkout_commit(repo, commit): + commit = str(repo.head.commit) + + commit_run_dir = exp_run_dir + if exp_run_dir is not None: + commit_run_dir = os.path.join(exp_run_dir, rf"commit\={commit}") + + print(f"Run benchmark on commit: {commit}") + + for model in models: + model_arg = [f"backend.model={model}"] if model != "" else [] + dir_args = [] + if commit_run_dir is not None: + if arg_dix > -1: + optimum_benchmark_args[arg_dix] = f"{arg_name}={commit_run_dir}" + else: + dir_args = [ + f"hydra.sweep.dir={commit_run_dir}", + f"hydra.run.dir={commit_run_dir}/" + "${hydra.job.override_dirname}", + ] + main(args.config_dir, args.config_name, model_arg + dir_args + optimum_benchmark_args) + + if commit_run_dir is not None: + # Need to remove the `\` character + summaries = summarize(commit_run_dir.replace("\\", ""), metrics) + run_summaries.extend(summaries) + + # aggregate the information across the commits + if exp_run_dir is not None: + with open(os.path.join(exp_run_dir, "summaries.json"), "w") as fp: + json.dump(run_summaries, fp, indent=4) + + combined_summary = combine_summaries(run_summaries) + + if args.repo_id is not None and args.path_in_repo is not None: + # Upload to Hub + api = HfApi() + api.upload_folder( + folder_path=exp_run_dir, + path_in_repo=args.path_in_repo, + repo_id=args.repo_id, + repo_type="dataset", + token=args.token, + ) diff --git a/docs/transformers/benchmark/benchmarks_entrypoint.py b/docs/transformers/benchmark/benchmarks_entrypoint.py new file mode 100644 index 0000000000000000000000000000000000000000..6c036fdd6939ae25a299ed2eac777ec58a838ecf --- /dev/null +++ b/docs/transformers/benchmark/benchmarks_entrypoint.py @@ -0,0 +1,143 @@ +import argparse +import importlib.util +import logging +import os +from typing import Dict +import sys + +from psycopg2.extras import Json +from psycopg2.extensions import register_adapter + + +register_adapter(dict, Json) + + +class ImportModuleException(Exception): + pass + + +class MetricsRecorder: + def __init__(self, connection, logger: logging.Logger, branch: str, commit_id: str, commit_msg: str): + self.conn = connection + self.conn.autocommit = True + self.logger = logger + self.branch = branch + self.commit_id = commit_id + self.commit_msg = commit_msg + + def initialise_benchmark(self, metadata: Dict[str, str]) -> int: + """ + Creates a new benchmark, returns the benchmark id + """ + # gpu_name: str, model_id: str + with self.conn.cursor() as cur: + cur.execute( + "INSERT INTO benchmarks (branch, commit_id, commit_message, metadata) VALUES (%s, %s, %s, %s) RETURNING benchmark_id", + (self.branch, self.commit_id, self.commit_msg, metadata), + ) + benchmark_id = cur.fetchone()[0] + logger.debug(f"initialised benchmark #{benchmark_id}") + return benchmark_id + + def collect_device_measurements(self, benchmark_id: int, cpu_util, mem_megabytes, gpu_util, gpu_mem_megabytes): + """ + Collect device metrics, such as CPU & GPU usage. These are "static", as in you cannot pass arbitrary arguments to the function. + """ + with self.conn.cursor() as cur: + cur.execute( + "INSERT INTO device_measurements (benchmark_id, cpu_util, mem_megabytes, gpu_util, gpu_mem_megabytes) VALUES (%s, %s, %s, %s, %s)", + (benchmark_id, cpu_util, mem_megabytes, gpu_util, gpu_mem_megabytes), + ) + self.logger.debug( + f"inserted device measurements for benchmark #{benchmark_id} [CPU util: {cpu_util}, mem MBs: {mem_megabytes}, GPU util: {gpu_util}, GPU mem MBs: {gpu_mem_megabytes}]" + ) + + def collect_model_measurements(self, benchmark_id: int, measurements: Dict[str, float]): + with self.conn.cursor() as cur: + cur.execute( + """ + INSERT INTO model_measurements ( + benchmark_id, + measurements + ) VALUES (%s, %s) + """, + ( + benchmark_id, + measurements, + ), + ) + self.logger.debug(f"inserted model measurements for benchmark #{benchmark_id}: {measurements}") + + def close(self): + self.conn.close() + + +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) + +handler = logging.StreamHandler(sys.stdout) +handler.setLevel(logging.INFO) +formatter = logging.Formatter("[%(levelname)s - %(asctime)s] %(message)s") +handler.setFormatter(formatter) +logger.addHandler(handler) + + +def parse_arguments(): + """ + Parse command line arguments for the benchmarking CLI. + """ + parser = argparse.ArgumentParser(description="CLI for benchmarking the huggingface/transformers.") + + parser.add_argument( + "branch", + type=str, + help="The branch name on which the benchmarking is performed.", + ) + + parser.add_argument( + "commit_id", + type=str, + help="The commit hash on which the benchmarking is performed.", + ) + + parser.add_argument( + "commit_msg", + type=str, + help="The commit message associated with the commit, truncated to 70 characters.", + ) + + args = parser.parse_args() + + return args.branch, args.commit_id, args.commit_msg + + +def import_from_path(module_name, file_path): + try: + spec = importlib.util.spec_from_file_location(module_name, file_path) + module = importlib.util.module_from_spec(spec) + sys.modules[module_name] = module + spec.loader.exec_module(module) + return module + except Exception as e: + raise ImportModuleException(f"failed to load python module: {e}") + + +if __name__ == "__main__": + benchmarks_folder_path = os.path.dirname(os.path.realpath(__file__)) + + branch, commit_id, commit_msg = parse_arguments() + + for entry in os.scandir(benchmarks_folder_path): + try: + if not entry.name.endswith(".py"): + continue + if entry.path == __file__: + continue + logger.debug(f"loading: {entry.name}") + module = import_from_path(entry.name.split(".")[0], entry.path) + logger.info(f"running benchmarks in: {entry.name}") + module.run_benchmark(logger, branch, commit_id, commit_msg) + except ImportModuleException as e: + logger.error(e) + except Exception as e: + logger.error(f"error running benchmarks for {entry.name}: {e}") diff --git a/docs/transformers/benchmark/config/generation.yaml b/docs/transformers/benchmark/config/generation.yaml new file mode 100644 index 0000000000000000000000000000000000000000..44a3f9ea49015491c7add94a8790d16c20dc76fc --- /dev/null +++ b/docs/transformers/benchmark/config/generation.yaml @@ -0,0 +1,57 @@ +defaults: + - benchmark # inheriting benchmark schema + - scenario: inference + - launcher: process + - backend: pytorch + - _self_ # for hydra 1.1 compatibility + +name: pytorch_generate + +launcher: + start_method: spawn + device_isolation: true + device_isolation_action: warn + +backend: + device: cuda + device_ids: 0 + no_weights: true + model: meta-llama/Llama-2-7b-hf + cache_implementation: static + torch_compile: true + torch_dtype: float16 + torch_compile_config: + backend: inductor + mode: reduce-overhead + fullgraph: true + +scenario: + input_shapes: + batch_size: 1 + sequence_length: 7 + generate_kwargs: + max_new_tokens: 128 + min_new_tokens: 128 + do_sample: false + memory: true + latency: true + iterations: 2 + duration: 0 + + +# hydra/cli specific settings +hydra: + run: + # where to store run results + dir: runs/${name} + job: + # change working directory to the run directory + chdir: true + env_set: + # set environment variable OVERRIDE_BENCHMARKS to 1 + # to not skip benchmarks that have been run before + OVERRIDE_BENCHMARKS: 1 + LOG_LEVEL: WARN + sweep: + dir: multirun + subdir: ${hydra.job.override_dirname} \ No newline at end of file diff --git a/docs/transformers/benchmark/default.yml b/docs/transformers/benchmark/default.yml new file mode 100644 index 0000000000000000000000000000000000000000..f3f02cab34d1bd2451eda4744ccf4f4eb7d0e6cc --- /dev/null +++ b/docs/transformers/benchmark/default.yml @@ -0,0 +1,10 @@ +apiVersion: 1 + +providers: + - name: 'Transformers Benchmarks' + orgId: 1 + type: file + updateIntervalSeconds: 10 + allowUiUpdates: true + options: + path: /etc/grafana/dashboards diff --git a/docs/transformers/benchmark/grafana_dashboard.json b/docs/transformers/benchmark/grafana_dashboard.json new file mode 100644 index 0000000000000000000000000000000000000000..caaec78a522303eaadc66171d405e203e7475694 --- /dev/null +++ b/docs/transformers/benchmark/grafana_dashboard.json @@ -0,0 +1,2375 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": 1, + "links": [ + { + "asDropdown": false, + "icon": "external link", + "includeVars": false, + "keepTime": false, + "tags": [], + "targetBlank": false, + "title": "Go to data", + "tooltip": "Go to data", + "type": "link", + "url": "http://transformers-benchmarks.hf.co/d/fdz33iyzln9c0a/transformers-benchmarks?orgId=1&from=${StartTime}&to=${EndTime}" + } + ], + "liveNow": true, + "panels": [ + { + "datasource": { + "default": true, + "type": "grafana-postgresql-datasource", + "uid": "be28nkzirtb0gd" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "custom": { + "align": "left", + "cellOptions": { + "type": "auto" + }, + "inspect": false + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "gpu_name" + }, + "properties": [ + { + "id": "custom.width", + "value": 202 + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "left" + }, + "properties": [ + { + "id": "custom.width", + "value": 407 + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "commit_message" + }, + "properties": [ + { + "id": "custom.width", + "value": 524 + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "commit_id" + }, + "properties": [ + { + "id": "custom.width", + "value": 353 + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "model_id" + }, + "properties": [ + { + "id": "custom.width", + "value": 216 + } + ] + } + ] + }, + "gridPos": { + "h": 6, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 5, + "options": { + "cellHeight": "sm", + "footer": { + "countRows": false, + "fields": "", + "reducer": [ + "sum" + ], + "show": false + }, + "showHeader": true, + "sortBy": [] + }, + "pluginVersion": "11.2.2", + "targets": [ + { + "datasource": { + "default": true, + "type": "grafana-postgresql-datasource", + "uid": "be28nkzirtb0gd" + }, + "editorMode": "code", + "format": "table", + "rawQuery": true, + "rawSql": "SELECT commit_id, commit_message, metadata->>'gpu_name' as gpu_name, metadata->>'model_id' as model_id, created_at AS date FROM benchmarks WHERE branch = '${branch}' AND metadata->>'gpu_name' = '${gpu_name}' ORDER BY benchmark_id DESC LIMIT ${last_n_commits};", + "refId": "A", + "sql": { + "columns": [ + { + "parameters": [ + { + "name": "commit_id", + "type": "functionParameter" + } + ], + "type": "function" + }, + { + "parameters": [ + { + "name": "gpu_name", + "type": "functionParameter" + } + ], + "type": "function" + } + ], + "groupBy": [ + { + "property": { + "type": "string" + }, + "type": "groupBy" + } + ], + "limit": 50, + "whereJsonTree": { + "children1": [ + { + "id": "baaa8aaa-89ab-4cde-b012-31922f96de3f", + "properties": { + "field": "commit_id", + "fieldSrc": "field", + "operator": "equal", + "value": [ + "${commit}" + ], + "valueError": [ + null + ], + "valueSrc": [ + "value" + ], + "valueType": [ + "text" + ] + }, + "type": "rule" + } + ], + "id": "bab88a98-0123-4456-b89a-b1922f7d4f11", + "type": "group" + }, + "whereString": "commit_id = '${commit}'" + }, + "table": "benchmarks" + } + ], + "transparent": true, + "type": "table" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 6 + }, + "id": 13, + "panels": [], + "title": "Eager Forward Pass", + "type": "row" + }, + { + "datasource": { + "default": true, + "type": "grafana-postgresql-datasource", + "uid": "be28nkzirtb0gd" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "continuous-YlBl" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "fillOpacity": 80, + "gradientMode": "scheme", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineWidth": 0, + "scaleDistribution": { + "type": "linear" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 11, + "w": 12, + "x": 0, + "y": 7 + }, + "id": 7, + "options": { + "barRadius": 0.05, + "barWidth": 0.8, + "fullHighlight": false, + "groupWidth": 0.7, + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "orientation": "auto", + "showValue": "auto", + "stacking": "none", + "tooltip": { + "mode": "single", + "sort": "none" + }, + "xTickLabelRotation": 0, + "xTickLabelSpacing": 0 + }, + "pluginVersion": "11.2.2", + "targets": [ + { + "datasource": { + "default": true, + "type": "grafana-postgresql-datasource", + "uid": "be28nkzirtb0gd" + }, + "editorMode": "code", + "format": "table", + "rawQuery": true, + "rawSql": "SELECT CAST(m.measurements->'first_eager_forward_pass_time_secs' AS double precision) AS first_eager_forward_pass_time_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = '${branch}' AND b.metadata->>'gpu_name' = '${gpu_name}' ORDER BY b.benchmark_id DESC LIMIT ${last_n_commits};", + "refId": "A", + "sql": { + "columns": [ + { + "parameters": [], + "type": "function" + } + ], + "groupBy": [ + { + "property": { + "type": "string" + }, + "type": "groupBy" + } + ], + "limit": 50 + } + } + ], + "title": "First eager forward pass", + "transformations": [ + { + "id": "sortBy", + "options": { + "fields": {}, + "sort": [ + { + "field": "time" + } + ] + } + } + ], + "transparent": true, + "type": "barchart" + }, + { + "datasource": { + "default": true, + "type": "grafana-postgresql-datasource", + "uid": "be28nkzirtb0gd" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "continuous-YlBl" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "fillOpacity": 80, + "gradientMode": "scheme", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineWidth": 0, + "scaleDistribution": { + "type": "linear" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 11, + "w": 12, + "x": 12, + "y": 7 + }, + "id": 9, + "options": { + "barRadius": 0.05, + "barWidth": 0.8, + "fullHighlight": false, + "groupWidth": 0.7, + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "orientation": "auto", + "showValue": "auto", + "stacking": "none", + "tooltip": { + "mode": "single", + "sort": "none" + }, + "xTickLabelRotation": 0, + "xTickLabelSpacing": 0 + }, + "targets": [ + { + "datasource": { + "default": true, + "type": "grafana-postgresql-datasource", + "uid": "be28nkzirtb0gd" + }, + "editorMode": "code", + "format": "table", + "rawQuery": true, + "rawSql": "SELECT CAST(m.measurements->'second_eager_forward_pass_time_secs' AS double precision) AS second_eager_forward_pass_time_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = '${branch}' AND b.metadata->>'gpu_name' = '${gpu_name}' ORDER BY b.benchmark_id DESC LIMIT ${last_n_commits};", + "refId": "A", + "sql": { + "columns": [ + { + "parameters": [], + "type": "function" + } + ], + "groupBy": [ + { + "property": { + "type": "string" + }, + "type": "groupBy" + } + ], + "limit": 50 + } + } + ], + "title": "Second eager forward pass", + "transformations": [ + { + "id": "sortBy", + "options": { + "fields": {}, + "sort": [ + { + "field": "time" + } + ] + } + } + ], + "transparent": true, + "type": "barchart" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 18 + }, + "id": 16, + "panels": [], + "title": "Time to next token", + "type": "row" + }, + { + "datasource": { + "default": true, + "type": "grafana-postgresql-datasource", + "uid": "be28nkzirtb0gd" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "continuous-YlBl" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "fillOpacity": 80, + "gradientMode": "scheme", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineWidth": 0, + "scaleDistribution": { + "type": "linear" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 11, + "w": 12, + "x": 0, + "y": 19 + }, + "id": 17, + "options": { + "barRadius": 0.05, + "barWidth": 0.8, + "fullHighlight": false, + "groupWidth": 0.7, + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "orientation": "auto", + "showValue": "always", + "stacking": "none", + "tooltip": { + "mode": "single", + "sort": "none" + }, + "xTickLabelRotation": 0, + "xTickLabelSpacing": 0 + }, + "targets": [ + { + "datasource": { + "default": true, + "type": "grafana-postgresql-datasource", + "uid": "be28nkzirtb0gd" + }, + "editorMode": "code", + "format": "table", + "rawQuery": true, + "rawSql": "SELECT CAST(m.measurements->'time_to_first_token_secs' AS double precision) AS time_to_first_token_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = '${branch}' AND b.metadata->>'gpu_name' = '${gpu_name}' ORDER BY b.benchmark_id DESC LIMIT ${last_n_commits};", + "refId": "A", + "sql": { + "columns": [ + { + "parameters": [], + "type": "function" + } + ], + "groupBy": [ + { + "property": { + "type": "string" + }, + "type": "groupBy" + } + ], + "limit": 50 + } + } + ], + "title": "Time to first token", + "transformations": [ + { + "id": "sortBy", + "options": { + "fields": {}, + "sort": [ + { + "field": "time" + } + ] + } + } + ], + "transparent": true, + "type": "barchart" + }, + { + "datasource": { + "default": true, + "type": "grafana-postgresql-datasource", + "uid": "be28nkzirtb0gd" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "continuous-YlBl" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "fillOpacity": 80, + "gradientMode": "scheme", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineWidth": 0, + "scaleDistribution": { + "type": "linear" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 11, + "w": 12, + "x": 12, + "y": 19 + }, + "id": 18, + "options": { + "barRadius": 0.05, + "barWidth": 0.8, + "fullHighlight": false, + "groupWidth": 0.7, + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "orientation": "auto", + "showValue": "always", + "stacking": "none", + "tooltip": { + "mode": "single", + "sort": "none" + }, + "xTickLabelRotation": 0, + "xTickLabelSpacing": 0 + }, + "targets": [ + { + "datasource": { + "default": true, + "type": "grafana-postgresql-datasource", + "uid": "be28nkzirtb0gd" + }, + "editorMode": "code", + "format": "table", + "rawQuery": true, + "rawSql": "SELECT CAST(m.measurements->'time_to_second_token_secs' AS double precision) AS time_to_second_token_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = '${branch}' AND b.metadata->>'gpu_name' = '${gpu_name}' ORDER BY b.benchmark_id DESC LIMIT ${last_n_commits};", + "refId": "A", + "sql": { + "columns": [ + { + "parameters": [], + "type": "function" + } + ], + "groupBy": [ + { + "property": { + "type": "string" + }, + "type": "groupBy" + } + ], + "limit": 50 + } + } + ], + "title": "Time to second token", + "transformations": [ + { + "id": "sortBy", + "options": { + "fields": {}, + "sort": [ + { + "field": "time" + } + ] + } + } + ], + "transparent": true, + "type": "barchart" + }, + { + "datasource": { + "default": true, + "type": "grafana-postgresql-datasource", + "uid": "be28nkzirtb0gd" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "continuous-YlBl" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "fillOpacity": 80, + "gradientMode": "scheme", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineWidth": 0, + "scaleDistribution": { + "type": "linear" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 11, + "w": 12, + "x": 0, + "y": 30 + }, + "id": 19, + "options": { + "barRadius": 0.05, + "barWidth": 0.8, + "fullHighlight": false, + "groupWidth": 0.7, + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "orientation": "auto", + "showValue": "always", + "stacking": "none", + "tooltip": { + "mode": "single", + "sort": "none" + }, + "xTickLabelRotation": 0, + "xTickLabelSpacing": 0 + }, + "targets": [ + { + "datasource": { + "default": true, + "type": "grafana-postgresql-datasource", + "uid": "be28nkzirtb0gd" + }, + "editorMode": "code", + "format": "table", + "rawQuery": true, + "rawSql": "SELECT CAST(m.measurements->'time_to_third_token_secs' AS double precision) AS time_to_third_token_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = '${branch}' AND b.metadata->>'gpu_name' = '${gpu_name}' ORDER BY b.benchmark_id DESC LIMIT ${last_n_commits};", + "refId": "A", + "sql": { + "columns": [ + { + "parameters": [], + "type": "function" + } + ], + "groupBy": [ + { + "property": { + "type": "string" + }, + "type": "groupBy" + } + ], + "limit": 50 + } + } + ], + "title": "Time to third token", + "transformations": [ + { + "id": "sortBy", + "options": { + "fields": {}, + "sort": [ + { + "field": "time" + } + ] + } + } + ], + "transparent": true, + "type": "barchart" + }, + { + "datasource": { + "default": true, + "type": "grafana-postgresql-datasource", + "uid": "be28nkzirtb0gd" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "continuous-YlBl" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "fillOpacity": 80, + "gradientMode": "scheme", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineWidth": 0, + "scaleDistribution": { + "type": "linear" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 11, + "w": 12, + "x": 12, + "y": 30 + }, + "id": 20, + "options": { + "barRadius": 0.05, + "barWidth": 0.8, + "fullHighlight": false, + "groupWidth": 0.7, + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "orientation": "auto", + "showValue": "always", + "stacking": "none", + "tooltip": { + "mode": "single", + "sort": "none" + }, + "xTickLabelRotation": 0, + "xTickLabelSpacing": 0 + }, + "targets": [ + { + "datasource": { + "default": true, + "type": "grafana-postgresql-datasource", + "uid": "be28nkzirtb0gd" + }, + "editorMode": "code", + "format": "table", + "rawQuery": true, + "rawSql": "SELECT CAST(m.measurements->'time_to_next_token_mean_secs' AS double precision) AS time_to_next_token_mean_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = '${branch}' AND b.metadata->>'gpu_name' = '${gpu_name}' ORDER BY b.benchmark_id DESC LIMIT ${last_n_commits};", + "refId": "A", + "sql": { + "columns": [ + { + "parameters": [], + "type": "function" + } + ], + "groupBy": [ + { + "property": { + "type": "string" + }, + "type": "groupBy" + } + ], + "limit": 50 + } + } + ], + "title": "Time to subsequent next tokens mean", + "transformations": [ + { + "id": "sortBy", + "options": { + "fields": {}, + "sort": [ + { + "field": "time" + } + ] + } + } + ], + "transparent": true, + "type": "barchart" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 41 + }, + "id": 14, + "panels": [], + "title": "Compiled Generate", + "type": "row" + }, + { + "datasource": { + "default": true, + "type": "grafana-postgresql-datasource", + "uid": "be28nkzirtb0gd" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "continuous-YlBl" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "fillOpacity": 80, + "gradientMode": "scheme", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineWidth": 0, + "scaleDistribution": { + "type": "linear" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 11, + "w": 12, + "x": 0, + "y": 42 + }, + "id": 8, + "options": { + "barRadius": 0.05, + "barWidth": 0.8, + "fullHighlight": false, + "groupWidth": 0.7, + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "orientation": "auto", + "showValue": "always", + "stacking": "none", + "tooltip": { + "mode": "single", + "sort": "none" + }, + "xTickLabelRotation": 0, + "xTickLabelSpacing": 0 + }, + "targets": [ + { + "datasource": { + "default": true, + "type": "grafana-postgresql-datasource", + "uid": "be28nkzirtb0gd" + }, + "editorMode": "code", + "format": "table", + "rawQuery": true, + "rawSql": "SELECT CAST(m.measurements->'first_compile_generate_time_secs' AS double precision) AS first_compile_generate_time_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = '${branch}' AND b.metadata->>'gpu_name' = '${gpu_name}' ORDER BY b.benchmark_id DESC LIMIT ${last_n_commits};", + "refId": "A", + "sql": { + "columns": [ + { + "parameters": [], + "type": "function" + } + ], + "groupBy": [ + { + "property": { + "type": "string" + }, + "type": "groupBy" + } + ], + "limit": 50 + } + } + ], + "title": "First compile generate", + "transformations": [ + { + "id": "sortBy", + "options": { + "fields": {}, + "sort": [ + { + "field": "time" + } + ] + } + } + ], + "transparent": true, + "type": "barchart" + }, + { + "datasource": { + "default": true, + "type": "grafana-postgresql-datasource", + "uid": "be28nkzirtb0gd" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "continuous-YlBl" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "fillOpacity": 80, + "gradientMode": "scheme", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineWidth": 0, + "scaleDistribution": { + "type": "linear" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 11, + "w": 12, + "x": 12, + "y": 42 + }, + "id": 10, + "options": { + "barRadius": 0.05, + "barWidth": 0.8, + "fullHighlight": false, + "groupWidth": 0.7, + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "orientation": "auto", + "showValue": "auto", + "stacking": "none", + "tooltip": { + "mode": "single", + "sort": "none" + }, + "xTickLabelRotation": 0, + "xTickLabelSpacing": 0 + }, + "targets": [ + { + "datasource": { + "default": true, + "type": "grafana-postgresql-datasource", + "uid": "be28nkzirtb0gd" + }, + "editorMode": "code", + "format": "table", + "rawQuery": true, + "rawSql": "SELECT CAST(m.measurements->'second_compile_generate_time_secs' AS double precision) AS second_compile_generate_time_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = '${branch}' AND b.metadata->>'gpu_name' = '${gpu_name}' ORDER BY b.benchmark_id DESC LIMIT ${last_n_commits};", + "refId": "A", + "sql": { + "columns": [ + { + "parameters": [], + "type": "function" + } + ], + "groupBy": [ + { + "property": { + "type": "string" + }, + "type": "groupBy" + } + ], + "limit": 50 + } + } + ], + "title": "Second compile generate", + "transformations": [ + { + "id": "sortBy", + "options": { + "fields": {}, + "sort": [ + { + "field": "time" + } + ] + } + } + ], + "transparent": true, + "type": "barchart" + }, + { + "datasource": { + "default": true, + "type": "grafana-postgresql-datasource", + "uid": "be28nkzirtb0gd" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "continuous-YlBl" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "fillOpacity": 80, + "gradientMode": "scheme", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineWidth": 0, + "scaleDistribution": { + "type": "linear" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 11, + "w": 12, + "x": 0, + "y": 53 + }, + "id": 11, + "options": { + "barRadius": 0.05, + "barWidth": 0.8, + "fullHighlight": false, + "groupWidth": 0.7, + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "orientation": "auto", + "showValue": "auto", + "stacking": "none", + "tooltip": { + "mode": "single", + "sort": "none" + }, + "xTickLabelRotation": 0, + "xTickLabelSpacing": 0 + }, + "targets": [ + { + "datasource": { + "default": true, + "type": "grafana-postgresql-datasource", + "uid": "be28nkzirtb0gd" + }, + "editorMode": "code", + "format": "table", + "rawQuery": true, + "rawSql": "SELECT CAST(m.measurements->'third_compile_generate_time_secs' AS double precision) AS third_compile_generate_time_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = '${branch}' AND b.metadata->>'gpu_name' = '${gpu_name}' ORDER BY b.benchmark_id DESC LIMIT ${last_n_commits};", + "refId": "A", + "sql": { + "columns": [ + { + "parameters": [], + "type": "function" + } + ], + "groupBy": [ + { + "property": { + "type": "string" + }, + "type": "groupBy" + } + ], + "limit": 50 + } + } + ], + "title": "Third compile generate", + "transformations": [ + { + "id": "sortBy", + "options": { + "fields": {}, + "sort": [ + { + "field": "time" + } + ] + } + } + ], + "transparent": true, + "type": "barchart" + }, + { + "datasource": { + "default": true, + "type": "grafana-postgresql-datasource", + "uid": "be28nkzirtb0gd" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "continuous-YlBl" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "fillOpacity": 80, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineWidth": 0, + "scaleDistribution": { + "type": "linear" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "s" + }, + "overrides": [] + }, + "gridPos": { + "h": 11, + "w": 12, + "x": 12, + "y": 53 + }, + "id": 12, + "options": { + "barRadius": 0.05, + "barWidth": 0.8, + "fullHighlight": false, + "groupWidth": 0.7, + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "orientation": "auto", + "showValue": "auto", + "stacking": "none", + "tooltip": { + "mode": "single", + "sort": "none" + }, + "xTickLabelRotation": 0, + "xTickLabelSpacing": 0 + }, + "targets": [ + { + "datasource": { + "default": true, + "type": "grafana-postgresql-datasource", + "uid": "be28nkzirtb0gd" + }, + "editorMode": "code", + "format": "table", + "rawQuery": true, + "rawSql": "SELECT CAST(m.measurements->'fourth_compile_generate_time_secs' AS double precision) AS fourth_compile_generate_time_secs, left(b.commit_id, 7), m.time FROM benchmarks as b JOIN model_measurements AS m ON b.benchmark_id = m.benchmark_id WHERE b.branch = '${branch}' AND b.metadata->>'gpu_name' = '${gpu_name}' ORDER BY b.benchmark_id DESC LIMIT ${last_n_commits};", + "refId": "A", + "sql": { + "columns": [ + { + "parameters": [], + "type": "function" + } + ], + "groupBy": [ + { + "property": { + "type": "string" + }, + "type": "groupBy" + } + ], + "limit": 50 + } + } + ], + "title": "Fourth compile generate", + "transformations": [ + { + "id": "sortBy", + "options": { + "fields": {}, + "sort": [ + { + "field": "time" + } + ] + } + } + ], + "transparent": true, + "type": "barchart" + }, + { + "collapsed": true, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 64 + }, + "id": 15, + "panels": [ + { + "datasource": {}, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": 60000, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 0, + "y": 65 + }, + "id": 1, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "default": true, + "type": "grafana-postgresql-datasource", + "uid": "be28nkzirtb0gd" + }, + "editorMode": "code", + "format": "table", + "rawQuery": true, + "rawSql": "SELECT\n d.cpu_util,\n d.time\nFROM\n benchmarks AS b\n JOIN device_measurements AS d ON b.benchmark_id = d.benchmark_id\nWHERE\n branch = '${branch}';", + "refId": "A", + "sql": { + "columns": [ + { + "parameters": [ + { + "name": "cpu_util", + "type": "functionParameter" + } + ], + "type": "function" + }, + { + "parameters": [ + { + "name": "mem_megabytes", + "type": "functionParameter" + } + ], + "type": "function" + }, + { + "parameters": [ + { + "name": "gpu_util", + "type": "functionParameter" + } + ], + "type": "function" + }, + { + "parameters": [ + { + "name": "gpu_mem_megabytes", + "type": "functionParameter" + } + ], + "type": "function" + }, + { + "parameters": [ + { + "name": "\"time\"", + "type": "functionParameter" + } + ], + "type": "function" + } + ], + "groupBy": [ + { + "property": { + "type": "string" + }, + "type": "groupBy" + } + ], + "limit": 50, + "whereJsonTree": { + "children1": [ + { + "id": "baa888b8-89ab-4cde-b012-31922f8671e9", + "properties": { + "field": "commit_id", + "fieldSrc": "field", + "operator": "equal", + "value": [ + "${commit}" + ], + "valueError": [ + null + ], + "valueSrc": [ + "value" + ], + "valueType": [ + "text" + ] + }, + "type": "rule" + } + ], + "id": "bab88a98-0123-4456-b89a-b1922f7d4f11", + "type": "group" + }, + "whereString": "commit_id = '${commit}'" + }, + "table": "measurements" + } + ], + "title": "CPU Utilization", + "transparent": true, + "type": "timeseries" + }, + { + "datasource": {}, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": 60000, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "percent" + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 12, + "y": 65 + }, + "id": 4, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "default": true, + "type": "grafana-postgresql-datasource", + "uid": "be28nkzirtb0gd" + }, + "editorMode": "code", + "format": "table", + "rawQuery": true, + "rawSql": "SELECT\n b.commit_id,\n d.gpu_util,\n d.time\nFROM\n benchmarks AS b\n JOIN device_measurements AS d ON b.benchmark_id = d.benchmark_id\nWHERE\n branch = '${branch}';", + "refId": "A", + "sql": { + "columns": [ + { + "parameters": [ + { + "name": "cpu_util", + "type": "functionParameter" + } + ], + "type": "function" + }, + { + "parameters": [ + { + "name": "mem_megabytes", + "type": "functionParameter" + } + ], + "type": "function" + }, + { + "parameters": [ + { + "name": "gpu_util", + "type": "functionParameter" + } + ], + "type": "function" + }, + { + "parameters": [ + { + "name": "gpu_mem_megabytes", + "type": "functionParameter" + } + ], + "type": "function" + }, + { + "parameters": [ + { + "name": "\"time\"", + "type": "functionParameter" + } + ], + "type": "function" + } + ], + "groupBy": [ + { + "property": { + "type": "string" + }, + "type": "groupBy" + } + ], + "limit": 50, + "whereJsonTree": { + "children1": [ + { + "id": "baa888b8-89ab-4cde-b012-31922f8671e9", + "properties": { + "field": "commit_id", + "fieldSrc": "field", + "operator": "equal", + "value": [ + "${commit}" + ], + "valueError": [ + null + ], + "valueSrc": [ + "value" + ], + "valueType": [ + "text" + ] + }, + "type": "rule" + } + ], + "id": "bab88a98-0123-4456-b89a-b1922f7d4f11", + "type": "group" + }, + "whereString": "commit_id = '${commit}'" + }, + "table": "measurements" + } + ], + "title": "GPU Utilization", + "transparent": true, + "type": "timeseries" + }, + { + "datasource": {}, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": 60000, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "decmbytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 0, + "y": 74 + }, + "id": 2, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "default": true, + "type": "grafana-postgresql-datasource", + "uid": "be28nkzirtb0gd" + }, + "editorMode": "code", + "format": "table", + "rawQuery": true, + "rawSql": "SELECT d.mem_megabytes, d.time FROM benchmarks AS b JOIN device_measurements AS d ON b.benchmark_id = d.benchmark_id WHERE branch = '${branch}';", + "refId": "A", + "sql": { + "columns": [ + { + "parameters": [ + { + "name": "cpu_util", + "type": "functionParameter" + } + ], + "type": "function" + }, + { + "parameters": [ + { + "name": "mem_megabytes", + "type": "functionParameter" + } + ], + "type": "function" + }, + { + "parameters": [ + { + "name": "gpu_util", + "type": "functionParameter" + } + ], + "type": "function" + }, + { + "parameters": [ + { + "name": "gpu_mem_megabytes", + "type": "functionParameter" + } + ], + "type": "function" + }, + { + "parameters": [ + { + "name": "\"time\"", + "type": "functionParameter" + } + ], + "type": "function" + } + ], + "groupBy": [ + { + "property": { + "type": "string" + }, + "type": "groupBy" + } + ], + "limit": 50, + "whereJsonTree": { + "children1": [ + { + "id": "baa888b8-89ab-4cde-b012-31922f8671e9", + "properties": { + "field": "commit_id", + "fieldSrc": "field", + "operator": "equal", + "value": [ + "${commit}" + ], + "valueError": [ + null + ], + "valueSrc": [ + "value" + ], + "valueType": [ + "text" + ] + }, + "type": "rule" + } + ], + "id": "bab88a98-0123-4456-b89a-b1922f7d4f11", + "type": "group" + }, + "whereString": "commit_id = '${commit}'" + }, + "table": "measurements" + } + ], + "title": "Memory usage", + "transparent": true, + "type": "timeseries" + }, + { + "datasource": {}, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "barWidthFactor": 0.6, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": 60000, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green" + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "decmbytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 12, + "x": 12, + "y": 74 + }, + "id": 3, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "default": true, + "type": "grafana-postgresql-datasource", + "uid": "be28nkzirtb0gd" + }, + "editorMode": "code", + "format": "table", + "rawQuery": true, + "rawSql": "SELECT\n d.gpu_mem_megabytes,\n d.time\nFROM\n benchmarks AS b\n JOIN device_measurements AS d ON b.benchmark_id = d.benchmark_id\nWHERE\n branch = '${branch}';", + "refId": "A", + "sql": { + "columns": [ + { + "parameters": [ + { + "name": "cpu_util", + "type": "functionParameter" + } + ], + "type": "function" + }, + { + "parameters": [ + { + "name": "mem_megabytes", + "type": "functionParameter" + } + ], + "type": "function" + }, + { + "parameters": [ + { + "name": "gpu_util", + "type": "functionParameter" + } + ], + "type": "function" + }, + { + "parameters": [ + { + "name": "gpu_mem_megabytes", + "type": "functionParameter" + } + ], + "type": "function" + }, + { + "parameters": [ + { + "name": "\"time\"", + "type": "functionParameter" + } + ], + "type": "function" + } + ], + "groupBy": [ + { + "property": { + "type": "string" + }, + "type": "groupBy" + } + ], + "limit": 50, + "whereJsonTree": { + "children1": [ + { + "id": "baa888b8-89ab-4cde-b012-31922f8671e9", + "properties": { + "field": "commit_id", + "fieldSrc": "field", + "operator": "equal", + "value": [ + "${commit}" + ], + "valueError": [ + null + ], + "valueSrc": [ + "value" + ], + "valueType": [ + "text" + ] + }, + "type": "rule" + } + ], + "id": "bab88a98-0123-4456-b89a-b1922f7d4f11", + "type": "group" + }, + "whereString": "commit_id = '${commit}'" + }, + "table": "measurements" + } + ], + "title": "GPU memory usage", + "transparent": true, + "type": "timeseries" + } + ], + "title": "Usage metrics", + "type": "row" + } + ], + "schemaVersion": 39, + "tags": [], + "templating": { + "list": [ + { + "current": { + "selected": false, + "text": "main", + "value": "main" + }, + "datasource": { + "default": true, + "type": "grafana-postgresql-datasource", + "uid": "be28nkzirtb0gd" + }, + "definition": "SELECT DISTINCT branch FROM benchmarks;", + "description": "", + "hide": 0, + "includeAll": false, + "label": "branch", + "multi": false, + "name": "branch", + "options": [], + "query": "SELECT DISTINCT branch FROM benchmarks;", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "type": "query" + }, + { + "current": { + "selected": false, + "text": "1729701492845", + "value": "1729701492845" + }, + "datasource": { + "default": true, + "type": "grafana-postgresql-datasource", + "uid": "be28nkzirtb0gd" + }, + "definition": "SELECT created_at - INTERVAL '5 secs' FROM benchmarks WHERE branch = '${branch}' ORDER BY benchmark_id ASC LIMIT 1;", + "description": "", + "hide": 2, + "includeAll": false, + "multi": false, + "name": "StartTime", + "options": [], + "query": "SELECT created_at - INTERVAL '5 secs' FROM benchmarks WHERE branch = '${branch}' ORDER BY benchmark_id ASC LIMIT 1;", + "refresh": 2, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "type": "query" + }, + { + "current": { + "selected": false, + "text": "1730393397577", + "value": "1730393397577" + }, + "datasource": { + "default": true, + "type": "grafana-postgresql-datasource", + "uid": "be28nkzirtb0gd" + }, + "definition": "SELECT time + INTERVAL '5 secs' FROM benchmarks AS b JOIN device_measurements AS d ON b.benchmark_id = d.benchmark_id WHERE branch = '${branch}' ORDER BY b.benchmark_id DESC, d.measurement_id DESC LIMIT 1;", + "description": "", + "hide": 2, + "includeAll": false, + "multi": false, + "name": "EndTime", + "options": [], + "query": "SELECT time + INTERVAL '5 secs' FROM benchmarks AS b JOIN device_measurements AS d ON b.benchmark_id = d.benchmark_id WHERE branch = '${branch}' ORDER BY b.benchmark_id DESC, d.measurement_id DESC LIMIT 1;", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "type": "query" + }, + { + "current": { + "selected": false, + "text": "NVIDIA A10G", + "value": "NVIDIA A10G" + }, + "datasource": { + "type": "grafana-postgresql-datasource", + "uid": "be28nkzirtb0gd" + }, + "definition": "SELECT DISTINCT metadata->>'gpu_name' FROM benchmarks;", + "description": "", + "hide": 0, + "includeAll": false, + "label": "GPU", + "multi": false, + "name": "gpu_name", + "options": [], + "query": "SELECT DISTINCT metadata->>'gpu_name' FROM benchmarks;", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "type": "query" + }, + { + "current": { + "selected": true, + "text": "10", + "value": "10" + }, + "description": "The number of commits to display, going from most recent to the nth commit.", + "hide": 0, + "label": "Last # of commits", + "name": "last_n_commits", + "options": [ + { + "selected": true, + "text": "10", + "value": "10" + } + ], + "query": "10", + "skipUrlSync": false, + "type": "textbox" + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": { + "hidden": false + }, + "timezone": "browser", + "title": "Transformers benchmarks", + "uid": "fdz33iyzln9c0a", + "version": 10, + "weekStart": "" +} diff --git a/docs/transformers/benchmark/grafana_datasource.yaml b/docs/transformers/benchmark/grafana_datasource.yaml new file mode 100644 index 0000000000000000000000000000000000000000..25f36254104ab5047fd36ddf00ebcde7289849a7 --- /dev/null +++ b/docs/transformers/benchmark/grafana_datasource.yaml @@ -0,0 +1,17 @@ +apiVersion: 1 +datasources: + - name: grafana-postgresql-datasource + uid: be28nkzirtb0gd + type: postgres + url: $GRAFANA_POSTGRES_DATASOURCE_URL + user: $GRAFANA_POSTGRES_DATASOURCE_USER + secureJsonData: + password: $GRAFANA_POSTGRES_DATASOURCE_PWD + jsonData: + database: metrics + maxOpenConns: 100 + maxIdleConns: 100 + maxIdleConnsAuto: true + connMaxLifetime: 14400 + postgresVersion: 1000 + timescaledb: false diff --git a/docs/transformers/benchmark/init_db.sql b/docs/transformers/benchmark/init_db.sql new file mode 100644 index 0000000000000000000000000000000000000000..a7864c4af183b6ac26a8ac5dc623282e94b44c91 --- /dev/null +++ b/docs/transformers/benchmark/init_db.sql @@ -0,0 +1,33 @@ +CREATE TABLE IF NOT EXISTS benchmarks ( + benchmark_id SERIAL PRIMARY KEY, + branch VARCHAR(255), + commit_id VARCHAR(72), + commit_message VARCHAR(70), + metadata jsonb, + created_at timestamp without time zone NOT NULL DEFAULT (current_timestamp AT TIME ZONE 'UTC') +); + +CREATE INDEX IF NOT EXISTS benchmarks_benchmark_id_idx ON benchmarks (benchmark_id); + +CREATE INDEX IF NOT EXISTS benchmarks_branch_idx ON benchmarks (branch); + +CREATE TABLE IF NOT EXISTS device_measurements ( + measurement_id SERIAL PRIMARY KEY, + benchmark_id int REFERENCES benchmarks (benchmark_id), + cpu_util double precision, + mem_megabytes double precision, + gpu_util double precision, + gpu_mem_megabytes double precision, + time timestamp without time zone NOT NULL DEFAULT (current_timestamp AT TIME ZONE 'UTC') +); + +CREATE INDEX IF NOT EXISTS device_measurements_branch_idx ON device_measurements (benchmark_id); + +CREATE TABLE IF NOT EXISTS model_measurements ( + measurement_id SERIAL PRIMARY KEY, + benchmark_id int REFERENCES benchmarks (benchmark_id), + measurements jsonb, + time timestamp without time zone NOT NULL DEFAULT (current_timestamp AT TIME ZONE 'UTC') +); + +CREATE INDEX IF NOT EXISTS model_measurements_branch_idx ON model_measurements (benchmark_id); diff --git a/docs/transformers/benchmark/llama.py b/docs/transformers/benchmark/llama.py new file mode 100644 index 0000000000000000000000000000000000000000..1857dee3d66b1ac0497e50e45dfe2f5fdead76aa --- /dev/null +++ b/docs/transformers/benchmark/llama.py @@ -0,0 +1,342 @@ +from logging import Logger +import os +from threading import Event, Thread +from time import perf_counter, sleep +from typing import Optional +from benchmarks_entrypoint import MetricsRecorder +import gpustat +import psutil +import psycopg2 +import torch + +from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig, StaticCache + + +os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1" + +os.environ["TOKENIZERS_PARALLELISM"] = "1" +torch.set_float32_matmul_precision("high") + + +def collect_metrics(benchmark_id, continue_metric_collection, metrics_recorder): + p = psutil.Process(os.getpid()) + while not continue_metric_collection.is_set(): + with p.oneshot(): + cpu_util = p.cpu_percent() + mem_megabytes = p.memory_info().rss / (1024 * 1024) + gpu_stats = gpustat.GPUStatCollection.new_query() + gpu_util = gpu_stats[0]["utilization.gpu"] + gpu_mem_megabytes = gpu_stats[0]["memory.used"] + metrics_recorder.collect_device_measurements( + benchmark_id, cpu_util, mem_megabytes, gpu_util, gpu_mem_megabytes + ) + sleep(0.01) + + +def run_benchmark(logger: Logger, branch: str, commit_id: str, commit_msg: str, num_tokens_to_generate=100): + continue_metric_collection = Event() + metrics_thread = None + model_id = "meta-llama/Llama-2-7b-hf" + metrics_recorder = MetricsRecorder(psycopg2.connect("dbname=metrics"), logger, branch, commit_id, commit_msg) + try: + gpu_stats = gpustat.GPUStatCollection.new_query() + gpu_name = gpu_stats[0]["name"] + benchmark_id = metrics_recorder.initialise_benchmark({"gpu_name": gpu_name, "model_id": model_id}) + logger.info(f"running benchmark #{benchmark_id} on {gpu_name} for {model_id}") + metrics_thread = Thread( + target=collect_metrics, + args=[benchmark_id, continue_metric_collection, metrics_recorder], + ) + metrics_thread.start() + logger.info("started background thread to fetch device metrics") + + os.environ["TOKENIZERS_PARALLELISM"] = "false" # silence warnings when compiling + + device = "cuda" + + logger.info("downloading weights") + # This is to avoid counting download in model load time measurement + model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float16) + gen_config = GenerationConfig(do_sample=False, top_p=1, temperature=1) + logger.info("loading model") + start = perf_counter() + model = AutoModelForCausalLM.from_pretrained( + model_id, torch_dtype=torch.float16, generation_config=gen_config + ).eval() + model.to(device) + torch.cuda.synchronize() + end = perf_counter() + model_load_time = end - start + logger.info(f"loaded model in: {model_load_time}s") + + tokenizer = AutoTokenizer.from_pretrained(model_id) + + prompt = "Why dogs are so cute?" + inputs = tokenizer(prompt, return_tensors="pt").to(device) + + # Specify the max length (including both the prompt and the response) + # When calling `generate` with `cache_implementation="static" later, this is also used to create a `StaticCache` object + # with sequence length = `max_length`. The longer the more you will re-use it + seq_length = inputs["input_ids"].shape[1] + model.generation_config.max_length = seq_length + num_tokens_to_generate + batch_size = inputs["input_ids"].shape[0] + + # Copied from the gpt-fast repo + def multinomial_sample_one_no_sync(probs_sort): # Does multinomial sampling without a cuda synchronization + q = torch.empty_like(probs_sort).exponential_(1) + return torch.argmax(probs_sort / q, dim=-1, keepdim=True).to(dtype=torch.int) + + def logits_to_probs(logits, temperature: float = 1.0, top_k: Optional[int] = None): + logits = logits / max(temperature, 1e-5) + + if top_k is not None: + v, _ = torch.topk(logits, min(top_k, logits.size(-1))) + pivot = v.select(-1, -1).unsqueeze(-1) + logits = torch.where(logits < pivot, -float("Inf"), logits) + probs = torch.nn.functional.softmax(logits, dim=-1) + return probs + + def sample(logits, temperature: float = 1.0, top_k: Optional[int] = None): + probs = logits_to_probs(logits[:, -1], temperature, top_k) + idx_next = multinomial_sample_one_no_sync(probs) + return idx_next, probs + + def decode_one_token(model, cur_token, cache_position, past_key_values): + logits = model( + cur_token, + cache_position=cache_position, + past_key_values=past_key_values, + return_dict=False, + use_cache=True, + )[0] + new_token = sample(logits, temperature=0.6, top_k=5)[0] + return new_token + + ######### + # Eager # + ######### + with torch.no_grad(): + past_key_values = StaticCache( + model.config, + max_batch_size=batch_size, + device=device, + dtype=torch.float16, + max_cache_len=seq_length + num_tokens_to_generate, + ) + cache_position = torch.arange(seq_length, device=device) + start = perf_counter() + model( + **inputs, + cache_position=cache_position, + past_key_values=past_key_values, + return_dict=False, + use_cache=True, + ) + end = perf_counter() + first_eager_fwd_pass_time = end - start + logger.info(f"completed first eager fwd pass in: {first_eager_fwd_pass_time}s") + start = perf_counter() + output = model.generate(**inputs, do_sample=False) + end = perf_counter() + first_eager_generate_time = end - start + logger.info(f"completed first eager generation in: {first_eager_generate_time}s") + logger.info(f"generated: {tokenizer.batch_decode(output.cpu().tolist())}") + + past_key_values = StaticCache( + model.config, + max_batch_size=batch_size, + device=device, + dtype=torch.float16, + max_cache_len=seq_length + num_tokens_to_generate, + ) + cache_position = torch.arange(seq_length, device=device) + start = perf_counter() + model( + **inputs, + cache_position=cache_position, + past_key_values=past_key_values, + return_dict=False, + use_cache=True, + ) + end = perf_counter() + second_eager_fwd_pass_time = end - start + logger.info(f"completed second eager fwd pass in: {second_eager_fwd_pass_time}s") + start = perf_counter() + model.generate(**inputs, do_sample=False) + end = perf_counter() + second_eager_generate_time = end - start + logger.info(f"completed second eager generation in: {second_eager_generate_time}s") + logger.info(f"generated: {tokenizer.batch_decode(output.cpu().tolist())}") + + torch.compiler.reset() + + ################ + # Forward pass # + ################ + + # `torch.compile(model, ...)` is not recommended as you compile callbacks + # and full generate. We recommend compiling only the forward for now. + # "reduce-overhead" will use cudagraphs. + generated_ids = torch.zeros( + (batch_size, num_tokens_to_generate + seq_length), dtype=torch.int, device=device + ) + + generated_ids[:, :seq_length] = inputs["input_ids"] + decode_one_token = torch.compile(decode_one_token, mode="reduce-overhead", fullgraph=True) + # model.forward = torch.compile(model.forward, mode="reduce-overhead", fullgraph=True) + # TODO use decode_one_token(model, input_id.clone(), cache_position) for verification + past_key_values = StaticCache( + model.config, + max_batch_size=batch_size, + device=device, + dtype=torch.float16, + max_cache_len=seq_length + num_tokens_to_generate + 10, + ) + cache_position = torch.arange(seq_length, device=device) + all_generated_tokens = [] + ### First compile, prefill + start = perf_counter() + next_token = decode_one_token( + model, inputs["input_ids"], cache_position=cache_position, past_key_values=past_key_values + ) + torch.cuda.synchronize() + end = perf_counter() + time_to_first_token = end - start + logger.info(f"completed first compile generation in: {time_to_first_token}s") + cache_position += 1 + all_generated_tokens += next_token.tolist() + + cache_position = torch.tensor([seq_length], device=device) + ### First compile, decoding + start = perf_counter() + next_token = decode_one_token( + model, next_token.clone(), cache_position=cache_position, past_key_values=past_key_values + ) + torch.cuda.synchronize() + end = perf_counter() + time_to_second_token = end - start + logger.info(f"completed second compile generation in: {time_to_second_token}s") + cache_position += 1 + all_generated_tokens += next_token.tolist() + + ### Second compile, decoding + start = perf_counter() + next_token = decode_one_token( + model, next_token.clone(), cache_position=cache_position, past_key_values=past_key_values + ) + torch.cuda.synchronize() + end = perf_counter() + time_to_third_token = end - start + logger.info(f"completed third compile forward in: {time_to_third_token}s") + cache_position += 1 + all_generated_tokens += next_token.tolist() + + ### Using cuda graphs decoding + + start = perf_counter() + for _ in range(1, num_tokens_to_generate): + all_generated_tokens += next_token.tolist() + next_token = decode_one_token( + model, next_token.clone(), cache_position=cache_position, past_key_values=past_key_values + ) + cache_position += 1 + torch.cuda.synchronize() + end = perf_counter() + mean_time_to_next_token = (end - start) / num_tokens_to_generate + logger.info(f"completed next compile generation in: {mean_time_to_next_token}s") + logger.info(f"generated: {tokenizer.batch_decode(all_generated_tokens)}") + + #################### + # Generate compile # + #################### + torch.compiler.reset() + # we will not compile full generate as it' s to intensive, tho we measure full forward! + + past_key_values = StaticCache( + model.config, + max_batch_size=batch_size, + device=device, + dtype=torch.float16, + max_cache_len=seq_length + 128, + ) + + # 1st call + start = perf_counter() + output = model.generate(**inputs, past_key_values=past_key_values) + torch.cuda.synchronize() + end = perf_counter() + first_compile_generate_time = end - start + logger.info(f"completed first compile generation in: {first_compile_generate_time}s") + logger.info(f"generated: {tokenizer.batch_decode(output.cpu().tolist())}") + + past_key_values = StaticCache( + model.config, + max_batch_size=batch_size, + device=device, + dtype=torch.float16, + max_cache_len=seq_length + 128, + ) + # 2nd call + start = perf_counter() + output = model.generate(**inputs, past_key_values=past_key_values) + torch.cuda.synchronize() + end = perf_counter() + second_compile_generate_time = end - start + logger.info(f"completed second compile generation in: {second_compile_generate_time}s") + logger.info(f"generated: {tokenizer.batch_decode(output.cpu().tolist())}") + + past_key_values = StaticCache( + model.config, + max_batch_size=batch_size, + device=device, + dtype=torch.float16, + max_cache_len=seq_length + 128, + ) + + # 3nd call + start = perf_counter() + output = model.generate(**inputs, past_key_values=past_key_values) + end = perf_counter() + third_compile_generate_time = end - start + logger.info(f"completed third compile generation in: {third_compile_generate_time}s") + logger.info(f"generated: {tokenizer.batch_decode(output.cpu().tolist())}") + + past_key_values = StaticCache( + model.config, + max_batch_size=batch_size, + device=device, + dtype=torch.float16, + max_cache_len=seq_length + 128, + ) + # 4th call + start = perf_counter() + output = model.generate(**inputs, past_key_values=past_key_values) + end = perf_counter() + fourth_compile_generate_time = end - start + logger.info(f"completed fourth compile generation in: {fourth_compile_generate_time}s") + logger.info(f"generated: {tokenizer.batch_decode(output.cpu().tolist())}") + + metrics_recorder.collect_model_measurements( + benchmark_id, + { + "model_load_time": model_load_time, + "first_eager_forward_pass_time_secs": first_eager_fwd_pass_time, + "second_eager_forward_pass_time_secs": second_eager_fwd_pass_time, + "first_eager_generate_time_secs": first_eager_generate_time, + "second_eager_generate_time_secs": second_eager_generate_time, + "time_to_first_token_secs": time_to_first_token, + "time_to_second_token_secs": time_to_second_token, + "time_to_third_token_secs": time_to_third_token, + "time_to_next_token_mean_secs": mean_time_to_next_token, + "first_compile_generate_time_secs": first_compile_generate_time, + "second_compile_generate_time_secs": second_compile_generate_time, + "third_compile_generate_time_secs": third_compile_generate_time, + "fourth_compile_generate_time_secs": fourth_compile_generate_time, + }, + ) + except Exception as e: + logger.error(f"Caught exception: {e}") + continue_metric_collection.set() + if metrics_thread is not None: + metrics_thread.join() + metrics_recorder.close() diff --git a/docs/transformers/benchmark/optimum_benchmark_wrapper.py b/docs/transformers/benchmark/optimum_benchmark_wrapper.py new file mode 100644 index 0000000000000000000000000000000000000000..c43e9a73e3160de938f0181e1d606fe742d5a5ef --- /dev/null +++ b/docs/transformers/benchmark/optimum_benchmark_wrapper.py @@ -0,0 +1,16 @@ +import argparse +import subprocess + + +def main(config_dir, config_name, args): + subprocess.run(["optimum-benchmark", "--config-dir", f"{config_dir}", "--config-name", f"{config_name}"] + ["hydra/job_logging=disabled", "hydra/hydra_logging=disabled"] + args) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + + parser.add_argument("--config-dir", type=str, required=True, help="The path to the config directory.") + parser.add_argument("--config-name", type=str, required=True, help="The config name.") + args, unknown = parser.parse_known_args() + + main(args.config_dir, args.config_name, unknown) diff --git a/docs/transformers/benchmark/requirements.txt b/docs/transformers/benchmark/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..50e9dfaddfa4cab8bea5b51561198d47d14c3b35 --- /dev/null +++ b/docs/transformers/benchmark/requirements.txt @@ -0,0 +1,5 @@ +gpustat==1.1.1 +psutil==6.0.0 +psycopg2==2.9.9 +torch>=2.4.0 +hf_transfer \ No newline at end of file diff --git a/docs/transformers/build/lib/transformers/__init__.py b/docs/transformers/build/lib/transformers/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e75357c39f9c296fa99249ce38e4633a86157707 --- /dev/null +++ b/docs/transformers/build/lib/transformers/__init__.py @@ -0,0 +1,1032 @@ +# Copyright 2020 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# When adding a new object to this init, remember to add it twice: once inside the `_import_structure` dictionary and +# once inside the `if TYPE_CHECKING` branch. The `TYPE_CHECKING` should have import statements as usual, but they are +# only there for type checking. The `_import_structure` is a dictionary submodule to list of object names, and is used +# to defer the actual importing for when the objects are requested. This way `import transformers` provides the names +# in the namespace without actually importing anything (and especially none of the backends). + +__version__ = "4.52.0.dev0" + +from pathlib import Path +from typing import TYPE_CHECKING + +# Check the dependencies satisfy the minimal versions required. +from . import dependency_versions_check +from .utils import ( + OptionalDependencyNotAvailable, + _LazyModule, + is_bitsandbytes_available, + is_essentia_available, + is_flax_available, + is_g2p_en_available, + is_keras_nlp_available, + is_librosa_available, + is_pretty_midi_available, + is_scipy_available, + is_sentencepiece_available, + is_speech_available, + is_tensorflow_text_available, + is_tf_available, + is_timm_available, + is_tokenizers_available, + is_torch_available, + is_torchaudio_available, + is_torchvision_available, + is_vision_available, + logging, +) +from .utils.import_utils import define_import_structure + + +logger = logging.get_logger(__name__) # pylint: disable=invalid-name + +# Base objects, independent of any specific backend +_import_structure = { + "audio_utils": [], + "commands": [], + "configuration_utils": ["PretrainedConfig"], + "convert_graph_to_onnx": [], + "convert_slow_tokenizers_checkpoints_to_fast": [], + "convert_tf_hub_seq_to_seq_bert_to_pytorch": [], + "data": [ + "DataProcessor", + "InputExample", + "InputFeatures", + "SingleSentenceClassificationProcessor", + "SquadExample", + "SquadFeatures", + "SquadV1Processor", + "SquadV2Processor", + "glue_compute_metrics", + "glue_convert_examples_to_features", + "glue_output_modes", + "glue_processors", + "glue_tasks_num_labels", + "squad_convert_examples_to_features", + "xnli_compute_metrics", + "xnli_output_modes", + "xnli_processors", + "xnli_tasks_num_labels", + ], + "data.data_collator": [ + "DataCollator", + "DataCollatorForLanguageModeling", + "DataCollatorForMultipleChoice", + "DataCollatorForPermutationLanguageModeling", + "DataCollatorForSeq2Seq", + "DataCollatorForSOP", + "DataCollatorForTokenClassification", + "DataCollatorForWholeWordMask", + "DataCollatorWithFlattening", + "DataCollatorWithPadding", + "DefaultDataCollator", + "default_data_collator", + ], + "data.metrics": [], + "data.processors": [], + "debug_utils": [], + "dependency_versions_check": [], + "dependency_versions_table": [], + "dynamic_module_utils": [], + "feature_extraction_sequence_utils": ["SequenceFeatureExtractor"], + "feature_extraction_utils": ["BatchFeature", "FeatureExtractionMixin"], + "file_utils": [], + "generation": [ + "AsyncTextIteratorStreamer", + "CompileConfig", + "GenerationConfig", + "TextIteratorStreamer", + "TextStreamer", + "WatermarkingConfig", + ], + "hf_argparser": ["HfArgumentParser"], + "hyperparameter_search": [], + "image_transforms": [], + "integrations": [ + "is_clearml_available", + "is_comet_available", + "is_dvclive_available", + "is_neptune_available", + "is_optuna_available", + "is_ray_available", + "is_ray_tune_available", + "is_sigopt_available", + "is_swanlab_available", + "is_tensorboard_available", + "is_wandb_available", + ], + "loss": [], + "modelcard": ["ModelCard"], + # Losses + "modeling_tf_pytorch_utils": [ + "convert_tf_weight_name_to_pt_weight_name", + "load_pytorch_checkpoint_in_tf2_model", + "load_pytorch_model_in_tf2_model", + "load_pytorch_weights_in_tf2_model", + "load_tf2_checkpoint_in_pytorch_model", + "load_tf2_model_in_pytorch_model", + "load_tf2_weights_in_pytorch_model", + ], + # Models + "onnx": [], + "pipelines": [ + "AudioClassificationPipeline", + "AutomaticSpeechRecognitionPipeline", + "CsvPipelineDataFormat", + "DepthEstimationPipeline", + "DocumentQuestionAnsweringPipeline", + "FeatureExtractionPipeline", + "FillMaskPipeline", + "ImageClassificationPipeline", + "ImageFeatureExtractionPipeline", + "ImageSegmentationPipeline", + "ImageTextToTextPipeline", + "ImageToImagePipeline", + "ImageToTextPipeline", + "JsonPipelineDataFormat", + "MaskGenerationPipeline", + "NerPipeline", + "ObjectDetectionPipeline", + "PipedPipelineDataFormat", + "Pipeline", + "PipelineDataFormat", + "QuestionAnsweringPipeline", + "SummarizationPipeline", + "TableQuestionAnsweringPipeline", + "Text2TextGenerationPipeline", + "TextClassificationPipeline", + "TextGenerationPipeline", + "TextToAudioPipeline", + "TokenClassificationPipeline", + "TranslationPipeline", + "VideoClassificationPipeline", + "VisualQuestionAnsweringPipeline", + "ZeroShotAudioClassificationPipeline", + "ZeroShotClassificationPipeline", + "ZeroShotImageClassificationPipeline", + "ZeroShotObjectDetectionPipeline", + "pipeline", + ], + "processing_utils": ["ProcessorMixin"], + "quantizers": [], + "testing_utils": [], + "tokenization_utils": ["PreTrainedTokenizer"], + "tokenization_utils_base": [ + "AddedToken", + "BatchEncoding", + "CharSpan", + "PreTrainedTokenizerBase", + "SpecialTokensMixin", + "TokenSpan", + ], + "trainer_callback": [ + "DefaultFlowCallback", + "EarlyStoppingCallback", + "PrinterCallback", + "ProgressCallback", + "TrainerCallback", + "TrainerControl", + "TrainerState", + ], + "trainer_utils": [ + "EvalPrediction", + "IntervalStrategy", + "SchedulerType", + "enable_full_determinism", + "set_seed", + ], + "training_args": ["TrainingArguments"], + "training_args_seq2seq": ["Seq2SeqTrainingArguments"], + "training_args_tf": ["TFTrainingArguments"], + "utils": [ + "CONFIG_NAME", + "MODEL_CARD_NAME", + "PYTORCH_PRETRAINED_BERT_CACHE", + "PYTORCH_TRANSFORMERS_CACHE", + "SPIECE_UNDERLINE", + "TF2_WEIGHTS_NAME", + "TF_WEIGHTS_NAME", + "TRANSFORMERS_CACHE", + "WEIGHTS_NAME", + "TensorType", + "add_end_docstrings", + "add_start_docstrings", + "is_apex_available", + "is_av_available", + "is_bitsandbytes_available", + "is_datasets_available", + "is_faiss_available", + "is_flax_available", + "is_keras_nlp_available", + "is_phonemizer_available", + "is_psutil_available", + "is_py3nvml_available", + "is_pyctcdecode_available", + "is_sacremoses_available", + "is_safetensors_available", + "is_scipy_available", + "is_sentencepiece_available", + "is_sklearn_available", + "is_speech_available", + "is_tensorflow_text_available", + "is_tf_available", + "is_timm_available", + "is_tokenizers_available", + "is_torch_available", + "is_torch_hpu_available", + "is_torch_mlu_available", + "is_torch_musa_available", + "is_torch_neuroncore_available", + "is_torch_npu_available", + "is_torchvision_available", + "is_torch_xla_available", + "is_torch_xpu_available", + "is_vision_available", + "logging", + ], + "utils.quantization_config": [ + "AqlmConfig", + "AutoRoundConfig", + "AwqConfig", + "BitNetConfig", + "BitsAndBytesConfig", + "CompressedTensorsConfig", + "EetqConfig", + "FbgemmFp8Config", + "FineGrainedFP8Config", + "GPTQConfig", + "HiggsConfig", + "HqqConfig", + "QuantoConfig", + "QuarkConfig", + "SpQRConfig", + "TorchAoConfig", + "VptqConfig", + ], +} + +# tokenizers-backed objects +try: + if not is_tokenizers_available(): + raise OptionalDependencyNotAvailable() +except OptionalDependencyNotAvailable: + from .utils import dummy_tokenizers_objects + + _import_structure["utils.dummy_tokenizers_objects"] = [ + name for name in dir(dummy_tokenizers_objects) if not name.startswith("_") + ] +else: + # Fast tokenizers structure + _import_structure["tokenization_utils_fast"] = ["PreTrainedTokenizerFast"] + + +try: + if not (is_sentencepiece_available() and is_tokenizers_available()): + raise OptionalDependencyNotAvailable() +except OptionalDependencyNotAvailable: + from .utils import dummy_sentencepiece_and_tokenizers_objects + + _import_structure["utils.dummy_sentencepiece_and_tokenizers_objects"] = [ + name for name in dir(dummy_sentencepiece_and_tokenizers_objects) if not name.startswith("_") + ] +else: + _import_structure["convert_slow_tokenizer"] = [ + "SLOW_TO_FAST_CONVERTERS", + "convert_slow_tokenizer", + ] + +# Vision-specific objects +try: + if not is_vision_available(): + raise OptionalDependencyNotAvailable() +except OptionalDependencyNotAvailable: + from .utils import dummy_vision_objects + + _import_structure["utils.dummy_vision_objects"] = [ + name for name in dir(dummy_vision_objects) if not name.startswith("_") + ] +else: + _import_structure["image_processing_base"] = ["ImageProcessingMixin"] + _import_structure["image_processing_utils"] = ["BaseImageProcessor"] + _import_structure["image_utils"] = ["ImageFeatureExtractionMixin"] + +try: + if not is_torchvision_available(): + raise OptionalDependencyNotAvailable() +except OptionalDependencyNotAvailable: + from .utils import dummy_torchvision_objects + + _import_structure["utils.dummy_torchvision_objects"] = [ + name for name in dir(dummy_torchvision_objects) if not name.startswith("_") + ] +else: + _import_structure["image_processing_utils_fast"] = ["BaseImageProcessorFast"] + +# PyTorch-backed objects +try: + if not is_torch_available(): + raise OptionalDependencyNotAvailable() +except OptionalDependencyNotAvailable: + from .utils import dummy_pt_objects + + _import_structure["utils.dummy_pt_objects"] = [name for name in dir(dummy_pt_objects) if not name.startswith("_")] +else: + _import_structure["model_debugging_utils"] = [ + "model_addition_debugger_context", + ] + _import_structure["activations"] = [] + _import_structure["cache_utils"] = [ + "Cache", + "CacheConfig", + "DynamicCache", + "EncoderDecoderCache", + "HQQQuantizedCache", + "HybridCache", + "MambaCache", + "OffloadedCache", + "OffloadedStaticCache", + "QuantizedCache", + "QuantizedCacheConfig", + "QuantoQuantizedCache", + "SinkCache", + "SlidingWindowCache", + "StaticCache", + ] + _import_structure["data.datasets"] = [ + "GlueDataset", + "GlueDataTrainingArguments", + "LineByLineTextDataset", + "LineByLineWithRefDataset", + "LineByLineWithSOPTextDataset", + "SquadDataset", + "SquadDataTrainingArguments", + "TextDataset", + "TextDatasetForNextSentencePrediction", + ] + _import_structure["generation"].extend( + [ + "AlternatingCodebooksLogitsProcessor", + "BayesianDetectorConfig", + "BayesianDetectorModel", + "BeamScorer", + "BeamSearchScorer", + "ClassifierFreeGuidanceLogitsProcessor", + "ConstrainedBeamSearchScorer", + "Constraint", + "ConstraintListState", + "DisjunctiveConstraint", + "EncoderNoRepeatNGramLogitsProcessor", + "EncoderRepetitionPenaltyLogitsProcessor", + "EosTokenCriteria", + "EpsilonLogitsWarper", + "EtaLogitsWarper", + "ExponentialDecayLengthPenalty", + "ForcedBOSTokenLogitsProcessor", + "ForcedEOSTokenLogitsProcessor", + "GenerationMixin", + "HammingDiversityLogitsProcessor", + "InfNanRemoveLogitsProcessor", + "LogitNormalization", + "LogitsProcessor", + "LogitsProcessorList", + "MaxLengthCriteria", + "MaxTimeCriteria", + "MinLengthLogitsProcessor", + "MinNewTokensLengthLogitsProcessor", + "MinPLogitsWarper", + "NoBadWordsLogitsProcessor", + "NoRepeatNGramLogitsProcessor", + "PhrasalConstraint", + "PrefixConstrainedLogitsProcessor", + "RepetitionPenaltyLogitsProcessor", + "SequenceBiasLogitsProcessor", + "StoppingCriteria", + "StoppingCriteriaList", + "StopStringCriteria", + "SuppressTokensAtBeginLogitsProcessor", + "SuppressTokensLogitsProcessor", + "SynthIDTextWatermarkDetector", + "SynthIDTextWatermarkingConfig", + "SynthIDTextWatermarkLogitsProcessor", + "TemperatureLogitsWarper", + "TopKLogitsWarper", + "TopPLogitsWarper", + "TypicalLogitsWarper", + "UnbatchedClassifierFreeGuidanceLogitsProcessor", + "WatermarkDetector", + "WatermarkLogitsProcessor", + "WhisperTimeStampLogitsProcessor", + ] + ) + + # PyTorch domain libraries integration + _import_structure["integrations.executorch"] = [ + "TorchExportableModuleWithStaticCache", + "convert_and_export_with_cache", + ] + + _import_structure["modeling_flash_attention_utils"] = [] + _import_structure["modeling_layers"] = ["GradientCheckpointingLayer"] + _import_structure["modeling_outputs"] = [] + _import_structure["modeling_rope_utils"] = ["ROPE_INIT_FUNCTIONS", "dynamic_rope_update"] + _import_structure["modeling_utils"] = ["PreTrainedModel", "AttentionInterface"] + _import_structure["optimization"] = [ + "Adafactor", + "get_constant_schedule", + "get_constant_schedule_with_warmup", + "get_cosine_schedule_with_warmup", + "get_cosine_with_hard_restarts_schedule_with_warmup", + "get_inverse_sqrt_schedule", + "get_linear_schedule_with_warmup", + "get_polynomial_decay_schedule_with_warmup", + "get_scheduler", + "get_wsd_schedule", + ] + _import_structure["pytorch_utils"] = [ + "Conv1D", + "apply_chunking_to_forward", + "prune_layer", + ] + _import_structure["sagemaker"] = [] + _import_structure["time_series_utils"] = [] + _import_structure["trainer"] = ["Trainer"] + _import_structure["trainer_pt_utils"] = ["torch_distributed_zero_first"] + _import_structure["trainer_seq2seq"] = ["Seq2SeqTrainer"] + +# TensorFlow-backed objects +try: + if not is_tf_available(): + raise OptionalDependencyNotAvailable() +except OptionalDependencyNotAvailable: + from .utils import dummy_tf_objects + + _import_structure["utils.dummy_tf_objects"] = [name for name in dir(dummy_tf_objects) if not name.startswith("_")] +else: + _import_structure["activations_tf"] = [] + _import_structure["generation"].extend( + [ + "TFForcedBOSTokenLogitsProcessor", + "TFForcedEOSTokenLogitsProcessor", + "TFForceTokensLogitsProcessor", + "TFGenerationMixin", + "TFLogitsProcessor", + "TFLogitsProcessorList", + "TFLogitsWarper", + "TFMinLengthLogitsProcessor", + "TFNoBadWordsLogitsProcessor", + "TFNoRepeatNGramLogitsProcessor", + "TFRepetitionPenaltyLogitsProcessor", + "TFSuppressTokensAtBeginLogitsProcessor", + "TFSuppressTokensLogitsProcessor", + "TFTemperatureLogitsWarper", + "TFTopKLogitsWarper", + "TFTopPLogitsWarper", + ] + ) + _import_structure["keras_callbacks"] = ["KerasMetricCallback", "PushToHubCallback"] + _import_structure["modeling_tf_outputs"] = [] + _import_structure["modeling_tf_utils"] = [ + "TFPreTrainedModel", + "TFSequenceSummary", + "TFSharedEmbeddings", + "shape_list", + ] + _import_structure["optimization_tf"] = [ + "AdamWeightDecay", + "GradientAccumulator", + "WarmUp", + "create_optimizer", + ] + _import_structure["tf_utils"] = [] + + +# FLAX-backed objects +try: + if not is_flax_available(): + raise OptionalDependencyNotAvailable() +except OptionalDependencyNotAvailable: + from .utils import dummy_flax_objects + + _import_structure["utils.dummy_flax_objects"] = [ + name for name in dir(dummy_flax_objects) if not name.startswith("_") + ] +else: + _import_structure["generation"].extend( + [ + "FlaxForcedBOSTokenLogitsProcessor", + "FlaxForcedEOSTokenLogitsProcessor", + "FlaxForceTokensLogitsProcessor", + "FlaxGenerationMixin", + "FlaxLogitsProcessor", + "FlaxLogitsProcessorList", + "FlaxLogitsWarper", + "FlaxMinLengthLogitsProcessor", + "FlaxTemperatureLogitsWarper", + "FlaxSuppressTokensAtBeginLogitsProcessor", + "FlaxSuppressTokensLogitsProcessor", + "FlaxTopKLogitsWarper", + "FlaxTopPLogitsWarper", + "FlaxWhisperTimeStampLogitsProcessor", + ] + ) + _import_structure["modeling_flax_outputs"] = [] + _import_structure["modeling_flax_utils"] = ["FlaxPreTrainedModel"] + +# Direct imports for type-checking +if TYPE_CHECKING: + # All modeling imports + from .configuration_utils import PretrainedConfig + + # Data + from .data import ( + DataProcessor, + InputExample, + InputFeatures, + SingleSentenceClassificationProcessor, + SquadExample, + SquadFeatures, + SquadV1Processor, + SquadV2Processor, + glue_compute_metrics, + glue_convert_examples_to_features, + glue_output_modes, + glue_processors, + glue_tasks_num_labels, + squad_convert_examples_to_features, + xnli_compute_metrics, + xnli_output_modes, + xnli_processors, + xnli_tasks_num_labels, + ) + from .data.data_collator import ( + DataCollator, + DataCollatorForLanguageModeling, + DataCollatorForMultipleChoice, + DataCollatorForPermutationLanguageModeling, + DataCollatorForSeq2Seq, + DataCollatorForSOP, + DataCollatorForTokenClassification, + DataCollatorForWholeWordMask, + DataCollatorWithFlattening, + DataCollatorWithPadding, + DefaultDataCollator, + default_data_collator, + ) + from .feature_extraction_sequence_utils import SequenceFeatureExtractor + + # Feature Extractor + from .feature_extraction_utils import BatchFeature, FeatureExtractionMixin + + # Generation + from .generation import ( + AsyncTextIteratorStreamer, + CompileConfig, + GenerationConfig, + TextIteratorStreamer, + TextStreamer, + WatermarkingConfig, + ) + from .hf_argparser import HfArgumentParser + + # Integrations + from .integrations import ( + is_clearml_available, + is_comet_available, + is_dvclive_available, + is_neptune_available, + is_optuna_available, + is_ray_available, + is_ray_tune_available, + is_sigopt_available, + is_swanlab_available, + is_tensorboard_available, + is_wandb_available, + ) + + # Model Cards + from .modelcard import ModelCard + + # TF 2.0 <=> PyTorch conversion utilities + from .modeling_tf_pytorch_utils import ( + convert_tf_weight_name_to_pt_weight_name, + load_pytorch_checkpoint_in_tf2_model, + load_pytorch_model_in_tf2_model, + load_pytorch_weights_in_tf2_model, + load_tf2_checkpoint_in_pytorch_model, + load_tf2_model_in_pytorch_model, + load_tf2_weights_in_pytorch_model, + ) + from .models import * + + # Pipelines + from .pipelines import ( + AudioClassificationPipeline, + AutomaticSpeechRecognitionPipeline, + CsvPipelineDataFormat, + DepthEstimationPipeline, + DocumentQuestionAnsweringPipeline, + FeatureExtractionPipeline, + FillMaskPipeline, + ImageClassificationPipeline, + ImageFeatureExtractionPipeline, + ImageSegmentationPipeline, + ImageTextToTextPipeline, + ImageToImagePipeline, + ImageToTextPipeline, + JsonPipelineDataFormat, + MaskGenerationPipeline, + NerPipeline, + ObjectDetectionPipeline, + PipedPipelineDataFormat, + Pipeline, + PipelineDataFormat, + QuestionAnsweringPipeline, + SummarizationPipeline, + TableQuestionAnsweringPipeline, + Text2TextGenerationPipeline, + TextClassificationPipeline, + TextGenerationPipeline, + TextToAudioPipeline, + TokenClassificationPipeline, + TranslationPipeline, + VideoClassificationPipeline, + VisualQuestionAnsweringPipeline, + ZeroShotAudioClassificationPipeline, + ZeroShotClassificationPipeline, + ZeroShotImageClassificationPipeline, + ZeroShotObjectDetectionPipeline, + pipeline, + ) + from .processing_utils import ProcessorMixin + + # Tokenization + from .tokenization_utils import PreTrainedTokenizer + from .tokenization_utils_base import ( + AddedToken, + BatchEncoding, + CharSpan, + PreTrainedTokenizerBase, + SpecialTokensMixin, + TokenSpan, + ) + + # Trainer + from .trainer_callback import ( + DefaultFlowCallback, + EarlyStoppingCallback, + PrinterCallback, + ProgressCallback, + TrainerCallback, + TrainerControl, + TrainerState, + ) + from .trainer_utils import ( + EvalPrediction, + IntervalStrategy, + SchedulerType, + enable_full_determinism, + set_seed, + ) + from .training_args import TrainingArguments + from .training_args_seq2seq import Seq2SeqTrainingArguments + from .training_args_tf import TFTrainingArguments + + # Files and general utilities + from .utils import ( + CONFIG_NAME, + MODEL_CARD_NAME, + PYTORCH_PRETRAINED_BERT_CACHE, + PYTORCH_TRANSFORMERS_CACHE, + SPIECE_UNDERLINE, + TF2_WEIGHTS_NAME, + TF_WEIGHTS_NAME, + TRANSFORMERS_CACHE, + WEIGHTS_NAME, + TensorType, + add_end_docstrings, + add_start_docstrings, + is_apex_available, + is_av_available, + is_bitsandbytes_available, + is_datasets_available, + is_faiss_available, + is_flax_available, + is_keras_nlp_available, + is_phonemizer_available, + is_psutil_available, + is_py3nvml_available, + is_pyctcdecode_available, + is_sacremoses_available, + is_safetensors_available, + is_scipy_available, + is_sentencepiece_available, + is_sklearn_available, + is_speech_available, + is_tensorflow_text_available, + is_tf_available, + is_timm_available, + is_tokenizers_available, + is_torch_available, + is_torch_hpu_available, + is_torch_mlu_available, + is_torch_musa_available, + is_torch_neuroncore_available, + is_torch_npu_available, + is_torch_xla_available, + is_torch_xpu_available, + is_torchvision_available, + is_vision_available, + logging, + ) + + # bitsandbytes config + from .utils.quantization_config import ( + AqlmConfig, + AutoRoundConfig, + AwqConfig, + BitNetConfig, + BitsAndBytesConfig, + CompressedTensorsConfig, + EetqConfig, + FbgemmFp8Config, + FineGrainedFP8Config, + GPTQConfig, + HiggsConfig, + HqqConfig, + QuantoConfig, + QuarkConfig, + SpQRConfig, + TorchAoConfig, + VptqConfig, + ) + + try: + if not is_tokenizers_available(): + raise OptionalDependencyNotAvailable() + except OptionalDependencyNotAvailable: + from .utils.dummy_tokenizers_objects import * + else: + from .tokenization_utils_fast import PreTrainedTokenizerFast + + try: + if not (is_sentencepiece_available() and is_tokenizers_available()): + raise OptionalDependencyNotAvailable() + except OptionalDependencyNotAvailable: + from .utils.dummies_sentencepiece_and_tokenizers_objects import * + else: + from .convert_slow_tokenizer import ( + SLOW_TO_FAST_CONVERTERS, + convert_slow_tokenizer, + ) + + try: + if not is_vision_available(): + raise OptionalDependencyNotAvailable() + except OptionalDependencyNotAvailable: + from .utils.dummy_vision_objects import * + else: + from .image_processing_base import ImageProcessingMixin + from .image_processing_utils import BaseImageProcessor + from .image_utils import ImageFeatureExtractionMixin + + try: + if not is_torchvision_available(): + raise OptionalDependencyNotAvailable() + except OptionalDependencyNotAvailable: + from .utils.dummy_torchvision_objects import * + else: + from .image_processing_utils_fast import BaseImageProcessorFast + + try: + if not (is_torchvision_available() and is_timm_available()): + raise OptionalDependencyNotAvailable() + except OptionalDependencyNotAvailable: + from .utils.dummy_timm_and_torchvision_objects import * + else: + from .models.timm_wrapper import TimmWrapperImageProcessor + + # Modeling + try: + if not is_torch_available(): + raise OptionalDependencyNotAvailable() + except OptionalDependencyNotAvailable: + from .utils.dummy_pt_objects import * + else: + # Debugging + from .cache_utils import ( + Cache, + CacheConfig, + DynamicCache, + EncoderDecoderCache, + HQQQuantizedCache, + HybridCache, + MambaCache, + OffloadedCache, + OffloadedStaticCache, + QuantizedCache, + QuantizedCacheConfig, + QuantoQuantizedCache, + SinkCache, + SlidingWindowCache, + StaticCache, + ) + from .data.datasets import ( + GlueDataset, + GlueDataTrainingArguments, + LineByLineTextDataset, + LineByLineWithRefDataset, + LineByLineWithSOPTextDataset, + SquadDataset, + SquadDataTrainingArguments, + TextDataset, + TextDatasetForNextSentencePrediction, + ) + from .generation import ( + AlternatingCodebooksLogitsProcessor, + BayesianDetectorConfig, + BayesianDetectorModel, + BeamScorer, + BeamSearchScorer, + ClassifierFreeGuidanceLogitsProcessor, + ConstrainedBeamSearchScorer, + Constraint, + ConstraintListState, + DisjunctiveConstraint, + EncoderNoRepeatNGramLogitsProcessor, + EncoderRepetitionPenaltyLogitsProcessor, + EosTokenCriteria, + EpsilonLogitsWarper, + EtaLogitsWarper, + ExponentialDecayLengthPenalty, + ForcedBOSTokenLogitsProcessor, + ForcedEOSTokenLogitsProcessor, + GenerationMixin, + HammingDiversityLogitsProcessor, + InfNanRemoveLogitsProcessor, + LogitNormalization, + LogitsProcessor, + LogitsProcessorList, + MaxLengthCriteria, + MaxTimeCriteria, + MinLengthLogitsProcessor, + MinNewTokensLengthLogitsProcessor, + MinPLogitsWarper, + NoBadWordsLogitsProcessor, + NoRepeatNGramLogitsProcessor, + PhrasalConstraint, + PrefixConstrainedLogitsProcessor, + RepetitionPenaltyLogitsProcessor, + SequenceBiasLogitsProcessor, + StoppingCriteria, + StoppingCriteriaList, + StopStringCriteria, + SuppressTokensAtBeginLogitsProcessor, + SuppressTokensLogitsProcessor, + SynthIDTextWatermarkDetector, + SynthIDTextWatermarkingConfig, + SynthIDTextWatermarkLogitsProcessor, + TemperatureLogitsWarper, + TopKLogitsWarper, + TopPLogitsWarper, + TypicalLogitsWarper, + UnbatchedClassifierFreeGuidanceLogitsProcessor, + WatermarkDetector, + WatermarkLogitsProcessor, + WhisperTimeStampLogitsProcessor, + ) + from .integrations.executorch import ( + TorchExportableModuleWithStaticCache, + convert_and_export_with_cache, + ) + from .model_debugging_utils import ( + model_addition_debugger_context, + ) + from .modeling_layers import GradientCheckpointingLayer + from .modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update + from .modeling_utils import AttentionInterface, PreTrainedModel + + # Optimization + from .optimization import ( + Adafactor, + get_constant_schedule, + get_constant_schedule_with_warmup, + get_cosine_schedule_with_warmup, + get_cosine_with_hard_restarts_schedule_with_warmup, + get_inverse_sqrt_schedule, + get_linear_schedule_with_warmup, + get_polynomial_decay_schedule_with_warmup, + get_scheduler, + get_wsd_schedule, + ) + from .pytorch_utils import Conv1D, apply_chunking_to_forward, prune_layer + + # Trainer + from .trainer import Trainer + from .trainer_pt_utils import torch_distributed_zero_first + from .trainer_seq2seq import Seq2SeqTrainer + + # TensorFlow + try: + if not is_tf_available(): + raise OptionalDependencyNotAvailable() + except OptionalDependencyNotAvailable: + # Import the same objects as dummies to get them in the namespace. + # They will raise an import error if the user tries to instantiate / use them. + from .utils.dummy_tf_objects import * + else: + from .generation import ( + TFForcedBOSTokenLogitsProcessor, + TFForcedEOSTokenLogitsProcessor, + TFForceTokensLogitsProcessor, + TFGenerationMixin, + TFLogitsProcessor, + TFLogitsProcessorList, + TFLogitsWarper, + TFMinLengthLogitsProcessor, + TFNoBadWordsLogitsProcessor, + TFNoRepeatNGramLogitsProcessor, + TFRepetitionPenaltyLogitsProcessor, + TFSuppressTokensAtBeginLogitsProcessor, + TFSuppressTokensLogitsProcessor, + TFTemperatureLogitsWarper, + TFTopKLogitsWarper, + TFTopPLogitsWarper, + ) + from .keras_callbacks import KerasMetricCallback, PushToHubCallback + from .modeling_tf_utils import ( + TFPreTrainedModel, + TFSequenceSummary, + TFSharedEmbeddings, + shape_list, + ) + + # Optimization + from .optimization_tf import ( + AdamWeightDecay, + GradientAccumulator, + WarmUp, + create_optimizer, + ) + + try: + if not is_flax_available(): + raise OptionalDependencyNotAvailable() + except OptionalDependencyNotAvailable: + # Import the same objects as dummies to get them in the namespace. + # They will raise an import error if the user tries to instantiate / use them. + from .utils.dummy_flax_objects import * + else: + from .generation import ( + FlaxForcedBOSTokenLogitsProcessor, + FlaxForcedEOSTokenLogitsProcessor, + FlaxForceTokensLogitsProcessor, + FlaxGenerationMixin, + FlaxLogitsProcessor, + FlaxLogitsProcessorList, + FlaxLogitsWarper, + FlaxMinLengthLogitsProcessor, + FlaxSuppressTokensAtBeginLogitsProcessor, + FlaxSuppressTokensLogitsProcessor, + FlaxTemperatureLogitsWarper, + FlaxTopKLogitsWarper, + FlaxTopPLogitsWarper, + FlaxWhisperTimeStampLogitsProcessor, + ) + from .modeling_flax_utils import FlaxPreTrainedModel + +else: + import sys + + _import_structure = {k: set(v) for k, v in _import_structure.items()} + + import_structure = define_import_structure(Path(__file__).parent / "models", prefix="models") + import_structure[frozenset({})].update(_import_structure) + + sys.modules[__name__] = _LazyModule( + __name__, + globals()["__file__"], + import_structure, + module_spec=__spec__, + extra_objects={"__version__": __version__}, + ) + + +if not is_tf_available() and not is_torch_available() and not is_flax_available(): + logger.warning_advice( + "None of PyTorch, TensorFlow >= 2.0, or Flax have been found. " + "Models won't be available and only tokenizers, configuration " + "and file/data utilities can be used." + ) diff --git a/docs/transformers/build/lib/transformers/activations.py b/docs/transformers/build/lib/transformers/activations.py new file mode 100644 index 0000000000000000000000000000000000000000..2dab2fb32cd09839ea725cadd9bde1b3a48c8521 --- /dev/null +++ b/docs/transformers/build/lib/transformers/activations.py @@ -0,0 +1,228 @@ +# Copyright 2020 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import math +from collections import OrderedDict + +import torch +from torch import Tensor, nn + +from .utils import logging + + +logger = logging.get_logger(__name__) + + +class PytorchGELUTanh(nn.Module): + """ + A fast C implementation of the tanh approximation of the GeLU activation function. See + https://arxiv.org/abs/1606.08415. + + This implementation is equivalent to NewGELU and FastGELU but much faster. However, it is not an exact numerical + match due to rounding errors. + """ + + def forward(self, input: Tensor) -> Tensor: + return nn.functional.gelu(input, approximate="tanh") + + +class NewGELUActivation(nn.Module): + """ + Implementation of the GELU activation function currently in Google BERT repo (identical to OpenAI GPT). Also see + the Gaussian Error Linear Units paper: https://arxiv.org/abs/1606.08415 + """ + + def forward(self, input: Tensor) -> Tensor: + return 0.5 * input * (1.0 + torch.tanh(math.sqrt(2.0 / math.pi) * (input + 0.044715 * torch.pow(input, 3.0)))) + + +class GELUActivation(nn.Module): + """ + Original Implementation of the GELU activation function in Google BERT repo when initially created. For + information: OpenAI GPT's GELU is slightly different (and gives slightly different results): 0.5 * x * (1 + + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3)))) This is now written in C in nn.functional + Also see the Gaussian Error Linear Units paper: https://arxiv.org/abs/1606.08415 + """ + + def __init__(self, use_gelu_python: bool = False): + super().__init__() + if use_gelu_python: + self.act = self._gelu_python + else: + self.act = nn.functional.gelu + + def _gelu_python(self, input: Tensor) -> Tensor: + return input * 0.5 * (1.0 + torch.erf(input / math.sqrt(2.0))) + + def forward(self, input: Tensor) -> Tensor: + return self.act(input) + + +class FastGELUActivation(nn.Module): + """ + Applies GELU approximation that is slower than QuickGELU but more accurate. See: https://github.com/hendrycks/GELUs + """ + + def forward(self, input: Tensor) -> Tensor: + return 0.5 * input * (1.0 + torch.tanh(input * 0.7978845608 * (1.0 + 0.044715 * input * input))) + + +class QuickGELUActivation(nn.Module): + """ + Applies GELU approximation that is fast but somewhat inaccurate. See: https://github.com/hendrycks/GELUs + """ + + def forward(self, input: Tensor) -> Tensor: + return input * torch.sigmoid(1.702 * input) + + +class ClippedGELUActivation(nn.Module): + """ + Clip the range of possible GeLU outputs between [min, max]. This is especially useful for quantization purpose, as + it allows mapping negatives values in the GeLU spectrum. For more information on this trick, please refer to + https://arxiv.org/abs/2004.09602. + + Gaussian Error Linear Unit. Original Implementation of the gelu activation function in Google Bert repo when + initially created. + + For information: OpenAI GPT's gelu is slightly different (and gives slightly different results): 0.5 * x * (1 + + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3)))). See https://arxiv.org/abs/1606.08415 + """ + + def __init__(self, min: float, max: float): + if min > max: + raise ValueError(f"min should be < max (got min: {min}, max: {max})") + + super().__init__() + self.min = min + self.max = max + + def forward(self, x: Tensor) -> Tensor: + return torch.clip(gelu(x), self.min, self.max) + + +class AccurateGELUActivation(nn.Module): + """ + Applies GELU approximation that is faster than default and more accurate than QuickGELU. See: + https://github.com/hendrycks/GELUs + + Implemented along with MEGA (Moving Average Equipped Gated Attention) + """ + + def __init__(self): + super().__init__() + self.precomputed_constant = math.sqrt(2 / math.pi) + + def forward(self, input: Tensor) -> Tensor: + return 0.5 * input * (1 + torch.tanh(self.precomputed_constant * (input + 0.044715 * torch.pow(input, 3)))) + + +class MishActivation(nn.Module): + """ + See Mish: A Self-Regularized Non-Monotonic Activation Function (Misra., https://arxiv.org/abs/1908.08681). Also + visit the official repository for the paper: https://github.com/digantamisra98/Mish + """ + + def __init__(self): + super().__init__() + self.act = nn.functional.mish + + def _mish_python(self, input: Tensor) -> Tensor: + return input * torch.tanh(nn.functional.softplus(input)) + + def forward(self, input: Tensor) -> Tensor: + return self.act(input) + + +class LinearActivation(nn.Module): + """ + Applies the linear activation function, i.e. forwarding input directly to output. + """ + + def forward(self, input: Tensor) -> Tensor: + return input + + +class LaplaceActivation(nn.Module): + """ + Applies elementwise activation based on Laplace function, introduced in MEGA as an attention activation. See + https://arxiv.org/abs/2209.10655 + + Inspired by squared relu, but with bounded range and gradient for better stability + """ + + def forward(self, input, mu=0.707107, sigma=0.282095): + input = (input - mu).div(sigma * math.sqrt(2.0)) + return 0.5 * (1.0 + torch.erf(input)) + + +class ReLUSquaredActivation(nn.Module): + """ + Applies the relu^2 activation introduced in https://arxiv.org/abs/2109.08668v2 + """ + + def forward(self, input): + relu_applied = nn.functional.relu(input) + squared = torch.square(relu_applied) + return squared + + +class ClassInstantier(OrderedDict): + def __getitem__(self, key): + content = super().__getitem__(key) + cls, kwargs = content if isinstance(content, tuple) else (content, {}) + return cls(**kwargs) + + +ACT2CLS = { + "gelu": GELUActivation, + "gelu_10": (ClippedGELUActivation, {"min": -10, "max": 10}), + "gelu_fast": FastGELUActivation, + "gelu_new": NewGELUActivation, + "gelu_python": (GELUActivation, {"use_gelu_python": True}), + "gelu_pytorch_tanh": PytorchGELUTanh, + "gelu_accurate": AccurateGELUActivation, + "laplace": LaplaceActivation, + "leaky_relu": nn.LeakyReLU, + "linear": LinearActivation, + "mish": MishActivation, + "quick_gelu": QuickGELUActivation, + "relu": nn.ReLU, + "relu2": ReLUSquaredActivation, + "relu6": nn.ReLU6, + "sigmoid": nn.Sigmoid, + "silu": nn.SiLU, + "swish": nn.SiLU, + "tanh": nn.Tanh, + "prelu": nn.PReLU, +} +ACT2FN = ClassInstantier(ACT2CLS) + + +def get_activation(activation_string): + if activation_string in ACT2FN: + return ACT2FN[activation_string] + else: + raise KeyError(f"function {activation_string} not found in ACT2FN mapping {list(ACT2FN.keys())}") + + +# For backwards compatibility with: from activations import gelu_python +gelu_python = get_activation("gelu_python") +gelu_new = get_activation("gelu_new") +gelu = get_activation("gelu") +gelu_fast = get_activation("gelu_fast") +quick_gelu = get_activation("quick_gelu") +silu = get_activation("silu") +mish = get_activation("mish") +linear_act = get_activation("linear") diff --git a/docs/transformers/build/lib/transformers/activations_tf.py b/docs/transformers/build/lib/transformers/activations_tf.py new file mode 100644 index 0000000000000000000000000000000000000000..d12b73ea45176f3a4bc42cdabe8b73078a3b90f2 --- /dev/null +++ b/docs/transformers/build/lib/transformers/activations_tf.py @@ -0,0 +1,147 @@ +# Copyright 2020 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import math + +import tensorflow as tf +from packaging.version import parse + + +try: + import tf_keras as keras +except (ModuleNotFoundError, ImportError): + import keras + + if parse(keras.__version__).major > 2: + raise ValueError( + "Your currently installed version of Keras is Keras 3, but this is not yet supported in " + "Transformers. Please install the backwards-compatible tf-keras package with " + "`pip install tf-keras`." + ) + + +def _gelu(x): + """ + Gaussian Error Linear Unit. Original Implementation of the gelu activation function in Google Bert repo when + initially created. For information: OpenAI GPT's gelu is slightly different (and gives slightly different results): + 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3)))) Also see + https://arxiv.org/abs/1606.08415 + """ + x = tf.convert_to_tensor(x) + cdf = 0.5 * (1.0 + tf.math.erf(x / tf.cast(tf.sqrt(2.0), x.dtype))) + + return x * cdf + + +def _gelu_new(x): + """ + Gaussian Error Linear Unit. This is a smoother version of the GELU. Original paper: https://arxiv.org/abs/1606.0841 + + Args: + x: float Tensor to perform activation + + Returns: + `x` with the GELU activation applied. + """ + x = tf.convert_to_tensor(x) + pi = tf.cast(math.pi, x.dtype) + coeff = tf.cast(0.044715, x.dtype) + cdf = 0.5 * (1.0 + tf.tanh(tf.sqrt(2.0 / pi) * (x + coeff * tf.pow(x, 3)))) + + return x * cdf + + +def mish(x): + x = tf.convert_to_tensor(x) + + return x * tf.tanh(tf.math.softplus(x)) + + +def gelu_fast(x): + x = tf.convert_to_tensor(x) + coeff1 = tf.cast(0.044715, x.dtype) + coeff2 = tf.cast(0.7978845608, x.dtype) + + return 0.5 * x * (1.0 + tf.tanh(x * coeff2 * (1.0 + coeff1 * x * x))) + + +def quick_gelu(x): + x = tf.convert_to_tensor(x) + coeff = tf.cast(1.702, x.dtype) + return x * tf.math.sigmoid(coeff * x) + + +def gelu_10(x): + """ + Clip the range of possible GeLU outputs between [-10, 10]. This is especially useful for quantization purpose, as + it allows mapping 2 negatives values in the GeLU spectrum. For more information on this trick, please refer to + https://arxiv.org/abs/2004.09602 + + Gaussian Error Linear Unit. Original Implementation of the gelu activation function in Google Bert repo when + initially created. For information: OpenAI GPT's gelu is slightly different (and gives slightly different results): + 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3)))) Also see + https://arxiv.org/abs/1606.08415 :param x: :return: + """ + return tf.clip_by_value(_gelu(x), -10, 10) + + +def glu(x, axis=-1): + """ + Gated Linear Unit. Implementation as defined in the original paper (see https://arxiv.org/abs/1612.08083), where + the input `x` is split in two halves across a dimension (`axis`), A and B, returning A * sigmoid(B). + + Args: + `x`: float Tensor to perform activation + `axis`: dimension across which `x` be split in half + + Returns: + `x` with the GLU activation applied (with its size halved across the dimension `axis`). + """ + a, b = tf.split(x, 2, axis=axis) + return a * tf.math.sigmoid(b) + + +if parse(tf.version.VERSION) >= parse("2.4"): + + def approximate_gelu_wrap(x): + return keras.activations.gelu(x, approximate=True) + + gelu = keras.activations.gelu + gelu_new = approximate_gelu_wrap +else: + gelu = _gelu + gelu_new = _gelu_new + + +ACT2FN = { + "gelu": gelu, + "gelu_10": gelu_10, + "gelu_fast": gelu_fast, + "gelu_new": gelu_new, + "glu": glu, + "mish": mish, + "quick_gelu": quick_gelu, + "relu": keras.activations.relu, + "sigmoid": keras.activations.sigmoid, + "silu": keras.activations.swish, + "swish": keras.activations.swish, + "tanh": keras.activations.tanh, +} + + +def get_tf_activation(activation_string): + if activation_string in ACT2FN: + return ACT2FN[activation_string] + else: + raise KeyError(f"function {activation_string} not found in ACT2FN mapping {list(ACT2FN.keys())}") diff --git a/docs/transformers/build/lib/transformers/audio_utils.py b/docs/transformers/build/lib/transformers/audio_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..8420a84e089e03be9a80fb63c237e34203ea28a0 --- /dev/null +++ b/docs/transformers/build/lib/transformers/audio_utils.py @@ -0,0 +1,1193 @@ +# Copyright 2023 The HuggingFace Inc. team and the librosa & torchaudio authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Audio processing functions to extract features from audio waveforms. This code is pure numpy to support all frameworks +and remove unnecessary dependencies. +""" + +import os +import warnings +from io import BytesIO +from typing import List, Optional, Tuple, Union + +import numpy as np +import requests + +from .utils import is_librosa_available, requires_backends + + +if is_librosa_available(): + import librosa + + +def load_audio(audio: Union[str, np.ndarray], sampling_rate=16000, timeout=None) -> np.ndarray: + """ + Loads `audio` to an np.ndarray object. + + Args: + audio (`str` or `np.ndarray`): + The audio to be laoded to the numpy array format. + sampling_rate (`int`, *optional*, defaults to 16000): + The samlping rate to be used when loading the audio. It should be same as the + sampling rate the model you will be using further was trained with. + timeout (`float`, *optional*): + The timeout value in seconds for the URL request. + + Returns: + `np.ndarray`: A numpy artay representing the audio. + """ + requires_backends(load_audio, ["librosa"]) + + if isinstance(audio, str): + # Load audio from URL (e.g https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/translate_to_chinese.wav) + if audio.startswith("http://") or audio.startswith("https://"): + audio = librosa.load(BytesIO(requests.get(audio, timeout=timeout).content), sr=sampling_rate)[0] + elif os.path.isfile(audio): + audio = librosa.load(audio, sr=sampling_rate)[0] + elif isinstance(audio, np.ndarray): + audio = audio + else: + raise TypeError( + "Incorrect format used for `audio`. Should be an url linking to an audio, a local path, or numpy array." + ) + return audio + + +AudioInput = Union[ + np.ndarray, "torch.Tensor", List[np.ndarray], Tuple[np.ndarray], List["torch.Tensor"], Tuple["torch.Tensor"] # noqa: F821 +] + + +def hertz_to_mel(freq: Union[float, np.ndarray], mel_scale: str = "htk") -> Union[float, np.ndarray]: + """ + Convert frequency from hertz to mels. + + Args: + freq (`float` or `np.ndarray`): + The frequency, or multiple frequencies, in hertz (Hz). + mel_scale (`str`, *optional*, defaults to `"htk"`): + The mel frequency scale to use, `"htk"`, `"kaldi"` or `"slaney"`. + + Returns: + `float` or `np.ndarray`: The frequencies on the mel scale. + """ + + if mel_scale not in ["slaney", "htk", "kaldi"]: + raise ValueError('mel_scale should be one of "htk", "slaney" or "kaldi".') + + if mel_scale == "htk": + return 2595.0 * np.log10(1.0 + (freq / 700.0)) + elif mel_scale == "kaldi": + return 1127.0 * np.log(1.0 + (freq / 700.0)) + + min_log_hertz = 1000.0 + min_log_mel = 15.0 + logstep = 27.0 / np.log(6.4) + mels = 3.0 * freq / 200.0 + + if isinstance(freq, np.ndarray): + log_region = freq >= min_log_hertz + mels[log_region] = min_log_mel + np.log(freq[log_region] / min_log_hertz) * logstep + elif freq >= min_log_hertz: + mels = min_log_mel + np.log(freq / min_log_hertz) * logstep + + return mels + + +def mel_to_hertz(mels: Union[float, np.ndarray], mel_scale: str = "htk") -> Union[float, np.ndarray]: + """ + Convert frequency from mels to hertz. + + Args: + mels (`float` or `np.ndarray`): + The frequency, or multiple frequencies, in mels. + mel_scale (`str`, *optional*, `"htk"`): + The mel frequency scale to use, `"htk"`, `"kaldi"` or `"slaney"`. + + Returns: + `float` or `np.ndarray`: The frequencies in hertz. + """ + + if mel_scale not in ["slaney", "htk", "kaldi"]: + raise ValueError('mel_scale should be one of "htk", "slaney" or "kaldi".') + + if mel_scale == "htk": + return 700.0 * (np.power(10, mels / 2595.0) - 1.0) + elif mel_scale == "kaldi": + return 700.0 * (np.exp(mels / 1127.0) - 1.0) + + min_log_hertz = 1000.0 + min_log_mel = 15.0 + logstep = np.log(6.4) / 27.0 + freq = 200.0 * mels / 3.0 + + if isinstance(mels, np.ndarray): + log_region = mels >= min_log_mel + freq[log_region] = min_log_hertz * np.exp(logstep * (mels[log_region] - min_log_mel)) + elif mels >= min_log_mel: + freq = min_log_hertz * np.exp(logstep * (mels - min_log_mel)) + + return freq + + +def hertz_to_octave( + freq: Union[float, np.ndarray], tuning: Optional[float] = 0.0, bins_per_octave: Optional[int] = 12 +): + """ + Convert frequency from hertz to fractional octave numbers. + Adapted from *librosa*. + + Args: + freq (`float` or `np.ndarray`): + The frequency, or multiple frequencies, in hertz (Hz). + tuning (`float`, defaults to `0.`): + Tuning deviation from the Stuttgart pitch (A440) in (fractional) bins per octave. + bins_per_octave (`int`, defaults to `12`): + Number of bins per octave. + + Returns: + `float` or `np.ndarray`: The frequencies on the octave scale. + """ + stuttgart_pitch = 440.0 * 2.0 ** (tuning / bins_per_octave) + octave = np.log2(freq / (float(stuttgart_pitch) / 16)) + return octave + + +def _create_triangular_filter_bank(fft_freqs: np.ndarray, filter_freqs: np.ndarray) -> np.ndarray: + """ + Creates a triangular filter bank. + + Adapted from *torchaudio* and *librosa*. + + Args: + fft_freqs (`np.ndarray` of shape `(num_frequency_bins,)`): + Discrete frequencies of the FFT bins in Hz. + filter_freqs (`np.ndarray` of shape `(num_mel_filters,)`): + Center frequencies of the triangular filters to create, in Hz. + + Returns: + `np.ndarray` of shape `(num_frequency_bins, num_mel_filters)` + """ + filter_diff = np.diff(filter_freqs) + slopes = np.expand_dims(filter_freqs, 0) - np.expand_dims(fft_freqs, 1) + down_slopes = -slopes[:, :-2] / filter_diff[:-1] + up_slopes = slopes[:, 2:] / filter_diff[1:] + return np.maximum(np.zeros(1), np.minimum(down_slopes, up_slopes)) + + +def chroma_filter_bank( + num_frequency_bins: int, + num_chroma: int, + sampling_rate: int, + tuning: float = 0.0, + power: Optional[float] = 2.0, + weighting_parameters: Optional[tuple[float, float]] = (5.0, 2.0), + start_at_c_chroma: Optional[bool] = True, +): + """ + Creates a chroma filter bank, i.e a linear transformation to project spectrogram bins onto chroma bins. + + Adapted from *librosa*. + + Args: + num_frequency_bins (`int`): + Number of frequencies used to compute the spectrogram (should be the same as in `stft`). + num_chroma (`int`): + Number of chroma bins (i.e pitch classes). + sampling_rate (`float`): + Sample rate of the audio waveform. + tuning (`float`): + Tuning deviation from A440 in fractions of a chroma bin. + power (`float`, *optional*, defaults to 2.0): + If 12.0, normalizes each column with their L2 norm. If 1.0, normalizes each column with their L1 norm. + weighting_parameters (`Tuple[float, float]`, *optional*, defaults to `(5., 2.)`): + If specified, apply a Gaussian weighting parameterized by the first element of the tuple being the center and + the second element being the Gaussian half-width. + start_at_c_chroma (`float`, *optional*, defaults to `True`): + If True, the filter bank will start at the 'C' pitch class. Otherwise, it will start at 'A'. + Returns: + `np.ndarray` of shape `(num_frequency_bins, num_chroma)` + """ + # Get the FFT bins, not counting the DC component + frequencies = np.linspace(0, sampling_rate, num_frequency_bins, endpoint=False)[1:] + + freq_bins = num_chroma * hertz_to_octave(frequencies, tuning=tuning, bins_per_octave=num_chroma) + + # make up a value for the 0 Hz bin = 1.5 octaves below bin 1 + # (so chroma is 50% rotated from bin 1, and bin width is broad) + freq_bins = np.concatenate(([freq_bins[0] - 1.5 * num_chroma], freq_bins)) + + bins_width = np.concatenate((np.maximum(freq_bins[1:] - freq_bins[:-1], 1.0), [1])) + + chroma_filters = np.subtract.outer(freq_bins, np.arange(0, num_chroma, dtype="d")).T + + num_chroma2 = np.round(float(num_chroma) / 2) + + # Project into range -num_chroma/2 .. num_chroma/2 + # add on fixed offset of 10*num_chroma to ensure all values passed to + # rem are positive + chroma_filters = np.remainder(chroma_filters + num_chroma2 + 10 * num_chroma, num_chroma) - num_chroma2 + + # Gaussian bumps - 2*D to make them narrower + chroma_filters = np.exp(-0.5 * (2 * chroma_filters / np.tile(bins_width, (num_chroma, 1))) ** 2) + + # normalize each column + if power is not None: + chroma_filters = chroma_filters / np.sum(chroma_filters**power, axis=0, keepdims=True) ** (1.0 / power) + + # Maybe apply scaling for fft bins + if weighting_parameters is not None: + center, half_width = weighting_parameters + chroma_filters *= np.tile( + np.exp(-0.5 * (((freq_bins / num_chroma - center) / half_width) ** 2)), + (num_chroma, 1), + ) + + if start_at_c_chroma: + chroma_filters = np.roll(chroma_filters, -3 * (num_chroma // 12), axis=0) + + # remove aliasing columns, copy to ensure row-contiguity + return np.ascontiguousarray(chroma_filters[:, : int(1 + num_frequency_bins / 2)]) + + +def mel_filter_bank( + num_frequency_bins: int, + num_mel_filters: int, + min_frequency: float, + max_frequency: float, + sampling_rate: int, + norm: Optional[str] = None, + mel_scale: str = "htk", + triangularize_in_mel_space: bool = False, +) -> np.ndarray: + """ + Creates a frequency bin conversion matrix used to obtain a mel spectrogram. This is called a *mel filter bank*, and + various implementation exist, which differ in the number of filters, the shape of the filters, the way the filters + are spaced, the bandwidth of the filters, and the manner in which the spectrum is warped. The goal of these + features is to approximate the non-linear human perception of the variation in pitch with respect to the frequency. + + Different banks of mel filters were introduced in the literature. The following variations are supported: + + - MFCC FB-20: introduced in 1980 by Davis and Mermelstein, it assumes a sampling frequency of 10 kHz and a speech + bandwidth of `[0, 4600]` Hz. + - MFCC FB-24 HTK: from the Cambridge HMM Toolkit (HTK) (1995) uses a filter bank of 24 filters for a speech + bandwidth of `[0, 8000]` Hz. This assumes sampling rate ≥ 16 kHz. + - MFCC FB-40: from the Auditory Toolbox for MATLAB written by Slaney in 1998, assumes a sampling rate of 16 kHz and + speech bandwidth of `[133, 6854]` Hz. This version also includes area normalization. + - HFCC-E FB-29 (Human Factor Cepstral Coefficients) of Skowronski and Harris (2004), assumes a sampling rate of + 12.5 kHz and speech bandwidth of `[0, 6250]` Hz. + + This code is adapted from *torchaudio* and *librosa*. Note that the default parameters of torchaudio's + `melscale_fbanks` implement the `"htk"` filters while librosa uses the `"slaney"` implementation. + + Args: + num_frequency_bins (`int`): + Number of frequency bins (should be the same as `n_fft // 2 + 1` where `n_fft` is the size of the Fourier Transform used to compute the spectrogram). + num_mel_filters (`int`): + Number of mel filters to generate. + min_frequency (`float`): + Lowest frequency of interest in Hz. + max_frequency (`float`): + Highest frequency of interest in Hz. This should not exceed `sampling_rate / 2`. + sampling_rate (`int`): + Sample rate of the audio waveform. + norm (`str`, *optional*): + If `"slaney"`, divide the triangular mel weights by the width of the mel band (area normalization). + mel_scale (`str`, *optional*, defaults to `"htk"`): + The mel frequency scale to use, `"htk"`, `"kaldi"` or `"slaney"`. + triangularize_in_mel_space (`bool`, *optional*, defaults to `False`): + If this option is enabled, the triangular filter is applied in mel space rather than frequency space. This + should be set to `true` in order to get the same results as `torchaudio` when computing mel filters. + + Returns: + `np.ndarray` of shape (`num_frequency_bins`, `num_mel_filters`): Triangular filter bank matrix. This is a + projection matrix to go from a spectrogram to a mel spectrogram. + """ + if norm is not None and norm != "slaney": + raise ValueError('norm must be one of None or "slaney"') + + if num_frequency_bins < 2: + raise ValueError(f"Require num_frequency_bins: {num_frequency_bins} >= 2") + + if min_frequency > max_frequency: + raise ValueError(f"Require min_frequency: {min_frequency} <= max_frequency: {max_frequency}") + + # center points of the triangular mel filters + mel_min = hertz_to_mel(min_frequency, mel_scale=mel_scale) + mel_max = hertz_to_mel(max_frequency, mel_scale=mel_scale) + mel_freqs = np.linspace(mel_min, mel_max, num_mel_filters + 2) + filter_freqs = mel_to_hertz(mel_freqs, mel_scale=mel_scale) + + if triangularize_in_mel_space: + # frequencies of FFT bins in Hz, but filters triangularized in mel space + fft_bin_width = sampling_rate / ((num_frequency_bins - 1) * 2) + fft_freqs = hertz_to_mel(fft_bin_width * np.arange(num_frequency_bins), mel_scale=mel_scale) + filter_freqs = mel_freqs + else: + # frequencies of FFT bins in Hz + fft_freqs = np.linspace(0, sampling_rate // 2, num_frequency_bins) + + mel_filters = _create_triangular_filter_bank(fft_freqs, filter_freqs) + + if norm is not None and norm == "slaney": + # Slaney-style mel is scaled to be approx constant energy per channel + enorm = 2.0 / (filter_freqs[2 : num_mel_filters + 2] - filter_freqs[:num_mel_filters]) + mel_filters *= np.expand_dims(enorm, 0) + + if (mel_filters.max(axis=0) == 0.0).any(): + warnings.warn( + "At least one mel filter has all zero values. " + f"The value for `num_mel_filters` ({num_mel_filters}) may be set too high. " + f"Or, the value for `num_frequency_bins` ({num_frequency_bins}) may be set too low." + ) + + return mel_filters + + +def optimal_fft_length(window_length: int) -> int: + """ + Finds the best FFT input size for a given `window_length`. This function takes a given window length and, if not + already a power of two, rounds it up to the next power or two. + + The FFT algorithm works fastest when the length of the input is a power of two, which may be larger than the size + of the window or analysis frame. For example, if the window is 400 samples, using an FFT input size of 512 samples + is more optimal than an FFT size of 400 samples. Using a larger FFT size does not affect the detected frequencies, + it simply gives a higher frequency resolution (i.e. the frequency bins are smaller). + """ + return 2 ** int(np.ceil(np.log2(window_length))) + + +def window_function( + window_length: int, + name: str = "hann", + periodic: bool = True, + frame_length: Optional[int] = None, + center: bool = True, +) -> np.ndarray: + """ + Returns an array containing the specified window. This window is intended to be used with `stft`. + + The following window types are supported: + + - `"boxcar"`: a rectangular window + - `"hamming"`: the Hamming window + - `"hann"`: the Hann window + - `"povey"`: the Povey window + + Args: + window_length (`int`): + The length of the window in samples. + name (`str`, *optional*, defaults to `"hann"`): + The name of the window function. + periodic (`bool`, *optional*, defaults to `True`): + Whether the window is periodic or symmetric. + frame_length (`int`, *optional*): + The length of the analysis frames in samples. Provide a value for `frame_length` if the window is smaller + than the frame length, so that it will be zero-padded. + center (`bool`, *optional*, defaults to `True`): + Whether to center the window inside the FFT buffer. Only used when `frame_length` is provided. + + Returns: + `np.ndarray` of shape `(window_length,)` or `(frame_length,)` containing the window. + """ + length = window_length + 1 if periodic else window_length + + if name == "boxcar": + window = np.ones(length) + elif name in ["hamming", "hamming_window"]: + window = np.hamming(length) + elif name in ["hann", "hann_window"]: + window = np.hanning(length) + elif name in ["povey"]: + window = np.power(np.hanning(length), 0.85) + else: + raise ValueError(f"Unknown window function '{name}'") + + if periodic: + window = window[:-1] + + if frame_length is None: + return window + + if window_length > frame_length: + raise ValueError( + f"Length of the window ({window_length}) may not be larger than frame_length ({frame_length})" + ) + + padded_window = np.zeros(frame_length) + offset = (frame_length - window_length) // 2 if center else 0 + padded_window[offset : offset + window_length] = window + return padded_window + + +# TODO This method does not support batching yet as we are mainly focused on inference. +def spectrogram( + waveform: np.ndarray, + window: np.ndarray, + frame_length: int, + hop_length: int, + fft_length: Optional[int] = None, + power: Optional[float] = 1.0, + center: bool = True, + pad_mode: str = "reflect", + onesided: bool = True, + dither: float = 0.0, + preemphasis: Optional[float] = None, + mel_filters: Optional[np.ndarray] = None, + mel_floor: float = 1e-10, + log_mel: Optional[str] = None, + reference: float = 1.0, + min_value: float = 1e-10, + db_range: Optional[float] = None, + remove_dc_offset: Optional[bool] = None, + dtype: np.dtype = np.float32, +) -> np.ndarray: + """ + Calculates a spectrogram over one waveform using the Short-Time Fourier Transform. + + This function can create the following kinds of spectrograms: + + - amplitude spectrogram (`power = 1.0`) + - power spectrogram (`power = 2.0`) + - complex-valued spectrogram (`power = None`) + - log spectrogram (use `log_mel` argument) + - mel spectrogram (provide `mel_filters`) + - log-mel spectrogram (provide `mel_filters` and `log_mel`) + + How this works: + + 1. The input waveform is split into frames of size `frame_length` that are partially overlapping by `frame_length + - hop_length` samples. + 2. Each frame is multiplied by the window and placed into a buffer of size `fft_length`. + 3. The DFT is taken of each windowed frame. + 4. The results are stacked into a spectrogram. + + We make a distinction between the following "blocks" of sample data, each of which may have a different lengths: + + - The analysis frame. This is the size of the time slices that the input waveform is split into. + - The window. Each analysis frame is multiplied by the window to avoid spectral leakage. + - The FFT input buffer. The length of this determines how many frequency bins are in the spectrogram. + + In this implementation, the window is assumed to be zero-padded to have the same size as the analysis frame. A + padded window can be obtained from `window_function()`. The FFT input buffer may be larger than the analysis frame, + typically the next power of two. + + Note: This function is not optimized for speed yet. It should be mostly compatible with `librosa.stft` and + `torchaudio.functional.transforms.Spectrogram`, although it is more flexible due to the different ways spectrograms + can be constructed. + + Args: + waveform (`np.ndarray` of shape `(length,)`): + The input waveform. This must be a single real-valued, mono waveform. + window (`np.ndarray` of shape `(frame_length,)`): + The windowing function to apply, including zero-padding if necessary. The actual window length may be + shorter than `frame_length`, but we're assuming the array has already been zero-padded. + frame_length (`int`): + The length of the analysis frames in samples. With librosa this is always equal to `fft_length` but we also + allow smaller sizes. + hop_length (`int`): + The stride between successive analysis frames in samples. + fft_length (`int`, *optional*): + The size of the FFT buffer in samples. This determines how many frequency bins the spectrogram will have. + For optimal speed, this should be a power of two. If `None`, uses `frame_length`. + power (`float`, *optional*, defaults to 1.0): + If 1.0, returns the amplitude spectrogram. If 2.0, returns the power spectrogram. If `None`, returns + complex numbers. + center (`bool`, *optional*, defaults to `True`): + Whether to pad the waveform so that frame `t` is centered around time `t * hop_length`. If `False`, frame + `t` will start at time `t * hop_length`. + pad_mode (`str`, *optional*, defaults to `"reflect"`): + Padding mode used when `center` is `True`. Possible values are: `"constant"` (pad with zeros), `"edge"` + (pad with edge values), `"reflect"` (pads with mirrored values). + onesided (`bool`, *optional*, defaults to `True`): + If True, only computes the positive frequencies and returns a spectrogram containing `fft_length // 2 + 1` + frequency bins. If False, also computes the negative frequencies and returns `fft_length` frequency bins. + dither (`float`, *optional*, defaults to 0.0): + Adds dithering. In other words, adds a small Gaussian noise to each frame. + E.g. use 4.0 to add dithering with a normal distribution centered + around 0.0 with standard deviation 4.0, 0.0 means no dithering. + Dithering has similar effect as `mel_floor`. It reduces the high log_mel_fbank + values for signals with hard-zero sections, when VAD cutoff is present in the signal. + preemphasis (`float`, *optional*) + Coefficient for a low-pass filter that applies pre-emphasis before the DFT. + mel_filters (`np.ndarray` of shape `(num_freq_bins, num_mel_filters)`, *optional*): + The mel filter bank. If supplied, applies a this filter bank to create a mel spectrogram. + mel_floor (`float`, *optional*, defaults to 1e-10): + Minimum value of mel frequency banks. + log_mel (`str`, *optional*): + How to convert the spectrogram to log scale. Possible options are: `None` (don't convert), `"log"` (take + the natural logarithm) `"log10"` (take the base-10 logarithm), `"dB"` (convert to decibels). Can only be + used when `power` is not `None`. + reference (`float`, *optional*, defaults to 1.0): + Sets the input spectrogram value that corresponds to 0 dB. For example, use `np.max(spectrogram)` to set + the loudest part to 0 dB. Must be greater than zero. + min_value (`float`, *optional*, defaults to `1e-10`): + The spectrogram will be clipped to this minimum value before conversion to decibels, to avoid taking + `log(0)`. For a power spectrogram, the default of `1e-10` corresponds to a minimum of -100 dB. For an + amplitude spectrogram, the value `1e-5` corresponds to -100 dB. Must be greater than zero. + db_range (`float`, *optional*): + Sets the maximum dynamic range in decibels. For example, if `db_range = 80`, the difference between the + peak value and the smallest value will never be more than 80 dB. Must be greater than zero. + remove_dc_offset (`bool`, *optional*): + Subtract mean from waveform on each frame, applied before pre-emphasis. This should be set to `true` in + order to get the same results as `torchaudio.compliance.kaldi.fbank` when computing mel filters. + dtype (`np.dtype`, *optional*, defaults to `np.float32`): + Data type of the spectrogram tensor. If `power` is None, this argument is ignored and the dtype will be + `np.complex64`. + + Returns: + `nd.array` containing a spectrogram of shape `(num_frequency_bins, length)` for a regular spectrogram or shape + `(num_mel_filters, length)` for a mel spectrogram. + """ + window_length = len(window) + + if fft_length is None: + fft_length = frame_length + + if frame_length > fft_length: + raise ValueError(f"frame_length ({frame_length}) may not be larger than fft_length ({fft_length})") + + if window_length != frame_length: + raise ValueError(f"Length of the window ({window_length}) must equal frame_length ({frame_length})") + + if hop_length <= 0: + raise ValueError("hop_length must be greater than zero") + + if waveform.ndim != 1: + raise ValueError(f"Input waveform must have only one dimension, shape is {waveform.shape}") + + if np.iscomplexobj(waveform): + raise ValueError("Complex-valued input waveforms are not currently supported") + + if power is None and mel_filters is not None: + raise ValueError( + "You have provided `mel_filters` but `power` is `None`. Mel spectrogram computation is not yet supported for complex-valued spectrogram." + "Specify `power` to fix this issue." + ) + + # center pad the waveform + if center: + padding = [(int(frame_length // 2), int(frame_length // 2))] + waveform = np.pad(waveform, padding, mode=pad_mode) + + # promote to float64, since np.fft uses float64 internally + waveform = waveform.astype(np.float64) + window = window.astype(np.float64) + + # split waveform into frames of frame_length size + num_frames = int(1 + np.floor((waveform.size - frame_length) / hop_length)) + + num_frequency_bins = (fft_length // 2) + 1 if onesided else fft_length + spectrogram = np.empty((num_frames, num_frequency_bins), dtype=np.complex64) + + # rfft is faster than fft + fft_func = np.fft.rfft if onesided else np.fft.fft + buffer = np.zeros(fft_length) + + timestep = 0 + for frame_idx in range(num_frames): + buffer[:frame_length] = waveform[timestep : timestep + frame_length] + + if dither != 0.0: + buffer[:frame_length] += dither * np.random.randn(frame_length) + + if remove_dc_offset: + buffer[:frame_length] = buffer[:frame_length] - buffer[:frame_length].mean() + + if preemphasis is not None: + buffer[1:frame_length] -= preemphasis * buffer[: frame_length - 1] + buffer[0] *= 1 - preemphasis + + buffer[:frame_length] *= window + + spectrogram[frame_idx] = fft_func(buffer) + timestep += hop_length + + # note: ** is much faster than np.power + if power is not None: + spectrogram = np.abs(spectrogram, dtype=np.float64) ** power + + spectrogram = spectrogram.T + + if mel_filters is not None: + spectrogram = np.maximum(mel_floor, np.dot(mel_filters.T, spectrogram)) + + if power is not None and log_mel is not None: + if log_mel == "log": + spectrogram = np.log(spectrogram) + elif log_mel == "log10": + spectrogram = np.log10(spectrogram) + elif log_mel == "dB": + if power == 1.0: + spectrogram = amplitude_to_db(spectrogram, reference, min_value, db_range) + elif power == 2.0: + spectrogram = power_to_db(spectrogram, reference, min_value, db_range) + else: + raise ValueError(f"Cannot use log_mel option '{log_mel}' with power {power}") + else: + raise ValueError(f"Unknown log_mel option: {log_mel}") + + spectrogram = np.asarray(spectrogram, dtype) + + return spectrogram + + +def spectrogram_batch( + waveform_list: list[np.ndarray], + window: np.ndarray, + frame_length: int, + hop_length: int, + fft_length: Optional[int] = None, + power: Optional[float] = 1.0, + center: bool = True, + pad_mode: str = "reflect", + onesided: bool = True, + dither: float = 0.0, + preemphasis: Optional[float] = None, + mel_filters: Optional[np.ndarray] = None, + mel_floor: float = 1e-10, + log_mel: Optional[str] = None, + reference: float = 1.0, + min_value: float = 1e-10, + db_range: Optional[float] = None, + remove_dc_offset: Optional[bool] = None, + dtype: np.dtype = np.float32, +) -> list[np.ndarray]: + """ + Calculates spectrograms for a list of waveforms using the Short-Time Fourier Transform, optimized for batch processing. + This function extends the capabilities of the `spectrogram` function to handle multiple waveforms efficiently by leveraging broadcasting. + + It supports generating various types of spectrograms: + + - amplitude spectrogram (`power = 1.0`) + - power spectrogram (`power = 2.0`) + - complex-valued spectrogram (`power = None`) + - log spectrogram (use `log_mel` argument) + - mel spectrogram (provide `mel_filters`) + - log-mel spectrogram (provide `mel_filters` and `log_mel`) + + How this works: + + 1. The input waveform is split into frames of size `frame_length` that are partially overlapping by `frame_length + - hop_length` samples. + 2. Each frame is multiplied by the window and placed into a buffer of size `fft_length`. + 3. The DFT is taken of each windowed frame. + 4. The results are stacked into a spectrogram. + + We make a distinction between the following "blocks" of sample data, each of which may have a different lengths: + + - The analysis frame. This is the size of the time slices that the input waveform is split into. + - The window. Each analysis frame is multiplied by the window to avoid spectral leakage. + - The FFT input buffer. The length of this determines how many frequency bins are in the spectrogram. + + In this implementation, the window is assumed to be zero-padded to have the same size as the analysis frame. A + padded window can be obtained from `window_function()`. The FFT input buffer may be larger than the analysis frame, + typically the next power of two. + + Note: This function is designed for efficient batch processing of multiple waveforms but retains compatibility with individual waveform processing methods like `librosa.stft`. + + Args: + waveform_list (`List[np.ndarray]` with arrays of shape `(length,)`): + The list of input waveforms, each a single-channel (mono) signal. + window (`np.ndarray` of shape `(frame_length,)`): + The windowing function to apply, including zero-padding if necessary. + frame_length (`int`): + The length of each frame for analysis. + hop_length (`int`): + The step size between successive frames. + fft_length (`int`, *optional*): + The size of the FFT buffer, defining frequency bin resolution. + power (`float`, *optional*, defaults to 1.0): + Determines the type of spectrogram: 1.0 for amplitude, 2.0 for power, None for complex. + center (`bool`, *optional*, defaults to `True`): + Whether to center-pad the waveform frames. + pad_mode (`str`, *optional*, defaults to `"reflect"`): + The padding strategy when `center` is `True`. + onesided (`bool`, *optional*, defaults to `True`): + If True, returns a one-sided spectrogram for real input signals. + dither (`float`, *optional*, defaults to 0.0): + Adds dithering. In other words, adds a small Gaussian noise to each frame. + E.g. use 4.0 to add dithering with a normal distribution centered + around 0.0 with standard deviation 4.0, 0.0 means no dithering. + preemphasis (`float`, *optional*): + Applies a pre-emphasis filter to each frame. + mel_filters (`np.ndarray`, *optional*): + Mel filter bank for converting to mel spectrogram. + mel_floor (`float`, *optional*, defaults to 1e-10): + Floor value for mel spectrogram to avoid log(0). + log_mel (`str`, *optional*): + Specifies log scaling strategy; options are None, "log", "log10", "dB". + reference (`float`, *optional*, defaults to 1.0): + Reference value for dB conversion in log_mel. + min_value (`float`, *optional*, defaults to 1e-10): + Minimum floor value for log scale conversions. + db_range (`float`, *optional*): + Dynamic range for dB scale spectrograms. + remove_dc_offset (`bool`, *optional*): + Whether to remove the DC offset from each frame. + dtype (`np.dtype`, *optional*, defaults to `np.float32`): + Data type of the output spectrogram. + + Returns: + List[`np.ndarray`]: A list of spectrogram arrays, one for each input waveform. + """ + window_length = len(window) + + if fft_length is None: + fft_length = frame_length + + if frame_length > fft_length: + raise ValueError(f"frame_length ({frame_length}) may not be larger than fft_length ({fft_length})") + + if window_length != frame_length: + raise ValueError(f"Length of the window ({window_length}) must equal frame_length ({frame_length})") + + if hop_length <= 0: + raise ValueError("hop_length must be greater than zero") + + # Check the dimensions of the waveform , and if waveform is complex + for waveform in waveform_list: + if waveform.ndim != 1: + raise ValueError(f"Input waveform must have only one dimension, shape is {waveform.shape}") + if np.iscomplexobj(waveform): + raise ValueError("Complex-valued input waveforms are not currently supported") + # Center pad the waveform + if center: + padding = [(int(frame_length // 2), int(frame_length // 2))] + waveform_list = [ + np.pad( + waveform, + padding, + mode=pad_mode, + ) + for waveform in waveform_list + ] + original_waveform_lengths = [ + len(waveform) for waveform in waveform_list + ] # these lengths will be used to remove padding later + + # Batch pad the waveform + max_length = max(original_waveform_lengths) + padded_waveform_batch = np.array( + [ + np.pad(waveform, (0, max_length - len(waveform)), mode="constant", constant_values=0) + for waveform in waveform_list + ], + dtype=dtype, + ) + + # Promote to float64, since np.fft uses float64 internally + padded_waveform_batch = padded_waveform_batch.astype(np.float64) + window = window.astype(np.float64) + + # Split waveform into frames of frame_length size + num_frames = int(1 + np.floor((padded_waveform_batch.shape[1] - frame_length) / hop_length)) + # these lengths will be used to remove padding later + true_num_frames = [int(1 + np.floor((length - frame_length) / hop_length)) for length in original_waveform_lengths] + num_batches = padded_waveform_batch.shape[0] + + num_frequency_bins = (fft_length // 2) + 1 if onesided else fft_length + spectrogram = np.empty((num_batches, num_frames, num_frequency_bins), dtype=np.complex64) + + # rfft is faster than fft + fft_func = np.fft.rfft if onesided else np.fft.fft + buffer = np.zeros((num_batches, fft_length)) + + for frame_idx in range(num_frames): + timestep = frame_idx * hop_length + buffer[:, :frame_length] = padded_waveform_batch[:, timestep : timestep + frame_length] + + if dither != 0.0: + buffer[:, :frame_length] += dither * np.random.randn(*buffer[:, :frame_length].shape) + + if remove_dc_offset: + buffer[:, :frame_length] -= buffer[:, :frame_length].mean(axis=1, keepdims=True) + + if preemphasis is not None: + buffer[:, 1:frame_length] -= preemphasis * buffer[:, : frame_length - 1] + buffer[:, 0] *= 1 - preemphasis + + buffer[:, :frame_length] *= window + + spectrogram[:, frame_idx] = fft_func(buffer) + + # Note: ** is much faster than np.power + if power is not None: + spectrogram = np.abs(spectrogram, dtype=np.float64) ** power + + # Apply mel filters if provided + if mel_filters is not None: + result = np.tensordot(spectrogram, mel_filters.T, axes=([2], [1])) + spectrogram = np.maximum(mel_floor, result) + + # Convert to log scale if specified + if power is not None and log_mel is not None: + if log_mel == "log": + spectrogram = np.log(spectrogram) + elif log_mel == "log10": + spectrogram = np.log10(spectrogram) + elif log_mel == "dB": + if power == 1.0: + spectrogram = amplitude_to_db_batch(spectrogram, reference, min_value, db_range) + elif power == 2.0: + spectrogram = power_to_db_batch(spectrogram, reference, min_value, db_range) + else: + raise ValueError(f"Cannot use log_mel option '{log_mel}' with power {power}") + else: + raise ValueError(f"Unknown log_mel option: {log_mel}") + + spectrogram = np.asarray(spectrogram, dtype) + + spectrogram_list = [spectrogram[i, : true_num_frames[i], :].T for i in range(len(true_num_frames))] + + return spectrogram_list + + +def power_to_db( + spectrogram: np.ndarray, + reference: float = 1.0, + min_value: float = 1e-10, + db_range: Optional[float] = None, +) -> np.ndarray: + """ + Converts a power spectrogram to the decibel scale. This computes `10 * log10(spectrogram / reference)`, using basic + logarithm properties for numerical stability. + + The motivation behind applying the log function on the (mel) spectrogram is that humans do not hear loudness on a + linear scale. Generally to double the perceived volume of a sound we need to put 8 times as much energy into it. + This means that large variations in energy may not sound all that different if the sound is loud to begin with. + This compression operation makes the (mel) spectrogram features match more closely what humans actually hear. + + Based on the implementation of `librosa.power_to_db`. + + Args: + spectrogram (`np.ndarray`): + The input power (mel) spectrogram. Note that a power spectrogram has the amplitudes squared! + reference (`float`, *optional*, defaults to 1.0): + Sets the input spectrogram value that corresponds to 0 dB. For example, use `np.max(spectrogram)` to set + the loudest part to 0 dB. Must be greater than zero. + min_value (`float`, *optional*, defaults to `1e-10`): + The spectrogram will be clipped to this minimum value before conversion to decibels, to avoid taking + `log(0)`. The default of `1e-10` corresponds to a minimum of -100 dB. Must be greater than zero. + db_range (`float`, *optional*): + Sets the maximum dynamic range in decibels. For example, if `db_range = 80`, the difference between the + peak value and the smallest value will never be more than 80 dB. Must be greater than zero. + + Returns: + `np.ndarray`: the spectrogram in decibels + """ + if reference <= 0.0: + raise ValueError("reference must be greater than zero") + if min_value <= 0.0: + raise ValueError("min_value must be greater than zero") + + reference = max(min_value, reference) + + spectrogram = np.clip(spectrogram, a_min=min_value, a_max=None) + spectrogram = 10.0 * (np.log10(spectrogram) - np.log10(reference)) + + if db_range is not None: + if db_range <= 0.0: + raise ValueError("db_range must be greater than zero") + spectrogram = np.clip(spectrogram, a_min=spectrogram.max() - db_range, a_max=None) + + return spectrogram + + +def power_to_db_batch( + spectrogram: np.ndarray, + reference: float = 1.0, + min_value: float = 1e-10, + db_range: Optional[float] = None, +) -> np.ndarray: + """ + Converts a batch of power spectrograms to the decibel scale. This computes `10 * log10(spectrogram / reference)`, + using basic logarithm properties for numerical stability. + + This function supports batch processing, where each item in the batch is an individual power (mel) spectrogram. + + Args: + spectrogram (`np.ndarray`): + The input batch of power (mel) spectrograms. Expected shape is (batch_size, *spectrogram_shape). + Note that a power spectrogram has the amplitudes squared! + reference (`float`, *optional*, defaults to 1.0): + Sets the input spectrogram value that corresponds to 0 dB. For example, use `np.max(spectrogram)` to set + the loudest part to 0 dB. Must be greater than zero. + min_value (`float`, *optional*, defaults to `1e-10`): + The spectrogram will be clipped to this minimum value before conversion to decibels, to avoid taking + `log(0)`. The default of `1e-10` corresponds to a minimum of -100 dB. Must be greater than zero. + db_range (`float`, *optional*): + Sets the maximum dynamic range in decibels. For example, if `db_range = 80`, the difference between the + peak value and the smallest value will never be more than 80 dB. Must be greater than zero. + + Returns: + `np.ndarray`: the batch of spectrograms in decibels + """ + if reference <= 0.0: + raise ValueError("reference must be greater than zero") + if min_value <= 0.0: + raise ValueError("min_value must be greater than zero") + + reference = max(min_value, reference) + + spectrogram = np.clip(spectrogram, a_min=min_value, a_max=None) + spectrogram = 10.0 * (np.log10(spectrogram) - np.log10(reference)) + + if db_range is not None: + if db_range <= 0.0: + raise ValueError("db_range must be greater than zero") + # Apply db_range clipping per batch item + max_values = spectrogram.max(axis=(1, 2), keepdims=True) + spectrogram = np.clip(spectrogram, a_min=max_values - db_range, a_max=None) + + return spectrogram + + +def amplitude_to_db( + spectrogram: np.ndarray, + reference: float = 1.0, + min_value: float = 1e-5, + db_range: Optional[float] = None, +) -> np.ndarray: + """ + Converts an amplitude spectrogram to the decibel scale. This computes `20 * log10(spectrogram / reference)`, using + basic logarithm properties for numerical stability. + + The motivation behind applying the log function on the (mel) spectrogram is that humans do not hear loudness on a + linear scale. Generally to double the perceived volume of a sound we need to put 8 times as much energy into it. + This means that large variations in energy may not sound all that different if the sound is loud to begin with. + This compression operation makes the (mel) spectrogram features match more closely what humans actually hear. + + Args: + spectrogram (`np.ndarray`): + The input amplitude (mel) spectrogram. + reference (`float`, *optional*, defaults to 1.0): + Sets the input spectrogram value that corresponds to 0 dB. For example, use `np.max(spectrogram)` to set + the loudest part to 0 dB. Must be greater than zero. + min_value (`float`, *optional*, defaults to `1e-5`): + The spectrogram will be clipped to this minimum value before conversion to decibels, to avoid taking + `log(0)`. The default of `1e-5` corresponds to a minimum of -100 dB. Must be greater than zero. + db_range (`float`, *optional*): + Sets the maximum dynamic range in decibels. For example, if `db_range = 80`, the difference between the + peak value and the smallest value will never be more than 80 dB. Must be greater than zero. + + Returns: + `np.ndarray`: the spectrogram in decibels + """ + if reference <= 0.0: + raise ValueError("reference must be greater than zero") + if min_value <= 0.0: + raise ValueError("min_value must be greater than zero") + + reference = max(min_value, reference) + + spectrogram = np.clip(spectrogram, a_min=min_value, a_max=None) + spectrogram = 20.0 * (np.log10(spectrogram) - np.log10(reference)) + + if db_range is not None: + if db_range <= 0.0: + raise ValueError("db_range must be greater than zero") + spectrogram = np.clip(spectrogram, a_min=spectrogram.max() - db_range, a_max=None) + + return spectrogram + + +def amplitude_to_db_batch( + spectrogram: np.ndarray, reference: float = 1.0, min_value: float = 1e-5, db_range: Optional[float] = None +) -> np.ndarray: + """ + Converts a batch of amplitude spectrograms to the decibel scale. This computes `20 * log10(spectrogram / reference)`, + using basic logarithm properties for numerical stability. + + The function supports batch processing, where each item in the batch is an individual amplitude (mel) spectrogram. + + Args: + spectrogram (`np.ndarray`): + The input batch of amplitude (mel) spectrograms. Expected shape is (batch_size, *spectrogram_shape). + reference (`float`, *optional*, defaults to 1.0): + Sets the input spectrogram value that corresponds to 0 dB. For example, use `np.max(spectrogram)` to set + the loudest part to 0 dB. Must be greater than zero. + min_value (`float`, *optional*, defaults to `1e-5`): + The spectrogram will be clipped to this minimum value before conversion to decibels, to avoid taking + `log(0)`. The default of `1e-5` corresponds to a minimum of -100 dB. Must be greater than zero. + db_range (`float`, *optional*): + Sets the maximum dynamic range in decibels. For example, if `db_range = 80`, the difference between the + peak value and the smallest value will never be more than 80 dB. Must be greater than zero. + + Returns: + `np.ndarray`: the batch of spectrograms in decibels + """ + if reference <= 0.0: + raise ValueError("reference must be greater than zero") + if min_value <= 0.0: + raise ValueError("min_value must be greater than zero") + + reference = max(min_value, reference) + + spectrogram = np.clip(spectrogram, a_min=min_value, a_max=None) + spectrogram = 20.0 * (np.log10(spectrogram) - np.log10(reference)) + + if db_range is not None: + if db_range <= 0.0: + raise ValueError("db_range must be greater than zero") + # Apply db_range clipping per batch item + max_values = spectrogram.max(axis=(1, 2), keepdims=True) + spectrogram = np.clip(spectrogram, a_min=max_values - db_range, a_max=None) + + return spectrogram + + +### deprecated functions below this line ### + + +def get_mel_filter_banks( + nb_frequency_bins: int, + nb_mel_filters: int, + frequency_min: float, + frequency_max: float, + sample_rate: int, + norm: Optional[str] = None, + mel_scale: str = "htk", +) -> np.array: + warnings.warn( + "The function `get_mel_filter_banks` is deprecated and will be removed in version 4.31.0 of Transformers", + FutureWarning, + ) + return mel_filter_bank( + num_frequency_bins=nb_frequency_bins, + num_mel_filters=nb_mel_filters, + min_frequency=frequency_min, + max_frequency=frequency_max, + sampling_rate=sample_rate, + norm=norm, + mel_scale=mel_scale, + ) + + +def fram_wave(waveform: np.array, hop_length: int = 160, fft_window_size: int = 400, center: bool = True): + """ + In order to compute the short time fourier transform, the waveform needs to be split in overlapping windowed + segments called `frames`. + + The window length (window_length) defines how much of the signal is contained in each frame, while the hop length + defines the step between the beginning of each new frame. + + + Args: + waveform (`np.array` of shape `(sample_length,)`): + The raw waveform which will be split into smaller chunks. + hop_length (`int`, *optional*, defaults to 160): + Step between each window of the waveform. + fft_window_size (`int`, *optional*, defaults to 400): + Defines the size of the window. + center (`bool`, defaults to `True`): + Whether or not to center each frame around the middle of the frame. Centering is done by reflecting the + waveform on the left and on the right. + + Return: + framed_waveform (`np.array` of shape `(waveform.shape // hop_length , fft_window_size)`): + The framed waveforms that can be fed to `np.fft`. + """ + warnings.warn( + "The function `fram_wave` is deprecated and will be removed in version 4.31.0 of Transformers", + FutureWarning, + ) + frames = [] + for i in range(0, waveform.shape[0] + 1, hop_length): + if center: + half_window = (fft_window_size - 1) // 2 + 1 + start = i - half_window if i > half_window else 0 + end = i + half_window if i < waveform.shape[0] - half_window else waveform.shape[0] + frame = waveform[start:end] + if start == 0: + padd_width = (-i + half_window, 0) + frame = np.pad(frame, pad_width=padd_width, mode="reflect") + + elif end == waveform.shape[0]: + padd_width = (0, (i - waveform.shape[0] + half_window)) + frame = np.pad(frame, pad_width=padd_width, mode="reflect") + + else: + frame = waveform[i : i + fft_window_size] + frame_width = frame.shape[0] + if frame_width < waveform.shape[0]: + frame = np.lib.pad( + frame, pad_width=(0, fft_window_size - frame_width), mode="constant", constant_values=0 + ) + frames.append(frame) + + frames = np.stack(frames, 0) + return frames + + +def stft(frames: np.array, windowing_function: np.array, fft_window_size: Optional[int] = None): + """ + Calculates the complex Short-Time Fourier Transform (STFT) of the given framed signal. Should give the same results + as `torch.stft`. + + Args: + frames (`np.array` of dimension `(num_frames, fft_window_size)`): + A framed audio signal obtained using `audio_utils.fram_wav`. + windowing_function (`np.array` of dimension `(nb_frequency_bins, nb_mel_filters)`: + A array representing the function that will be used to reduces the amplitude of the discontinuities at the + boundaries of each frame when computing the STFT. Each frame will be multiplied by the windowing_function. + For more information on the discontinuities, called *Spectral leakage*, refer to [this + tutorial]https://download.ni.com/evaluation/pxi/Understanding%20FFTs%20and%20Windowing.pdf + fft_window_size (`int`, *optional*): + Size of the window om which the Fourier transform is applied. This controls the frequency resolution of the + spectrogram. 400 means that the fourrier transform is computed on windows of 400 samples. The number of + frequency bins (`nb_frequency_bins`) used to divide the window into equal strips is equal to + `(1+fft_window_size)//2`. An increase of the fft_window_size slows the calculus time proportionnally. + + Example: + + ```python + >>> from transformers.audio_utils import stft, fram_wave + >>> import numpy as np + + >>> audio = np.random.rand(50) + >>> fft_window_size = 10 + >>> hop_length = 2 + >>> framed_audio = fram_wave(audio, hop_length, fft_window_size) + >>> spectrogram = stft(framed_audio, np.hanning(fft_window_size + 1)) + ``` + + Returns: + spectrogram (`np.ndarray`): + A spectrogram of shape `(num_frames, nb_frequency_bins)` obtained using the STFT algorithm + """ + warnings.warn( + "The function `stft` is deprecated and will be removed in version 4.31.0 of Transformers", + FutureWarning, + ) + frame_size = frames.shape[1] + + if fft_window_size is None: + fft_window_size = frame_size + + if fft_window_size < frame_size: + raise ValueError("FFT size must greater or equal the frame size") + # number of FFT bins to store + nb_frequency_bins = (fft_window_size >> 1) + 1 + + spectrogram = np.empty((len(frames), nb_frequency_bins), dtype=np.complex64) + fft_signal = np.zeros(fft_window_size) + + for f, frame in enumerate(frames): + if windowing_function is not None: + np.multiply(frame, windowing_function, out=fft_signal[:frame_size]) + else: + fft_signal[:frame_size] = frame + spectrogram[f] = np.fft.fft(fft_signal, axis=0)[:nb_frequency_bins] + return spectrogram.T diff --git a/docs/transformers/build/lib/transformers/cache_utils.py b/docs/transformers/build/lib/transformers/cache_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..700aedba840288b03e80c10a83716a8243e362bc --- /dev/null +++ b/docs/transformers/build/lib/transformers/cache_utils.py @@ -0,0 +1,2485 @@ +import copy +import importlib.metadata +import json +import os +from dataclasses import dataclass +from typing import Any, Dict, Iterable, List, Optional, Tuple, Union + +import torch +from packaging import version + +from transformers.pytorch_utils import is_torch_greater_or_equal_than_2_6 + +from .configuration_utils import PretrainedConfig +from .utils import is_hqq_available, is_optimum_quanto_available, is_torch_greater_or_equal, logging + + +if is_hqq_available(): + from hqq.core.quantize import Quantizer as HQQQuantizer + +logger = logging.get_logger(__name__) + + +class Cache: + """ + Base, abstract class for all caches. The actual data structure is specific to each subclass. + """ + + is_compileable = False + + def __init__(self): + super().__init__() + + def update( + self, + key_states: torch.Tensor, + value_states: torch.Tensor, + layer_idx: int, + cache_kwargs: Optional[Dict[str, Any]] = None, + ) -> Tuple[torch.Tensor, torch.Tensor]: + """ + Updates the cache with the new `key_states` and `value_states` for the layer `layer_idx`. + + Parameters: + key_states (`torch.Tensor`): + The new key states to cache. + value_states (`torch.Tensor`): + The new value states to cache. + layer_idx (`int`): + The index of the layer to cache the states for. + cache_kwargs (`Dict[str, Any]`, `optional`): + Additional arguments for the cache subclass. These are specific to each subclass and allow new types of + cache to be created. + + Return: + A tuple containing the updated key and value states. + """ + raise NotImplementedError("Make sure to implement `update` in a subclass.") + + def get_seq_length(self, layer_idx: Optional[int] = 0) -> int: + """Returns the sequence length of the cached states. A layer index can be optionally passed.""" + # TODO: deprecate this function in favor of `cache_position` + raise NotImplementedError("Make sure to implement `get_seq_length` in a subclass.") + + def get_max_cache_shape(self) -> Optional[int]: + """Returns the maximum sequence length (i.e. max capacity) of the cache object""" + raise NotImplementedError("Make sure to implement `get_max_cache_shape` in a subclass.") + + def get_usable_length(self, new_seq_length: int, layer_idx: Optional[int] = 0) -> int: + """Given the sequence length of the new inputs, returns the usable length of the cache.""" + # Cache without size limit -> all cache is usable + # Cache with size limit -> if the length cache plus the length of the new inputs is larger the maximum cache + # length, we will need to evict part of the cache (and thus not all cache is usable) + max_length = self.get_max_cache_shape() + previous_seq_length = self.get_seq_length(layer_idx) + if max_length is not None and previous_seq_length + new_seq_length > max_length: + return max_length - new_seq_length + return previous_seq_length + + def reorder_cache(self, beam_idx: torch.LongTensor): + """Reorders the cache for beam search, given the selected beam indices.""" + for layer_idx in range(len(self.key_cache)): + if self.key_cache[layer_idx].numel(): + device = self.key_cache[layer_idx].device + self.key_cache[layer_idx] = self.key_cache[layer_idx].index_select(0, beam_idx.to(device)) + if self.value_cache[layer_idx].numel(): + device = self.value_cache[layer_idx].device + self.value_cache[layer_idx] = self.value_cache[layer_idx].index_select(0, beam_idx.to(device)) + + @property + def seen_tokens(self): + logger.warning_once( + "The `seen_tokens` attribute is deprecated and will be removed in v4.41. Use the `cache_position` " + "model input instead." + ) + if hasattr(self, "_seen_tokens"): + return self._seen_tokens + else: + return None + + +@dataclass +class CacheConfig: + """ + Base class for cache configs + """ + + cache_implementation: None + + @classmethod + def from_dict(cls, config_dict, **kwargs): + """ + Constructs a CacheConfig instance from a dictionary of parameters. + Args: + config_dict (Dict[str, Any]): Dictionary containing configuration parameters. + **kwargs: Additional keyword arguments to override dictionary values. + + Returns: + CacheConfig: Instance of CacheConfig constructed from the dictionary. + """ + config = cls(**config_dict) + to_remove = [] + for key, value in kwargs.items(): + if hasattr(config, key): + setattr(config, key, value) + to_remove.append(key) + for key in to_remove: + kwargs.pop(key, None) + return config + + # Copied from transformers.utils.quantization_config.QuantizationConfigMixin.to_json_file + def to_json_file(self, json_file_path: Union[str, os.PathLike]): + """ + Save this instance to a JSON file. + + Args: + json_file_path (`str` or `os.PathLike`): + Path to the JSON file in which this configuration instance's parameters will be saved. + use_diff (`bool`, *optional*, defaults to `True`): + If set to `True`, only the difference between the config instance and the default + `QuantizationConfig()` is serialized to JSON file. + """ + with open(json_file_path, "w", encoding="utf-8") as writer: + config_dict = self.to_dict() + json_string = json.dumps(config_dict, indent=2, sort_keys=True) + "\n" + + writer.write(json_string) + + # Copied from transformers.utils.quantization_config.QuantizationConfigMixin.to_dict + def to_dict(self) -> Dict[str, Any]: + """ + Serializes this instance to a Python dictionary. Returns: + `Dict[str, Any]`: Dictionary of all the attributes that make up this configuration instance. + """ + return copy.deepcopy(self.__dict__) + + # Copied from transformers.utils.quantization_config.QuantizationConfigMixin.__iter__ + def __iter__(self): + """allows `dict(obj)` for situations where obj may be a dict or QuantizationConfigMixin""" + for attr, value in copy.deepcopy(self.__dict__).items(): + yield attr, value + + # Copied from transformers.utils.quantization_config.QuantizationConfigMixin.__repr__ + def __repr__(self): + return f"{self.__class__.__name__} {self.to_json_string()}" + + def to_json_string(self): + """ + Serializes this instance to a JSON formatted string. + Returns: + str: JSON formatted string representing the configuration instance. + """ + return json.dumps(self.__dict__, indent=2) + "\n" + + # Copied from transformers.utils.quantization_config.QuantizationConfigMixin.update + def update(self, **kwargs): + """ + Updates attributes of this class instance with attributes from `kwargs` if they match existing attributes, + returning all the unused kwargs. + + Args: + kwargs (`Dict[str, Any]`): + Dictionary of attributes to tentatively update this class. + + Returns: + `Dict[str, Any]`: Dictionary containing all the key-value pairs that were not used to update the instance. + """ + to_remove = [] + for key, value in kwargs.items(): + if hasattr(self, key): + setattr(self, key, value) + to_remove.append(key) + + # Remove all the attributes that were updated, without modifying the input dict + unused_kwargs = {key: value for key, value in kwargs.items() if key not in to_remove} + return unused_kwargs + + +@dataclass +class QuantizedCacheConfig(CacheConfig): + """ + Configuration class for quantized cache settings. + + Attributes: + backend (`str`, *optional*, defaults to `"quanto"`): + Backend to use when performing quantization, Can be one of [`quanto`, `HQQ`] + nbits (`Optional[int]`, *optional*, defaults to 4): + Number of bits, can be 2 or 4 for the `quanto` backend and one of [1, 2, 3, 4, 8] for the `HQQ` backend. Defaults to 2. + axis_key (`int`, *optional*, defaults to 0): + Axis over which to perform grouping for the key tensors. Can be [0, -1] for `quanto` backend and [0, 1] for `HQQ` backend. + axis_value (`int`, *optional*, defaults to 0): + Axis over which to perform grouping for the value tensors. Can be [0, -1] for `quanto` backend and [0, 1] for `HQQ` backend. + q_group_size (`Optional[int]`, *optional*, defaults to 64): + Size of the quantization group, should be a divisor of the model's hidden dimension. + Defaults to 64. + residual_length (`Optional[int]`, *optional*, defaults to 128): + Length of the residual cache which will always be stored in original precision. + Defaults to 128. + compute_dtype (`torch.dtype`, *optional*, defaults to `torch.float16`): + The default dtype used for computations in the model. Keys and Values will be cast to this dtype after dequantization. + device (`str`, *optional*, defaults to `"cpu"`): + Device on which to perform computations, should be same as the model's device. + """ + + def __init__( + self, + backend: str = "quanto", + nbits: Optional[int] = 4, + axis_key: Optional[int] = 0, + axis_value: Optional[int] = 0, + q_group_size: Optional[int] = 64, + residual_length: Optional[int] = 128, + compute_dtype: Optional[torch.dtype] = torch.float16, + device: Optional[str] = "cpu", + ): + self.backend = backend + self.nbits = nbits + self.axis_key = axis_key + self.axis_value = axis_value + self.q_group_size = q_group_size + self.residual_length = residual_length + self.compute_dtype = compute_dtype + self.device = device + + def validate(self): + """Validates if the arguments passed are correct""" + + incorrect_arg_msg = ( + "Some of the keys in `cache_config` are defined incorrectly. `{key}` should be {correct_value}` " + "but found {found_value}" + ) + # Check that the values are reasonable in general (nbits, axis) + # Later in QuantizedCache init we check if they are supported for that particular backend + if self.nbits not in [1, 2, 3, 4, 8]: + raise ValueError( + incorrect_arg_msg.format( + key="nbits", + correct_value="2 or 4 or 8", + found_value=self.nbits, + ), + ) + if self.q_group_size <= 0: + raise ValueError( + incorrect_arg_msg.format( + key="q_group_size", + correct_value="a positive integer", + found_value=self.q_group_size, + ), + ) + if self.residual_length < 0: + raise ValueError( + incorrect_arg_msg.format( + key="residual_length", + correct_value="a positive integer", + found_value=self.residual_length, + ), + ) + + if self.axis_key not in [0, 1, -1]: + raise ValueError( + incorrect_arg_msg.format( + key="axis_key", + correct_value="`1` or `0`, `-1`", + found_value=self.axis_key, + ), + ) + + if self.axis_value not in [0, 1, -1]: + raise ValueError( + incorrect_arg_msg.format( + key="axis_value", + correct_value="`1` or `0` or `-1`", + found_value=self.axis_value, + ), + ) + + +@dataclass +class StaticCacheConfig(CacheConfig): + """ + Configuration class for static cache settings. + """ + + cache_implementation = "static" + + def __init__(self, batch_size: int, max_cache_len: int, device="cpu"): + self.batch_size = batch_size + self.max_cache_len = max_cache_len + self.device = device + + def validate(self): + """Validates if the arguments passed are correct""" + + incorrect_arg_msg = ( + "Some of the keys in `cache_config` are defined incorrectly. `{key}` should be {correct_value}` " + "but found {found_value}" + ) + + if self.batch_size <= 0: + raise ValueError( + incorrect_arg_msg.format( + key="batch_size", + correct_value="> 0", + found_value=self.batch_size, + ), + ) + + if self.max_cache_len <= 0: + raise ValueError( + incorrect_arg_msg.format( + key="max_cache_len", + correct_value="> 0", + found_value=self.max_cache_len, + ), + ) + + +class DynamicCache(Cache): + """ + A cache that grows dynamically as more tokens are generated. This is the default for generative models. + + It stores the Key and Value states as a list of tensors, one for each layer. The expected shape for each tensor is + `[batch_size, num_heads, seq_len, head_dim]`. + + Example: + + ```python + >>> from transformers import AutoTokenizer, AutoModelForCausalLM, DynamicCache + + >>> model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2-0.5B-Instruct") + >>> tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-0.5B-Instruct") + + >>> inputs = tokenizer(text="My name is Qwen2", return_tensors="pt") + + >>> # Prepare a cache class and pass it to model's forward + >>> past_key_values = DynamicCache() + >>> outputs = model(**inputs, past_key_values=past_key_values, use_cache=True) + >>> outputs.past_key_values # access cache filled with key/values from generation + DynamicCache() + ``` + """ + + def __init__(self, _distributed_cache_data: Iterable = None) -> None: + super().__init__() + self._seen_tokens = 0 # Used in `generate` to keep tally of how many tokens the cache has seen + self.key_cache: List[torch.Tensor] = [] + self.value_cache: List[torch.Tensor] = [] + + # `_distributed_cache_data` was originally added for compatibility with `torch.distributed` (DDP). See #36121 + # and #36373 for more information. In a nutshell, it is `map(gather_map, zip(*caches))`, i.e. each item in the + # iterable contains the key and value states for a layer gathered across replicas by torch.distributed + # (shape=[global batch size, num_heads, seq_len, head_dim]). + # WARNING: `_distributed_cache_data` must be the first argument in `__init__`, otherwise we'll break + # compatibility. The name of the argument doesn't matter. + if _distributed_cache_data is not None: + for key_states, value_states in _distributed_cache_data: + self.key_cache.append(key_states) + self.value_cache.append(value_states) + + def __getitem__(self, layer_idx: int) -> List[Tuple[torch.Tensor]]: + """ + Support for backwards-compatible `past_key_value` indexing, e.g. `past_key_value[0][0].shape[2]` to get the + sequence length. + """ + if layer_idx < len(self): + return (self.key_cache[layer_idx], self.value_cache[layer_idx]) + else: + raise KeyError(f"Cache only has {len(self)} layers, attempted to access layer with index {layer_idx}") + + def __iter__(self): + """ + Support for backwards-compatible `past_key_value` iteration, e.g. `for x in past_key_value:` to iterate over + keys and values + """ + for layer_idx in range(len(self)): + yield (self.key_cache[layer_idx], self.value_cache[layer_idx]) + + def __len__(self): + """ + Support for backwards-compatible `past_key_value` length, e.g. `len(past_key_value)`. This value corresponds + to the number of layers in the model. + """ + return len(self.key_cache) + + def update( + self, + key_states: torch.Tensor, + value_states: torch.Tensor, + layer_idx: int, + cache_kwargs: Optional[Dict[str, Any]] = None, + ) -> Tuple[torch.Tensor, torch.Tensor]: + """ + Updates the cache with the new `key_states` and `value_states` for the layer `layer_idx`. + + Parameters: + key_states (`torch.Tensor`): + The new key states to cache. + value_states (`torch.Tensor`): + The new value states to cache. + layer_idx (`int`): + The index of the layer to cache the states for. + cache_kwargs (`Dict[str, Any]`, `optional`): + Additional arguments for the cache subclass. No additional arguments are used in `DynamicCache`. + + Return: + A tuple containing the updated key and value states. + """ + # Update the number of seen tokens + if layer_idx == 0: + self._seen_tokens += key_states.shape[-2] + + # Update the cache + if key_states is not None: + if len(self.key_cache) <= layer_idx: + # There may be skipped layers, fill them with empty lists + for _ in range(len(self.key_cache), layer_idx): + self.key_cache.append(torch.tensor([])) + self.value_cache.append(torch.tensor([])) + self.key_cache.append(key_states) + self.value_cache.append(value_states) + elif ( + not self.key_cache[layer_idx].numel() # prefers not t.numel() to len(t) == 0 to export the model + ): # fills previously skipped layers; checking for tensor causes errors + self.key_cache[layer_idx] = key_states + self.value_cache[layer_idx] = value_states + else: + self.key_cache[layer_idx] = torch.cat([self.key_cache[layer_idx], key_states], dim=-2) + self.value_cache[layer_idx] = torch.cat([self.value_cache[layer_idx], value_states], dim=-2) + + return self.key_cache[layer_idx], self.value_cache[layer_idx] + + def get_seq_length(self, layer_idx: Optional[int] = 0) -> int: + """Returns the sequence length of the cached states. A layer index can be optionally passed.""" + # TODO: deprecate this function in favor of `cache_position` + is_empty_layer = ( + len(self.key_cache) == 0 # no cache in any layer + or len(self.key_cache) <= layer_idx # skipped `layer_idx` and hasn't run a layer with cache after it + or not self.key_cache[layer_idx].numel() # the layer has no cache + ) + layer_seq_length = self.key_cache[layer_idx].shape[-2] if not is_empty_layer else 0 + return layer_seq_length + + def get_max_cache_shape(self) -> Optional[int]: + """Returns the maximum sequence length of the cache object. DynamicCache does not have a maximum length.""" + return None + + def to_legacy_cache(self) -> Tuple[Tuple[torch.Tensor], Tuple[torch.Tensor]]: + """Converts the `DynamicCache` instance into the its equivalent in the legacy cache format. Used for + backward compatibility.""" + legacy_cache = () + for layer_idx in range(len(self)): + legacy_cache += ((self.key_cache[layer_idx], self.value_cache[layer_idx]),) + return legacy_cache + + @classmethod + def from_legacy_cache(cls, past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None) -> "DynamicCache": + """Converts a cache in the legacy cache format into an equivalent `DynamicCache`. Used for + backward compatibility.""" + cache = cls() + if past_key_values is not None: + for layer_idx in range(len(past_key_values)): + key_states, value_states = past_key_values[layer_idx] + cache.update(key_states, value_states, layer_idx) + return cache + + def crop(self, max_length: int): + """Crop the past key values up to a new `max_length` in terms of tokens. `max_length` can also be + negative to remove `max_length` tokens. This is used in assisted decoding and contrastive search.""" + # In case it is negative + if max_length < 0: + max_length = self.get_seq_length() - abs(max_length) + + if self.get_seq_length() <= max_length: + return + + self._seen_tokens = max_length + for idx in range(len(self.key_cache)): + if self.key_cache[idx].numel(): + self.key_cache[idx] = self.key_cache[idx][..., :max_length, :] + self.value_cache[idx] = self.value_cache[idx][..., :max_length, :] + + def batch_split(self, full_batch_size: int, split_size: int) -> List["DynamicCache"]: + """Split the current instance into a list of `DynamicCache` by the batch size. This will be used by + `_split_model_inputs()` in `generation.utils`""" + out = [] + for i in range(0, full_batch_size, split_size): + current_split = DynamicCache() + current_split._seen_tokens = self._seen_tokens + current_split.key_cache = [tensor[i : i + split_size] for tensor in self.key_cache] + current_split.value_cache = [tensor[i : i + split_size] for tensor in self.value_cache] + out.append(current_split) + return out + + @classmethod + def from_batch_splits(cls, splits: List["DynamicCache"]) -> "DynamicCache": + """This is the opposite of the above `batch_split()` method. This will be used by `stack_model_outputs` in + `generation.utils`""" + cache = cls() + for idx in range(len(splits[0])): + key_cache = [current.key_cache[idx] for current in splits if current.key_cache[idx].numel()] + value_cache = [current.value_cache[idx] for current in splits if current.value_cache[idx].numel()] + if key_cache != []: + layer_keys = torch.cat(key_cache, dim=0) + layer_values = torch.cat(value_cache, dim=0) + cache.update(layer_keys, layer_values, idx) + return cache + + def batch_repeat_interleave(self, repeats: int): + """Repeat the cache `repeats` times in the batch dimension. Used in contrastive search.""" + for layer_idx in range(len(self)): + self.key_cache[layer_idx] = self.key_cache[layer_idx].repeat_interleave(repeats, dim=0) + self.value_cache[layer_idx] = self.value_cache[layer_idx].repeat_interleave(repeats, dim=0) + + def batch_select_indices(self, indices: torch.Tensor): + """Only keep the `indices` in the batch dimension of the cache. Used in contrastive search.""" + for layer_idx in range(len(self)): + self.key_cache[layer_idx] = self.key_cache[layer_idx][indices, ...] + self.value_cache[layer_idx] = self.value_cache[layer_idx][indices, ...] + + +# Utilities for `DynamicCache` <> torch.export support +def _flatten_dynamic_cache( + dynamic_cache: DynamicCache, +): + """Flattens DynamicCache into flat list of tensors for `torch.export.export` to consume""" + if not isinstance(dynamic_cache, DynamicCache): + raise RuntimeError("This pytree flattening function should only be applied to DynamicCache") + + if not is_torch_greater_or_equal_than_2_6: + logger.warning_once( + "DynamicCache + torch.export is tested on torch 2.6.0+ and may not work on earlier versions." + ) + + # NOTE it seems _seen_tokens is deprecated, so probably doesn't need tracking + dictionary = { + "key_cache": getattr(dynamic_cache, "key_cache"), + "value_cache": getattr(dynamic_cache, "value_cache"), + } + return torch.utils._pytree._dict_flatten(dictionary) + + +def _flatten_with_keys_dynamic_cache(dynamic_cache: DynamicCache): + dictionary = { + "key_cache": getattr(dynamic_cache, "key_cache"), + "value_cache": getattr(dynamic_cache, "value_cache"), + } + return torch.utils._pytree._dict_flatten_with_keys(dictionary) + + +def _unflatten_dynamic_cache( + values, + context: torch.utils._pytree.Context, +): + dictionary = torch.utils._pytree._dict_unflatten(values, context) + cache = DynamicCache() + for k, v in dictionary.items(): + setattr(cache, k, v) + return cache + + +def _flatten_dynamic_cache_for_fx(cache, spec): + dictionary = { + "key_cache": getattr(cache, "key_cache"), + "value_cache": getattr(cache, "value_cache"), + } + return torch.utils._pytree.tree_flatten(dictionary)[0] + + +if is_torch_greater_or_equal("2.3"): + torch.utils._pytree.register_pytree_node( + DynamicCache, + _flatten_dynamic_cache, + _unflatten_dynamic_cache, + serialized_type_name=f"{DynamicCache.__module__}.{DynamicCache.__name__}", + flatten_with_keys_fn=_flatten_with_keys_dynamic_cache, + ) + # TODO (tmanlaibaatar) This won't be needed in torch 2.7. + torch.fx._pytree.register_pytree_flatten_spec(DynamicCache, _flatten_dynamic_cache_for_fx) + + +class OffloadedCache(DynamicCache): + """ + A drop-in replacement for DynamicCache that conserves accelerator(GPU, XPU) memory at the expense of more CPU memory. + Useful for generating from models with very long context. + + In addition to the default accelerator stream, where all forward() computations happen, + this class uses another stream, the prefetch stream, which it creates itself. + Since scheduling of operations on separate streams happens independently, this class uses + the prefetch stream to asynchronously prefetch the KV cache of layer k+1 when layer k is executing. + The movement of the layer k-1 cache to the CPU is handled by the default stream as a simple way to + ensure the eviction is scheduled after all computations on that cache are finished. + """ + + def __init__(self) -> None: + if not ( + torch.cuda.is_available() + or (is_torch_greater_or_equal("2.7", accept_dev=True) and torch.xpu.is_available()) + ): + raise RuntimeError( + "OffloadedCache can only be used with a GPU" + + (" or XPU" if is_torch_greater_or_equal("2.7", accept_dev=True) else "") + ) + + super().__init__() + self.original_device = [] + self.prefetch_stream = None + self.prefetch_stream = ( + torch.Stream() if is_torch_greater_or_equal("2.7", accept_dev=True) else torch.cuda.Stream() + ) + self.beam_idx = None # used to delay beam search operations + + def prefetch_layer(self, layer_idx: int): + "Starts prefetching the next layer cache" + if layer_idx < len(self): + with ( + self.prefetch_stream + if is_torch_greater_or_equal("2.7", accept_dev=True) + else torch.cuda.stream(self.prefetch_stream) + ): + # Prefetch next layer tensors to GPU + device = self.original_device[layer_idx] + self.key_cache[layer_idx] = self.key_cache[layer_idx].to(device, non_blocking=True) + self.value_cache[layer_idx] = self.value_cache[layer_idx].to(device, non_blocking=True) + + def evict_previous_layer(self, layer_idx: int): + "Moves the previous layer cache to the CPU" + if len(self) > 2: + # We do it on the default stream so it occurs after all earlier computations on these tensors are done + prev_layer_idx = (layer_idx - 1) % len(self) + self.key_cache[prev_layer_idx] = self.key_cache[prev_layer_idx].to("cpu", non_blocking=True) + self.value_cache[prev_layer_idx] = self.value_cache[prev_layer_idx].to("cpu", non_blocking=True) + + def __getitem__(self, layer_idx: int) -> List[Tuple[torch.Tensor]]: + "Gets the cache for this layer to the device. Prefetches the next and evicts the previous layer." + if layer_idx < len(self): + # Evict the previous layer if necessary + if is_torch_greater_or_equal("2.7", accept_dev=True): + torch.accelerator.current_stream().synchronize() + else: + torch.cuda.current_stream().synchronize() + self.evict_previous_layer(layer_idx) + # Load current layer cache to its original device if not already there + original_device = self.original_device[layer_idx] + self.prefetch_stream.synchronize() + key_tensor = self.key_cache[layer_idx] + value_tensor = self.value_cache[layer_idx] + # Now deal with beam search ops which were delayed + if self.beam_idx is not None: + self.beam_idx = self.beam_idx.to(original_device) + key_tensor = key_tensor.index_select(0, self.beam_idx) + value_tensor = value_tensor.index_select(0, self.beam_idx) + # Prefetch the next layer + self.prefetch_layer((layer_idx + 1) % len(self)) + return (key_tensor, value_tensor) + else: + raise KeyError(f"Cache only has {len(self)} layers, attempted to access layer with index {layer_idx}") + + def reorder_cache(self, beam_idx: torch.LongTensor): + """Saves the beam indices and reorders the cache when the tensor is back to its device.""" + # We delay this operation until the tensors are back to their original + # device because performing torch.index_select on the CPU is very slow + del self.beam_idx + self.beam_idx = beam_idx.clone() + + def update( + self, + key_states: torch.Tensor, + value_states: torch.Tensor, + layer_idx: int, + cache_kwargs: Optional[Dict[str, Any]] = None, + ) -> Tuple[torch.Tensor, torch.Tensor]: + """ + Updates the cache with the new `key_states` and `value_states` for the layer `layer_idx`. + Parameters: + key_states (`torch.Tensor`): + The new key states to cache. + value_states (`torch.Tensor`): + The new value states to cache. + layer_idx (`int`): + The index of the layer to cache the states for. + cache_kwargs (`Dict[str, Any]`, `optional`): + Additional arguments for the cache subclass. No additional arguments are used in `OffloadedCache`. + Return: + A tuple containing the updated key and value states. + """ + # Update the number of seen tokens + if layer_idx == 0: + self._seen_tokens += key_states.shape[-2] + + # Update the cache + if len(self.key_cache) < layer_idx: + raise ValueError("OffloadedCache does not support model usage where layers are skipped. Use DynamicCache.") + elif len(self.key_cache) == layer_idx: + self.key_cache.append(key_states) + self.value_cache.append(value_states) + self.original_device.append(key_states.device) + self.evict_previous_layer(layer_idx) + else: + key_tensor, value_tensor = self[layer_idx] + self.key_cache[layer_idx] = torch.cat([key_tensor, key_states], dim=-2) + self.value_cache[layer_idx] = torch.cat([value_tensor, value_states], dim=-2) + + return self.key_cache[layer_idx], self.value_cache[layer_idx] + + # According to https://docs.python.org/3/library/exceptions.html#NotImplementedError + # if a method is not supposed to be supported in a subclass we should set it to None + from_legacy_cache = None + + to_legacy_cache = None + + +class QuantizedCache(DynamicCache): + """ + A quantizer cache similar to what is described in the [KIVI: A Tuning-Free Asymmetric 2bit Quantization for KV Cache paper](https://arxiv.org/abs/2402.02750). + It allows the model to generate longer sequence length without allocating too much memory for Key and Value cache by applying quantization. + + The cache has two types of storage, one for original precision and one for the quantized cache. A `residual length` is set as a maximum capacity for the + original precision cache. When the length goes beyond maximum capacity, the original precision cache is discarded and moved into the quantized cache. The + quantization is done per-channel with a set `q_group_size` for both Keys and Values, in contrast to what was described in the paper. + + It stores Keys and Values a list of quantized tensors (tuples in case we need to store metadata), one for each layer. Additionally, it stores the Key and + Value in original precision states as a list of tensors, one for each layer. The size of each tensor + is `[batch_size, num_heads, seq_len - residual_length, head_dim]` + """ + + def __init__(self, cache_config: QuantizedCacheConfig) -> None: + super().__init__() + self._quantized_key_cache: List[torch.Tensor] = [] + self._quantized_value_cache: List[torch.Tensor] = [] + + self.nbits = cache_config.nbits + self.residual_length = cache_config.residual_length + self.q_group_size = cache_config.q_group_size + self.axis_key = cache_config.axis_key + self.axis_value = cache_config.axis_value + self.compute_dtype = cache_config.compute_dtype + self.device = cache_config.device + + super().__init__() + + def update( + self, + key_states: torch.Tensor, + value_states: torch.Tensor, + layer_idx: int, + cache_kwargs: Optional[Dict[str, Any]] = None, + ) -> Tuple[torch.Tensor, torch.Tensor]: + # Update the number of seen tokens + if layer_idx == 0: + self._seen_tokens += key_states.shape[-2] + + if len(self.key_cache) < layer_idx: + raise ValueError("QuantizedCache does not support model usage where layers are skipped. Use DynamicCache.") + elif len(self.key_cache) == layer_idx: + self._quantized_key_cache.append(self._quantize(key_states.contiguous(), axis=self.axis_key)) + self._quantized_value_cache.append(self._quantize(value_states.contiguous(), axis=self.axis_value)) + self.key_cache.append(torch.zeros(0, dtype=key_states.dtype, device=key_states.device)) + self.value_cache.append(torch.zeros(0, dtype=key_states.dtype, device=key_states.device)) + keys_to_return, values_to_return = key_states, value_states + else: + dequant_key = self._dequantize(self._quantized_key_cache[layer_idx]) + dequant_value = self._dequantize(self._quantized_value_cache[layer_idx]) + keys_to_return = [dequant_key, self.key_cache[layer_idx], key_states] + values_to_return = [dequant_value, self.value_cache[layer_idx], value_states] + + keys_to_return = torch.cat(keys_to_return, dim=-2) + values_to_return = torch.cat(values_to_return, dim=-2) + if ( + self.key_cache[layer_idx].dim() == 4 + and self.key_cache[layer_idx].shape[-2] + 1 >= self.residual_length + ): + self._quantized_key_cache[layer_idx] = self._quantize(keys_to_return.contiguous(), axis=self.axis_key) + self._quantized_value_cache[layer_idx] = self._quantize( + values_to_return.contiguous(), axis=self.axis_value + ) + self.key_cache[layer_idx] = torch.zeros(0, dtype=key_states.dtype, device=key_states.device) + self.value_cache[layer_idx] = torch.zeros(0, dtype=key_states.dtype, device=key_states.device) + else: + self.key_cache[layer_idx] = torch.cat([self.key_cache[layer_idx], key_states], dim=-2) + self.value_cache[layer_idx] = torch.cat([self.value_cache[layer_idx], value_states], dim=-2) + + return keys_to_return, values_to_return + + def get_seq_length(self, layer_idx: Optional[int] = 0) -> int: + """Returns the sequence length of the cached states. A layer index can be optionally passed.""" + if len(self.key_cache) <= layer_idx: + return 0 + # since we cannot get the seq_length of each layer directly and rely on `_seen_tokens` which is + # updated every "layer_idx" == 0, this is a hack to get the actual seq_length for the given layer_idx + # this part of code otherwise fails when used to verify attn_weight shape in some models + return self._seen_tokens if layer_idx == 0 else self._seen_tokens - 1 + + def _quantize(self, tensor, axis): + """Quantizes a key/value using a defined quantization method.""" + raise NotImplementedError("Make sure to implement `_quantize` in a subclass.") + + def _dequantize(self, q_tensor): + """Dequantizes back the tensor that was quantized by `self._quantize()`""" + raise NotImplementedError("Make sure to implement `_dequantize` in a subclass.") + + +class QuantoQuantizedCache(QuantizedCache): + """ + Quantized Cache class that uses `quanto` as a backend to perform quantization. Current implementation supports `int2` and `int4` dtypes only. + + Parameters: + cache_config (`QuantizedCacheConfig`): + A configuration containing all the arguments to be used by the quantizer, including axis, qtype and group size. + + Example: + + ```python + >>> # Run pip install quanto first if you don't have it yet + >>> from transformers import AutoTokenizer, AutoModelForCausalLM, QuantoQuantizedCache, QuantizedCacheConfig + + >>> model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2-0.5B-Instruct") + >>> tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-0.5B-Instruct") + + >>> inputs = tokenizer(text="My name is Qwen2", return_tensors="pt") + + >>> # Prepare a cache class and pass it to model's forward + >>> cache_config = QuantizedCacheConfig(nbits=4) + >>> past_key_values = QuantoQuantizedCache(cache_config=cache_config) + >>> outputs = model(**inputs, past_key_values=past_key_values, use_cache=True) + >>> outputs.past_key_values # access cache filled with key/values from generation + QuantoQuantizedCache() + ``` + """ + + def __init__(self, cache_config: CacheConfig) -> None: + super().__init__(cache_config) + + if is_optimum_quanto_available(): + optimum_quanto_version = version.parse(importlib.metadata.version("optimum-quanto")) + if optimum_quanto_version <= version.parse("0.2.5"): + raise ImportError( + f"You need optimum-quanto package version to be greater or equal than 0.2.5 to use `QuantoQuantizedCache`. Detected version {optimum_quanto_version}." + ) + from optimum.quanto import MaxOptimizer, qint2, qint4 + + if self.nbits not in [2, 4]: + raise ValueError(f"`nbits` for `quanto` backend has to be one of [`2`, `4`] but got {self.nbits}") + + if self.axis_key not in [0, -1]: + raise ValueError(f"`axis_key` for `quanto` backend has to be one of [`0`, `-1`] but got {self.axis_key}") + + if self.axis_value not in [0, -1]: + raise ValueError( + f"`axis_value` for `quanto` backend has to be one of [`0`, `-1`] but got {self.axis_value}" + ) + + self.qtype = qint4 if self.nbits == 4 else qint2 + self.optimizer = MaxOptimizer() # hardcode as it's the only one for per-channel quantization + + def _quantize(self, tensor, axis): + # We have two different API since in optimum-quanto, we don't use AffineQuantizer anymore + if is_optimum_quanto_available(): + from optimum.quanto import quantize_weight + + scale, zeropoint = self.optimizer(tensor, self.qtype, axis, self.q_group_size) + qtensor = quantize_weight(tensor, self.qtype, axis, scale, zeropoint, self.q_group_size) + return qtensor + + def _dequantize(self, qtensor): + return qtensor.dequantize() + + +class HQQQuantizedCache(QuantizedCache): + """ + Quantized Cache class that uses `HQQ` as a backend to perform quantization. Current implementation supports `int2`, `int4`, `int8` dtypes. + + Parameters: + cache_config (`QuantizedCacheConfig`): + A configuration containing all the arguments to be used by the quantizer, including axis, qtype and group size. + + Example: + + ```python + >>> # Run pip install hqq first if you don't have it yet + >>> from transformers import AutoTokenizer, AutoModelForCausalLM, HQQQuantizedCache, QuantizedCacheConfig + + >>> model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2-0.5B-Instruct") + >>> tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-0.5B-Instruct") + + >>> inputs = tokenizer(text="My name is Qwen2", return_tensors="pt") + + >>> # Prepare a cache class and pass it to model's forward + >>> cache_config = QuantizedCacheConfig(nbits=4, axis_key=1, axis_value=1) + >>> past_key_values = HQQQuantizedCache(cache_config=cache_config) + >>> outputs = model(**inputs, past_key_values=past_key_values, use_cache=True) + >>> outputs.past_key_values # access cache filled with key/values from generation + HQQQuantizedCache() + ``` + """ + + def __init__(self, cache_config: CacheConfig) -> None: + super().__init__(cache_config) + if self.nbits not in [1, 2, 3, 4, 8]: + raise ValueError( + f"`nbits` for `HQQ` backend has to be one of [`1`, `2`, `3`, `4`, `8`] but got {self.nbits}" + ) + + if self.axis_key not in [0, 1]: + raise ValueError(f"`axis_key` for `HQQ` backend has to be one of [`0`, `1`] but got {self.axis_key}") + + if self.axis_value not in [0, 1]: + raise ValueError(f"`axis_value` for `HQQ` backend has to be one of [`0`, `1`] but got {self.axis_value}") + + self.quantizer = HQQQuantizer + + def _quantize(self, tensor, axis): + qtensor, meta = self.quantizer.quantize( + tensor, + axis=axis, + device=self.device, + compute_dtype=self.compute_dtype, + nbits=self.nbits, + group_size=self.q_group_size, + ) + meta["compute_dtype"] = self.compute_dtype + self.quantizer.cuda(qtensor, meta=meta, device=self.device) # Move to device and cast to dtype + meta["scale"] = meta["scale"].to(qtensor.device) + meta["zero"] = meta["zero"].to(qtensor.device) + return qtensor, meta + + def _dequantize(self, qtensor): + quant_tensor, meta = qtensor + tensor = self.quantizer.dequantize(quant_tensor, meta) + return tensor + + +class SinkCache(Cache): + """ + A cache that as described in the [Attention Sinks paper](https://arxiv.org/abs/2309.17453). It allows the model to + generate beyond the length of its context window, without losing fluency in the conversation. As it discards past + tokens, the model will lose the ability to generate tokens that depend on the context that was discarded. + + It stores the Key and Value states as a list of tensors, one for each layer. The expected shape for each tensor is + `[batch_size, num_heads, seq_len, head_dim]`. + + Parameters: + window_length (`int`): + The length of the context window. + num_sink_tokens (`int`): + The number of sink tokens. See the original paper for more information. + + Example: + + ```python + >>> from transformers import AutoTokenizer, AutoModelForCausalLM, SinkCache + + >>> model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2-0.5B-Instruct") + >>> tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-0.5B-Instruct") + + >>> inputs = tokenizer(text="My name is Qwen2", return_tensors="pt") + + >>> # Prepare a cache class and pass it to model's forward + >>> past_key_values = SinkCache(window_length=256, num_sink_tokens=4) + >>> outputs = model(**inputs, past_key_values=past_key_values, use_cache=True) + >>> outputs.past_key_values # access cache filled with key/values from generation + SinkCache() + ``` + """ + + is_sliding = True + + def __init__(self, window_length: int, num_sink_tokens: int) -> None: + super().__init__() + self.key_cache: List[torch.Tensor] = [] + self.value_cache: List[torch.Tensor] = [] + self.window_length = window_length + self.num_sink_tokens = num_sink_tokens + self.cos_sin_rerotation_cache = {} + self._cos_cache = None + self._sin_cache = None + self._seen_tokens = 0 # Used in `generate` to keep tally of how many tokens the cache has seen + + @staticmethod + def _rotate_half(x): + x1 = x[..., : x.shape[-1] // 2] + x2 = x[..., x.shape[-1] // 2 :] + return torch.cat((-x2, x1), dim=-1) + + def _apply_key_rotary_pos_emb( + self, key_states: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor + ) -> torch.Tensor: + rotated_key_states = (key_states * cos) + (self._rotate_half(key_states) * sin) + return rotated_key_states + + def _get_rerotation_cos_sin( + self, key_states: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor + ) -> Tuple[torch.Tensor, torch.Tensor]: + if key_states.shape[-2] not in self.cos_sin_rerotation_cache: + # Upcast to float32 temporarily for better accuracy + cos = cos.to(torch.float32) + sin = sin.to(torch.float32) + + # Compute the cos and sin required for back- and forward-rotating to one position earlier in the sequence + original_cos = cos[self.num_sink_tokens + key_states.shape[-2] :] + shifted_cos = cos[self.num_sink_tokens : -key_states.shape[-2]] + original_sin = sin[self.num_sink_tokens + key_states.shape[-2] :] + shifted_sin = sin[self.num_sink_tokens : -key_states.shape[-2]] + rerotation_cos = original_cos * shifted_cos + original_sin * shifted_sin + rerotation_sin = -original_sin * shifted_cos + original_cos * shifted_sin + + self.cos_sin_rerotation_cache[key_states.shape[-2]] = ( + rerotation_cos.to(key_states.dtype).unsqueeze(0), + rerotation_sin.to(key_states.dtype).unsqueeze(0), + ) + return self.cos_sin_rerotation_cache[key_states.shape[-2]] + + def get_seq_length(self, layer_idx: Optional[int] = 0) -> int: + """Returns the sequence length of the cached states. A layer index can be optionally passed.""" + # TODO: deprecate this function in favor of `cache_position` + # Workaround to make 'key_states.shape[-2] + past_key_value.get_seq_length(self.layer_idx)' <= window_length + if len(self.key_cache) <= layer_idx: + return 0 + return self.key_cache[layer_idx].shape[-2] + + def get_max_cache_shape(self) -> Optional[int]: + """Returns the maximum sequence length of the cache object, in case of SinkCache it is the window length.""" + return self.window_length + + def update( + self, + key_states: torch.Tensor, + value_states: torch.Tensor, + layer_idx: int, + cache_kwargs: Optional[Dict[str, Any]] = None, + ) -> Tuple[torch.Tensor, torch.Tensor]: + """ + Updates the cache with the new `key_states` and `value_states` for the layer `layer_idx`. + + Parameters: + key_states (`torch.Tensor`): + The new key states to cache. + value_states (`torch.Tensor`): + The new value states to cache. + layer_idx (`int`): + The index of the layer to cache the states for. + cache_kwargs (`Dict[str, Any]`, `optional`): + Additional arguments for the cache subclass. The following arguments can be used in `SinkCache`: `sin`, + `cos` and `partial_rotation_size`. These arguments are used with models using RoPE, to recompute the + rotation as the tokens are shifted. + + Return: + A tuple containing the updated key and value states. + """ + # Optional kwargs for `SinkCache` -- needed on models using RoPE. `partial_rotation_size` is used on models + # with partially rotated position embeddings, like Phi or Persimmon. + if cache_kwargs is None: + cache_kwargs = {} + sin = cache_kwargs.get("sin") + cos = cache_kwargs.get("cos") + partial_rotation_size = cache_kwargs.get("partial_rotation_size") + using_rope = cos is not None and sin is not None + + # Update the number of seen tokens + if layer_idx == 0: + self._seen_tokens += key_states.shape[-2] + + # Update the sin/cos cache, which holds sin/cos values for all possible positions + if using_rope and layer_idx == 0: + # BC: some models still pass `sin`/`cos` with 2 dims. In those models, they are the full sin/cos. Remove + # after all RoPE models have a llama-like cache utilization. + if cos.dim() == 2: + self._cos_cache = cos + self._sin_cache = sin + else: + if self._cos_cache is None: + self._cos_cache = cos[0, ...] + self._sin_cache = sin[0, ...] + elif self._cos_cache.shape[0] < self.window_length: + self._cos_cache = torch.cat([self._cos_cache, cos[0, ...]], dim=0) + self._sin_cache = torch.cat([self._sin_cache, sin[0, ...]], dim=0) + + # [bsz, num_heads, seq_len, head_dim] + if len(self.key_cache) <= layer_idx: + # Empty cache + self.key_cache.append(key_states) + self.value_cache.append(value_states) + + elif key_states.shape[-2] + self.get_seq_length(layer_idx) < self.window_length: + # Growing cache + self.key_cache[layer_idx] = torch.cat([self.key_cache[layer_idx], key_states], dim=-2) + self.value_cache[layer_idx] = torch.cat([self.value_cache[layer_idx], value_states], dim=-2) + + else: + # Shifting cache + keys_to_keep = self.key_cache[layer_idx][ + :, :, -self.window_length + self.num_sink_tokens + key_states.shape[-2] : + ] + + # On RoPE models, we need to recompute the Key rotation as the tokens are shifted + if using_rope: + rerotation_cos, rerotation_sin = self._get_rerotation_cos_sin( + key_states, self._cos_cache[: self.window_length], self._sin_cache[: self.window_length] + ) + if partial_rotation_size is not None: + keys_to_keep, keys_pass = ( + keys_to_keep[..., :partial_rotation_size], + keys_to_keep[..., partial_rotation_size:], + ) + keys_to_keep = self._apply_key_rotary_pos_emb(keys_to_keep, rerotation_cos, rerotation_sin) + if partial_rotation_size is not None: + keys_to_keep = torch.cat((keys_to_keep, keys_pass), dim=-1) + + # Concatenate sink tokens, shifted & rotated tokens (if needed), and new tokens + sink_keys = self.key_cache[layer_idx][:, :, : self.num_sink_tokens] + self.key_cache[layer_idx] = torch.cat([sink_keys, keys_to_keep, key_states], dim=-2) + + sink_values = self.value_cache[layer_idx][:, :, : self.num_sink_tokens] + values_to_keep = self.value_cache[layer_idx][ + :, :, -self.window_length + self.num_sink_tokens + value_states.shape[-2] : + ] + self.value_cache[layer_idx] = torch.cat([sink_values, values_to_keep, value_states], dim=-2) + + return self.key_cache[layer_idx], self.value_cache[layer_idx] + + +class StaticCache(Cache): + """ + Static Cache class to be used with `torch.compile(model)` and `torch.export()`. + + Parameters: + config (`PretrainedConfig`): + The configuration file defining the shape-related attributes required to initialize the static cache. + max_batch_size (`int`): + The maximum batch size with which the model will be used. Note that a new instance must be instantiated if a + smaller batch size is used. If you are manually setting the batch size, make sure to take into account the + number of beams if you are running beam search + max_cache_len (`int`, *optional*): + The maximum sequence length with which the model will be used. + device (`torch.device` or `str`, *optional*): + The device on which the cache should be initialized. If you're using more than 1 computation device, you + should pass the `layer_device_map` argument instead. + dtype (`torch.dtype`, *optional*, defaults to `torch.float32`): + The default `dtype` to use when initializing the layer. + layer_device_map (`Optional[Dict[int, Union[str, torch.device, int]]]]`, *optional*): + Mapping between the layers and its device. This is required when you are manually initializing the cache + and the model is split between different gpus. You can know which layers mapped to which device by + checking the associated device_map: `model.hf_device_map`. + + + Example: + + ```python + >>> from transformers import AutoTokenizer, AutoModelForCausalLM, StaticCache + + >>> model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf") + >>> tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf") + + >>> inputs = tokenizer(text="My name is Llama", return_tensors="pt") + + >>> # Prepare a cache class and pass it to model's forward + >>> # Leave empty space for 10 new tokens, which can be used when calling forward iteratively 10 times to generate + >>> max_generated_length = inputs.input_ids.shape[1] + 10 + >>> past_key_values = StaticCache(config=model.config, max_batch_size=1, max_cache_len=max_generated_length, device=model.device, dtype=model.dtype) + >>> outputs = model(**inputs, past_key_values=past_key_values, use_cache=True) + >>> outputs.past_key_values # access cache filled with key/values from generation + StaticCache() + ``` + """ + + is_compileable = True + + def __init__( + self, + config: PretrainedConfig, + max_batch_size: int, + max_cache_len: Optional[int] = None, + device: Union[torch.device, str, None] = None, + dtype: torch.dtype = torch.float32, + layer_device_map: Optional[Dict[int, Union[str, torch.device, int]]] = None, + ) -> None: + super().__init__() + self.max_batch_size = max_batch_size + self.max_cache_len = config.max_position_embeddings if max_cache_len is None else max_cache_len + + # Some model define a custom `head_dim` != config.hidden_size // config.num_attention_heads + self.head_dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads + + self._dtype = dtype + self.num_key_value_heads = ( + config.num_attention_heads + if getattr(config, "num_key_value_heads", None) is None + else config.num_key_value_heads + ) + + self.key_cache: List[torch.Tensor] = [] + self.value_cache: List[torch.Tensor] = [] + # Note: There will be significant perf decrease if switching to use 5D tensors instead. + cache_shape = (self.max_batch_size, self.num_key_value_heads, self.max_cache_len, self.head_dim) + device = torch.device(device) if device is not None else None + for idx in range(config.num_hidden_layers): + if layer_device_map is not None: + layer_device = layer_device_map[idx] + else: + layer_device = device + new_layer_key_cache = torch.zeros(cache_shape, dtype=self._dtype, device=layer_device) + new_layer_value_cache = torch.zeros(cache_shape, dtype=self._dtype, device=layer_device) + # Note: `mark_static_address` is used to tag the cache as a fixed data pointer, + # preventing compiled graph breaks when updating the cache. + torch._dynamo.mark_static_address(new_layer_key_cache) + torch._dynamo.mark_static_address(new_layer_value_cache) + self.key_cache.append(new_layer_key_cache) + self.value_cache.append(new_layer_value_cache) + + def update( + self, + key_states: torch.Tensor, + value_states: torch.Tensor, + layer_idx: int, + cache_kwargs: Optional[Dict[str, Any]] = None, + ) -> Tuple[torch.Tensor, torch.Tensor]: + """ + Updates the cache with the new `key_states` and `value_states` for the layer `layer_idx`. + It is VERY important to index using a tensor, otherwise you introduce a copy to the device. + + Parameters: + key_states (`torch.Tensor`): + The new key states to cache. + value_states (`torch.Tensor`): + The new value states to cache. + layer_idx (`int`): + The index of the layer to cache the states for. + cache_kwargs (`Dict[str, Any]`, `optional`): + Additional arguments for the cache subclass. The `StaticCache` needs the `cache_position` input + to know how where to write in the cache. + + Return: + A tuple containing the updated key and value states. + """ + if cache_kwargs is None: + cache_kwargs = {} + cache_position = cache_kwargs.get("cache_position") + k_out = self.key_cache[layer_idx] + v_out = self.value_cache[layer_idx] + key_states = key_states.to(k_out.dtype) + value_states = value_states.to(v_out.dtype) + + if cache_position is None: + k_out.copy_(key_states) + v_out.copy_(value_states) + else: + # Note: here we use `tensor.index_copy_(dim, index, tensor)` that is equivalent to + # `tensor[:, :, index] = tensor`, but the first one is compile-friendly and it does explicitly an in-place + # operation, that avoids copies and uses less memory. + try: + k_out.index_copy_(2, cache_position, key_states) + v_out.index_copy_(2, cache_position, value_states) + except NotImplementedError: + # The operator 'aten::index_copy.out' is not currently implemented for the MPS device. + k_out[:, :, cache_position] = key_states + v_out[:, :, cache_position] = value_states + + return k_out, v_out + + def get_seq_length(self, layer_idx: Optional[int] = 0) -> int: + """Returns the sequence length of the cached states that were seen by the model.""" + # Occupied cache == any slot in the 3rd dim (sequence length) holds a non-zero value. To save on compute, let's + # limit the check to the first batch member and head dimension. + # TODO: deprecate this function in favor of `cache_position` + return (self.key_cache[layer_idx][0, 0].any(dim=-1)).sum() + + def get_max_cache_shape(self) -> Optional[int]: + return self.max_cache_len + + def reset(self): + """Resets the cache values while preserving the objects""" + for layer_idx in range(len(self.key_cache)): + # In-place ops prevent breaking the static address + self.key_cache[layer_idx].zero_() + self.value_cache[layer_idx].zero_() + + +class SlidingWindowCache(StaticCache): + """ + Sliding Window Cache class to be used with `torch.compile` for models like Mistral that support sliding window attention. + Every time when we try to update the cache, we compute the `indices` based on `cache_position >= self.config.sliding_window - 1`, + if true(which means the cache can not hold all the old key value states and new states together because of the sliding window constraint), + we need to do a cycle shift based on `indices` to replace the oldest states by the new key value states passed in. + + The `to_shift` is only true once we are above sliding_window. Thus with `sliding_window==64`: + + indices = (slicing + to_shift[-1].int()-1) % self.config.sliding_window + tensor([ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, + 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, + 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, + 55, 56, 57, 58, 59, 60, 61, 62, 63, 0]) + + We overwrite the cache using these, then we always write at cache_position (clamped to `sliding_window`) + + Parameters: + config (`PretrainedConfig`): + The configuration file defining the shape-related attributes required to initialize the static cache. + max_batch_size (`int`): + The maximum batch size with which the model will be used. Note that a new instance must be instantiated if a + smaller batch size is used. + max_cache_len (`int`, *optional*): + The maximum sequence length with which the model will be used. + device (`torch.device` or `str`, *optional*): + The device on which the cache should be initialized. If you're using more than 1 computation device, you + should pass the `layer_device_map` argument instead. + dtype (`torch.dtype`, *optional*, defaults to `torch.float32`): + The default `dtype` to use when initializing the layer. + layer_device_map (`Optional[Dict[int, Union[str, torch.device, int]]]]`, *optional*): + Mapping between the layers and its device. This is required when you are manually initializing the cache + and the model is split between different gpus. You can know which layers mapped to which device by + checking the associated device_map: `model.hf_device_map`. + + Example: + + ```python + >>> from transformers import AutoTokenizer, AutoModelForCausalLM, SlidingWindowCache + + >>> model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-Instruct-v0.3") + >>> tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.3") + + >>> inputs = tokenizer(text="My name is Mistral", return_tensors="pt") + + >>> # Prepare a cache class and pass it to model's forward + >>> # Leave empty space for 10 new tokens, which can be used when calling forward iteratively 10 times to generate + >>> max_generated_length = inputs.input_ids.shape[1] + 10 + >>> past_key_values = SlidingWindowCache(config=model.config, max_batch_size=1, max_cache_len=max_generated_length, device=model.device, dtype=model.dtype) + >>> outputs = model(**inputs, past_key_values=past_key_values, use_cache=True) + >>> outputs.past_key_values # access cache filled with key/values from generation + SlidingWindowCache() + ``` + """ + + is_sliding = True + is_compileable = True + + def __init__( + self, + config: PretrainedConfig, + max_batch_size: int, + max_cache_len: Optional[int] = None, + device: Union[torch.device, str, None] = None, + dtype: torch.dtype = torch.float32, + layer_device_map: Optional[Dict[int, Union[str, torch.device, int]]] = None, + ) -> None: + if not hasattr(config, "sliding_window") or config.sliding_window is None: + raise ValueError( + "Setting `cache_implementation` to 'sliding_window' requires the model config supporting " + "sliding window attention, please check if there is a `sliding_window` field in the model " + "config and it's not set to None." + ) + max_cache_len = min(config.sliding_window, max_cache_len) + super().__init__( + config=config, + max_batch_size=max_batch_size, + max_cache_len=max_cache_len, + device=device, + dtype=dtype, + layer_device_map=layer_device_map, + ) + + def update( + self, + key_states: torch.Tensor, + value_states: torch.Tensor, + layer_idx: int, + cache_kwargs: Optional[Dict[str, Any]] = None, + ) -> Tuple[torch.Tensor, torch.Tensor]: + if cache_kwargs is None: + cache_kwargs = {} + cache_position = cache_kwargs.get("cache_position") + k_out = self.key_cache[layer_idx] + v_out = self.value_cache[layer_idx] + key_states = key_states.to(k_out.dtype) + value_states = value_states.to(v_out.dtype) + + # assume this only happens in prefill phase when prompt length > sliding_window_size (= max_cache_len) + if cache_position.shape[0] > self.max_cache_len: + k_out = key_states[:, :, -self.max_cache_len :, :] + v_out = value_states[:, :, -self.max_cache_len :, :] + # Assumption: caches are all zeros at this point, `+=` is equivalent to `=` but compile-friendly + self.key_cache[layer_idx] += k_out + self.value_cache[layer_idx] += v_out + # we should return the whole states instead of k_out, v_out to take the whole prompt + # into consideration when building kv cache instead of just throwing away tokens outside of the window + return key_states, value_states + + slicing = torch.ones(self.max_cache_len, dtype=torch.long, device=value_states.device).cumsum(0) + cache_position = cache_position.clamp(0, self.max_cache_len - 1) + to_shift = cache_position >= self.max_cache_len - 1 + indices = (slicing + to_shift[-1].int() - 1) % self.max_cache_len + + k_out = k_out[:, :, indices] + v_out = v_out[:, :, indices] + + try: + k_out.index_copy_(2, cache_position, key_states) + v_out.index_copy_(2, cache_position, value_states) + except NotImplementedError: + # The operator 'aten::index_copy.out' is not currently implemented for the MPS device. + k_out[:, :, cache_position] = key_states + v_out[:, :, cache_position] = value_states + + # `_.zero()` followed by `+=` is equivalent `=`, but compile-friendly (without graph breaks due to assignment) + self.key_cache[layer_idx].zero_() + self.value_cache[layer_idx].zero_() + + self.key_cache[layer_idx] += k_out + self.value_cache[layer_idx] += v_out + + return k_out, v_out + + def get_max_cache_shape(self) -> Optional[int]: + return self.max_cache_len + + def reset(self): + for layer_idx in range(len(self.key_cache)): + # In-place ops prevent breaking the static address + self.key_cache[layer_idx].zero_() + self.value_cache[layer_idx].zero_() + + +class EncoderDecoderCache(Cache): + """ + Base, abstract class for all encoder-decoder caches. Can be used to hold combinations of self-attention and + cross-attention caches. + + Example: + + ```python + >>> from transformers import AutoProcessor, AutoModelForCausalLM, DynamicCache, EncoderDecoderCache + + >>> model = AutoModelForCausalLM.from_pretrained("openai/whisper-small") + >>> processor = AutoProcessor.from_pretrained("openai/whisper-small") + + >>> inputs = processor(audio=YOUR-AUDIO, return_tensors="pt") + + >>> # Prepare cache classes for encoder and decoder and pass it to model's forward + >>> self_attention_cache = DynamicCache() + >>> cross_attention_cache = DynamicCache() + >>> past_key_values = EncoderDecoderCache(self_attention_cache, cross_attention_cache) + >>> outputs = model(**inputs, past_key_values=past_key_values, use_cache=True) + >>> outputs.past_key_values # access cache filled with key/values from generation + EncoderDecoderCache() + ``` + + """ + + def __init__(self, self_attention_cache: Cache, cross_attention_cache: Cache): + super().__init__() + self.self_attention_cache = self_attention_cache + self.cross_attention_cache = cross_attention_cache + self.is_compileable = getattr(self.self_attention_cache, "is_compileable", False) + + self.is_updated = {} + for layer_idx in range(len(cross_attention_cache.key_cache)): + self.is_updated[layer_idx] = bool(cross_attention_cache.get_seq_length(layer_idx) > 0) + + def __getitem__(self, layer_idx: int) -> List[Tuple[torch.Tensor]]: + """ + Support for backwards-compatible `past_key_value` indexing, e.g. `past_key_value[0][0].shape[2]` to get the + sequence length. + """ + if layer_idx < len(self): + return ( + self.self_attention_cache.key_cache[layer_idx], + self.self_attention_cache.value_cache[layer_idx], + self.cross_attention_cache.key_cache[layer_idx], + self.cross_attention_cache.value_cache[layer_idx], + ) + else: + raise KeyError(f"Cache only has {len(self)} layers, attempted to access layer with index {layer_idx}") + + def __len__(self): + """ + Support for backwards-compatible `past_key_value` length, e.g. `len(past_key_value)`. This value corresponds + to the number of layers in the model. + """ + return len(self.self_attention_cache) + + def to_legacy_cache(self) -> Tuple[Tuple[torch.Tensor], Tuple[torch.Tensor]]: + """Converts the `EncoderDecoderCache` instance into its equivalent in the legacy cache format.""" + legacy_cache = () + if len(self.cross_attention_cache) > 0: + for self_attn, cross_attn in zip( + self.self_attention_cache.to_legacy_cache(), self.cross_attention_cache.to_legacy_cache() + ): + legacy_cache += (self_attn + cross_attn,) + else: + legacy_cache = self.self_attention_cache.to_legacy_cache() + return legacy_cache + + @classmethod + def from_legacy_cache( + cls, past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None + ) -> "EncoderDecoderCache": + """Converts a cache in the legacy cache format into an equivalent `EncoderDecoderCache`.""" + cache = cls( + self_attention_cache=DynamicCache(), + cross_attention_cache=DynamicCache(), + ) + if past_key_values is not None: + for layer_idx in range(len(past_key_values)): + key_states, value_states = past_key_values[layer_idx][:2] + cache.self_attention_cache.update(key_states, value_states, layer_idx) + if len(past_key_values[layer_idx]) > 2: + key_states, value_states = past_key_values[layer_idx][2:] + cache.cross_attention_cache.update(key_states, value_states, layer_idx) + cache.is_updated[layer_idx] = True + return cache + + def get_seq_length(self, layer_idx: Optional[int] = 0) -> int: + """Returns the sequence length of the cached states. A layer index can be optionally passed.""" + # check if empty list because in case of static cache it will be a tensors and we can't check `if not torch.Tensor` + return self.self_attention_cache.get_seq_length(layer_idx) + + def reset(self): + if hasattr(self.self_attention_cache, "reset"): + self.self_attention_cache.reset() + if hasattr(self.cross_attention_cache, "reset"): + self.cross_attention_cache.reset() + elif not hasattr(self.self_attention_cache, "reset") and not hasattr(self.cross_attention_cache, "reset"): + raise ValueError( + "Neither self nor cross-attention cache have valid `.reset()` methods. `.reset()` should " + "only be called on compatible cache classes, such as `StaticCache` or `SlidingWindowCache`. " + f"Got {self.self_attention_cache.__str__()} for the self attention cache and " + f"{self.cross_attention_cache.__str__()} for the cross attention cache." + ) + for layer_idx in self.is_updated: + self.is_updated[layer_idx] = False + + def reorder_cache(self, beam_idx: torch.LongTensor): + """Reorders the cache for beam search, given the selected beam indices.""" + self.self_attention_cache.reorder_cache(beam_idx) + self.cross_attention_cache.reorder_cache(beam_idx) + + def check_dynamic_cache(self, method: str): + if not ( + isinstance(self.self_attention_cache, DynamicCache) + and isinstance(self.cross_attention_cache, DynamicCache) + ): + raise ValueError( + f"`{method}` is only defined for dynamic cache, got {self.self_attention_cache.__str__()} for the self " + f"attention cache and {self.cross_attention_cache.__str__()} for the cross attention cache." + ) + + # TODO(gante, sanchit-gandhi): move following functionality into `.generate` + def crop(self, maximum_length: int): + """Crop the past key values up to a new `maximum_length` in terms of tokens. `maximum_length` can also be + negative to remove `maximum_length` tokens. This is used in assisted decoding and contrastive search.""" + self.check_dynamic_cache(self.crop.__name__) + self.self_attention_cache.crop(maximum_length) + + def batch_split(self, full_batch_size: int, split_size: int) -> "List[EncoderDecoderCache]": + """Split the current instance into a list of `DynamicCache` by the batch size. This will be used by + `_split_model_inputs()` in `generation.utils`""" + self.check_dynamic_cache(self.batch_split.__name__) + self_attention_cache = self.self_attention_cache.batch_split(full_batch_size, split_size) + cross_attention_cache = self.cross_attention_cache.batch_split(full_batch_size, split_size) + + out = [] + for self_attn, cross_attn in zip(self_attention_cache, cross_attention_cache): + out.append(EncoderDecoderCache(self_attn, cross_attn)) + return out + + @classmethod + def from_batch_splits(cls, splits: List["EncoderDecoderCache"]) -> "EncoderDecoderCache": + """This is the opposite of the above `batch_split()` method. This will be used by `stack_model_outputs` in + `generation.utils`""" + self_attention_cache = DynamicCache() + cross_attention_cache = DynamicCache() + for idx in range(len(splits[0])): + layer_keys = torch.cat([current.self_attention_cache.key_cache[idx] for current in splits], dim=0) + layer_values = torch.cat([current.self_attention_cache.value_cache[idx] for current in splits], dim=0) + self_attention_cache.update(layer_keys, layer_values, idx) + + layer_keys = torch.cat([current.cross_attention_cache.key_cache[idx] for current in splits], dim=0) + layer_values = torch.cat([current.cross_attention_cache.value_cache[idx] for current in splits], dim=0) + cross_attention_cache.update(layer_keys, layer_values, idx) + return cls(self_attention_cache, cross_attention_cache) + + def batch_repeat_interleave(self, repeats: int): + """Repeat the cache `repeats` times in the batch dimension. Used in contrastive search.""" + self.check_dynamic_cache(self.batch_repeat_interleave.__name__) + self.self_attention_cache.batch_repeat_interleave(repeats) + self.cross_attention_cache.batch_repeat_interleave(repeats) + + def batch_select_indices(self, indices: torch.Tensor): + """Only keep the `indices` in the batch dimension of the cache. Used in contrastive search.""" + self.check_dynamic_cache(self.batch_select_indices.__name__) + self.self_attention_cache.batch_select_indices(indices) + self.cross_attention_cache.batch_select_indices(indices) + + +class HybridCache(Cache): + """ + Hybrid Cache class to be used with `torch.compile` for models that alternate between a local sliding window + attention and global attention in every other layer (originally implemented for Gemma2). + Under the hood, Hybrid Cache leverages ["SlidingWindowCache"] for sliding window attention and ["StaticCache"] + for global attention.For more information, see the documentation of each subcomponent cache class. + + Parameters: + config (`PretrainedConfig): + The configuration file defining the shape-related attributes required to initialize the static cache. + max_batch_size (`int`): + The maximum batch size with which the model will be used. Note that a new instance must be instantiated if a + smaller batch size is used. + max_cache_len (`int`, *optional*): + The maximum sequence length with which the model will be used. + device (`torch.device` or `str`, *optional*): + The device on which the cache should be initialized. If you're using more than 1 computation device, you + should pass the `layer_device_map` argument instead. + dtype (torch.dtype, *optional*, defaults to `torch.float32`): + The default `dtype` to use when initializing the layer. + layer_device_map (`Optional[Dict[int, Union[str, torch.device, int]]]]`, *optional*): + Mapping between the layers and its device. This is required when you are manually initializing the cache + and the model is split between different gpus. You can know which layers mapped to which device by + checking the associated device_map: `model.hf_device_map`. + + Example: + + ```python + >>> from transformers import AutoTokenizer, AutoModelForCausalLM, HybridCache + + >>> model = AutoModelForCausalLM.from_pretrained("google/gemma-2-2b") + >>> tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-2b") + + >>> inputs = tokenizer(text="My name is Gemma", return_tensors="pt") + + >>> # Prepare a cache class and pass it to model's forward + >>> # Leave empty space for 10 new tokens, which can be used when calling forward iteratively 10 times to generate + >>> max_generated_length = inputs.input_ids.shape[1] + 10 + >>> past_key_values = HybridCache(config=model.config, max_batch_size=1, max_cache_len=max_generated_length, device=model.device, dtype=model.dtype) + >>> outputs = model(**inputs, past_key_values=past_key_values, use_cache=True) + >>> outputs.past_key_values # access cache filled with key/values from generation + HybridCache() + ``` + """ + + is_compileable = True + + def __init__( + self, + config: PretrainedConfig, + max_batch_size: int, + max_cache_len: Optional[int] = None, + device: Union[torch.device, str, None] = None, + dtype: torch.dtype = torch.float32, + layer_device_map: Optional[Dict[int, Union[str, torch.device, int]]] = None, + ) -> None: + super().__init__() + if not hasattr(config, "sliding_window") or config.sliding_window is None: + raise ValueError( + "Setting `cache_implementation` to 'sliding_window' requires the model config supporting " + "sliding window attention, please check if there is a `sliding_window` field in the model " + "config and it's not set to None." + ) + self.max_cache_len = max_cache_len + self.max_batch_size = max_batch_size + # Some model define a custom `head_dim` != config.hidden_size // config.num_attention_heads + self.head_dim = ( + config.head_dim if hasattr(config, "head_dim") else config.hidden_size // config.num_attention_heads + ) + + self._dtype = dtype + self.num_key_value_heads = ( + config.num_attention_heads if config.num_key_value_heads is None else config.num_key_value_heads + ) + + layer_switch = config.sliding_window_pattern if hasattr(config, "sliding_window_pattern") else 2 # 2 is for BC + self.is_sliding = torch.tensor( + [bool((i + 1) % layer_switch) for i in range(config.num_hidden_layers)], dtype=torch.bool + ) + self.key_cache: List[torch.Tensor] = [] + self.value_cache: List[torch.Tensor] = [] + global_cache_shape = (self.max_batch_size, self.num_key_value_heads, max_cache_len, self.head_dim) + sliding_cache_shape = ( + self.max_batch_size, + self.num_key_value_heads, + min(config.sliding_window, max_cache_len), + self.head_dim, + ) + device = torch.device(device) if device is not None else None + for i in range(config.num_hidden_layers): + if layer_device_map is not None: + layer_device = layer_device_map[i] + else: + layer_device = device + # Note: `mark_static_address` is used to tag the cache as an fixed data pointer, preventing cuda graph + # breaks when updating the cache. + cache_shape = global_cache_shape if not self.is_sliding[i] else sliding_cache_shape + new_layer_key_cache = torch.zeros(cache_shape, dtype=self._dtype, device=layer_device) + new_layer_value_cache = torch.zeros(cache_shape, dtype=self._dtype, device=layer_device) + torch._dynamo.mark_static_address(new_layer_key_cache) + torch._dynamo.mark_static_address(new_layer_value_cache) + self.key_cache.append(new_layer_key_cache) + self.value_cache.append(new_layer_value_cache) + + def _sliding_update(self, cache_position, layer_idx, key_states, value_states, k_out, v_out, max_cache_len): + if cache_position.shape[0] > max_cache_len: + k_out = key_states[:, :, -max_cache_len:, :] + v_out = value_states[:, :, -max_cache_len:, :] + # Assumption: caches are all zeros at this point, `+=` is equivalent to `=` but compile-friendly + self.key_cache[layer_idx] += k_out + self.value_cache[layer_idx] += v_out + # we should return the whole states instead of k_out, v_out to take the whole prompt + # into consideration when building kv cache instead of just throwing away tokens outside of the window + return key_states, value_states + + slicing = torch.ones(max_cache_len, dtype=torch.long, device=value_states.device).cumsum(0) + cache_position = cache_position.clamp(0, max_cache_len - 1) + to_shift = cache_position >= max_cache_len - 1 + indices = (slicing + to_shift[-1].int() - 1) % max_cache_len + k_out = k_out[:, :, indices] + v_out = v_out[:, :, indices] + + k_out[:, :, cache_position] = key_states + v_out[:, :, cache_position] = value_states + # `_.zero()` followed by `+=` is equivalent `=`, but compile-friendly (without graph breaks due to assignment) + self.key_cache[layer_idx].zero_() + self.value_cache[layer_idx].zero_() + + self.key_cache[layer_idx] += k_out + self.value_cache[layer_idx] += v_out + return k_out, v_out + + def _static_update(self, cache_position, layer_idx, key_states, value_states, k_out, v_out, max_cache_len): + k_out[:, :, cache_position] = key_states + v_out[:, :, cache_position] = value_states + + self.key_cache[layer_idx] = k_out + self.value_cache[layer_idx] = v_out + return k_out, v_out + + def update( + self, + key_states: torch.Tensor, + value_states: torch.Tensor, + layer_idx: int, + cache_kwargs: Optional[Dict[str, Any]] = None, + ) -> Tuple[torch.Tensor, torch.Tensor]: + if cache_kwargs is None: + cache_kwargs = {} + cache_position = cache_kwargs.get("cache_position") + sliding_window = cache_kwargs.get("sliding_window") + + # These two `if` blocks are only reached in multigpu and if `layer_device_map` is not passed. They are used + # when the cache is initialized in the forward pass (e.g. Gemma2) + if self.key_cache[layer_idx].device != key_states.device: + self.key_cache[layer_idx] = self.key_cache[layer_idx].to(key_states.device) + if self.value_cache[layer_idx].device != value_states.device: + self.value_cache[layer_idx] = self.value_cache[layer_idx].to(value_states.device) + + k_out = self.key_cache[layer_idx] + v_out = self.value_cache[layer_idx] + key_states = key_states.to(k_out.dtype) + value_states = value_states.to(v_out.dtype) + + if sliding_window: + update_fn = self._sliding_update + else: + update_fn = self._static_update + + return update_fn( + cache_position, + layer_idx, + key_states, + value_states, + k_out, + v_out, + k_out.shape[2], + ) + + def get_max_cache_shape(self) -> Optional[int]: + return self.max_cache_len + + def get_seq_length(self, layer_idx: Optional[int] = 0): + # Occupied cache == any slot in the 3rd dim (sequence length) holds a non-zero value. To save on compute, let's + # limit the check to the first batch member and head dimension. + # TODO: deprecate this function in favor of `cache_position` + if layer_idx != 0: + raise ValueError( + "`get_seq_length` on `HybridCache` may get inconsistent results depending on the layer index. " + "Using the `layer_idx` argument is not supported." + ) + return (self.key_cache[layer_idx][0, 0].any(dim=-1)).sum() + + def reset(self): + """Resets the cache values while preserving the objects""" + for layer_idx in range(len(self.key_cache)): + # In-place ops prevent breaking the static address + self.key_cache[layer_idx].zero_() + self.value_cache[layer_idx].zero_() + + +class HybridChunkedCache(Cache): + """ + Hybrid Cache class to be used with `torch.compile` for models that alternate between a local sliding window + attention and global attention in every other layer, with support for chunked attention (originally implemented + for Llama4). + Under the hood, Hybrid Cache leverages ["SlidingWindowCache"] for sliding window attention and ["StaticCache"] + for global attention. For more information, see the documentation of each subcomponent cache class. + + Parameters: + config (`PretrainedConfig): + The configuration file defining the shape-related attributes required to initialize the static cache. + max_batch_size (`int`): + The maximum batch size with which the model will be used. Note that a new instance must be instantiated if a + smaller batch size is used. + max_cache_len (`int`, *optional*): + The maximum sequence length with which the model will be used. + device (`torch.device` or `str`, *optional*): + The device on which the cache should be initialized. If you're using more than 1 computation device, you + should pass the `layer_device_map` argument instead. + dtype (torch.dtype, *optional*, defaults to `torch.bfloat16`): + The default `dtype` to use when initializing the layer. + layer_device_map (`Optional[Dict[int, Union[str, torch.device, int]]]]`, *optional*): + Mapping between the layers and its device. This is required when you are manually initializing the cache + and the model is split between different gpus. You can know which layers mapped to which device by + checking the associated device_map: `model.hf_device_map`. + + Example: + + ```python + >>> from transformers import AutoTokenizer, AutoModelForCausalLM, HybridCache + + >>> model = AutoModelForCausalLM.from_pretrained("google/gemma-2-2b") + >>> tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-2b") + + >>> inputs = tokenizer(text="My name is Gemma", return_tensors="pt") + + >>> # Prepare a cache class and pass it to model's forward + >>> # Leave empty space for 10 new tokens, which can be used when calling forward iteratively 10 times to generate + >>> max_generated_length = inputs.input_ids.shape[1] + 10 + >>> past_key_values = HybridCache(config=model.config, max_batch_size=1, max_cache_len=max_generated_length, device=model.device, dtype=model.dtype) + >>> outputs = model(**inputs, past_key_values=past_key_values, use_cache=True) + >>> outputs.past_key_values # access cache filled with key/values from generation + HybridCache() + ``` + """ + + is_compileable = True + + def __init__( + self, + config: PretrainedConfig, + max_batch_size: int, + max_cache_len: Optional[int] = None, + device: Union[torch.device, str, None] = None, + dtype: torch.dtype = torch.bfloat16, + layer_device_map: Optional[Dict[int, Union[str, torch.device, int]]] = None, + ) -> None: + super().__init__() + if not hasattr(config, "sliding_window") or config.sliding_window is None: + self.sliding_window = getattr(config.get_text_config(), "attention_chunk_size", 8192) + else: + self.sliding_window = config.sliding_window + self.max_cache_len = max_cache_len + self.max_batch_size = max_batch_size + self.head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads) + self._dtype = dtype + + if hasattr(config.get_text_config(), "no_rope_layers"): + self.is_sliding = config.no_rope_layers + else: + layer_switch = getattr(config, "sliding_window_pattern", 2) + self.is_sliding = [bool((i + 1) % layer_switch) for i in range(config.num_hidden_layers)] + + self.key_cache: List[torch.Tensor] = [] + self.value_cache: List[torch.Tensor] = [] + self.cumulative_length = [0 for _ in range(config.num_hidden_layers)] + + def initialise_cache_layer(self, layer_idx, key_states): + if len(self.key_cache) > layer_idx: + return + + num_key_value_heads = key_states.shape[1] + device = key_states.device + global_cache_shape = (self.max_batch_size, num_key_value_heads, self.max_cache_len, self.head_dim) + sliding_cache_shape = ( + self.max_batch_size, + num_key_value_heads, + self.sliding_window, + self.head_dim, + ) + # Note: `mark_static_address` is used to tag the cache as an fixed data pointer, preventing cuda graph + # breaks when updating the cache. + cache_shape = sliding_cache_shape if self.is_sliding[layer_idx] else global_cache_shape + new_layer_key_cache = torch.zeros(cache_shape, dtype=self._dtype, device=device) + new_layer_value_cache = torch.zeros(cache_shape, dtype=self._dtype, device=device) + torch._dynamo.mark_static_address(new_layer_key_cache) + torch._dynamo.mark_static_address(new_layer_value_cache) + self.key_cache.append(new_layer_key_cache) + self.value_cache.append(new_layer_value_cache) + + def _sliding_update(self, cache_position, layer_idx, key_states, value_states, k_out, v_out, max_cache_len): + cumulative_length = self.cumulative_length[layer_idx] + # Update it now that we saved the value above + self.cumulative_length[layer_idx] += key_states.shape[-2] + is_full = cumulative_length >= max_cache_len + if is_full: + full_key_states = torch.cat((k_out[:, :, 1:, :], key_states), dim=-2) + full_value_states = torch.cat((v_out[:, :, 1:, :], value_states), dim=-2) + # Fast decoding path -> here as the effective size is still sliding window, it is extremely important + # to return `self.key_cache[layer_idx]` and `self.value_cache[layer_idx]`, as they have the fixed adress + # in memory (the values are the same as the full states, but not the address!!) + if key_states.shape[-2] == 1: + self.key_cache[layer_idx].copy_(full_key_states) + self.value_cache[layer_idx].copy_(full_value_states) + return self.key_cache[layer_idx], self.value_cache[layer_idx] + elif not is_full and cumulative_length + key_states.shape[2] > max_cache_len: + # Fast prefill path, no need to cat() in this case (which creates a copy even if cating from 0 dim) + if cumulative_length == 0: + full_key_states = key_states + full_value_states = value_states + else: + full_key_states = torch.cat((k_out[:, :, :cumulative_length, :], key_states), dim=-2) + full_value_states = torch.cat((v_out[:, :, :cumulative_length, :], value_states), dim=-2) + else: + self.key_cache[layer_idx].index_copy_(2, cache_position, key_states) + self.value_cache[layer_idx].index_copy_(2, cache_position, value_states) + return self.key_cache[layer_idx], self.value_cache[layer_idx] + + self.key_cache[layer_idx].copy_(full_key_states[:, :, -max_cache_len:, :]) + self.value_cache[layer_idx].copy_(full_value_states[:, :, -max_cache_len:, :]) + # we should return the whole states instead of k_out, v_out to take the whole prompt + # into consideration when building kv cache instead of just throwing away tokens outside of the window + return full_key_states, full_value_states + + def _static_update(self, cache_position, layer_idx, key_states, value_states, k_out, v_out, max_cache_len): + k_out[:, :, cache_position] = key_states + v_out[:, :, cache_position] = value_states + + self.key_cache[layer_idx] = k_out + self.value_cache[layer_idx] = v_out + return k_out, v_out + + def update( + self, + key_states: torch.Tensor, + value_states: torch.Tensor, + layer_idx: int, + cache_kwargs: Optional[Dict[str, Any]] = None, + ) -> Tuple[torch.Tensor, torch.Tensor]: + if cache_kwargs is None: + cache_kwargs = {} + cache_position = cache_kwargs.get("cache_position") + self.initialise_cache_layer(layer_idx, key_states) + + k_out = self.key_cache[layer_idx] + v_out = self.value_cache[layer_idx] + key_states = key_states.to(k_out.dtype) + value_states = value_states.to(v_out.dtype) + + if self.is_sliding[layer_idx]: + update_fn = self._sliding_update + else: + update_fn = self._static_update + + return update_fn( + cache_position, + layer_idx, + key_states, + value_states, + k_out, + v_out, + k_out.shape[2], + ) + + def get_max_cache_shape(self) -> Optional[int]: + return self.max_cache_len + + def get_seq_length(self, layer_idx: Optional[int] = 0): + # Occupied cache == any slot in the 3rd dim (sequence length) holds a non-zero value. To save on compute, let's + # limit the check to the first batch member and head dimension. + # TODO: deprecate this function in favor of `cache_position` + if layer_idx != 0: + raise ValueError( + "`get_seq_length` on `HybridCache` may get inconsistent results depending on the layer index. " + "Using the `layer_idx` argument is not supported." + ) + if len(self.key_cache) == 0: + return 0 + return (self.key_cache[layer_idx][0, 0].any(dim=-1)).sum() + + def reset(self): + """Resets the cache values while preserving the objects""" + for layer_idx in range(len(self.key_cache)): + # In-place ops prevent breaking the static address + self.key_cache[layer_idx].zero_() + self.value_cache[layer_idx].zero_() + self.cumulative_length = [0 for _ in range(len(self.cumulative_length))] + + +class OffloadedHybridCache(HybridChunkedCache): + def __init__( + self, + config: PretrainedConfig, + max_batch_size: int, + max_cache_len: Optional[int] = None, + device: Union[torch.device, str, None] = None, + dtype: torch.dtype = torch.bfloat16, + offload_device: Union[str, torch.device] = torch.device("cpu"), + layer_device_map: Optional[Dict[int, Union[str, torch.device, int]]] = None, + ): + super().__init__(config, max_batch_size, max_cache_len, device, dtype, layer_device_map) + self.offload_device = torch.device(offload_device) + # Create new CUDA stream for parallel prefetching. + self._prefetch_stream = torch.cuda.Stream() if torch._C._get_accelerator().type == "cuda" else None + # Those will be dynamically created as the other layers (for TP) + self.device_key_cache = None + self.device_value_cache = None + # This gives the index of which on-device full layer to use (we need 2 to avoid race conditions when prefetching) + self.active_device_layer = 0 + + def initialise_cache_layer(self, layer_idx, key_states): + """Overriden to use the correct device if offloaded layer (and pin memory).""" + if len(self.key_cache) > layer_idx: + return + + num_key_value_heads = key_states.shape[1] + device = key_states.device if self.is_sliding[layer_idx] else self.offload_device + pin_memory = not self.is_sliding[layer_idx] + global_cache_shape = (self.max_batch_size, num_key_value_heads, self.max_cache_len, self.head_dim) + sliding_cache_shape = ( + self.max_batch_size, + num_key_value_heads, + self.sliding_window, + self.head_dim, + ) + # Note: `mark_static_address` is used to tag the cache as an fixed data pointer, preventing cuda graph + # breaks when updating the cache. + cache_shape = sliding_cache_shape if self.is_sliding[layer_idx] else global_cache_shape + new_layer_key_cache = torch.zeros(cache_shape, dtype=self._dtype, device=device, pin_memory=pin_memory) + new_layer_value_cache = torch.zeros(cache_shape, dtype=self._dtype, device=device, pin_memory=pin_memory) + torch._dynamo.mark_static_address(new_layer_key_cache) + torch._dynamo.mark_static_address(new_layer_value_cache) + self.key_cache.append(new_layer_key_cache) + self.value_cache.append(new_layer_value_cache) + + # Make sure to initialize the on-device layer if it does not already exist + if self.device_key_cache is None and not self.is_sliding[layer_idx]: + self.device_key_cache = [] + self.device_value_cache = [] + # We need 2 layers to avoid race conditions when prefetching the next one + for _ in range(2): + device_layer_key_cache = torch.zeros(cache_shape, dtype=self._dtype, device=key_states.device) + device_layer_value_cache = torch.zeros(cache_shape, dtype=self._dtype, device=key_states.device) + torch._dynamo.mark_static_address(new_layer_key_cache) + torch._dynamo.mark_static_address(new_layer_value_cache) + self.device_key_cache.append(device_layer_key_cache) + self.device_value_cache.append(device_layer_value_cache) + + def _static_update(self, cache_position, layer_idx, key_states, value_states, k_out, v_out, max_cache_len): + # Wait for prefetch stream if needed + if self._prefetch_stream is not None: + torch.cuda.default_stream(key_states.device).wait_stream(self._prefetch_stream) + + # Get correct on-device layer + k_out = self.device_key_cache[self.active_device_layer] + v_out = self.device_value_cache[self.active_device_layer] + + # Let's prefetch the next layer as soon as possible + self._prefetch_next_layer(layer_idx) + + # Copy to on-device layer + k_out[:, :, cache_position] = key_states + v_out[:, :, cache_position] = value_states + + # Copy to offloaded device + self.key_cache[layer_idx][:, :, cache_position] = key_states.to(self.offload_device) + self.value_cache[layer_idx][:, :, cache_position] = value_states.to(self.offload_device) + + return k_out, v_out + + def _prefetch_next_layer(self, layer_idx: int) -> None: + """Based on current layer_idx, prefetch next full layer to the device.""" + + # Switch the active layer + self.active_device_layer = 0 if self.active_device_layer == 1 else 1 + + # Find the next non-sliding layer + try: + next_layer = layer_idx + 1 + self.is_sliding[layer_idx + 1 :].index(False) + # In this case, we are at the last layer, and we go back to prefect the first one + except ValueError: + next_layer = self.is_sliding.index(False) + + # Alternate between two on-device caches. + if self._prefetch_stream is not None: + with torch.cuda.stream(self._prefetch_stream): + self._prefetch_layer_in_context(next_layer) + else: + self._prefetch_layer_in_context(next_layer) + + def _prefetch_layer_in_context(self, layer_idx: int) -> None: + """Performs the actual copy of the layer to device cache.""" + if len(self.key_cache) >= layer_idx: + self.device_key_cache[self.active_device_layer].copy_(self.key_cache[layer_idx], non_blocking=True) + self.device_value_cache[self.active_device_layer].copy_(self.value_cache[layer_idx], non_blocking=True) + # The layer was not yet initialized + else: + self.device_key_cache[self.active_device_layer].fill_(0.0) + self.device_value_cache[self.active_device_layer].fill_(0.0) + + +class MambaCache: + """ + Cache for mamba model which does not have attention mechanism and key value states. + + Arguments: + config (`PretrainedConfig): + The configuration file defining the shape-related attributes required to initialize the static cache. + max_batch_size (`int`): + The maximum batch size with which the model will be used. Note that a new instance must be instantiated if a smaller batch size is used. + dtype (`torch.dtype`, *optional*, defaults to `torch.float16`): + The default `dtype` to use when initializing the layer. + device (`torch.device` or `str`, *optional*): + The device on which the cache should be initialized. Should be the same as the layer. + + Example: + + ```python + >>> from transformers import AutoTokenizer, MambaForCausalLM, MambaCache + + >>> model = MambaForCausalLM.from_pretrained("state-spaces/mamba-130m-hf") + >>> tokenizer = AutoTokenizer.from_pretrained("state-spaces/mamba-130m-hf") + + >>> inputs = tokenizer(text="My name is Mamba", return_tensors="pt") + + >>> # Prepare a cache class and pass it to model's forward + >>> past_key_values = MambaCache(config=model.config, max_batch_size=1, device=model.device, dtype=model.dtype) + >>> outputs = model(**inputs, past_key_values=past_key_values, use_cache=True) + >>> outputs.past_key_values + MambaCache() + ``` + """ + + is_compileable = True + + # TODO (joao): add layer_device_map arg and update code in `generate` accordingly + def __init__( + self, + config: PretrainedConfig, + max_batch_size: int, + dtype: torch.dtype = torch.float16, + device: Union[torch.device, str, None] = None, + ): + self.max_batch_size = max_batch_size + self._dtype = dtype + self.intermediate_size = config.intermediate_size + self.ssm_state_size = config.state_size + self.conv_kernel_size = config.conv_kernel + + self.conv_states: List[torch.Tensor] = [] + self.ssm_states: List[torch.Tensor] = [] + device = torch.device(device) if device is not None else None + for _ in range(config.num_hidden_layers): + conv_state: torch.Tensor = torch.zeros( + self.max_batch_size, + self.intermediate_size, + self.conv_kernel_size, + device=device, + dtype=self._dtype, + ) + ssm_state: torch.Tensor = torch.zeros( + self.max_batch_size, + self.intermediate_size, + self.ssm_state_size, + device=device, + dtype=self._dtype, + ) + + torch._dynamo.mark_static_address(conv_state) + torch._dynamo.mark_static_address(ssm_state) + self.conv_states.append(conv_state) + self.ssm_states.append(ssm_state) + + def update_conv_state( + self, layer_idx: int, new_conv_state: torch.Tensor, cache_position: torch.LongTensor + ) -> torch.Tensor: + # This `if` blocks is only reached in multigpu and if `layer_device_map` is not passed. It is used + # when the cache is initialized in the forward pass (e.g. Mamba) + if self.conv_states[layer_idx].device != new_conv_state.device: + self.conv_states[layer_idx] = self.conv_states[layer_idx].to(new_conv_state.device) + + conv_state = self.conv_states[layer_idx] + cache_position = cache_position.clamp(0, self.conv_kernel_size - 1) + + conv_state = conv_state.roll(shifts=-1, dims=-1) + conv_state[:, :, cache_position] = new_conv_state.to(device=conv_state.device, dtype=conv_state.dtype) + self.conv_states[layer_idx].zero_() + self.conv_states[layer_idx] += conv_state + return self.conv_states[layer_idx] + + def update_ssm_state(self, layer_idx: int, new_ssm_state: torch.Tensor): + self.ssm_states[layer_idx] = new_ssm_state.to(self.ssm_states[layer_idx].device) + return self.ssm_states[layer_idx] + + def reset(self): + for layer_idx in range(len(self.conv_states)): + # In-place ops prevent breaking the static address + self.conv_states[layer_idx].zero_() + self.ssm_states[layer_idx].zero_() + + +class OffloadedStaticCache(StaticCache): + """ + Static cache class to be used with `torch.compile(model)` that offloads to the CPU or + another device. + + Args: + config (`PretrainedConfig): + The configuration file defining the shape-related attributes required to initialize + the static cache. + max_batch_size (`int`): + The maximum batch size with which the model will be used. + max_cache_len (`int`): + The maximum sequence length with which the model will be used. + device (`Union[str, torch.device]`): + The device on which the cache should be initialized. If you're using more than 1 computation device, you + should pass the `layer_device_map` argument instead. + dtype (`torch.dtype`, *optional*): + The default `dtype` to use when initializing the cache. + offload_device (`Union[str, torch.device]`, *optional*, defaults to `cpu`): + The device to offload to. Defaults to CPU. + layer_device_map (`Dict[int, Union[str, torch.device, int]]`, *optional*): + Mapping between the layers and its device. This is required when you are manually initializing the cache + and the model is splitted between differents gpus. You can know which layers mapped to which device by + checking the associated device_map: `model.hf_device_map`. + + Example: + + ```python + >>> from transformers import AutoTokenizer, AutoModelForCausalLM, OffloadedStaticCache + + >>> model = AutoModelForCausalLM.from_pretrained("openai-community/gpt2") + >>> tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2") + + >>> inputs = tokenizer(text="My name is GPT2", return_tensors="pt") + + >>> # Prepare a cache class and pass it to model's forward + >>> # Leave empty space for 10 new tokens, which can be used when calling forward iteratively 10 times to generate + >>> max_generated_length = inputs.input_ids.shape[1] + 10 + >>> past_key_values = OffloadedStaticCache(config=model.config, max_batch_size=1, max_cache_len=max_generated_length, device=model.device, dtype=model.dtype) + >>> outputs = model(**inputs, past_key_values=past_key_values, use_cache=True) + >>> past_kv_length = outputs.past_key_values # access cache filled with key/values from generation + ``` + """ + + is_compileable = True + + def __init__( + self, + config: PretrainedConfig, + max_batch_size: int, + max_cache_len: Optional[int], + device: Union[str, torch.device], + dtype: Optional[torch.dtype] = None, + offload_device: Union[str, torch.device] = torch.device("cpu"), + layer_device_map: Optional[Dict[int, Union[str, torch.device, int]]] = None, + ) -> None: + super(Cache, self).__init__() + self.max_batch_size = max_batch_size + self.max_cache_len = config.max_position_embeddings if max_cache_len is None else max_cache_len + self.device = torch.device(device) if layer_device_map is None else torch.device(layer_device_map[0]) + self.offload_device = torch.device(offload_device) + self._dtype = dtype if dtype is not None else torch.float32 + + # Some model define a custom `head_dim` != config.hidden_size // config.num_attention_heads + head_dim = config.head_dim if hasattr(config, "head_dim") else config.hidden_size // config.num_attention_heads + + num_key_value_heads = ( + config.num_attention_heads + if getattr(config, "num_key_value_heads", None) is None + else config.num_key_value_heads + ) + + cache_shape = (max_batch_size, num_key_value_heads, self.max_cache_len, head_dim) + + # Create offloaded CPU tensors. + self.key_cache: List[torch.Tensor] = [] + self.value_cache: List[torch.Tensor] = [] + + for i in range(config.num_hidden_layers): + # First layer is always on-device. + device = self.device if i == 0 else self.offload_device + + key_cache, value_cache = self._create_key_value_cache_tensors(cache_shape, device) + + self.key_cache.append(key_cache) + self.value_cache.append(value_cache) + + # Create device tensors. + self._device_key_cache: List[torch.Tensor] = [] + self._device_value_cache: List[torch.Tensor] = [] + + for i in range(2): + key_cache, value_cache = self._create_key_value_cache_tensors(cache_shape, self.device) + + self._device_key_cache.append(key_cache) + self._device_value_cache.append(value_cache) + + # For backwards compatibility. + # TODO(gante): Remove this. + self._seen_tokens = 0 + + # Create new CUDA stream for parallel prefetching. + self._prefetch_stream = torch.cuda.Stream() if self.device.type == "cuda" else None + + def update( + self, + key_states: torch.Tensor, + value_states: torch.Tensor, + layer_idx: int, + cache_kwargs: Optional[Dict[str, Any]] = None, + ) -> Tuple[torch.Tensor, torch.Tensor]: + """ + Updates the cache with the new `key_states` and `value_states` for the layer `layer_idx`. + It is VERY important to index using a tensor, otherwise you introduce a copy to the device. + + Parameters: + key_states (`torch.Tensor`): + The new key states to cache. + value_states (`torch.Tensor`): + The new value states to cache. + layer_idx (`int`): + The index of the layer to cache the states for. + cache_kwargs (`Dict[str, Any]`, *optional*): + Additional arguments for the cache subclass. The `OffloadedStaticCache` needs the + `cache_position` input to know how where to write in the cache. + + Return: + A tuple containing the updated key and value states. + """ + + if layer_idx == 0: + # Update seen tokens. + # TODO(gante): Remove this. + self._seen_tokens += key_states.shape[-2] + + # Always there. + k_out = self.key_cache[0] + v_out = self.value_cache[0] + else: + # Wait for prefetch stream. + if self._prefetch_stream is not None: + torch.cuda.default_stream(self.device).wait_stream(self._prefetch_stream) + + k_out = self._device_key_cache[layer_idx & 1] + v_out = self._device_value_cache[layer_idx & 1] + + self._prefetch_layer(layer_idx + 1) + + cache_position = cache_kwargs.get("cache_position") if cache_kwargs is not None else None + if cache_position is None: + k_out.copy_(key_states) + v_out.copy_(value_states) + + # Copy the values to the offloaded device as well. + if layer_idx == 0: + self.key_cache[layer_idx].copy_(key_states.to(self.offload_device)) + self.value_cache[layer_idx].copy_(value_states.to(self.offload_device)) + else: + # Note: here we use `tensor.index_copy_(dim, index, tensor)` that is equivalent to + # `tensor[:, :, index] = tensor`, but the first one is compile-friendly and it does + # explicitly an in-place operation, that avoids copies and uses less memory. + try: + k_out.index_copy_(2, cache_position, key_states) + v_out.index_copy_(2, cache_position, value_states) + except NotImplementedError: + # The operator 'aten::index_copy.out' is not currently implemented for the MPS + # device. + k_out[:, :, cache_position] = key_states + v_out[:, :, cache_position] = value_states + + # Copy the values to the offloaded device as well. + if layer_idx != 0: + cache_position = cache_position.to(self.offload_device) + key_states = key_states.to(self.offload_device) + value_states = value_states.to(self.offload_device) + + try: + self.key_cache[layer_idx].index_copy_(2, cache_position, key_states) + self.value_cache[layer_idx].index_copy_(2, cache_position, value_states) + except NotImplementedError: + # The operator 'aten::index_copy.out' is not currently implemented for the MPS + # device. + self.key_cache[layer_idx][:, :, cache_position] = key_states + self.value_cache[layer_idx][:, :, cache_position] = value_states + + return k_out, v_out + + def get_seq_length(self, layer_idx: Optional[int] = 0) -> int: + """Returns the sequence length of the cached states that were seen by the model.""" + + # TODO(gante): Remove this. + return self._seen_tokens + + def get_max_cache_shape(self) -> Optional[int]: + """Returns the maximum sequence length of the cached states.""" + + return self.max_cache_len + + def reset(self) -> None: + """Resets the cache values while preserving the objects.""" + + # For backwards compatibility. + # TODO(gante): Remove this. + self._seen_tokens = 0 + + # Zero out cache. + for layer_idx in range(len(self.key_cache)): + # In-place ops prevent breaking the static address. + self.key_cache[layer_idx].zero_() + self.value_cache[layer_idx].zero_() + + @property + def seen_tokens(self) -> int: + # For backwards compatibility. + # TODO(gante): Remove this. + return self._seen_tokens + + def _create_key_value_cache_tensors( + self, shape: Tuple[int, ...], device: torch.device + ) -> Tuple[torch.Tensor, torch.Tensor]: + """Creates K/V cache tensors on a device. Pins memory for CPU tensors. Marks them as static + addresses for non-CPU tensors. + + Args: + shape (`Tuple[int, ...]`): Shape. + device (`torch.device`): Device. + + Returns: + Key and value cache tensors as a tuple. + """ + + is_cpu_device = device == torch.device("cpu") + + key_cache = torch.zeros(shape, dtype=self._dtype, device=device, pin_memory=is_cpu_device) + value_cache = torch.zeros(shape, dtype=self._dtype, device=device, pin_memory=is_cpu_device) + + # Note: `mark_static_address` is used to tag the cache as a fixed data pointer, + # preventing compiled graph breaks when updating the cache. + torch._dynamo.mark_static_address(key_cache) + torch._dynamo.mark_static_address(value_cache) + + return key_cache, value_cache + + def _prefetch_layer(self, layer_idx: int) -> None: + """Prefetch a layer to the device. Needs to be called in order of layer indices.""" + + # Don't fetch layers that do not exist. + if layer_idx >= len(self.key_cache): + return + + # Alternate between two on-device caches. + if self._prefetch_stream is not None: + with torch.cuda.stream(self._prefetch_stream): + self._prefetch_layer_in_context(layer_idx) + else: + self._prefetch_layer_in_context(layer_idx) + + def _prefetch_layer_in_context(self, layer_idx: int) -> None: + """Performs the actual copy of the layer to device cache.""" + + self._device_key_cache[layer_idx & 1].copy_(self.key_cache[layer_idx], non_blocking=True) + self._device_value_cache[layer_idx & 1].copy_(self.value_cache[layer_idx], non_blocking=True) diff --git a/docs/transformers/build/lib/transformers/commands/__init__.py b/docs/transformers/build/lib/transformers/commands/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..aa5d95a85b538171ec9cf4fa16e892df1efdef6b --- /dev/null +++ b/docs/transformers/build/lib/transformers/commands/__init__.py @@ -0,0 +1,27 @@ +# Copyright 2020 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from abc import ABC, abstractmethod +from argparse import ArgumentParser + + +class BaseTransformersCLICommand(ABC): + @staticmethod + @abstractmethod + def register_subcommand(parser: ArgumentParser): + raise NotImplementedError() + + @abstractmethod + def run(self): + raise NotImplementedError() diff --git a/docs/transformers/build/lib/transformers/commands/add_fast_image_processor.py b/docs/transformers/build/lib/transformers/commands/add_fast_image_processor.py new file mode 100644 index 0000000000000000000000000000000000000000..dff3cb2d7bc6061aacc6bf9d82f81d3abe0e7b54 --- /dev/null +++ b/docs/transformers/build/lib/transformers/commands/add_fast_image_processor.py @@ -0,0 +1,534 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import re +from argparse import ArgumentParser, Namespace +from datetime import date +from pathlib import Path + +from ..utils import logging +from . import BaseTransformersCLICommand + + +logger = logging.get_logger(__name__) # pylint: disable=invalid-name + + +CURRENT_YEAR = date.today().year +TRANSFORMERS_PATH = Path(__file__).parent.parent +REPO_PATH = TRANSFORMERS_PATH.parent.parent + + +def add_fast_image_processor_to_model_init( + fast_image_processing_module_file: str, fast_image_processor_name, model_name: str +): + """ + Add the fast image processor to the __init__.py file of the model. + """ + with open(TRANSFORMERS_PATH / "models" / model_name / "__init__.py", "r", encoding="utf-8") as f: + content = f.read() + + fast_image_processing_module_file = fast_image_processing_module_file.split(os.sep)[-1].replace(".py", "") + + if "import *" in content: + # we have an init file in the updated format + # get the indented block after if TYPE_CHECKING: and before else:, append the new import, sort the imports and write the updated content + # Step 1: Find the block + block_regex = re.compile( + r"if TYPE_CHECKING:\n(?P.*?)(?=\s*else:)", + re.DOTALL, + ) + match = block_regex.search(content) + + if not match: + raise ValueError("Couldn't find the 'if TYPE_CHECKING' block.") + + block_content = match.group("if_block") # The captured import block + + # Step 2: Parse existing entries + entries = block_content.split("\n") + indent = " " * (len(entries[0]) - len(entries[0].lstrip())) + new_entry = f"{indent}from .{fast_image_processing_module_file} import *" + if new_entry not in entries: + entries.append(new_entry) + entries.sort() + updated_block = "\n".join(entry for entry in entries) + + # Replace the original block in the content + updated_content = content[: match.start("if_block")] + updated_block + content[match.end("if_block") :] + else: + # we have an init file in the old format + + # add "is_torchvision_available" import to from ...utils import ( + # Regex to match import statements from transformers.utils + pattern = r""" + from\s+\.\.\.utils\s+import\s+ + (?: # Non-capturing group for either: + ([\w, ]+) # 1. Single-line imports (e.g., 'a, b') + | # OR + \((.*?)\) # 2. Multi-line imports (e.g., '(a, ... b)') + ) + """ + regex = re.compile(pattern, re.VERBOSE | re.DOTALL) + + def replacement_function(match): + # Extract existing imports + imports = (match.group(1) or match.group(2)).split(",") + imports = imports[:-1] if imports[-1] == "\n" else imports + imports = [imp.strip() for imp in imports] + + # Add the new import if not already present + if "is_torchvision_available" not in imports: + imports.append("is_torchvision_available") + imports.sort() + + # Convert to multi-line import in all cases + updated_imports = "(\n " + ",\n ".join(imports) + ",\n)" + + return f"from ...utils import {updated_imports}" + + # Replace all matches in the file content + updated_content = regex.sub(replacement_function, content) + + vision_import_structure_block = f' _import_structure["{fast_image_processing_module_file[:-5]}"] = ["{fast_image_processor_name[:-4]}"]\n' + + added_import_structure_block = ( + "try:\n if not is_torchvision_available():\n" + " raise OptionalDependencyNotAvailable()\n" + "except OptionalDependencyNotAvailable:\n" + " pass\n" + "else:\n" + f' _import_structure["{fast_image_processing_module_file}"] = ["{fast_image_processor_name}"]\n' + ) + + if vision_import_structure_block not in updated_content: + raise ValueError("Couldn't find the 'vision _import_structure block' block.") + + if added_import_structure_block not in updated_content: + updated_content = updated_content.replace( + vision_import_structure_block, vision_import_structure_block + "\n" + added_import_structure_block + ) + + vision_import_statement_block = ( + f" from .{fast_image_processing_module_file[:-5]} import {fast_image_processor_name[:-4]}\n" + ) + + added_import_statement_block = ( + " try:\n if not is_torchvision_available():\n" + " raise OptionalDependencyNotAvailable()\n" + " except OptionalDependencyNotAvailable:\n" + " pass\n" + " else:\n" + f" from .{fast_image_processing_module_file} import {fast_image_processor_name}\n" + ) + + if vision_import_statement_block not in updated_content: + raise ValueError("Couldn't find the 'vision _import_structure block' block.") + + if added_import_statement_block not in updated_content: + updated_content = updated_content.replace( + vision_import_statement_block, vision_import_statement_block + "\n" + added_import_statement_block + ) + + # write the updated content + with open(TRANSFORMERS_PATH / "models" / model_name / "__init__.py", "w", encoding="utf-8") as f: + f.write(updated_content) + + +def add_fast_image_processor_to_auto(image_processor_name: str, fast_image_processor_name: str): + """ + Add the fast image processor to the auto module. + """ + with open(TRANSFORMERS_PATH / "models" / "auto" / "image_processing_auto.py", "r", encoding="utf-8") as f: + content = f.read() + + # get all lines containing the image processor name + updated_content = content.replace( + f'("{image_processor_name}",)', f'("{image_processor_name}", "{fast_image_processor_name}")' + ) + + # write the updated content + with open(TRANSFORMERS_PATH / "models" / "auto" / "image_processing_auto.py", "w", encoding="utf-8") as f: + f.write(updated_content) + + +def add_fast_image_processor_to_doc(fast_image_processor_name: str, model_name: str): + """ + Add the fast image processor to the model's doc file. + """ + doc_source = REPO_PATH / "docs" / "source" + # find the doc files + doc_files = list(doc_source.glob(f"*/model_doc/{model_name}.md")) + if not doc_files: + # try again with "-" + doc_files = list(doc_source.glob(f"*/model_doc/{model_name.replace('_', '-')}.md")) + if not doc_files: + raise ValueError(f"No doc files found for {model_name}") + + base_doc_string = ( + f"## {fast_image_processor_name[:-4]}\n\n[[autodoc]] {fast_image_processor_name[:-4]}\n - preprocess" + ) + fast_doc_string = f"## {fast_image_processor_name}\n\n[[autodoc]] {fast_image_processor_name}\n - preprocess" + + for doc_file in doc_files: + with open(doc_file, "r", encoding="utf-8") as f: + content = f.read() + + if fast_doc_string not in content: + # add the fast image processor to the doc + updated_content = content.replace( + base_doc_string, + base_doc_string + "\n\n" + fast_doc_string, + ) + + # write the updated content + with open(doc_file, "w", encoding="utf-8") as f: + f.write(updated_content) + + +def add_fast_image_processor_to_tests(fast_image_processor_name: str, model_name: str): + """ + Add the fast image processor to the image processing tests. + """ + tests_path = REPO_PATH / "tests" / "models" / model_name + test_file = tests_path / f"test_image_processing_{model_name}.py" + if not os.path.exists(test_file): + logger.warning(f"No test file found for {model_name}. Skipping.") + return + + with open(test_file, "r", encoding="utf-8") as f: + content = f.read() + + # add is_torchvision_available import to the imports + # Regex to match import statements from transformers.utils + pattern = r""" + from\s+transformers\.utils\s+import\s+ + (?: # Non-capturing group for either: + ([\w, ]+) # 1. Single-line imports (e.g., 'a, b') + | # OR + \((.*?)\) # 2. Multi-line imports (e.g., '(a, ... b)') + ) + """ + regex = re.compile(pattern, re.VERBOSE | re.DOTALL) + + def replacement_function(match): + # Extract existing imports + existing_imports = (match.group(1) or match.group(2)).split(",") + existing_imports = existing_imports[:-1] if existing_imports[-1] == "\n" else existing_imports + existing_imports = [imp.strip() for imp in existing_imports] + + # Add the new import if not already present + if "is_torchvision_available" not in existing_imports: + existing_imports.append("is_torchvision_available") + existing_imports.sort() + + # Rebuild the import statement + if match.group(1): # Single-line import + updated_imports = ", ".join(existing_imports) + else: # Multi-line import + updated_imports = "(\n " + ",\n ".join(existing_imports) + ",\n)" + + return f"from transformers.utils import {updated_imports}" + + # Replace all matches in the file content + updated_content = regex.sub(replacement_function, content) + + # add the fast image processor to the imports + base_import_string = f" from transformers import {fast_image_processor_name[:-4]}" + fast_import_string = ( + f" if is_torchvision_available():\n from transformers import {fast_image_processor_name}" + ) + if fast_import_string not in updated_content: + updated_content = updated_content.replace(base_import_string, base_import_string + "\n\n" + fast_import_string) + + # get line starting with " image_processing_class = " and add a line after it starting with " fast_image_processing_class = " + image_processing_class_line = re.search(r" image_processing_class = .*", updated_content) + if not image_processing_class_line: + logger.warning(f"Couldn't find the 'image_processing_class' line in {test_file}. Skipping.") + return + + fast_image_processing_class_line = ( + f" fast_image_processing_class = {fast_image_processor_name} if is_torchvision_available() else None" + ) + if " fast_image_processing_class = " not in updated_content: + updated_content = updated_content.replace( + image_processing_class_line.group(0), + image_processing_class_line.group(0) + "\n" + fast_image_processing_class_line, + ) + + # write the updated content + with open(test_file, "w", encoding="utf-8") as f: + f.write(updated_content) + + +def get_fast_image_processing_content_header(content: str) -> str: + """ + Get the header of the slow image processor file. + """ + # get all the commented lines at the beginning of the file + content_header = re.search(r"^# coding=utf-8\n(#[^\n]*\n)*", content, re.MULTILINE) + if not content_header: + logger.warning("Couldn't find the content header in the slow image processor file. Using a default header.") + return ( + f"# coding=utf-8\n" + f"# Copyright {CURRENT_YEAR} The HuggingFace Team. All rights reserved.\n" + f"#\n" + f'# Licensed under the Apache License, Version 2.0 (the "License");\n' + f"# you may not use this file except in compliance with the License.\n" + f"# You may obtain a copy of the License at\n" + f"#\n" + f"# http://www.apache.org/licenses/LICENSE-2.0\n" + f"#\n" + f"# Unless required by applicable law or agreed to in writing, software\n" + f'# distributed under the License is distributed on an "AS IS" BASIS,\n' + f"# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n" + f"# See the License for the specific language governing permissions and\n" + f"# limitations under the License.\n" + f"\n" + ) + content_header = content_header.group(0) + # replace the year in the copyright + content_header = re.sub(r"# Copyright (\d+)\s", f"# Copyright {CURRENT_YEAR} ", content_header) + # get the line starting with """Image processor in content if it exists + match = re.search(r'^"""Image processor.*$', content, re.MULTILINE) + if match: + content_header += match.group(0).replace("Image processor", "Fast Image processor") + + return content_header + + +def write_default_fast_image_processor_file( + fast_image_processing_module_file: str, fast_image_processor_name: str, content_base_file: str +): + """ + Write a default fast image processor file. Used when encountering a problem while parsing the slow image processor file. + """ + imports = "\n\nfrom ...image_processing_utils_fast import BaseImageProcessorFast\n\n\n" + content_header = get_fast_image_processing_content_header(content_base_file) + content_base_file = ( + f"class {fast_image_processor_name}(BaseImageProcessorFast):\n" + " # To be implemented\n" + " resample = None\n" + " image_mean = None\n" + " image_std = None\n" + " size = None\n" + " default_to_square = None\n" + " crop_size = None\n" + " do_resize = None\n" + " do_center_crop = None\n" + " do_rescale = None\n" + " do_normalize = None\n" + " do_convert_rgb = None\n\n\n" + f'__all__ = ["{fast_image_processor_name}"]\n' + ) + + content = content_header + imports + content_base_file + + with open(fast_image_processing_module_file, "w", encoding="utf-8") as f: + f.write(content) + + +def add_fast_image_processor_file( + fast_image_processing_module_file: str, fast_image_processor_name: str, content_base_file: str +): + """ + Add the fast image processor file to the model's folder. + """ + # if the file already exists, do nothing + if os.path.exists(fast_image_processing_module_file): + print(f"{fast_image_processing_module_file} already exists. Skipping.") + return + + regex = rf"class {fast_image_processor_name[:-4]}.*?(\n\S|$)" + match = re.search(regex, content_base_file, re.DOTALL) + if not match: + print(f"Couldn't find the {fast_image_processor_name[:-4]} class in {fast_image_processing_module_file}") + print("Creating a new file with the default content.") + return write_default_fast_image_processor_file( + fast_image_processing_module_file, fast_image_processor_name, content_base_file + ) + # Exclude the last unindented line + slow_class_content = match.group(0).rstrip() + # get default args: + # find the __init__ block which start with def __init__ and ends with def + match = re.search(r"def __init__.*?def ", slow_class_content, re.DOTALL) + if not match: + print( + f"Couldn't find the __init__ block for {fast_image_processor_name[:-4]} in {fast_image_processing_module_file}" + ) + print("Creating a new file with the default content.") + return write_default_fast_image_processor_file( + fast_image_processing_module_file, fast_image_processor_name, content_base_file + ) + init = match.group(0) + init_signature_block = init.split(")")[0] + arg_names = init_signature_block.split(":") + arg_names = [arg_name.split("\n")[-1].strip() for arg_name in arg_names] + # get the default values + default_args = re.findall(r"= (.*?)(?:,|\))", init_signature_block) + + # build default args dict + default_args_dict = dict(zip(arg_names, default_args)) + pattern_default_size = r"size = size if size is not None else\s+(.*)" + match_default_size = re.findall(pattern_default_size, init) + default_args_dict["size"] = match_default_size[0] if match_default_size else None + pattern_default_crop_size = r"crop_size = crop_size if crop_size is not None else\s+(.*)" + match_default_crop_size = re.findall(pattern_default_crop_size, init) + default_args_dict["crop_size"] = match_default_crop_size[0] if match_default_crop_size else None + pattern_default_image_mean = r"self.image_mean = image_mean if image_mean is not None else\s+(.*)" + match_default_image_mean = re.findall(pattern_default_image_mean, init) + default_args_dict["image_mean"] = match_default_image_mean[0] if match_default_image_mean else None + pattern_default_image_std = r"self.image_std = image_std if image_std is not None else\s+(.*)" + match_default_image_std = re.findall(pattern_default_image_std, init) + default_args_dict["image_std"] = match_default_image_std[0] if match_default_image_std else None + default_args_dict["default_to_square"] = False if "(size, default_to_square=False" in init else None + + content_header = get_fast_image_processing_content_header(content_base_file) + content_base_file = ( + f"@add_start_docstrings(\n" + f' "Constructs a fast {fast_image_processor_name.replace("ImageProcessorFast", "")} image processor.",\n' + f" BASE_IMAGE_PROCESSOR_FAST_DOCSTRING,\n)\n" + f"class {fast_image_processor_name}(BaseImageProcessorFast):\n" + " # This generated class can be used as a starting point for the fast image processor.\n" + " # if the image processor is only used for simple augmentations, such as resizing, center cropping, rescaling, or normalizing,\n" + " # only the default values should be set in the class.\n" + " # If the image processor requires more complex augmentations, methods from BaseImageProcessorFast can be overridden.\n" + " # In most cases, only the `_preprocess` method should be overridden.\n\n" + " # For an example of a fast image processor requiring more complex augmentations, see `LlavaNextImageProcessorFast`.\n\n" + " # Default values should be checked against the slow image processor\n" + " # None values left after checking can be removed\n" + f" resample = {default_args_dict.get('resample')}\n" + f" image_mean = {default_args_dict.get('image_mean')}\n" + f" image_std = {default_args_dict.get('image_std')}\n" + f" size = {default_args_dict.get('size')}\n" + f" default_to_square = {default_args_dict.get('default_to_square')}\n" + f" crop_size = {default_args_dict.get('crop_size')}\n" + f" do_resize = {default_args_dict.get('do_resize')}\n" + f" do_center_crop = {default_args_dict.get('do_center_crop')}\n" + f" do_rescale = {default_args_dict.get('do_rescale')}\n" + f" do_normalize = {default_args_dict.get('do_normalize')}\n" + f" do_convert_rgb = {default_args_dict.get('do_convert_rgb')}\n\n\n" + f'__all__ = ["{fast_image_processor_name}"]\n' + ) + + imports = ( + "\n\nfrom ...image_processing_utils_fast import BASE_IMAGE_PROCESSOR_FAST_DOCSTRING, BaseImageProcessorFast\n" + ) + image_utils_imports = [] + if default_args_dict.get("resample") is not None and "PILImageResampling" in default_args_dict.get("resample"): + image_utils_imports.append("PILImageResampling") + if default_args_dict.get("image_mean") is not None and not any( + char.isdigit() for char in default_args_dict.get("image_mean") + ): + image_utils_imports.append(default_args_dict.get("image_mean")) + if default_args_dict.get("image_std") is not None and not any( + char.isdigit() for char in default_args_dict.get("image_std") + ): + image_utils_imports.append(default_args_dict.get("image_std")) + + if image_utils_imports: + # sort imports + image_utils_imports.sort() + imports += f"from ...image_utils import {', '.join(image_utils_imports)}\n" + + imports += "from ...utils import add_start_docstrings\n" + + content = content_header + imports + "\n\n" + content_base_file + + with open(fast_image_processing_module_file, "w", encoding="utf-8") as f: + f.write(content) + + +def add_fast_image_processor(model_name: str): + """ + Add the necessary references to the fast image processor in the transformers package, + and create the fast image processor file in the model's folder. + """ + model_module = TRANSFORMERS_PATH / "models" / model_name + image_processing_module_file = list(model_module.glob("image_processing*.py")) + if not image_processing_module_file: + raise ValueError(f"No image processing module found in {model_module}") + elif len(image_processing_module_file) > 1: + for file_name in image_processing_module_file: + if not str(file_name).endswith("_fast.py"): + image_processing_module_file = str(file_name) + break + else: + image_processing_module_file = str(image_processing_module_file[0]) + + with open(image_processing_module_file, "r", encoding="utf-8") as f: + content_base_file = f.read() + + # regex to find object starting with "class " and ending with "ImageProcessor", including "ImageProcessor" in the match + image_processor_name = re.findall(r"class (\w*ImageProcessor)", content_base_file) + if not image_processor_name: + raise ValueError(f"No ImageProcessor class found in {image_processing_module_file}") + elif len(image_processor_name) > 1: + raise ValueError(f"Multiple ImageProcessor classes found in {image_processing_module_file}") + + image_processor_name = image_processor_name[0] + fast_image_processor_name = image_processor_name + "Fast" + fast_image_processing_module_file = image_processing_module_file.replace(".py", "_fast.py") + + print(f"Adding {fast_image_processor_name} to {fast_image_processing_module_file}") + + add_fast_image_processor_to_model_init( + fast_image_processing_module_file=fast_image_processing_module_file, + fast_image_processor_name=fast_image_processor_name, + model_name=model_name, + ) + + add_fast_image_processor_to_auto( + image_processor_name=image_processor_name, + fast_image_processor_name=fast_image_processor_name, + ) + + add_fast_image_processor_to_doc( + fast_image_processor_name=fast_image_processor_name, + model_name=model_name, + ) + + add_fast_image_processor_to_tests( + fast_image_processor_name=fast_image_processor_name, + model_name=model_name, + ) + + add_fast_image_processor_file( + fast_image_processing_module_file=fast_image_processing_module_file, + fast_image_processor_name=fast_image_processor_name, + content_base_file=content_base_file, + ) + + +def add_new_model_like_command_factory(args: Namespace): + return AddFastImageProcessorCommand(model_name=args.model_name) + + +class AddFastImageProcessorCommand(BaseTransformersCLICommand): + @staticmethod + def register_subcommand(parser: ArgumentParser): + add_fast_image_processor_parser = parser.add_parser("add-fast-image-processor") + add_fast_image_processor_parser.add_argument( + "--model-name", + type=str, + required=True, + help="The name of the folder containing the model's implementation.", + ) + add_fast_image_processor_parser.set_defaults(func=add_new_model_like_command_factory) + + def __init__(self, model_name: str, *args): + self.model_name = model_name + + def run(self): + add_fast_image_processor(model_name=self.model_name) diff --git a/docs/transformers/build/lib/transformers/commands/add_new_model_like.py b/docs/transformers/build/lib/transformers/commands/add_new_model_like.py new file mode 100644 index 0000000000000000000000000000000000000000..bfb812340e72bdc4543581d1e53bafb6730eb622 --- /dev/null +++ b/docs/transformers/build/lib/transformers/commands/add_new_model_like.py @@ -0,0 +1,1791 @@ +# Copyright 2021 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import difflib +import json +import os +import re +from argparse import ArgumentParser, Namespace +from dataclasses import dataclass +from datetime import date +from itertools import chain +from pathlib import Path +from typing import Any, Callable, Dict, List, Optional, Pattern, Tuple, Union + +import yaml + +from ..models import auto as auto_module +from ..models.auto.configuration_auto import model_type_to_module_name +from ..utils import is_flax_available, is_tf_available, is_torch_available, logging +from . import BaseTransformersCLICommand +from .add_fast_image_processor import add_fast_image_processor + + +logger = logging.get_logger(__name__) # pylint: disable=invalid-name + + +CURRENT_YEAR = date.today().year +TRANSFORMERS_PATH = Path(__file__).parent.parent +REPO_PATH = TRANSFORMERS_PATH.parent.parent + + +@dataclass +class ModelPatterns: + """ + Holds the basic information about a new model for the add-new-model-like command. + + Args: + model_name (`str`): The model name. + checkpoint (`str`): The checkpoint to use for doc examples. + model_type (`str`, *optional*): + The model type, the identifier used internally in the library like `bert` or `xlm-roberta`. Will default to + `model_name` lowercased with spaces replaced with minuses (-). + model_lower_cased (`str`, *optional*): + The lowercased version of the model name, to use for the module name or function names. Will default to + `model_name` lowercased with spaces and minuses replaced with underscores. + model_camel_cased (`str`, *optional*): + The camel-cased version of the model name, to use for the class names. Will default to `model_name` + camel-cased (with spaces and minuses both considered as word separators. + model_upper_cased (`str`, *optional*): + The uppercased version of the model name, to use for the constant names. Will default to `model_name` + uppercased with spaces and minuses replaced with underscores. + config_class (`str`, *optional*): + The tokenizer class associated with this model. Will default to `"{model_camel_cased}Config"`. + tokenizer_class (`str`, *optional*): + The tokenizer class associated with this model (leave to `None` for models that don't use a tokenizer). + image_processor_class (`str`, *optional*): + The image processor class associated with this model (leave to `None` for models that don't use an image + processor). + image_processor_fast_class (`str`, *optional*): + The fast image processor class associated with this model (leave to `None` for models that don't use a fast + image processor). + feature_extractor_class (`str`, *optional*): + The feature extractor class associated with this model (leave to `None` for models that don't use a feature + extractor). + processor_class (`str`, *optional*): + The processor class associated with this model (leave to `None` for models that don't use a processor). + """ + + model_name: str + checkpoint: str + model_type: Optional[str] = None + model_lower_cased: Optional[str] = None + model_camel_cased: Optional[str] = None + model_upper_cased: Optional[str] = None + config_class: Optional[str] = None + tokenizer_class: Optional[str] = None + image_processor_class: Optional[str] = None + image_processor_fast_class: Optional[str] = None + feature_extractor_class: Optional[str] = None + processor_class: Optional[str] = None + + def __post_init__(self): + if self.model_type is None: + self.model_type = self.model_name.lower().replace(" ", "-") + if self.model_lower_cased is None: + self.model_lower_cased = self.model_name.lower().replace(" ", "_").replace("-", "_") + if self.model_camel_cased is None: + # Split the model name on - and space + words = self.model_name.split(" ") + words = list(chain(*[w.split("-") for w in words])) + # Make sure each word is capitalized + words = [w[0].upper() + w[1:] for w in words] + self.model_camel_cased = "".join(words) + if self.model_upper_cased is None: + self.model_upper_cased = self.model_name.upper().replace(" ", "_").replace("-", "_") + if self.config_class is None: + self.config_class = f"{self.model_camel_cased}Config" + + +ATTRIBUTE_TO_PLACEHOLDER = { + "config_class": "[CONFIG_CLASS]", + "tokenizer_class": "[TOKENIZER_CLASS]", + "image_processor_class": "[IMAGE_PROCESSOR_CLASS]", + "image_processor_fast_class": "[IMAGE_PROCESSOR_FAST_CLASS]", + "feature_extractor_class": "[FEATURE_EXTRACTOR_CLASS]", + "processor_class": "[PROCESSOR_CLASS]", + "checkpoint": "[CHECKPOINT]", + "model_type": "[MODEL_TYPE]", + "model_upper_cased": "[MODEL_UPPER_CASED]", + "model_camel_cased": "[MODEL_CAMELCASED]", + "model_lower_cased": "[MODEL_LOWER_CASED]", + "model_name": "[MODEL_NAME]", +} + + +def is_empty_line(line: str) -> bool: + """ + Determines whether a line is empty or not. + """ + return len(line) == 0 or line.isspace() + + +def find_indent(line: str) -> int: + """ + Returns the number of spaces that start a line indent. + """ + search = re.search(r"^(\s*)(?:\S|$)", line) + if search is None: + return 0 + return len(search.groups()[0]) + + +def parse_module_content(content: str) -> List[str]: + """ + Parse the content of a module in the list of objects it defines. + + Args: + content (`str`): The content to parse + + Returns: + `List[str]`: The list of objects defined in the module. + """ + objects = [] + current_object = [] + lines = content.split("\n") + # Doc-styler takes everything between two triple quotes in docstrings, so we need a fake """ here to go with this. + end_markers = [")", "]", "}", '"""'] + + for line in lines: + # End of an object + is_valid_object = len(current_object) > 0 + if is_valid_object and len(current_object) == 1: + is_valid_object = not current_object[0].startswith("# Copied from") + if not is_empty_line(line) and find_indent(line) == 0 and is_valid_object: + # Closing parts should be included in current object + if line in end_markers: + current_object.append(line) + objects.append("\n".join(current_object)) + current_object = [] + else: + objects.append("\n".join(current_object)) + current_object = [line] + else: + current_object.append(line) + + # Add last object + if len(current_object) > 0: + objects.append("\n".join(current_object)) + + return objects + + +def extract_block(content: str, indent_level: int = 0) -> str: + """Return the first block in `content` with the indent level `indent_level`. + + The first line in `content` should be indented at `indent_level` level, otherwise an error will be thrown. + + This method will immediately stop the search when a (non-empty) line with indent level less than `indent_level` is + encountered. + + Args: + content (`str`): The content to parse + indent_level (`int`, *optional*, default to 0): The indent level of the blocks to search for + + Returns: + `str`: The first block in `content` with the indent level `indent_level`. + """ + current_object = [] + lines = content.split("\n") + # Doc-styler takes everything between two triple quotes in docstrings, so we need a fake """ here to go with this. + end_markers = [")", "]", "}", '"""'] + + for idx, line in enumerate(lines): + if idx == 0 and indent_level > 0 and not is_empty_line(line) and find_indent(line) != indent_level: + raise ValueError( + f"When `indent_level > 0`, the first line in `content` should have indent level {indent_level}. Got " + f"{find_indent(line)} instead." + ) + + if find_indent(line) < indent_level and not is_empty_line(line): + break + + # End of an object + is_valid_object = len(current_object) > 0 + if ( + not is_empty_line(line) + and not line.endswith(":") + and find_indent(line) == indent_level + and is_valid_object + ): + # Closing parts should be included in current object + if line.lstrip() in end_markers: + current_object.append(line) + return "\n".join(current_object) + else: + current_object.append(line) + + # Add last object + if len(current_object) > 0: + return "\n".join(current_object) + + +def add_content_to_text( + text: str, + content: str, + add_after: Optional[Union[str, Pattern]] = None, + add_before: Optional[Union[str, Pattern]] = None, + exact_match: bool = False, +) -> str: + """ + A utility to add some content inside a given text. + + Args: + text (`str`): The text in which we want to insert some content. + content (`str`): The content to add. + add_after (`str` or `Pattern`): + The pattern to test on a line of `text`, the new content is added after the first instance matching it. + add_before (`str` or `Pattern`): + The pattern to test on a line of `text`, the new content is added before the first instance matching it. + exact_match (`bool`, *optional*, defaults to `False`): + A line is considered a match with `add_after` or `add_before` if it matches exactly when `exact_match=True`, + otherwise, if `add_after`/`add_before` is present in the line. + + + + The arguments `add_after` and `add_before` are mutually exclusive, and one exactly needs to be provided. + + + + Returns: + `str`: The text with the new content added if a match was found. + """ + if add_after is None and add_before is None: + raise ValueError("You need to pass either `add_after` or `add_before`") + if add_after is not None and add_before is not None: + raise ValueError("You can't pass both `add_after` or `add_before`") + pattern = add_after if add_before is None else add_before + + def this_is_the_line(line): + if isinstance(pattern, Pattern): + return pattern.search(line) is not None + elif exact_match: + return pattern == line + else: + return pattern in line + + new_lines = [] + for line in text.split("\n"): + if this_is_the_line(line): + if add_before is not None: + new_lines.append(content) + new_lines.append(line) + if add_after is not None: + new_lines.append(content) + else: + new_lines.append(line) + + return "\n".join(new_lines) + + +def add_content_to_file( + file_name: Union[str, os.PathLike], + content: str, + add_after: Optional[Union[str, Pattern]] = None, + add_before: Optional[Union[str, Pattern]] = None, + exact_match: bool = False, +): + """ + A utility to add some content inside a given file. + + Args: + file_name (`str` or `os.PathLike`): The name of the file in which we want to insert some content. + content (`str`): The content to add. + add_after (`str` or `Pattern`): + The pattern to test on a line of `text`, the new content is added after the first instance matching it. + add_before (`str` or `Pattern`): + The pattern to test on a line of `text`, the new content is added before the first instance matching it. + exact_match (`bool`, *optional*, defaults to `False`): + A line is considered a match with `add_after` or `add_before` if it matches exactly when `exact_match=True`, + otherwise, if `add_after`/`add_before` is present in the line. + + + + The arguments `add_after` and `add_before` are mutually exclusive, and one exactly needs to be provided. + + + """ + with open(file_name, "r", encoding="utf-8") as f: + old_content = f.read() + + new_content = add_content_to_text( + old_content, content, add_after=add_after, add_before=add_before, exact_match=exact_match + ) + + with open(file_name, "w", encoding="utf-8") as f: + f.write(new_content) + + +def replace_model_patterns( + text: str, old_model_patterns: ModelPatterns, new_model_patterns: ModelPatterns +) -> Tuple[str, str]: + """ + Replace all patterns present in a given text. + + Args: + text (`str`): The text to treat. + old_model_patterns (`ModelPatterns`): The patterns for the old model. + new_model_patterns (`ModelPatterns`): The patterns for the new model. + + Returns: + `Tuple(str, str)`: A tuple of with the treated text and the replacement actually done in it. + """ + # The order is crucially important as we will check and replace in that order. For instance the config probably + # contains the camel-cased named, but will be treated before. + attributes_to_check = ["config_class"] + # Add relevant preprocessing classes + for attr in [ + "tokenizer_class", + "image_processor_class", + "image_processor_fast_class", + "feature_extractor_class", + "processor_class", + ]: + if getattr(old_model_patterns, attr) is not None and getattr(new_model_patterns, attr) is not None: + attributes_to_check.append(attr) + + # Special cases for checkpoint and model_type + if old_model_patterns.checkpoint not in [old_model_patterns.model_type, old_model_patterns.model_lower_cased]: + attributes_to_check.append("checkpoint") + if old_model_patterns.model_type != old_model_patterns.model_lower_cased: + attributes_to_check.append("model_type") + else: + text = re.sub( + rf'(\s*)model_type = "{old_model_patterns.model_type}"', + r'\1model_type = "[MODEL_TYPE]"', + text, + ) + + # Special case when the model camel cased and upper cased names are the same for the old model (like for GPT2) but + # not the new one. We can't just do a replace in all the text and will need a special regex + if old_model_patterns.model_upper_cased == old_model_patterns.model_camel_cased: + old_model_value = old_model_patterns.model_upper_cased + if re.search(rf"{old_model_value}_[A-Z_]*[^A-Z_]", text) is not None: + text = re.sub(rf"{old_model_value}([A-Z_]*)([^a-zA-Z_])", r"[MODEL_UPPER_CASED]\1\2", text) + else: + attributes_to_check.append("model_upper_cased") + + attributes_to_check.extend(["model_camel_cased", "model_lower_cased", "model_name"]) + + # Now let's replace every other attribute by their placeholder + for attr in attributes_to_check: + text = text.replace(getattr(old_model_patterns, attr), ATTRIBUTE_TO_PLACEHOLDER[attr]) + + # Finally we can replace the placeholder byt the new values. + replacements = [] + for attr, placeholder in ATTRIBUTE_TO_PLACEHOLDER.items(): + if placeholder in text: + replacements.append((getattr(old_model_patterns, attr), getattr(new_model_patterns, attr))) + text = text.replace(placeholder, getattr(new_model_patterns, attr)) + + # If we have two inconsistent replacements, we don't return anything (ex: GPT2->GPT_NEW and GPT2->GPTNew) + old_replacement_values = [old for old, new in replacements] + if len(set(old_replacement_values)) != len(old_replacement_values): + return text, "" + + replacements = simplify_replacements(replacements) + replacements = [f"{old}->{new}" for old, new in replacements] + return text, ",".join(replacements) + + +def simplify_replacements(replacements): + """ + Simplify a list of replacement patterns to make sure there are no needless ones. + + For instance in the sequence "Bert->BertNew, BertConfig->BertNewConfig, bert->bert_new", the replacement + "BertConfig->BertNewConfig" is implied by "Bert->BertNew" so not needed. + + Args: + replacements (`List[Tuple[str, str]]`): List of patterns (old, new) + + Returns: + `List[Tuple[str, str]]`: The list of patterns simplified. + """ + if len(replacements) <= 1: + # Nothing to simplify + return replacements + + # Next let's sort replacements by length as a replacement can only "imply" another replacement if it's shorter. + replacements.sort(key=lambda x: len(x[0])) + + idx = 0 + while idx < len(replacements): + old, new = replacements[idx] + # Loop through all replacements after + j = idx + 1 + while j < len(replacements): + old_2, new_2 = replacements[j] + # If the replacement is implied by the current one, we can drop it. + if old_2.replace(old, new) == new_2: + replacements.pop(j) + else: + j += 1 + idx += 1 + + return replacements + + +def get_module_from_file(module_file: Union[str, os.PathLike]) -> str: + """ + Returns the module name corresponding to a module file. + """ + full_module_path = Path(module_file).absolute() + module_parts = full_module_path.with_suffix("").parts + + # Find the first part named transformers, starting from the end. + idx = len(module_parts) - 1 + while idx >= 0 and module_parts[idx] != "transformers": + idx -= 1 + if idx < 0: + raise ValueError(f"{module_file} is not a transformers module.") + + return ".".join(module_parts[idx:]) + + +SPECIAL_PATTERNS = { + "_CHECKPOINT_FOR_DOC =": "checkpoint", + "_CONFIG_FOR_DOC =": "config_class", + "_TOKENIZER_FOR_DOC =": "tokenizer_class", + "_IMAGE_PROCESSOR_FOR_DOC =": "image_processor_class", + "_FEAT_EXTRACTOR_FOR_DOC =": "feature_extractor_class", + "_PROCESSOR_FOR_DOC =": "processor_class", +} + + +_re_class_func = re.compile(r"^(?:class|def)\s+([^\s:\(]+)\s*(?:\(|\:)", flags=re.MULTILINE) + + +def remove_attributes(obj, target_attr): + """Remove `target_attr` in `obj`.""" + lines = obj.split(os.linesep) + + target_idx = None + for idx, line in enumerate(lines): + # search for assignment + if line.lstrip().startswith(f"{target_attr} = "): + target_idx = idx + break + # search for function/method definition + elif line.lstrip().startswith(f"def {target_attr}("): + target_idx = idx + break + + # target not found + if target_idx is None: + return obj + + line = lines[target_idx] + indent_level = find_indent(line) + # forward pass to find the ending of the block (including empty lines) + parsed = extract_block("\n".join(lines[target_idx:]), indent_level) + num_lines = len(parsed.split("\n")) + for idx in range(num_lines): + lines[target_idx + idx] = None + + # backward pass to find comments or decorator + for idx in range(target_idx - 1, -1, -1): + line = lines[idx] + if (line.lstrip().startswith("#") or line.lstrip().startswith("@")) and find_indent(line) == indent_level: + lines[idx] = None + else: + break + + new_obj = os.linesep.join([x for x in lines if x is not None]) + + return new_obj + + +def duplicate_module( + module_file: Union[str, os.PathLike], + old_model_patterns: ModelPatterns, + new_model_patterns: ModelPatterns, + dest_file: Optional[str] = None, + add_copied_from: bool = True, + attrs_to_remove: List[str] = None, +): + """ + Create a new module from an existing one and adapting all function and classes names from old patterns to new ones. + + Args: + module_file (`str` or `os.PathLike`): Path to the module to duplicate. + old_model_patterns (`ModelPatterns`): The patterns for the old model. + new_model_patterns (`ModelPatterns`): The patterns for the new model. + dest_file (`str` or `os.PathLike`, *optional*): Path to the new module. + add_copied_from (`bool`, *optional*, defaults to `True`): + Whether or not to add `# Copied from` statements in the duplicated module. + """ + if dest_file is None: + dest_file = str(module_file).replace( + old_model_patterns.model_lower_cased, new_model_patterns.model_lower_cased + ) + + with open(module_file, "r", encoding="utf-8") as f: + content = f.read() + + content = re.sub(r"# Copyright (\d+)\s", f"# Copyright {CURRENT_YEAR} ", content) + objects = parse_module_content(content) + + # Loop and treat all objects + new_objects = [] + for obj in objects: + special_pattern = False + for pattern, attr in SPECIAL_PATTERNS.items(): + if pattern in obj: + obj = obj.replace(getattr(old_model_patterns, attr), getattr(new_model_patterns, attr)) + new_objects.append(obj) + special_pattern = True + break + + if special_pattern: + continue + + # Regular classes functions + old_obj = obj + obj, replacement = replace_model_patterns(obj, old_model_patterns, new_model_patterns) + has_copied_from = re.search(r"^#\s+Copied from", obj, flags=re.MULTILINE) is not None + if add_copied_from and not has_copied_from and _re_class_func.search(obj) is not None and len(replacement) > 0: + # Copied from statement must be added just before the class/function definition, which may not be the + # first line because of decorators. + module_name = get_module_from_file(module_file) + old_object_name = _re_class_func.search(old_obj).groups()[0] + obj = add_content_to_text( + obj, f"# Copied from {module_name}.{old_object_name} with {replacement}", add_before=_re_class_func + ) + # In all cases, we remove Copied from statement with indent on methods. + obj = re.sub("\n[ ]+# Copied from [^\n]*\n", "\n", obj) + + new_objects.append(obj) + + content = "\n".join(new_objects) + # Remove some attributes that we don't want to copy to the new file(s) + if attrs_to_remove is not None: + for attr in attrs_to_remove: + content = remove_attributes(content, target_attr=attr) + + with open(dest_file, "w", encoding="utf-8") as f: + f.write(content) + + +def filter_framework_files( + files: List[Union[str, os.PathLike]], frameworks: Optional[List[str]] = None +) -> List[Union[str, os.PathLike]]: + """ + Filter a list of files to only keep the ones corresponding to a list of frameworks. + + Args: + files (`List[Union[str, os.PathLike]]`): The list of files to filter. + frameworks (`List[str]`, *optional*): The list of allowed frameworks. + + Returns: + `List[Union[str, os.PathLike]]`: The list of filtered files. + """ + if frameworks is None: + frameworks = get_default_frameworks() + + framework_to_file = {} + others = [] + for f in files: + parts = Path(f).name.split("_") + if "modeling" not in parts: + others.append(f) + continue + if "tf" in parts: + framework_to_file["tf"] = f + elif "flax" in parts: + framework_to_file["flax"] = f + else: + framework_to_file["pt"] = f + + return [framework_to_file[f] for f in frameworks if f in framework_to_file] + others + + +def get_model_files(model_type: str, frameworks: Optional[List[str]] = None) -> Dict[str, Union[Path, List[Path]]]: + """ + Retrieves all the files associated to a model. + + Args: + model_type (`str`): A valid model type (like "bert" or "gpt2") + frameworks (`List[str]`, *optional*): + If passed, will only keep the model files corresponding to the passed frameworks. + + Returns: + `Dict[str, Union[Path, List[Path]]]`: A dictionary with the following keys: + - **doc_file** -- The documentation file for the model. + - **model_files** -- All the files in the model module. + - **test_files** -- The test files for the model. + """ + module_name = model_type_to_module_name(model_type) + + model_module = TRANSFORMERS_PATH / "models" / module_name + model_files = list(model_module.glob("*.py")) + model_files = filter_framework_files(model_files, frameworks=frameworks) + + doc_file = REPO_PATH / "docs" / "source" / "en" / "model_doc" / f"{model_type}.md" + + # Basic pattern for test files + test_files = [ + f"test_modeling_{module_name}.py", + f"test_modeling_tf_{module_name}.py", + f"test_modeling_flax_{module_name}.py", + f"test_tokenization_{module_name}.py", + f"test_image_processing_{module_name}.py", + f"test_feature_extraction_{module_name}.py", + f"test_processor_{module_name}.py", + ] + test_files = filter_framework_files(test_files, frameworks=frameworks) + # Add the test directory + test_files = [REPO_PATH / "tests" / "models" / module_name / f for f in test_files] + # Filter by existing files + test_files = [f for f in test_files if f.exists()] + + return {"doc_file": doc_file, "model_files": model_files, "module_name": module_name, "test_files": test_files} + + +_re_checkpoint_for_doc = re.compile(r"^_CHECKPOINT_FOR_DOC\s+=\s+(\S*)\s*$", flags=re.MULTILINE) + + +def find_base_model_checkpoint( + model_type: str, model_files: Optional[Dict[str, Union[Path, List[Path]]]] = None +) -> str: + """ + Finds the model checkpoint used in the docstrings for a given model. + + Args: + model_type (`str`): A valid model type (like "bert" or "gpt2") + model_files (`Dict[str, Union[Path, List[Path]]`, *optional*): + The files associated to `model_type`. Can be passed to speed up the function, otherwise will be computed. + + Returns: + `str`: The checkpoint used. + """ + if model_files is None: + model_files = get_model_files(model_type) + module_files = model_files["model_files"] + for fname in module_files: + if "modeling" not in str(fname): + continue + + with open(fname, "r", encoding="utf-8") as f: + content = f.read() + if _re_checkpoint_for_doc.search(content) is not None: + checkpoint = _re_checkpoint_for_doc.search(content).groups()[0] + # Remove quotes + checkpoint = checkpoint.replace('"', "") + checkpoint = checkpoint.replace("'", "") + return checkpoint + + # TODO: Find some kind of fallback if there is no _CHECKPOINT_FOR_DOC in any of the modeling file. + return "" + + +def get_default_frameworks(): + """ + Returns the list of frameworks (PyTorch, TensorFlow, Flax) that are installed in the environment. + """ + frameworks = [] + if is_torch_available(): + frameworks.append("pt") + if is_tf_available(): + frameworks.append("tf") + if is_flax_available(): + frameworks.append("flax") + return frameworks + + +_re_model_mapping = re.compile("MODEL_([A-Z_]*)MAPPING_NAMES") + + +def retrieve_model_classes(model_type: str, frameworks: Optional[List[str]] = None) -> Dict[str, List[str]]: + """ + Retrieve the model classes associated to a given model. + + Args: + model_type (`str`): A valid model type (like "bert" or "gpt2") + frameworks (`List[str]`, *optional*): + The frameworks to look for. Will default to `["pt", "tf", "flax"]`, passing a smaller list will restrict + the classes returned. + + Returns: + `Dict[str, List[str]]`: A dictionary with one key per framework and the list of model classes associated to + that framework as values. + """ + if frameworks is None: + frameworks = get_default_frameworks() + + modules = { + "pt": auto_module.modeling_auto if is_torch_available() else None, + "tf": auto_module.modeling_tf_auto if is_tf_available() else None, + "flax": auto_module.modeling_flax_auto if is_flax_available() else None, + } + + model_classes = {} + for framework in frameworks: + new_model_classes = [] + if modules[framework] is None: + raise ValueError(f"You selected {framework} in the frameworks, but it is not installed.") + model_mappings = [attr for attr in dir(modules[framework]) if _re_model_mapping.search(attr) is not None] + for model_mapping_name in model_mappings: + model_mapping = getattr(modules[framework], model_mapping_name) + if model_type in model_mapping: + new_model_classes.append(model_mapping[model_type]) + + if len(new_model_classes) > 0: + # Remove duplicates + model_classes[framework] = list(set(new_model_classes)) + + return model_classes + + +def retrieve_info_for_model(model_type, frameworks: Optional[List[str]] = None): + """ + Retrieves all the information from a given model_type. + + Args: + model_type (`str`): A valid model type (like "bert" or "gpt2") + frameworks (`List[str]`, *optional*): + If passed, will only keep the info corresponding to the passed frameworks. + + Returns: + `Dict`: A dictionary with the following keys: + - **frameworks** (`List[str]`): The list of frameworks that back this model type. + - **model_classes** (`Dict[str, List[str]]`): The model classes implemented for that model type. + - **model_files** (`Dict[str, Union[Path, List[Path]]]`): The files associated with that model type. + - **model_patterns** (`ModelPatterns`): The various patterns for the model. + """ + if model_type not in auto_module.MODEL_NAMES_MAPPING: + raise ValueError(f"{model_type} is not a valid model type.") + + model_name = auto_module.MODEL_NAMES_MAPPING[model_type] + config_class = auto_module.configuration_auto.CONFIG_MAPPING_NAMES[model_type] + if model_type in auto_module.tokenization_auto.TOKENIZER_MAPPING_NAMES: + tokenizer_classes = auto_module.tokenization_auto.TOKENIZER_MAPPING_NAMES[model_type] + tokenizer_class = tokenizer_classes[0] if tokenizer_classes[0] is not None else tokenizer_classes[1] + else: + tokenizer_class = None + image_processor_classes = auto_module.image_processing_auto.IMAGE_PROCESSOR_MAPPING_NAMES.get(model_type, None) + if isinstance(image_processor_classes, tuple): + image_processor_class, image_processor_fast_class = image_processor_classes + else: + image_processor_class = image_processor_classes + image_processor_fast_class = None + feature_extractor_class = auto_module.feature_extraction_auto.FEATURE_EXTRACTOR_MAPPING_NAMES.get(model_type, None) + processor_class = auto_module.processing_auto.PROCESSOR_MAPPING_NAMES.get(model_type, None) + + model_files = get_model_files(model_type, frameworks=frameworks) + model_camel_cased = config_class.replace("Config", "") + + available_frameworks = [] + for fname in model_files["model_files"]: + if "modeling_tf" in str(fname): + available_frameworks.append("tf") + elif "modeling_flax" in str(fname): + available_frameworks.append("flax") + elif "modeling" in str(fname): + available_frameworks.append("pt") + + if frameworks is None: + frameworks = get_default_frameworks() + + frameworks = [f for f in frameworks if f in available_frameworks] + + model_classes = retrieve_model_classes(model_type, frameworks=frameworks) + + model_upper_cased = model_camel_cased.upper() + model_patterns = ModelPatterns( + model_name, + checkpoint=find_base_model_checkpoint(model_type, model_files=model_files), + model_type=model_type, + model_camel_cased=model_camel_cased, + model_lower_cased=model_files["module_name"], + model_upper_cased=model_upper_cased, + config_class=config_class, + tokenizer_class=tokenizer_class, + image_processor_class=image_processor_class, + image_processor_fast_class=image_processor_fast_class, + feature_extractor_class=feature_extractor_class, + processor_class=processor_class, + ) + + return { + "frameworks": frameworks, + "model_classes": model_classes, + "model_files": model_files, + "model_patterns": model_patterns, + } + + +def clean_frameworks_in_init( + init_file: Union[str, os.PathLike], frameworks: Optional[List[str]] = None, keep_processing: bool = True +): + """ + Removes all the import lines that don't belong to a given list of frameworks or concern tokenizers/feature + extractors/image processors/processors in an init. + + Args: + init_file (`str` or `os.PathLike`): The path to the init to treat. + frameworks (`List[str]`, *optional*): + If passed, this will remove all imports that are subject to a framework not in frameworks + keep_processing (`bool`, *optional*, defaults to `True`): + Whether or not to keep the preprocessing (tokenizer, feature extractor, image processor, processor) imports + in the init. + """ + if frameworks is None: + frameworks = get_default_frameworks() + + names = {"pt": "torch"} + to_remove = [names.get(f, f) for f in ["pt", "tf", "flax"] if f not in frameworks] + if not keep_processing: + to_remove.extend(["sentencepiece", "tokenizers", "vision"]) + + if len(to_remove) == 0: + # Nothing to do + return + + remove_pattern = "|".join(to_remove) + re_conditional_imports = re.compile(rf"^\s*if not is_({remove_pattern})_available\(\):\s*$") + re_try = re.compile(r"\s*try:") + re_else = re.compile(r"\s*else:") + re_is_xxx_available = re.compile(rf"is_({remove_pattern})_available") + + with open(init_file, "r", encoding="utf-8") as f: + content = f.read() + + lines = content.split("\n") + new_lines = [] + idx = 0 + while idx < len(lines): + # Conditional imports in try-except-else blocks + if (re_conditional_imports.search(lines[idx]) is not None) and (re_try.search(lines[idx - 1]) is not None): + # Remove the preceding `try:` + new_lines.pop() + idx += 1 + # Iterate until `else:` + while is_empty_line(lines[idx]) or re_else.search(lines[idx]) is None: + idx += 1 + idx += 1 + indent = find_indent(lines[idx]) + while find_indent(lines[idx]) >= indent or is_empty_line(lines[idx]): + idx += 1 + # Remove the import from utils + elif re_is_xxx_available.search(lines[idx]) is not None: + line = lines[idx] + for framework in to_remove: + line = line.replace(f", is_{framework}_available", "") + line = line.replace(f"is_{framework}_available, ", "") + line = line.replace(f"is_{framework}_available,", "") + line = line.replace(f"is_{framework}_available", "") + + if len(line.strip()) > 0: + new_lines.append(line) + idx += 1 + # Otherwise we keep the line, except if it's a tokenizer import and we don't want to keep it. + elif keep_processing or ( + re.search(r'^\s*"(tokenization|processing|feature_extraction|image_processing)', lines[idx]) is None + and re.search(r"^\s*from .(tokenization|processing|feature_extraction|image_processing)", lines[idx]) + is None + ): + new_lines.append(lines[idx]) + idx += 1 + else: + idx += 1 + + with open(init_file, "w", encoding="utf-8") as f: + f.write("\n".join(new_lines)) + + +def add_model_to_main_init( + old_model_patterns: ModelPatterns, + new_model_patterns: ModelPatterns, + frameworks: Optional[List[str]] = None, + with_processing: bool = True, +): + """ + Add a model to the main init of Transformers. + + Args: + old_model_patterns (`ModelPatterns`): The patterns for the old model. + new_model_patterns (`ModelPatterns`): The patterns for the new model. + frameworks (`List[str]`, *optional*): + If specified, only the models implemented in those frameworks will be added. + with_processing (`bool`, *optional*, defaults to `True`): + Whether the tokenizer/feature extractor/processor of the model should also be added to the init or not. + """ + with open(TRANSFORMERS_PATH / "__init__.py", "r", encoding="utf-8") as f: + content = f.read() + + lines = content.split("\n") + idx = 0 + new_lines = [] + framework = None + while idx < len(lines): + new_framework = False + if not is_empty_line(lines[idx]) and find_indent(lines[idx]) == 0: + framework = None + elif lines[idx].lstrip().startswith("if not is_torch_available"): + framework = "pt" + new_framework = True + elif lines[idx].lstrip().startswith("if not is_tf_available"): + framework = "tf" + new_framework = True + elif lines[idx].lstrip().startswith("if not is_flax_available"): + framework = "flax" + new_framework = True + + if new_framework: + # For a new framework, we need to skip until the else: block to get where the imports are. + while lines[idx].strip() != "else:": + new_lines.append(lines[idx]) + idx += 1 + + # Skip if we are in a framework not wanted. + if framework is not None and frameworks is not None and framework not in frameworks: + new_lines.append(lines[idx]) + idx += 1 + elif re.search(rf'models.{old_model_patterns.model_lower_cased}( |")', lines[idx]) is not None: + block = [lines[idx]] + indent = find_indent(lines[idx]) + idx += 1 + while find_indent(lines[idx]) > indent: + block.append(lines[idx]) + idx += 1 + if lines[idx].strip() in [")", "]", "],"]: + block.append(lines[idx]) + idx += 1 + block = "\n".join(block) + new_lines.append(block) + + add_block = True + if not with_processing: + processing_classes = [ + old_model_patterns.tokenizer_class, + old_model_patterns.image_processor_class, + old_model_patterns.image_processor_fast_class, + old_model_patterns.feature_extractor_class, + old_model_patterns.processor_class, + ] + # Only keep the ones that are not None + processing_classes = [c for c in processing_classes if c is not None] + for processing_class in processing_classes: + block = block.replace(f' "{processing_class}",', "") + block = block.replace(f', "{processing_class}"', "") + block = block.replace(f" {processing_class},", "") + block = block.replace(f", {processing_class}", "") + + if processing_class in block: + add_block = False + if add_block: + new_lines.append(replace_model_patterns(block, old_model_patterns, new_model_patterns)[0]) + else: + new_lines.append(lines[idx]) + idx += 1 + + with open(TRANSFORMERS_PATH / "__init__.py", "w", encoding="utf-8") as f: + f.write("\n".join(new_lines)) + + +def insert_tokenizer_in_auto_module(old_model_patterns: ModelPatterns, new_model_patterns: ModelPatterns): + """ + Add a tokenizer to the relevant mappings in the auto module. + + Args: + old_model_patterns (`ModelPatterns`): The patterns for the old model. + new_model_patterns (`ModelPatterns`): The patterns for the new model. + """ + if old_model_patterns.tokenizer_class is None or new_model_patterns.tokenizer_class is None: + return + + with open(TRANSFORMERS_PATH / "models" / "auto" / "tokenization_auto.py", "r", encoding="utf-8") as f: + content = f.read() + + lines = content.split("\n") + idx = 0 + # First we get to the TOKENIZER_MAPPING_NAMES block. + while not lines[idx].startswith(" TOKENIZER_MAPPING_NAMES = OrderedDict("): + idx += 1 + idx += 1 + + # That block will end at this prompt: + while not lines[idx].startswith("TOKENIZER_MAPPING = _LazyAutoMapping"): + # Either all the tokenizer block is defined on one line, in which case, it ends with ")," + if lines[idx].endswith(","): + block = lines[idx] + # Otherwise it takes several lines until we get to a ")," + else: + block = [] + while not lines[idx].startswith(" ),"): + block.append(lines[idx]) + idx += 1 + block = "\n".join(block) + idx += 1 + + # If we find the model type and tokenizer class in that block, we have the old model tokenizer block + if f'"{old_model_patterns.model_type}"' in block and old_model_patterns.tokenizer_class in block: + break + + new_block = block.replace(old_model_patterns.model_type, new_model_patterns.model_type) + new_block = new_block.replace(old_model_patterns.tokenizer_class, new_model_patterns.tokenizer_class) + + new_lines = lines[:idx] + [new_block] + lines[idx:] + with open(TRANSFORMERS_PATH / "models" / "auto" / "tokenization_auto.py", "w", encoding="utf-8") as f: + f.write("\n".join(new_lines)) + + +AUTO_CLASSES_PATTERNS = { + "configuration_auto.py": [ + ' ("{model_type}", "{model_name}"),', + ' ("{model_type}", "{config_class}"),', + ' ("{model_type}", "{pretrained_archive_map}"),', + ], + "feature_extraction_auto.py": [' ("{model_type}", "{feature_extractor_class}"),'], + "image_processing_auto.py": [' ("{model_type}", "{image_processor_classes}"),'], + "modeling_auto.py": [' ("{model_type}", "{any_pt_class}"),'], + "modeling_tf_auto.py": [' ("{model_type}", "{any_tf_class}"),'], + "modeling_flax_auto.py": [' ("{model_type}", "{any_flax_class}"),'], + "processing_auto.py": [' ("{model_type}", "{processor_class}"),'], +} + + +def add_model_to_auto_classes( + old_model_patterns: ModelPatterns, new_model_patterns: ModelPatterns, model_classes: Dict[str, List[str]] +): + """ + Add a model to the relevant mappings in the auto module. + + Args: + old_model_patterns (`ModelPatterns`): The patterns for the old model. + new_model_patterns (`ModelPatterns`): The patterns for the new model. + model_classes (`Dict[str, List[str]]`): A dictionary framework to list of model classes implemented. + """ + for filename in AUTO_CLASSES_PATTERNS: + # Extend patterns with all model classes if necessary + new_patterns = [] + for pattern in AUTO_CLASSES_PATTERNS[filename]: + if re.search("any_([a-z]*)_class", pattern) is not None: + framework = re.search("any_([a-z]*)_class", pattern).groups()[0] + if framework in model_classes: + new_patterns.extend( + [ + pattern.replace("{" + f"any_{framework}_class" + "}", cls) + for cls in model_classes[framework] + ] + ) + elif "{config_class}" in pattern: + new_patterns.append(pattern.replace("{config_class}", old_model_patterns.config_class)) + elif "{image_processor_classes}" in pattern: + if ( + old_model_patterns.image_processor_class is not None + and new_model_patterns.image_processor_class is not None + ): + if ( + old_model_patterns.image_processor_fast_class is not None + and new_model_patterns.image_processor_fast_class is not None + ): + new_patterns.append( + pattern.replace( + '"{image_processor_classes}"', + f'("{old_model_patterns.image_processor_class}", "{old_model_patterns.image_processor_fast_class}")', + ) + ) + else: + new_patterns.append( + pattern.replace( + '"{image_processor_classes}"', f'("{old_model_patterns.image_processor_class}",)' + ) + ) + elif "{feature_extractor_class}" in pattern: + if ( + old_model_patterns.feature_extractor_class is not None + and new_model_patterns.feature_extractor_class is not None + ): + new_patterns.append( + pattern.replace("{feature_extractor_class}", old_model_patterns.feature_extractor_class) + ) + elif "{processor_class}" in pattern: + if old_model_patterns.processor_class is not None and new_model_patterns.processor_class is not None: + new_patterns.append(pattern.replace("{processor_class}", old_model_patterns.processor_class)) + else: + new_patterns.append(pattern) + + # Loop through all patterns. + for pattern in new_patterns: + full_name = TRANSFORMERS_PATH / "models" / "auto" / filename + old_model_line = pattern + new_model_line = pattern + for attr in ["model_type", "model_name"]: + old_model_line = old_model_line.replace("{" + attr + "}", getattr(old_model_patterns, attr)) + new_model_line = new_model_line.replace("{" + attr + "}", getattr(new_model_patterns, attr)) + new_model_line = new_model_line.replace( + old_model_patterns.model_camel_cased, new_model_patterns.model_camel_cased + ) + add_content_to_file(full_name, new_model_line, add_after=old_model_line) + + # Tokenizers require special handling + insert_tokenizer_in_auto_module(old_model_patterns, new_model_patterns) + + +DOC_OVERVIEW_TEMPLATE = """## Overview + +The {model_name} model was proposed in []() by . + + +The abstract from the paper is the following: + +** + +Tips: + + + +This model was contributed by [INSERT YOUR HF USERNAME HERE](https://huggingface.co/). +The original code can be found [here](). + +""" + + +def duplicate_doc_file( + doc_file: Union[str, os.PathLike], + old_model_patterns: ModelPatterns, + new_model_patterns: ModelPatterns, + dest_file: Optional[Union[str, os.PathLike]] = None, + frameworks: Optional[List[str]] = None, +): + """ + Duplicate a documentation file and adapts it for a new model. + + Args: + module_file (`str` or `os.PathLike`): Path to the doc file to duplicate. + old_model_patterns (`ModelPatterns`): The patterns for the old model. + new_model_patterns (`ModelPatterns`): The patterns for the new model. + dest_file (`str` or `os.PathLike`, *optional*): Path to the new doc file. + Will default to the a file named `{new_model_patterns.model_type}.md` in the same folder as `module_file`. + frameworks (`List[str]`, *optional*): + If passed, will only keep the model classes corresponding to this list of frameworks in the new doc file. + """ + with open(doc_file, "r", encoding="utf-8") as f: + content = f.read() + + content = re.sub(r" +""" + +AUTOGENERATED_KERAS_COMMENT = """ + +""" + + +TASK_TAG_TO_NAME_MAPPING = { + "fill-mask": "Masked Language Modeling", + "image-classification": "Image Classification", + "image-segmentation": "Image Segmentation", + "multiple-choice": "Multiple Choice", + "object-detection": "Object Detection", + "question-answering": "Question Answering", + "summarization": "Summarization", + "table-question-answering": "Table Question Answering", + "text-classification": "Text Classification", + "text-generation": "Causal Language Modeling", + "text2text-generation": "Sequence-to-sequence Language Modeling", + "token-classification": "Token Classification", + "translation": "Translation", + "zero-shot-classification": "Zero Shot Classification", + "automatic-speech-recognition": "Automatic Speech Recognition", + "audio-classification": "Audio Classification", +} + + +METRIC_TAGS = [ + "accuracy", + "bleu", + "f1", + "matthews_correlation", + "pearsonr", + "precision", + "recall", + "rouge", + "sacrebleu", + "spearmanr", + "wer", +] + + +def _listify(obj): + if obj is None: + return [] + elif isinstance(obj, str): + return [obj] + else: + return obj + + +def _insert_values_as_list(metadata, name, values): + if values is None: + return metadata + if isinstance(values, str): + values = [values] + values = [v for v in values if v is not None] + if len(values) == 0: + return metadata + metadata[name] = values + return metadata + + +def infer_metric_tags_from_eval_results(eval_results): + if eval_results is None: + return {} + result = {} + for key in eval_results.keys(): + if key.lower().replace(" ", "_") in METRIC_TAGS: + result[key.lower().replace(" ", "_")] = key + elif key.lower() == "rouge1": + result["rouge"] = key + return result + + +def _insert_value(metadata, name, value): + if value is None: + return metadata + metadata[name] = value + return metadata + + +def is_hf_dataset(dataset): + if not is_datasets_available(): + return False + + from datasets import Dataset, IterableDataset + + return isinstance(dataset, (Dataset, IterableDataset)) + + +def _get_mapping_values(mapping): + result = [] + for v in mapping.values(): + if isinstance(v, (tuple, list)): + result += list(v) + else: + result.append(v) + return result + + +@dataclass +class TrainingSummary: + model_name: str + language: Optional[Union[str, list[str]]] = None + license: Optional[str] = None + tags: Optional[Union[str, list[str]]] = None + finetuned_from: Optional[str] = None + tasks: Optional[Union[str, list[str]]] = None + dataset: Optional[Union[str, list[str]]] = None + dataset_tags: Optional[Union[str, list[str]]] = None + dataset_args: Optional[Union[str, list[str]]] = None + dataset_metadata: Optional[dict[str, Any]] = None + eval_results: Optional[dict[str, float]] = None + eval_lines: Optional[list[str]] = None + hyperparameters: Optional[dict[str, Any]] = None + source: Optional[str] = "trainer" + + def __post_init__(self): + # Infer default license from the checkpoint used, if possible. + if ( + self.license is None + and not is_offline_mode() + and self.finetuned_from is not None + and len(self.finetuned_from) > 0 + ): + try: + info = model_info(self.finetuned_from) + for tag in info.tags: + if tag.startswith("license:"): + self.license = tag[8:] + except (requests.exceptions.HTTPError, requests.exceptions.ConnectionError, HFValidationError): + pass + + def create_model_index(self, metric_mapping): + model_index = {"name": self.model_name} + + # Dataset mapping tag -> name + dataset_names = _listify(self.dataset) + dataset_tags = _listify(self.dataset_tags) + dataset_args = _listify(self.dataset_args) + dataset_metadata = _listify(self.dataset_metadata) + if len(dataset_args) < len(dataset_tags): + dataset_args = dataset_args + [None] * (len(dataset_tags) - len(dataset_args)) + dataset_mapping = dict(zip(dataset_tags, dataset_names)) + dataset_arg_mapping = dict(zip(dataset_tags, dataset_args)) + dataset_metadata_mapping = dict(zip(dataset_tags, dataset_metadata)) + + task_mapping = { + task: TASK_TAG_TO_NAME_MAPPING[task] for task in _listify(self.tasks) if task in TASK_TAG_TO_NAME_MAPPING + } + + model_index["results"] = [] + + if len(task_mapping) == 0 and len(dataset_mapping) == 0: + return [model_index] + if len(task_mapping) == 0: + task_mapping = {None: None} + if len(dataset_mapping) == 0: + dataset_mapping = {None: None} + + # One entry per dataset and per task + all_possibilities = [(task_tag, ds_tag) for task_tag in task_mapping for ds_tag in dataset_mapping] + for task_tag, ds_tag in all_possibilities: + result = {} + if task_tag is not None: + result["task"] = {"name": task_mapping[task_tag], "type": task_tag} + + if ds_tag is not None: + metadata = dataset_metadata_mapping.get(ds_tag, {}) + result["dataset"] = { + "name": dataset_mapping[ds_tag], + "type": ds_tag, + **metadata, + } + if dataset_arg_mapping[ds_tag] is not None: + result["dataset"]["args"] = dataset_arg_mapping[ds_tag] + + if len(metric_mapping) > 0: + result["metrics"] = [] + for metric_tag, metric_name in metric_mapping.items(): + result["metrics"].append( + { + "name": metric_name, + "type": metric_tag, + "value": self.eval_results[metric_name], + } + ) + + # Remove partial results to avoid the model card being rejected. + if "task" in result and "dataset" in result and "metrics" in result: + model_index["results"].append(result) + else: + logger.info(f"Dropping the following result as it does not have all the necessary fields:\n{result}") + + return [model_index] + + def create_metadata(self): + metric_mapping = infer_metric_tags_from_eval_results(self.eval_results) + + metadata = {} + metadata = _insert_value(metadata, "library_name", "transformers") + metadata = _insert_values_as_list(metadata, "language", self.language) + metadata = _insert_value(metadata, "license", self.license) + if self.finetuned_from is not None and isinstance(self.finetuned_from, str) and len(self.finetuned_from) > 0: + metadata = _insert_value(metadata, "base_model", self.finetuned_from) + metadata = _insert_values_as_list(metadata, "tags", self.tags) + metadata = _insert_values_as_list(metadata, "datasets", self.dataset_tags) + metadata = _insert_values_as_list(metadata, "metrics", list(metric_mapping.keys())) + metadata["model-index"] = self.create_model_index(metric_mapping) + + return metadata + + def to_model_card(self): + model_card = "" + + metadata = yaml.dump(self.create_metadata(), sort_keys=False) + if len(metadata) > 0: + model_card = f"---\n{metadata}---\n" + + # Now the model card for realsies. + if self.source == "trainer": + model_card += AUTOGENERATED_TRAINER_COMMENT + else: + model_card += AUTOGENERATED_KERAS_COMMENT + + model_card += f"\n# {self.model_name}\n\n" + + if self.finetuned_from is None: + model_card += "This model was trained from scratch on " + else: + model_card += ( + "This model is a fine-tuned version of" + f" [{self.finetuned_from}](https://huggingface.co/{self.finetuned_from}) on " + ) + + if self.dataset is None or (isinstance(self.dataset, list) and len(self.dataset) == 0): + model_card += "an unknown dataset." + else: + if isinstance(self.dataset, str): + model_card += f"the {self.dataset} dataset." + elif isinstance(self.dataset, (tuple, list)) and len(self.dataset) == 1: + model_card += f"the {self.dataset[0]} dataset." + else: + model_card += ( + ", ".join([f"the {ds}" for ds in self.dataset[:-1]]) + f" and the {self.dataset[-1]} datasets." + ) + + if self.eval_results is not None: + model_card += "\nIt achieves the following results on the evaluation set:\n" + model_card += "\n".join([f"- {name}: {_maybe_round(value)}" for name, value in self.eval_results.items()]) + model_card += "\n" + + model_card += "\n## Model description\n\nMore information needed\n" + model_card += "\n## Intended uses & limitations\n\nMore information needed\n" + model_card += "\n## Training and evaluation data\n\nMore information needed\n" + + model_card += "\n## Training procedure\n" + model_card += "\n### Training hyperparameters\n" + if self.hyperparameters is not None: + model_card += "\nThe following hyperparameters were used during training:\n" + model_card += "\n".join([f"- {name}: {value}" for name, value in self.hyperparameters.items()]) + model_card += "\n" + else: + model_card += "\nMore information needed\n" + + if self.eval_lines is not None: + model_card += "\n### Training results\n\n" + model_card += make_markdown_table(self.eval_lines) + model_card += "\n" + + model_card += "\n### Framework versions\n\n" + model_card += f"- Transformers {__version__}\n" + + if self.source == "trainer" and is_torch_available(): + import torch + + model_card += f"- Pytorch {torch.__version__}\n" + elif self.source == "keras" and is_tf_available(): + import tensorflow as tf + + model_card += f"- TensorFlow {tf.__version__}\n" + if is_datasets_available(): + import datasets + + model_card += f"- Datasets {datasets.__version__}\n" + if is_tokenizers_available(): + import tokenizers + + model_card += f"- Tokenizers {tokenizers.__version__}\n" + + return model_card + + @classmethod + def from_trainer( + cls, + trainer, + language=None, + license=None, + tags=None, + model_name=None, + finetuned_from=None, + tasks=None, + dataset_tags=None, + dataset_metadata=None, + dataset=None, + dataset_args=None, + ): + # Infer default from dataset + one_dataset = trainer.eval_dataset if trainer.eval_dataset is not None else trainer.train_dataset + if is_hf_dataset(one_dataset) and (dataset_tags is None or dataset_args is None or dataset_metadata is None): + default_tag = one_dataset.builder_name + # Those are not real datasets from the Hub so we exclude them. + if default_tag not in ["csv", "json", "pandas", "parquet", "text"]: + if dataset_metadata is None: + dataset_metadata = [{"config": one_dataset.config_name, "split": str(one_dataset.split)}] + if dataset_tags is None: + dataset_tags = [default_tag] + if dataset_args is None: + dataset_args = [one_dataset.config_name] + + if dataset is None and dataset_tags is not None: + dataset = dataset_tags + + # Infer default finetuned_from + if ( + finetuned_from is None + and hasattr(trainer.model.config, "_name_or_path") + and not os.path.isdir(trainer.model.config._name_or_path) + ): + finetuned_from = trainer.model.config._name_or_path + + # Infer default task tag: + if tasks is None: + model_class_name = trainer.model.__class__.__name__ + for task, mapping in TASK_MAPPING.items(): + if model_class_name in _get_mapping_values(mapping): + tasks = task + + if model_name is None: + model_name = Path(trainer.args.output_dir).name + if len(model_name) == 0: + model_name = finetuned_from + + # Add `generated_from_trainer` to the tags + if tags is None: + tags = ["generated_from_trainer"] + elif isinstance(tags, str) and tags != "generated_from_trainer": + tags = [tags, "generated_from_trainer"] + elif "generated_from_trainer" not in tags: + tags.append("generated_from_trainer") + + _, eval_lines, eval_results = parse_log_history(trainer.state.log_history) + hyperparameters = extract_hyperparameters_from_trainer(trainer) + + return cls( + language=language, + license=license, + tags=tags, + model_name=model_name, + finetuned_from=finetuned_from, + tasks=tasks, + dataset=dataset, + dataset_tags=dataset_tags, + dataset_args=dataset_args, + dataset_metadata=dataset_metadata, + eval_results=eval_results, + eval_lines=eval_lines, + hyperparameters=hyperparameters, + ) + + @classmethod + def from_keras( + cls, + model, + model_name, + keras_history=None, + language=None, + license=None, + tags=None, + finetuned_from=None, + tasks=None, + dataset_tags=None, + dataset=None, + dataset_args=None, + ): + # Infer default from dataset + if dataset is not None: + if is_hf_dataset(dataset) and (dataset_tags is None or dataset_args is None): + default_tag = dataset.builder_name + # Those are not real datasets from the Hub so we exclude them. + if default_tag not in ["csv", "json", "pandas", "parquet", "text"]: + if dataset_tags is None: + dataset_tags = [default_tag] + if dataset_args is None: + dataset_args = [dataset.config_name] + + if dataset is None and dataset_tags is not None: + dataset = dataset_tags + + # Infer default finetuned_from + if ( + finetuned_from is None + and hasattr(model.config, "_name_or_path") + and not os.path.isdir(model.config._name_or_path) + ): + finetuned_from = model.config._name_or_path + + # Infer default task tag: + if tasks is None: + model_class_name = model.__class__.__name__ + for task, mapping in TASK_MAPPING.items(): + if model_class_name in _get_mapping_values(mapping): + tasks = task + + # Add `generated_from_keras_callback` to the tags + if tags is None: + tags = ["generated_from_keras_callback"] + elif isinstance(tags, str) and tags != "generated_from_keras_callback": + tags = [tags, "generated_from_keras_callback"] + elif "generated_from_keras_callback" not in tags: + tags.append("generated_from_keras_callback") + + if keras_history is not None: + _, eval_lines, eval_results = parse_keras_history(keras_history) + else: + eval_lines = [] + eval_results = {} + hyperparameters = extract_hyperparameters_from_keras(model) + + return cls( + language=language, + license=license, + tags=tags, + model_name=model_name, + finetuned_from=finetuned_from, + tasks=tasks, + dataset_tags=dataset_tags, + dataset=dataset, + dataset_args=dataset_args, + eval_results=eval_results, + eval_lines=eval_lines, + hyperparameters=hyperparameters, + source="keras", + ) + + +def parse_keras_history(logs): + """ + Parse the `logs` of either a `keras.History` object returned by `model.fit()` or an accumulated logs `dict` + passed to the `PushToHubCallback`. Returns lines and logs compatible with those returned by `parse_log_history`. + """ + if hasattr(logs, "history"): + # This looks like a `History` object + if not hasattr(logs, "epoch"): + # This history looks empty, return empty results + return None, [], {} + logs.history["epoch"] = logs.epoch + logs = logs.history + else: + # Training logs is a list of dicts, let's invert it to a dict of lists to match a History object + logs = {log_key: [single_dict[log_key] for single_dict in logs] for log_key in logs[0]} + + lines = [] + for i in range(len(logs["epoch"])): + epoch_dict = {log_key: log_value_list[i] for log_key, log_value_list in logs.items()} + values = {} + for k, v in epoch_dict.items(): + if k.startswith("val_"): + k = "validation_" + k[4:] + elif k != "epoch": + k = "train_" + k + splits = k.split("_") + name = " ".join([part.capitalize() for part in splits]) + values[name] = v + lines.append(values) + + eval_results = lines[-1] + + return logs, lines, eval_results + + +def parse_log_history(log_history): + """ + Parse the `log_history` of a Trainer to get the intermediate and final evaluation results. + """ + idx = 0 + while idx < len(log_history) and "train_runtime" not in log_history[idx]: + idx += 1 + + # If there are no training logs + if idx == len(log_history): + idx -= 1 + while idx >= 0 and "eval_loss" not in log_history[idx]: + idx -= 1 + + if idx >= 0: + return None, None, log_history[idx] + else: + return None, None, None + + # From now one we can assume we have training logs: + train_log = log_history[idx] + lines = [] + training_loss = "No log" + for i in range(idx): + if "loss" in log_history[i]: + training_loss = log_history[i]["loss"] + if "eval_loss" in log_history[i]: + metrics = log_history[i].copy() + _ = metrics.pop("total_flos", None) + epoch = metrics.pop("epoch", None) + step = metrics.pop("step", None) + _ = metrics.pop("eval_runtime", None) + _ = metrics.pop("eval_samples_per_second", None) + _ = metrics.pop("eval_steps_per_second", None) + _ = metrics.pop("eval_jit_compilation_time", None) + values = {"Training Loss": training_loss, "Epoch": epoch, "Step": step} + for k, v in metrics.items(): + if k == "eval_loss": + values["Validation Loss"] = v + else: + splits = k.split("_") + name = " ".join([part.capitalize() for part in splits[1:]]) + values[name] = v + lines.append(values) + + idx = len(log_history) - 1 + while idx >= 0 and "eval_loss" not in log_history[idx]: + idx -= 1 + + if idx > 0: + eval_results = {} + for key, value in log_history[idx].items(): + if key.startswith("eval_"): + key = key[5:] + if key not in ["runtime", "samples_per_second", "steps_per_second", "epoch", "step"]: + camel_cased_key = " ".join([part.capitalize() for part in key.split("_")]) + eval_results[camel_cased_key] = value + return train_log, lines, eval_results + else: + return train_log, lines, None + + +def extract_hyperparameters_from_keras(model): + from .modeling_tf_utils import keras + + hyperparameters = {} + if hasattr(model, "optimizer") and model.optimizer is not None: + hyperparameters["optimizer"] = model.optimizer.get_config() + else: + hyperparameters["optimizer"] = None + hyperparameters["training_precision"] = keras.mixed_precision.global_policy().name + + return hyperparameters + + +def _maybe_round(v, decimals=4): + if isinstance(v, float) and len(str(v).split(".")) > 1 and len(str(v).split(".")[1]) > decimals: + return f"{v:.{decimals}f}" + return str(v) + + +def _regular_table_line(values, col_widths): + values_with_space = [f"| {v}" + " " * (w - len(v) + 1) for v, w in zip(values, col_widths)] + return "".join(values_with_space) + "|\n" + + +def _second_table_line(col_widths): + values = ["|:" + "-" * w + ":" for w in col_widths] + return "".join(values) + "|\n" + + +def make_markdown_table(lines): + """ + Create a nice Markdown table from the results in `lines`. + """ + if lines is None or len(lines) == 0: + return "" + col_widths = {key: len(str(key)) for key in lines[0].keys()} + for line in lines: + for key, value in line.items(): + if col_widths[key] < len(_maybe_round(value)): + col_widths[key] = len(_maybe_round(value)) + + table = _regular_table_line(list(lines[0].keys()), list(col_widths.values())) + table += _second_table_line(list(col_widths.values())) + for line in lines: + table += _regular_table_line([_maybe_round(v) for v in line.values()], list(col_widths.values())) + return table + + +_TRAINING_ARGS_KEYS = [ + "learning_rate", + "train_batch_size", + "eval_batch_size", + "seed", +] + + +def extract_hyperparameters_from_trainer(trainer): + hyperparameters = {k: getattr(trainer.args, k) for k in _TRAINING_ARGS_KEYS} + + if trainer.args.parallel_mode not in [ParallelMode.NOT_PARALLEL, ParallelMode.NOT_DISTRIBUTED]: + hyperparameters["distributed_type"] = ( + "multi-GPU" if trainer.args.parallel_mode == ParallelMode.DISTRIBUTED else trainer.args.parallel_mode.value + ) + if trainer.args.world_size > 1: + hyperparameters["num_devices"] = trainer.args.world_size + if trainer.args.gradient_accumulation_steps > 1: + hyperparameters["gradient_accumulation_steps"] = trainer.args.gradient_accumulation_steps + + total_train_batch_size = ( + trainer.args.train_batch_size * trainer.args.world_size * trainer.args.gradient_accumulation_steps + ) + if total_train_batch_size != hyperparameters["train_batch_size"]: + hyperparameters["total_train_batch_size"] = total_train_batch_size + total_eval_batch_size = trainer.args.eval_batch_size * trainer.args.world_size + if total_eval_batch_size != hyperparameters["eval_batch_size"]: + hyperparameters["total_eval_batch_size"] = total_eval_batch_size + + if trainer.args.optim: + optimizer_name = trainer.args.optim + optimizer_args = trainer.args.optim_args if trainer.args.optim_args else "No additional optimizer arguments" + + if "adam" in optimizer_name.lower(): + hyperparameters["optimizer"] = ( + f"Use {optimizer_name} with betas=({trainer.args.adam_beta1},{trainer.args.adam_beta2}) and" + f" epsilon={trainer.args.adam_epsilon} and optimizer_args={optimizer_args}" + ) + else: + hyperparameters["optimizer"] = f"Use {optimizer_name} and the args are:\n{optimizer_args}" + + hyperparameters["lr_scheduler_type"] = trainer.args.lr_scheduler_type.value + if trainer.args.warmup_ratio != 0.0: + hyperparameters["lr_scheduler_warmup_ratio"] = trainer.args.warmup_ratio + if trainer.args.warmup_steps != 0.0: + hyperparameters["lr_scheduler_warmup_steps"] = trainer.args.warmup_steps + if trainer.args.max_steps != -1: + hyperparameters["training_steps"] = trainer.args.max_steps + else: + hyperparameters["num_epochs"] = trainer.args.num_train_epochs + + if trainer.args.fp16: + if trainer.use_apex: + hyperparameters["mixed_precision_training"] = f"Apex, opt level {trainer.args.fp16_opt_level}" + else: + hyperparameters["mixed_precision_training"] = "Native AMP" + + if trainer.args.label_smoothing_factor != 0.0: + hyperparameters["label_smoothing_factor"] = trainer.args.label_smoothing_factor + + return hyperparameters diff --git a/docs/transformers/build/lib/transformers/modeling_attn_mask_utils.py b/docs/transformers/build/lib/transformers/modeling_attn_mask_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..dfdd976f0156139024c6f7788870a71e4a41754d --- /dev/null +++ b/docs/transformers/build/lib/transformers/modeling_attn_mask_utils.py @@ -0,0 +1,481 @@ +# Copyright 2023 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from dataclasses import dataclass +from typing import Optional, Union + +import torch + +from .utils.import_utils import is_torchdynamo_compiling + + +@dataclass +class AttentionMaskConverter: + """ + A utility attention mask class that allows one to: + - Create a causal 4d mask + - Create a causal 4d mask with slided window + - Convert a 2d attention mask (batch_size, query_length) to a 4d attention mask (batch_size, 1, query_length, + key_value_length) that can be multiplied with attention scores + + Examples: + + ```python + >>> import torch + >>> from transformers.modeling_attn_mask_utils import AttentionMaskConverter + + >>> converter = AttentionMaskConverter(True) + >>> converter.to_4d(torch.tensor([[0, 0, 0, 1, 1]]), 5, key_value_length=5, dtype=torch.float32) + tensor([[[[-3.4028e+38, -3.4028e+38, -3.4028e+38, -3.4028e+38, -3.4028e+38], + [-3.4028e+38, -3.4028e+38, -3.4028e+38, -3.4028e+38, -3.4028e+38], + [-3.4028e+38, -3.4028e+38, -3.4028e+38, -3.4028e+38, -3.4028e+38], + [-3.4028e+38, -3.4028e+38, -3.4028e+38, 0.0000e+00, -3.4028e+38], + [-3.4028e+38, -3.4028e+38, -3.4028e+38, 0.0000e+00, 0.0000e+00]]]]) + ``` + + Parameters: + is_causal (`bool`): + Whether the attention mask should be a uni-directional (causal) or bi-directional mask. + + sliding_window (`int`, *optional*): + Optionally, the sliding window masks can be created if `sliding_window` is defined to a positive integer. + """ + + is_causal: bool + sliding_window: int + + def __init__(self, is_causal: bool, sliding_window: Optional[int] = None): + self.is_causal = is_causal + self.sliding_window = sliding_window + + if self.sliding_window is not None and self.sliding_window <= 0: + raise ValueError( + f"Make sure that when passing `sliding_window` that its value is a strictly positive integer, not `{self.sliding_window}`" + ) + + def to_causal_4d( + self, + batch_size: int, + query_length: int, + key_value_length: int, + dtype: torch.dtype, + device: Union[torch.device, "str"] = "cpu", + ) -> Optional[torch.Tensor]: + """ + Creates a causal 4D mask of (bsz, head_dim=1, query_length, key_value_length) shape and adds large negative + bias to upper right hand triangular matrix (causal mask). + """ + if not self.is_causal: + raise ValueError(f"Please use `to_causal_4d` only if {self.__class__} has `is_causal` set to True.") + + # If shape is not cached, create a new causal mask and cache it + input_shape = (batch_size, query_length) + past_key_values_length = key_value_length - query_length + + # create causal mask + # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] + causal_4d_mask = None + if input_shape[-1] > 1 or self.sliding_window is not None: + causal_4d_mask = self._make_causal_mask( + input_shape, + dtype, + device=device, + past_key_values_length=past_key_values_length, + sliding_window=self.sliding_window, + ) + + return causal_4d_mask + + def to_4d( + self, + attention_mask_2d: torch.Tensor, + query_length: int, + dtype: torch.dtype, + key_value_length: Optional[int] = None, + ) -> torch.Tensor: + """ + Converts 2D attention mask to 4D attention mask by expanding mask to (bsz, head_dim=1, query_length, + key_value_length) shape and by adding a large negative bias to not-attended positions. If attention_mask is + causal, a causal mask will be added. + """ + input_shape = (attention_mask_2d.shape[0], query_length) + + # create causal mask + # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] + causal_4d_mask = None + if (input_shape[-1] > 1 or self.sliding_window is not None) and self.is_causal: + if key_value_length is None: + raise ValueError( + "This attention mask converter is causal. Make sure to pass `key_value_length` to correctly create a causal mask." + ) + + past_key_values_length = key_value_length - query_length + causal_4d_mask = self._make_causal_mask( + input_shape, + dtype, + device=attention_mask_2d.device, + past_key_values_length=past_key_values_length, + sliding_window=self.sliding_window, + ) + elif self.sliding_window is not None: + raise NotImplementedError("Sliding window is currently only implemented for causal masking") + + # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] + expanded_attn_mask = self._expand_mask(attention_mask_2d, dtype, tgt_len=input_shape[-1]).to( + attention_mask_2d.device + ) + + if causal_4d_mask is not None: + expanded_attn_mask = causal_4d_mask.masked_fill(expanded_attn_mask.bool(), torch.finfo(dtype).min) + + # expanded_attn_mask + causal_4d_mask can cause some overflow + expanded_4d_mask = expanded_attn_mask + + return expanded_4d_mask + + @staticmethod + def _make_causal_mask( + input_ids_shape: torch.Size, + dtype: torch.dtype, + device: torch.device, + past_key_values_length: int = 0, + sliding_window: Optional[int] = None, + ): + """ + Make causal mask used for bi-directional self-attention. + """ + bsz, tgt_len = input_ids_shape + mask = torch.full((tgt_len, tgt_len), torch.finfo(dtype).min, device=device) + mask_cond = torch.arange(mask.size(-1), device=device) + mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0) + + mask = mask.to(dtype) + + if past_key_values_length > 0: + mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype, device=device), mask], dim=-1) + + # add lower triangular sliding window mask if necessary + if sliding_window is not None: + diagonal = past_key_values_length - sliding_window - 1 + + context_mask = torch.tril(torch.ones_like(mask, dtype=torch.bool), diagonal=diagonal) + # Recent changes in PyTorch prevent mutations on tensors converted with aten::_to_copy + # See https://github.com/pytorch/pytorch/issues/127571 + if is_torchdynamo_compiling(): + mask = mask.clone() + mask.masked_fill_(context_mask, torch.finfo(dtype).min) + + return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length) + + @staticmethod + def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None): + """ + Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`. + """ + bsz, src_len = mask.size() + tgt_len = tgt_len if tgt_len is not None else src_len + + expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype) + + inverted_mask = 1.0 - expanded_mask + + return inverted_mask.masked_fill(inverted_mask.to(torch.bool), torch.finfo(dtype).min) + + @staticmethod + def _unmask_unattended( + expanded_mask: torch.FloatTensor, + min_dtype: float, + ): + # fmt: off + """ + Attend to all tokens in masked rows from the expanded attention mask, for example the relevant first rows when + using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path. + Details: https://github.com/pytorch/pytorch/issues/110213 + + `expanded_mask` is [bsz, num_masks, tgt_seq_len, src_seq_len] or [bsz, tgt_seq_len, src_seq_len]. + `attention_mask` is [bsz, src_seq_len]. + + The dimension num_masks of `expanded_mask` is most often 1, but it can also be the number of heads in the case of alibi attention bias. + + For example, if `expanded_mask` is (e.g. here left-padding case) + ``` + [[[[0, 0, 0], + [0, 0, 0], + [0, 0, 1]]], + [[[1, 0, 0], + [1, 1, 0], + [1, 1, 1]]], + [[[0, 0, 0], + [0, 1, 0], + [0, 1, 1]]]] + ``` + then the modified `expanded_mask` will be + ``` + [[[[1, 1, 1], <-- modified + [1, 1, 1], <-- modified + [0, 0, 1]]], + [[[1, 0, 0], + [1, 1, 0], + [1, 1, 1]]], + [[[1, 1, 1], <-- modified + [0, 1, 0], + [0, 1, 1]]]] + ``` + """ + # fmt: on + if expanded_mask.dtype == torch.bool: + raise ValueError( + "AttentionMaskConverter._unmask_unattended expects a float `expanded_mask`, got a BoolTensor." + ) + + return expanded_mask.mul(~torch.all(expanded_mask == min_dtype, dim=-1, keepdim=True)) + + @staticmethod + def _ignore_causal_mask_sdpa( + attention_mask: Optional[torch.Tensor], + inputs_embeds: torch.Tensor, + past_key_values_length: int, + sliding_window: Optional[int] = None, + is_training: bool = False, + ) -> bool: + """ + Detects whether the optional user-specified attention_mask & the automatically created causal mask can be + ignored in case PyTorch's SDPA is used, rather relying on SDPA's `is_causal` argument. + + In case no token is masked in the `attention_mask` argument, if `query_length == 1` or + `key_value_length == query_length`, we rather rely on SDPA `is_causal` argument to use causal/non-causal masks, + allowing to dispatch to the flash attention kernel (that can otherwise not be used if a custom `attn_mask` is + passed). + """ + + _, query_length = inputs_embeds.shape[0], inputs_embeds.shape[1] + key_value_length = query_length + past_key_values_length + + is_tracing = torch.jit.is_tracing() or isinstance(inputs_embeds, torch.fx.Proxy) or is_torchdynamo_compiling() + + ignore_causal_mask = False + + if attention_mask is None: + # TODO: When tracing with TorchDynamo with fullgraph=True, the model is recompiled depending on the input + # shape, thus SDPA's `is_causal` argument is rightfully updated + # (see https://gist.github.com/fxmarty/1313f39037fc1c112508989628c57363). However, when using + # `torch.export` or `torch.onnx.dynamo_export`, we must pass an example input, and `is_causal` behavior is + # hard-coded. If a user exports a model with q_len > 1, the exported model will hard-code `is_causal=True` + # which is in general wrong (see https://github.com/pytorch/pytorch/issues/108108). + # Thus, we only set `ignore_causal_mask = True` if the model is set to training. + # + # Besides, jit.trace can not handle the `q_len > 1` condition for `is_causal` + # ("TypeError: scaled_dot_product_attention(): argument 'is_causal' must be bool, not Tensor"). + if ( + (is_training or not is_tracing) + and (query_length == 1 or key_value_length == query_length) + and (sliding_window is None or key_value_length < sliding_window) + ): + ignore_causal_mask = True + elif sliding_window is None or key_value_length < sliding_window: + if len(attention_mask.shape) == 4: + return False + elif not is_tracing and torch.all(attention_mask == 1): + if query_length == 1 or key_value_length == query_length: + # For query_length == 1, causal attention and bi-directional attention are the same. + ignore_causal_mask = True + + # Unfortunately, for query_length > 1 and key_value_length != query_length, we cannot generally ignore + # the attention mask, as SDPA causal mask generation may be wrong. We will set `is_causal=False` in + # SDPA and rely on Transformers attention_mask instead, hence not setting it to None here. + # Reference: https://github.com/pytorch/pytorch/issues/108108 + # TODO: maybe revisit this with https://github.com/pytorch/pytorch/pull/114823 in PyTorch 2.3. + + return ignore_causal_mask + + +def _prepare_4d_causal_attention_mask( + attention_mask: Optional[torch.Tensor], + input_shape: Union[torch.Size, tuple, list], + inputs_embeds: torch.Tensor, + past_key_values_length: int, + sliding_window: Optional[int] = None, +): + """ + Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape + `(batch_size, key_value_length)` + + Args: + attention_mask (`torch.Tensor` or `None`): + A 2D attention mask of shape `(batch_size, key_value_length)` + input_shape (`tuple(int)` or `list(int)` or `torch.Size`): + The input shape should be a tuple that defines `(batch_size, query_length)`. + inputs_embeds (`torch.Tensor`): + The embedded inputs as a torch Tensor. + past_key_values_length (`int`): + The length of the key value cache. + sliding_window (`int`, *optional*): + If the model uses windowed attention, a sliding window should be passed. + """ + attn_mask_converter = AttentionMaskConverter(is_causal=True, sliding_window=sliding_window) + + key_value_length = input_shape[-1] + past_key_values_length + + # 4d mask is passed through the layers + if attention_mask is not None and len(attention_mask.shape) == 2: + attention_mask = attn_mask_converter.to_4d( + attention_mask, input_shape[-1], key_value_length=key_value_length, dtype=inputs_embeds.dtype + ) + elif attention_mask is not None and len(attention_mask.shape) == 4: + expected_shape = (input_shape[0], 1, input_shape[1], key_value_length) + if tuple(attention_mask.shape) != expected_shape: + raise ValueError( + f"Incorrect 4D attention_mask shape: {tuple(attention_mask.shape)}; expected: {expected_shape}." + ) + else: + # if the 4D mask has correct shape - invert it and fill with negative infinity + inverted_mask = 1.0 - attention_mask + attention_mask = inverted_mask.masked_fill( + inverted_mask.to(torch.bool), torch.finfo(inputs_embeds.dtype).min + ) + else: + attention_mask = attn_mask_converter.to_causal_4d( + input_shape[0], input_shape[-1], key_value_length, dtype=inputs_embeds.dtype, device=inputs_embeds.device + ) + + return attention_mask + + +# Adapted from _prepare_4d_causal_attention_mask +def _prepare_4d_causal_attention_mask_for_sdpa( + attention_mask: Optional[torch.Tensor], + input_shape: Union[torch.Size, tuple, list], + inputs_embeds: torch.Tensor, + past_key_values_length: int, + sliding_window: Optional[int] = None, +): + """ + Prepares the correct `attn_mask` argument to be used by `torch.nn.functional.scaled_dot_product_attention`. + + In case no token is masked in the `attention_mask` argument, we simply set it to `None` for the cases `query_length == 1` and + `key_value_length == query_length`, and rely instead on SDPA `is_causal` argument to use causal/non-causal masks, + allowing to dispatch to the flash attention kernel (that can otherwise not be used if a custom `attn_mask` is passed). + """ + attn_mask_converter = AttentionMaskConverter(is_causal=True, sliding_window=sliding_window) + + key_value_length = input_shape[-1] + past_key_values_length + + # torch.jit.trace, symbolic_trace and torchdynamo with fullgraph=True are unable to capture the controlflow `is_causal=attention_mask is None and q_len > 1` + # used as an SDPA argument. We keep compatibility with these tracing tools by always using SDPA's `attn_mask` argument in case we are tracing. + # TODO: For dynamo, rather use a check on fullgraph=True once this is possible (https://github.com/pytorch/pytorch/pull/120400). + is_tracing = torch.jit.is_tracing() or isinstance(inputs_embeds, torch.fx.Proxy) or is_torchdynamo_compiling() + + ignore_causal_mask = AttentionMaskConverter._ignore_causal_mask_sdpa( + attention_mask=attention_mask, + inputs_embeds=inputs_embeds, + past_key_values_length=past_key_values_length, + sliding_window=sliding_window, + ) + + if ignore_causal_mask: + expanded_4d_mask = None + elif attention_mask is None: + expanded_4d_mask = attn_mask_converter.to_causal_4d( + input_shape[0], input_shape[-1], key_value_length, dtype=inputs_embeds.dtype, device=inputs_embeds.device + ) + else: + if attention_mask.dim() == 4: + expanded_4d_mask = attention_mask + else: + expanded_4d_mask = attn_mask_converter.to_4d( + attention_mask, + input_shape[-1], + dtype=inputs_embeds.dtype, + key_value_length=key_value_length, + ) + + # Attend to all tokens in masked rows from the causal_mask, for example the relevant first rows when + # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path. + # Details: https://github.com/pytorch/pytorch/issues/110213 + if not is_tracing and expanded_4d_mask.device.type == "cuda": + expanded_4d_mask = AttentionMaskConverter._unmask_unattended( + expanded_4d_mask, min_dtype=torch.finfo(inputs_embeds.dtype).min + ) + + return expanded_4d_mask + + +def _prepare_4d_attention_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None): + """ + Creates a non-causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape + `(batch_size, key_value_length)` + + Args: + mask (`torch.Tensor`): + A 2D attention mask of shape `(batch_size, key_value_length)` + dtype (`torch.dtype`): + The torch dtype the created mask shall have. + tgt_len (`int`): + The target length or query length the created mask shall have. + """ + return AttentionMaskConverter._expand_mask(mask=mask, dtype=dtype, tgt_len=tgt_len) + + +def _prepare_4d_attention_mask_for_sdpa(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None): + """ + Creates a non-causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape + `(batch_size, key_value_length)` + + Args: + mask (`torch.Tensor`): + A 2D attention mask of shape `(batch_size, key_value_length)` + dtype (`torch.dtype`): + The torch dtype the created mask shall have. + tgt_len (`int`): + The target length or query length the created mask shall have. + """ + _, key_value_length = mask.shape + tgt_len = tgt_len if tgt_len is not None else key_value_length + + is_tracing = torch.jit.is_tracing() or isinstance(mask, torch.fx.Proxy) or is_torchdynamo_compiling() + + # torch.jit.trace, symbolic_trace and torchdynamo with fullgraph=True are unable to capture data-dependent controlflows. + if not is_tracing and torch.all(mask == 1): + return None + else: + return AttentionMaskConverter._expand_mask(mask=mask, dtype=dtype, tgt_len=tgt_len) + + +def _create_4d_causal_attention_mask( + input_shape: Union[torch.Size, tuple, list], + dtype: torch.dtype, + device: torch.device, + past_key_values_length: int = 0, + sliding_window: Optional[int] = None, +) -> Optional[torch.Tensor]: + """ + Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` + + Args: + input_shape (`tuple(int)` or `list(int)` or `torch.Size`): + The input shape should be a tuple that defines `(batch_size, query_length)`. + dtype (`torch.dtype`): + The torch dtype the created mask shall have. + device (`int`): + The torch device the created mask shall have. + sliding_window (`int`, *optional*): + If the model uses windowed attention, a sliding window should be passed. + """ + attn_mask_converter = AttentionMaskConverter(is_causal=True, sliding_window=sliding_window) + + key_value_length = past_key_values_length + input_shape[-1] + attention_mask = attn_mask_converter.to_causal_4d( + input_shape[0], input_shape[-1], key_value_length, dtype=dtype, device=device + ) + + return attention_mask diff --git a/docs/transformers/build/lib/transformers/modeling_flash_attention_utils.py b/docs/transformers/build/lib/transformers/modeling_flash_attention_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..c7d54dc415146e0f289750c86caffa5021399a92 --- /dev/null +++ b/docs/transformers/build/lib/transformers/modeling_flash_attention_utils.py @@ -0,0 +1,437 @@ +# Copyright 2024 The Fairseq Authors and the HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import inspect +import os +from typing import Optional, TypedDict + +import torch +import torch.nn.functional as F + +from .utils import ( + is_flash_attn_2_available, + is_flash_attn_greater_or_equal, + is_flash_attn_greater_or_equal_2_10, + is_torch_npu_available, + logging, +) + + +logger = logging.get_logger(__name__) +flash_attn_func = None + + +if is_flash_attn_2_available(): + from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input # noqa + from flash_attn import flash_attn_func, flash_attn_varlen_func + from flash_attn.layers.rotary import apply_rotary_emb # noqa + + +# patch functions in package `flash-attn` when using flash-attention on Ascend NPU. +if is_torch_npu_available(): + from torch_npu import npu_rotary_mul as apply_rotary_emb # noqa + + from .integrations.npu_flash_attention import index_first_axis, pad_input, unpad_input + from .integrations.npu_flash_attention import npu_flash_attn_func as flash_attn_func + from .integrations.npu_flash_attention import npu_flash_attn_varlen_func as flash_attn_varlen_func + + +if flash_attn_func: + _flash_supports_window_size = "window_size" in list(inspect.signature(flash_attn_func).parameters) + + +def is_flash_attn_available(): + """Determine whether flash-attention can be used or not.""" + + # if package `flash-attn` is available, flash-attention can be used natively. + if is_flash_attn_2_available(): + return True + + # flash-attention can be used on Ascend NPU without package `flash-attn` + if is_torch_npu_available(): + return True + + return False + + +def flash_attn_supports_top_left_mask(): + """Determine whether flash-attention uses top-left or down-right mask""" + + if is_flash_attn_2_available(): + # top-left mask is used in package `flash-attn` with version lower than 2.1.0 + return not is_flash_attn_greater_or_equal_2_10() + + if is_torch_npu_available(): + # down-right mask is used on Ascend NPU by default, set env `NPU_FA2_SPARSE_MODE=2` to activate top-left mask. + from .integrations.npu_flash_attention import is_npu_fa2_top_left_aligned_causal_mask + + return is_npu_fa2_top_left_aligned_causal_mask() + + return False + + +def _get_unpad_data(attention_mask: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor, int]: + """ + Retrieves indexing data required to repad unpadded (ragged) tensors. + + Arguments: + attention_mask (`torch.Tensor`): + Boolean or int tensor of shape (batch_size, sequence_length), 1 means valid and 0 means not valid. + + Return: + indices (`torch.Tensor`): + The indices of non-masked tokens from the flattened input sequence. + cu_seqlens (`torch.Tensor`): + The cumulative sequence lengths, used to index into ragged (unpadded) tensors. `cu_seqlens` shape is (batch_size + 1,). + max_seqlen_in_batch (`int`): + Maximum sequence length in batch. + """ + seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32) + indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten() + max_seqlen_in_batch = seqlens_in_batch.max().item() + cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.int32), (1, 0)) + return ( + indices, + cu_seqlens, + max_seqlen_in_batch, + ) + + +def _upad_input( + query_layer: torch.Tensor, + key_layer: torch.Tensor, + value_layer: torch.Tensor, + attention_mask: torch.Tensor, + query_length: int, +): + """ + Unpads query, key, and values tensors, using a single dimension for all tokens even though they belong to different batches. + + This function is used instead of `flash_attn.bert_padding.unpad_input` in order to avoid the recomputation of the same intermediary + tensors for query, key, value tensors. + + Arguments: + query_layer (`torch.Tensor`): + Query state with padding. Shape: (batch_size, query_length, num_heads, head_dim). + key_layer (`torch.Tensor`): + Key state with padding. Shape: (batch_size, kv_seq_len, num_key_value_heads, head_dim). + value_layer (`torch.Tensor`): + Value state with padding. Shape: (batch_size, kv_seq_len, num_key_value_heads, head_dim). + attention_mask (`torch.Tensor`): + Boolean or int tensor of shape (batch_size, sequence_length), 1 means valid and 0 means not valid. + query_length (`int`): + Target length. + + Return: + query_layer (`torch.Tensor`): + Query state without padding. Shape: (total_target_length, num_heads, head_dim). + key_layer (`torch.Tensor`): + Key state with padding. Shape: (total_source_length, num_key_value_heads, head_dim). + value_layer (`torch.Tensor`): + Value state with padding. Shape: (total_source_length, num_key_value_heads, head_dim). + indices_q (`torch.Tensor`): + The indices of non-masked tokens from the flattened input target sequence. + (cu_seqlens_q, cu_seqlens_k) (`Tuple[int]`): + The cumulative sequence lengths for the target (query) and source (key, value), used to index into ragged (unpadded) tensors. `cu_seqlens` shape is (batch_size + 1,). + (max_seqlen_in_batch_q, max_seqlen_in_batch_k) (`Tuple[int]`): + Maximum sequence length in batch (`max_seqlen_in_batch_q` for the target sequence i.e. query, `max_seqlen_in_batch_k` for the source sequence i.e. key/value). + """ + indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask) + batch_size, kv_seq_len, num_key_value_heads, head_dim = key_layer.shape + + key_layer = index_first_axis(key_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k) + value_layer = index_first_axis( + value_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k + ) + if query_length == kv_seq_len: + query_layer = index_first_axis(query_layer.reshape(batch_size * kv_seq_len, -1, head_dim), indices_k) + cu_seqlens_q = cu_seqlens_k + max_seqlen_in_batch_q = max_seqlen_in_batch_k + indices_q = indices_k + elif query_length == 1: + max_seqlen_in_batch_q = 1 + cu_seqlens_q = torch.arange( + batch_size + 1, dtype=torch.int32, device=query_layer.device + ) # There is a memcpy here, that is very bad. + indices_q = cu_seqlens_q[:-1] + query_layer = query_layer.squeeze(1) + else: + # The -q_len: slice assumes left padding. + attention_mask = attention_mask[:, -query_length:] + query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q, *_ = unpad_input(query_layer, attention_mask) + + return ( + query_layer, + key_layer, + value_layer, + indices_q, + (cu_seqlens_q, cu_seqlens_k), + (max_seqlen_in_batch_q, max_seqlen_in_batch_k), + ) + + +def prepare_fa2_from_position_ids(query, key, value, position_ids): + """ + This function returns necessary arguments to call `flash_attn_varlen_func`. + All three query, key, value states will be flattened. + Cumulative lengths of each examples in the batch will be extracted from position_ids. + + NOTE: ideally cumulative lengths should be prepared at the data collator stage + + Arguments: + query (`torch.Tensor`): + Query state with padding. Shape: (batch_size, query_length, num_heads, head_dim). + key (`torch.Tensor`): + Key state with padding. Shape: (batch_size, kv_seq_len, num_key_value_heads, head_dim). + value (`torch.Tensor`): + Value state with padding. Shape: (batch_size, kv_seq_len, num_key_value_heads, head_dim). + position_ids (`torch.Tensor`): + Boolean or int tensor of shape (batch_size, sequence_length), 1 means valid and 0 means not valid. + + Return: + query (`torch.Tensor`): + Query state without padding. Shape: (total_target_length, num_heads, head_dim). + key (`torch.Tensor`): + Key state with padding. Shape: (total_source_length, num_key_value_heads, head_dim). + value (`torch.Tensor`): + Value state with padding. Shape: (total_source_length, num_key_value_heads, head_dim). + indices_q (`torch.Tensor`): + The indices of non-masked tokens from the flattened input target sequence. + (cu_seqlens_q, cu_seqlens_k) (`Tuple[int]`): + The cumulative sequence lengths for the target (query) and source (key, value), used to index into ragged (unpadded) tensors. `cu_seqlens` shape is (batch_size + 1,). + (max_seqlen_in_batch_q, max_seqlen_in_batch_k) (`Tuple[int]`): + Maximum sequence length in batch (`max_seqlen_in_batch_q` for the target sequence i.e. query, `max_seqlen_in_batch_k` for the source sequence i.e. key/value). + """ + query = query.view(-1, query.size(-2), query.size(-1)) + key = key.contiguous().view(-1, key.size(-2), key.size(-1)) + value = value.contiguous().view(-1, value.size(-2), value.size(-1)) + position_ids = position_ids.flatten() + indices_q = torch.arange(position_ids.size(0), device=position_ids.device, dtype=torch.int32) + + cu_seq_lens = torch.cat( + ( + indices_q[position_ids == 0], + torch.tensor(position_ids.size(), device=position_ids.device, dtype=torch.int32), + ) + ) + + max_length = position_ids.max() + 1 + + return (query, key, value, indices_q, (cu_seq_lens, cu_seq_lens), (max_length, max_length)) + + +def fa_peft_integration_check( + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + target_dtype: Optional[torch.dtype] = None, +): + """ + PEFT usually casts the layer norms in float32 for training stability reasons + therefore the input hidden states gets silently casted in float32. Hence, we need + cast them back in float16 / bfloat16 just to be sure everything works as expected. + This might slowdown training & inference so it is recommended to not cast the LayerNorms! + + Args: + query (`torch.Tensor`): + Input query states to be passed to Flash Attention API + key (`torch.Tensor`): + Input key states to be passed to Flash Attention API + value (`torch.Tensor`): + Input value states to be passed to Flash Attention API + target_dtype (`torch.dtype`, *optional*): + The dtype to convert the attention tensors to. Conversion can be ignored by + not providing the target dtype. + """ + if target_dtype is None: + return query, key, value + + input_dtype = query.dtype + if input_dtype == torch.float32: + logger.warning_once( + f"The input hidden states seems to be silently casted in float32, this might be related to" + f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in" + f" {target_dtype}." + ) + + query = query.to(target_dtype) + key = key.to(target_dtype) + value = value.to(target_dtype) + + return query, key, value + + +flash_241 = is_flash_attn_greater_or_equal("2.4.1") +deterministic_g = os.environ.get("FLASH_ATTENTION_DETERMINISTIC", "0") == "1" + + +def _flash_attention_forward( + query_states: torch.Tensor, + key_states: torch.Tensor, + value_states: torch.Tensor, + attention_mask: Optional[torch.Tensor], + query_length: int, + is_causal: bool, + dropout: float = 0.0, + position_ids: Optional[torch.Tensor] = None, + softmax_scale: Optional[float] = None, + sliding_window: Optional[int] = None, + use_top_left_mask: bool = False, + softcap: Optional[float] = None, + deterministic: Optional[bool] = None, + cu_seq_lens_q: Optional[torch.LongTensor] = None, + cu_seq_lens_k: Optional[torch.LongTensor] = None, + max_length_q: Optional[int] = None, + max_length_k: Optional[int] = None, + target_dtype: Optional[torch.dtype] = None, + **kwargs, +): + """ + Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token + first unpad the input, then computes the attention scores and pad the final attention scores. + + Args: + query_states (`torch.Tensor`): + Input query states to be passed to Flash Attention API + key_states (`torch.Tensor`): + Input key states to be passed to Flash Attention API + value_states (`torch.Tensor`): + Input value states to be passed to Flash Attention API + attention_mask (`torch.Tensor`, *optional*): + The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the + position of padding tokens and 1 for the position of non-padding tokens. + dropout (`float`): + Attention dropout + softmax_scale (`float`, *optional*): + The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim) + use_top_left_mask (`bool`, defaults to `False`): + flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignment, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. + softcap (`float`, *optional*): + Softcap for the attention logits, used e.g. in gemma2. + deterministic (`bool`, *optional*): + Determines if the deterministic option introduced in flash_attn>=2.4.1 is enabled. + """ + if not use_top_left_mask: + causal = is_causal + else: + # TODO: Remove the `query_length != 1` check once Flash Attention for RoCm is bumped to 2.1. + causal = is_causal and query_length != 1 + + # Assuming 4D tensors, key_states.shape[1] is the key/value sequence length (source length). + use_sliding_windows = ( + _flash_supports_window_size and sliding_window is not None and key_states.shape[1] > sliding_window + ) + flash_kwargs = {"window_size": (sliding_window, sliding_window)} if use_sliding_windows else {} + + if flash_241: + if deterministic is None: + deterministic = deterministic_g + flash_kwargs["deterministic"] = deterministic + + if softcap is not None: + flash_kwargs["softcap"] = softcap + + # PEFT possibly silently casts tensors to fp32, this potentially reconverts to correct dtype or is a no op + query_states, key_states, value_states = fa_peft_integration_check( + query_states, key_states, value_states, target_dtype + ) + + # Contains at least one padding token in the sequence + if attention_mask is not None: + batch_size = query_states.shape[0] + query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = _upad_input( + query_states, key_states, value_states, attention_mask, query_length + ) + cu_seqlens_q, cu_seqlens_k = cu_seq_lens + max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens + + attn_output_unpad = flash_attn_varlen_func( + query_states, + key_states, + value_states, + cu_seqlens_q=cu_seqlens_q, + cu_seqlens_k=cu_seqlens_k, + max_seqlen_q=max_seqlen_in_batch_q, + max_seqlen_k=max_seqlen_in_batch_k, + dropout_p=dropout, + softmax_scale=softmax_scale, + causal=causal, + **flash_kwargs, + ) + attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length) + + # If position_ids is provided and check all examples do not contain only 1 sequence, If tensor in increasing + # then we probably have one sequence, otherwise it is packed. Additionally check we are in pre-fill/training stage. + # Use `flash_attn_varlen_func` to prevent cross-example attention and also allow padding free approach + elif position_ids is not None and ( + max_length_q is not None or (query_length != 1 and not (torch.diff(position_ids, dim=-1) >= 0).all()) + ): + batch_size = query_states.size(0) + + if cu_seq_lens_q is None or cu_seq_lens_k is None: + query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = ( + prepare_fa2_from_position_ids(query_states, key_states, value_states, position_ids) + ) + + cu_seq_lens_q, cu_seq_lens_k = cu_seq_lens + max_length_q, max_length_k = max_seq_lens + + else: + query_states = query_states.reshape(-1, query_states.size(-2), query_states.size(-1)) + key_states = key_states.reshape(-1, key_states.size(-2), key_states.size(-1)) + value_states = value_states.reshape(-1, value_states.size(-2), value_states.size(-1)) + + attn_output = flash_attn_varlen_func( + query_states, + key_states, + value_states, + cu_seqlens_q=cu_seq_lens_q, + cu_seqlens_k=cu_seq_lens_k, + max_seqlen_q=max_length_q, + max_seqlen_k=max_length_k, + dropout_p=dropout, + softmax_scale=softmax_scale, + causal=causal, + **flash_kwargs, + ) + + attn_output = attn_output.view(batch_size, -1, attn_output.size(-2), attn_output.size(-1)) + + else: + attn_output = flash_attn_func( + query_states, key_states, value_states, dropout, softmax_scale=softmax_scale, causal=causal, **flash_kwargs + ) + + return attn_output + + +class FlashAttentionKwargs(TypedDict, total=False): + """ + Keyword arguments for Flash Attention with Compile. + + Attributes: + cu_seq_lens_q (`torch.LongTensor`, *optional*) + Gets cumulative sequence length for query state. + cu_seq_lens_k (`torch.LongTensor`, *optional*) + Gets cumulative sequence length for key state. + max_length_q (`int`, *optional*): + Maximum sequence length for query state. + max_length_k (`int`, *optional*): + Maximum sequence length for key state. + """ + + cu_seq_lens_q: Optional[torch.LongTensor] + cu_seq_lens_k: Optional[torch.LongTensor] + max_length_q: Optional[int] + max_length_k: Optional[int] diff --git a/docs/transformers/build/lib/transformers/modeling_flax_outputs.py b/docs/transformers/build/lib/transformers/modeling_flax_outputs.py new file mode 100644 index 0000000000000000000000000000000000000000..325f968571ed400903b6c3bb15a4faf10f7e56c9 --- /dev/null +++ b/docs/transformers/build/lib/transformers/modeling_flax_outputs.py @@ -0,0 +1,700 @@ +# Copyright 2021 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import Dict, Optional, Tuple + +import flax +import jax.numpy as jnp + +from .utils import ModelOutput + + +@flax.struct.dataclass +class FlaxBaseModelOutput(ModelOutput): + """ + Base class for model's outputs, with potential hidden states and attentions. + + Args: + last_hidden_state (`jnp.ndarray` of shape `(batch_size, sequence_length, hidden_size)`): + Sequence of hidden-states at the output of the last layer of the model. + hidden_states (`tuple(jnp.ndarray)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `jnp.ndarray` (one for the output of the embeddings + one for the output of each layer) of shape + `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (`tuple(jnp.ndarray)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `jnp.ndarray` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + + last_hidden_state: Optional[jnp.ndarray] = None + hidden_states: Optional[Tuple[jnp.ndarray]] = None + attentions: Optional[Tuple[jnp.ndarray]] = None + + +@flax.struct.dataclass +class FlaxBaseModelOutputWithNoAttention(ModelOutput): + """ + Base class for model's outputs, with potential hidden states. + + Args: + last_hidden_state (`jnp.ndarray` of shape `(batch_size, num_channels, height, width)`): + Sequence of hidden-states at the output of the last layer of the model. + hidden_states (`tuple(jnp.ndarray)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `jnp.ndarray` (one for the output of the embeddings, if the model has an embedding layer, + one + for the output of each layer) of shape `(batch_size, num_channels, height, width)`. Hidden-states of the + model at the output of each layer plus the optional initial embedding outputs. + """ + + last_hidden_state: Optional[jnp.ndarray] = None + hidden_states: Optional[Tuple[jnp.ndarray]] = None + + +@flax.struct.dataclass +class FlaxBaseModelOutputWithPoolingAndNoAttention(ModelOutput): + """ + Base class for model's outputs that also contains a pooling of the last hidden states. + + Args: + last_hidden_state (`jnp.ndarray` of shape `(batch_size, num_channels, height, width)`): + Sequence of hidden-states at the output of the last layer of the model. + pooler_output (`jnp.ndarray` of shape `(batch_size, hidden_size)`): + Last layer hidden-state after a pooling operation on the spatial dimensions. + hidden_states (`tuple(jnp.ndarray)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `jnp.ndarray` (one for the output of the embeddings, if the model has an embedding layer, + one + for the output of each layer) of shape `(batch_size, num_channels, height, width)`. Hidden-states of the + model at the output of each layer plus the optional initial embedding outputs. + """ + + last_hidden_state: Optional[jnp.ndarray] = None + pooler_output: Optional[jnp.ndarray] = None + hidden_states: Optional[Tuple[jnp.ndarray]] = None + + +@flax.struct.dataclass +class FlaxImageClassifierOutputWithNoAttention(ModelOutput): + """ + Base class for outputs of image classification models. + + Args: + logits (`jnp.ndarray` of shape `(batch_size, config.num_labels)`): + Classification (or regression if config.num_labels==1) scores (before SoftMax). + hidden_states (`tuple(jnp.ndarray)`, *optional*, returned when `output_hidden_states=True` is passed or when + `config.output_hidden_states=True`): + Tuple of `jnp.ndarray` (one for the output of the embeddings, if the model has an embedding layer, + one + for the output of each stage) of shape `(batch_size, num_channels, height, width)`. Hidden-states (also + called feature maps) of the model at the output of each stage. + """ + + logits: Optional[jnp.ndarray] = None + hidden_states: Optional[Tuple[jnp.ndarray]] = None + + +@flax.struct.dataclass +class FlaxBaseModelOutputWithPast(ModelOutput): + """ + Base class for model's outputs, with potential hidden states and attentions. + + Args: + last_hidden_state (`jnp.ndarray` of shape `(batch_size, sequence_length, hidden_size)`): + Sequence of hidden-states at the output of the last layer of the model. + past_key_values (`Dict[str, jnp.ndarray]`): + Dictionary of pre-computed hidden-states (key and values in the attention blocks) that can be used for fast + auto-regressive decoding. Pre-computed key and value hidden-states are of shape *[batch_size, max_length]*. + hidden_states (`tuple(jnp.ndarray)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `jnp.ndarray` (one for the output of the embeddings + one for the output of each layer) of shape + `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (`tuple(jnp.ndarray)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `jnp.ndarray` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + + last_hidden_state: Optional[jnp.ndarray] = None + past_key_values: Optional[Dict[str, jnp.ndarray]] = None + hidden_states: Optional[Tuple[jnp.ndarray]] = None + attentions: Optional[Tuple[jnp.ndarray]] = None + + +@flax.struct.dataclass +class FlaxBaseModelOutputWithPooling(ModelOutput): + """ + Base class for model's outputs that also contains a pooling of the last hidden states. + + Args: + last_hidden_state (`jnp.ndarray` of shape `(batch_size, sequence_length, hidden_size)`): + Sequence of hidden-states at the output of the last layer of the model. + pooler_output (`jnp.ndarray` of shape `(batch_size, hidden_size)`): + Last layer hidden-state of the first token of the sequence (classification token) further processed by a + Linear layer and a Tanh activation function. The Linear layer weights are trained from the next sentence + prediction (classification) objective during pretraining. + hidden_states (`tuple(jnp.ndarray)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `jnp.ndarray` (one for the output of the embeddings + one for the output of each layer) of shape + `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (`tuple(jnp.ndarray)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `jnp.ndarray` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + + last_hidden_state: Optional[jnp.ndarray] = None + pooler_output: Optional[jnp.ndarray] = None + hidden_states: Optional[Tuple[jnp.ndarray]] = None + attentions: Optional[Tuple[jnp.ndarray]] = None + + +@flax.struct.dataclass +class FlaxBaseModelOutputWithPoolingAndCrossAttentions(ModelOutput): + """ + Base class for model's outputs that also contains a pooling of the last hidden states. + + Args: + last_hidden_state (`jnp.ndarray` of shape `(batch_size, sequence_length, hidden_size)`): + Sequence of hidden-states at the output of the last layer of the model. + pooler_output (`jnp.ndarray` of shape `(batch_size, hidden_size)`): + Last layer hidden-state of the first token of the sequence (classification token) after further processing + through the layers used for the auxiliary pretraining task. E.g. for BERT-family of models, this returns + the classification token after processing through a linear layer and a tanh activation function. The linear + layer weights are trained from the next sentence prediction (classification) objective during pretraining. + hidden_states (`tuple(jnp.ndarray)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `jnp.ndarray` (one for the output of the embeddings, if the model has an embedding layer, + one + for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the optional initial embedding outputs. + attentions (`tuple(jnp.ndarray)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `jnp.ndarray` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + cross_attentions (`tuple(jnp.ndarray)`, *optional*, returned when `output_attentions=True` and `config.add_cross_attention=True` is passed or when `config.output_attentions=True`): + Tuple of `jnp.ndarray` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the + weighted average in the cross-attention heads. + past_key_values (`tuple(tuple(jnp.ndarray))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): + Tuple of `tuple(jnp.ndarray)` of length `config.n_layers`, with each tuple having 2 tensors of shape + `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and optionally if + `config.is_encoder_decoder=True` 2 additional tensors of shape `(batch_size, num_heads, + encoder_sequence_length, embed_size_per_head)`. + + Contains pre-computed hidden-states (key and values in the self-attention blocks and optionally if + `config.is_encoder_decoder=True` in the cross-attention blocks) that can be used (see `past_key_values` + input) to speed up sequential decoding. + """ + + last_hidden_state: Optional[jnp.ndarray] = None + pooler_output: Optional[jnp.ndarray] = None + hidden_states: Optional[Tuple[jnp.ndarray]] = None + past_key_values: Optional[Tuple[Tuple[jnp.ndarray]]] = None + attentions: Optional[Tuple[jnp.ndarray]] = None + cross_attentions: Optional[Tuple[jnp.ndarray]] = None + + +@flax.struct.dataclass +class FlaxBaseModelOutputWithPastAndCrossAttentions(ModelOutput): + """ + Base class for model's outputs that may also contain a past key/values (to speed up sequential decoding). + + Args: + last_hidden_state (`jnp.ndarray` of shape `(batch_size, sequence_length, hidden_size)`): + Sequence of hidden-states at the output of the last layer of the model. + + If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1, + hidden_size)` is output. + past_key_values (`tuple(tuple(jnp.ndarray))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): + Tuple of `tuple(jnp.ndarray)` of length `config.n_layers`, with each tuple having 2 tensors of shape + `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and optionally if + `config.is_encoder_decoder=True` 2 additional tensors of shape `(batch_size, num_heads, + encoder_sequence_length, embed_size_per_head)`. + + Contains pre-computed hidden-states (key and values in the self-attention blocks and optionally if + `config.is_encoder_decoder=True` in the cross-attention blocks) that can be used (see `past_key_values` + input) to speed up sequential decoding. + hidden_states (`tuple(jnp.ndarray)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `jnp.ndarray` (one for the output of the embeddings + one for the output of each layer) of shape + `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (`tuple(jnp.ndarray)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `jnp.ndarray` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + cross_attentions (`tuple(jnp.ndarray)`, *optional*, returned when `output_attentions=True` and `config.add_cross_attention=True` is passed or when `config.output_attentions=True`): + Tuple of `jnp.ndarray` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the + weighted average in the cross-attention heads. + """ + + last_hidden_state: Optional[jnp.ndarray] = None + past_key_values: Optional[Tuple[Tuple[jnp.ndarray]]] = None + hidden_states: Optional[Tuple[jnp.ndarray]] = None + attentions: Optional[Tuple[jnp.ndarray]] = None + cross_attentions: Optional[Tuple[jnp.ndarray]] = None + + +@flax.struct.dataclass +class FlaxSeq2SeqModelOutput(ModelOutput): + """ + Base class for model encoder's outputs that also contains : pre-computed hidden states that can speed up sequential + decoding. + + Args: + last_hidden_state (`jnp.ndarray` of shape `(batch_size, sequence_length, hidden_size)`): + Sequence of hidden-states at the output of the last layer of the decoder of the model. + + If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1, + hidden_size)` is output. + past_key_values (`tuple(tuple(jnp.ndarray))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): + Tuple of `tuple(jnp.ndarray)` of length `config.n_layers`, with each tuple having 2 tensors of shape + `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape + `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`. + + Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention + blocks) that can be used (see `past_key_values` input) to speed up sequential decoding. + decoder_hidden_states (`tuple(jnp.ndarray)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `jnp.ndarray` (one for the output of the embeddings + one for the output of each layer) of shape + `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the decoder at the output of each layer plus the initial embedding outputs. + decoder_attentions (`tuple(jnp.ndarray)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `jnp.ndarray` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the + self-attention heads. + cross_attentions (`tuple(jnp.ndarray)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `jnp.ndarray` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the + weighted average in the cross-attention heads. + encoder_last_hidden_state (`jnp.ndarray` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Sequence of hidden-states at the output of the last layer of the encoder of the model. + encoder_hidden_states (`tuple(jnp.ndarray)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `jnp.ndarray` (one for the output of the embeddings + one for the output of each layer) of shape + `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the encoder at the output of each layer plus the initial embedding outputs. + encoder_attentions (`tuple(jnp.ndarray)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `jnp.ndarray` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the + self-attention heads. + """ + + last_hidden_state: Optional[jnp.ndarray] = None + past_key_values: Optional[Tuple[Tuple[jnp.ndarray]]] = None + decoder_hidden_states: Optional[Tuple[jnp.ndarray]] = None + decoder_attentions: Optional[Tuple[jnp.ndarray]] = None + cross_attentions: Optional[Tuple[jnp.ndarray]] = None + encoder_last_hidden_state: Optional[jnp.ndarray] = None + encoder_hidden_states: Optional[Tuple[jnp.ndarray]] = None + encoder_attentions: Optional[Tuple[jnp.ndarray]] = None + + +@flax.struct.dataclass +class FlaxCausalLMOutputWithCrossAttentions(ModelOutput): + """ + Base class for causal language model (or autoregressive) outputs. + + Args: + logits (`jnp.ndarray` of shape `(batch_size, sequence_length, config.vocab_size)`): + Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). + hidden_states (`tuple(jnp.ndarray)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `jnp.ndarray` (one for the output of the embeddings + one for the output of each layer) of shape + `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (`tuple(jnp.ndarray)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `jnp.ndarray` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + cross_attentions (`tuple(jnp.ndarray)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `jnp.ndarray` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Cross attentions weights after the attention softmax, used to compute the weighted average in the + cross-attention heads. + past_key_values (`tuple(tuple(jnp.ndarray))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): + Tuple of `jnp.ndarray` tuples of length `config.n_layers`, with each tuple containing the cached key, value + states of the self-attention and the cross-attention layers if model is used in encoder-decoder setting. + Only relevant if `config.is_decoder = True`. + + Contains pre-computed hidden-states (key and values in the attention blocks) that can be used (see + `past_key_values` input) to speed up sequential decoding. + """ + + logits: Optional[jnp.ndarray] = None + past_key_values: Optional[Tuple[Tuple[jnp.ndarray]]] = None + hidden_states: Optional[Tuple[jnp.ndarray]] = None + attentions: Optional[Tuple[jnp.ndarray]] = None + cross_attentions: Optional[Tuple[jnp.ndarray]] = None + + +@flax.struct.dataclass +class FlaxMaskedLMOutput(ModelOutput): + """ + Base class for masked language models outputs. + + Args: + logits (`jnp.ndarray` of shape `(batch_size, sequence_length, config.vocab_size)`): + Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). + hidden_states (`tuple(jnp.ndarray)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `jnp.ndarray` (one for the output of the embeddings + one for the output of each layer) of shape + `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (`tuple(jnp.ndarray)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `jnp.ndarray` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + + logits: Optional[jnp.ndarray] = None + hidden_states: Optional[Tuple[jnp.ndarray]] = None + attentions: Optional[Tuple[jnp.ndarray]] = None + + +FlaxCausalLMOutput = FlaxMaskedLMOutput + + +@flax.struct.dataclass +class FlaxSeq2SeqLMOutput(ModelOutput): + """ + Base class for sequence-to-sequence language models outputs. + + Args: + logits (`jnp.ndarray` of shape `(batch_size, sequence_length, config.vocab_size)`): + Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). + past_key_values (`tuple(tuple(jnp.ndarray))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): + Tuple of `tuple(jnp.ndarray)` of length `config.n_layers`, with each tuple having 2 tensors of shape + `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape + `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`. + + Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention + blocks) that can be used (see `past_key_values` input) to speed up sequential decoding. + decoder_hidden_states (`tuple(jnp.ndarray)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `jnp.ndarray` (one for the output of the embeddings + one for the output of each layer) of shape + `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the decoder at the output of each layer plus the initial embedding outputs. + decoder_attentions (`tuple(jnp.ndarray)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `jnp.ndarray` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the + self-attention heads. + cross_attentions (`tuple(jnp.ndarray)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `jnp.ndarray` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the + weighted average in the cross-attention heads. + encoder_last_hidden_state (`jnp.ndarray` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Sequence of hidden-states at the output of the last layer of the encoder of the model. + encoder_hidden_states (`tuple(jnp.ndarray)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `jnp.ndarray` (one for the output of the embeddings + one for the output of each layer) of shape + `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the encoder at the output of each layer plus the initial embedding outputs. + encoder_attentions (`tuple(jnp.ndarray)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `jnp.ndarray` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the + self-attention heads. + """ + + logits: Optional[jnp.ndarray] = None + past_key_values: Optional[Tuple[Tuple[jnp.ndarray]]] = None + decoder_hidden_states: Optional[Tuple[jnp.ndarray]] = None + decoder_attentions: Optional[Tuple[jnp.ndarray]] = None + cross_attentions: Optional[Tuple[jnp.ndarray]] = None + encoder_last_hidden_state: Optional[jnp.ndarray] = None + encoder_hidden_states: Optional[Tuple[jnp.ndarray]] = None + encoder_attentions: Optional[Tuple[jnp.ndarray]] = None + + +@flax.struct.dataclass +class FlaxNextSentencePredictorOutput(ModelOutput): + """ + Base class for outputs of models predicting if two sentences are consecutive or not. + + Args: + logits (`jnp.ndarray` of shape `(batch_size, 2)`): + Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation + before SoftMax). + hidden_states (`tuple(jnp.ndarray)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `jnp.ndarray` (one for the output of the embeddings + one for the output of each layer) of shape + `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (`tuple(jnp.ndarray)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `jnp.ndarray` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + + logits: Optional[jnp.ndarray] = None + hidden_states: Optional[Tuple[jnp.ndarray]] = None + attentions: Optional[Tuple[jnp.ndarray]] = None + + +@flax.struct.dataclass +class FlaxSequenceClassifierOutput(ModelOutput): + """ + Base class for outputs of sentence classification models. + + Args: + logits (`jnp.ndarray` of shape `(batch_size, config.num_labels)`): + Classification (or regression if config.num_labels==1) scores (before SoftMax). + hidden_states (`tuple(jnp.ndarray)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `jnp.ndarray` (one for the output of the embeddings + one for the output of each layer) of shape + `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (`tuple(jnp.ndarray)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `jnp.ndarray` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + + logits: Optional[jnp.ndarray] = None + hidden_states: Optional[Tuple[jnp.ndarray]] = None + attentions: Optional[Tuple[jnp.ndarray]] = None + + +@flax.struct.dataclass +class FlaxSeq2SeqSequenceClassifierOutput(ModelOutput): + """ + Base class for outputs of sequence-to-sequence sentence classification models. + + Args: + logits (`jnp.ndarray` of shape `(batch_size, config.num_labels)`): + Classification (or regression if config.num_labels==1) scores (before SoftMax). + past_key_values (`tuple(tuple(jnp.ndarray))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): + Tuple of `tuple(jnp.ndarray)` of length `config.n_layers`, with each tuple having 2 tensors of shape + `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape + `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`. + + Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention + blocks) that can be used (see `past_key_values` input) to speed up sequential decoding. + decoder_hidden_states (`tuple(jnp.ndarray)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `jnp.ndarray` (one for the output of the embeddings + one for the output of each layer) of shape + `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the decoder at the output of each layer plus the initial embedding outputs. + decoder_attentions (`tuple(jnp.ndarray)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `jnp.ndarray` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the + self-attention heads. + cross_attentions (`tuple(jnp.ndarray)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `jnp.ndarray` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the + weighted average in the cross-attention heads. + encoder_last_hidden_state (`jnp.ndarray` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Sequence of hidden-states at the output of the last layer of the encoder of the model. + encoder_hidden_states (`tuple(jnp.ndarray)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `jnp.ndarray` (one for the output of the embeddings + one for the output of each layer) of shape + `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the encoder at the output of each layer plus the initial embedding outputs. + encoder_attentions (`tuple(jnp.ndarray)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `jnp.ndarray` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the + self-attention heads. + """ + + logits: Optional[jnp.ndarray] = None + past_key_values: Optional[Tuple[Tuple[jnp.ndarray]]] = None + decoder_hidden_states: Optional[Tuple[jnp.ndarray]] = None + decoder_attentions: Optional[Tuple[jnp.ndarray]] = None + cross_attentions: Optional[Tuple[jnp.ndarray]] = None + encoder_last_hidden_state: Optional[jnp.ndarray] = None + encoder_hidden_states: Optional[Tuple[jnp.ndarray]] = None + encoder_attentions: Optional[Tuple[jnp.ndarray]] = None + + +@flax.struct.dataclass +class FlaxMultipleChoiceModelOutput(ModelOutput): + """ + Base class for outputs of multiple choice models. + + Args: + logits (`jnp.ndarray` of shape `(batch_size, num_choices)`): + *num_choices* is the second dimension of the input tensors. (see *input_ids* above). + + Classification scores (before SoftMax). + hidden_states (`tuple(jnp.ndarray)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `jnp.ndarray` (one for the output of the embeddings + one for the output of each layer) of shape + `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (`tuple(jnp.ndarray)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `jnp.ndarray` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + + logits: Optional[jnp.ndarray] = None + hidden_states: Optional[Tuple[jnp.ndarray]] = None + attentions: Optional[Tuple[jnp.ndarray]] = None + + +@flax.struct.dataclass +class FlaxTokenClassifierOutput(ModelOutput): + """ + Base class for outputs of token classification models. + + Args: + logits (`jnp.ndarray` of shape `(batch_size, sequence_length, config.num_labels)`): + Classification scores (before SoftMax). + hidden_states (`tuple(jnp.ndarray)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `jnp.ndarray` (one for the output of the embeddings + one for the output of each layer) of shape + `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (`tuple(jnp.ndarray)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `jnp.ndarray` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + + logits: Optional[jnp.ndarray] = None + hidden_states: Optional[Tuple[jnp.ndarray]] = None + attentions: Optional[Tuple[jnp.ndarray]] = None + + +@flax.struct.dataclass +class FlaxQuestionAnsweringModelOutput(ModelOutput): + """ + Base class for outputs of question answering models. + + Args: + start_logits (`jnp.ndarray` of shape `(batch_size, sequence_length)`): + Span-start scores (before SoftMax). + end_logits (`jnp.ndarray` of shape `(batch_size, sequence_length)`): + Span-end scores (before SoftMax). + hidden_states (`tuple(jnp.ndarray)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `jnp.ndarray` (one for the output of the embeddings + one for the output of each layer) of shape + `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (`tuple(jnp.ndarray)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `jnp.ndarray` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + + start_logits: Optional[jnp.ndarray] = None + end_logits: Optional[jnp.ndarray] = None + hidden_states: Optional[Tuple[jnp.ndarray]] = None + attentions: Optional[Tuple[jnp.ndarray]] = None + + +@flax.struct.dataclass +class FlaxSeq2SeqQuestionAnsweringModelOutput(ModelOutput): + """ + Base class for outputs of sequence-to-sequence question answering models. + + Args: + start_logits (`jnp.ndarray` of shape `(batch_size, sequence_length)`): + Span-start scores (before SoftMax). + end_logits (`jnp.ndarray` of shape `(batch_size, sequence_length)`): + Span-end scores (before SoftMax). + past_key_values (`tuple(tuple(jnp.ndarray))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): + Tuple of `tuple(jnp.ndarray)` of length `config.n_layers`, with each tuple having 2 tensors of shape + `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape + `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`. + + Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention + blocks) that can be used (see `past_key_values` input) to speed up sequential decoding. + decoder_hidden_states (`tuple(jnp.ndarray)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `jnp.ndarray` (one for the output of the embeddings + one for the output of each layer) of shape + `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the decoder at the output of each layer plus the initial embedding outputs. + decoder_attentions (`tuple(jnp.ndarray)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `jnp.ndarray` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the + self-attention heads. + cross_attentions (`tuple(jnp.ndarray)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `jnp.ndarray` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the + weighted average in the cross-attention heads. + encoder_last_hidden_state (`jnp.ndarray` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Sequence of hidden-states at the output of the last layer of the encoder of the model. + encoder_hidden_states (`tuple(jnp.ndarray)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `jnp.ndarray` (one for the output of the embeddings + one for the output of each layer) of shape + `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the encoder at the output of each layer plus the initial embedding outputs. + encoder_attentions (`tuple(jnp.ndarray)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `jnp.ndarray` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the + self-attention heads. + """ + + start_logits: Optional[jnp.ndarray] = None + end_logits: Optional[jnp.ndarray] = None + past_key_values: Optional[Tuple[Tuple[jnp.ndarray]]] = None + decoder_hidden_states: Optional[Tuple[jnp.ndarray]] = None + decoder_attentions: Optional[Tuple[jnp.ndarray]] = None + cross_attentions: Optional[Tuple[jnp.ndarray]] = None + encoder_last_hidden_state: Optional[jnp.ndarray] = None + encoder_hidden_states: Optional[Tuple[jnp.ndarray]] = None + encoder_attentions: Optional[Tuple[jnp.ndarray]] = None diff --git a/docs/transformers/build/lib/transformers/modeling_flax_pytorch_utils.py b/docs/transformers/build/lib/transformers/modeling_flax_pytorch_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..072850657725fb41e54eb33d0a7ad9d489968b00 --- /dev/null +++ b/docs/transformers/build/lib/transformers/modeling_flax_pytorch_utils.py @@ -0,0 +1,490 @@ +# coding=utf-8 +# Copyright 2021 The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch - Flax general utilities.""" + +import os +from pickle import UnpicklingError +from typing import Dict, Tuple + +import jax +import jax.numpy as jnp +import numpy as np +from flax.serialization import from_bytes +from flax.traverse_util import flatten_dict, unflatten_dict + +import transformers + +from . import is_safetensors_available, is_torch_available +from .utils import logging + + +if is_torch_available(): + import torch + +if is_safetensors_available(): + from safetensors import safe_open + from safetensors.flax import load_file as safe_load_file + + +logger = logging.get_logger(__name__) + + +##################### +# PyTorch => Flax # +##################### + + +def load_pytorch_checkpoint_in_flax_state_dict( + flax_model, pytorch_checkpoint_path, is_sharded, allow_missing_keys=False +): + """Load pytorch checkpoints in a flax model""" + + if not is_sharded: + pt_path = os.path.abspath(pytorch_checkpoint_path) + logger.info(f"Loading PyTorch weights from {pt_path}") + + if pt_path.endswith(".safetensors"): + pt_state_dict = {} + with safe_open(pt_path, framework="flax") as f: + for k in f.keys(): + pt_state_dict[k] = f.get_tensor(k) + else: + try: + import torch # noqa: F401 + except (ImportError, ModuleNotFoundError): + logger.error( + "Loading a PyTorch model in Flax, requires both PyTorch and Flax to be installed. Please see" + " https://pytorch.org/ and https://flax.readthedocs.io/en/latest/installation.html for installation" + " instructions." + ) + raise + + pt_state_dict = torch.load(pt_path, map_location="cpu", weights_only=True) + logger.info(f"PyTorch checkpoint contains {sum(t.numel() for t in pt_state_dict.values()):,} parameters.") + + flax_state_dict = convert_pytorch_state_dict_to_flax(pt_state_dict, flax_model) + else: + # model is sharded and pytorch_checkpoint_path already contains the list of .pt shard files + flax_state_dict = convert_pytorch_sharded_state_dict_to_flax(pytorch_checkpoint_path, flax_model) + return flax_state_dict + + +def rename_key_and_reshape_tensor( + pt_tuple_key: Tuple[str], + pt_tensor: np.ndarray, + random_flax_state_dict: Dict[str, jnp.ndarray], + model_prefix: str, +) -> (Tuple[str], np.ndarray): + """Rename PT weight names to corresponding Flax weight names and reshape tensor if necessary""" + + def is_key_or_prefix_key_in_dict(key: Tuple[str]) -> bool: + """Checks if `key` of `(prefix,) + key` is in random_flax_state_dict""" + return len(set(random_flax_state_dict) & {key, (model_prefix,) + key}) > 0 + + # layer norm + renamed_pt_tuple_key = pt_tuple_key[:-1] + ("scale",) + if pt_tuple_key[-1] in ["weight", "gamma"] and is_key_or_prefix_key_in_dict(renamed_pt_tuple_key): + return renamed_pt_tuple_key, pt_tensor + + # batch norm layer mean + renamed_pt_tuple_key = pt_tuple_key[:-1] + ("mean",) + if pt_tuple_key[-1] == "running_mean" and not is_key_or_prefix_key_in_dict(pt_tuple_key): + return renamed_pt_tuple_key, pt_tensor + + # batch norm layer var + renamed_pt_tuple_key = pt_tuple_key[:-1] + ("var",) + if pt_tuple_key[-1] == "running_var" and not is_key_or_prefix_key_in_dict(pt_tuple_key): + return renamed_pt_tuple_key, pt_tensor + + # embedding + renamed_pt_tuple_key = pt_tuple_key[:-1] + ("embedding",) + if pt_tuple_key[-1] == "weight" and is_key_or_prefix_key_in_dict(renamed_pt_tuple_key): + return renamed_pt_tuple_key, pt_tensor + + # conv layer + renamed_pt_tuple_key = pt_tuple_key[:-1] + ("kernel",) + if pt_tuple_key[-1] == "weight" and pt_tensor.ndim == 4 and not is_key_or_prefix_key_in_dict(pt_tuple_key): + pt_tensor = pt_tensor.transpose(2, 3, 1, 0) + return renamed_pt_tuple_key, pt_tensor + + # linear layer + renamed_pt_tuple_key = pt_tuple_key[:-1] + ("kernel",) + if pt_tuple_key[-1] == "weight" and not is_key_or_prefix_key_in_dict(pt_tuple_key): + pt_tensor = pt_tensor.T + return renamed_pt_tuple_key, pt_tensor + + # old PyTorch layer norm weight + renamed_pt_tuple_key = pt_tuple_key[:-1] + ("weight",) + if pt_tuple_key[-1] == "gamma": + return renamed_pt_tuple_key, pt_tensor + + # old PyTorch layer norm bias + renamed_pt_tuple_key = pt_tuple_key[:-1] + ("bias",) + if pt_tuple_key[-1] == "beta": + return renamed_pt_tuple_key, pt_tensor + + # New `weight_norm` from https://github.com/huggingface/transformers/pull/24030 + name = None + if pt_tuple_key[-3::2] == ("parametrizations", "original0"): + name = pt_tuple_key[-2] + "_g" + elif pt_tuple_key[-3::2] == ("parametrizations", "original1"): + name = pt_tuple_key[-2] + "_v" + if name is not None: + renamed_pt_tuple_key = pt_tuple_key[:-3] + (name,) + return renamed_pt_tuple_key, pt_tensor + + return pt_tuple_key, pt_tensor + + +def convert_pytorch_state_dict_to_flax(pt_state_dict, flax_model): + # convert pytorch tensor to numpy + from_bin = is_torch_available() and isinstance(next(iter(pt_state_dict.values())), torch.Tensor) + bfloat16 = torch.bfloat16 if from_bin else "bfloat16" + + weight_dtypes = {k: v.dtype for k, v in pt_state_dict.items()} + + if from_bin: + for k, v in pt_state_dict.items(): + # numpy currently does not support bfloat16, need to go over float32 in this case to not lose precision + if v.dtype == bfloat16: + v = v.float() + pt_state_dict[k] = v.cpu().numpy() + + model_prefix = flax_model.base_model_prefix + + # use params dict if the model contains batch norm layers + if "params" in flax_model.params: + flax_model_params = flax_model.params["params"] + else: + flax_model_params = flax_model.params + random_flax_state_dict = flatten_dict(flax_model_params) + + # add batch_stats keys,values to dict + if "batch_stats" in flax_model.params: + flax_batch_stats = flatten_dict(flax_model.params["batch_stats"]) + random_flax_state_dict.update(flax_batch_stats) + + flax_state_dict = {} + + load_model_with_head_into_base_model = (model_prefix not in flax_model_params) and ( + model_prefix in {k.split(".")[0] for k in pt_state_dict.keys()} + ) + load_base_model_into_model_with_head = (model_prefix in flax_model_params) and ( + model_prefix not in {k.split(".")[0] for k in pt_state_dict.keys()} + ) + + # Need to change some parameters name to match Flax names + for pt_key, pt_tensor in pt_state_dict.items(): + pt_tuple_key = tuple(pt_key.split(".")) + is_bfloat_16 = weight_dtypes[pt_key] == bfloat16 + + # remove base model prefix if necessary + has_base_model_prefix = pt_tuple_key[0] == model_prefix + if load_model_with_head_into_base_model and has_base_model_prefix: + pt_tuple_key = pt_tuple_key[1:] + + # Correctly rename weight parameters + flax_key, flax_tensor = rename_key_and_reshape_tensor( + pt_tuple_key, pt_tensor, random_flax_state_dict, model_prefix + ) + + # add model prefix if necessary + require_base_model_prefix = (model_prefix,) + flax_key in random_flax_state_dict + if load_base_model_into_model_with_head and require_base_model_prefix: + flax_key = (model_prefix,) + flax_key + + if flax_key in random_flax_state_dict: + if flax_tensor.shape != random_flax_state_dict[flax_key].shape: + raise ValueError( + f"PyTorch checkpoint seems to be incorrect. Weight {pt_key} was expected to be of shape " + f"{random_flax_state_dict[flax_key].shape}, but is {flax_tensor.shape}." + ) + + # add batch stats if the model contains batchnorm layers + if "batch_stats" in flax_model.params: + if "mean" in flax_key[-1] or "var" in flax_key[-1]: + flax_state_dict[("batch_stats",) + flax_key] = jnp.asarray(flax_tensor) + continue + # remove num_batches_tracked key + if "num_batches_tracked" in flax_key[-1]: + flax_state_dict.pop(flax_key, None) + continue + + # also add unexpected weight so that warning is thrown + flax_state_dict[("params",) + flax_key] = ( + jnp.asarray(flax_tensor) if not is_bfloat_16 else jnp.asarray(flax_tensor, dtype=jnp.bfloat16) + ) + else: + # also add unexpected weight so that warning is thrown + flax_state_dict[flax_key] = ( + jnp.asarray(flax_tensor) if not is_bfloat_16 else jnp.asarray(flax_tensor, dtype=jnp.bfloat16) + ) + + return unflatten_dict(flax_state_dict) + + +############################ +# Sharded Pytorch => Flax # +############################ + + +def convert_pytorch_sharded_state_dict_to_flax(shard_filenames, flax_model): + import torch + + # Load the index + flax_state_dict = {} + for shard_file in shard_filenames: + # load using msgpack utils + pt_state_dict = torch.load(shard_file, weights_only=True) + weight_dtypes = {k: v.dtype for k, v in pt_state_dict.items()} + pt_state_dict = { + k: v.numpy() if v.dtype != torch.bfloat16 else v.float().numpy() for k, v in pt_state_dict.items() + } + + model_prefix = flax_model.base_model_prefix + + # use params dict if the model contains batch norm layers and then add batch_stats keys,values to dict + if "batch_stats" in flax_model.params: + flax_model_params = flax_model.params["params"] + + random_flax_state_dict = flatten_dict(flax_model_params) + random_flax_state_dict.update(flatten_dict(flax_model.params["batch_stats"])) + else: + flax_model_params = flax_model.params + random_flax_state_dict = flatten_dict(flax_model_params) + + load_model_with_head_into_base_model = (model_prefix not in flax_model_params) and ( + model_prefix in {k.split(".")[0] for k in pt_state_dict.keys()} + ) + load_base_model_into_model_with_head = (model_prefix in flax_model_params) and ( + model_prefix not in {k.split(".")[0] for k in pt_state_dict.keys()} + ) + # Need to change some parameters name to match Flax names + for pt_key, pt_tensor in pt_state_dict.items(): + pt_tuple_key = tuple(pt_key.split(".")) + is_bfloat_16 = weight_dtypes[pt_key] == torch.bfloat16 + + # remove base model prefix if necessary + has_base_model_prefix = pt_tuple_key[0] == model_prefix + if load_model_with_head_into_base_model and has_base_model_prefix: + pt_tuple_key = pt_tuple_key[1:] + + # Correctly rename weight parameters + flax_key, flax_tensor = rename_key_and_reshape_tensor( + pt_tuple_key, pt_tensor, random_flax_state_dict, model_prefix + ) + # add model prefix if necessary + require_base_model_prefix = (model_prefix,) + flax_key in random_flax_state_dict + if load_base_model_into_model_with_head and require_base_model_prefix: + flax_key = (model_prefix,) + flax_key + + if flax_key in random_flax_state_dict: + if flax_tensor.shape != random_flax_state_dict[flax_key].shape: + raise ValueError( + f"PyTorch checkpoint seems to be incorrect. Weight {pt_key} was expected to be of shape " + f"{random_flax_state_dict[flax_key].shape}, but is {flax_tensor.shape}." + ) + + # add batch stats if the model contains batchnorm layers + if "batch_stats" in flax_model.params: + if "mean" in flax_key[-1]: + flax_state_dict[("batch_stats",) + flax_key] = jnp.asarray(flax_tensor) + continue + if "var" in flax_key[-1]: + flax_state_dict[("batch_stats",) + flax_key] = jnp.asarray(flax_tensor) + continue + # remove num_batches_tracked key + if "num_batches_tracked" in flax_key[-1]: + flax_state_dict.pop(flax_key, None) + continue + + # also add unexpected weight so that warning is thrown + flax_state_dict[("params",) + flax_key] = ( + jnp.asarray(flax_tensor) if not is_bfloat_16 else jnp.asarray(flax_tensor, dtype=jnp.bfloat16) + ) + + else: + # also add unexpected weight so that warning is thrown + flax_state_dict[flax_key] = ( + jnp.asarray(flax_tensor) if not is_bfloat_16 else jnp.asarray(flax_tensor, dtype=jnp.bfloat16) + ) + return unflatten_dict(flax_state_dict) + + +##################### +# Flax => PyTorch # +##################### + + +def load_flax_checkpoint_in_pytorch_model(model, flax_checkpoint_path): + """Load flax checkpoints in a PyTorch model""" + flax_checkpoint_path = os.path.abspath(flax_checkpoint_path) + logger.info(f"Loading Flax weights from {flax_checkpoint_path}") + + # import correct flax class + flax_cls = getattr(transformers, "Flax" + model.__class__.__name__) + + # load flax weight dict + if flax_checkpoint_path.endswith(".safetensors"): + flax_state_dict = safe_load_file(flax_checkpoint_path) + flax_state_dict = unflatten_dict(flax_state_dict, sep=".") + else: + with open(flax_checkpoint_path, "rb") as state_f: + try: + flax_state_dict = from_bytes(flax_cls, state_f.read()) + except UnpicklingError: + raise EnvironmentError(f"Unable to convert {flax_checkpoint_path} to Flax deserializable object. ") + + return load_flax_weights_in_pytorch_model(model, flax_state_dict) + + +def load_flax_weights_in_pytorch_model(pt_model, flax_state): + """Load flax checkpoints in a PyTorch model""" + + try: + import torch # noqa: F401 + except (ImportError, ModuleNotFoundError): + logger.error( + "Loading a Flax weights in PyTorch, requires both PyTorch and Flax to be installed. Please see" + " https://pytorch.org/ and https://flax.readthedocs.io/en/latest/installation.html for installation" + " instructions." + ) + raise + + # check if we have bf16 weights + is_type_bf16 = flatten_dict(jax.tree_util.tree_map(lambda x: x.dtype == jnp.bfloat16, flax_state)).values() + if any(is_type_bf16): + # convert all weights to fp32 if the are bf16 since torch.from_numpy can-not handle bf16 + # and bf16 is not fully supported in PT yet. + logger.warning( + "Found ``bfloat16`` weights in Flax model. Casting all ``bfloat16`` weights to ``float32`` " + "before loading those in PyTorch model." + ) + flax_state = jax.tree_util.tree_map( + lambda params: params.astype(np.float32) if params.dtype == jnp.bfloat16 else params, flax_state + ) + + flax_state_dict = flatten_dict(flax_state) + pt_model_dict = pt_model.state_dict() + + load_model_with_head_into_base_model = (pt_model.base_model_prefix in flax_state) and ( + pt_model.base_model_prefix not in {k.split(".")[0] for k in pt_model_dict.keys()} + ) + load_base_model_into_model_with_head = (pt_model.base_model_prefix not in flax_state) and ( + pt_model.base_model_prefix in {k.split(".")[0] for k in pt_model_dict.keys()} + ) + + # keep track of unexpected & missing keys + unexpected_keys = [] + missing_keys = set(pt_model_dict.keys()) + + for flax_key_tuple, flax_tensor in flax_state_dict.items(): + has_base_model_prefix = flax_key_tuple[0] == pt_model.base_model_prefix + require_base_model_prefix = ".".join((pt_model.base_model_prefix,) + flax_key_tuple) in pt_model_dict + + # adapt flax_key to prepare for loading from/to base model only + if load_model_with_head_into_base_model and has_base_model_prefix: + flax_key_tuple = flax_key_tuple[1:] + elif load_base_model_into_model_with_head and require_base_model_prefix: + flax_key_tuple = (pt_model.base_model_prefix,) + flax_key_tuple + + # rename flax weights to PyTorch format + if flax_key_tuple[-1] == "kernel" and flax_tensor.ndim == 4 and ".".join(flax_key_tuple) not in pt_model_dict: + # conv layer + flax_key_tuple = flax_key_tuple[:-1] + ("weight",) + flax_tensor = jnp.transpose(flax_tensor, (3, 2, 0, 1)) + elif flax_key_tuple[-1] == "kernel" and ".".join(flax_key_tuple) not in pt_model_dict: + # linear layer + flax_key_tuple = flax_key_tuple[:-1] + ("weight",) + flax_tensor = flax_tensor.T + elif flax_key_tuple[-1] in ["scale", "embedding"]: + flax_key_tuple = flax_key_tuple[:-1] + ("weight",) + + # adding batch stats from flax batch norm to pt + elif "mean" in flax_key_tuple[-1]: + flax_key_tuple = flax_key_tuple[:-1] + ("running_mean",) + elif "var" in flax_key_tuple[-1]: + flax_key_tuple = flax_key_tuple[:-1] + ("running_var",) + + if "batch_stats" in flax_state: + flax_key = ".".join(flax_key_tuple[1:]) # Remove the params/batch_stats header + else: + flax_key = ".".join(flax_key_tuple) + + # We also need to look at `pt_model_dict` and see if there are keys requiring further transformation. + special_pt_names = {} + # New `weight_norm` from https://github.com/huggingface/transformers/pull/24030 + for key in pt_model_dict: + key_components = key.split(".") + name = None + if key_components[-3::2] == ["parametrizations", "original0"]: + name = key_components[-2] + "_g" + elif key_components[-3::2] == ["parametrizations", "original1"]: + name = key_components[-2] + "_v" + if name is not None: + key_components = key_components[:-3] + [name] + key_to_check = ".".join(key_components) + special_pt_names[key_to_check] = key + + if flax_key in special_pt_names: + flax_key = special_pt_names[flax_key] + + if flax_key in pt_model_dict: + if flax_tensor.shape != pt_model_dict[flax_key].shape: + raise ValueError( + f"Flax checkpoint seems to be incorrect. Weight {flax_key_tuple} was expected " + f"to be of shape {pt_model_dict[flax_key].shape}, but is {flax_tensor.shape}." + ) + else: + # add weight to pytorch dict + flax_tensor = np.asarray(flax_tensor) if not isinstance(flax_tensor, np.ndarray) else flax_tensor + pt_model_dict[flax_key] = torch.from_numpy(flax_tensor) + # remove from missing keys + missing_keys.remove(flax_key) + else: + # weight is not expected by PyTorch model + unexpected_keys.append(flax_key) + + pt_model.load_state_dict(pt_model_dict) + + # re-transform missing_keys to list + missing_keys = list(missing_keys) + + if len(unexpected_keys) > 0: + logger.warning( + "Some weights of the Flax model were not used when initializing the PyTorch model" + f" {pt_model.__class__.__name__}: {unexpected_keys}\n- This IS expected if you are initializing" + f" {pt_model.__class__.__name__} from a Flax model trained on another task or with another architecture" + " (e.g. initializing a BertForSequenceClassification model from a FlaxBertForPreTraining model).\n- This" + f" IS NOT expected if you are initializing {pt_model.__class__.__name__} from a Flax model that you expect" + " to be exactly identical (e.g. initializing a BertForSequenceClassification model from a" + " FlaxBertForSequenceClassification model)." + ) + else: + logger.warning(f"All Flax model weights were used when initializing {pt_model.__class__.__name__}.\n") + if len(missing_keys) > 0: + logger.warning( + f"Some weights of {pt_model.__class__.__name__} were not initialized from the Flax model and are newly" + f" initialized: {missing_keys}\nYou should probably TRAIN this model on a down-stream task to be able to" + " use it for predictions and inference." + ) + else: + logger.warning( + f"All the weights of {pt_model.__class__.__name__} were initialized from the Flax model.\n" + "If your task is similar to the task the model of the checkpoint was trained on, " + f"you can already use {pt_model.__class__.__name__} for predictions without further training." + ) + + return pt_model diff --git a/docs/transformers/build/lib/transformers/modeling_flax_utils.py b/docs/transformers/build/lib/transformers/modeling_flax_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..c775ee85bbefb6a70481863bb8c546ef3efa43d3 --- /dev/null +++ b/docs/transformers/build/lib/transformers/modeling_flax_utils.py @@ -0,0 +1,1273 @@ +# coding=utf-8 +# Copyright 2021 The Google Flax Team Authors and The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import gc +import json +import os +import warnings +from functools import partial +from pickle import UnpicklingError +from typing import Any, Dict, Optional, Set, Tuple, Union + +import flax.linen as nn +import jax +import jax.numpy as jnp +import msgpack.exceptions +from flax.core.frozen_dict import FrozenDict, unfreeze +from flax.serialization import from_bytes, to_bytes +from flax.traverse_util import flatten_dict, unflatten_dict +from jax.random import PRNGKey + +from .configuration_utils import PretrainedConfig +from .dynamic_module_utils import custom_object_save +from .generation import FlaxGenerationMixin, GenerationConfig +from .modeling_flax_pytorch_utils import load_pytorch_checkpoint_in_flax_state_dict +from .utils import ( + FLAX_WEIGHTS_INDEX_NAME, + FLAX_WEIGHTS_NAME, + SAFE_WEIGHTS_INDEX_NAME, + SAFE_WEIGHTS_NAME, + WEIGHTS_INDEX_NAME, + WEIGHTS_NAME, + PushToHubMixin, + add_code_sample_docstrings, + add_start_docstrings_to_model_forward, + cached_file, + copy_func, + download_url, + has_file, + is_offline_mode, + is_remote_url, + logging, + replace_return_docstrings, +) +from .utils.hub import convert_file_size_to_int, get_checkpoint_shard_files +from .utils.import_utils import is_safetensors_available + + +if is_safetensors_available(): + from safetensors import safe_open + from safetensors.flax import load_file as safe_load_file + from safetensors.flax import save_file as safe_save_file + +logger = logging.get_logger(__name__) + + +def quick_gelu(x): + return x * jax.nn.sigmoid(1.702 * x) + + +ACT2FN = { + "gelu": partial(nn.gelu, approximate=False), + "relu": nn.relu, + "silu": nn.swish, + "swish": nn.swish, + "gelu_new": partial(nn.gelu, approximate=True), + "quick_gelu": quick_gelu, + "gelu_pytorch_tanh": partial(nn.gelu, approximate=True), + "tanh": nn.tanh, +} + + +def flax_shard_checkpoint(params, max_shard_size="10GB"): + """ + Splits a model state dictionary in sub-checkpoints so that the final size of each sub-checkpoint does not exceed a + given size. The sub-checkpoints are determined by iterating through the `state_dict` in the order of its keys, so + there is no optimization made to make each sub-checkpoint as close as possible to the maximum size passed. For + example, if the limit is 10GB and we have weights of sizes [6GB, 6GB, 2GB, 6GB, 2GB, 2GB] they will get sharded as + [6GB], [6+2GB], [6+2+2GB] and not [6+2+2GB], [6+2GB], [6GB]. + + + + If one of the model's weight is bigger that `max_shard_size`, it will end up in its own sub-checkpoint which will + have a size greater than `max_shard_size`. + + + + Args: + params (`Union[Dict, FrozenDict]`): A `PyTree` of model parameters. + max_shard_size (`int` or `str`, *optional*, defaults to `"10GB"`): + The maximum size of each sub-checkpoint. If expressed as a string, needs to be digits followed by a unit + (like `"5MB"`). + """ + max_shard_size = convert_file_size_to_int(max_shard_size) + + sharded_state_dicts = [] + current_block = {} + current_block_size = 0 + total_size = 0 + + # flatten the weights to chunk + weights = flatten_dict(params, sep="/") + for item in weights: + weight_size = weights[item].size * weights[item].dtype.itemsize + + # If this weight is going to tip up over the maximal size, we split. + if current_block_size + weight_size > max_shard_size: + sharded_state_dicts.append(current_block) + current_block = {} + current_block_size = 0 + + current_block[item] = weights[item] + current_block_size += weight_size + total_size += weight_size + + # Add the last block + sharded_state_dicts.append(current_block) + + # If we only have one shard, we return it + if len(sharded_state_dicts) == 1: + return {FLAX_WEIGHTS_NAME: sharded_state_dicts[0]}, None + + # Otherwise, let's build the index + weight_map = {} + shards = {} + for idx, shard in enumerate(sharded_state_dicts): + shard_file = FLAX_WEIGHTS_NAME.replace(".msgpack", f"-{idx + 1:05d}-of-{len(sharded_state_dicts):05d}.msgpack") + shards[shard_file] = shard + for weight_name in shard.keys(): + weight_map[weight_name] = shard_file + + # Add the metadata + metadata = {"total_size": total_size} + index = {"metadata": metadata, "weight_map": weight_map} + return shards, index + + +class FlaxPreTrainedModel(PushToHubMixin, FlaxGenerationMixin): + r""" + Base class for all models. + + [`FlaxPreTrainedModel`] takes care of storing the configuration of the models and handles methods for loading, + downloading and saving models. + + Class attributes (overridden by derived classes): + + - **config_class** ([`PretrainedConfig`]) -- A subclass of [`PretrainedConfig`] to use as configuration class + for this model architecture. + - **base_model_prefix** (`str`) -- A string indicating the attribute associated to the base model in derived + classes of the same architecture adding modules on top of the base model. + - **main_input_name** (`str`) -- The name of the principal input to the model (often `input_ids` for NLP + models, `pixel_values` for vision models and `input_values` for speech models). + """ + + config_class = None + base_model_prefix = "" + main_input_name = "input_ids" + _auto_class = None + _missing_keys = set() + + def __init__( + self, + config: PretrainedConfig, + module: nn.Module, + input_shape: Tuple = (1, 1), + seed: int = 0, + dtype: jnp.dtype = jnp.float32, + _do_init: bool = True, + ): + if config is None: + raise ValueError("config cannot be None") + + if module is None: + raise ValueError("module cannot be None") + + # Those are private to be exposed as typed property on derived classes. + self._config = config + self._module = module + + # Those are public as their type is generic to every derived classes. + self.key = PRNGKey(seed) + self.dtype = dtype + self.input_shape = input_shape + self.generation_config = GenerationConfig.from_model_config(config) if self.can_generate() else None + + # To check if the model was initialized automatically. + self._is_initialized = _do_init + + if _do_init: + # randomly initialized parameters + random_params = self.init_weights(self.key, input_shape) + params_shape_tree = jax.eval_shape(lambda params: params, random_params) + else: + init_fn = partial(self.init_weights, input_shape=input_shape) + params_shape_tree = jax.eval_shape(init_fn, self.key) + + logger.info( + "Model weights are not initialized as `_do_init` is set to `False`. " + f"Make sure to call `{self.__class__.__name__}.init_weights` manually to initialize the weights." + ) + + # get the shape of the parameters + self._params_shape_tree = params_shape_tree + + # save required_params as set + self._required_params = set(flatten_dict(unfreeze(params_shape_tree)).keys()) + + # initialize the parameters + if _do_init: + self.params = random_params + + def init_weights(self, rng: jax.random.PRNGKey, input_shape: Tuple, params: FrozenDict = None) -> Dict: + raise NotImplementedError(f"init method has to be implemented for {self}") + + def enable_gradient_checkpointing(self): + raise NotImplementedError(f"gradient checkpointing method has to be implemented for {self}") + + @classmethod + def _from_config(cls, config, **kwargs): + """ + All context managers that the model should be initialized under go here. + """ + return cls(config, **kwargs) + + @property + def framework(self) -> str: + """ + :str: Identifies that this is a Flax model. + """ + return "flax" + + @property + def config(self) -> PretrainedConfig: + return self._config + + @property + def module(self) -> nn.Module: + return self._module + + @property + def params(self) -> Union[Dict, FrozenDict]: + if not self._is_initialized: + raise ValueError( + "`params` cannot be accessed from model when the model is created with `_do_init=False`. " + "You must call `init_weights` manually and store the params outside of the model and " + "pass it explicitly where needed." + ) + return self._params + + @property + def required_params(self) -> Set: + return self._required_params + + @property + def params_shape_tree(self) -> Dict: + return self._params_shape_tree + + @params.setter + def params(self, params: Union[Dict, FrozenDict]): + # don't set params if the model is not initialized + if not self._is_initialized: + raise ValueError( + "`params` cannot be set from model when the model is created with `_do_init=False`. " + "You store the params outside of the model." + ) + + if isinstance(params, FrozenDict): + params = unfreeze(params) + param_keys = set(flatten_dict(params).keys()) + if len(self.required_params - param_keys) > 0: + raise ValueError( + "Some parameters are missing. Make sure that `params` include the following " + f"parameters {self.required_params - param_keys}" + ) + self._params = params + + def _cast_floating_to(self, params: Union[Dict, FrozenDict], dtype: jnp.dtype, mask: Any = None) -> Any: + """ + Helper method to cast floating-point values of given parameter `PyTree` to given `dtype`. + """ + + # taken from https://github.com/deepmind/jmp/blob/3a8318abc3292be38582794dbf7b094e6583b192/jmp/_src/policy.py#L27 + def conditional_cast(param): + if isinstance(param, jnp.ndarray) and jnp.issubdtype(param.dtype, jnp.floating): + param = param.astype(dtype) + return param + + if mask is None: + return jax.tree_util.tree_map(conditional_cast, params) + + flat_params = flatten_dict(params) + flat_mask, _ = jax.tree_util.tree_flatten(mask) + + for masked, key in zip(flat_mask, sorted(flat_params.keys())): + if masked: + flat_params[key] = conditional_cast(flat_params[key]) + + return unflatten_dict(flat_params) + + def to_bf16(self, params: Union[Dict, FrozenDict], mask: Any = None): + r""" + Cast the floating-point `params` to `jax.numpy.bfloat16`. This returns a new `params` tree and does not cast + the `params` in place. + + This method can be used on TPU to explicitly convert the model parameters to bfloat16 precision to do full + half-precision training or to save weights in bfloat16 for inference in order to save memory and improve speed. + + Arguments: + params (`Union[Dict, FrozenDict]`): + A `PyTree` of model parameters. + mask (`Union[Dict, FrozenDict]`): + A `PyTree` with same structure as the `params` tree. The leaves should be booleans, `True` for params + you want to cast, and should be `False` for those you want to skip. + + Examples: + + ```python + >>> from transformers import FlaxBertModel + + >>> # load model + >>> model = FlaxBertModel.from_pretrained("google-bert/bert-base-cased") + >>> # By default, the model parameters will be in fp32 precision, to cast these to bfloat16 precision + >>> model.params = model.to_bf16(model.params) + >>> # If you want don't want to cast certain parameters (for example layer norm bias and scale) + >>> # then pass the mask as follows + >>> from flax import traverse_util + + >>> model = FlaxBertModel.from_pretrained("google-bert/bert-base-cased") + >>> flat_params = traverse_util.flatten_dict(model.params) + >>> mask = { + ... path: (path[-2] != ("LayerNorm", "bias") and path[-2:] != ("LayerNorm", "scale")) + ... for path in flat_params + ... } + >>> mask = traverse_util.unflatten_dict(mask) + >>> model.params = model.to_bf16(model.params, mask) + ```""" + return self._cast_floating_to(params, jnp.bfloat16, mask) + + def to_fp32(self, params: Union[Dict, FrozenDict], mask: Any = None): + r""" + Cast the floating-point `params` to `jax.numpy.float32`. This method can be used to explicitly convert the + model parameters to fp32 precision. This returns a new `params` tree and does not cast the `params` in place. + + Arguments: + params (`Union[Dict, FrozenDict]`): + A `PyTree` of model parameters. + mask (`Union[Dict, FrozenDict]`): + A `PyTree` with same structure as the `params` tree. The leaves should be booleans, `True` for params + you want to cast, and should be `False` for those you want to skip + + Examples: + + ```python + >>> from transformers import FlaxBertModel + + >>> # Download model and configuration from huggingface.co + >>> model = FlaxBertModel.from_pretrained("google-bert/bert-base-cased") + >>> # By default, the model params will be in fp32, to illustrate the use of this method, + >>> # we'll first cast to fp16 and back to fp32 + >>> model.params = model.to_f16(model.params) + >>> # now cast back to fp32 + >>> model.params = model.to_fp32(model.params) + ```""" + return self._cast_floating_to(params, jnp.float32, mask) + + def to_fp16(self, params: Union[Dict, FrozenDict], mask: Any = None): + r""" + Cast the floating-point `params` to `jax.numpy.float16`. This returns a new `params` tree and does not cast the + `params` in place. + + This method can be used on GPU to explicitly convert the model parameters to float16 precision to do full + half-precision training or to save weights in float16 for inference in order to save memory and improve speed. + + Arguments: + params (`Union[Dict, FrozenDict]`): + A `PyTree` of model parameters. + mask (`Union[Dict, FrozenDict]`): + A `PyTree` with same structure as the `params` tree. The leaves should be booleans, `True` for params + you want to cast, and should be `False` for those you want to skip + + Examples: + + ```python + >>> from transformers import FlaxBertModel + + >>> # load model + >>> model = FlaxBertModel.from_pretrained("google-bert/bert-base-cased") + >>> # By default, the model params will be in fp32, to cast these to float16 + >>> model.params = model.to_fp16(model.params) + >>> # If you want don't want to cast certain parameters (for example layer norm bias and scale) + >>> # then pass the mask as follows + >>> from flax import traverse_util + + >>> model = FlaxBertModel.from_pretrained("google-bert/bert-base-cased") + >>> flat_params = traverse_util.flatten_dict(model.params) + >>> mask = { + ... path: (path[-2] != ("LayerNorm", "bias") and path[-2:] != ("LayerNorm", "scale")) + ... for path in flat_params + ... } + >>> mask = traverse_util.unflatten_dict(mask) + >>> model.params = model.to_fp16(model.params, mask) + ```""" + return self._cast_floating_to(params, jnp.float16, mask) + + @classmethod + def load_flax_weights(cls, resolved_archive_file): + try: + if resolved_archive_file.endswith(".safetensors"): + state = safe_load_file(resolved_archive_file) + state = unflatten_dict(state, sep=".") + else: + with open(resolved_archive_file, "rb") as state_f: + state = from_bytes(cls, state_f.read()) + except (UnpicklingError, msgpack.exceptions.ExtraData) as e: + try: + with open(resolved_archive_file) as f: + if f.read().startswith("version"): + raise OSError( + "You seem to have cloned a repository without having git-lfs installed. Please" + " install git-lfs and run `git lfs install` followed by `git lfs pull` in the" + " folder you cloned." + ) + else: + raise ValueError from e + except (UnicodeDecodeError, ValueError): + raise EnvironmentError(f"Unable to convert {resolved_archive_file} to Flax deserializable object. ") + + return state + + @classmethod + def load_flax_sharded_weights(cls, shard_files): + """ + This is the same as [`flax.serialization.from_bytes`] + (https:lax.readthedocs.io/en/latest/_modules/flax/serialization.html#from_bytes) but for a sharded checkpoint. + + This load is performed efficiently: each checkpoint shard is loaded one by one in RAM and deleted after being + loaded in the model. + + Args: + shard_files (`List[str]`: + The list of shard files to load. + + Returns: + `Dict`: A nested dictionary of the model parameters, in the expected format for flax models : `{'model': + {'params': {'...'}}}`. + """ + + # Load the index + state_sharded_dict = {} + + for shard_file in shard_files: + # load using msgpack utils + try: + with open(shard_file, "rb") as state_f: + state = from_bytes(cls, state_f.read()) + except (UnpicklingError, msgpack.exceptions.ExtraData) as e: + with open(shard_file) as f: + if f.read().startswith("version"): + raise OSError( + "You seem to have cloned a repository without having git-lfs installed. Please" + " install git-lfs and run `git lfs install` followed by `git lfs pull` in the" + " folder you cloned." + ) + else: + raise ValueError from e + except (UnicodeDecodeError, ValueError): + raise EnvironmentError(f"Unable to convert {shard_file} to Flax deserializable object. ") + + state = flatten_dict(state, sep="/") + state_sharded_dict.update(state) + del state + gc.collect() + + # the state dict is unflattened to the match the format of model.params + return unflatten_dict(state_sharded_dict, sep="/") + + @classmethod + def can_generate(cls) -> bool: + """ + Returns whether this model can generate sequences with `.generate()`. Returns: + `bool`: Whether this model can generate sequences with `.generate()`. + """ + # Detects whether `prepare_inputs_for_generation` has been overwritten, which is a requirement for generation. + # Alternatively, the model can also have a custom `generate` function. + if "GenerationMixin" in str(cls.prepare_inputs_for_generation) and "GenerationMixin" in str(cls.generate): + return False + return True + + @classmethod + def from_pretrained( + cls, + pretrained_model_name_or_path: Union[str, os.PathLike], + dtype: jnp.dtype = jnp.float32, + *model_args, + config: Optional[Union[PretrainedConfig, str, os.PathLike]] = None, + cache_dir: Optional[Union[str, os.PathLike]] = None, + ignore_mismatched_sizes: bool = False, + force_download: bool = False, + local_files_only: bool = False, + token: Optional[Union[str, bool]] = None, + revision: str = "main", + **kwargs, + ): + r""" + Instantiate a pretrained flax model from a pre-trained model configuration. + + The warning *Weights from XXX not initialized from pretrained model* means that the weights of XXX do not come + pretrained with the rest of the model. It is up to you to train those weights with a downstream fine-tuning + task. + + The warning *Weights from XXX not used in YYY* means that the layer XXX is not used by YYY, therefore those + weights are discarded. + + Parameters: + pretrained_model_name_or_path (`str` or `os.PathLike`): + Can be either: + + - A string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co. + - A path to a *directory* containing model weights saved using + [`~FlaxPreTrainedModel.save_pretrained`], e.g., `./my_model_directory/`. + - A path or url to a *pt index checkpoint file* (e.g, `./tf_model/model.ckpt.index`). In this case, + `from_pt` should be set to `True`. + dtype (`jax.numpy.dtype`, *optional*, defaults to `jax.numpy.float32`): + The data type of the computation. Can be one of `jax.numpy.float32`, `jax.numpy.float16` (on GPUs) and + `jax.numpy.bfloat16` (on TPUs). + + This can be used to enable mixed-precision training or half-precision inference on GPUs or TPUs. If + specified all the computation will be performed with the given `dtype`. + + **Note that this only specifies the dtype of the computation and does not influence the dtype of model + parameters.** + + If you wish to change the dtype of the model parameters, see [`~FlaxPreTrainedModel.to_fp16`] and + [`~FlaxPreTrainedModel.to_bf16`]. + model_args (sequence of positional arguments, *optional*): + All remaining positional arguments will be passed to the underlying model's `__init__` method. + config (`Union[PretrainedConfig, str, os.PathLike]`, *optional*): + Can be either: + + - an instance of a class derived from [`PretrainedConfig`], + - a string or path valid as input to [`~PretrainedConfig.from_pretrained`]. + + Configuration for the model to use instead of an automatically loaded configuration. Configuration can + be automatically loaded when: + + - The model is a model provided by the library (loaded with the *model id* string of a pretrained + model). + - The model was saved using [`~PreTrainedModel.save_pretrained`] and is reloaded by supplying the + save directory. + - The model is loaded by supplying a local directory as `pretrained_model_name_or_path` and a + configuration JSON file named *config.json* is found in the directory. + cache_dir (`Union[str, os.PathLike]`, *optional*): + Path to a directory in which a downloaded pretrained model configuration should be cached if the + standard cache should not be used. + from_pt (`bool`, *optional*, defaults to `False`): + Load the model weights from a PyTorch checkpoint save file (see docstring of + `pretrained_model_name_or_path` argument). + ignore_mismatched_sizes (`bool`, *optional*, defaults to `False`): + Whether or not to raise an error if some of the weights from the checkpoint do not have the same size + as the weights of the model (if for instance, you are instantiating a model with 10 labels from a + checkpoint with 3 labels). + force_download (`bool`, *optional*, defaults to `False`): + Whether or not to force the (re-)download of the model weights and configuration files, overriding the + cached versions if they exist. + resume_download: + Deprecated and ignored. All downloads are now resumed by default when possible. + Will be removed in v5 of Transformers. + proxies (`Dict[str, str]`, *optional*): + A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128', + 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request. + local_files_only(`bool`, *optional*, defaults to `False`): + Whether or not to only look at local files (i.e., do not try to download the model). + token (`str` or `bool`, *optional*): + The token to use as HTTP bearer authorization for remote files. If `True`, or not specified, will use + the token generated when running `huggingface-cli login` (stored in `~/.huggingface`). + revision (`str`, *optional*, defaults to `"main"`): + The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a + git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any + identifier allowed by git. + + + + + To test a pull request you made on the Hub, you can pass `revision="refs/pr/"`. + + + + subfolder (`str`, *optional*, defaults to `""`): + In case the relevant files are located inside a subfolder of the model repo on huggingface.co, you can + specify the folder name here. + kwargs (remaining dictionary of keyword arguments, *optional*): + Can be used to update the configuration object (after it being loaded) and initiate the model (e.g., + `output_attentions=True`). Behaves differently depending on whether a `config` is provided or + automatically loaded: + + - If a configuration is provided with `config`, `**kwargs` will be directly passed to the + underlying model's `__init__` method (we assume all relevant updates to the configuration have + already been done) + - If a configuration is not provided, `kwargs` will be first passed to the configuration class + initialization function ([`~PretrainedConfig.from_pretrained`]). Each key of `kwargs` that + corresponds to a configuration attribute will be used to override said attribute with the + supplied `kwargs` value. Remaining keys that do not correspond to any configuration attribute + will be passed to the underlying model's `__init__` function. + + Examples: + + ```python + >>> from transformers import BertConfig, FlaxBertModel + + >>> # Download model and configuration from huggingface.co and cache. + >>> model = FlaxBertModel.from_pretrained("google-bert/bert-base-cased") + >>> # Model was saved using *save_pretrained('./test/saved_model/')* (for example purposes, not runnable). + >>> model = FlaxBertModel.from_pretrained("./test/saved_model/") + >>> # Loading from a PyTorch checkpoint file instead of a PyTorch model (slower, for example purposes, not runnable). + >>> config = BertConfig.from_json_file("./pt_model/config.json") + >>> model = FlaxBertModel.from_pretrained("./pt_model/pytorch_model.bin", from_pt=True, config=config) + ```""" + from_pt = kwargs.pop("from_pt", False) + resume_download = kwargs.pop("resume_download", None) + proxies = kwargs.pop("proxies", None) + use_auth_token = kwargs.pop("use_auth_token", None) + trust_remote_code = kwargs.pop("trust_remote_code", None) + from_pipeline = kwargs.pop("_from_pipeline", None) + from_auto_class = kwargs.pop("_from_auto", False) + _do_init = kwargs.pop("_do_init", True) + subfolder = kwargs.pop("subfolder", "") + commit_hash = kwargs.pop("_commit_hash", None) + + # Not relevant for Flax Models + _ = kwargs.pop("adapter_kwargs", None) + + if use_auth_token is not None: + warnings.warn( + "The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers. Please use `token` instead.", + FutureWarning, + ) + if token is not None: + raise ValueError( + "`token` and `use_auth_token` are both specified. Please set only the argument `token`." + ) + token = use_auth_token + + if trust_remote_code is True: + logger.warning( + "The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is" + " ignored." + ) + + user_agent = {"file_type": "model", "framework": "flax", "from_auto_class": from_auto_class} + if from_pipeline is not None: + user_agent["using_pipeline"] = from_pipeline + + if is_offline_mode() and not local_files_only: + logger.info("Offline mode: forcing local_files_only=True") + local_files_only = True + + # Load config if we don't provide a configuration + if not isinstance(config, PretrainedConfig): + config_path = config if config is not None else pretrained_model_name_or_path + config, model_kwargs = cls.config_class.from_pretrained( + config_path, + cache_dir=cache_dir, + return_unused_kwargs=True, + force_download=force_download, + resume_download=resume_download, + proxies=proxies, + local_files_only=local_files_only, + token=token, + revision=revision, + subfolder=subfolder, + _from_auto=from_auto_class, + _from_pipeline=from_pipeline, + _commit_hash=commit_hash, + **kwargs, + ) + else: + model_kwargs = kwargs.copy() + + if commit_hash is None: + commit_hash = getattr(config, "_commit_hash", None) + + # Add the dtype to model_kwargs + model_kwargs["dtype"] = dtype + + # This variable will flag if we're loading a sharded checkpoint. In this case the archive file is just the + # index of the files. + is_sharded = False + + # Load model + if pretrained_model_name_or_path is not None: + pretrained_model_name_or_path = str(pretrained_model_name_or_path) + is_local = os.path.isdir(pretrained_model_name_or_path) + if os.path.isdir(pretrained_model_name_or_path): + if os.path.isfile(os.path.join(pretrained_model_name_or_path, subfolder, FLAX_WEIGHTS_NAME)): + # Load from a Flax checkpoint + archive_file = os.path.join(pretrained_model_name_or_path, subfolder, FLAX_WEIGHTS_NAME) + elif os.path.isfile(os.path.join(pretrained_model_name_or_path, subfolder, FLAX_WEIGHTS_INDEX_NAME)): + # Load from a sharded Flax checkpoint + archive_file = os.path.join(pretrained_model_name_or_path, subfolder, FLAX_WEIGHTS_INDEX_NAME) + is_sharded = True + elif is_safetensors_available() and os.path.isfile( + os.path.join(pretrained_model_name_or_path, SAFE_WEIGHTS_NAME) + ): + # Load from a safetensors checkpoint + archive_file = os.path.join(pretrained_model_name_or_path, SAFE_WEIGHTS_NAME) + elif from_pt and os.path.isfile(os.path.join(pretrained_model_name_or_path, subfolder, WEIGHTS_NAME)): + # Load from a PyTorch checkpoint + archive_file = os.path.join(pretrained_model_name_or_path, subfolder, WEIGHTS_NAME) + elif from_pt and os.path.isfile( + os.path.join(pretrained_model_name_or_path, subfolder, WEIGHTS_INDEX_NAME) + ): + # Load from a sharded pytorch checkpoint + archive_file = os.path.join(pretrained_model_name_or_path, subfolder, WEIGHTS_INDEX_NAME) + is_sharded = True + # At this stage we don't have a weight file so we will raise an error. + elif is_safetensors_available() and os.path.isfile( + os.path.join(pretrained_model_name_or_path, SAFE_WEIGHTS_INDEX_NAME) + ): + # Load from a sharded safetensors checkpoint + archive_file = os.path.join(pretrained_model_name_or_path, SAFE_WEIGHTS_INDEX_NAME) + is_sharded = True + raise NotImplementedError("Support for sharded checkpoints using safetensors is coming soon!") + elif os.path.isfile(os.path.join(pretrained_model_name_or_path, subfolder, WEIGHTS_NAME)): + raise EnvironmentError( + f"Error no file named {FLAX_WEIGHTS_NAME} found in directory {pretrained_model_name_or_path} " + "but there is a file for PyTorch weights. Use `from_pt=True` to load this model from those " + "weights." + ) + else: + raise EnvironmentError( + f"Error no file named {FLAX_WEIGHTS_NAME} or {WEIGHTS_NAME} found in directory " + f"{pretrained_model_name_or_path}." + ) + elif os.path.isfile(os.path.join(subfolder, pretrained_model_name_or_path)): + archive_file = pretrained_model_name_or_path + is_local = True + elif is_remote_url(pretrained_model_name_or_path): + filename = pretrained_model_name_or_path + resolved_archive_file = download_url(pretrained_model_name_or_path) + else: + if from_pt: + filename = WEIGHTS_NAME + else: + filename = FLAX_WEIGHTS_NAME + + try: + # Load from URL or cache if already cached + cached_file_kwargs = { + "cache_dir": cache_dir, + "force_download": force_download, + "proxies": proxies, + "resume_download": resume_download, + "local_files_only": local_files_only, + "token": token, + "user_agent": user_agent, + "revision": revision, + "subfolder": subfolder, + "_raise_exceptions_for_gated_repo": False, + "_raise_exceptions_for_missing_entries": False, + "_commit_hash": commit_hash, + } + resolved_archive_file = cached_file(pretrained_model_name_or_path, filename, **cached_file_kwargs) + + # Maybe the checkpoint is sharded, we try to grab the index name in this case. + if resolved_archive_file is None and filename == FLAX_WEIGHTS_NAME: + resolved_archive_file = cached_file( + pretrained_model_name_or_path, FLAX_WEIGHTS_INDEX_NAME, **cached_file_kwargs + ) + if resolved_archive_file is not None: + is_sharded = True + + # Maybe the checkpoint is pytorch sharded, we try to grab the pytorch index name in this case. + if resolved_archive_file is None and from_pt: + resolved_archive_file = cached_file( + pretrained_model_name_or_path, WEIGHTS_INDEX_NAME, **cached_file_kwargs + ) + if resolved_archive_file is not None: + is_sharded = True + + # If we still haven't found anything, look for `safetensors`. + if resolved_archive_file is None: + # No support for sharded safetensors yet, so we'll raise an error if that's all we find. + filename = SAFE_WEIGHTS_NAME + resolved_archive_file = cached_file( + pretrained_model_name_or_path, SAFE_WEIGHTS_NAME, **cached_file_kwargs + ) + + # Since we set _raise_exceptions_for_missing_entries=False, we don't get an exception but a None + # result when internet is up, the repo and revision exist, but the file does not. + if resolved_archive_file is None: + # Otherwise, maybe there is a TF or Torch model file. We try those to give a helpful error + # message. + has_file_kwargs = { + "revision": revision, + "proxies": proxies, + "token": token, + "cache_dir": cache_dir, + "local_files_only": local_files_only, + } + if has_file(pretrained_model_name_or_path, SAFE_WEIGHTS_INDEX_NAME, **has_file_kwargs): + is_sharded = True + raise NotImplementedError( + "Support for sharded checkpoints using safetensors is coming soon!" + ) + elif has_file(pretrained_model_name_or_path, WEIGHTS_NAME, **has_file_kwargs): + raise EnvironmentError( + f"{pretrained_model_name_or_path} does not appear to have a file named" + f" {FLAX_WEIGHTS_NAME} but there is a file for PyTorch weights. Use `from_pt=True` to" + " load this model from those weights." + ) + elif has_file(pretrained_model_name_or_path, WEIGHTS_INDEX_NAME, **has_file_kwargs): + raise EnvironmentError( + f"{pretrained_model_name_or_path} does not appear to have a file named" + f" {FLAX_WEIGHTS_INDEX_NAME} but there is a sharded file for PyTorch weights. Use" + " `from_pt=True` to load this model from those weights." + ) + else: + raise EnvironmentError( + f"{pretrained_model_name_or_path} does not appear to have a file named" + f" {FLAX_WEIGHTS_NAME} or {WEIGHTS_NAME}." + ) + except EnvironmentError: + # Raise any environment error raise by `cached_file`. It will have a helpful error message adapted + # to the original exception. + raise + except Exception: + # For any other exception, we throw a generic error. + raise EnvironmentError( + f"Can't load the model for '{pretrained_model_name_or_path}'. If you were trying to load it" + " from 'https://huggingface.co/models', make sure you don't have a local directory with the" + f" same name. Otherwise, make sure '{pretrained_model_name_or_path}' is the correct path to a" + f" directory containing a file named {FLAX_WEIGHTS_NAME} or {WEIGHTS_NAME}." + ) + + if is_local: + logger.info(f"loading weights file {archive_file}") + resolved_archive_file = archive_file + filename = resolved_archive_file.split(os.path.sep)[-1] + else: + logger.info(f"loading weights file {filename} from cache at {resolved_archive_file}") + else: + resolved_archive_file = None + + # We'll need to download and cache each checkpoint shard if the checkpoint is sharded. + if is_sharded: + # resolved_archive_file becomes a list of files that point to the different checkpoint shards in this case. + resolved_archive_file, _ = get_checkpoint_shard_files( + pretrained_model_name_or_path, + resolved_archive_file, + cache_dir=cache_dir, + force_download=force_download, + proxies=proxies, + resume_download=resume_download, + local_files_only=local_files_only, + token=token, + user_agent=user_agent, + revision=revision, + subfolder=subfolder, + _commit_hash=commit_hash, + ) + + safetensors_from_pt = False + if filename == SAFE_WEIGHTS_NAME: + with safe_open(resolved_archive_file, framework="flax") as f: + safetensors_metadata = f.metadata() + if safetensors_metadata is None or safetensors_metadata.get("format") not in ["pt", "tf", "flax"]: + raise OSError( + f"The safetensors archive passed at {resolved_archive_file} does not contain the valid metadata." + " Make sure you save your model with the `save_pretrained` method." + ) + safetensors_from_pt = safetensors_metadata.get("format") == "pt" + + # init random models + model = cls(config, *model_args, _do_init=_do_init, **model_kwargs) + + if from_pt or safetensors_from_pt: + state = load_pytorch_checkpoint_in_flax_state_dict(model, resolved_archive_file, is_sharded) + else: + if is_sharded: + state = cls.load_flax_sharded_weights(resolved_archive_file) + else: + state = cls.load_flax_weights(resolved_archive_file) + # make sure all arrays are stored as jnp.arrays + # NOTE: This is to prevent a bug this will be fixed in Flax >= v0.3.4: + # https://github.com/google/flax/issues/1261 + if _do_init: + state = jax.tree_util.tree_map(jnp.array, state) + else: + # keep the params on CPU if we don't want to initialize + state = jax.tree_util.tree_map(lambda x: jax.device_put(x, jax.local_devices(backend="cpu")[0]), state) + + if "batch_stats" in state: # if flax model contains batch norm layers + # if model is base model only use model_prefix key + if ( + cls.base_model_prefix not in dict(model.params_shape_tree["params"]) + and cls.base_model_prefix in state["params"] + ): + state["params"] = state["params"][cls.base_model_prefix] + state["batch_stats"] = state["batch_stats"][cls.base_model_prefix] + + # if model is head model and we are loading weights from base model + # we initialize new params dict with base_model_prefix + if ( + cls.base_model_prefix in dict(model.params_shape_tree["params"]) + and cls.base_model_prefix not in state["params"] + ): + state = { + "params": {cls.base_model_prefix: state["params"]}, + "batch_stats": {cls.base_model_prefix: state["batch_stats"]}, + } + + else: + # if model is base model only use model_prefix key + if cls.base_model_prefix not in dict(model.params_shape_tree) and cls.base_model_prefix in state: + state = state[cls.base_model_prefix] + + # if model is head model and we are loading weights from base model + # we initialize new params dict with base_model_prefix + if cls.base_model_prefix in dict(model.params_shape_tree) and cls.base_model_prefix not in state: + state = {cls.base_model_prefix: state} + + # flatten dicts + state = flatten_dict(state) + + random_state = flatten_dict(unfreeze(model.params if _do_init else model.params_shape_tree)) + + missing_keys = model.required_params - set(state.keys()) + unexpected_keys = set(state.keys()) - model.required_params + + # Disabling warning when porting pytorch weights to flax, flax does not uses num_batches_tracked + for unexpected_key in unexpected_keys.copy(): + if "num_batches_tracked" in unexpected_key[-1]: + unexpected_keys.remove(unexpected_key) + + if missing_keys and not _do_init: + logger.warning( + f"The checkpoint {pretrained_model_name_or_path} is missing required keys: {missing_keys}. " + "Make sure to call model.init_weights to initialize the missing weights." + ) + cls._missing_keys = missing_keys + + # Mismatched keys contains tuples key/shape1/shape2 of weights in the checkpoint that have a shape not + # matching the weights in the model. + mismatched_keys = [] + for key in state.keys(): + if key in random_state and state[key].shape != random_state[key].shape: + if ignore_mismatched_sizes: + mismatched_keys.append((key, state[key].shape, random_state[key].shape)) + state[key] = random_state[key] + else: + raise ValueError( + f"Trying to load the pretrained weight for {key} failed: checkpoint has shape " + f"{state[key].shape} which is incompatible with the model shape {random_state[key].shape}. " + "Using `ignore_mismatched_sizes=True` if you really want to load this checkpoint inside this " + "model." + ) + + # add missing keys as random parameters if we are initializing + if missing_keys and _do_init: + for missing_key in missing_keys: + state[missing_key] = random_state[missing_key] + + # remove unexpected keys to not be saved again + for unexpected_key in unexpected_keys: + del state[unexpected_key] + + if len(unexpected_keys) > 0: + logger.warning( + f"Some weights of the model checkpoint at {pretrained_model_name_or_path} were not used when" + f" initializing {model.__class__.__name__}: {unexpected_keys}\n- This IS expected if you are" + f" initializing {model.__class__.__name__} from the checkpoint of a model trained on another task or" + " with another architecture (e.g. initializing a BertForSequenceClassification model from a" + " BertForPreTraining model).\n- This IS NOT expected if you are initializing" + f" {model.__class__.__name__} from the checkpoint of a model that you expect to be exactly identical" + " (initializing a BertForSequenceClassification model from a BertForSequenceClassification model)." + ) + else: + logger.info(f"All model checkpoint weights were used when initializing {model.__class__.__name__}.\n") + + if len(missing_keys) > 0: + logger.warning( + f"Some weights of {model.__class__.__name__} were not initialized from the model checkpoint at" + f" {pretrained_model_name_or_path} and are newly initialized: {missing_keys}\nYou should probably" + " TRAIN this model on a down-stream task to be able to use it for predictions and inference." + ) + elif len(mismatched_keys) == 0: + logger.info( + f"All the weights of {model.__class__.__name__} were initialized from the model checkpoint at" + f" {pretrained_model_name_or_path}.\nIf your task is similar to the task the model of the checkpoint" + f" was trained on, you can already use {model.__class__.__name__} for predictions without further" + " training." + ) + if len(mismatched_keys) > 0: + mismatched_warning = "\n".join( + [ + f"- {key}: found shape {shape1} in the checkpoint and {shape2} in the model instantiated" + for key, shape1, shape2 in mismatched_keys + ] + ) + logger.warning( + f"Some weights of {model.__class__.__name__} were not initialized from the model checkpoint at" + f" {pretrained_model_name_or_path} and are newly initialized because the shapes did not" + f" match:\n{mismatched_warning}\nYou should probably TRAIN this model on a down-stream task to be able" + " to use it for predictions and inference." + ) + + # dictionary of key: dtypes for the model params + param_dtypes = jax.tree_util.tree_map(lambda x: x.dtype, state) + # extract keys of parameters not in jnp.float32 + fp16_params = [k for k in param_dtypes if param_dtypes[k] == jnp.float16] + bf16_params = [k for k in param_dtypes if param_dtypes[k] == jnp.bfloat16] + + # raise a warning if any of the parameters are not in jnp.float32 + if len(fp16_params) > 0: + logger.warning( + f"Some of the weights of {model.__class__.__name__} were initialized in float16 precision from " + f"the model checkpoint at {pretrained_model_name_or_path}:\n{fp16_params}\n" + "You should probably UPCAST the model weights to float32 if this was not intended. " + "See [`~FlaxPreTrainedModel.to_fp32`] for further information on how to do this." + ) + + if len(bf16_params) > 0: + logger.warning( + f"Some of the weights of {model.__class__.__name__} were initialized in bfloat16 precision from " + f"the model checkpoint at {pretrained_model_name_or_path}:\n{bf16_params}\n" + "You should probably UPCAST the model weights to float32 if this was not intended. " + "See [`~FlaxPreTrainedModel.to_fp32`] for further information on how to do this." + ) + + # If it is a model with generation capabilities, attempt to load the generation config + if model.can_generate(): + try: + model.generation_config = GenerationConfig.from_pretrained( + pretrained_model_name_or_path, + cache_dir=cache_dir, + force_download=force_download, + resume_download=resume_download, + proxies=proxies, + local_files_only=local_files_only, + token=token, + revision=revision, + subfolder=subfolder, + _from_auto=from_auto_class, + _from_pipeline=from_pipeline, + **kwargs, + ) + except OSError: + logger.info( + "Generation config file not found, using a generation config created from the model config." + ) + pass + + if _do_init: + # set correct parameters + model.params = unflatten_dict(state) + return model + else: + return model, unflatten_dict(state) + + def save_pretrained( + self, + save_directory: Union[str, os.PathLike], + params=None, + push_to_hub=False, + max_shard_size="10GB", + token: Optional[Union[str, bool]] = None, + safe_serialization: bool = False, + **kwargs, + ): + """ + Save a model and its configuration file to a directory, so that it can be re-loaded using the + `[`~FlaxPreTrainedModel.from_pretrained`]` class method + + Arguments: + save_directory (`str` or `os.PathLike`): + Directory to which to save. Will be created if it doesn't exist. + push_to_hub (`bool`, *optional*, defaults to `False`): + Whether or not to push your model to the Hugging Face model hub after saving it. You can specify the + repository you want to push to with `repo_id` (will default to the name of `save_directory` in your + namespace). + max_shard_size (`int` or `str`, *optional*, defaults to `"10GB"`): + The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size + lower than this size. If expressed as a string, needs to be digits followed by a unit (like `"5MB"`). + + + + If a single weight of the model is bigger than `max_shard_size`, it will be in its own checkpoint shard + which will be bigger than `max_shard_size`. + + + + token (`str` or `bool`, *optional*): + The token to use as HTTP bearer authorization for remote files. If `True`, or not specified, will use + the token generated when running `huggingface-cli login` (stored in `~/.huggingface`). + kwargs (`Dict[str, Any]`, *optional*): + Additional key word arguments passed along to the [`~utils.PushToHubMixin.push_to_hub`] method. + safe_serialization (`bool`, *optional*, defaults to `False`): + Whether to save the model using `safetensors` or through msgpack. + """ + use_auth_token = kwargs.pop("use_auth_token", None) + + if use_auth_token is not None: + warnings.warn( + "The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers. Please use `token` instead.", + FutureWarning, + ) + if token is not None: + raise ValueError( + "`token` and `use_auth_token` are both specified. Please set only the argument `token`." + ) + token = use_auth_token + + if token is not None: + kwargs["token"] = token + + if os.path.isfile(save_directory): + logger.error(f"Provided path ({save_directory}) should be a directory, not a file") + return + + os.makedirs(save_directory, exist_ok=True) + + if push_to_hub: + commit_message = kwargs.pop("commit_message", None) + repo_id = kwargs.pop("repo_id", save_directory.split(os.path.sep)[-1]) + repo_id = self._create_repo(repo_id, **kwargs) + files_timestamps = self._get_files_timestamps(save_directory) + + # get abs dir + save_directory = os.path.abspath(save_directory) + # save config as well + self.config.architectures = [self.__class__.__name__[4:]] + + # If we have a custom model, we copy the file defining it in the folder and set the attributes so it can be + # loaded from the Hub. + if self._auto_class is not None: + custom_object_save(self, save_directory, config=self.config) + + self.config.save_pretrained(save_directory) + if self.can_generate(): + self.generation_config.save_pretrained(save_directory) + + # save model + weights_name = SAFE_WEIGHTS_NAME if safe_serialization else FLAX_WEIGHTS_NAME + output_model_file = os.path.join(save_directory, weights_name) + + shards, index = flax_shard_checkpoint(params if params is not None else self.params, max_shard_size) + # Clean the folder from a previous save + for filename in os.listdir(save_directory): + full_filename = os.path.join(save_directory, filename) + weights_no_suffix = weights_name.replace(".bin", "").replace(".safetensors", "") + if ( + filename.startswith(weights_no_suffix) + and os.path.isfile(full_filename) + and filename not in shards.keys() + ): + os.remove(full_filename) + + if index is None: + if safe_serialization: + params = params if params is not None else self.params + flat_dict = flatten_dict(params, sep=".") + safe_save_file(flat_dict, output_model_file, metadata={"format": "flax"}) + else: + with open(output_model_file, "wb") as f: + params = params if params is not None else self.params + model_bytes = to_bytes(params) + f.write(model_bytes) + + else: + save_index_file = os.path.join(save_directory, FLAX_WEIGHTS_INDEX_NAME) + # Save the index as well + with open(save_index_file, "w", encoding="utf-8") as f: + content = json.dumps(index, indent=2, sort_keys=True) + "\n" + f.write(content) + logger.info( + f"The model is bigger than the maximum size per checkpoint ({max_shard_size}) and is going to be " + f"split in {len(shards)} checkpoint shards. You can find where each parameters has been saved in the " + f"index located at {save_index_file}." + ) + for shard_file, shard in shards.items(): + # the shard item are unflattened, to save them we need to flatten them again + with open(os.path.join(save_directory, shard_file), mode="wb") as f: + params = unflatten_dict(shard, sep="/") + shard_bytes = to_bytes(params) + f.write(shard_bytes) + + logger.info(f"Model weights saved in {output_model_file}") + + if push_to_hub: + self._upload_modified_files( + save_directory, + repo_id, + files_timestamps, + commit_message=commit_message, + token=token, + ) + + @classmethod + def register_for_auto_class(cls, auto_class="FlaxAutoModel"): + """ + Register this class with a given auto class. This should only be used for custom models as the ones in the + library are already mapped with an auto class. + + + + This API is experimental and may have some slight breaking changes in the next releases. + + + + Args: + auto_class (`str` or `type`, *optional*, defaults to `"FlaxAutoModel"`): + The auto class to register this new model with. + """ + if not isinstance(auto_class, str): + auto_class = auto_class.__name__ + + import transformers.models.auto as auto_module + + if not hasattr(auto_module, auto_class): + raise ValueError(f"{auto_class} is not a valid auto class.") + + cls._auto_class = auto_class + + +# To update the docstring, we need to copy the method, otherwise we change the original docstring. +FlaxPreTrainedModel.push_to_hub = copy_func(FlaxPreTrainedModel.push_to_hub) +if FlaxPreTrainedModel.push_to_hub.__doc__ is not None: + FlaxPreTrainedModel.push_to_hub.__doc__ = FlaxPreTrainedModel.push_to_hub.__doc__.format( + object="model", object_class="FlaxAutoModel", object_files="model checkpoint" + ) + + +def overwrite_call_docstring(model_class, docstring): + # copy __call__ function to be sure docstring is changed only for this function + model_class.__call__ = copy_func(model_class.__call__) + # delete existing docstring + model_class.__call__.__doc__ = None + # set correct docstring + model_class.__call__ = add_start_docstrings_to_model_forward(docstring)(model_class.__call__) + + +def append_call_sample_docstring( + model_class, checkpoint, output_type, config_class, mask=None, revision=None, real_checkpoint=None +): + model_class.__call__ = copy_func(model_class.__call__) + model_class.__call__ = add_code_sample_docstrings( + checkpoint=checkpoint, + output_type=output_type, + config_class=config_class, + model_cls=model_class.__name__, + revision=revision, + real_checkpoint=real_checkpoint, + )(model_class.__call__) + + +def append_replace_return_docstrings(model_class, output_type, config_class): + model_class.__call__ = copy_func(model_class.__call__) + model_class.__call__ = replace_return_docstrings( + output_type=output_type, + config_class=config_class, + )(model_class.__call__) diff --git a/docs/transformers/build/lib/transformers/modeling_gguf_pytorch_utils.py b/docs/transformers/build/lib/transformers/modeling_gguf_pytorch_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..3ce50f8fec2c9c044f076684e2934a9829be352c --- /dev/null +++ b/docs/transformers/build/lib/transformers/modeling_gguf_pytorch_utils.py @@ -0,0 +1,496 @@ +# Copyright 2024 The ggml.ai team and The HuggingFace Inc. team. and pygguf author (github.com/99991) +# https://github.com/99991/pygguf +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import re +from typing import NamedTuple, Optional + +import numpy as np +from tqdm.auto import tqdm + +from .integrations import ( + GGUF_CONFIG_MAPPING, + GGUF_TOKENIZER_MAPPING, + _gguf_parse_value, +) +from .utils import is_torch_available +from .utils.import_utils import is_gguf_available +from .utils.logging import get_logger + + +if is_torch_available(): + import torch + +logger = get_logger(__name__) + + +GGUF_TO_TRANSFORMERS_MAPPING = { + "ignore": { + "GGUF": { + "version": "version", + "tensor_count": "tensor_count", + "kv_count": "kv_count", + }, + "general": {"file_type": "file_type", "quantization_version": "quantization_version"}, + }, + "config": GGUF_CONFIG_MAPPING, + "tokenizer": {"tokenizer": GGUF_TOKENIZER_MAPPING["tokenizer"]}, + "tokenizer_config": {"tokenizer": GGUF_TOKENIZER_MAPPING["tokenizer_config"]}, +} + +GGUF_SUPPORTED_ARCHITECTURES = list(GGUF_TO_TRANSFORMERS_MAPPING["config"].keys()) + + +class GGUFTensor(NamedTuple): + weights: np.ndarray + name: str + metadata: dict + + +class TensorProcessor: + def __init__(self, config=None): + self.config = config or {} + + def process(self, weights, name, **kwargs): + return GGUFTensor(weights, name, {}) + + +class LlamaTensorProcessor(TensorProcessor): + def __init__(self, config=None): + super().__init__(config=config) + + def process(self, weights, name, **kwargs): + if ".attn_k." in name or ".attn_q." in name: + num_heads = self.config.get("num_attention_heads") + num_kv_heads = self.config.get("num_key_value_heads") + + if None in (num_heads, num_kv_heads): + return GGUFTensor(weights, name, {}) + if ".attn_q." in name: + weights = self._reverse_permute_weights(weights, num_heads, num_heads) + elif ".attn_k." in name: + weights = self._reverse_permute_weights(weights, num_heads, num_kv_heads) + return GGUFTensor(weights, name, {}) + + def _reverse_permute_weights( + self, weights: np.ndarray, n_head: int, num_kv_heads: Optional[int] = None + ) -> np.ndarray: + # Original permutation implementation + # https://github.com/ggerganov/llama.cpp/blob/a38b884c6c4b0c256583acfaaabdf556c62fabea/convert_hf_to_gguf.py#L1402-L1408 + if num_kv_heads is not None and n_head != num_kv_heads: + n_head = num_kv_heads + + dim = weights.shape[0] // n_head // 2 + w = weights.reshape(n_head, dim, 2, *weights.shape[1:]) + return w.swapaxes(2, 1).reshape(weights.shape) + + +class Qwen2MoeTensorProcessor(TensorProcessor): + def __init__(self, config=None): + super().__init__(config=config) + + def process(self, weights, name, **kwargs): + if "_exp" in name: + tensor_key_mapping = kwargs.get("tensor_key_mapping") + parsed_parameters = kwargs.get("parsed_parameters") + if tensor_key_mapping: + self._split_moe_expert_tensor(weights, parsed_parameters, name, tensor_key_mapping) + return GGUFTensor(weights, None, {}) + if "ffn_gate_inp_shexp" in name: + # for compatibility tensor shared_expert_gate must be (1, 2048) dim, + # quantized one is (2048) + weights = np.expand_dims(weights, axis=0) + return GGUFTensor(weights, name, {}) + + def _split_moe_expert_tensor( + self, weights: np.ndarray, parsed_parameters: dict[str, dict], name: str, tensor_key_mapping: dict + ): + # Original merge implementation + # https://github.com/ggerganov/llama.cpp/blob/master/convert_hf_to_gguf.py#L1994-L2022 + name = tensor_key_mapping[name] + w_counter = self.config.get("num_experts", 60) + for i in range(0, w_counter): + temp_name = name.replace("mlp.experts.", f"mlp.experts.{i}.") + exp_weight = weights[i] + parsed_parameters["tensors"][temp_name] = torch.from_numpy(np.copy(exp_weight)) + + +class BloomTensorProcessor(TensorProcessor): + def __init__(self, config=None): + super().__init__(config=config) + + def process(self, weights, name, **kwargs): + if "attn_qkv" in name: + num_heads = self.config["n_head"] + n_embed = self.config["hidden_size"] + if "weight" in name: + weights = self._reverse_reshape_weights(weights, num_heads, n_embed) + else: + weights = self._reverse_reshape_bias(weights, num_heads, n_embed) + return GGUFTensor(weights, name, {}) + + def _reverse_reshape_weights(self, weights: np.ndarray, n_head: int, n_embed: int): + # Original reshape implementation + # https://github.com/ggerganov/llama.cpp/blob/master/convert_hf_to_gguf.py#L972-L985 + q, k, v = np.array_split(weights, 3, axis=0) + + q = q.reshape(n_head, n_embed // n_head, n_embed) + k = k.reshape(n_head, n_embed // n_head, n_embed) + v = v.reshape(n_head, n_embed // n_head, n_embed) + qkv_weights = np.stack([q, k, v], axis=1) + + return qkv_weights.reshape(n_head * 3 * (n_embed // n_head), n_embed) + + def _reverse_reshape_bias(self, weights: np.ndarray, n_head: int, n_embed: int): + # Original reshape implementation + # https://github.com/ggerganov/llama.cpp/blob/master/convert_hf_to_gguf.py#L986-L998 + q_bias, k_bias, v_bias = np.array_split(weights, 3) + + q_bias = q_bias.reshape(n_head, n_embed // n_head) + k_bias = k_bias.reshape(n_head, n_embed // n_head) + v_bias = v_bias.reshape(n_head, n_embed // n_head) + + qkv_bias = np.stack([q_bias, k_bias, v_bias], axis=1).flatten() + return qkv_bias + + +class T5TensorProcessor(TensorProcessor): + def __init__(self, config=None): + super().__init__(config=config) + + def process(self, weights, name, **kwargs): + bid = None + for chunk in name.split("."): + if chunk.isdigit(): + bid = int(chunk) + break + return GGUFTensor(weights, name, {"bid": bid}) + + +class GPT2TensorProcessor(TensorProcessor): + def __init__(self, config=None): + super().__init__(config=config) + + def process(self, weights, name, **kwargs): + # Original transpose implementation + # https://github.com/ggerganov/llama.cpp/blob/a38b884c6c4b0c256583acfaaabdf556c62fabea/convert_hf_to_gguf.py#L2060-L2061 + if ( + "attn_qkv.weight" in name + or "ffn_down.weight" in name + or "ffn_up.weight" in name + or "attn_output.weight" in name + ): + weights = weights.T + + # Handle special case for output.weight + if name == "output.weight": + # output.weight has conflicts with attn_output.weight in name checking + # Store the tensor directly and signal to skip further processing + name = "lm_head.weight" + parsed_parameters = kwargs.get("parsed_parameters", {}) + parsed_parameters["tensors"][name] = torch.from_numpy(np.copy(weights)) + name = None # Signal to skip further processing + return GGUFTensor(weights, name, {}) + + +class MambaTensorProcessor(TensorProcessor): + def __init__(self, config=None): + super().__init__(config=config) + + def process(self, weights, name, **kwargs): + if "ssm_conv1d.weight" in name: + # for compatibility tensor ssm_conv1d must be (5120, 1, 4]) dim, + # quantized one is (5120, 4) + weights = np.expand_dims(weights, axis=1) + if "ssm_a" in name: + # Original exponential implementation + # https://github.com/ggerganov/llama.cpp/blob/master/convert_hf_to_gguf.py#L2975-L2977 + weights = np.log(-weights) + return GGUFTensor(weights, name, {}) + + +class NemotronTensorProcessor(TensorProcessor): + def __init__(self, config=None): + super().__init__(config=config) + + # ref : https://github.com/ggerganov/llama.cpp/blob/master/convert_hf_to_gguf.py#L4666 + def process(self, weights, name, **kwargs): + if "norm.weight" in name: + weights = weights - 1 + return GGUFTensor(weights, name, {}) + + +class Gemma2TensorProcessor(TensorProcessor): + def __init__(self, config=None): + super().__init__(config=config) + + # ref: https://github.com/ggerganov/llama.cpp/blob/d79d8f39b4da6deca4aea8bf130c6034c482b320/convert_hf_to_gguf.py#L3191 + # ref: https://github.com/huggingface/transformers/blob/fc37f38915372c15992b540dfcbbe00a916d4fc6/src/transformers/models/gemma/modeling_gemma.py#L89 + def process(self, weights, name, **kwargs): + if "norm.weight" in name: + weights = weights - 1 + return GGUFTensor(weights, name, {}) + + +TENSOR_PROCESSORS = { + "llama": LlamaTensorProcessor, + "qwen2moe": Qwen2MoeTensorProcessor, + "bloom": BloomTensorProcessor, + "t5": T5TensorProcessor, + "t5encoder": T5TensorProcessor, + "gpt2": GPT2TensorProcessor, + "mamba": MambaTensorProcessor, + "nemotron": NemotronTensorProcessor, + "gemma2": Gemma2TensorProcessor, + "gemma3": Gemma2TensorProcessor, +} + + +def read_field(reader, field): + if field not in reader.fields: + return [] + value = reader.fields[field] + return [_gguf_parse_value(value.parts[_data_index], value.types) for _data_index in value.data] + + +# modified from https://github.com/vllm-project/vllm/blob/v0.6.4.post1/vllm/model_executor/model_loader/loader.py#L1115-L1147 +def get_gguf_hf_weights_map( + hf_model, + model_type: Optional[str] = None, + num_layers: Optional[int] = None, + qual_name: str = "", +): + """ + GGUF uses this naming convention for their tensors from HF checkpoint: + `blk.N.BB.weight` and `blk.N.BB.bias` + where N signifies the block number of a layer, and BB signifies the + attention/mlp layer components. + See "Standardized tensor names" in + https://github.com/ggerganov/ggml/blob/master/docs/gguf.md for details. + """ + if is_gguf_available() and is_torch_available(): + from gguf import MODEL_ARCH_NAMES, get_tensor_name_map + else: + logger.error( + "Loading a GGUF checkpoint in PyTorch, requires both PyTorch and GGUF>=0.10.0 to be installed. Please see " + "https://pytorch.org/ and https://github.com/ggerganov/llama.cpp/tree/master/gguf-py for installation instructions." + ) + raise ImportError("Please install torch and gguf>=0.10.0 to load a GGUF checkpoint in PyTorch.") + + model_type = hf_model.config.model_type if model_type is None else model_type + num_layers = hf_model.config.num_hidden_layers if num_layers is None else num_layers + # hack: ggufs have a different name for cohere + if model_type == "cohere": + model_type = "command-r" + elif model_type == "qwen2_moe": + model_type = "qwen2moe" + elif model_type == "gemma3_text": + model_type = "gemma3" + arch = None + for key, value in MODEL_ARCH_NAMES.items(): + if value == model_type: + arch = key + break + if arch is None: + raise NotImplementedError( + f"Unknown gguf model_type: {model_type} in gguf-py. " + "This might because you're using an outdated version of gguf-py package, " + "you can install `gguf` package from source refer to " + "https://github.com/ggerganov/llama.cpp/tree/master/gguf-py#development" + ) + name_map = get_tensor_name_map(arch, num_layers) + + # Use a dummy conversion to get the mapping, because + # hf => gguf and gguf => hf mappings are reversed + gguf_to_hf_name_map = {} + state_dict = hf_model.state_dict() + for hf_name in state_dict.keys(): + # An exception for qwen2moe model, where the expert layers are packed + if model_type == "qwen2moe" and "mlp.experts." in hf_name: + hf_name = re.sub(r"mlp.experts.\d+.", "mlp.experts.", hf_name) + + name, suffix = hf_name, "" + if hf_name.endswith(".weight") or hf_name.endswith(".bias"): + name, suffix = hf_name.rsplit(".", 1) + suffix = "." + suffix + + gguf_name = name_map.get_name(name) + if gguf_name is None: + continue + + gguf_to_hf_name_map[gguf_name + suffix] = qual_name + hf_name + + # Some model like Bloom converted from BloomModel instead of BloomForCausalLM + # Therefore, we need to check submodule as well to get a correct mapping + if named_children := hf_model.named_children(): + for name, child in named_children: + sub_map = get_gguf_hf_weights_map(child, model_type, num_layers, qual_name=f"{qual_name}{name}.") + # Ignore the keys that are already in the main map to avoid overwriting + sub_map = {k: v for k, v in sub_map.items() if k not in gguf_to_hf_name_map} + gguf_to_hf_name_map.update(sub_map) + + return gguf_to_hf_name_map + + +def load_gguf_checkpoint(gguf_checkpoint_path, return_tensors=False, model_to_load=None): + """ + Load a GGUF file and return a dictionary of parsed parameters containing tensors, the parsed + tokenizer and config attributes. + + Args: + gguf_checkpoint_path (`str`): + The path the to GGUF file to load + return_tensors (`bool`, defaults to `False`): + Whether to read the tensors from the file and return them. Not doing so is faster + and only loads the metadata in memory. + """ + if is_gguf_available() and is_torch_available(): + from gguf import GGUFReader, dequantize + else: + logger.error( + "Loading a GGUF checkpoint in PyTorch, requires both PyTorch and GGUF>=0.10.0 to be installed. Please see " + "https://pytorch.org/ and https://github.com/ggerganov/llama.cpp/tree/master/gguf-py for installation instructions." + ) + raise ImportError("Please install torch and gguf>=0.10.0 to load a GGUF checkpoint in PyTorch.") + + reader = GGUFReader(gguf_checkpoint_path) + fields = reader.fields + reader_keys = list(fields.keys()) + + parsed_parameters = {k: {} for k in GGUF_TO_TRANSFORMERS_MAPPING} + + architecture = read_field(reader, "general.architecture")[0] + # NOTE: Some GGUF checkpoints may miss `general.name` field in metadata + model_name = read_field(reader, "general.name") + + updated_architecture = None + # in llama.cpp mistral models use the same architecture as llama. We need + # to add this patch to ensure things work correctly on our side. + if "llama" in architecture and "mistral" in model_name: + updated_architecture = "mistral" + # FIXME: Currently this implementation is only for flan-t5 architecture. + # It needs to be developed for supporting legacy t5. + elif "t5" in architecture or "t5encoder" in architecture: + parsed_parameters["config"]["is_gated_act"] = True + if "t5encoder" in architecture: + parsed_parameters["config"]["architectures"] = ["T5EncoderModel"] + updated_architecture = "t5" + else: + updated_architecture = architecture + + if "qwen2moe" in architecture: + updated_architecture = "qwen2_moe" + + # For stablelm architecture, we need to set qkv_bias and use_parallel_residual from tensors + # If `qkv_bias=True`, qkv_proj with bias will be present in the tensors + # If `use_parallel_residual=False`, ffn_norm will be present in the tensors + if "stablelm" in architecture: + attn_bias_name = {"attn_q.bias", "attn_k.bias", "attn_v.bias"} + ffn_norm_name = "ffn_norm" + qkv_bias = any(bias_name in tensor.name for tensor in reader.tensors for bias_name in attn_bias_name) + use_parallel_residual = any(ffn_norm_name in tensor.name for tensor in reader.tensors) + parsed_parameters["config"]["use_qkv_bias"] = qkv_bias + parsed_parameters["config"]["use_parallel_residual"] = not use_parallel_residual + + if architecture not in GGUF_SUPPORTED_ARCHITECTURES and updated_architecture not in GGUF_SUPPORTED_ARCHITECTURES: + raise ValueError(f"GGUF model with architecture {architecture} is not supported yet.") + + # Handle tie_word_embeddings, if lm_head.weight is not present in tensors, + # tie_word_embeddings is true otherwise false + exceptions = ["falcon", "bloom"] + parsed_parameters["config"]["tie_word_embeddings"] = ( + all("output.weight" != tensor.name for tensor in reader.tensors) or architecture in exceptions + ) + + # List all key-value pairs in a columnized format + for gguf_key, field in reader.fields.items(): + gguf_key = gguf_key.replace(architecture, updated_architecture) + split = gguf_key.split(".") + prefix = split[0] + config_key = ".".join(split[1:]) + + value = [_gguf_parse_value(field.parts[_data_index], field.types) for _data_index in field.data] + + if len(value) == 1: + value = value[0] + + if isinstance(value, str) and architecture in value: + value = value.replace(architecture, updated_architecture) + + for parameter in GGUF_TO_TRANSFORMERS_MAPPING: + parameter_renames = GGUF_TO_TRANSFORMERS_MAPPING[parameter] + if prefix in parameter_renames and config_key in parameter_renames[prefix]: + renamed_config_key = parameter_renames[prefix][config_key] + if renamed_config_key == -1: + continue + + if renamed_config_key is not None: + parsed_parameters[parameter][renamed_config_key] = value + + if gguf_key in reader_keys: + reader_keys.remove(gguf_key) + + if gguf_key in reader_keys: + logger.info(f"Some keys were not parsed and added into account {gguf_key} | {value}") + + # Gemma3 GGUF checkpoint only contains weights of text backbone + if parsed_parameters["config"]["model_type"] == "gemma3": + parsed_parameters["config"]["model_type"] = "gemma3_text" + + # retrieve config vocab_size from tokenizer + # Please refer to https://github.com/huggingface/transformers/issues/32526 for more details + if "vocab_size" not in parsed_parameters["config"]: + tokenizer_parameters = parsed_parameters["tokenizer"] + if "tokens" in tokenizer_parameters: + parsed_parameters["config"]["vocab_size"] = len(tokenizer_parameters["tokens"]) + else: + logger.warning( + "Can't find a way to retrieve missing config vocab_size from tokenizer parameters. " + "This will use default value from model config class and cause unexpected behavior." + ) + + if return_tensors: + parsed_parameters["tensors"] = {} + + tensor_key_mapping = get_gguf_hf_weights_map(model_to_load) + config = parsed_parameters.get("config", {}) + + ProcessorClass = TENSOR_PROCESSORS.get(architecture, TensorProcessor) + processor = ProcessorClass(config=config) + + for tensor in tqdm(reader.tensors, desc="Converting and de-quantizing GGUF tensors..."): + name = tensor.name + weights = dequantize(tensor.data, tensor.tensor_type) + + result = processor.process( + weights=weights, + name=name, + tensor_key_mapping=tensor_key_mapping, + parsed_parameters=parsed_parameters, + ) + + weights = result.weights + name = result.name + + if name not in tensor_key_mapping: + continue + + name = tensor_key_mapping[name] + + parsed_parameters["tensors"][name] = torch.from_numpy(np.copy(weights)) + + if len(reader_keys) > 0: + logger.info(f"Some keys of the GGUF file were not considered: {reader_keys}") + + return parsed_parameters diff --git a/docs/transformers/build/lib/transformers/modeling_layers.py b/docs/transformers/build/lib/transformers/modeling_layers.py new file mode 100644 index 0000000000000000000000000000000000000000..57be2d8e0d7dc7af50f847f699bf80b707bbb98e --- /dev/null +++ b/docs/transformers/build/lib/transformers/modeling_layers.py @@ -0,0 +1,48 @@ +# Copyright 2025 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from functools import partial + +import torch.nn as nn + + +class GradientCheckpointingLayer(nn.Module): + """Base class for layers with gradient checkpointing. + + This class enables gradient checkpointing functionality for a layer. By default, gradient checkpointing is disabled + (`gradient_checkpointing = False`). When `model.set_gradient_checkpointing()` is called, gradient checkpointing is + enabled by setting `gradient_checkpointing = True` and assigning a checkpointing function to `_gradient_checkpointing_func`. + + Important: + + When using gradient checkpointing with `use_reentrant=True`, inputs that require gradients (e.g. hidden states) + must be passed as positional arguments (`*args`) rather than keyword arguments to properly propagate gradients. + + Example: + + ```python + >>> # Correct - hidden_states passed as positional arg + >>> out = self.layer(hidden_states, attention_mask=attention_mask) + + >>> # Incorrect - hidden_states passed as keyword arg + >>> out = self.layer(hidden_states=hidden_states, attention_mask=attention_mask) + ``` + """ + + gradient_checkpointing = False + + def __call__(self, *args, **kwargs): + if self.gradient_checkpointing and self.training: + return self._gradient_checkpointing_func(partial(super().__call__, **kwargs), *args) + return super().__call__(*args, **kwargs) diff --git a/docs/transformers/build/lib/transformers/modeling_outputs.py b/docs/transformers/build/lib/transformers/modeling_outputs.py new file mode 100644 index 0000000000000000000000000000000000000000..60a3642f87c6b64771584b361ff6aef3ac06cb87 --- /dev/null +++ b/docs/transformers/build/lib/transformers/modeling_outputs.py @@ -0,0 +1,1753 @@ +# Copyright 2020 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import warnings +from dataclasses import dataclass +from typing import Optional, Tuple + +import torch + +from .utils import ModelOutput + + +@dataclass +class BaseModelOutput(ModelOutput): + """ + Base class for model's outputs, with potential hidden states and attentions. + + Args: + last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): + Sequence of hidden-states at the output of the last layer of the model. + hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, + + one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the optional initial embedding outputs. + attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + + last_hidden_state: Optional[torch.FloatTensor] = None + hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None + attentions: Optional[Tuple[torch.FloatTensor, ...]] = None + + +@dataclass +class BaseModelOutputWithNoAttention(ModelOutput): + """ + Base class for model's outputs, with potential hidden states. + + Args: + last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`): + Sequence of hidden-states at the output of the last layer of the model. + hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, + + one for the output of each layer) of shape `(batch_size, num_channels, height, width)`. + + Hidden-states of the model at the output of each layer plus the optional initial embedding outputs. + """ + + last_hidden_state: Optional[torch.FloatTensor] = None + hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None + + +@dataclass +class BaseModelOutputWithPooling(ModelOutput): + """ + Base class for model's outputs that also contains a pooling of the last hidden states. + + Args: + last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): + Sequence of hidden-states at the output of the last layer of the model. + pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`): + Last layer hidden-state of the first token of the sequence (classification token) after further processing + through the layers used for the auxiliary pretraining task. E.g. for BERT-family of models, this returns + the classification token after processing through a linear layer and a tanh activation function. The linear + layer weights are trained from the next sentence prediction (classification) objective during pretraining. + hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, + + one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the optional initial embedding outputs. + attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + + last_hidden_state: Optional[torch.FloatTensor] = None + pooler_output: Optional[torch.FloatTensor] = None + hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None + attentions: Optional[Tuple[torch.FloatTensor, ...]] = None + + +@dataclass +class BaseModelOutputWithPoolingAndNoAttention(ModelOutput): + """ + Base class for model's outputs that also contains a pooling of the last hidden states. + + Args: + last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`): + Sequence of hidden-states at the output of the last layer of the model. + pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`): + Last layer hidden-state after a pooling operation on the spatial dimensions. + hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, + + one for the output of each layer) of shape `(batch_size, num_channels, height, width)`. + + Hidden-states of the model at the output of each layer plus the optional initial embedding outputs. + """ + + last_hidden_state: Optional[torch.FloatTensor] = None + pooler_output: Optional[torch.FloatTensor] = None + hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None + + +@dataclass +class BaseModelOutputWithPast(ModelOutput): + """ + Base class for model's outputs that may also contain a past key/values (to speed up sequential decoding). + + Args: + last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): + Sequence of hidden-states at the output of the last layer of the model. + + If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1, + hidden_size)` is output. + past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): + Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape + `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and optionally if + `config.is_encoder_decoder=True` 2 additional tensors of shape `(batch_size, num_heads, + encoder_sequence_length, embed_size_per_head)`. + + Contains pre-computed hidden-states (key and values in the self-attention blocks and optionally if + `config.is_encoder_decoder=True` in the cross-attention blocks) that can be used (see `past_key_values` + input) to speed up sequential decoding. + hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, + + one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the optional initial embedding outputs. + attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + + last_hidden_state: Optional[torch.FloatTensor] = None + past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None + hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None + attentions: Optional[Tuple[torch.FloatTensor, ...]] = None + + +@dataclass +class BaseModelOutputWithCrossAttentions(ModelOutput): + """ + Base class for model's outputs, with potential hidden states and attentions. + + Args: + last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): + Sequence of hidden-states at the output of the last layer of the model. + hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, + + one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the optional initial embedding outputs. + attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` and `config.add_cross_attention=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the + weighted average in the cross-attention heads. + """ + + last_hidden_state: Optional[torch.FloatTensor] = None + hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None + attentions: Optional[Tuple[torch.FloatTensor, ...]] = None + cross_attentions: Optional[Tuple[torch.FloatTensor, ...]] = None + + +@dataclass +class BaseModelOutputWithPoolingAndCrossAttentions(ModelOutput): + """ + Base class for model's outputs that also contains a pooling of the last hidden states. + + Args: + last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): + Sequence of hidden-states at the output of the last layer of the model. + pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`): + Last layer hidden-state of the first token of the sequence (classification token) after further processing + through the layers used for the auxiliary pretraining task. E.g. for BERT-family of models, this returns + the classification token after processing through a linear layer and a tanh activation function. The linear + layer weights are trained from the next sentence prediction (classification) objective during pretraining. + hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, + + one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the optional initial embedding outputs. + attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` and `config.add_cross_attention=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the + weighted average in the cross-attention heads. + past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): + Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape + `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and optionally if + `config.is_encoder_decoder=True` 2 additional tensors of shape `(batch_size, num_heads, + encoder_sequence_length, embed_size_per_head)`. + + Contains pre-computed hidden-states (key and values in the self-attention blocks and optionally if + `config.is_encoder_decoder=True` in the cross-attention blocks) that can be used (see `past_key_values` + input) to speed up sequential decoding. + """ + + last_hidden_state: Optional[torch.FloatTensor] = None + pooler_output: Optional[torch.FloatTensor] = None + hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None + past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None + attentions: Optional[Tuple[torch.FloatTensor, ...]] = None + cross_attentions: Optional[Tuple[torch.FloatTensor, ...]] = None + + +@dataclass +class BaseModelOutputWithPastAndCrossAttentions(ModelOutput): + """ + Base class for model's outputs that may also contain a past key/values (to speed up sequential decoding). + + Args: + last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): + Sequence of hidden-states at the output of the last layer of the model. + + If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1, + hidden_size)` is output. + past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): + Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape + `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and optionally if + `config.is_encoder_decoder=True` 2 additional tensors of shape `(batch_size, num_heads, + encoder_sequence_length, embed_size_per_head)`. + + Contains pre-computed hidden-states (key and values in the self-attention blocks and optionally if + `config.is_encoder_decoder=True` in the cross-attention blocks) that can be used (see `past_key_values` + input) to speed up sequential decoding. + hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, + + one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the optional initial embedding outputs. + attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` and `config.add_cross_attention=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the + weighted average in the cross-attention heads. + """ + + last_hidden_state: Optional[torch.FloatTensor] = None + past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None + hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None + attentions: Optional[Tuple[torch.FloatTensor, ...]] = None + cross_attentions: Optional[Tuple[torch.FloatTensor, ...]] = None + + +@dataclass +class MoECausalLMOutputWithPast(ModelOutput): + """ + Base class for causal language model (or autoregressive) outputs as well as Mixture of Expert's router hidden + states terms, to train a MoE model. + + Args: + loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided): + Language modeling loss (for next-token prediction). + logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`): + Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). + past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): + Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape + `(batch_size, num_heads, sequence_length, embed_size_per_head)`) + + Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see + `past_key_values` input) to speed up sequential decoding. + hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, + + one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the optional initial embedding outputs. + attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + z_loss (`torch.FloatTensor`, *optional*, returned when `labels` is provided): + z_loss for the sparse modules. + aux_loss (`torch.FloatTensor`, *optional*, returned when `labels` is provided): + aux_loss for the sparse modules. + router_logits (`tuple(torch.FloatTensor)`, *optional*, returned when `output_router_logits=True` is passed or when `config.add_router_probs=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, sequence_length, num_experts)`. + + Router logits of the encoder model, useful to compute the auxiliary loss and the z_loss for the sparse + modules. + """ + + loss: Optional[torch.FloatTensor] = None + logits: Optional[torch.FloatTensor] = None + past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None + hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None + attentions: Optional[Tuple[torch.FloatTensor, ...]] = None + z_loss: Optional[torch.FloatTensor] = None + aux_loss: Optional[torch.FloatTensor] = None + router_logits: Optional[Tuple[torch.FloatTensor]] = None + + +@dataclass +class MoEModelOutput(ModelOutput): + """ + Base class for model's outputs, with potential hidden states and attentions. + + Args: + last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): + Sequence of hidden-states at the output of the last layer of the model. + hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, + + one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the optional initial embedding outputs. + attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + router_probs (`tuple(torch.FloatTensor)`, *optional*, returned when `output_router_probs=True` and `config.add_router_probs=True` is passed or when `config.output_router_probs=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, sequence_length, num_experts)`. + + Raw router probabilities that are computed by MoE routers, these terms are used to compute the auxiliary + loss and the z_loss for Mixture of Experts models. + """ + + last_hidden_state: Optional[torch.FloatTensor] = None + hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None + attentions: Optional[Tuple[torch.FloatTensor, ...]] = None + router_probs: Optional[Tuple[torch.FloatTensor]] = None + + +@dataclass +class MoeModelOutputWithPast(ModelOutput): + """ + Base class for model's outputs, with potential hidden states and attentions. + + Args: + last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): + Sequence of hidden-states at the output of the last layer of the model. + past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): + Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape + `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and optionally if + `config.is_encoder_decoder=True` 2 additional tensors of shape `(batch_size, num_heads, + encoder_sequence_length, embed_size_per_head)`. + + Contains pre-computed hidden-states (key and values in the self-attention blocks and optionally if + `config.is_encoder_decoder=True` in the cross-attention blocks) that can be used (see `past_key_values` + input) to speed up sequential decoding. + hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, + + one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the optional initial embedding outputs. + attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + router_logits (`tuple(torch.FloatTensor)`, *optional*, returned when `output_router_probs=True` and `config.add_router_probs=True` is passed or when `config.output_router_probs=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, sequence_length, num_experts)`. + + Raw router logtis (post-softmax) that are computed by MoE routers, these terms are used to compute the auxiliary + loss for Mixture of Experts models. + """ + + last_hidden_state: Optional[torch.FloatTensor] = None + past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None + hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None + attentions: Optional[Tuple[torch.FloatTensor, ...]] = None + router_logits: Optional[Tuple[torch.FloatTensor]] = None + + +@dataclass +class MoeCausalLMOutputWithPast(ModelOutput): + """ + Base class for causal language model (or autoregressive) with mixture of experts outputs. + + Args: + loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided): + Language modeling loss (for next-token prediction). + + logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`): + Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). + + aux_loss (`torch.FloatTensor`, *optional*, returned when `labels` is provided): + aux_loss for the sparse modules. + + router_logits (`tuple(torch.FloatTensor)`, *optional*, returned when `output_router_probs=True` and `config.add_router_probs=True` is passed or when `config.output_router_probs=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, sequence_length, num_experts)`. + + Raw router logtis (post-softmax) that are computed by MoE routers, these terms are used to compute the auxiliary + loss for Mixture of Experts models. + + past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): + Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape + `(batch_size, num_heads, sequence_length, embed_size_per_head)`) + + Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see + `past_key_values` input) to speed up sequential decoding. + hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, + + one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the optional initial embedding outputs. + attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + + loss: Optional[torch.FloatTensor] = None + aux_loss: Optional[torch.FloatTensor] = None + logits: Optional[torch.FloatTensor] = None + past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None + hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None + attentions: Optional[Tuple[torch.FloatTensor, ...]] = None + router_logits: Optional[Tuple[torch.FloatTensor]] = None + + +@dataclass +class MoEModelOutputWithPastAndCrossAttentions(ModelOutput): + """ + Base class for model's outputs that may also contain a past key/values (to speed up sequential decoding) as well as + Mixture of Expert's router hidden states terms, to train a MoE model. + + Args: + last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): + Sequence of hidden-states at the output of the last layer of the model. + + If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1, + hidden_size)` is output. + past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): + Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape + `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and optionally if + `config.is_encoder_decoder=True` 2 additional tensors of shape `(batch_size, num_heads, + encoder_sequence_length, embed_size_per_head)`. + + Contains pre-computed hidden-states (key and values in the self-attention blocks and optionally if + `config.is_encoder_decoder=True` in the cross-attention blocks) that can be used (see `past_key_values` + input) to speed up sequential decoding. + hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, + + one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the optional initial embedding outputs. + attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` and `config.add_cross_attention=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the + weighted average in the cross-attention heads. + router_probs (`tuple(torch.FloatTensor)`, *optional*, returned when `output_router_probs=True` and `config.add_router_probs=True` is passed or when `config.output_router_probs=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, sequence_length, num_experts)`. + + Raw router probabilities that are computed by MoE routers, these terms are used to compute the auxiliary + loss and the z_loss for Mixture of Experts models. + """ + + last_hidden_state: Optional[torch.FloatTensor] = None + past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None + hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None + attentions: Optional[Tuple[torch.FloatTensor, ...]] = None + cross_attentions: Optional[Tuple[torch.FloatTensor, ...]] = None + router_probs: Optional[Tuple[torch.FloatTensor]] = None + + +@dataclass +class Seq2SeqModelOutput(ModelOutput): + """ + Base class for model encoder's outputs that also contains : pre-computed hidden states that can speed up sequential + decoding. + + Args: + last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): + Sequence of hidden-states at the output of the last layer of the decoder of the model. + + If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1, + hidden_size)` is output. + past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): + Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape + `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape + `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`. + + Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention + blocks) that can be used (see `past_key_values` input) to speed up sequential decoding. + decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, + + one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the decoder at the output of each layer plus the optional initial embedding outputs. + decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the + self-attention heads. + cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the + weighted average in the cross-attention heads. + encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Sequence of hidden-states at the output of the last layer of the encoder of the model. + encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, + + one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the encoder at the output of each layer plus the optional initial embedding outputs. + encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the + self-attention heads. + """ + + last_hidden_state: Optional[torch.FloatTensor] = None + past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None + decoder_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None + decoder_attentions: Optional[Tuple[torch.FloatTensor, ...]] = None + cross_attentions: Optional[Tuple[torch.FloatTensor, ...]] = None + encoder_last_hidden_state: Optional[torch.FloatTensor] = None + encoder_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None + encoder_attentions: Optional[Tuple[torch.FloatTensor, ...]] = None + + +@dataclass +class Seq2SeqMoEModelOutput(ModelOutput): + """ + Base class for model encoder's outputs that also contains : pre-computed hidden states that can speed up sequential + decoding. + + Args: + last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): + Sequence of hidden-states at the output of the last layer of the decoder of the model. + + If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1, + hidden_size)` is output. + past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): + Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape + `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape + `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`. + + Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention + blocks) that can be used (see `past_key_values` input) to speed up sequential decoding. + decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, + + one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the decoder at the output of each layer plus the optional initial embedding outputs. + decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the + self-attention heads. + decoder_router_logits (`tuple(torch.FloatTensor)`, *optional*, returned when `output_router_logits=True` is passed or when `config.add_router_probs=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, sequence_length, num_experts)`. + + Router logits of the decoder model, useful to compute the auxiliary loss for Mixture of Experts models. + cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the + weighted average in the cross-attention heads. + encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Sequence of hidden-states at the output of the last layer of the encoder of the model. + encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, + + one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the encoder at the output of each layer plus the optional initial embedding outputs. + encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the + self-attention heads. + encoder_router_logits (`tuple(torch.FloatTensor)`, *optional*, returned when `output_router_logits=True` is passed or when `config.add_router_probs=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, sequence_length, num_experts)`. + + Router logits of the encoder model, useful to compute the auxiliary loss and the z_loss for the sparse + modules. + """ + + last_hidden_state: Optional[torch.FloatTensor] = None + past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None + decoder_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None + decoder_attentions: Optional[Tuple[torch.FloatTensor, ...]] = None + decoder_router_logits: Optional[Tuple[torch.FloatTensor]] = None + cross_attentions: Optional[Tuple[torch.FloatTensor, ...]] = None + encoder_last_hidden_state: Optional[torch.FloatTensor] = None + encoder_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None + encoder_attentions: Optional[Tuple[torch.FloatTensor, ...]] = None + encoder_router_logits: Optional[Tuple[torch.FloatTensor]] = None + + +@dataclass +class CausalLMOutput(ModelOutput): + """ + Base class for causal language model (or autoregressive) outputs. + + Args: + loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided): + Language modeling loss (for next-token prediction). + logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`): + Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). + hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, + + one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the optional initial embedding outputs. + attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + + loss: Optional[torch.FloatTensor] = None + logits: Optional[torch.FloatTensor] = None + hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None + attentions: Optional[Tuple[torch.FloatTensor, ...]] = None + + +@dataclass +class CausalLMOutputWithPast(ModelOutput): + """ + Base class for causal language model (or autoregressive) outputs. + + Args: + loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided): + Language modeling loss (for next-token prediction). + logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`): + Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). + past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): + Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape + `(batch_size, num_heads, sequence_length, embed_size_per_head)`) + + Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see + `past_key_values` input) to speed up sequential decoding. + hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, + + one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the optional initial embedding outputs. + attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + + loss: Optional[torch.FloatTensor] = None + logits: Optional[torch.FloatTensor] = None + past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None + hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None + attentions: Optional[Tuple[torch.FloatTensor, ...]] = None + + +@dataclass +class CausalLMOutputWithCrossAttentions(ModelOutput): + """ + Base class for causal language model (or autoregressive) outputs. + + Args: + loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided): + Language modeling loss (for next-token prediction). + logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`): + Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). + hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, + + one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the optional initial embedding outputs. + attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Cross attentions weights after the attention softmax, used to compute the weighted average in the + cross-attention heads. + past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): + Tuple of `torch.FloatTensor` tuples of length `config.n_layers`, with each tuple containing the cached key, + value states of the self-attention and the cross-attention layers if model is used in encoder-decoder + setting. Only relevant if `config.is_decoder = True`. + + Contains pre-computed hidden-states (key and values in the attention blocks) that can be used (see + `past_key_values` input) to speed up sequential decoding. + """ + + loss: Optional[torch.FloatTensor] = None + logits: Optional[torch.FloatTensor] = None + past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None + hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None + attentions: Optional[Tuple[torch.FloatTensor, ...]] = None + cross_attentions: Optional[Tuple[torch.FloatTensor, ...]] = None + + +@dataclass +class SequenceClassifierOutputWithPast(ModelOutput): + """ + Base class for outputs of sentence classification models. + + Args: + loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided): + Classification (or regression if config.num_labels==1) loss. + logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`): + Classification (or regression if config.num_labels==1) scores (before SoftMax). + past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): + Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape + `(batch_size, num_heads, sequence_length, embed_size_per_head)`) + + Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see + `past_key_values` input) to speed up sequential decoding. + hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, + + one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the optional initial embedding outputs. + attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + + loss: Optional[torch.FloatTensor] = None + logits: Optional[torch.FloatTensor] = None + past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None + hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None + attentions: Optional[Tuple[torch.FloatTensor, ...]] = None + + +@dataclass +class MaskedLMOutput(ModelOutput): + """ + Base class for masked language models outputs. + + Args: + loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided): + Masked language modeling (MLM) loss. + logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`): + Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). + hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, + + one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the optional initial embedding outputs. + attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + + loss: Optional[torch.FloatTensor] = None + logits: Optional[torch.FloatTensor] = None + hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None + attentions: Optional[Tuple[torch.FloatTensor, ...]] = None + + +@dataclass +class Seq2SeqLMOutput(ModelOutput): + """ + Base class for sequence-to-sequence language models outputs. + + Args: + loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided): + Language modeling loss. + logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`): + Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). + past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): + Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape + `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape + `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`. + + Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention + blocks) that can be used (see `past_key_values` input) to speed up sequential decoding. + decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, + + one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the decoder at the output of each layer plus the initial embedding outputs. + decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the + self-attention heads. + cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the + weighted average in the cross-attention heads. + encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Sequence of hidden-states at the output of the last layer of the encoder of the model. + encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, + + one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the encoder at the output of each layer plus the initial embedding outputs. + encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the + self-attention heads. + """ + + loss: Optional[torch.FloatTensor] = None + logits: Optional[torch.FloatTensor] = None + past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None + decoder_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None + decoder_attentions: Optional[Tuple[torch.FloatTensor, ...]] = None + cross_attentions: Optional[Tuple[torch.FloatTensor, ...]] = None + encoder_last_hidden_state: Optional[torch.FloatTensor] = None + encoder_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None + encoder_attentions: Optional[Tuple[torch.FloatTensor, ...]] = None + + +@dataclass +class Seq2SeqMoEOutput(ModelOutput): + """ + Base class for sequence-to-sequence language models outputs. + + Args: + loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided): + Language modeling loss. + logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`): + Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). + past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): + Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape + `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape + `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`. + + Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention + blocks) that can be used (see `past_key_values` input) to speed up sequential decoding. + decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, + + one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the decoder at the output of each layer plus the initial embedding outputs. + decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the + self-attention heads. + decoder_router_logits (`tuple(torch.FloatTensor)`, *optional*, returned when `output_router_logits=True` is passed or when `config.add_router_probs=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, sequence_length, num_experts)`. + + Router logits of the decoder model, useful to compute the auxiliary loss for Mixture of Experts models. + cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the + weighted average in the cross-attention heads. + encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Sequence of hidden-states at the output of the last layer of the encoder of the model. + encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, + + one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the encoder at the output of each layer plus the initial embedding outputs. + encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the + self-attention heads. + encoder_router_logits (`tuple(torch.FloatTensor)`, *optional*, returned when `output_router_logits=True` is passed or when `config.add_router_probs=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, sequence_length, num_experts)`. + + Router logits of the encoder model, useful to compute the auxiliary loss and z_loss for Mixture of Experts + models. + """ + + loss: Optional[torch.FloatTensor] = None + logits: Optional[torch.FloatTensor] = None + encoder_z_loss: Optional[torch.FloatTensor] = None + decoder_z_loss: Optional[torch.FloatTensor] = None + encoder_aux_loss: Optional[torch.FloatTensor] = None + decoder_aux_loss: Optional[torch.FloatTensor] = None + past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None + decoder_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None + decoder_attentions: Optional[Tuple[torch.FloatTensor, ...]] = None + decoder_router_logits: Optional[Tuple[torch.FloatTensor]] = None + cross_attentions: Optional[Tuple[torch.FloatTensor, ...]] = None + encoder_last_hidden_state: Optional[torch.FloatTensor] = None + encoder_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None + encoder_attentions: Optional[Tuple[torch.FloatTensor, ...]] = None + encoder_router_logits: Optional[Tuple[torch.FloatTensor]] = None + + +@dataclass +class NextSentencePredictorOutput(ModelOutput): + """ + Base class for outputs of models predicting if two sentences are consecutive or not. + + Args: + loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `next_sentence_label` is provided): + Next sequence prediction (classification) loss. + logits (`torch.FloatTensor` of shape `(batch_size, 2)`): + Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation + before SoftMax). + hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, + + one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the optional initial embedding outputs. + attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + + loss: Optional[torch.FloatTensor] = None + logits: Optional[torch.FloatTensor] = None + hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None + attentions: Optional[Tuple[torch.FloatTensor, ...]] = None + + +@dataclass +class SequenceClassifierOutput(ModelOutput): + """ + Base class for outputs of sentence classification models. + + Args: + loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided): + Classification (or regression if config.num_labels==1) loss. + logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`): + Classification (or regression if config.num_labels==1) scores (before SoftMax). + hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, + + one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the optional initial embedding outputs. + attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + + loss: Optional[torch.FloatTensor] = None + logits: Optional[torch.FloatTensor] = None + hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None + attentions: Optional[Tuple[torch.FloatTensor, ...]] = None + + +@dataclass +class Seq2SeqSequenceClassifierOutput(ModelOutput): + """ + Base class for outputs of sequence-to-sequence sentence classification models. + + Args: + loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `label` is provided): + Classification (or regression if config.num_labels==1) loss. + logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`): + Classification (or regression if config.num_labels==1) scores (before SoftMax). + past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): + Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape + `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape + `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`. + + Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention + blocks) that can be used (see `past_key_values` input) to speed up sequential decoding. + decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, + + one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the decoder at the output of each layer plus the initial embedding outputs. + decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the + self-attention heads. + cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the + weighted average in the cross-attention heads. + encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Sequence of hidden-states at the output of the last layer of the encoder of the model. + encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, + + one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the encoder at the output of each layer plus the initial embedding outputs. + encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the + self-attention heads. + """ + + loss: Optional[torch.FloatTensor] = None + logits: Optional[torch.FloatTensor] = None + past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None + decoder_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None + decoder_attentions: Optional[Tuple[torch.FloatTensor, ...]] = None + cross_attentions: Optional[Tuple[torch.FloatTensor, ...]] = None + encoder_last_hidden_state: Optional[torch.FloatTensor] = None + encoder_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None + encoder_attentions: Optional[Tuple[torch.FloatTensor, ...]] = None + + +@dataclass +class MultipleChoiceModelOutput(ModelOutput): + """ + Base class for outputs of multiple choice models. + + Args: + loss (`torch.FloatTensor` of shape *(1,)*, *optional*, returned when `labels` is provided): + Classification loss. + logits (`torch.FloatTensor` of shape `(batch_size, num_choices)`): + *num_choices* is the second dimension of the input tensors. (see *input_ids* above). + + Classification scores (before SoftMax). + hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, + + one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the optional initial embedding outputs. + attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + + loss: Optional[torch.FloatTensor] = None + logits: Optional[torch.FloatTensor] = None + hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None + attentions: Optional[Tuple[torch.FloatTensor, ...]] = None + + +@dataclass +class TokenClassifierOutput(ModelOutput): + """ + Base class for outputs of token classification models. + + Args: + loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided) : + Classification loss. + logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_labels)`): + Classification scores (before SoftMax). + hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, + + one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the optional initial embedding outputs. + attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + + loss: Optional[torch.FloatTensor] = None + logits: Optional[torch.FloatTensor] = None + hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None + attentions: Optional[Tuple[torch.FloatTensor, ...]] = None + + +@dataclass +class QuestionAnsweringModelOutput(ModelOutput): + """ + Base class for outputs of question answering models. + + Args: + loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided): + Total span extraction loss is the sum of a Cross-Entropy for the start and end positions. + start_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length)`): + Span-start scores (before SoftMax). + end_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length)`): + Span-end scores (before SoftMax). + hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, + + one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the optional initial embedding outputs. + attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + + loss: Optional[torch.FloatTensor] = None + start_logits: Optional[torch.FloatTensor] = None + end_logits: Optional[torch.FloatTensor] = None + hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None + attentions: Optional[Tuple[torch.FloatTensor, ...]] = None + + +@dataclass +class Seq2SeqQuestionAnsweringModelOutput(ModelOutput): + """ + Base class for outputs of sequence-to-sequence question answering models. + + Args: + loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided): + Total span extraction loss is the sum of a Cross-Entropy for the start and end positions. + start_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length)`): + Span-start scores (before SoftMax). + end_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length)`): + Span-end scores (before SoftMax). + past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): + Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape + `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape + `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`. + + Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention + blocks) that can be used (see `past_key_values` input) to speed up sequential decoding. + decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, + + one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the decoder at the output of each layer plus the initial embedding outputs. + decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the + self-attention heads. + cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the + weighted average in the cross-attention heads. + encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Sequence of hidden-states at the output of the last layer of the encoder of the model. + encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, + + one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the encoder at the output of each layer plus the initial embedding outputs. + encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the + self-attention heads. + """ + + loss: Optional[torch.FloatTensor] = None + start_logits: Optional[torch.FloatTensor] = None + end_logits: Optional[torch.FloatTensor] = None + past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None + decoder_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None + decoder_attentions: Optional[Tuple[torch.FloatTensor, ...]] = None + cross_attentions: Optional[Tuple[torch.FloatTensor, ...]] = None + encoder_last_hidden_state: Optional[torch.FloatTensor] = None + encoder_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None + encoder_attentions: Optional[Tuple[torch.FloatTensor, ...]] = None + + +@dataclass +class SemanticSegmenterOutput(ModelOutput): + """ + Base class for outputs of semantic segmentation models. + + Args: + loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided): + Classification (or regression if config.num_labels==1) loss. + logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels, logits_height, logits_width)`): + Classification scores for each pixel. + + + + The logits returned do not necessarily have the same size as the `pixel_values` passed as inputs. This is + to avoid doing two interpolations and lose some quality when a user needs to resize the logits to the + original image size as post-processing. You should always check your logits shape and resize as needed. + + + + hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, + + one for the output of each layer) of shape `(batch_size, patch_size, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the optional initial embedding outputs. + attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, patch_size, + sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + + loss: Optional[torch.FloatTensor] = None + logits: Optional[torch.FloatTensor] = None + hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None + attentions: Optional[Tuple[torch.FloatTensor, ...]] = None + + +@dataclass +class ImageClassifierOutput(ModelOutput): + """ + Base class for outputs of image classification models. + + Args: + loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided): + Classification (or regression if config.num_labels==1) loss. + logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`): + Classification (or regression if config.num_labels==1) scores (before SoftMax). + hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, + + one for the output of each stage) of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states + (also called feature maps) of the model at the output of each stage. + attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, patch_size, + sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + + loss: Optional[torch.FloatTensor] = None + logits: Optional[torch.FloatTensor] = None + hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None + attentions: Optional[Tuple[torch.FloatTensor, ...]] = None + + +@dataclass +class ImageClassifierOutputWithNoAttention(ModelOutput): + """ + Base class for outputs of image classification models. + + Args: + loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided): + Classification (or regression if config.num_labels==1) loss. + logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`): + Classification (or regression if config.num_labels==1) scores (before SoftMax). + hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, + + one for the output of each stage) of shape `(batch_size, num_channels, height, width)`. Hidden-states (also + called feature maps) of the model at the output of each stage. + """ + + loss: Optional[torch.FloatTensor] = None + logits: Optional[torch.FloatTensor] = None + hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None + + +@dataclass +class DepthEstimatorOutput(ModelOutput): + """ + Base class for outputs of depth estimation models. + + Args: + loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided): + Classification (or regression if config.num_labels==1) loss. + predicted_depth (`torch.FloatTensor` of shape `(batch_size, height, width)`): + Predicted depth for each pixel. + + hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, + + one for the output of each layer) of shape `(batch_size, num_channels, height, width)`. + + Hidden-states of the model at the output of each layer plus the optional initial embedding outputs. + attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, patch_size, + sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + + loss: Optional[torch.FloatTensor] = None + predicted_depth: Optional[torch.FloatTensor] = None + hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None + attentions: Optional[Tuple[torch.FloatTensor, ...]] = None + + +@dataclass +class ImageSuperResolutionOutput(ModelOutput): + """ + Base class for outputs of image super resolution models. + + Args: + loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided): + Reconstruction loss. + reconstruction (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`): + Reconstructed images, possibly upscaled. + hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, + + one for the output of each stage) of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states + (also called feature maps) of the model at the output of each stage. + attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, patch_size, + sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + + loss: Optional[torch.FloatTensor] = None + reconstruction: Optional[torch.FloatTensor] = None + hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None + attentions: Optional[Tuple[torch.FloatTensor, ...]] = None + + +@dataclass +class Wav2Vec2BaseModelOutput(ModelOutput): + """ + Base class for models that have been trained with the Wav2Vec2 loss objective. + + Args: + last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): + Sequence of hidden-states at the output of the last layer of the model. + extract_features (`torch.FloatTensor` of shape `(batch_size, sequence_length, conv_dim[-1])`): + Sequence of extracted feature vectors of the last convolutional layer of the model. + hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of + shape `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + + last_hidden_state: Optional[torch.FloatTensor] = None + extract_features: Optional[torch.FloatTensor] = None + hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None + attentions: Optional[Tuple[torch.FloatTensor, ...]] = None + + +@dataclass +class XVectorOutput(ModelOutput): + """ + Output type of [`Wav2Vec2ForXVector`]. + + Args: + loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided): + Classification loss. + logits (`torch.FloatTensor` of shape `(batch_size, config.xvector_output_dim)`): + Classification hidden states before AMSoftmax. + embeddings (`torch.FloatTensor` of shape `(batch_size, config.xvector_output_dim)`): + Utterance embeddings used for vector similarity-based retrieval. + hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of + shape `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + + loss: Optional[torch.FloatTensor] = None + logits: Optional[torch.FloatTensor] = None + embeddings: Optional[torch.FloatTensor] = None + hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None + attentions: Optional[Tuple[torch.FloatTensor, ...]] = None + + +@dataclass +class BackboneOutput(ModelOutput): + """ + Base class for outputs of backbones. + + Args: + feature_maps (`tuple(torch.FloatTensor)` of shape `(batch_size, num_channels, height, width)`): + Feature maps of the stages. + hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of + shape `(batch_size, sequence_length, hidden_size)` or `(batch_size, num_channels, height, width)`, + depending on the backbone. + + Hidden-states of the model at the output of each stage plus the initial embedding outputs. + attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. Only applicable if the backbone uses attention. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + + feature_maps: Optional[Tuple[torch.FloatTensor]] = None + hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None + attentions: Optional[Tuple[torch.FloatTensor, ...]] = None + + +@dataclass +class BaseModelOutputWithPoolingAndProjection(ModelOutput): + """ + Base class for model's outputs that also contains a pooling of the last hidden states. + + Args: + last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): + Sequence of hidden-states at the output of the last layer of the model. + pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`): + Last layer hidden-state of the first token of the sequence (classification token) after further processing + through the layers used for the auxiliary pretraining task. E.g. for BERT-family of models, this returns + the classification token after processing through a linear layer and a tanh activation function. The linear + layer weights are trained from the next sentence prediction (classification) objective during pretraining. + hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, + + one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the optional initial embedding outputs. + attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + projection_state (`tuple(torch.FloatTensor)`, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` of shape `(batch_size,config.project_dim)`. + + Text embeddings before the projection layer, used to mimic the last hidden state of the teacher encoder. + """ + + last_hidden_state: Optional[torch.FloatTensor] = None + pooler_output: Optional[torch.FloatTensor] = None + hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None + attentions: Optional[Tuple[torch.FloatTensor, ...]] = None + projection_state: Optional[Tuple[torch.FloatTensor]] = None + + +@dataclass +class Seq2SeqSpectrogramOutput(ModelOutput): + """ + Base class for sequence-to-sequence spectrogram outputs. + + Args: + loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided): + Spectrogram generation loss. + spectrogram (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_bins)`): + The predicted spectrogram. + past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): + Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape + `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape + `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`. + + Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention + blocks) that can be used (see `past_key_values` input) to speed up sequential decoding. + decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, + + one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the decoder at the output of each layer plus the initial embedding outputs. + decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the + self-attention heads. + cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the + weighted average in the cross-attention heads. + encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Sequence of hidden-states at the output of the last layer of the encoder of the model. + encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, + + one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the encoder at the output of each layer plus the initial embedding outputs. + encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the + self-attention heads. + """ + + loss: Optional[torch.FloatTensor] = None + spectrogram: Optional[torch.FloatTensor] = None + past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None + decoder_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None + decoder_attentions: Optional[Tuple[torch.FloatTensor, ...]] = None + cross_attentions: Optional[Tuple[torch.FloatTensor, ...]] = None + encoder_last_hidden_state: Optional[torch.FloatTensor] = None + encoder_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None + encoder_attentions: Optional[Tuple[torch.FloatTensor, ...]] = None + + +@dataclass +class Seq2SeqTSModelOutput(ModelOutput): + """ + Base class for time series model's encoder outputs that also contains pre-computed hidden states that can speed up + sequential decoding. + + Args: + last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): + Sequence of hidden-states at the output of the last layer of the decoder of the model. + + If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1, + hidden_size)` is output. + past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): + Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape + `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape + `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`. + + Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention + blocks) that can be used (see `past_key_values` input) to speed up sequential decoding. + decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, + + one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the decoder at the output of each layer plus the optional initial embedding outputs. + decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the + self-attention heads. + cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the + weighted average in the cross-attention heads. + encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Sequence of hidden-states at the output of the last layer of the encoder of the model. + encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, + + one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the encoder at the output of each layer plus the optional initial embedding outputs. + encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the + self-attention heads. + loc (`torch.FloatTensor` of shape `(batch_size,)` or `(batch_size, input_size)`, *optional*): + Shift values of each time series' context window which is used to give the model inputs of the same + magnitude and then used to shift back to the original magnitude. + scale (`torch.FloatTensor` of shape `(batch_size,)` or `(batch_size, input_size)`, *optional*): + Scaling values of each time series' context window which is used to give the model inputs of the same + magnitude and then used to rescale back to the original magnitude. + static_features (`torch.FloatTensor` of shape `(batch_size, feature size)`, *optional*): + Static features of each time series' in a batch which are copied to the covariates at inference time. + """ + + last_hidden_state: Optional[torch.FloatTensor] = None + past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None + decoder_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None + decoder_attentions: Optional[Tuple[torch.FloatTensor, ...]] = None + cross_attentions: Optional[Tuple[torch.FloatTensor, ...]] = None + encoder_last_hidden_state: Optional[torch.FloatTensor] = None + encoder_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None + encoder_attentions: Optional[Tuple[torch.FloatTensor, ...]] = None + loc: Optional[torch.FloatTensor] = None + scale: Optional[torch.FloatTensor] = None + static_features: Optional[torch.FloatTensor] = None + + +@dataclass +class Seq2SeqTSPredictionOutput(ModelOutput): + """ + Base class for time series model's decoder outputs that also contain the loss as well as the parameters of the + chosen distribution. + + Args: + loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when a `future_values` is provided): + Distributional loss. + params (`torch.FloatTensor` of shape `(batch_size, num_samples, num_params)`): + Parameters of the chosen distribution. + past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): + Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape + `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape + `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`. + + Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention + blocks) that can be used (see `past_key_values` input) to speed up sequential decoding. + decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, + + one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the decoder at the output of each layer plus the initial embedding outputs. + decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the + self-attention heads. + cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the + weighted average in the cross-attention heads. + encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Sequence of hidden-states at the output of the last layer of the encoder of the model. + encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, + + one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the encoder at the output of each layer plus the initial embedding outputs. + encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the + self-attention heads. + loc (`torch.FloatTensor` of shape `(batch_size,)` or `(batch_size, input_size)`, *optional*): + Shift values of each time series' context window which is used to give the model inputs of the same + magnitude and then used to shift back to the original magnitude. + scale (`torch.FloatTensor` of shape `(batch_size,)` or `(batch_size, input_size)`, *optional*): + Scaling values of each time series' context window which is used to give the model inputs of the same + magnitude and then used to rescale back to the original magnitude. + static_features (`torch.FloatTensor` of shape `(batch_size, feature size)`, *optional*): + Static features of each time series' in a batch which are copied to the covariates at inference time. + """ + + loss: Optional[torch.FloatTensor] = None + params: Optional[Tuple[torch.FloatTensor]] = None + past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None + decoder_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None + decoder_attentions: Optional[Tuple[torch.FloatTensor, ...]] = None + cross_attentions: Optional[Tuple[torch.FloatTensor, ...]] = None + encoder_last_hidden_state: Optional[torch.FloatTensor] = None + encoder_hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None + encoder_attentions: Optional[Tuple[torch.FloatTensor, ...]] = None + loc: Optional[torch.FloatTensor] = None + scale: Optional[torch.FloatTensor] = None + static_features: Optional[torch.FloatTensor] = None + + +@dataclass +class SampleTSPredictionOutput(ModelOutput): + """ + Base class for time series model's predictions outputs that contains the sampled values from the chosen + distribution. + + Args: + sequences (`torch.FloatTensor` of shape `(batch_size, num_samples, prediction_length)` or `(batch_size, num_samples, prediction_length, input_size)`): + Sampled values from the chosen distribution. + """ + + sequences: Optional[torch.FloatTensor] = None + + +@dataclass +class MaskedImageModelingOutput(ModelOutput): + """ + Base class for outputs of masked image completion / in-painting models. + + Args: + loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `bool_masked_pos` is provided): + Reconstruction loss. + reconstruction (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`): + Reconstructed / completed images. + hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or + when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, + + one for the output of each stage) of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states + (also called feature maps) of the model at the output of each stage. + attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when + `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, patch_size, + sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in + the self-attention heads. + """ + + loss: Optional[torch.FloatTensor] = None + reconstruction: Optional[torch.FloatTensor] = None + hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None + attentions: Optional[Tuple[torch.FloatTensor, ...]] = None + + @property + def logits(self): + warnings.warn( + "logits attribute is deprecated and will be removed in version 5 of Transformers." + " Please use the reconstruction attribute to retrieve the final output instead.", + FutureWarning, + ) + return self.reconstruction diff --git a/docs/transformers/build/lib/transformers/modeling_rope_utils.py b/docs/transformers/build/lib/transformers/modeling_rope_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..ccb961edea9d6e297b14531188320d72bbf4ef66 --- /dev/null +++ b/docs/transformers/build/lib/transformers/modeling_rope_utils.py @@ -0,0 +1,655 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import math +from functools import wraps +from typing import Optional + +from .configuration_utils import PretrainedConfig +from .utils import is_torch_available, logging + + +logger = logging.get_logger(__name__) + + +if is_torch_available(): + import torch + + +def dynamic_rope_update(rope_forward): + """ + Decorator function to update the RoPE parameters in the forward pass, if the model is using a dynamic RoPE + (i.e. a RoPE implementation that may recompute its frequencies in the forward pass). + + Args: + rope_forward (Callable): + The forward pass of the RoPE implementation. + + Returns: + The decorated forward pass. + """ + + def longrope_frequency_update(self, position_ids, device): + """Longrope uses long factor if sequence is larger than original pretraining length, short otherwise.""" + seq_len = torch.max(position_ids) + 1 + if hasattr(self.config, "original_max_position_embeddings"): + original_max_position_embeddings = self.config.original_max_position_embeddings + else: + original_max_position_embeddings = self.config.max_position_embeddings + if seq_len > original_max_position_embeddings: + if not hasattr(self, "long_inv_freq"): + self.long_inv_freq, _ = self.rope_init_fn( + self.config, device, seq_len=original_max_position_embeddings + 1 + ) + self.register_buffer("inv_freq", self.long_inv_freq, persistent=False) + else: + # This .to() is needed if the model has been moved to a device after being initialized (because + # the buffer is automatically moved, but not the original copy) + self.original_inv_freq = self.original_inv_freq.to(device) + self.register_buffer("inv_freq", self.original_inv_freq, persistent=False) + + def dynamic_frequency_update(self, position_ids, device): + """ + dynamic RoPE layers should recompute `inv_freq` in the following situations: + 1 - growing beyond the cached sequence length (allow scaling) + 2 - the current sequence length is in the original scale (avoid losing precision with small sequences) + """ + seq_len = torch.max(position_ids) + 1 + if seq_len > self.max_seq_len_cached: # growth + inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device, seq_len=seq_len) + self.register_buffer("inv_freq", inv_freq, persistent=False) # TODO joao: may break with compilation + self.max_seq_len_cached = seq_len + + if seq_len < self.original_max_seq_len and self.max_seq_len_cached > self.original_max_seq_len: # reset + # This .to() is needed if the model has been moved to a device after being initialized (because + # the buffer is automatically moved, but not the original copy) + self.original_inv_freq = self.original_inv_freq.to(device) + self.register_buffer("inv_freq", self.original_inv_freq, persistent=False) + self.max_seq_len_cached = self.original_max_seq_len + + @wraps(rope_forward) + def wrapper(self, x, position_ids): + if "dynamic" in self.rope_type: + dynamic_frequency_update(self, position_ids, device=x.device) + elif self.rope_type == "longrope": + longrope_frequency_update(self, position_ids, device=x.device) + return rope_forward(self, x, position_ids) + + return wrapper + + +def _compute_default_rope_parameters( + config: Optional[PretrainedConfig] = None, + device: Optional["torch.device"] = None, + seq_len: Optional[int] = None, + **rope_kwargs, +) -> tuple["torch.Tensor", float]: + """ + Computes the inverse frequencies according to the original RoPE implementation + Args: + config ([`~transformers.PretrainedConfig`]): + The model configuration. + device (`torch.device`): + The device to use for initialization of the inverse frequencies. + seq_len (`int`, *optional*): + The current sequence length. Unused for this type of RoPE. + rope_kwargs (`Dict`, *optional*): + BC compatibility with the previous RoPE class instantiation, will be removed in v4.45. + Returns: + Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the + post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE). + """ + if config is not None and len(rope_kwargs) > 0: + raise ValueError( + "Unexpected arguments: `**rope_kwargs` and `config` are mutually exclusive in " + f"`_compute_default_rope_parameters`, got `rope_kwargs`={rope_kwargs} and `config`={config}" + ) + if len(rope_kwargs) > 0: + base = rope_kwargs["base"] + dim = rope_kwargs["dim"] + elif config is not None: + base = config.rope_theta + partial_rotary_factor = config.partial_rotary_factor if hasattr(config, "partial_rotary_factor") else 1.0 + head_dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads + dim = int(head_dim * partial_rotary_factor) + + attention_factor = 1.0 # Unused in this type of RoPE + + # Compute the inverse frequencies + inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.int64).to(device=device, dtype=torch.float) / dim)) + return inv_freq, attention_factor + + +def _compute_linear_scaling_rope_parameters( + config: Optional[PretrainedConfig] = None, + device: Optional["torch.device"] = None, + seq_len: Optional[int] = None, + **rope_kwargs, +) -> tuple["torch.Tensor", float]: + """ + Computes the inverse frequencies with linear scaling. Credits to the Reddit user /u/kaiokendev + Args: + config ([`~transformers.PretrainedConfig`]): + The model configuration. + device (`torch.device`): + The device to use for initialization of the inverse frequencies. + seq_len (`int`, *optional*): + The current sequence length. Unused for this type of RoPE. + rope_kwargs (`Dict`, *optional*): + BC compatibility with the previous RoPE class instantiation, will be removed in v4.45. + Returns: + Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the + post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE). + """ + if config is not None and len(rope_kwargs) > 0: + raise ValueError( + "Unexpected arguments: `**rope_kwargs` and `config` are mutually exclusive in " + f"`_compute_linear_scaling_rope_parameters`, got `rope_kwargs`={rope_kwargs} and `config`={config}" + ) + if len(rope_kwargs) > 0: + factor = rope_kwargs["factor"] + elif config is not None: + factor = config.rope_scaling["factor"] + + # Gets the default RoPE parameters + inv_freq, attention_factor = _compute_default_rope_parameters(config, device, seq_len, **rope_kwargs) + + # Then applies linear scaling to the frequencies. + # NOTE: originally, scaling was applied to the position_ids. However, we get `embs = inv_freq @ position_ids`, so + # applying scaling to the inverse frequencies is equivalent. + inv_freq /= factor + return inv_freq, attention_factor + + +def _compute_dynamic_ntk_parameters( + config: Optional[PretrainedConfig] = None, + device: Optional["torch.device"] = None, + seq_len: Optional[int] = None, + **rope_kwargs, +) -> tuple["torch.Tensor", float]: + """ + Computes the inverse frequencies with NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla + Args: + config ([`~transformers.PretrainedConfig`]): + The model configuration. + device (`torch.device`): + The device to use for initialization of the inverse frequencies. + seq_len (`int`, *optional*): + The current sequence length, used to update the dynamic RoPE at inference time. + rope_kwargs (`Dict`, *optional*): + BC compatibility with the previous RoPE class instantiation, will be removed in v4.45. + Returns: + Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the + post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE). + """ + # TODO (joao): use the new `original_max_position_embeddings` from rope_scaling + if config is not None and len(rope_kwargs) > 0: + raise ValueError( + "Unexpected arguments: `**rope_kwargs` and `config` are mutually exclusive in " + f"`_compute_dynamic_ntk_parameters`, got `rope_kwargs`={rope_kwargs} and `config`={config}" + ) + if len(rope_kwargs) > 0: + base = rope_kwargs["base"] + dim = rope_kwargs["dim"] + max_position_embeddings = rope_kwargs["max_position_embeddings"] + factor = rope_kwargs["factor"] + elif config is not None: + base = config.rope_theta + partial_rotary_factor = config.partial_rotary_factor if hasattr(config, "partial_rotary_factor") else 1.0 + head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads) + dim = int(head_dim * partial_rotary_factor) + max_position_embeddings = config.max_position_embeddings + factor = config.rope_scaling["factor"] + + attention_factor = 1.0 # Unused in this type of RoPE + + # seq_len: default to max_position_embeddings, e.g. at init time + seq_len = seq_len if seq_len is not None and seq_len > max_position_embeddings else max_position_embeddings + + # Compute the inverse frequencies + base = base * ((factor * seq_len / max_position_embeddings) - (factor - 1)) ** (dim / (dim - 2)) + inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.int64).to(device=device, dtype=torch.float) / dim)) + return inv_freq, attention_factor + + +def _compute_yarn_parameters( + config: PretrainedConfig, device: "torch.device", seq_len: Optional[int] = None, **rope_kwargs +) -> tuple["torch.Tensor", float]: + """ + Computes the inverse frequencies with NTK scaling. Please refer to the + [original paper](https://arxiv.org/abs/2309.00071) + Args: + config ([`~transformers.PretrainedConfig`]): + The model configuration. + device (`torch.device`): + The device to use for initialization of the inverse frequencies. + seq_len (`int`, *optional*): + The current sequence length. Unused for this type of RoPE. + rope_kwargs (`Dict`, *optional*): + BC compatibility with the previous RoPE class instantiation, will be removed in v4.45. + Returns: + Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the + post-processing scaling factor applied to the computed cos/sin. + """ + # No need to keep BC with yarn, unreleased when this new pattern was created. + if len(rope_kwargs) > 0: + raise ValueError( + f"Unexpected arguments: `**rope_kwargs` should be unset in `_compute_yarn_parameters`, got {rope_kwargs}" + ) + + base = config.rope_theta + partial_rotary_factor = config.partial_rotary_factor if hasattr(config, "partial_rotary_factor") else 1.0 + head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads) + dim = int(head_dim * partial_rotary_factor) + factor = config.rope_scaling["factor"] + attention_factor = config.rope_scaling.get("attention_factor") + mscale = config.rope_scaling.get("mscale") + mscale_all_dim = config.rope_scaling.get("mscale_all_dim") + + # NOTE: DeekSeek-V3 (and potentially other models) modify `max_position_embeddings` and have a + # `original_max_position_embeddings` field containing the pretrained value. They use the ratio between these two + # values to compute the default attention scaling factor, instead of using `factor`. + if "original_max_position_embeddings" in config.rope_scaling: + original_max_position_embeddings = config.rope_scaling["original_max_position_embeddings"] + factor = config.max_position_embeddings / original_max_position_embeddings + else: + original_max_position_embeddings = config.max_position_embeddings + + def get_mscale(scale, mscale=1): + if scale <= 1: + return 1.0 + return 0.1 * mscale * math.log(scale) + 1.0 + + # Sets the attention factor as suggested in the paper + if attention_factor is None: + if mscale and mscale_all_dim: + attention_factor = float(get_mscale(factor, mscale) / get_mscale(factor, mscale_all_dim)) + else: + attention_factor = get_mscale(factor) + + # Optional config options + # beta_fast/beta_slow: as suggested in the paper, default to 32/1 (correspondingly) + beta_fast = config.rope_scaling.get("beta_fast") or 32 + beta_slow = config.rope_scaling.get("beta_slow") or 1 + + # Compute the inverse frequencies + def find_correction_dim(num_rotations, dim, base, max_position_embeddings): + """Inverse dimension formula to find the dimension based on the number of rotations""" + return (dim * math.log(max_position_embeddings / (num_rotations * 2 * math.pi))) / (2 * math.log(base)) + + def find_correction_range(low_rot, high_rot, dim, base, max_position_embeddings): + """Find dimension range bounds based on rotations""" + low = math.floor(find_correction_dim(low_rot, dim, base, max_position_embeddings)) + high = math.ceil(find_correction_dim(high_rot, dim, base, max_position_embeddings)) + return max(low, 0), min(high, dim - 1) + + def linear_ramp_factor(min, max, dim): + if min == max: + max += 0.001 # Prevent singularity + + linear_func = (torch.arange(dim, dtype=torch.float32) - min) / (max - min) + ramp_func = torch.clamp(linear_func, 0, 1) + return ramp_func + + # Note on variable naming: "interpolation" comes from the original technique, where we interpolate the position IDs + # to expand the possible context length. In other words, interpolation = apply scaling factor. + pos_freqs = base ** (torch.arange(0, dim, 2).to(device=device, dtype=torch.float) / dim) + inv_freq_extrapolation = 1.0 / pos_freqs + inv_freq_interpolation = 1.0 / (factor * pos_freqs) + + low, high = find_correction_range(beta_fast, beta_slow, dim, base, original_max_position_embeddings) + + # Get n-dimensional rotational scaling corrected for extrapolation + inv_freq_extrapolation_factor = 1 - linear_ramp_factor(low, high, dim // 2).to(device=device, dtype=torch.float) + inv_freq = ( + inv_freq_interpolation * (1 - inv_freq_extrapolation_factor) + + inv_freq_extrapolation * inv_freq_extrapolation_factor + ) + return inv_freq, attention_factor + + +def _compute_longrope_parameters( + config: PretrainedConfig, device: "torch.device", seq_len: Optional[int] = None, **rope_kwargs +) -> tuple["torch.Tensor", float]: + """ + Computes the inverse frequencies with LongRoPE scaling. Please refer to the + [original implementation](https://github.com/microsoft/LongRoPE) + Args: + config ([`~transformers.PretrainedConfig`]): + The model configuration. + device (`torch.device`): + The device to use for initialization of the inverse frequencies. + seq_len (`int`, *optional*): + The current sequence length. + rope_kwargs (`Dict`, *optional*): + BC compatibility with the previous RoPE class instantiation, will be removed in v4.45. + Returns: + Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the + post-processing scaling factor applied to the computed cos/sin. + """ + # TODO (joao): use the new `original_max_position_embeddings` from rope_scaling + # No need to keep BC with longrope, unreleased when this new pattern was created. + if len(rope_kwargs) > 0: + raise ValueError( + "Unexpected arguments: `**rope_kwargs` should be unset in `_compute_longrope_parameters`, got " + f"{rope_kwargs}" + ) + + base = config.rope_theta + partial_rotary_factor = config.partial_rotary_factor if hasattr(config, "partial_rotary_factor") else 1.0 + head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads) + dim = int(head_dim * partial_rotary_factor) + long_factor = config.rope_scaling["long_factor"] + short_factor = config.rope_scaling["short_factor"] + factor = config.rope_scaling.get("factor") + attention_factor = config.rope_scaling.get("attention_factor") + + # NOTE: Phi3 (and potentially other models) modify `max_position_embeddings` and have a + # `original_max_position_embeddings` field containing the pretrained value. They use the ratio between these two + # values to compute the default attention scaling factor, instead of using `factor`. + if hasattr(config, "original_max_position_embeddings"): + original_max_position_embeddings = config.original_max_position_embeddings + factor = config.max_position_embeddings / config.original_max_position_embeddings + else: + original_max_position_embeddings = config.max_position_embeddings + + # Sets the attention factor as suggested in the paper + if attention_factor is None: + if factor <= 1.0: + attention_factor = 1.0 + else: + attention_factor = math.sqrt(1 + math.log(factor) / math.log(original_max_position_embeddings)) + + # Compute the inverse frequencies -- scaled based on the target sequence length + if seq_len and seq_len > original_max_position_embeddings: + ext_factors = torch.tensor(long_factor, dtype=torch.float32, device=device) + else: + ext_factors = torch.tensor(short_factor, dtype=torch.float32, device=device) + inv_freq_shape = torch.arange(0, dim, 2, dtype=torch.int64, device=device).float() / dim + inv_freq = 1.0 / (ext_factors * base**inv_freq_shape) + + return inv_freq, attention_factor + + +def _compute_llama3_parameters( + config: PretrainedConfig, device: "torch.device", seq_len: Optional[int] = None, **rope_kwargs +) -> tuple["torch.Tensor", float]: + """ + Computes the inverse frequencies for llama 3.1. + + Args: + config ([`~transformers.PretrainedConfig`]): + The model configuration. + device (`torch.device`): + The device to use for initialization of the inverse frequencies. + seq_len (`int`, *optional*): + The current sequence length. Unused for this type of RoPE. + rope_kwargs (`Dict`, *optional*): + BC compatibility with the previous RoPE class instantiation, will be removed in v4.45. + Returns: + Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the + post-processing scaling factor applied to the computed cos/sin. + """ + # Gets the default RoPE parameters + inv_freq, attention_factor = _compute_default_rope_parameters(config, device, seq_len, **rope_kwargs) + + factor = config.rope_scaling["factor"] # `8` in the original implementation + low_freq_factor = config.rope_scaling["low_freq_factor"] # `1` in the original implementation + high_freq_factor = config.rope_scaling["high_freq_factor"] # `4` in the original implementation + old_context_len = config.rope_scaling["original_max_position_embeddings"] # `8192` in the original implementation + + low_freq_wavelen = old_context_len / low_freq_factor + high_freq_wavelen = old_context_len / high_freq_factor + + wavelen = 2 * math.pi / inv_freq + # wavelen < high_freq_wavelen: do nothing + # wavelen > low_freq_wavelen: divide by factor + inv_freq_llama = torch.where(wavelen > low_freq_wavelen, inv_freq / factor, inv_freq) + # otherwise: interpolate between the two, using a smooth factor + smooth_factor = (old_context_len / wavelen - low_freq_factor) / (high_freq_factor - low_freq_factor) + smoothed_inv_freq = (1 - smooth_factor) * inv_freq_llama / factor + smooth_factor * inv_freq_llama + is_medium_freq = ~(wavelen < high_freq_wavelen) * ~(wavelen > low_freq_wavelen) + inv_freq_llama = torch.where(is_medium_freq, smoothed_inv_freq, inv_freq_llama) + + return inv_freq_llama, attention_factor + + +# This maps the "rope_type" string field in rope config to the corresponding function to compute the RoPE parameters +# from the model config. You can append new {'rope_type': callable} pairs to this dictionary to enable custom RoPE +# parameterizations, as long as the callable has the same signature. +ROPE_INIT_FUNCTIONS = { + "default": _compute_default_rope_parameters, + "linear": _compute_linear_scaling_rope_parameters, + "dynamic": _compute_dynamic_ntk_parameters, + "yarn": _compute_yarn_parameters, + "longrope": _compute_longrope_parameters, + "llama3": _compute_llama3_parameters, +} + + +def _check_received_keys( + rope_type: str, + received_keys: set, + required_keys: set, + optional_keys: Optional[set] = None, + ignore_keys: Optional[set] = None, +): + """Compare the received keys in `config.rope_scaling` against the expected and optional keys""" + # BC: "rope_type" was originally "type" -- let's check for "rope_type" when "type" is present + if "type" in received_keys: + received_keys -= {"type"} + required_keys.add("rope_type") + + # Some models need to store model-specific keys, and we don't want to throw warning at them + if ignore_keys is not None: + received_keys -= ignore_keys + + missing_keys = required_keys - received_keys + if missing_keys: + raise KeyError(f"Missing required keys in `rope_scaling` for 'rope_type'='{rope_type}': {missing_keys}") + + if optional_keys is not None: + unused_keys = received_keys - required_keys - optional_keys + else: + unused_keys = received_keys - required_keys + if unused_keys: + logger.warning(f"Unrecognized keys in `rope_scaling` for 'rope_type'='{rope_type}': {unused_keys}") + + +def _validate_default_rope_parameters(config: PretrainedConfig, ignore_keys: Optional[set] = None): + rope_scaling = config.rope_scaling + rope_type = rope_scaling.get("rope_type", rope_scaling.get("type", None)) # BC: "rope_type" was originally "type" + required_keys = {"rope_type"} + received_keys = set(rope_scaling.keys()) + _check_received_keys(rope_type, received_keys, required_keys, ignore_keys=ignore_keys) + + +def _validate_linear_scaling_rope_parameters(config: PretrainedConfig, ignore_keys: Optional[set] = None): + rope_scaling = config.rope_scaling + rope_type = rope_scaling.get("rope_type", rope_scaling.get("type", None)) # BC: "rope_type" was originally "type" + required_keys = {"rope_type", "factor"} + received_keys = set(rope_scaling.keys()) + _check_received_keys(rope_type, received_keys, required_keys, ignore_keys=ignore_keys) + + factor = rope_scaling["factor"] + if factor is None or not isinstance(factor, float) or factor < 1.0: + logger.warning(f"`rope_scaling`'s factor field must be a float >= 1, got {factor}") + + +def _validate_dynamic_scaling_rope_parameters(config: PretrainedConfig, ignore_keys: Optional[set] = None): + rope_scaling = config.rope_scaling + rope_type = rope_scaling.get("rope_type", rope_scaling.get("type", None)) # BC: "rope_type" was originally "type" + required_keys = {"rope_type", "factor"} + # TODO (joao): update logic for the inclusion of `original_max_position_embeddings` + optional_keys = {"original_max_position_embeddings"} + received_keys = set(rope_scaling.keys()) + _check_received_keys(rope_type, received_keys, required_keys, optional_keys, ignore_keys=ignore_keys) + + factor = rope_scaling["factor"] + if factor is None or not isinstance(factor, float) or factor < 1.0: + logger.warning(f"`rope_scaling`'s factor field must be a float >= 1, got {factor}") + + +def _validate_yarn_parameters(config: PretrainedConfig, ignore_keys: Optional[set] = None): + rope_scaling = config.rope_scaling + rope_type = rope_scaling.get("rope_type", rope_scaling.get("type", None)) # BC: "rope_type" was originally "type" + required_keys = {"rope_type", "factor"} + optional_keys = { + "attention_factor", + "beta_fast", + "beta_slow", + "original_max_position_embeddings", + "mscale", + "mscale_all_dim", + } + received_keys = set(rope_scaling.keys()) + _check_received_keys(rope_type, received_keys, required_keys, optional_keys, ignore_keys=ignore_keys) + + factor = rope_scaling["factor"] + if factor is None or not isinstance(factor, float) or factor < 1.0: + logger.warning(f"`rope_scaling`'s factor field must be a float >= 1, got {factor}") + + attention_factor = rope_scaling.get("attention_factor") + if attention_factor is not None and (not isinstance(attention_factor, float) or attention_factor < 0): + logger.warning( + f"`rope_scaling`'s attention_factor field must be a float greater than 0, got {attention_factor}" + ) + beta_fast = rope_scaling.get("beta_fast") + if beta_fast is not None and not isinstance(beta_fast, float): + logger.warning(f"`rope_scaling`'s beta_fast field must be a float, got {beta_fast}") + beta_slow = rope_scaling.get("beta_slow") + if beta_slow is not None and not isinstance(beta_slow, float): + logger.warning(f"`rope_scaling`'s beta_slow field must be a float, got {beta_slow}") + + if (beta_fast or 32) < (beta_slow or 1): + logger.warning( + f"`rope_scaling`'s beta_fast field must be greater than beta_slow, got beta_fast={beta_fast} " + f"(defaults to 32 if None) and beta_slow={beta_slow} (defaults to 1 if None)" + ) + + +def _validate_longrope_parameters(config: PretrainedConfig, ignore_keys: Optional[set] = None): + rope_scaling = config.rope_scaling + rope_type = rope_scaling.get("rope_type", rope_scaling.get("type", None)) # BC: "rope_type" was originally "type" + required_keys = {"rope_type", "short_factor", "long_factor"} + # TODO (joao): update logic for the inclusion of `original_max_position_embeddings` + optional_keys = {"attention_factor", "factor", "original_max_position_embeddings"} + received_keys = set(rope_scaling.keys()) + _check_received_keys(rope_type, received_keys, required_keys, optional_keys, ignore_keys=ignore_keys) + + partial_rotary_factor = config.partial_rotary_factor if hasattr(config, "partial_rotary_factor") else 1.0 + head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads) + dim = int(head_dim * partial_rotary_factor) + + short_factor = rope_scaling.get("short_factor") + if not isinstance(short_factor, list) and all(isinstance(x, (int, float)) for x in short_factor): + logger.warning(f"`rope_scaling`'s short_factor field must be a list of numbers, got {short_factor}") + if not len(short_factor) == dim // 2: + logger.warning(f"`rope_scaling`'s short_factor field must have length {dim // 2}, got {len(short_factor)}") + + long_factor = rope_scaling.get("long_factor") + if not isinstance(long_factor, list) and all(isinstance(x, (int, float)) for x in long_factor): + logger.warning(f"`rope_scaling`'s long_factor field must be a list of numbers, got {long_factor}") + if not len(long_factor) == dim // 2: + logger.warning(f"`rope_scaling`'s long_factor field must have length {dim // 2}, got {len(long_factor)}") + + # Handle Phi3 divergence: prefer the use of `attention_factor` and/or `factor` over + # `original_max_position_embeddings` to compute internal variables. The latter lives outside `rope_scaling` and is + # unique to longrope (= undesirable) + if hasattr(config, "original_max_position_embeddings"): + logger.warning_once( + "This model has set a `original_max_position_embeddings` field, to be used together with " + "`max_position_embeddings` to determine a scaling factor. Please set the `factor` field of `rope_scaling`" + "with this ratio instead -- we recommend the use of this field over `original_max_position_embeddings`, " + "as it is compatible with most model architectures." + ) + else: + factor = rope_scaling.get("factor") + if factor is None: + logger.warning("Missing required keys in `rope_scaling`: 'factor'") + elif not isinstance(factor, float) or factor < 1.0: + logger.warning(f"`rope_scaling`'s factor field must be a float >= 1, got {factor}") + + attention_factor = rope_scaling.get("attention_factor") + if attention_factor is not None: + if not isinstance(attention_factor, float) or attention_factor < 0.0: + logger.warning( + f"`rope_scaling`'s attention_factor field must be a float greater than 0, got {attention_factor}" + ) + + +def _validate_llama3_parameters(config: PretrainedConfig, ignore_keys: Optional[set] = None): + rope_scaling = config.rope_scaling + rope_type = rope_scaling.get("rope_type", rope_scaling.get("type", None)) # BC: "rope_type" was originally "type" + required_keys = {"rope_type", "factor", "original_max_position_embeddings", "low_freq_factor", "high_freq_factor"} + received_keys = set(rope_scaling.keys()) + _check_received_keys(rope_type, received_keys, required_keys, ignore_keys=ignore_keys) + + factor = rope_scaling["factor"] + if factor is None or not isinstance(factor, float) or factor < 1.0: + logger.warning(f"`rope_scaling`'s factor field must be a float >= 1, got {factor}") + + low_freq_factor = rope_scaling["low_freq_factor"] + high_freq_factor = rope_scaling["high_freq_factor"] + if low_freq_factor is None or not isinstance(low_freq_factor, float): + logger.warning(f"`rope_scaling`'s low_freq_factor field must be a float, got {low_freq_factor}") + if high_freq_factor is None or not isinstance(high_freq_factor, float): + logger.warning(f"`rope_scaling`'s high_freq_factor field must be a float, got {high_freq_factor}") + if high_freq_factor <= low_freq_factor: + logger.warning( + "`rope_scaling`'s high_freq_factor field must be greater than low_freq_factor, got high_freq_factor=" + f"{high_freq_factor} and low_freq_factor={low_freq_factor}" + ) + + original_max_position_embeddings = rope_scaling["original_max_position_embeddings"] + if original_max_position_embeddings is None or not isinstance(original_max_position_embeddings, int): + logger.warning( + "`rope_scaling`'s original_max_position_embeddings field must be an integer, got " + f"{original_max_position_embeddings}" + ) + if original_max_position_embeddings >= config.max_position_embeddings: + logger.warning( + "`rope_scaling`'s original_max_position_embeddings field must be less than max_position_embeddings, got " + f"{original_max_position_embeddings} and max_position_embeddings={config.max_position_embeddings}" + ) + + +# Like `ROPE_INIT_FUNCTIONS`, this validation function mapping can be dynamically updated for custom RoPE types. +ROPE_VALIDATION_FUNCTIONS = { + "default": _validate_default_rope_parameters, + "linear": _validate_linear_scaling_rope_parameters, + "dynamic": _validate_dynamic_scaling_rope_parameters, + "yarn": _validate_yarn_parameters, + "longrope": _validate_longrope_parameters, + "llama3": _validate_llama3_parameters, +} + + +def rope_config_validation(config: PretrainedConfig, ignore_keys: Optional[set] = None): + """ + Validate the RoPE config arguments, given a `PretrainedConfig` object + """ + rope_scaling = getattr(config, "rope_scaling", None) # not a default parameter in `PretrainedConfig` + if rope_scaling is None: + return + + # BC: "rope_type" was originally "type" + rope_type = rope_scaling.get("rope_type", rope_scaling.get("type", "default")) + validation_fn = ROPE_VALIDATION_FUNCTIONS.get(rope_type) + if validation_fn is not None: + validation_fn(config, ignore_keys=ignore_keys) + else: + logger.warning( + f"Missing validation function mapping in `ROPE_VALIDATION_FUNCTIONS` for 'rope_type'='{rope_type}'" + ) diff --git a/docs/transformers/build/lib/transformers/modeling_tf_outputs.py b/docs/transformers/build/lib/transformers/modeling_tf_outputs.py new file mode 100644 index 0000000000000000000000000000000000000000..cbc8b3682af608d0db6da02ef36764d0be10ca9c --- /dev/null +++ b/docs/transformers/build/lib/transformers/modeling_tf_outputs.py @@ -0,0 +1,991 @@ +# Copyright 2020 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +import warnings +from dataclasses import dataclass +from typing import List, Optional, Tuple + +import tensorflow as tf + +from .utils import ModelOutput + + +@dataclass +class TFBaseModelOutput(ModelOutput): + """ + Base class for model's outputs, with potential hidden states and attentions. + + Args: + last_hidden_state (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`): + Sequence of hidden-states at the output of the last layer of the model. + hidden_states (`tuple(tf.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape + `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + + last_hidden_state: Optional[tf.Tensor] = None + hidden_states: Tuple[tf.Tensor] | None = None + attentions: Tuple[tf.Tensor] | None = None + + +@dataclass +class TFBaseModelOutputWithNoAttention(ModelOutput): + """ + Base class for model's outputs, with potential hidden states. + + Args: + last_hidden_state (`tf.Tensor` shape `(batch_size, num_channels, height, width)`): + Sequence of hidden-states at the output of the last layer of the model. + hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `tf.Tensor` (one for the output of the embeddings, if the model has an embedding layer, + one for + the output of each layer) of shape `(batch_size, num_channels, height, width)`. + + Hidden-states of the model at the output of each layer plus the optional initial embedding outputs. + """ + + last_hidden_state: Optional[tf.Tensor] = None + hidden_states: Optional[Tuple[tf.Tensor, ...]] = None + + +@dataclass +class TFBaseModelOutputWithPooling(ModelOutput): + """ + Base class for model's outputs that also contains a pooling of the last hidden states. + + Args: + last_hidden_state (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`): + Sequence of hidden-states at the output of the last layer of the model. + pooler_output (`tf.Tensor` of shape `(batch_size, hidden_size)`): + Last layer hidden-state of the first token of the sequence (classification token) further processed by a + Linear layer and a Tanh activation function. The Linear layer weights are trained from the next sentence + prediction (classification) objective during pretraining. + + This output is usually *not* a good summary of the semantic content of the input, you're often better with + averaging or pooling the sequence of hidden-states for the whole input sequence. + hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape + `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + + last_hidden_state: Optional[tf.Tensor] = None + pooler_output: Optional[tf.Tensor] = None + hidden_states: Tuple[tf.Tensor] | None = None + attentions: Tuple[tf.Tensor] | None = None + + +@dataclass +class TFBaseModelOutputWithPoolingAndNoAttention(ModelOutput): + """ + Base class for model's outputs that also contains a pooling of the last hidden states. + + Args: + last_hidden_state (`tf.Tensor` of shape `(batch_size, num_channels, height, width)`): + Sequence of hidden-states at the output of the last layer of the model. + pooler_output (`tf.Tensor` of shape `(batch_size, hidden_size)`): + Last layer hidden-state after a pooling operation on the spatial dimensions. + hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `tf.Tensor` (one for the output of the embeddings, if the model has an embedding layer, + one for + the output of each layer) of shape `(batch_size, num_channels, height, width)`. + + Hidden-states of the model at the output of each layer plus the optional initial embedding outputs. + """ + + last_hidden_state: Optional[tf.Tensor] = None + pooler_output: Optional[tf.Tensor] = None + hidden_states: Optional[Tuple[tf.Tensor, ...]] = None + + +@dataclass +class TFBaseModelOutputWithPoolingAndCrossAttentions(ModelOutput): + """ + Base class for model's outputs that also contains a pooling of the last hidden states. + + Args: + last_hidden_state (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`): + Sequence of hidden-states at the output of the last layer of the model. + pooler_output (`tf.Tensor` of shape `(batch_size, hidden_size)`): + Last layer hidden-state of the first token of the sequence (classification token) further processed by a + Linear layer and a Tanh activation function. The Linear layer weights are trained from the next sentence + prediction (classification) objective during pretraining. + + This output is usually *not* a good summary of the semantic content of the input, you're often better with + averaging or pooling the sequence of hidden-states for the whole input sequence. + past_key_values (`List[tf.Tensor]`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): + List of `tf.Tensor` of length `config.n_layers`, with each tensor of shape `(2, batch_size, num_heads, + sequence_length, embed_size_per_head)`). + + Contains pre-computed hidden-states (key and values in the attention blocks) that can be used (see + `past_key_values` input) to speed up sequential decoding. + hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape + `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + cross_attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the + weighted average in the cross-attention heads. + """ + + last_hidden_state: Optional[tf.Tensor] = None + pooler_output: Optional[tf.Tensor] = None + past_key_values: List[tf.Tensor] | None = None + hidden_states: Tuple[tf.Tensor] | None = None + attentions: Tuple[tf.Tensor] | None = None + cross_attentions: Tuple[tf.Tensor] | None = None + + +@dataclass +class TFBaseModelOutputWithPast(ModelOutput): + """ + Base class for model's outputs that may also contain a past key/values (to speed up sequential decoding). + + Args: + last_hidden_state (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`): + Sequence of hidden-states at the output of the last layer of the model. + + If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1, + hidden_size)` is output. + past_key_values (`List[tf.Tensor]`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): + List of `tf.Tensor` of length `config.n_layers`, with each tensor of shape `(2, batch_size, num_heads, + sequence_length, embed_size_per_head)`). + + Contains pre-computed hidden-states (key and values in the attention blocks) that can be used (see + `past_key_values` input) to speed up sequential decoding. + hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape + `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + + last_hidden_state: Optional[tf.Tensor] = None + past_key_values: List[tf.Tensor] | None = None + hidden_states: Tuple[tf.Tensor] | None = None + attentions: Tuple[tf.Tensor] | None = None + + +@dataclass +class TFBaseModelOutputWithCrossAttentions(ModelOutput): + """ + Base class for model's outputs, with potential hidden states and attentions. + + Args: + last_hidden_state (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`): + Sequence of hidden-states at the output of the last layer of the model. + hidden_states (`tuple(tf.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape + `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + cross_attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the + weighted average in the cross-attention heads. + """ + + last_hidden_state: Optional[tf.Tensor] = None + hidden_states: Tuple[tf.Tensor] | None = None + attentions: Tuple[tf.Tensor] | None = None + cross_attentions: Tuple[tf.Tensor] | None = None + + +@dataclass +class TFBaseModelOutputWithPastAndCrossAttentions(ModelOutput): + """ + Base class for model's outputs that may also contain a past key/values (to speed up sequential decoding). + + Args: + last_hidden_state (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`): + Sequence of hidden-states at the output of the last layer of the model. + + If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1, + hidden_size)` is output. + past_key_values (`List[tf.Tensor]`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): + List of `tf.Tensor` of length `config.n_layers`, with each tensor of shape `(2, batch_size, num_heads, + sequence_length, embed_size_per_head)`). + + Contains pre-computed hidden-states (key and values in the attention blocks) that can be used (see + `past_key_values` input) to speed up sequential decoding. + hidden_states (`tuple(tf.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape + `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + cross_attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the + weighted average in the cross-attention heads. + """ + + last_hidden_state: Optional[tf.Tensor] = None + past_key_values: List[tf.Tensor] | None = None + hidden_states: Tuple[tf.Tensor] | None = None + attentions: Tuple[tf.Tensor] | None = None + cross_attentions: Tuple[tf.Tensor] | None = None + + +@dataclass +class TFSeq2SeqModelOutput(ModelOutput): + """ + Base class for model encoder's outputs that also contains : pre-computed hidden states that can speed up sequential + decoding. + + Args: + last_hidden_state (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`): + Sequence of hidden-states at the output of the last layer of the decoder of the model. + + If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1, + hidden_size)` is output. + past_key_values (`List[tf.Tensor]`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): + List of `tf.Tensor` of length `config.n_layers`, with each tensor of shape `(2, batch_size, num_heads, + sequence_length, embed_size_per_head)`). + + Contains pre-computed hidden-states (key and values in the attention blocks) of the decoder that can be + used (see `past_key_values` input) to speed up sequential decoding. + decoder_hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape + `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the decoder at the output of each layer plus the initial embedding outputs. + decoder_attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the + self-attention heads. + cross_attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the + weighted average in the cross-attention heads. + encoder_last_hidden_state (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Sequence of hidden-states at the output of the last layer of the encoder of the model. + encoder_hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape + `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the encoder at the output of each layer plus the initial embedding outputs. + encoder_attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the + self-attention heads. + """ + + last_hidden_state: Optional[tf.Tensor] = None + past_key_values: List[tf.Tensor] | None = None + decoder_hidden_states: Tuple[tf.Tensor] | None = None + decoder_attentions: Tuple[tf.Tensor] | None = None + cross_attentions: Tuple[tf.Tensor] | None = None + encoder_last_hidden_state: tf.Tensor | None = None + encoder_hidden_states: Tuple[tf.Tensor] | None = None + encoder_attentions: Tuple[tf.Tensor] | None = None + + +@dataclass +class TFCausalLMOutput(ModelOutput): + """ + Base class for causal language model (or autoregressive) outputs. + + Args: + loss (`tf.Tensor` of shape `(n,)`, *optional*, where n is the number of non-masked labels, returned when `labels` is provided): + Language modeling loss (for next-token prediction). + logits (`tf.Tensor` of shape `(batch_size, sequence_length, config.vocab_size)`): + Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). + hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape + `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + + loss: tf.Tensor | None = None + logits: Optional[tf.Tensor] = None + hidden_states: Tuple[tf.Tensor] | None = None + attentions: Tuple[tf.Tensor] | None = None + + +@dataclass +class TFCausalLMOutputWithPast(ModelOutput): + """ + Base class for causal language model (or autoregressive) outputs. + + Args: + loss (`tf.Tensor` of shape `(n,)`, *optional*, where n is the number of non-masked labels, returned when `labels` is provided): + Language modeling loss (for next-token prediction). + logits (`tf.Tensor` of shape `(batch_size, sequence_length, config.vocab_size)`): + Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). + past_key_values (`List[tf.Tensor]`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): + List of `tf.Tensor` of length `config.n_layers`, with each tensor of shape `(2, batch_size, num_heads, + sequence_length, embed_size_per_head)`). + + Contains pre-computed hidden-states (key and values in the attention blocks) that can be used (see + `past_key_values` input) to speed up sequential decoding. + hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape + `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + + loss: tf.Tensor | None = None + logits: Optional[tf.Tensor] = None + past_key_values: List[tf.Tensor] | None = None + hidden_states: Tuple[tf.Tensor] | None = None + attentions: Tuple[tf.Tensor] | None = None + + +@dataclass +class TFCausalLMOutputWithCrossAttentions(ModelOutput): + """ + Base class for causal language model (or autoregressive) outputs. + + Args: + loss (`tf.Tensor` of shape `(n,)`, *optional*, where n is the number of non-masked labels, returned when `labels` is provided): + Language modeling loss (for next-token prediction). + logits (`tf.Tensor` of shape `(batch_size, sequence_length, config.vocab_size)`): + Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). + hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape + `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + cross_attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the + weighted average in the cross-attention heads. + past_key_values (`List[tf.Tensor]`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): + List of `tf.Tensor` of length `config.n_layers`, with each tensor of shape `(2, batch_size, num_heads, + sequence_length, embed_size_per_head)`). + + Contains pre-computed hidden-states (key and values in the attention blocks) that can be used (see + `past_key_values` input) to speed up sequential decoding. + """ + + loss: tf.Tensor | None = None + logits: Optional[tf.Tensor] = None + past_key_values: List[tf.Tensor] | None = None + hidden_states: Tuple[tf.Tensor] | None = None + attentions: Tuple[tf.Tensor] | None = None + cross_attentions: Tuple[tf.Tensor] | None = None + + +@dataclass +class TFMaskedLMOutput(ModelOutput): + """ + Base class for masked language models outputs. + + Args: + loss (`tf.Tensor` of shape `(n,)`, *optional*, where n is the number of non-masked labels, returned when `labels` is provided): + Masked language modeling (MLM) loss. + logits (`tf.Tensor` of shape `(batch_size, sequence_length, config.vocab_size)`): + Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). + hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape + `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + + loss: tf.Tensor | None = None + logits: Optional[tf.Tensor] = None + hidden_states: Tuple[tf.Tensor] | None = None + attentions: Tuple[tf.Tensor] | None = None + + +@dataclass +class TFSeq2SeqLMOutput(ModelOutput): + """ + Base class for sequence-to-sequence language models outputs. + + Args: + loss (`tf.Tensor` of shape `(n,)`, *optional*, where n is the number of non-masked labels, returned when `labels` is provided): + Language modeling loss. + logits (`tf.Tensor` of shape `(batch_size, sequence_length, config.vocab_size)`): + Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). + past_key_values (`List[tf.Tensor]`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): + List of `tf.Tensor` of length `config.n_layers`, with each tensor of shape `(2, batch_size, num_heads, + sequence_length, embed_size_per_head)`). + + Contains pre-computed hidden-states (key and values in the attention blocks) of the decoder that can be + used (see `past_key_values` input) to speed up sequential decoding. + decoder_hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape + `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the decoder at the output of each layer plus the initial embedding outputs. + decoder_attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the + self-attention heads. + cross_attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the + weighted average in the cross-attention heads. + encoder_last_hidden_state (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Sequence of hidden-states at the output of the last layer of the encoder of the model. + encoder_hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape + `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the encoder at the output of each layer plus the initial embedding outputs. + encoder_attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the + self-attention heads. + """ + + loss: tf.Tensor | None = None + logits: Optional[tf.Tensor] = None + past_key_values: List[tf.Tensor] | None = None + decoder_hidden_states: Tuple[tf.Tensor] | None = None + decoder_attentions: Tuple[tf.Tensor] | None = None + cross_attentions: Tuple[tf.Tensor] | None = None + encoder_last_hidden_state: tf.Tensor | None = None + encoder_hidden_states: Tuple[tf.Tensor] | None = None + encoder_attentions: Tuple[tf.Tensor] | None = None + + +@dataclass +class TFNextSentencePredictorOutput(ModelOutput): + """ + Base class for outputs of models predicting if two sentences are consecutive or not. + + Args: + loss (`tf.Tensor` of shape `(n,)`, *optional*, where n is the number of non-masked labels, returned when `next_sentence_label` is provided): + Next sentence prediction loss. + logits (`tf.Tensor` of shape `(batch_size, 2)`): + Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation + before SoftMax). + hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape + `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + + loss: tf.Tensor | None = None + logits: Optional[tf.Tensor] = None + hidden_states: Tuple[tf.Tensor] | None = None + attentions: Tuple[tf.Tensor] | None = None + + +@dataclass +class TFSequenceClassifierOutput(ModelOutput): + """ + Base class for outputs of sentence classification models. + + Args: + loss (`tf.Tensor` of shape `(batch_size, )`, *optional*, returned when `labels` is provided): + Classification (or regression if config.num_labels==1) loss. + logits (`tf.Tensor` of shape `(batch_size, config.num_labels)`): + Classification (or regression if config.num_labels==1) scores (before SoftMax). + hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape + `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + + loss: tf.Tensor | None = None + logits: Optional[tf.Tensor] = None + hidden_states: Tuple[tf.Tensor] | None = None + attentions: Tuple[tf.Tensor] | None = None + + +@dataclass +class TFSeq2SeqSequenceClassifierOutput(ModelOutput): + """ + Base class for outputs of sequence-to-sequence sentence classification models. + + Args: + loss (`tf.Tensor` of shape `(1,)`, *optional*, returned when `label` is provided): + Classification (or regression if config.num_labels==1) loss. + logits (`tf.Tensor` of shape `(batch_size, config.num_labels)`): + Classification (or regression if config.num_labels==1) scores (before SoftMax). + past_key_values (`List[tf.Tensor]`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): + List of `tf.Tensor` of length `config.n_layers`, with each tensor of shape `(2, batch_size, num_heads, + sequence_length, embed_size_per_head)`). + + Contains pre-computed hidden-states (key and values in the attention blocks) of the decoder that can be + used (see `past_key_values` input) to speed up sequential decoding. + decoder_hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape + `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the decoder at the output of each layer plus the initial embedding outputs. + decoder_attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the + self-attention heads. + cross_attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)` + encoder_last_hidden_state (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Sequence of hidden-states at the output of the last layer of the encoder of the model. + encoder_hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape + `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the encoder at the output of each layer plus the initial embedding outputs. + encoder_attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the + self-attention heads. + """ + + loss: tf.Tensor | None = None + logits: Optional[tf.Tensor] = None + past_key_values: List[tf.Tensor] | None = None + decoder_hidden_states: Tuple[tf.Tensor] | None = None + decoder_attentions: Tuple[tf.Tensor] | None = None + cross_attentions: Tuple[tf.Tensor] | None = None + encoder_last_hidden_state: tf.Tensor | None = None + encoder_hidden_states: Tuple[tf.Tensor] | None = None + encoder_attentions: Tuple[tf.Tensor] | None = None + + +@dataclass +class TFSemanticSegmenterOutput(ModelOutput): + """ + Base class for outputs of semantic segmentation models. + + Args: + loss (`tf.Tensor` of shape `(1,)`, *optional*, returned when `labels` is provided): + Classification (or regression if config.num_labels==1) loss. + logits (`tf.Tensor` of shape `(batch_size, config.num_labels, logits_height, logits_width)`): + Classification scores for each pixel. + + + + The logits returned do not necessarily have the same size as the `pixel_values` passed as inputs. This is + to avoid doing two interpolations and lose some quality when a user needs to resize the logits to the + original image size as post-processing. You should always check your logits shape and resize as needed. + + + + hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `tf.Tensor` (one for the output of the embeddings, if the model has an embedding layer, + one for + the output of each layer) of shape `(batch_size, patch_size, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the optional initial embedding outputs. + attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, patch_size, sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + + loss: tf.Tensor | None = None + logits: Optional[tf.Tensor] = None + hidden_states: Tuple[tf.Tensor] | None = None + attentions: Tuple[tf.Tensor] | None = None + + +@dataclass +class TFSemanticSegmenterOutputWithNoAttention(ModelOutput): + """ + Base class for outputs of semantic segmentation models that do not output attention scores. + + Args: + loss (`tf.Tensor` of shape `(1,)`, *optional*, returned when `labels` is provided): + Classification (or regression if config.num_labels==1) loss. + logits (`tf.Tensor` of shape `(batch_size, config.num_labels, logits_height, logits_width)`): + Classification scores for each pixel. + + + + The logits returned do not necessarily have the same size as the `pixel_values` passed as inputs. This is + to avoid doing two interpolations and lose some quality when a user needs to resize the logits to the + original image size as post-processing. You should always check your logits shape and resize as needed. + + + + hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `tf.Tensor` (one for the output of the embeddings, if the model has an embedding layer, + one for + the output of each layer) of shape `(batch_size, patch_size, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the optional initial embedding outputs. + """ + + loss: tf.Tensor | None = None + logits: Optional[tf.Tensor] = None + hidden_states: Tuple[tf.Tensor] | None = None + + +@dataclass +class TFImageClassifierOutput(ModelOutput): + """ + Base class for outputs of image classification models. + + Args: + loss (`tf.Tensor` of shape `(1,)`, *optional*, returned when `labels` is provided): + Classification (or regression if config.num_labels==1) loss. + logits (`tf.Tensor` of shape `(batch_size, config.num_labels)`): + Classification (or regression if config.num_labels==1) scores (before SoftMax). + hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `tf.Tensor` (one for the output of the embeddings, if the model has an embedding layer, + one for + the output of each stage) of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states (also called + feature maps) of the model at the output of each stage. + attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, patch_size, sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + + loss: tf.Tensor | None = None + logits: Optional[tf.Tensor] = None + hidden_states: Tuple[tf.Tensor] | None = None + attentions: Tuple[tf.Tensor] | None = None + + +@dataclass +class TFMultipleChoiceModelOutput(ModelOutput): + """ + Base class for outputs of multiple choice models. + + Args: + loss (`tf.Tensor` of shape *(batch_size, )*, *optional*, returned when `labels` is provided): + Classification loss. + logits (`tf.Tensor` of shape `(batch_size, num_choices)`): + *num_choices* is the second dimension of the input tensors. (see *input_ids* above). + + Classification scores (before SoftMax). + hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape + `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + + loss: tf.Tensor | None = None + logits: Optional[tf.Tensor] = None + hidden_states: Tuple[tf.Tensor] | None = None + attentions: Tuple[tf.Tensor] | None = None + + +@dataclass +class TFTokenClassifierOutput(ModelOutput): + """ + Base class for outputs of token classification models. + + Args: + loss (`tf.Tensor` of shape `(n,)`, *optional*, where n is the number of unmasked labels, returned when `labels` is provided) : + Classification loss. + logits (`tf.Tensor` of shape `(batch_size, sequence_length, config.num_labels)`): + Classification scores (before SoftMax). + hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape + `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + + loss: tf.Tensor | None = None + logits: Optional[tf.Tensor] = None + hidden_states: Tuple[tf.Tensor] | None = None + attentions: Tuple[tf.Tensor] | None = None + + +@dataclass +class TFQuestionAnsweringModelOutput(ModelOutput): + """ + Base class for outputs of question answering models. + + Args: + loss (`tf.Tensor` of shape `(batch_size, )`, *optional*, returned when `start_positions` and `end_positions` are provided): + Total span extraction loss is the sum of a Cross-Entropy for the start and end positions. + start_logits (`tf.Tensor` of shape `(batch_size, sequence_length)`): + Span-start scores (before SoftMax). + end_logits (`tf.Tensor` of shape `(batch_size, sequence_length)`): + Span-end scores (before SoftMax). + hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape + `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + + loss: tf.Tensor | None = None + start_logits: Optional[tf.Tensor] = None + end_logits: Optional[tf.Tensor] = None + hidden_states: Tuple[tf.Tensor] | None = None + attentions: Tuple[tf.Tensor] | None = None + + +@dataclass +class TFSeq2SeqQuestionAnsweringModelOutput(ModelOutput): + """ + Base class for outputs of sequence-to-sequence question answering models. + + Args: + loss (`tf.Tensor` of shape `(1,)`, *optional*, returned when `labels` is provided): + Total span extraction loss is the sum of a Cross-Entropy for the start and end positions. + start_logits (`tf.Tensor` of shape `(batch_size, sequence_length)`): + Span-start scores (before SoftMax). + end_logits (`tf.Tensor` of shape `(batch_size, sequence_length)`): + Span-end scores (before SoftMax). + past_key_values (`List[tf.Tensor]`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): + List of `tf.Tensor` of length `config.n_layers`, with each tensor of shape `(2, batch_size, num_heads, + sequence_length, embed_size_per_head)`). + + Contains pre-computed hidden-states (key and values in the attention blocks) of the decoder that can be + used (see `past_key_values` input) to speed up sequential decoding. + decoder_hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape + `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the decoder at the output of each layer plus the initial embedding outputs. + decoder_attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the + self-attention heads. + encoder_last_hidden_state (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Sequence of hidden-states at the output of the last layer of the encoder of the model. + encoder_hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape + `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the encoder at the output of each layer plus the initial embedding outputs. + encoder_attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the + self-attention heads. + """ + + loss: tf.Tensor | None = None + start_logits: Optional[tf.Tensor] = None + end_logits: Optional[tf.Tensor] = None + past_key_values: List[tf.Tensor] | None = None + decoder_hidden_states: Tuple[tf.Tensor] | None = None + decoder_attentions: Tuple[tf.Tensor] | None = None + encoder_last_hidden_state: tf.Tensor | None = None + encoder_hidden_states: Tuple[tf.Tensor] | None = None + encoder_attentions: Tuple[tf.Tensor] | None = None + + +@dataclass +class TFSequenceClassifierOutputWithPast(ModelOutput): + """ + Base class for outputs of sentence classification models. + + Args: + loss (`tf.Tensor` of shape `(batch_size, )`, *optional*, returned when `labels` is provided): + Classification (or regression if config.num_labels==1) loss. + logits (`tf.Tensor` of shape `(batch_size, config.num_labels)`): + Classification (or regression if config.num_labels==1) scores (before SoftMax). + past_key_values (`List[tf.Tensor]`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): + List of `tf.Tensor` of length `config.n_layers`, with each tensor of shape `(2, batch_size, num_heads, + sequence_length, embed_size_per_head)`). + + Contains pre-computed hidden-states (key and values in the attention blocks) that can be used (see + `past_key_values` input) to speed up sequential decoding. + hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape + `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + + loss: tf.Tensor | None = None + logits: Optional[tf.Tensor] = None + past_key_values: List[tf.Tensor] | None = None + hidden_states: Tuple[tf.Tensor] | None = None + attentions: Tuple[tf.Tensor] | None = None + + +@dataclass +class TFImageClassifierOutputWithNoAttention(ModelOutput): + """ + Base class for outputs of image classification models. + + Args: + loss (`tf.Tensor` of shape `(1,)`, *optional*, returned when `labels` is provided): + Classification (or regression if config.num_labels==1) loss. + logits (`tf.Tensor` of shape `(batch_size, config.num_labels)`): + Classification (or regression if config.num_labels==1) scores (before SoftMax). + hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `tf.Tensor` (one for the output of the embeddings, if the model has an embedding layer, + one for + the output of each stage) of shape `(batch_size, num_channels, height, width)`. Hidden-states (also called + feature maps) of the model at the output of each stage. + """ + + loss: tf.Tensor | None = None + logits: Optional[tf.Tensor] = None + hidden_states: Optional[Tuple[tf.Tensor, ...]] = None + + +@dataclass +class TFMaskedImageModelingOutput(ModelOutput): + """ + Base class for outputs of masked image completion / in-painting models. + + Args: + loss (`tf.Tensor` of shape `(1,)`, *optional*, returned when `bool_masked_pos` is provided): + Reconstruction loss. + reconstruction (`tf.Tensor` of shape `(batch_size, num_channels, height, width)`): + Reconstructed / completed images. + hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when + `config.output_hidden_states=True`): + Tuple of `tf.Tensor` (one for the output of the embeddings, if the model has an embedding layer, + one for + the output of each stage) of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states (also called + feature maps) of the model at the output of each stage. + attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when + `config.output_attentions=True`): + Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, patch_size, sequence_length)`. + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + + loss: tf.Tensor | None = None + reconstruction: Optional[tf.Tensor] = None + hidden_states: Tuple[tf.Tensor] | None = None + attentions: Tuple[tf.Tensor] | None = None + + @property + def logits(self): + warnings.warn( + "logits attribute is deprecated and will be removed in version 5 of Transformers." + " Please use the reconstruction attribute to retrieve the final output instead.", + FutureWarning, + ) + return self.reconstruction diff --git a/docs/transformers/build/lib/transformers/modeling_tf_pytorch_utils.py b/docs/transformers/build/lib/transformers/modeling_tf_pytorch_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..84a6ddaebcc46c12a2a08e4612fde2f4f20cf648 --- /dev/null +++ b/docs/transformers/build/lib/transformers/modeling_tf_pytorch_utils.py @@ -0,0 +1,671 @@ +# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch - TF 2.0 general utilities.""" + +import os +import re + +import numpy + +from .utils import ( + ExplicitEnum, + expand_dims, + is_numpy_array, + is_safetensors_available, + is_torch_tensor, + logging, + reshape, + squeeze, + tensor_size, +) +from .utils import transpose as transpose_func + + +if is_safetensors_available(): + from safetensors import safe_open + + +logger = logging.get_logger(__name__) + + +class TransposeType(ExplicitEnum): + """ + Possible ... + """ + + NO = "no" + SIMPLE = "simple" + CONV1D = "conv1d" + CONV2D = "conv2d" + + +def convert_tf_weight_name_to_pt_weight_name( + tf_name, start_prefix_to_remove="", tf_weight_shape=None, name_scope=None +): + """ + Convert a TF 2.0 model variable name in a pytorch model weight name. + + Conventions for TF2.0 scopes -> PyTorch attribute names conversions: + + - '$1___$2' is replaced by $2 (can be used to duplicate or remove layers in TF2.0 vs PyTorch) + - '_._' is replaced by a new level separation (can be used to convert TF2.0 lists in PyTorch nn.ModulesList) + + return tuple with: + + - pytorch model weight name + - transpose: `TransposeType` member indicating whether and how TF2.0 and PyTorch weights matrices should be + transposed with regards to each other + """ + if name_scope is not None: + if not tf_name.startswith(name_scope) and "final_logits_bias" not in tf_name: + raise ValueError( + f"Weight name {tf_name} does not start with name_scope {name_scope}. This is an internal error " + "in Transformers, so (unless you were doing something really evil) please open an issue to report it!" + ) + tf_name = tf_name[len(name_scope) :] + tf_name = tf_name.lstrip("/") + tf_name = tf_name.replace(":0", "") # device ids + tf_name = re.sub( + r"/[^/]*___([^/]*)/", r"/\1/", tf_name + ) # '$1___$2' is replaced by $2 (can be used to duplicate or remove layers in TF2.0 vs PyTorch) + tf_name = tf_name.replace( + "_._", "/" + ) # '_._' is replaced by a level separation (can be used to convert TF2.0 lists in PyTorch nn.ModulesList) + tf_name = re.sub(r"//+", "/", tf_name) # Remove empty levels at the end + tf_name = tf_name.split("/") # Convert from TF2.0 '/' separators to PyTorch '.' separators + # Some weights have a single name without "/" such as final_logits_bias in BART + if len(tf_name) > 1: + tf_name = tf_name[1:] # Remove level zero + + tf_weight_shape = list(tf_weight_shape) + + # When should we transpose the weights + if tf_name[-1] == "kernel" and tf_weight_shape is not None and len(tf_weight_shape) == 4: + transpose = TransposeType.CONV2D + elif tf_name[-1] == "kernel" and tf_weight_shape is not None and len(tf_weight_shape) == 3: + transpose = TransposeType.CONV1D + elif bool( + tf_name[-1] in ["kernel", "pointwise_kernel", "depthwise_kernel"] + or "emb_projs" in tf_name + or "out_projs" in tf_name + ): + transpose = TransposeType.SIMPLE + else: + transpose = TransposeType.NO + + # Convert standard TF2.0 names in PyTorch names + if tf_name[-1] == "kernel" or tf_name[-1] == "embeddings" or tf_name[-1] == "gamma": + tf_name[-1] = "weight" + if tf_name[-1] == "beta": + tf_name[-1] = "bias" + + # The SeparableConv1D TF layer contains two weights that are translated to PyTorch Conv1D here + if tf_name[-1] == "pointwise_kernel" or tf_name[-1] == "depthwise_kernel": + tf_name[-1] = tf_name[-1].replace("_kernel", ".weight") + + # Remove prefix if needed + tf_name = ".".join(tf_name) + if start_prefix_to_remove: + tf_name = tf_name.replace(start_prefix_to_remove, "", 1) + + return tf_name, transpose + + +def apply_transpose(transpose: TransposeType, weight, match_shape=None, pt_to_tf=True): + """ + Apply a transpose to some weight then tries to reshape the weight to the same shape as a given shape, all in a + framework agnostic way. + """ + if transpose is TransposeType.CONV2D: + # Conv2D weight: + # PT: (num_out_channel, num_in_channel, kernel[0], kernel[1]) + # -> TF: (kernel[0], kernel[1], num_in_channel, num_out_channel) + axes = (2, 3, 1, 0) if pt_to_tf else (3, 2, 0, 1) + weight = transpose_func(weight, axes=axes) + elif transpose is TransposeType.CONV1D: + # Conv1D weight: + # PT: (num_out_channel, num_in_channel, kernel) + # -> TF: (kernel, num_in_channel, num_out_channel) + weight = transpose_func(weight, axes=(2, 1, 0)) + elif transpose is TransposeType.SIMPLE: + weight = transpose_func(weight) + + if match_shape is None: + return weight + + if len(match_shape) < len(weight.shape): + weight = squeeze(weight) + elif len(match_shape) > len(weight.shape): + weight = expand_dims(weight, axis=0) + + if list(match_shape) != list(weight.shape): + try: + weight = reshape(weight, match_shape) + except AssertionError as e: + e.args += (match_shape, match_shape) + raise e + + return weight + + +##################### +# PyTorch => TF 2.0 # +##################### + + +def load_pytorch_checkpoint_in_tf2_model( + tf_model, + pytorch_checkpoint_path, + tf_inputs=None, + allow_missing_keys=False, + output_loading_info=False, + _prefix=None, + tf_to_pt_weight_rename=None, +): + """Load pytorch checkpoints in a TF 2.0 model""" + try: + import tensorflow as tf # noqa: F401 + import torch # noqa: F401 + from safetensors.torch import load_file as safe_load_file # noqa: F401 + except ImportError: + logger.error( + "Loading a PyTorch model in TensorFlow, requires both PyTorch and TensorFlow to be installed. Please see " + "https://pytorch.org/ and https://www.tensorflow.org/install/ for installation instructions." + ) + raise + + # Treats a single file as a collection of shards with 1 shard. + if isinstance(pytorch_checkpoint_path, str): + pytorch_checkpoint_path = [pytorch_checkpoint_path] + + # Loads all shards into a single state dictionary + pt_state_dict = {} + for path in pytorch_checkpoint_path: + pt_path = os.path.abspath(path) + logger.info(f"Loading PyTorch weights from {pt_path}") + if pt_path.endswith(".safetensors"): + state_dict = safe_load_file(pt_path) + else: + state_dict = torch.load(pt_path, map_location="cpu", weights_only=True) + + pt_state_dict.update(state_dict) + + logger.info(f"PyTorch checkpoint contains {sum(t.numel() for t in pt_state_dict.values()):,} parameters") + + return load_pytorch_weights_in_tf2_model( + tf_model, + pt_state_dict, + tf_inputs=tf_inputs, + allow_missing_keys=allow_missing_keys, + output_loading_info=output_loading_info, + _prefix=_prefix, + tf_to_pt_weight_rename=tf_to_pt_weight_rename, + ) + + +def load_pytorch_model_in_tf2_model(tf_model, pt_model, tf_inputs=None, allow_missing_keys=False): + """Load pytorch checkpoints in a TF 2.0 model""" + pt_state_dict = pt_model.state_dict() + + return load_pytorch_weights_in_tf2_model( + tf_model, pt_state_dict, tf_inputs=tf_inputs, allow_missing_keys=allow_missing_keys + ) + + +def load_pytorch_weights_in_tf2_model( + tf_model, + pt_state_dict, + tf_inputs=None, + allow_missing_keys=False, + output_loading_info=False, + _prefix=None, + tf_to_pt_weight_rename=None, +): + """Load pytorch state_dict in a TF 2.0 model.""" + try: + import tensorflow as tf # noqa: F401 + import torch # noqa: F401 + except ImportError: + logger.error( + "Loading a PyTorch model in TensorFlow, requires both PyTorch and TensorFlow to be installed. Please see " + "https://pytorch.org/ and https://www.tensorflow.org/install/ for installation instructions." + ) + raise + + # Numpy doesn't understand bfloat16, so upcast to a dtype that doesn't lose precision + pt_state_dict = { + k: v.numpy() if v.dtype != torch.bfloat16 else v.float().numpy() for k, v in pt_state_dict.items() + } + return load_pytorch_state_dict_in_tf2_model( + tf_model, + pt_state_dict, + tf_inputs=tf_inputs, + allow_missing_keys=allow_missing_keys, + output_loading_info=output_loading_info, + _prefix=_prefix, + tf_to_pt_weight_rename=tf_to_pt_weight_rename, + ) + + +def _log_key_warnings(missing_keys, unexpected_keys, mismatched_keys, class_name): + if len(unexpected_keys) > 0: + logger.warning( + "Some weights of the PyTorch model were not used when initializing the TF 2.0 model" + f" {class_name}: {unexpected_keys}\n- This IS expected if you are initializing" + f" {class_name} from a PyTorch model trained on another task or with another architecture" + " (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).\n- This IS" + f" NOT expected if you are initializing {class_name} from a PyTorch model that you expect" + " to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a" + " BertForSequenceClassification model)." + ) + else: + logger.warning(f"All PyTorch model weights were used when initializing {class_name}.\n") + if len(missing_keys) > 0: + logger.warning( + f"Some weights or buffers of the TF 2.0 model {class_name} were not initialized from the" + f" PyTorch model and are newly initialized: {missing_keys}\nYou should probably TRAIN this model on a" + " down-stream task to be able to use it for predictions and inference." + ) + else: + logger.warning( + f"All the weights of {class_name} were initialized from the PyTorch model.\n" + "If your task is similar to the task the model of the checkpoint was trained on, " + f"you can already use {class_name} for predictions without further training." + ) + + if len(mismatched_keys) > 0: + mismatched_warning = "\n".join( + [ + f"- {key}: found shape {shape1} in the checkpoint and {shape2} in the model instantiated" + for key, shape1, shape2 in mismatched_keys + ] + ) + logger.warning( + f"Some weights of {class_name} were not initialized from the model checkpoint" + f" are newly initialized because the shapes did not" + f" match:\n{mismatched_warning}\nYou should probably TRAIN this model on a down-stream task to be able" + " to use it for predictions and inference." + ) + + +def load_pytorch_state_dict_in_tf2_model( + tf_model, + pt_state_dict, + tf_inputs=None, + allow_missing_keys=False, + output_loading_info=False, + _prefix=None, + tf_to_pt_weight_rename=None, + ignore_mismatched_sizes=False, + skip_logger_warnings=False, +): + """Load a pytorch state_dict in a TF 2.0 model. pt_state_dict can be either an actual dict or a lazy-loading + safetensors archive created with the safe_open() function.""" + import tensorflow as tf + + if tf_inputs is None: + tf_inputs = tf_model.dummy_inputs + + if _prefix is None: + _prefix = "" + if tf_inputs: + with tf.name_scope(_prefix): + tf_model(tf_inputs, training=False) # Make sure model is built + # Convert old format to new format if needed from a PyTorch state_dict + tf_keys_to_pt_keys = {} + for key in pt_state_dict.keys(): + new_key = None + if "gamma" in key: + new_key = key.replace("gamma", "weight") + if "beta" in key: + new_key = key.replace("beta", "bias") + if "running_var" in key: + new_key = key.replace("running_var", "moving_variance") + if "running_mean" in key: + new_key = key.replace("running_mean", "moving_mean") + + # New `weight_norm` from https://github.com/huggingface/transformers/pull/24030 + key_components = key.split(".") + name = None + if key_components[-3::2] == ["parametrizations", "original0"]: + name = key_components[-2] + "_g" + elif key_components[-3::2] == ["parametrizations", "original1"]: + name = key_components[-2] + "_v" + if name is not None: + key_components = key_components[:-3] + [name] + new_key = ".".join(key_components) + + if new_key is None: + new_key = key + tf_keys_to_pt_keys[new_key] = key + + # Matt: All TF models store the actual model stem in a MainLayer class, including the base model. + # In PT, the derived models (with heads) use the base model class as the stem instead, + # and there is no MainLayer class. This means that TF base classes have one + # extra layer in their weight names, corresponding to the MainLayer class. This code block compensates for that. + start_prefix_to_remove = "" + if not any(s.startswith(tf_model.base_model_prefix) for s in tf_keys_to_pt_keys.keys()): + start_prefix_to_remove = tf_model.base_model_prefix + "." + + symbolic_weights = tf_model.trainable_weights + tf_model.non_trainable_weights + tf_loaded_numel = 0 + all_pytorch_weights = set(tf_keys_to_pt_keys.keys()) + missing_keys = [] + mismatched_keys = [] + is_safetensor_archive = hasattr(pt_state_dict, "get_tensor") + for symbolic_weight in symbolic_weights: + sw_name = symbolic_weight.name + name, transpose = convert_tf_weight_name_to_pt_weight_name( + sw_name, + start_prefix_to_remove=start_prefix_to_remove, + tf_weight_shape=symbolic_weight.shape, + name_scope=_prefix, + ) + if tf_to_pt_weight_rename is not None: + aliases = tf_to_pt_weight_rename(name) # Is a tuple to account for possible name aliasing + for alias in aliases: # The aliases are in priority order, take the first one that matches + if alias in tf_keys_to_pt_keys: + name = alias + break + else: + # If none of the aliases match, just use the first one (it'll be reported as missing) + name = aliases[0] + + # Find associated numpy array in pytorch model state dict + if name not in tf_keys_to_pt_keys: + if allow_missing_keys: + missing_keys.append(name) + continue + elif tf_model._keys_to_ignore_on_load_missing is not None: + # authorized missing keys don't have to be loaded + if any(re.search(pat, name) is not None for pat in tf_model._keys_to_ignore_on_load_missing): + continue + raise AttributeError(f"{name} not found in PyTorch model") + state_dict_name = tf_keys_to_pt_keys[name] + if is_safetensor_archive: + array = pt_state_dict.get_tensor(state_dict_name) + else: + array = pt_state_dict[state_dict_name] + try: + array = apply_transpose(transpose, array, symbolic_weight.shape) + except tf.errors.InvalidArgumentError as e: + if not ignore_mismatched_sizes: + error_msg = str(e) + error_msg += ( + "\n\tYou may consider adding `ignore_mismatched_sizes=True` in the model `from_pretrained` method." + ) + raise tf.errors.InvalidArgumentError(error_msg) + else: + mismatched_keys.append((name, array.shape, symbolic_weight.shape)) + continue + + tf_loaded_numel += tensor_size(array) + + symbolic_weight.assign(tf.cast(array, symbolic_weight.dtype)) + del array # Immediately free memory to keep peak usage as low as possible + all_pytorch_weights.discard(name) + + logger.info(f"Loaded {tf_loaded_numel:,} parameters in the TF 2.0 model.") + + unexpected_keys = list(all_pytorch_weights) + + if tf_model._keys_to_ignore_on_load_missing is not None: + for pat in tf_model._keys_to_ignore_on_load_missing: + missing_keys = [k for k in missing_keys if re.search(pat, k) is None] + if tf_model._keys_to_ignore_on_load_unexpected is not None: + for pat in tf_model._keys_to_ignore_on_load_unexpected: + unexpected_keys = [k for k in unexpected_keys if re.search(pat, k) is None] + if not skip_logger_warnings: + _log_key_warnings(missing_keys, unexpected_keys, mismatched_keys, class_name=tf_model.__class__.__name__) + + if output_loading_info: + loading_info = { + "missing_keys": missing_keys, + "unexpected_keys": unexpected_keys, + "mismatched_keys": mismatched_keys, + } + return tf_model, loading_info + + return tf_model + + +def load_sharded_pytorch_safetensors_in_tf2_model( + tf_model, + safetensors_shards, + tf_inputs=None, + allow_missing_keys=False, + output_loading_info=False, + _prefix=None, + tf_to_pt_weight_rename=None, + ignore_mismatched_sizes=False, +): + all_loading_infos = [] + for shard in safetensors_shards: + with safe_open(shard, framework="tf") as safetensors_archive: + tf_model, loading_info = load_pytorch_state_dict_in_tf2_model( + tf_model, + safetensors_archive, + tf_inputs=tf_inputs, + allow_missing_keys=allow_missing_keys, + output_loading_info=True, + _prefix=_prefix, + tf_to_pt_weight_rename=tf_to_pt_weight_rename, + ignore_mismatched_sizes=ignore_mismatched_sizes, + skip_logger_warnings=True, # We will emit merged warnings at the end + ) + all_loading_infos.append(loading_info) + # Now we just need to merge the loading info + # Keys are missing only if they're missing in *every* shard + missing_keys = sorted(set.intersection(*[set(info["missing_keys"]) for info in all_loading_infos])) + # Keys are unexpected/mismatched if they're unexpected/mismatched in *any* shard + unexpected_keys = sum([info["unexpected_keys"] for info in all_loading_infos], []) + mismatched_keys = sum([info["mismatched_keys"] for info in all_loading_infos], []) + + _log_key_warnings(missing_keys, unexpected_keys, mismatched_keys, class_name=tf_model.__class__.__name__) + + if output_loading_info: + loading_info = { + "missing_keys": missing_keys, + "unexpected_keys": unexpected_keys, + "mismatched_keys": mismatched_keys, + } + return tf_model, loading_info + + return tf_model + + +##################### +# TF 2.0 => PyTorch # +##################### + + +def load_tf2_checkpoint_in_pytorch_model( + pt_model, tf_checkpoint_path, tf_inputs=None, allow_missing_keys=False, output_loading_info=False +): + """ + Load TF 2.0 HDF5 checkpoint in a PyTorch model We use HDF5 to easily do transfer learning (see + https://github.com/tensorflow/tensorflow/blob/ee16fcac960ae660e0e4496658a366e2f745e1f0/tensorflow/python/keras/engine/network.py#L1352-L1357). + """ + try: + import tensorflow as tf # noqa: F401 + import torch # noqa: F401 + except ImportError: + logger.error( + "Loading a TensorFlow model in PyTorch, requires both PyTorch and TensorFlow to be installed. Please see " + "https://pytorch.org/ and https://www.tensorflow.org/install/ for installation instructions." + ) + raise + + import transformers + + from .modeling_tf_utils import load_tf_weights + + logger.info(f"Loading TensorFlow weights from {tf_checkpoint_path}") + + # Instantiate and load the associated TF 2.0 model + tf_model_class_name = "TF" + pt_model.__class__.__name__ # Add "TF" at the beginning + tf_model_class = getattr(transformers, tf_model_class_name) + tf_model = tf_model_class(pt_model.config) + + if tf_inputs is None: + tf_inputs = tf_model.dummy_inputs + + if tf_inputs is not None: + tf_model(tf_inputs, training=False) # Make sure model is built + + load_tf_weights(tf_model, tf_checkpoint_path) + + return load_tf2_model_in_pytorch_model( + pt_model, tf_model, allow_missing_keys=allow_missing_keys, output_loading_info=output_loading_info + ) + + +def load_tf2_model_in_pytorch_model(pt_model, tf_model, allow_missing_keys=False, output_loading_info=False): + """Load TF 2.0 model in a pytorch model""" + weights = tf_model.weights + + return load_tf2_weights_in_pytorch_model( + pt_model, weights, allow_missing_keys=allow_missing_keys, output_loading_info=output_loading_info + ) + + +def load_tf2_weights_in_pytorch_model(pt_model, tf_weights, allow_missing_keys=False, output_loading_info=False): + """Load TF2.0 symbolic weights in a PyTorch model""" + try: + import tensorflow as tf # noqa: F401 + import torch # noqa: F401 + except ImportError: + logger.error( + "Loading a TensorFlow model in PyTorch, requires both PyTorch and TensorFlow to be installed. Please see " + "https://pytorch.org/ and https://www.tensorflow.org/install/ for installation instructions." + ) + raise + + tf_state_dict = {tf_weight.name: tf_weight.numpy() for tf_weight in tf_weights} + return load_tf2_state_dict_in_pytorch_model( + pt_model, tf_state_dict, allow_missing_keys=allow_missing_keys, output_loading_info=output_loading_info + ) + + +def load_tf2_state_dict_in_pytorch_model(pt_model, tf_state_dict, allow_missing_keys=False, output_loading_info=False): + import torch + + new_pt_params_dict = {} + current_pt_params_dict = dict(pt_model.named_parameters()) + + # Make sure we are able to load PyTorch base models as well as derived models (with heads) + # TF models always have a prefix, some of PyTorch models (base ones) don't + start_prefix_to_remove = "" + if not any(s.startswith(pt_model.base_model_prefix) for s in current_pt_params_dict.keys()): + start_prefix_to_remove = pt_model.base_model_prefix + "." + + # Build a map from potential PyTorch weight names to TF 2.0 Variables + tf_weights_map = {} + for name, tf_weight in tf_state_dict.items(): + pt_name, transpose = convert_tf_weight_name_to_pt_weight_name( + name, start_prefix_to_remove=start_prefix_to_remove, tf_weight_shape=tf_weight.shape + ) + tf_weights_map[pt_name] = (tf_weight, transpose) + + all_tf_weights = set(tf_weights_map.keys()) + loaded_pt_weights_data_ptr = {} + missing_keys_pt = [] + for pt_weight_name, pt_weight in current_pt_params_dict.items(): + # Handle PyTorch shared weight ()not duplicated in TF 2.0 + if pt_weight.data_ptr() in loaded_pt_weights_data_ptr: + new_pt_params_dict[pt_weight_name] = loaded_pt_weights_data_ptr[pt_weight.data_ptr()] + continue + + pt_weight_name_to_check = pt_weight_name + # New `weight_norm` from https://github.com/huggingface/transformers/pull/24030 + key_components = pt_weight_name.split(".") + name = None + if key_components[-3::2] == ["parametrizations", "original0"]: + name = key_components[-2] + "_g" + elif key_components[-3::2] == ["parametrizations", "original1"]: + name = key_components[-2] + "_v" + if name is not None: + key_components = key_components[:-3] + [name] + pt_weight_name_to_check = ".".join(key_components) + + # Find associated numpy array in pytorch model state dict + if pt_weight_name_to_check not in tf_weights_map: + if allow_missing_keys: + missing_keys_pt.append(pt_weight_name) + continue + + raise AttributeError(f"{pt_weight_name} not found in TF 2.0 model") + + array, transpose = tf_weights_map[pt_weight_name_to_check] + + array = apply_transpose(transpose, array, pt_weight.shape, pt_to_tf=False) + + if numpy.isscalar(array): + array = numpy.array(array) + if not is_torch_tensor(array) and not is_numpy_array(array): + array = array.numpy() + if is_numpy_array(array): + # Convert to torch tensor + array = torch.from_numpy(array) + + new_pt_params_dict[pt_weight_name] = array + loaded_pt_weights_data_ptr[pt_weight.data_ptr()] = array + all_tf_weights.discard(pt_weight_name) + + missing_keys, unexpected_keys = pt_model.load_state_dict(new_pt_params_dict, strict=False) + missing_keys += missing_keys_pt + + # Some models may have keys that are not in the state by design, removing them before needlessly warning + # the user. + if pt_model._keys_to_ignore_on_load_missing is not None: + for pat in pt_model._keys_to_ignore_on_load_missing: + missing_keys = [k for k in missing_keys if re.search(pat, k) is None] + + if pt_model._keys_to_ignore_on_load_unexpected is not None: + for pat in pt_model._keys_to_ignore_on_load_unexpected: + unexpected_keys = [k for k in unexpected_keys if re.search(pat, k) is None] + + if len(unexpected_keys) > 0: + logger.warning( + "Some weights of the TF 2.0 model were not used when initializing the PyTorch model" + f" {pt_model.__class__.__name__}: {unexpected_keys}\n- This IS expected if you are initializing" + f" {pt_model.__class__.__name__} from a TF 2.0 model trained on another task or with another architecture" + " (e.g. initializing a BertForSequenceClassification model from a TFBertForPreTraining model).\n- This IS" + f" NOT expected if you are initializing {pt_model.__class__.__name__} from a TF 2.0 model that you expect" + " to be exactly identical (e.g. initializing a BertForSequenceClassification model from a" + " TFBertForSequenceClassification model)." + ) + else: + logger.warning(f"All TF 2.0 model weights were used when initializing {pt_model.__class__.__name__}.\n") + if len(missing_keys) > 0: + logger.warning( + f"Some weights of {pt_model.__class__.__name__} were not initialized from the TF 2.0 model and are newly" + f" initialized: {missing_keys}\nYou should probably TRAIN this model on a down-stream task to be able to" + " use it for predictions and inference." + ) + else: + logger.warning( + f"All the weights of {pt_model.__class__.__name__} were initialized from the TF 2.0 model.\n" + "If your task is similar to the task the model of the checkpoint was trained on, " + f"you can already use {pt_model.__class__.__name__} for predictions without further training." + ) + + logger.info(f"Weights or buffers not loaded from TF 2.0 model: {all_tf_weights}") + + if output_loading_info: + loading_info = {"missing_keys": missing_keys, "unexpected_keys": unexpected_keys} + return pt_model, loading_info + + return pt_model diff --git a/docs/transformers/build/lib/transformers/modeling_tf_utils.py b/docs/transformers/build/lib/transformers/modeling_tf_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..c0add5bb6672bf77ca0fca34a5bc194ab80f89f5 --- /dev/null +++ b/docs/transformers/build/lib/transformers/modeling_tf_utils.py @@ -0,0 +1,3535 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""TF general model utils.""" + +from __future__ import annotations + +import functools +import gc +import inspect +import json +import os +import pickle +import re +import warnings +from collections.abc import Mapping +from pathlib import Path +from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Union + +import h5py +import numpy as np +import tensorflow as tf +from packaging.version import parse + +from . import DataCollatorWithPadding, DefaultDataCollator +from .activations_tf import get_tf_activation +from .configuration_utils import PretrainedConfig +from .dynamic_module_utils import custom_object_save +from .generation import GenerationConfig, TFGenerationMixin +from .tf_utils import ( + convert_batch_encoding, + expand_1d, + load_attributes_from_hdf5_group, + save_attributes_to_hdf5_group, + shape_list, +) +from .utils import ( + SAFE_WEIGHTS_INDEX_NAME, + SAFE_WEIGHTS_NAME, + TF2_WEIGHTS_INDEX_NAME, + TF2_WEIGHTS_NAME, + TF_WEIGHTS_NAME, + WEIGHTS_INDEX_NAME, + WEIGHTS_NAME, + ModelOutput, + PushToHubMixin, + cached_file, + download_url, + find_labels, + has_file, + is_offline_mode, + is_remote_url, + is_safetensors_available, + is_tf_symbolic_tensor, + logging, + requires_backends, + working_or_temp_dir, +) +from .utils.hub import convert_file_size_to_int, get_checkpoint_shard_files + + +if is_safetensors_available(): + from safetensors import safe_open + from safetensors.tensorflow import save_file as safe_save_file + +if TYPE_CHECKING: + from . import PreTrainedTokenizerBase + +logger = logging.get_logger(__name__) + +if "TF_USE_LEGACY_KERAS" not in os.environ: + os.environ["TF_USE_LEGACY_KERAS"] = "1" # Compatibility fix to make sure tf.keras stays at Keras 2 +elif os.environ["TF_USE_LEGACY_KERAS"] != "1": + logger.warning( + "Transformers is only compatible with Keras 2, but you have explicitly set `TF_USE_LEGACY_KERAS` to `0`. " + "This may result in unexpected behaviour or errors if Keras 3 objects are passed to Transformers models." + ) + +try: + import tf_keras as keras + from tf_keras import backend as K +except (ModuleNotFoundError, ImportError): + import keras + from keras import backend as K + + if parse(keras.__version__).major > 2: + raise ValueError( + "Your currently installed version of Keras is Keras 3, but this is not yet supported in " + "Transformers. Please install the backwards-compatible tf-keras package with " + "`pip install tf-keras`." + ) + + +tf_logger = tf.get_logger() + +TFModelInputType = Union[ + List[tf.Tensor], + List[np.ndarray], + Dict[str, tf.Tensor], + Dict[str, np.ndarray], + tf.Tensor, + np.ndarray, +] + + +def dummy_loss(y_true, y_pred): + if y_pred.shape.rank <= 1: + return y_pred + else: + reduction_axes = list(range(1, y_pred.shape.rank)) + return tf.reduce_mean(y_pred, axis=reduction_axes) + + +class TFModelUtilsMixin: + """ + A few utilities for `keras.Model`, to be used as a mixin. + """ + + def num_parameters(self, only_trainable: bool = False) -> int: + """ + Get the number of (optionally, trainable) parameters in the model. + + Args: + only_trainable (`bool`, *optional*, defaults to `False`): + Whether or not to return only the number of trainable parameters + + Returns: + `int`: The number of parameters. + """ + if only_trainable: + return int(sum(np.prod(w.shape.as_list()) for w in self.trainable_variables)) + else: + return self.count_params() + + +def keras_serializable(cls): + """ + Decorate a Keras Layer class to support Keras serialization. + + This is done by: + + 1. Adding a `transformers_config` dict to the Keras config dictionary in `get_config` (called by Keras at + serialization time. + 2. Wrapping `__init__` to accept that `transformers_config` dict (passed by Keras at deserialization time) and + convert it to a config object for the actual layer initializer. + 3. Registering the class as a custom object in Keras (if the Tensorflow version supports this), so that it does not + need to be supplied in `custom_objects` in the call to `keras.models.load_model`. + + Args: + cls (a `keras.layers.Layers subclass`): + Typically a `TF.MainLayer` class in this project, in general must accept a `config` argument to its + initializer. + + Returns: + The same class object, with modifications for Keras deserialization. + """ + initializer = cls.__init__ + + config_class = getattr(cls, "config_class", None) + if config_class is None: + raise AttributeError("Must set `config_class` to use @keras_serializable") + + @functools.wraps(initializer) + def wrapped_init(self, *args, **kwargs): + config = args[0] if args and isinstance(args[0], PretrainedConfig) else kwargs.pop("config", None) + + if isinstance(config, dict): + config = config_class.from_dict(config) + initializer(self, config, *args, **kwargs) + elif isinstance(config, PretrainedConfig): + if len(args) > 0: + initializer(self, *args, **kwargs) + else: + initializer(self, config, *args, **kwargs) + else: + raise ValueError("Must pass either `config` (PretrainedConfig) or `config` (dict)") + + self._config = config + self._kwargs = kwargs + + cls.__init__ = wrapped_init + + if not hasattr(cls, "get_config"): + raise TypeError("Only use @keras_serializable on keras.layers.Layer subclasses") + if hasattr(cls.get_config, "_is_default"): + + def get_config(self): + cfg = super(cls, self).get_config() + cfg["config"] = self._config.to_dict() + cfg.update(self._kwargs) + return cfg + + cls.get_config = get_config + + cls._keras_serializable = True + if hasattr(keras.utils, "register_keras_serializable"): + cls = keras.utils.register_keras_serializable()(cls) + return cls + + +class TFCausalLanguageModelingLoss: + """ + Loss function suitable for causal language modeling (CLM), that is, the task of guessing the next token. + + + + Any label of -100 will be ignored (along with the corresponding logits) in the loss computation. + + + """ + + def hf_compute_loss(self, labels, logits): + loss_fn = keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction=keras.losses.Reduction.NONE) + if self.config.tf_legacy_loss: + # make sure only labels that are not equal to -100 affect the loss + active_loss = tf.not_equal(tf.reshape(labels, (-1,)), -100) + reduced_logits = tf.boolean_mask(tf.reshape(logits, (-1, shape_list(logits)[2])), active_loss) + labels = tf.boolean_mask(tf.reshape(labels, (-1,)), active_loss) + return loss_fn(labels, reduced_logits) + + # Clip negative labels to zero here to avoid NaNs and errors - those positions will get masked later anyway + unmasked_loss = loss_fn(tf.nn.relu(labels), logits) + # make sure only labels that are not equal to -100 affect the loss + loss_mask = tf.cast(labels != -100, dtype=unmasked_loss.dtype) + masked_loss = unmasked_loss * loss_mask + reduced_masked_loss = tf.reduce_sum(masked_loss) / tf.reduce_sum(loss_mask) + return tf.reshape(reduced_masked_loss, (1,)) + + +class TFQuestionAnsweringLoss: + """ + Loss function suitable for question answering. + """ + + def hf_compute_loss(self, labels, logits): + loss_fn = keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction=keras.losses.Reduction.NONE) + start_loss = loss_fn(labels["start_position"], logits[0]) + end_loss = loss_fn(labels["end_position"], logits[1]) + + return (start_loss + end_loss) / 2.0 + + +class TFTokenClassificationLoss: + """ + Loss function suitable for token classification. + + + + Any label of -100 will be ignored (along with the corresponding logits) in the loss computation. + + + """ + + def hf_compute_loss(self, labels, logits): + loss_fn = keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction=keras.losses.Reduction.NONE) + if tf.executing_eagerly(): # Data-dependent conditionals are forbidden in XLA + if tf.math.reduce_any(labels == -1): + tf.print("Using `-1` to mask the loss for the token is deprecated. Please use `-100` instead.") + + if self.config.tf_legacy_loss: + # make sure only labels that are not equal to -100 + # are taken into account as loss + if tf.math.reduce_any(labels == -1): + tf.print("Using `-1` to mask the loss for the token is deprecated. Please use `-100` instead.") + active_loss = tf.reshape(labels, (-1,)) != -1 + else: + active_loss = tf.reshape(labels, (-1,)) != -100 + reduced_logits = tf.boolean_mask(tf.reshape(logits, (-1, shape_list(logits)[2])), active_loss) + labels = tf.boolean_mask(tf.reshape(labels, (-1,)), active_loss) + + return loss_fn(labels, reduced_logits) + + # Clip negative labels to zero here to avoid NaNs and errors - those positions will get masked later anyway + unmasked_loss = loss_fn(tf.nn.relu(labels), logits) + # make sure only labels that are not equal to -100 or -1 + # are taken into account as loss + loss_mask = tf.cast(labels >= 0, dtype=unmasked_loss.dtype) + # Avoid possible division by zero later + # Masked positions will have a loss of NaN because -100 and -1 are not valid labels + masked_loss = unmasked_loss * loss_mask + reduced_masked_loss = tf.reduce_sum(masked_loss) / tf.reduce_sum(loss_mask) + return tf.reshape(reduced_masked_loss, (1,)) + + +class TFSequenceClassificationLoss: + """ + Loss function suitable for sequence classification. + """ + + def hf_compute_loss(self, labels, logits): + if logits.shape.rank == 1 or logits.shape[1] == 1: + loss_fn = keras.losses.MeanSquaredError(reduction=keras.losses.Reduction.NONE) + if labels.shape.rank == 1: + # MeanSquaredError returns a scalar loss if the labels are 1D, so avoid that + labels = tf.expand_dims(labels, axis=-1) + else: + loss_fn = keras.losses.SparseCategoricalCrossentropy( + from_logits=True, reduction=keras.losses.Reduction.NONE + ) + + return loss_fn(labels, logits) + + +class TFMultipleChoiceLoss: + """Loss function suitable for multiple choice tasks.""" + + def hf_compute_loss(self, labels, logits): + loss_fn = keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction=keras.losses.Reduction.NONE) + return loss_fn(labels, logits) + + +class TFMaskedLanguageModelingLoss(TFCausalLanguageModelingLoss): + """ + Loss function suitable for masked language modeling (MLM), that is, the task of guessing the masked tokens. + + + + Any label of -100 will be ignored (along with the corresponding logits) in the loss computation. + + + """ + + +class TFNextSentencePredictionLoss: + """ + Loss function suitable for next sentence prediction (NSP), that is, the task of guessing the next sentence. + + + + Any label of -100 will be ignored (along with the corresponding logits) in the loss computation. + + + """ + + def hf_compute_loss(self, labels, logits): + loss_fn = keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction=keras.losses.Reduction.NONE) + if self.config.tf_legacy_loss: + # make sure only labels that are not equal to -100 + # are taken into account as loss + next_sentence_active_loss = tf.not_equal(tf.reshape(labels, (-1,)), -100) + next_sentence_reduced_logits = tf.boolean_mask(tf.reshape(logits, (-1, 2)), next_sentence_active_loss) + next_sentence_label = tf.boolean_mask(tf.reshape(labels, (-1,)), next_sentence_active_loss) + + return loss_fn(next_sentence_label, next_sentence_reduced_logits) + + # make sure only labels that are not equal to -100 + # are taken into account as loss + + # Clip negative labels to zero here to avoid NaNs and errors - those positions will get masked later anyway + unmasked_ns_loss = loss_fn(y_true=tf.nn.relu(labels), y_pred=logits) + ns_loss_mask = tf.cast(labels != -100, dtype=unmasked_ns_loss.dtype) + # Just zero out samples where label is -100, no reduction + masked_ns_loss = unmasked_ns_loss * ns_loss_mask + + return masked_ns_loss + + +def booleans_processing(config, **kwargs): + """ + Process the input booleans of each model. + + Args: + config ([`PretrainedConfig`]): + The config of the running model. + **kwargs: + The boolean parameters + + Returns: + A dictionary with the proper values for each boolean + """ + final_booleans = {} + + # Pure conv models (such as ConvNext) do not have `output_attentions`. If the signature has + # `output_attentions`, it will be present here in `kwargs`, even if unset (in that case, as `None`) + if "output_attentions" in kwargs: + final_booleans["output_attentions"] = ( + kwargs["output_attentions"] if kwargs["output_attentions"] is not None else config.output_attentions + ) + final_booleans["output_hidden_states"] = ( + kwargs["output_hidden_states"] if kwargs["output_hidden_states"] is not None else config.output_hidden_states + ) + final_booleans["return_dict"] = kwargs["return_dict"] if kwargs["return_dict"] is not None else config.return_dict + + if "use_cache" in kwargs: + final_booleans["use_cache"] = ( + kwargs["use_cache"] if kwargs["use_cache"] is not None else getattr(config, "use_cache", None) + ) + return final_booleans + + +def unpack_inputs(func): + """ + Decorator that processes the inputs to a Keras layer, passing them to the layer as keyword arguments. This enables + downstream use of the inputs by their variable name, even if they arrive packed as a dictionary in the first input + (common case in Keras). + + Args: + func (`callable`): + The callable function of the TensorFlow model. + + + Returns: + A callable that wraps the original `func` with the behavior described above. + """ + + original_signature = inspect.signature(func) + + @functools.wraps(func) + def run_call_with_unpacked_inputs(self, *args, **kwargs): + # isolates the actual `**kwargs` for the decorated function + kwargs_call = {key: val for key, val in kwargs.items() if key not in dict(original_signature.parameters)} + fn_args_and_kwargs = {key: val for key, val in kwargs.items() if key not in kwargs_call} + fn_args_and_kwargs.update({"kwargs_call": kwargs_call}) + + # move any arg into kwargs, if they exist + fn_args_and_kwargs.update(dict(zip(func.__code__.co_varnames[1:], args))) + + # Encoder Decoder models delegate the application of the configuration options to their inner models. + if "EncoderDecoder" in self.__class__.__name__: + config = None + else: + config = self.config + + unpacked_inputs = input_processing(func, config, **fn_args_and_kwargs) + return func(self, **unpacked_inputs) + + # Keras enforces the first layer argument to be passed, and checks it through `inspect.getfullargspec()`. This + # function does not follow wrapper chains (i.e. ignores `functools.wraps()`), meaning that without the line below + # Keras would attempt to check the first argument against the literal signature of the wrapper. + run_call_with_unpacked_inputs.__signature__ = original_signature + + return run_call_with_unpacked_inputs + + +def input_processing(func, config, **kwargs): + """ + Process the input of each TensorFlow model including the booleans. In case of a list of symbolic inputs, each input + has to be named accordingly to the parameters name, i.e. `input_ids = keras.Input(shape=(128,), dtype='int32', + name="input_ids")` otherwise the order of the tensors will not be guaranteed during the training. + + Args: + func (`callable`): + The callable function of the TensorFlow model. + config ([`PretrainedConfig`]): + The config of the running model. + **kwargs: + The inputs of the model. + + Returns: + Two lists, one for the missing layers, and another one for the unexpected layers. + """ + signature = dict(inspect.signature(func).parameters) + has_kwargs = bool(signature.pop("kwargs", None)) + signature.pop("self", None) + parameter_names = list(signature.keys()) + main_input_name = parameter_names[0] + main_input = kwargs.pop(main_input_name, None) + output = {} + allowed_types = (tf.Tensor, bool, int, ModelOutput, tuple, list, dict, np.ndarray) + + if "inputs" in kwargs["kwargs_call"]: + warnings.warn( + "The `inputs` argument is deprecated and will be removed in a future version, use `input_ids` instead.", + FutureWarning, + ) + + output["input_ids"] = kwargs["kwargs_call"].pop("inputs") + + if "decoder_cached_states" in kwargs["kwargs_call"]: + warnings.warn( + "The `decoder_cached_states` argument is deprecated and will be removed in a future version, use" + " `past_key_values` instead.", + FutureWarning, + ) + output["past_key_values"] = kwargs["kwargs_call"].pop("decoder_cached_states") + + if "past" in kwargs["kwargs_call"] and "past_key_values" in parameter_names: + warnings.warn( + "The `past` argument is deprecated and will be removed in a future version, use `past_key_values`" + " instead.", + FutureWarning, + ) + kwargs["past_key_values"] = kwargs["kwargs_call"].pop("past") + elif "past_key_values" in kwargs["kwargs_call"] and "past" in parameter_names: + kwargs["past"] = kwargs["kwargs_call"].pop("past_key_values") + + if has_kwargs: + output["kwargs"] = kwargs.pop("kwargs_call", {}) + else: + if len(kwargs["kwargs_call"]) > 0: + raise ValueError( + "The following keyword arguments are not supported by this model:" + f" {list(kwargs['kwargs_call'].keys())}." + ) + kwargs.pop("kwargs_call") + + for k, v in kwargs.items(): + if isinstance(v, allowed_types) or tf.is_tensor(v) or v is None: + output[k] = v + else: + raise ValueError(f"Data of type {type(v)} is not allowed only {allowed_types} is accepted for {k}.") + + if isinstance(main_input, (tuple, list)): + for i, input in enumerate(main_input): + # EagerTensors don't allow to use the .name property so we check for a real Tensor + if is_tf_symbolic_tensor(input): + # Tensor names have always the pattern `name:id` then we check only the + # `name` part + tensor_name = input.name.split(":")[0] + + if tensor_name in parameter_names: + output[tensor_name] = input + else: + output[parameter_names[i]] = input + elif isinstance(input, allowed_types) or input is None: + output[parameter_names[i]] = input + else: + raise ValueError( + f"Data of type {type(input)} is not allowed only {allowed_types} is accepted for" + f" {parameter_names[i]}." + ) + elif isinstance(main_input, Mapping): + if "inputs" in main_input: + warnings.warn( + "The `inputs` argument is deprecated and will be removed in a future version, use `input_ids`" + " instead.", + FutureWarning, + ) + + output["input_ids"] = main_input.pop("inputs") + + if "decoder_cached_states" in main_input: + warnings.warn( + "The `decoder_cached_states` argument is deprecated and will be removed in a future version, use" + " `past_key_values` instead.", + FutureWarning, + ) + output["past_key_values"] = main_input.pop("decoder_cached_states") + + for k, v in dict(main_input).items(): + if isinstance(v, allowed_types) or v is None: + output[k] = v + elif k not in parameter_names and "args" not in parameter_names: + logger.warning( + f"The parameter {k} does not belongs to the parameter list {parameter_names} and will be ignored." + ) + continue + else: + raise ValueError(f"Data of type {type(v)} is not allowed only {allowed_types} is accepted for {k}.") + else: + if tf.is_tensor(main_input) or main_input is None: + output[main_input_name] = main_input + else: + raise ValueError( + f"Data of type {type(main_input)} is not allowed only {allowed_types} is accepted for" + f" {main_input_name}." + ) + + # Populates any unspecified argument with their default value, according to the signature. + for name in parameter_names: + if name not in list(output.keys()) and name != "args": + output[name] = kwargs.pop(name, signature[name].default) + + # When creating a SavedModel TF calls the method with LayerCall.__call__(args, **kwargs) + # So to respect the proper output we have to add this exception + if "args" in output: + if output["args"] is not None and is_tf_symbolic_tensor(output["args"]): + tensor_name = output["args"].name.split(":")[0] + output[tensor_name] = output["args"] + else: + # `args` in this case is always the first parameter, then `input_ids` + output["input_ids"] = output["args"] + + del output["args"] + + if "kwargs" in output: + del output["kwargs"] + + cast_output = {} + for key, val in output.items(): + if isinstance(val, tf.Tensor) and val.dtype == tf.int64: + cast_output[key] = tf.cast(val, tf.int32) + elif isinstance(val, np.ndarray) and val.dtype == np.int64: + cast_output[key] = val.astype(np.int32) + else: + cast_output[key] = val + + output = cast_output + del cast_output + + if config is not None: + boolean_dict = { + k: v + for k, v in output.items() + if k in ["return_dict", "output_attentions", "output_hidden_states", "use_cache"] + } + + output.update( + booleans_processing( + config=config, + **boolean_dict, + ) + ) + + return output + + +def strip_model_name_and_prefix(name, _prefix=None): + if _prefix is not None and name.startswith(_prefix): + name = name[len(_prefix) :] + if name.startswith("/"): + name = name[1:] + if "model." not in name and len(name.split("/")) > 1: + name = "/".join(name.split("/")[1:]) + return name + + +def tf_shard_checkpoint(weights, max_shard_size="10GB", weights_name: str = TF2_WEIGHTS_NAME): + """ + Splits a model state dictionary in sub-checkpoints so that the final size of each sub-checkpoint does not exceed a + given size. + + The sub-checkpoints are determined by iterating through the `state_dict` in the order of its keys, so there is no + optimization made to make each sub-checkpoint as close as possible to the maximum size passed. For example, if the + limit is 10GB and we have weights of sizes [6GB, 6GB, 2GB, 6GB, 2GB, 2GB] they will get sharded as [6GB], [6+2GB], + [6+2+2GB] and not [6+2+2GB], [6+2GB], [6GB]. + + + + If one of the model's weight is bigger that `max_shard_size`, it will end up in its own sub-checkpoint which will + have a size greater than `max_shard_size`. + + + + Args: + weights (`Dict[str, tf.RessourceVariable]`): The list of tf.RessourceVariable of a model to save. + max_shard_size (`int` or `str`, *optional*, defaults to `"10GB"`): + The maximum size of each sub-checkpoint. If expressed as a string, needs to be digits followed by a unit + (like `"5MB"`). + """ + max_shard_size = convert_file_size_to_int(max_shard_size) + + sharded_state_dicts = [] + current_block = [] + current_block_size = 0 + total_size = 0 + + for item in weights: + weight_size = item.numpy().size * item.dtype.size + + # If this weight is going to tip up over the maximal size, we split. + if current_block_size + weight_size > max_shard_size: + sharded_state_dicts.append(current_block) + current_block = [] + current_block_size = 0 + + current_block.append(item) + current_block_size += weight_size + total_size += weight_size + + # Add the last block + sharded_state_dicts.append(current_block) + + # If we only have one shard, we return it + if len(sharded_state_dicts) == 1: + return {weights_name: sharded_state_dicts[0]}, None + + # Otherwise, let's build the index + weight_map = {} + shards = {} + for idx, shard in enumerate(sharded_state_dicts): + shard_file = weights_name.replace(".h5", f"-{idx + 1:05d}-of-{len(sharded_state_dicts):05d}.h5") + shard_file = shard_file.replace( + ".safetensors", f"-{idx + 1:05d}-of-{len(sharded_state_dicts):05d}.safetensors" + ) + shards[shard_file] = shard + for weight in shard: + weight_name = weight.name + weight_map[weight_name] = shard_file + + # Add the metadata + metadata = {"total_size": total_size} + index = {"metadata": metadata, "weight_map": weight_map} + return shards, index + + +def load_tf_sharded_weights(model, shard_files, ignore_mismatched_sizes=False, strict=False, _prefix=None): + """ + This is the same as `load_tf_weights` but for a sharded checkpoint. Detect missing and unexpected layers and load + the TF weights from the shard file accordingly to their names and shapes. + + This load is performed efficiently: each checkpoint shard is loaded one by one in RAM and deleted after being + loaded in the model. + + Args: + model (`keras.models.Model`): The model in which to load the checkpoint. + shard_files (`str` or `os.PathLike`): A list containing the sharded checkpoint names. + ignore_mismatched_sizes`bool`, *optional`, defaults to `True`): + Whether or not to ignore the mismatch between the sizes + strict (`bool`, *optional*, defaults to `True`): + Whether to strictly enforce that the keys in the model state dict match the keys in the sharded checkpoint. + + Returns: + Three lists, one for the missing layers, another one for the unexpected layers, and a last one for the + mismatched layers. + """ + + # Load the index + unexpected_keys = set() + saved_keys = set() + mismatched_keys = set() + + # Since TF adds the name of the class to its weights, and uses the index and not the name of the layer to load + # the weight, we have to get rid of the first prefix of the name of the layer. + model_keys = set() + model_layer_map = {} + for i, k in enumerate(model.weights): + layer_name = k.name + if _prefix is not None and layer_name.startswith(_prefix): + layer_name = layer_name[len(_prefix) :] + layer_name = layer_name.lstrip("/") + if not ("model." in layer_name or len(layer_name.split("/")) == 1): + layer_name = "/".join(layer_name.split("/")[1:]) + model_keys.add(layer_name) + model_layer_map[layer_name] = i + + for shard_file in shard_files: + saved_weight_names_set, unexpected_keys_set, mismatched_keys_set = load_tf_shard( + model, + model_layer_map, + shard_file, + ignore_mismatched_sizes=ignore_mismatched_sizes, + _prefix=_prefix, + ) + saved_keys.update(saved_weight_names_set) + unexpected_keys.update(unexpected_keys_set) + mismatched_keys.update(mismatched_keys_set) + gc.collect() + + missing_keys = model_keys - saved_keys + if strict and (len(missing_keys) > 0 or len(unexpected_keys) > 0): + error_message = f"Error(s) in loading state_dict for {model.__class__.__name__}" + if len(missing_keys) > 0: + str_missing_keys = ",".join([f'"{k}"' for k in missing_keys]) + error_message += f"\nMissing key(s): {str_missing_keys}." + if len(unexpected_keys) > 0: + str_unexpected_keys = ",".join([f'"{k}"' for k in unexpected_keys]) + error_message += f"\nMissing key(s): {str_unexpected_keys}." + raise RuntimeError(error_message) + + return missing_keys, unexpected_keys, mismatched_keys + + +def load_tf_shard(model, model_layer_map, resolved_archive_file, ignore_mismatched_sizes=False, _prefix=None): + """ + Loads a shard from a sharded checkpoint file. Can be either H5 or Safetensors. + Handles missing keys and unexpected keys. + + Args: + model (`keras.models.Model`): Model in which the weights are loaded + model_layer_map (`Dict`): A dictionary mapping the layer name to the index of the layer in the model. + resolved_archive_file (`str`): Path to the checkpoint file from which the weights will be loaded + ignore_mismatched_sizes (`bool`, *optional*, defaults to `False`): Whether to ignore the mismatched keys + + Returns: + `keras.models.Model`: Three lists, one for the layers that were found and successfully restored (from the + shard file), one for the mismatched layers, and another one for the unexpected layers. + """ + saved_weight_names_set = set() + saved_weights = {} + mismatched_keys = set() + unexpected_keys = set() + # Read the H5 file + try: + with h5py.File(resolved_archive_file, "r") as sharded_checkpoint_file: + # Retrieve the name of each layer from the H5 file + saved_h5_model_layers_name = set(load_attributes_from_hdf5_group(sharded_checkpoint_file, "layer_names")) + weight_value_tuples = [] + + # Compute missing and unexpected sub layers + # Store the weights in list of tuples that looks like [(weight_object, value_of_weight),...] + for layer_name in saved_h5_model_layers_name: + h5_layer_object = sharded_checkpoint_file[layer_name] + saved_weights[layer_name] = np.asarray(h5_layer_object) + + saved_weight_names_set.add(layer_name) + + if layer_name not in model_layer_map: + unexpected_keys.add(layer_name) + else: + symbolic_weight = model.weights[model_layer_map[layer_name]] + + saved_weight_value = saved_weights[layer_name] + # If the current weight is found + if saved_weight_value is not None: + # Check if the shape of the current weight and the one from the H5 file are different + if K.int_shape(symbolic_weight) != saved_weight_value.shape: + # If yes we reshape the weight from the H5 file accordingly to the current weight + # If the two shapes are not compatible we raise an issue + try: + array = np.reshape(saved_weight_value, K.int_shape(symbolic_weight)) + except ValueError as e: + if ignore_mismatched_sizes: + mismatched_keys.add( + (layer_name, saved_weight_value.shape, K.int_shape(symbolic_weight)) + ) + continue + else: + raise e + else: + array = saved_weight_value + + # We create the tuple that will be loaded and add it to the final list + weight_value_tuples.append((symbolic_weight, array)) + + K.batch_set_value(weight_value_tuples) + + return saved_weight_names_set, unexpected_keys, mismatched_keys + + except Exception as e: + try: + with open(resolved_archive_file) as f: + if f.read().startswith("version"): + raise OSError( + "You seem to have cloned a repository without having git-lfs installed. Please install " + "git-lfs and run `git lfs install` followed by `git lfs pull` in the folder " + "you cloned." + ) + else: + raise ValueError( + f"Unable to locate the file {resolved_archive_file} which is necessary to load this pretrained" + " model. Make sure you have saved the model properly." + ) from e + except (UnicodeDecodeError, ValueError): + raise OSError( + f"Unable to load weights from TF checkpoint file for '{resolved_archive_file}' " + f"at '{resolved_archive_file}'. " + "If you tried to load a TF model from a sharded checkpoint, you should try converting the model " + "by loading it in pytorch and saving it locally. A convertion script should be released soon." + ) + + +def load_tf_sharded_weights_from_safetensors( + model, shard_files, ignore_mismatched_sizes=False, strict=False, _prefix=None +): + """ + This is the same as `load_tf_weights_from_safetensors` but for a sharded TF-format safetensors checkpoint. + Detect missing and unexpected layers and load the TF weights from the shard file accordingly to their names and + shapes. + + This load is performed efficiently: each checkpoint shard is loaded one by one in RAM and deleted after being + loaded in the model. + + Args: + model (`keras.models.Model`): The model in which to load the checkpoint. + shard_files (`str` or `os.PathLike`): A list containing the sharded checkpoint names. + ignore_mismatched_sizes`bool`, *optional`, defaults to `True`): + Whether or not to ignore the mismatch between the sizes + strict (`bool`, *optional*, defaults to `True`): + Whether to strictly enforce that the keys in the model state dict match the keys in the sharded checkpoint. + + Returns: + Three lists, one for the missing layers, another one for the unexpected layers, and a last one for the + mismatched layers. + """ + + # Load the index + unexpected_keys = set() + all_missing_keys = [] + mismatched_keys = set() + + for shard_file in shard_files: + missing_layers, unexpected_layers, mismatched_layers = load_tf_weights_from_safetensors( + model, + shard_file, + ignore_mismatched_sizes=ignore_mismatched_sizes, + _prefix=_prefix, + ) + all_missing_keys.append(set(missing_layers)) + unexpected_keys.update(unexpected_layers) + mismatched_keys.update(mismatched_layers) + gc.collect() + missing_keys = set.intersection(*all_missing_keys) + + if strict and (len(missing_keys) > 0 or len(unexpected_keys) > 0): + error_message = f"Error(s) in loading state_dict for {model.__class__.__name__}" + if len(missing_keys) > 0: + str_missing_keys = ",".join([f'"{k}"' for k in missing_keys]) + error_message += f"\nMissing key(s): {str_missing_keys}." + if len(unexpected_keys) > 0: + str_unexpected_keys = ",".join([f'"{k}"' for k in unexpected_keys]) + error_message += f"\nMissing key(s): {str_unexpected_keys}." + raise RuntimeError(error_message) + + return missing_keys, unexpected_keys, mismatched_keys + + +def load_tf_weights(model, resolved_archive_file, ignore_mismatched_sizes=False, _prefix=None): + """ + Detect missing and unexpected layers and load the TF weights from the shard file accordingly to their names and + shapes. + + Args: + model (`keras.models.Model`): + The model to load the weights into. + resolved_archive_file (`str`): + The location of the H5 file. + ignore_mismatched_sizes (`bool`, *optional*, defaults to `False`): + Whether or not to ignore weights with shapes that don't match between the checkpoint of the model. + + Returns: + Three lists, one for the missing layers, another one for the unexpected layers, and a last one for the + mismatched layers. + """ + if resolved_archive_file.endswith(".safetensors"): + load_function = load_tf_weights_from_safetensors + else: + load_function = load_tf_weights_from_h5 + + return load_function( + model, resolved_archive_file, ignore_mismatched_sizes=ignore_mismatched_sizes, _prefix=_prefix + ) + + +def load_tf_weights_from_h5(model, resolved_archive_file, ignore_mismatched_sizes=False, _prefix=None): + mismatched_layers = [] + + # Read the H5 file + with h5py.File(resolved_archive_file, "r") as sharded_checkpoint_file: + # Retrieve the name of each layer from the H5 file + saved_h5_model_layers_name = set(load_attributes_from_hdf5_group(sharded_checkpoint_file, "layer_names")) + + # Find the missing layers from the high level list of layers + missing_layers = list({layer.name for layer in model.layers} - saved_h5_model_layers_name) + + # Find the unexpected layers from the high level list of layers + unexpected_layers = list(saved_h5_model_layers_name - {layer.name for layer in model.layers}) + saved_weight_names_set = set() + symbolic_weights_names = set() + weight_value_tuples = [] + + # Compute missing and unexpected sub layers + # Store the weights in list of tuples that looks like [(weight_object, value_of_weight),...] + for layer in model.layers: + # if layer_name from the H5 file belongs to the layers from the instantiated model + if layer.name in saved_h5_model_layers_name: + # Get the H5 layer object from its name + h5_layer_object = sharded_checkpoint_file[layer.name] + # Get all the weights as a list from the layer object + symbolic_weights = layer.trainable_weights + layer.non_trainable_weights + saved_weights = {} + + # Create a dict from the H5 saved model that looks like {"weight_name": weight_value} + # And a set with only the names + for weight_name in load_attributes_from_hdf5_group(h5_layer_object, "weight_names"): + # TF names always start with the model name so we ignore it + name = "/".join(weight_name.split("/")[1:]) + + if _prefix is not None: + name = _prefix + "/" + name + + saved_weights[name] = np.asarray(h5_layer_object[weight_name]) + + # Add the updated name to the final list for computing missing/unexpected values + saved_weight_names_set.add(name) + + # Loop over each weights from the instantiated model and compare with the weights from the H5 file + for symbolic_weight in symbolic_weights: + # TF names always start with the model name so we ignore it + if _prefix is not None: + delimeter = len(_prefix.split("/")) + symbolic_weight_name = "/".join( + symbolic_weight.name.split("/")[:delimeter] + + symbolic_weight.name.split("/")[delimeter + 1 :] + ) + else: + symbolic_weight_name = "/".join(symbolic_weight.name.split("/")[1:]) + + # here we check if the current weight is among the weights from the H5 file + # If yes, get the weight_value of the corresponding weight from the H5 file + # If not, make the value to None + saved_weight_value = saved_weights.get(symbolic_weight_name, None) + + # Retrocompatibility patch: some embeddings are stored with the weights name (e.g. Bart's + # `model.shared/embeddings:0` are stored as `model.shared/weights:0`) + if saved_weight_value is None and symbolic_weight_name.endswith("embeddings:0"): + symbolic_weight_name = symbolic_weight_name[:-12] + "weight:0" + saved_weight_value = saved_weights.get(symbolic_weight_name, None) + + # Add the updated name to the final list for computing missing/unexpected values + symbolic_weights_names.add(symbolic_weight_name) + + # If the current weight is found + if saved_weight_value is not None: + # Check if the shape of the current weight and the one from the H5 file are different + if K.int_shape(symbolic_weight) != saved_weight_value.shape: + # If yes we reshape the weight from the H5 file accordingly to the current weight + # If the two shapes are not compatible we raise an issue + try: + array = np.reshape(saved_weight_value, K.int_shape(symbolic_weight)) + except ValueError as e: + if ignore_mismatched_sizes: + mismatched_layers.append( + (symbolic_weight_name, saved_weight_value.shape, K.int_shape(symbolic_weight)) + ) + continue + else: + raise e + else: + array = saved_weight_value + + # We create the tuple that will be loaded and add it to the final list + weight_value_tuples.append((symbolic_weight, array)) + + # Load all the weights + K.batch_set_value(weight_value_tuples) + + # Compute the missing and unexpected layers + missing_layers.extend(list(symbolic_weights_names - saved_weight_names_set)) + unexpected_layers.extend(list(saved_weight_names_set - symbolic_weights_names)) + + return missing_layers, unexpected_layers, mismatched_layers + + +def load_tf_weights_from_safetensors(model, resolved_archive_file, ignore_mismatched_sizes=False, _prefix=None): + # Read the safetensors file + with safe_open(resolved_archive_file, framework="tf") as safetensors_archive: + mismatched_layers = [] + weight_names = [strip_model_name_and_prefix(w.name, _prefix=_prefix) for w in model.weights] + loaded_weight_names = list(safetensors_archive.keys()) + # Find the missing layers from the high level list of layers + missing_layers = list(set(weight_names) - set(loaded_weight_names)) + # Find the unexpected layers from the high level list of layers + unexpected_layers = list(set(loaded_weight_names) - set(weight_names)) + + for weight in model.weights: + weight_name = strip_model_name_and_prefix(weight.name, _prefix=_prefix) + if weight_name in loaded_weight_names: + weight_value = safetensors_archive.get_tensor(weight_name) + # Check if the shape of the current weight and the one from the H5 file are different + if K.int_shape(weight) != weight_value.shape: + # If yes we reshape the weight from the H5 file accordingly to the current weight + # If the two shapes are not compatible we raise an issue + try: + weight_value = tf.reshape(weight_value, K.int_shape(weight)) + except (ValueError, tf.errors.InvalidArgumentError) as e: + if ignore_mismatched_sizes: + mismatched_layers.append((weight_name, weight_value.shape, K.int_shape(weight))) + continue + else: + raise e + + K.set_value(weight, weight_value) # weight.assign() might break if weight is a DTensor + return missing_layers, unexpected_layers, mismatched_layers + + +def init_copy_embeddings(old_embeddings, new_num_tokens): + r""" + This function aims to reduce the embeddings in case new_num_tokens < old_num_tokens or to pad with -1 in case + new_num_tokens > old_num_tokens. A mask is also computed in order to know which weight in the embeddings should be + kept or not. Example: + + - if new_num_tokens=5 and old_num_tokens=4 and old_embeddings=[w1,w2,w3,w4] + + - mask=[True,True,True,True,False] and current_weights=[w1,w2,w3,w4,-1] + - if new_num_tokens=4 and old_num_tokens=5 and old_embeddings=[w1,w2,w3,w4,w5] + + - mask=[True,True,True,True] and current_weights=[w1,w2,w3,w4] + """ + old_num_tokens, old_embedding_dim = shape_list(old_embeddings) + size_diff = new_num_tokens - old_num_tokens + + # initialize new embeddings + # Copy token embeddings from the previous ones + if tf.math.greater(size_diff, 0): + # if the new size is greater than the old one, we extend the current embeddings with a padding until getting new size + # and we create a mask to properly identify the padded values and be replaced by the values of the newly created + # embeddings + current_weights = tf.pad( + old_embeddings.value(), tf.convert_to_tensor([[0, size_diff], [0, 0]]), constant_values=-1 + ) + num_tokens_to_copy = min(old_num_tokens, new_num_tokens) + mask = tf.fill(tf.convert_to_tensor([num_tokens_to_copy, 1]), True) + mask = tf.pad(mask, tf.convert_to_tensor([[0, size_diff], [0, 0]]), constant_values=False) + else: + # if the new size if lower than the old one, we take the current embeddings until the new size + current_weights = tf.slice( + old_embeddings.value(), + tf.convert_to_tensor([0, 0]), + tf.convert_to_tensor([new_num_tokens, old_embedding_dim]), + ) + mask = tf.fill(tf.convert_to_tensor([new_num_tokens, 1]), True) + + return mask, current_weights + + +class TFPreTrainedModel(keras.Model, TFModelUtilsMixin, TFGenerationMixin, PushToHubMixin): + r""" + Base class for all TF models. + + [`TFPreTrainedModel`] takes care of storing the configuration of the models and handles methods for loading, + downloading and saving models as well as a few methods common to all models to: + + - resize the input embeddings, + - prune heads in the self-attention heads. + + Class attributes (overridden by derived classes): + + - **config_class** ([`PretrainedConfig`]) -- A subclass of [`PretrainedConfig`] to use as configuration class + for this model architecture. + - **base_model_prefix** (`str`) -- A string indicating the attribute associated to the base model in derived + classes of the same architecture adding modules on top of the base model. + - **main_input_name** (`str`) -- The name of the principal input to the model (often `input_ids` for NLP + models, `pixel_values` for vision models and `input_values` for speech models). + """ + + config_class = None + base_model_prefix = "" + main_input_name = "input_ids" + _auto_class = None + _using_dummy_loss = None + _label_to_output_map = None + + # a list of re pattern of tensor names to ignore from the model when loading the model weights + # (and avoid unnecessary warnings). + _keys_to_ignore_on_load_missing = None + # a list of re pattern of tensor names to ignore from the weights when loading the model weights + # (and avoid unnecessary warnings). + _keys_to_ignore_on_load_unexpected = None + _requires_load_weight_prefix = False + + @property + def dummy_inputs(self) -> Dict[str, tf.Tensor]: + """ + Dummy inputs to build the network. + + Returns: + `Dict[str, tf.Tensor]`: The dummy inputs. + """ + dummies = {} + for key, spec in self.input_signature.items(): + # 2 is the most correct arbitrary size. I will not be taking questions + dummy_shape = [dim if dim is not None else 2 for dim in spec.shape] + if spec.shape[0] is None: + # But let's make the batch size 1 to save memory anyway + dummy_shape[0] = 1 + dummies[key] = tf.ones(shape=dummy_shape, dtype=spec.dtype) + if key == "token_type_ids": + # Some models have token_type_ids but with a vocab_size of 1 + dummies[key] = tf.zeros_like(dummies[key]) + if self.config.add_cross_attention and "encoder_hidden_states" in inspect.signature(self.call).parameters: + if "encoder_hidden_states" not in dummies: + if self.main_input_name == "input_ids": + dummies["encoder_hidden_states"] = tf.ones( + shape=(1, 2, self.config.hidden_size), dtype=tf.float32, name="encoder_hidden_states" + ) + else: + raise NotImplementedError( + "Model has cross-attention but we couldn't infer the shape for the encoder hidden states. Please manually override dummy_inputs!" + ) + return dummies + + def build_in_name_scope(self): + with tf.name_scope(self.name): + self.build(input_shape=None) + + @property + def framework(self) -> str: + """ + :str: Identifies that this is a TensorFlow model. + """ + return "tf" + + def build(self, input_shape=None): + pass # This is just here to make sure we don't call the superclass build() + + def __init__(self, config, *inputs, **kwargs): + super().__init__(*inputs, **kwargs) + if not isinstance(config, PretrainedConfig): + raise TypeError( + f"Parameter config in `{self.__class__.__name__}(config)` should be an instance of class " + "`PretrainedConfig`. To create a model from a pretrained model use " + f"`model = {self.__class__.__name__}.from_pretrained(PRETRAINED_MODEL_NAME)`" + ) + # Save config and origin of the pretrained weights if given in model + self.config = config + self.name_or_path = config.name_or_path + self.generation_config = GenerationConfig.from_model_config(config) if self.can_generate() else None + self._set_save_spec(self.input_signature) + + def get_config(self): + return self.config.to_dict() + + @functools.wraps(keras.Model.fit) + def fit(self, *args, **kwargs): + args, kwargs = convert_batch_encoding(*args, **kwargs) + return super().fit(*args, **kwargs) + + @functools.wraps(keras.Model.train_on_batch) + def train_on_batch(self, *args, **kwargs): + args, kwargs = convert_batch_encoding(*args, **kwargs) + return super().train_on_batch(*args, **kwargs) + + @functools.wraps(keras.Model.test_on_batch) + def test_on_batch(self, *args, **kwargs): + args, kwargs = convert_batch_encoding(*args, **kwargs) + return super().test_on_batch(*args, **kwargs) + + @functools.wraps(keras.Model.predict_on_batch) + def predict_on_batch(self, *args, **kwargs): + args, kwargs = convert_batch_encoding(*args, **kwargs) + return super().predict_on_batch(*args, **kwargs) + + @functools.wraps(keras.Model.predict) + def predict(self, *args, **kwargs): + args, kwargs = convert_batch_encoding(*args, **kwargs) + return super().predict(*args, **kwargs) + + @functools.wraps(keras.Model.evaluate) + def evaluate(self, *args, **kwargs): + args, kwargs = convert_batch_encoding(*args, **kwargs) + return super().evaluate(*args, **kwargs) + + @classmethod + def from_config(cls, config, **kwargs): + if isinstance(config, PretrainedConfig): + return cls._from_config(config, **kwargs) + return cls._from_config(cls.config_class.from_dict(config, **kwargs)) + + @classmethod + def _from_config(cls, config, **kwargs): + """ + All context managers that the model should be initialized under go here. + """ + return cls(config, **kwargs) + + def get_head_mask(self, head_mask: tf.Tensor | None, num_hidden_layers: int) -> tf.Tensor: + """ + Prepare the head mask if needed. + + Args: + head_mask (`tf.Tensor` with shape `[num_heads]` or `[num_hidden_layers x num_heads]`, *optional*): + The mask indicating if we should keep the heads or not (1.0 for keep, 0.0 for discard). + num_hidden_layers (`int`): + The number of hidden layers in the model. + + Returns: + `tf.Tensor` with shape `[num_hidden_layers x batch x num_heads x seq_length x seq_length]` or list with + `[None]` for each layer. + """ + if head_mask is not None: + head_mask = self._convert_head_mask_to_5d(head_mask, num_hidden_layers) + else: + head_mask = [None] * num_hidden_layers + + return head_mask + + def _convert_head_mask_to_5d(self, head_mask, num_hidden_layers): + """-> [num_hidden_layers x batch x num_heads x seq_length x seq_length]""" + if head_mask.shape.rank == 1: + head_mask = head_mask[None, None, :, None, None] + head_mask = tf.repeat(head_mask, repeats=num_hidden_layers, axis=0) + elif head_mask.shape.rank == 2: + head_mask = head_mask[:, None, :, None, None] + assert head_mask.shape.rank == 5, f"head_mask.dim != 5, instead {head_mask.dim()}" + head_mask = tf.cast(head_mask, tf.float32) # switch to float if need + fp16 compatibility + return head_mask + + @tf.function + def serving(self, inputs): + """ + Args: + Method used for serving the model. Does not have a specific signature, but will be specialized as concrete + functions when saving with `save_pretrained`. + inputs (`Dict[str, tf.Tensor]`): + The input of the saved model as a dictionary of tensors. + """ + output = self.call(inputs) + + return self.serving_output(output) + + @property + def input_signature(self) -> Dict[str, tf.TensorSpec]: + """ + This property should return a dict mapping input names to tf.TensorSpec objects, representing the expected + shape and dtype for model inputs. It is used for both serving and for generating dummy inputs. + """ + model_inputs = list(inspect.signature(self.call).parameters) + sig = {} + if "input_ids" in model_inputs: + if self.__class__.__name__.endswith("ForMultipleChoice"): + text_dims = 3 + else: + text_dims = 2 + for input_name in ( + "input_ids", + "attention_mask", + "token_type_ids", + "decoder_input_ids", + "decoder_attention_mask", + ): + if input_name in model_inputs: + sig[input_name] = tf.TensorSpec([None] * text_dims, tf.int32, name=input_name) + if "pixel_values" in model_inputs: + pixel_values_shape = [None, None, None, None] + if hasattr(self.config, "vision_config"): + vision_config = self.config.vision_config + else: + vision_config = self.config + if hasattr(vision_config, "num_channels"): + pixel_values_shape[1] = vision_config.num_channels + else: + raise NotImplementedError( + "Could not infer number of channels from config, please override input_signature to specify input shapes." + ) + if hasattr(vision_config, "image_size"): + pixel_values_shape[2] = pixel_values_shape[3] = vision_config.image_size + elif hasattr(vision_config, "input_size"): + pixel_values_shape[2] = pixel_values_shape[3] = vision_config.input_size + else: + raise NotImplementedError( + "Could not infer input image shape from config, please override input_signature to specify input shapes." + ) + sig["pixel_values"] = tf.TensorSpec(pixel_values_shape, tf.float32, name="pixel_values") + if "input_features" in model_inputs: + raise NotImplementedError("Audio models need a manually defined input_signature") + return sig + + def serving_output(self, output): + """ + Prepare the output of the saved model. Can be overridden if specific serving modifications are required. + """ + if not isinstance(output, ModelOutput): + return output + for key in output: + if key.endswith("hidden_states") and not getattr(self.config, "output_hidden_states", False): + output[key] = None + elif key.endswith("attentions") and not getattr(self.config, "output_attentions", False): + output[key] = None + elif key == "past_key_values" and not getattr(self.config, "use_cache", False): + output[key] = None + elif key == "cross_attentions" and not ( + getattr(self.config, "output_attentions", False) and getattr(self.config, "add_cross_attention", False) + ): + output[key] = None + if isinstance(output[key], (tuple, list)): + try: + output[key] = tf.convert_to_tensor(output[key]) + except (ValueError, tf.errors.InvalidArgumentError): + pass # Layers may not have the same dimensions + return output + + @classmethod + def can_generate(cls) -> bool: + """ + Returns whether this model can generate sequences with `.generate()`. + + Returns: + `bool`: Whether this model can generate sequences with `.generate()`. + """ + # Detects whether `prepare_inputs_for_generation` has been overwritten, which is a requirement for generation. + # Alternatively, the model can also have a custom `generate` function. + if "GenerationMixin" in str(cls.prepare_inputs_for_generation) and "GenerationMixin" in str(cls.generate): + return False + return True + + def get_input_embeddings(self) -> keras.layers.Layer: + """ + Returns the model's input embeddings layer. + + Returns: + `tf.Variable`: The embeddings layer mapping vocabulary to hidden states. + """ + main_layer = getattr(self, self.base_model_prefix, self) + + if main_layer is not self: + return main_layer.get_input_embeddings() + else: + raise NotImplementedError + + def _save_checkpoint(self, checkpoint_dir, epoch): + if not os.path.isdir(checkpoint_dir): + os.mkdir(checkpoint_dir) + # We avoid tf.train.checkpoint or saving weights in TF format, even though that includes optimizer + # state for us, because it requires special handling for objects like custom losses, which we use + # internally and which users are likely to use too + weights_path = os.path.join(checkpoint_dir, "weights.h5") + self.save_weights(weights_path) + extra_data = {"epoch": epoch, "optimizer_state": self.optimizer.get_weights()} + extra_data_path = os.path.join(checkpoint_dir, "extra_data.pickle") + with open(extra_data_path, "wb") as f: + pickle.dump(extra_data, f) + + def prepare_tf_dataset( + self, + dataset: "datasets.Dataset", # noqa:F821 + batch_size: int = 8, + shuffle: bool = True, + tokenizer: Optional["PreTrainedTokenizerBase"] = None, + collate_fn: Optional[Callable] = None, + collate_fn_args: Optional[Dict[str, Any]] = None, + drop_remainder: Optional[bool] = None, + prefetch: bool = True, + ): + """ + Wraps a HuggingFace [`~datasets.Dataset`] as a `tf.data.Dataset` with collation and batching. This method is + designed to create a "ready-to-use" dataset that can be passed directly to Keras methods like `fit()` without + further modification. The method will drop columns from the dataset if they don't match input names for the + model. If you want to specify the column names to return rather than using the names that match this model, we + recommend using `Dataset.to_tf_dataset()` instead. + + Args: + dataset (`Any`): + A [~`datasets.Dataset`] to be wrapped as a `tf.data.Dataset`. + batch_size (`int`, *optional*, defaults to 8): + The size of batches to return. + shuffle (`bool`, defaults to `True`): + Whether to return samples from the dataset in random order. Usually `True` for training datasets and + `False` for validation/test datasets. + tokenizer ([`PreTrainedTokenizerBase`], *optional*): + A `PreTrainedTokenizer` that will be used to pad samples to create batches. Has no effect if a specific + `collate_fn` is passed instead. + collate_fn (`Callable`, *optional*): + A function that collates samples from the dataset into a single batch. Defaults to + `DefaultDataCollator` if no `tokenizer` is supplied or `DataCollatorWithPadding` if a `tokenizer` is + passed. + collate_fn_args (`Dict[str, Any]`, *optional*): + A dict of arguments to pass to the `collate_fn` alongside the list of samples. + drop_remainder (`bool`, *optional*): + Whether to drop the final batch, if the batch_size does not evenly divide the dataset length. Defaults + to the same setting as `shuffle`. + prefetch (`bool`, defaults to `True`): + Whether to add prefetching to the end of the `tf.data` pipeline. This is almost always beneficial for + performance, but can be disabled in edge cases. + + + Returns: + `Dataset`: A `tf.data.Dataset` which is ready to pass to the Keras API. + """ + requires_backends(self, ["datasets"]) + import datasets + + if collate_fn is None: + if tokenizer is None: + collate_fn = DefaultDataCollator(return_tensors="np") + else: + collate_fn = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="np") + if collate_fn_args is None: + collate_fn_args = {} + + if not isinstance(dataset, datasets.Dataset): + raise TypeError("Dataset argument should be a datasets.Dataset!") + model_inputs = list(inspect.signature(self.call).parameters) + model_labels = find_labels(self.__class__) + if "cols_to_retain" in list(inspect.signature(dataset._get_output_signature).parameters.keys()): + output_signature, _ = dataset._get_output_signature( + dataset, + batch_size=None, + collate_fn=collate_fn, + collate_fn_args=collate_fn_args, + cols_to_retain=model_inputs, + ) + else: + # TODO Matt: This is a workaround for older versions of datasets that are missing the `cols_to_retain` + # argument. We should remove this once the minimum supported version of datasets is > 2.3.2 + unwanted_columns = [ + feature + for feature in dataset.features + if feature not in model_inputs and feature not in ("label_ids", "label") + ] + dataset = dataset.remove_columns(unwanted_columns) + output_signature, _ = dataset._get_output_signature( + dataset, batch_size=None, collate_fn=collate_fn, collate_fn_args=collate_fn_args + ) + output_columns = list(output_signature.keys()) + feature_cols = [col for col in output_columns if col in model_inputs and col not in model_labels] + label_cols = [col for col in output_columns if col in model_labels] + + # Backwards compatibility for older versions of datasets. Previously, if `columns` or `label_cols` + # were a single element list, the returned element spec would be a single element. Now, passing [feature] + # will return a dict structure {"feature": feature}, and passing a single string will return a single element. + feature_cols = feature_cols[0] if len(feature_cols) == 1 else feature_cols + label_cols = label_cols[0] if len(label_cols) == 1 else label_cols + + if drop_remainder is None: + drop_remainder = shuffle + tf_dataset = dataset.to_tf_dataset( + columns=feature_cols, + label_cols=label_cols, + batch_size=batch_size, + shuffle=shuffle, + drop_remainder=drop_remainder, + collate_fn=collate_fn, + collate_fn_args=collate_fn_args, + prefetch=prefetch, + ) + return tf_dataset + + def compile( + self, + optimizer="rmsprop", + loss="auto_with_warning", + metrics=None, + loss_weights=None, + weighted_metrics=None, + run_eagerly=None, + steps_per_execution=None, + **kwargs, + ): + """ + This is a thin wrapper that sets the model's loss output head as the loss if the user does not specify a loss + function themselves. + """ + if loss in ("auto_with_warning", "passthrough"): # "passthrough" for workflow backward compatibility + logger.info( + "No loss specified in compile() - the model's internal loss computation will be used as the " + "loss. Don't panic - this is a common way to train TensorFlow models in Transformers! " + "To disable this behaviour please pass a loss argument, or explicitly pass " + "`loss=None` if you do not want your model to compute a loss. You can also specify `loss='auto'` to " + "get the internal loss without printing this info string." + ) + loss = "auto" + if loss == "auto": + loss = dummy_loss + self._using_dummy_loss = True + else: + self._using_dummy_loss = False + parent_args = list(inspect.signature(keras.Model.compile).parameters.keys()) + # This argument got renamed, we need to support both versions + if "steps_per_execution" in parent_args: + super().compile( + optimizer=optimizer, + loss=loss, + metrics=metrics, + loss_weights=loss_weights, + weighted_metrics=weighted_metrics, + run_eagerly=run_eagerly, + steps_per_execution=steps_per_execution, + **kwargs, + ) + else: + super().compile( + optimizer=optimizer, + loss=loss, + metrics=metrics, + loss_weights=loss_weights, + weighted_metrics=weighted_metrics, + run_eagerly=run_eagerly, + experimental_steps_per_execution=steps_per_execution, + **kwargs, + ) + + def compute_loss(self, *args, **kwargs): + if hasattr(keras.Model, "compute_loss"): + # This will be true in TF 2.8 or greater + return super().compute_loss(*args, **kwargs) + else: + warnings.warn( + "The old compute_loss method is deprecated as it conflicts with the Keras compute_loss " + "method added in TF 2.8. If you want the original HF compute_loss, please call " + "hf_compute_loss() instead. From TF versions >= 2.8, or Transformers versions >= 5, " + "calling compute_loss() will get the Keras method instead.", + FutureWarning, + ) + return self.hf_compute_loss(*args, **kwargs) + + def get_label_to_output_name_mapping(self): + arg_names = list(inspect.signature(self.call).parameters) + if self._label_to_output_map is not None: + return self._label_to_output_map + elif "start_positions" in arg_names: + return {"start_positions": "start_logits", "end_positions": "end_logits"} + elif "sentence_order_label" in arg_names: + return {"labels": "prediction_logits", "sentence_order_label": "sop_logits"} + elif "next_sentence_label" in arg_names: + return {"labels": "prediction_logits", "next_sentence_label": "seq_relationship_logits"} + elif "mc_labels" in arg_names: + return {"labels": "logits", "mc_labels": "mc_logits"} + else: + return {} + + def train_step(self, data): + """ + A modification of Keras's default `train_step` that correctly handles matching outputs to labels for our models + and supports directly training on the loss output head. In addition, it ensures input keys are copied to the + labels where appropriate. It will also copy label keys into the input dict when using the dummy loss, to ensure + that they are available to the model during the forward pass. + """ + + # We hardcode the most common renamings; models with weirder names can set `self._label_to_output_map` + arg_names = list(inspect.signature(self.call).parameters) + label_kwargs = find_labels(self.__class__) + label_to_output = self.get_label_to_output_name_mapping() + output_to_label = {val: key for key, val in label_to_output.items()} + if not self._using_dummy_loss and parse(tf.__version__) < parse("2.11.0"): + # Newer TF train steps leave this out + data = expand_1d(data) + x, y, sample_weight = keras.utils.unpack_x_y_sample_weight(data) + # If the inputs are mutable dictionaries, make a shallow copy of them because we will modify + # them during input/label pre-processing. This avoids surprising the user by wrecking their data. + # In addition, modifying mutable Python inputs makes XLA compilation impossible. + if isinstance(x, dict): + x = x.copy() + if isinstance(y, dict): + y = y.copy() + + # When using a dummy loss, we ensure that separate labels are copied to the correct model arguments, + # if those keys are not already present in the input dict + if self._using_dummy_loss and y is not None: + # If y is a tensor and the model only has one label-like input, map y to that input + if len(label_kwargs) == 1 and isinstance(y, tf.Tensor): + if isinstance(x, tf.Tensor): + x = {arg_names[0]: x} + label_kwarg = next(iter(label_kwargs)) + if label_kwarg not in x: + x[label_kwarg] = y + # Otherwise, copy keys from y to x as long as they weren't already present in x + elif isinstance(y, dict): + if isinstance(x, tf.Tensor): + x = {arg_names[0]: x} + for key, val in y.items(): + if key in arg_names and key not in x: + x[key] = val + elif output_to_label.get(key, None) in arg_names and key not in x: + x[output_to_label[key]] = val + if y is None: + y = {key: val for key, val in x.items() if key in label_kwargs} + if not y and not self._using_dummy_loss: + raise ValueError("Could not find label column(s) in input dict and no separate labels were provided!") + + if isinstance(y, dict): + # Rename labels at this point to match output heads + y = {label_to_output.get(key, key): val for key, val in y.items()} + + # Run forward pass. + with tf.GradientTape() as tape: + if self._using_dummy_loss and "return_loss" in arg_names: + y_pred = self(x, training=True, return_loss=True) + else: + y_pred = self(x, training=True) + if self._using_dummy_loss: + loss = self.compiled_loss(y_pred.loss, y_pred.loss, sample_weight, regularization_losses=self.losses) + else: + loss = None + + # This next block matches outputs to label keys. Tensorflow's standard method for doing this + # can get very confused if any of the keys contain nested values (e.g. lists/tuples of Tensors) + if isinstance(y, dict) and len(y) == 1: + if list(y.keys())[0] in y_pred.keys(): + y_pred = y_pred[list(y.keys())[0]] + elif list(y_pred.keys())[0] == "loss": + y_pred = y_pred[1] + else: + y_pred = y_pred[0] + _, y = y.popitem() + elif isinstance(y, dict): + # If the labels are a dict, match keys from the output by name + y_pred = {key: val for key, val in y_pred.items() if key in y} + elif isinstance(y, tuple) or isinstance(y, list): + # If the labels are a tuple/list, match keys to the output by order, skipping the loss. + if list(y_pred.keys())[0] == "loss": + y_pred = y_pred.to_tuple()[1:] + else: + y_pred = y_pred.to_tuple() + y_pred = y_pred[: len(y)] # Remove unused fields in case those cause problems + else: + # If the labels are a single tensor, match them to the first non-loss tensor in the output + if list(y_pred.keys())[0] == "loss": + y_pred = y_pred[1] + else: + y_pred = y_pred[0] + + if loss is None: + loss = self.compiled_loss(y, y_pred, sample_weight, regularization_losses=self.losses) + + # Run backwards pass. + self.optimizer.minimize(loss, self.trainable_variables, tape=tape) + + self.compiled_metrics.update_state(y, y_pred, sample_weight) + # Collect metrics to return + return_metrics = {} + for metric in self.metrics: + result = metric.result() + if isinstance(result, dict): + return_metrics.update(result) + else: + return_metrics[metric.name] = result + return return_metrics + + def test_step(self, data): + """ + A modification of Keras's default `train_step` that correctly handles matching outputs to labels for our models + and supports directly training on the loss output head. In addition, it ensures input keys are copied to the + labels where appropriate. It will also copy label keys into the input dict when using the dummy loss, to ensure + that they are available to the model during the forward pass. + """ + # We hardcode the most common renamings; models with weirder names can set `self._label_to_output_map` + arg_names = list(inspect.signature(self.call).parameters) + label_kwargs = find_labels(self.__class__) + label_to_output = self.get_label_to_output_name_mapping() + output_to_label = {val: key for key, val in label_to_output.items()} + if not self._using_dummy_loss and parse(tf.__version__) < parse("2.11.0"): + # Newer versions leave this out + data = expand_1d(data) + x, y, sample_weight = keras.utils.unpack_x_y_sample_weight(data) + # If the inputs are mutable dictionaries, make a shallow copy of them because we will modify + # them during input/label pre-processing. This avoids surprising the user by wrecking their data. + # In addition, modifying mutable Python inputs makes XLA compilation impossible. + if isinstance(x, dict): + x = x.copy() + if isinstance(y, dict): + y = y.copy() + + # When using a dummy loss, we ensure that separate labels are copied to the correct model arguments, + # if those keys are not already present in the input dict + if self._using_dummy_loss and y is not None: + arg_names = list(inspect.signature(self.call).parameters) + # If y is a tensor and the model only has one label-like input, map y to that input + if len(label_kwargs) == 1 and isinstance(y, tf.Tensor): + if isinstance(x, tf.Tensor): + x = {arg_names[0]: x} + label_kwarg = next(iter(label_kwargs)) + if label_kwarg not in x: + x[label_kwarg] = y + # Otherwise, copy keys from y to x as long as they weren't already present in x + elif isinstance(y, dict): + if isinstance(x, tf.Tensor): + x = {arg_names[0]: x} + for key, val in y.items(): + if key in arg_names and key not in x: + x[key] = val + elif output_to_label.get(key, None) in arg_names and key not in x: + x[output_to_label[key]] = val + if y is None: + y = {key: val for key, val in x.items() if key in label_kwargs} + if not y and not self._using_dummy_loss: + raise ValueError("Could not find label column(s) in input dict and no separate labels were provided!") + + if isinstance(y, dict): + # Rename labels at this point to match output heads + y = {label_to_output.get(key, key): val for key, val in y.items()} + + # Run forward pass. + if self._using_dummy_loss and "return_loss" in arg_names: + y_pred = self(x, return_loss=True, training=False) + else: + y_pred = self(x, training=False) + if self._using_dummy_loss: + loss = self.compiled_loss(y_pred.loss, y_pred.loss, sample_weight, regularization_losses=self.losses) + else: + loss = None + + # This next block matches outputs to label keys. Tensorflow's standard method for doing this + # can get very confused if any of the keys contain nested values (e.g. lists/tuples of Tensors) + if isinstance(y, dict) and len(y) == 1: + if list(y.keys())[0] in y_pred.keys(): + y_pred = y_pred[list(y.keys())[0]] + elif list(y_pred.keys())[0] == "loss": + y_pred = y_pred[1] + else: + y_pred = y_pred[0] + _, y = y.popitem() + elif isinstance(y, dict): + # If the labels are a dict, match keys from the output by name + y_pred = {key: val for key, val in y_pred.items() if key in y} + elif isinstance(y, tuple) or isinstance(y, list): + # If the labels are a tuple/list, match keys to the output by order, skipping the loss. + if list(y_pred.keys())[0] == "loss": + y_pred = y_pred.to_tuple()[1:] + else: + y_pred = y_pred.to_tuple() + y_pred = y_pred[: len(y)] # Remove unused fields in case those cause problems + else: + # If the labels are a single tensor, match them to the first non-loss tensor in the output + if list(y_pred.keys())[0] == "loss": + y_pred = y_pred[1] + else: + y_pred = y_pred[0] + + if loss is None: + loss = self.compiled_loss(y, y_pred, sample_weight, regularization_losses=self.losses) + + self.compiled_metrics.update_state(y, y_pred, sample_weight) + # Collect metrics to return + return_metrics = {} + for metric in self.metrics: + result = metric.result() + if isinstance(result, dict): + return_metrics.update(result) + else: + return_metrics[metric.name] = result + return return_metrics + + def create_model_card( + self, + output_dir, + model_name: str, + language: Optional[str] = None, + license: Optional[str] = None, + tags: Optional[str] = None, + finetuned_from: Optional[str] = None, + tasks: Optional[str] = None, + dataset_tags: Optional[Union[str, List[str]]] = None, + dataset: Optional[Union[str, List[str]]] = None, + dataset_args: Optional[Union[str, List[str]]] = None, + ): + """ + Creates a draft of a model card using the information available to the `Trainer`. + + Args: + output_dir (`str` or `os.PathLike`): + The folder in which to create the model card. + model_name (`str`, *optional*): + The name of the model. + language (`str`, *optional*): + The language of the model (if applicable) + license (`str`, *optional*): + The license of the model. Will default to the license of the pretrained model used, if the original + model given to the `Trainer` comes from a repo on the Hub. + tags (`str` or `List[str]`, *optional*): + Some tags to be included in the metadata of the model card. + finetuned_from (`str`, *optional*): + The name of the model used to fine-tune this one (if applicable). Will default to the name of the repo + of the original model given to the `Trainer` (if it comes from the Hub). + tasks (`str` or `List[str]`, *optional*): + One or several task identifiers, to be included in the metadata of the model card. + dataset_tags (`str` or `List[str]`, *optional*): + One or several dataset tags, to be included in the metadata of the model card. + dataset (`str` or `List[str]`, *optional*): + One or several dataset identifiers, to be included in the metadata of the model card. + dataset_args (`str` or `List[str]`, *optional*): + One or several dataset arguments, to be included in the metadata of the model card. + """ + # Avoids a circular import by doing this when necessary. + from .modelcard import TrainingSummary # tests_ignore + + training_summary = TrainingSummary.from_keras( + self, + keras_history=self.history, + language=language, + license=license, + tags=tags, + model_name=model_name, + finetuned_from=finetuned_from, + tasks=tasks, + dataset_tags=dataset_tags, + dataset=dataset, + dataset_args=dataset_args, + ) + model_card = training_summary.to_model_card() + with open(os.path.join(output_dir, "README.md"), "w") as f: + f.write(model_card) + + def set_input_embeddings(self, value): + """ + Set model's input embeddings + + Args: + value (`tf.Variable`): + The new weights mapping hidden states to vocabulary. + """ + main_layer = getattr(self, self.base_model_prefix) + + if main_layer is None: + raise NotImplementedError("The model does not implements the base_model_prefix attribute.") + + try: + main_layer.set_input_embeddings(value) + except AttributeError: + logger.info("Building the model") + self.build_in_name_scope() + main_layer.set_input_embeddings(value) + + def get_output_embeddings(self) -> Union[None, keras.layers.Layer]: + """ + Returns the model's output embeddings + + Returns: + `tf.Variable`: The new weights mapping vocabulary to hidden states. + """ + if self.get_lm_head() is not None: + lm_head = self.get_lm_head() + + try: + return lm_head.get_output_embeddings() + except AttributeError: + logger.info("Building the model") + self.build_in_name_scope() + + return lm_head().get_output_embeddings() + + return None # Overwrite for models with output embeddings + + def set_output_embeddings(self, value): + """ + Set model's output embeddings + + Args: + value (`tf.Variable`): + The new weights mapping hidden states to vocabulary. + """ + if self.get_lm_head() is not None: + lm_head = self.get_lm_head() + try: + lm_head.set_output_embeddings(value) + except AttributeError: + logger.info("Building the model") + self.build_in_name_scope() + lm_head.set_output_embeddings(value) + + def get_output_layer_with_bias(self) -> Union[None, keras.layers.Layer]: + """ + Get the layer that handles a bias attribute in case the model has an LM head with weights tied to the + embeddings + + Return: + `keras.layers.Layer`: The layer that handles the bias, None if not an LM model. + """ + warnings.warn( + "The method get_output_layer_with_bias is deprecated. Please use `get_lm_head` instead.", FutureWarning + ) + return self.get_lm_head() + + def get_prefix_bias_name(self) -> Union[None, str]: + """ + Get the concatenated _prefix name of the bias from the model name to the parent layer + + Return: + `str`: The _prefix name of the bias. + """ + warnings.warn("The method get_prefix_bias_name is deprecated. Please use `get_bias` instead.", FutureWarning) + return None + + def get_bias(self) -> Union[None, Dict[str, tf.Variable]]: + """ + Dict of bias attached to an LM head. The key represents the name of the bias attribute. + + Return: + `tf.Variable`: The weights representing the bias, None if not an LM model. + """ + if self.get_lm_head() is not None: + lm_head = self.get_lm_head() + try: + return lm_head.get_bias() + except AttributeError: + self.build_in_name_scope() + + return lm_head.get_bias() + return None + + def set_bias(self, value): + """ + Set all the bias in the LM head. + + Args: + value (`Dict[tf.Variable]`): + All the new bias attached to an LM head. + """ + if self.get_lm_head() is not None: + lm_head = self.get_lm_head() + try: + lm_head.set_bias(value) + except AttributeError: + self.build_in_name_scope() + lm_head.set_bias(value) + + def get_lm_head(self) -> keras.layers.Layer: + """ + The LM Head layer. This method must be overwritten by all the models that have a lm head. + + Return: + `keras.layers.Layer`: The LM head layer if the model has one, None if not. + """ + return None + + def resize_token_embeddings( + self, new_num_tokens: Optional[int] = None + ) -> Union[keras.layers.Embedding, tf.Variable]: + """ + Resizes input token embeddings matrix of the model if `new_num_tokens != config.vocab_size`. + + Takes care of tying weights embeddings afterwards if the model class has a `tie_weights()` method. + + Arguments: + new_num_tokens (`int`, *optional*): + The number of new tokens in the embedding matrix. Increasing the size will add newly initialized + vectors at the end. Reducing the size will remove vectors from the end. If not provided or `None`, just + returns a pointer to the input tokens without doing anything. + + Return: + `tf.Variable` or `keras.layers.Embedding`: Pointer to the input tokens of the model. + """ + # TODO (joao): flagged for replacement (by `_v2_resized_token_embeddings`) due to embeddings refactor + + # Run the new code path if the model has a keras embeddings layer + if isinstance(self.get_input_embeddings(), keras.layers.Embedding): + return self._v2_resized_token_embeddings(new_num_tokens) + + if new_num_tokens is None or new_num_tokens == self.config.vocab_size: + return self._get_word_embedding_weight(self.get_input_embeddings()) + + model_embeds = self._resize_token_embeddings(new_num_tokens) + + # Update base model and current model config + self.config.vocab_size = new_num_tokens + + return model_embeds + + def _v2_resized_token_embeddings(self, new_num_tokens: Optional[int] = None) -> keras.layers.Embedding: + """ + Resizes input token embeddings matrix of the model if `new_num_tokens != config.vocab_size`. + + Arguments: + new_num_tokens (`int`, *optional*): + The number of new tokens in the embedding matrix. Increasing the size will add newly initialized + vectors at the end. Reducing the size will remove vectors from the end. If not provided or `None`, just + returns a pointer to the input tokens without doing anything. + + Return: + `keras.layers.Embedding`: Pointer to the input tokens of the model. + """ + if new_num_tokens is None or new_num_tokens == self.config.vocab_size: + return self.get_input_embeddings() + + model_embeds = self._v2_resize_token_embeddings(new_num_tokens) + + # Update base model and current model config + self.config.vocab_size = new_num_tokens + + return model_embeds + + def _get_word_embedding_weight(model, embedding_layer): + # TODO (joao): flagged for delection due to embeddings refactor + + # If the variable holds the weights themselves, return them + if isinstance(embedding_layer, tf.Tensor): + return embedding_layer + # Otherwise, try to get them from the layer's attributes + + embeds = getattr(embedding_layer, "weight", None) + if embeds is not None: + return embeds + + embeds = getattr(embedding_layer, "decoder", None) + if embeds is not None: + return embeds + + # The reason why the attributes don't exist might be + # because the model is not built, so retry getting + # the argument after building the model + model.build_in_name_scope() + + embeds = getattr(embedding_layer, "weight", None) + if embeds is not None: + return embeds + + embeds = getattr(embedding_layer, "decoder", None) + if embeds is not None: + return embeds + + return None + + def _resize_token_embeddings(self, new_num_tokens): + # TODO (joao): flagged for replacement (by `_v2_resize_token_embeddings`) due to embeddings refactor + old_embeddings = self._get_word_embedding_weight(self.get_input_embeddings()) + new_embeddings = self._get_resized_embeddings(old_embeddings, new_num_tokens) + + # if word embeddings are not tied, make sure that lm head bias is resized as well + if self.get_bias() is not None: + old_lm_head_bias = self.get_bias() + new_lm_head_bias = self._get_resized_lm_head_bias(old_lm_head_bias, new_num_tokens) + + self.set_bias(new_lm_head_bias) + + # if word embeddings are not tied, make sure that lm head decoder is resized as well + if self.get_output_embeddings() is not None: + old_lm_head_decoder = self._get_word_embedding_weight(self.get_output_embeddings()) + new_lm_head_decoder = self._get_resized_lm_head_decoder(old_lm_head_decoder, new_num_tokens) + + self.set_output_embeddings(new_lm_head_decoder) + + self.set_input_embeddings(new_embeddings) + + return self.get_input_embeddings() + + def _v2_resize_token_embeddings(self, new_num_tokens): + old_embeddings = self.get_input_embeddings() + new_embeddings = self._v2_get_resized_embeddings(old_embeddings, new_num_tokens) + self.set_input_embeddings(new_embeddings) + + # If word embeddings are not tied, make sure that lm head bias is resized as well + if self.get_bias() is not None: + old_lm_head_bias = self.get_bias() + new_lm_head_bias = self._v2_get_resized_lm_head_bias(old_lm_head_bias, new_num_tokens) + self.set_bias(new_lm_head_bias) + + # If word embeddings are not tied, make sure that lm head decoder is resized as well. + tied_weights = self.get_input_embeddings() == self.get_output_embeddings() + if self.get_output_embeddings() is not None and not tied_weights: + old_lm_head_decoder = self._get_word_embedding_weight(self.get_output_embeddings()) + # TODO (joao): this one probably needs a v2 version with other models + new_lm_head_decoder = self._get_resized_lm_head_decoder(old_lm_head_decoder, new_num_tokens) + self.set_output_embeddings(new_lm_head_decoder) + + return self.get_input_embeddings() + + def _get_resized_lm_head_bias(self, old_lm_head_bias, new_num_tokens): + """ + Build a resized bias from the old ones. Increasing the size will add newly initialized vectors at the end. + Reducing the size will remove vectors from the end + + Args: + old_lm_head_bias (`tf.Variable`): + Old lm head bias to be resized. + new_num_tokens (`int`, *optional*): + New number of tokens in the linear matrix. + + Increasing the size will add newly initialized vectors at the end. Reducing the size will remove + vectors from the end. If not provided or `None`, just returns None + + Return: + `tf.Variable`: Pointer to the resized bias. + """ + # TODO (joao): flagged for replacement (by `_v2_get_resized_lm_head_bias`) due to embeddings refactor + new_lm_head_bias = {} + + for attr, weight in old_lm_head_bias.items(): + first_dim, old_num_tokens = (None, shape_list(weight)[0]) if tf.rank(weight) == 1 else shape_list(weight) + size_diff = new_num_tokens - old_num_tokens + final_shape = [new_num_tokens] if first_dim is None else [first_dim, new_num_tokens] + + # initialize new bias + if tf.math.greater(size_diff, 0): + padding_shape = [[0, size_diff]] if first_dim is None else [[0, 0], [0, size_diff]] + current_bias = tf.pad(weight.value(), tf.convert_to_tensor(padding_shape), constant_values=-1) + num_tokens_to_copy = min(old_num_tokens, new_num_tokens) + mask_shape = [num_tokens_to_copy] if first_dim is None else [1, num_tokens_to_copy] + bias_mask = tf.fill(tf.convert_to_tensor(mask_shape), True) + bias_mask = tf.pad(bias_mask, tf.convert_to_tensor(padding_shape), constant_values=False) + else: + slice_from = [0] if first_dim is None else [0, 0] + current_bias = tf.slice( + weight.value(), tf.convert_to_tensor(slice_from), tf.convert_to_tensor(final_shape) + ) + bias_mask = tf.fill(tf.convert_to_tensor(final_shape), True) + + new_bias = self.add_weight( + shape=final_shape, + initializer="zeros", + trainable=True, + name=weight.name.split(":")[0], + ) + init_bias = tf.where(bias_mask, current_bias, new_bias.value()) + + new_bias.assign(init_bias) + new_lm_head_bias[attr] = new_bias + + return new_lm_head_bias + + def _v2_get_resized_lm_head_bias( + self, old_lm_head_bias: Dict[str, tf.Variable], new_num_tokens: int + ) -> Dict[str, tf.Tensor]: + """ + Build a resized bias from the old ones. Increasing the size will add newly initialized vectors at the end. + Reducing the size will remove vectors from the end + + Args: + old_lm_head_bias (`Dict[str, tf.Variable]`): + Old lm head bias to be resized. + new_num_tokens (`int`): + New number of tokens in the linear matrix. Increasing the size will add newly initialized vectors at + the end. Reducing the size will remove vectors from the end. + + Return: + `tf.Tensor`: Values for the resized bias. + """ + new_lm_head_bias = {} + + for attr, weight in old_lm_head_bias.items(): + # Determine the size difference (depending on the shape) + first_dim, old_num_tokens = (None, shape_list(weight)[0]) if tf.rank(weight) == 1 else shape_list(weight) + size_diff = new_num_tokens - old_num_tokens + + # Copy the old bias values to the new bias + if old_num_tokens > new_num_tokens: + new_bias = weight.value()[..., :new_num_tokens] + else: + padding_shape = [[0, size_diff]] if first_dim is None else [[0, 0], [0, size_diff]] + new_bias = tf.pad(weight.value(), tf.convert_to_tensor(padding_shape)) + + new_lm_head_bias[attr] = new_bias + return new_lm_head_bias + + def _get_resized_lm_head_decoder(self, old_lm_head_decoder, new_num_tokens): + """ + Build a resized decoder from the old ones. Increasing the size will add newly initialized vectors at the end. + Reducing the size will remove vectors from the end + + Args: + old_lm_head_decoder (`tf.Variable`): + Old lm head decoder to be resized. + new_num_tokens (`int`, *optional*): + New number of tokens in the linear matrix. + + Increasing the size will add newly initialized vectors at the end. Reducing the size will remove + vectors from the end. If not provided or `None`, just returns None + + Return: + `tf.Variable`: Pointer to the resized decoder or None if the output embeddings are different from the input + ones. + """ + new_lm_head_decoder = old_lm_head_decoder + is_input_output_equals = tf.reduce_any( + self._get_word_embedding_weight(self.get_input_embeddings()) == old_lm_head_decoder + ) + + if old_lm_head_decoder is not None and not is_input_output_equals: + old_embedding_dim = shape_list(old_lm_head_decoder)[1] + decoder_mask, current_decoder = init_copy_embeddings(old_lm_head_decoder, new_num_tokens) + new_lm_head_decoder = self.add_weight( + shape=(new_num_tokens, old_embedding_dim), + initializer="zeros", + trainable=True, + name=old_lm_head_decoder.name.split(":")[0], + ) + init_decoder = tf.where(decoder_mask, current_decoder, new_lm_head_decoder.value()) + + new_lm_head_decoder.assign(init_decoder) + + return new_lm_head_decoder + + def _get_resized_embeddings(self, old_embeddings, new_num_tokens=None) -> tf.Variable: + """ + Build a resized Embedding weights from a provided token Embedding weights. Increasing the size will add newly + initialized vectors at the end. Reducing the size will remove vectors from the end + + Args: + old_embeddings (`tf.Variable`): + Old embeddings to be resized. + new_num_tokens (`int`, *optional*): + New number of tokens in the embedding matrix. + + Increasing the size will add newly initialized vectors at the end. Reducing the size will remove + vectors from the end. If not provided or `None`, just returns a pointer to the input tokens + `tf.Variable` module of the model without doing anything. + + Return: + `tf.Variable`: Pointer to the resized Embedding Module or the old Embedding Module if `new_num_tokens` is + `None` + """ + # TODO (joao): flagged for replacement (by `_v2_get_resized_embeddings`) due to embeddings refactor + old_embedding_dim = shape_list(old_embeddings)[1] + init_range = getattr(self.config, "initializer_range", 0.02) + embeddings_mask, current_embeddings = init_copy_embeddings(old_embeddings, new_num_tokens) + new_embeddings = self.add_weight( + name=old_embeddings.name.split(":")[0], + shape=[new_num_tokens, old_embedding_dim], + initializer=get_initializer(init_range), + dtype=tf.float32, + ) + init_embeddings = tf.where(embeddings_mask, current_embeddings, new_embeddings.value()) + + new_embeddings.assign(init_embeddings) + + return new_embeddings + + def _v2_get_resized_embeddings( + self, old_embeddings: keras.layers.Embedding, new_num_tokens: int + ) -> keras.layers.Embedding: + """ + Build a resized Embedding layer from a provided Embedding layer. Increasing the size will add newly initialized + vectors at the end. Reducing the size will remove vectors from the end. + + Args: + old_embeddings (`keras.layers.Embedding`): + Old embeddings to be resized. + new_num_tokens (`int`, *optional*): + New number of tokens in the embedding matrix. + + Return: + `keras.layers.Embedding`: Resized Embedding layer. + """ + + # Get the initialization range for the embeddings + init_range = 0.02 # default value + potential_initialization_variable_names = [ + "initializer_range", # most common + "initializer_factor", # e.g. T5 + "init_std", # e.g BART + ] + for var_name in potential_initialization_variable_names: + if hasattr(self.config, var_name): + init_range = getattr(self.config, var_name) + + # Get a new (initialized) embeddings layer + new_embeddings = keras.layers.Embedding( + input_dim=new_num_tokens, + output_dim=old_embeddings.output_dim, + embeddings_initializer=keras.initializers.TruncatedNormal(stddev=init_range), + name=old_embeddings.embeddings.name[:-13], # exact same scoped name except "/embeddings:0" + ) + new_embeddings(tf.constant([[0]])) + + # Copy the old embeddings to the new embeddings + if old_embeddings.input_dim >= new_num_tokens: + init_embeddings = old_embeddings.embeddings[:new_num_tokens] + else: + init_embeddings = tf.concat( + [old_embeddings.embeddings, new_embeddings.embeddings[old_embeddings.input_dim :]], axis=0 + ) + new_embeddings.embeddings.assign(init_embeddings) + return new_embeddings + + def prune_heads(self, heads_to_prune): + """ + Prunes heads of the base model. + + Arguments: + heads_to_prune (`Dict[int, List[int]]`): + Dictionary with keys being selected layer indices (`int`) and associated values being the list of heads + to prune in said layer (list of `int`). For instance {1: [0, 2], 2: [2, 3]} will prune heads 0 and 2 on + layer 1 and heads 2 and 3 on layer 2. + """ + raise NotImplementedError + + def save_pretrained( + self, + save_directory, + saved_model=False, + version=1, + push_to_hub=False, + signatures=None, + max_shard_size: Union[int, str] = "5GB", + create_pr: bool = False, + safe_serialization: bool = False, + token: Optional[Union[str, bool]] = None, + **kwargs, + ): + """ + Save a model and its configuration file to a directory, so that it can be re-loaded using the + [`~TFPreTrainedModel.from_pretrained`] class method. + + Arguments: + save_directory (`str`): + Directory to which to save. Will be created if it doesn't exist. + saved_model (`bool`, *optional*, defaults to `False`): + If the model has to be saved in saved model format as well or not. + version (`int`, *optional*, defaults to 1): + The version of the saved model. A saved model needs to be versioned in order to be properly loaded by + TensorFlow Serving as detailed in the official documentation + https://www.tensorflow.org/tfx/serving/serving_basic + push_to_hub (`bool`, *optional*, defaults to `False`): + Whether or not to push your model to the Hugging Face model hub after saving it. You can specify the + repository you want to push to with `repo_id` (will default to the name of `save_directory` in your + namespace). + signatures (`dict` or `tf.function`, *optional*): + Model's signature used for serving. This will be passed to the `signatures` argument of model.save(). + max_shard_size (`int` or `str`, *optional*, defaults to `"10GB"`): + The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size + lower than this size. If expressed as a string, needs to be digits followed by a unit (like `"5MB"`). + + + + If a single weight of the model is bigger than `max_shard_size`, it will be in its own checkpoint shard + which will be bigger than `max_shard_size`. + + + + create_pr (`bool`, *optional*, defaults to `False`): + Whether or not to create a PR with the uploaded files or directly commit. + safe_serialization (`bool`, *optional*, defaults to `False`): + Whether to save the model using `safetensors` or the traditional TensorFlow way (that uses `h5`). + token (`str` or `bool`, *optional*): + The token to use as HTTP bearer authorization for remote files. If `True`, or not specified, will use + the token generated when running `huggingface-cli login` (stored in `~/.huggingface`). + kwargs (`Dict[str, Any]`, *optional*): + Additional key word arguments passed along to the [`~utils.PushToHubMixin.push_to_hub`] method. + """ + use_auth_token = kwargs.pop("use_auth_token", None) + + if use_auth_token is not None: + warnings.warn( + "The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers. Please use `token` instead.", + FutureWarning, + ) + if token is not None: + raise ValueError( + "`token` and `use_auth_token` are both specified. Please set only the argument `token`." + ) + token = use_auth_token + + if token is not None: + kwargs["token"] = token + + if os.path.isfile(save_directory): + logger.error(f"Provided path ({save_directory}) should be a directory, not a file") + return + + os.makedirs(save_directory, exist_ok=True) + + if push_to_hub: + commit_message = kwargs.pop("commit_message", None) + repo_id = kwargs.pop("repo_id", save_directory.split(os.path.sep)[-1]) + repo_id = self._create_repo(repo_id, **kwargs) + files_timestamps = self._get_files_timestamps(save_directory) + + if saved_model: + # If `torch_dtype` is in the config with a torch dtype class as the value, we need to change it to string. + # (Although TF doesn't care about this attribute, we can't just remove it or set it to `None`.) + if getattr(self.config, "torch_dtype", None) is not None and not isinstance(self.config.torch_dtype, str): + self.config.torch_dtype = str(self.config.torch_dtype).split(".")[1] + if signatures is None: + serving_default = self.serving.get_concrete_function(self.input_signature) + if any(spec.dtype == tf.int32 for spec in self.input_signature.values()): + int64_spec = { + key: tf.TensorSpec( + shape=spec.shape, dtype=tf.int64 if spec.dtype == tf.int32 else spec.dtype, name=spec.name + ) + for key, spec in self.input_signature.items() + } + int64_serving = self.serving.get_concrete_function(int64_spec) + signatures = {"serving_default": serving_default, "int64_serving": int64_serving} + else: + signatures = serving_default + saved_model_dir = os.path.join(save_directory, "saved_model", str(version)) + self.save(saved_model_dir, include_optimizer=False, signatures=signatures) + logger.info(f"Saved model created in {saved_model_dir}") + + # Save configuration file + self.config.architectures = [self.__class__.__name__[2:]] + + # If we have a custom model, we copy the file defining it in the folder and set the attributes so it can be + # loaded from the Hub. + if self._auto_class is not None: + custom_object_save(self, save_directory, config=self.config) + + self.config.save_pretrained(save_directory) + if self.can_generate(): + self.generation_config.save_pretrained(save_directory) + + # If we save using the predefined names, we can load using `from_pretrained` + weights_name = SAFE_WEIGHTS_NAME if safe_serialization else TF2_WEIGHTS_NAME + output_model_file = os.path.join(save_directory, weights_name) + + shards, index = tf_shard_checkpoint(self.weights, max_shard_size, weights_name=weights_name) + + # Clean the folder from a previous save + for filename in os.listdir(save_directory): + full_filename = os.path.join(save_directory, filename) + # If we have a shard file that is not going to be replaced, we delete it, but only from the main process + # in distributed settings to avoid race conditions. + weights_no_suffix = weights_name.replace(".bin", "").replace(".safetensors", "") + if ( + filename.startswith(weights_no_suffix) + and os.path.isfile(full_filename) + and filename not in shards.keys() + ): + os.remove(full_filename) + + if index is None: + if safe_serialization: + state_dict = {strip_model_name_and_prefix(w.name): w.value() for w in self.weights} + safe_save_file(state_dict, output_model_file, metadata={"format": "tf"}) + else: + self.save_weights(output_model_file) + logger.info(f"Model weights saved in {output_model_file}") + else: + save_index_file = SAFE_WEIGHTS_INDEX_NAME if safe_serialization else TF2_WEIGHTS_INDEX_NAME + save_index_file = os.path.join(save_directory, save_index_file) + # Save the index as well + with open(save_index_file, "w", encoding="utf-8") as index_file: + content = json.dumps(index, indent=2, sort_keys=True) + "\n" + index_file.write(content) + logger.info( + f"The model is bigger than the maximum size per checkpoint ({max_shard_size}) and is going to be " + f"split in {len(shards)} checkpoint shards. You can find where each parameters has been saved in the " + f"index located at {save_index_file}." + ) + for shard_file, shard in shards.items(): + if safe_serialization: + shard_state_dict = {strip_model_name_and_prefix(w.name): w.value() for w in shard} + safe_save_file( + shard_state_dict, os.path.join(save_directory, shard_file), metadata={"format": "tf"} + ) + else: + with h5py.File(os.path.join(save_directory, shard_file), mode="w") as shard_file: + layers = [] + for layer in sorted(shard, key=lambda x: x.name): + if "model." in layer.name or len(layer.name.split("/")) == 1: + layer_name = layer.name + else: + layer_name = "/".join(layer.name.split("/")[1:]) + param_dset = shard_file.create_dataset( + layer_name, layer.numpy().shape, dtype=layer.numpy().dtype + ) + param_dset[:] = layer.numpy() + layers.append(layer_name.encode("utf8")) + save_attributes_to_hdf5_group(shard_file, "layer_names", layers) + + if push_to_hub: + self._upload_modified_files( + save_directory, + repo_id, + files_timestamps, + commit_message=commit_message, + token=token, + ) + + @classmethod + def from_pretrained( + cls, + pretrained_model_name_or_path: Optional[Union[str, os.PathLike]], + *model_args, + config: Optional[Union[PretrainedConfig, str, os.PathLike]] = None, + cache_dir: Optional[Union[str, os.PathLike]] = None, + ignore_mismatched_sizes: bool = False, + force_download: bool = False, + local_files_only: bool = False, + token: Optional[Union[str, bool]] = None, + revision: str = "main", + use_safetensors: Optional[bool] = None, + **kwargs, + ): + r""" + Instantiate a pretrained TF 2.0 model from a pre-trained model configuration. + + The warning *Weights from XXX not initialized from pretrained model* means that the weights of XXX do not come + pretrained with the rest of the model. It is up to you to train those weights with a downstream fine-tuning + task. + + The warning *Weights from XXX not used in YYY* means that the layer XXX is not used by YYY, therefore those + weights are discarded. + + Parameters: + pretrained_model_name_or_path (`str`, *optional*): + Can be either: + + - A string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co. + - A path to a *directory* containing model weights saved using + [`~TFPreTrainedModel.save_pretrained`], e.g., `./my_model_directory/`. + - A path or url to a *PyTorch state_dict save file* (e.g, `./pt_model/pytorch_model.bin`). In this + case, `from_pt` should be set to `True` and a configuration object should be provided as `config` + argument. This loading path is slower than converting the PyTorch model in a TensorFlow model + using the provided conversion scripts and loading the TensorFlow model afterwards. + - `None` if you are both providing the configuration and state dictionary (resp. with keyword + arguments `config` and `state_dict`). + model_args (sequence of positional arguments, *optional*): + All remaining positional arguments will be passed to the underlying model's `__init__` method. + config (`Union[PretrainedConfig, str]`, *optional*): + Can be either: + + - an instance of a class derived from [`PretrainedConfig`], + - a string valid as input to [`~PretrainedConfig.from_pretrained`]. + + Configuration for the model to use instead of an automatically loaded configuration. Configuration can + be automatically loaded when: + + - The model is a model provided by the library (loaded with the *model id* string of a pretrained + model). + - The model was saved using [`~TFPreTrainedModel.save_pretrained`] and is reloaded by supplying the + save directory. + - The model is loaded by supplying a local directory as `pretrained_model_name_or_path` and a + configuration JSON file named *config.json* is found in the directory. + from_pt (`bool`, *optional*, defaults to `False`): + Load the model weights from a PyTorch state_dict save file (see docstring of + `pretrained_model_name_or_path` argument). + ignore_mismatched_sizes (`bool`, *optional*, defaults to `False`): + Whether or not to raise an error if some of the weights from the checkpoint do not have the same size + as the weights of the model (if for instance, you are instantiating a model with 10 labels from a + checkpoint with 3 labels). + cache_dir (`str`, *optional*): + Path to a directory in which a downloaded pretrained model configuration should be cached if the + standard cache should not be used. + force_download (`bool`, *optional*, defaults to `False`): + Whether or not to force the (re-)download of the model weights and configuration files, overriding the + cached versions if they exist. + resume_download: + Deprecated and ignored. All downloads are now resumed by default when possible. + Will be removed in v5 of Transformers. + proxies: + (`Dict[str, str], `optional`): A dictionary of proxy servers to use by protocol or endpoint, e.g., + `{'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request. + output_loading_info(`bool`, *optional*, defaults to `False`): Whether ot not to also return a + dictionary containing missing keys, unexpected keys and error messages. + local_files_only(`bool`, *optional*, defaults to `False`): + Whether or not to only look at local files (e.g., not try downloading the model). + token (`str` or `bool`, *optional*): + The token to use as HTTP bearer authorization for remote files. If `True`, or not specified, will use + the token generated when running `huggingface-cli login` (stored in `~/.huggingface`). + revision (`str`, *optional*, defaults to `"main"`): + The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a + git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any + identifier allowed by git. + + + + + To test a pull request you made on the Hub, you can pass `revision="refs/pr/"`. + + + + mirror (`str`, *optional*): + Mirror source to accelerate downloads in China. If you are from China and have an accessibility + problem, you can set this option to resolve it. Note that we do not guarantee the timeliness or safety. + Please refer to the mirror site for more information. + subfolder (`str`, *optional*, defaults to `""`): + In case the relevant files are located inside a subfolder of the model repo on huggingface.co, you can + specify the folder name here. + tf_to_pt_weight_rename (`Callable`, *optional*): + A function that is called to transform the names of weights during the PyTorch to TensorFlow + crossloading process. This is not necessary for most models, but is useful to allow composite models to + be crossloaded correctly. + use_safetensors (`bool`, *optional*, defaults to `None`): + Whether or not to use `safetensors` checkpoints. Defaults to `None`. If not specified and `safetensors` + is not installed, it will be set to `False`. + kwargs (remaining dictionary of keyword arguments, *optional*): + Can be used to update the configuration object (after it being loaded) and initiate the model (e.g., + `output_attentions=True`). Behaves differently depending on whether a `config` is provided or + automatically loaded: + + - If a configuration is provided with `config`, `**kwargs` will be directly passed to the + underlying model's `__init__` method (we assume all relevant updates to the configuration have + already been done) + - If a configuration is not provided, `kwargs` will be first passed to the configuration class + initialization function ([`~PretrainedConfig.from_pretrained`]). Each key of `kwargs` that + corresponds to a configuration attribute will be used to override said attribute with the + supplied `kwargs` value. Remaining keys that do not correspond to any configuration attribute + will be passed to the underlying model's `__init__` function. + + Examples: + + ```python + >>> from transformers import BertConfig, TFBertModel + + >>> # Download model and configuration from huggingface.co and cache. + >>> model = TFBertModel.from_pretrained("google-bert/bert-base-uncased") + >>> # Model was saved using *save_pretrained('./test/saved_model/')* (for example purposes, not runnable). + >>> model = TFBertModel.from_pretrained("./test/saved_model/") + >>> # Update configuration during loading. + >>> model = TFBertModel.from_pretrained("google-bert/bert-base-uncased", output_attentions=True) + >>> assert model.config.output_attentions == True + >>> # Loading from a Pytorch model file instead of a TensorFlow checkpoint (slower, for example purposes, not runnable). + >>> config = BertConfig.from_json_file("./pt_model/my_pt_model_config.json") + >>> model = TFBertModel.from_pretrained("./pt_model/my_pytorch_model.bin", from_pt=True, config=config) + ```""" + from_pt = kwargs.pop("from_pt", False) + resume_download = kwargs.pop("resume_download", None) + proxies = kwargs.pop("proxies", None) + output_loading_info = kwargs.pop("output_loading_info", False) + use_auth_token = kwargs.pop("use_auth_token", None) + trust_remote_code = kwargs.pop("trust_remote_code", None) + _ = kwargs.pop("mirror", None) + load_weight_prefix = kwargs.pop("load_weight_prefix", None) + from_pipeline = kwargs.pop("_from_pipeline", None) + from_auto_class = kwargs.pop("_from_auto", False) + subfolder = kwargs.pop("subfolder", "") + commit_hash = kwargs.pop("_commit_hash", None) + tf_to_pt_weight_rename = kwargs.pop("tf_to_pt_weight_rename", None) + + # Not relevant for TF models + _ = kwargs.pop("adapter_kwargs", None) + + if use_auth_token is not None: + warnings.warn( + "The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers. Please use `token` instead.", + FutureWarning, + ) + if token is not None: + raise ValueError( + "`token` and `use_auth_token` are both specified. Please set only the argument `token`." + ) + token = use_auth_token + + if trust_remote_code is True: + logger.warning( + "The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is" + " ignored." + ) + + user_agent = {"file_type": "model", "framework": "tensorflow", "from_auto_class": from_auto_class} + if from_pipeline is not None: + user_agent["using_pipeline"] = from_pipeline + + if is_offline_mode() and not local_files_only: + logger.info("Offline mode: forcing local_files_only=True") + local_files_only = True + + if use_safetensors is None and not is_safetensors_available(): + use_safetensors = False + + # Load config if we don't provide a configuration + if not isinstance(config, PretrainedConfig): + config_path = config if config is not None else pretrained_model_name_or_path + config, model_kwargs = cls.config_class.from_pretrained( + config_path, + cache_dir=cache_dir, + return_unused_kwargs=True, + force_download=force_download, + resume_download=resume_download, + proxies=proxies, + local_files_only=local_files_only, + token=token, + revision=revision, + _from_auto=from_auto_class, + _from_pipeline=from_pipeline, + _commit_hash=commit_hash, + **kwargs, + ) + else: + model_kwargs = kwargs + + if commit_hash is None: + commit_hash = getattr(config, "_commit_hash", None) + + # This variable will flag if we're loading a sharded checkpoint. In this case the archive file is just the + # index of the files. + is_sharded = False + # Load model + if pretrained_model_name_or_path is not None: + pretrained_model_name_or_path = str(pretrained_model_name_or_path) + is_local = os.path.isdir(pretrained_model_name_or_path) + if is_local: + if from_pt and os.path.isfile(os.path.join(pretrained_model_name_or_path, WEIGHTS_NAME)): + # Load from a PyTorch checkpoint in priority if from_pt + archive_file = os.path.join(pretrained_model_name_or_path, WEIGHTS_NAME) + elif from_pt and os.path.isfile(os.path.join(pretrained_model_name_or_path, WEIGHTS_INDEX_NAME)): + # Load from a sharded PyTorch checkpoint + archive_file = os.path.join(pretrained_model_name_or_path, WEIGHTS_INDEX_NAME) + is_sharded = True + elif use_safetensors is not False and os.path.isfile( + os.path.join(pretrained_model_name_or_path, SAFE_WEIGHTS_NAME) + ): + # Load from a safetensors checkpoint + archive_file = os.path.join(pretrained_model_name_or_path, SAFE_WEIGHTS_NAME) + elif use_safetensors is not False and os.path.isfile( + os.path.join(pretrained_model_name_or_path, SAFE_WEIGHTS_INDEX_NAME) + ): + # Load from a sharded safetensors checkpoint + archive_file = os.path.join(pretrained_model_name_or_path, SAFE_WEIGHTS_INDEX_NAME) + is_sharded = True + elif os.path.isfile(os.path.join(pretrained_model_name_or_path, TF2_WEIGHTS_NAME)): + # Load from a TF 2.0 checkpoint + archive_file = os.path.join(pretrained_model_name_or_path, TF2_WEIGHTS_NAME) + elif os.path.isfile(os.path.join(pretrained_model_name_or_path, TF2_WEIGHTS_INDEX_NAME)): + # Load from a sharded TF 2.0 checkpoint + archive_file = os.path.join(pretrained_model_name_or_path, TF2_WEIGHTS_INDEX_NAME) + is_sharded = True + + # At this stage we don't have a weight file so we will raise an error. + elif use_safetensors: + raise EnvironmentError( + f"Error no file named {SAFE_WEIGHTS_NAME} or {SAFE_WEIGHTS_INDEX_NAME} found in directory {pretrained_model_name_or_path}. " + f"Please make sure that the model has been saved with `safe_serialization=True` or do not " + f"set `use_safetensors=True`." + ) + elif os.path.isfile(os.path.join(pretrained_model_name_or_path, WEIGHTS_NAME)) or os.path.isfile( + os.path.join(pretrained_model_name_or_path, WEIGHTS_INDEX_NAME) + ): + raise EnvironmentError( + f"Error no file named {TF2_WEIGHTS_NAME} or {SAFE_WEIGHTS_NAME} found in directory {pretrained_model_name_or_path} " + "but there is a file for PyTorch weights. Use `from_pt=True` to load this model from those " + "weights." + ) + else: + raise EnvironmentError( + f"Error no file named {TF2_WEIGHTS_NAME}, {SAFE_WEIGHTS_NAME} or {WEIGHTS_NAME} found in directory " + f"{pretrained_model_name_or_path}." + ) + elif os.path.isfile(pretrained_model_name_or_path): + archive_file = pretrained_model_name_or_path + is_local = True + elif os.path.isfile(pretrained_model_name_or_path + ".index"): + archive_file = pretrained_model_name_or_path + ".index" + is_local = True + elif is_remote_url(pretrained_model_name_or_path): + filename = pretrained_model_name_or_path + resolved_archive_file = download_url(pretrained_model_name_or_path) + else: + # set correct filename + if from_pt: + filename = WEIGHTS_NAME + elif use_safetensors is not False: + filename = SAFE_WEIGHTS_NAME + else: + filename = TF2_WEIGHTS_NAME + + try: + # Load from URL or cache if already cached + cached_file_kwargs = { + "cache_dir": cache_dir, + "force_download": force_download, + "proxies": proxies, + "resume_download": resume_download, + "local_files_only": local_files_only, + "token": token, + "user_agent": user_agent, + "revision": revision, + "subfolder": subfolder, + "_raise_exceptions_for_gated_repo": False, + "_raise_exceptions_for_missing_entries": False, + "_commit_hash": commit_hash, + } + resolved_archive_file = cached_file(pretrained_model_name_or_path, filename, **cached_file_kwargs) + + # Since we set _raise_exceptions_for_missing_entries=False, we don't get an exception but a None + # result when internet is up, the repo and revision exist, but the file does not. + if resolved_archive_file is None and filename == SAFE_WEIGHTS_NAME: + # Did not find the safetensors file, let's fallback to TF. + # No support for sharded safetensors yet, so we'll raise an error if that's all we find. + filename = TF2_WEIGHTS_NAME + resolved_archive_file = cached_file( + pretrained_model_name_or_path, TF2_WEIGHTS_NAME, **cached_file_kwargs + ) + if resolved_archive_file is None and filename == TF2_WEIGHTS_NAME: + # Maybe the checkpoint is sharded, we try to grab the index name in this case. + resolved_archive_file = cached_file( + pretrained_model_name_or_path, TF2_WEIGHTS_INDEX_NAME, **cached_file_kwargs + ) + if resolved_archive_file is not None: + is_sharded = True + if resolved_archive_file is None and filename == WEIGHTS_NAME: + # Maybe the checkpoint is sharded, we try to grab the index name in this case. + resolved_archive_file = cached_file( + pretrained_model_name_or_path, WEIGHTS_INDEX_NAME, **cached_file_kwargs + ) + if resolved_archive_file is not None: + is_sharded = True + if resolved_archive_file is None: + # Otherwise, maybe there is a PyTorch or Flax model file. We try those to give a helpful error + # message. + has_file_kwargs = { + "revision": revision, + "proxies": proxies, + "token": token, + "cache_dir": cache_dir, + "local_files_only": local_files_only, + } + if has_file(pretrained_model_name_or_path, SAFE_WEIGHTS_INDEX_NAME, **has_file_kwargs): + is_sharded = True + elif has_file(pretrained_model_name_or_path, WEIGHTS_NAME, **has_file_kwargs): + raise EnvironmentError( + f"{pretrained_model_name_or_path} does not appear to have a file named" + f" {TF2_WEIGHTS_NAME} but there is a file for PyTorch weights. Use `from_pt=True` to" + " load this model from those weights." + ) + else: + raise EnvironmentError( + f"{pretrained_model_name_or_path} does not appear to have a file named {WEIGHTS_NAME}," + f" {TF2_WEIGHTS_NAME} or {TF_WEIGHTS_NAME}" + ) + + except EnvironmentError: + # Raise any environment error raise by `cached_file`. It will have a helpful error message adapted + # to the original exception. + raise + except Exception: + # For any other exception, we throw a generic error. + + raise EnvironmentError( + f"Can't load the model for '{pretrained_model_name_or_path}'. If you were trying to load it" + " from 'https://huggingface.co/models', make sure you don't have a local directory with the" + f" same name. Otherwise, make sure '{pretrained_model_name_or_path}' is the correct path to a" + f" directory containing a file named {WEIGHTS_NAME}, {TF2_WEIGHTS_NAME} or {TF_WEIGHTS_NAME}" + ) + if is_local: + logger.info(f"loading weights file {archive_file}") + resolved_archive_file = archive_file + filename = resolved_archive_file.split(os.path.sep)[-1] + else: + logger.info(f"loading weights file {filename} from cache at {resolved_archive_file}") + else: + resolved_archive_file = None + + # We'll need to download and cache each checkpoint shard if the checkpoint is sharded. + if is_sharded: + # resolved_archive_file becomes a list of files that point to the different checkpoint shards in this case. + resolved_archive_file, sharded_metadata = get_checkpoint_shard_files( + pretrained_model_name_or_path, + resolved_archive_file, + cache_dir=cache_dir, + force_download=force_download, + proxies=proxies, + resume_download=resume_download, + local_files_only=local_files_only, + token=token, + user_agent=user_agent, + revision=revision, + _commit_hash=commit_hash, + ) + + safetensors_from_pt = False + if filename == SAFE_WEIGHTS_NAME: + with safe_open(resolved_archive_file, framework="tf") as f: + safetensors_metadata = f.metadata() + if safetensors_metadata is None or safetensors_metadata.get("format") not in ["pt", "tf", "flax", "mlx"]: + raise OSError( + f"The safetensors archive passed at {resolved_archive_file} does not contain the valid metadata." + " Make sure you save your model with the `save_pretrained` method." + ) + safetensors_from_pt = safetensors_metadata.get("format") == "pt" + elif filename == SAFE_WEIGHTS_INDEX_NAME: + with safe_open(resolved_archive_file[0], framework="tf") as f: + safetensors_metadata = f.metadata() + if safetensors_metadata is None or safetensors_metadata.get("format") not in ["pt", "tf", "flax", "mlx"]: + raise OSError( + f"The safetensors archive passed at {resolved_archive_file} does not contain the valid metadata." + " Make sure you save your model with the `save_pretrained` method." + ) + safetensors_from_pt = safetensors_metadata.get("format") == "pt" + + config.name_or_path = pretrained_model_name_or_path + + # composed models, *e.g.* TFRag, require special treatment when it comes to loading + # pre-trained weights. + if cls._requires_load_weight_prefix and model_kwargs.get("name") is not None: + model_kwargs["load_weight_prefix"] = load_weight_prefix + "/" + model_kwargs.get("name") + + # Instantiate model. + model = cls(config, *model_args, **model_kwargs) + + if tf_to_pt_weight_rename is None and hasattr(model, "tf_to_pt_weight_rename"): + # TODO Matt: This is a temporary workaround to allow weight renaming, but requires a method + # to be defined for each class that requires a rename. We can probably just have a class-level + # dict and a single top-level method or something and cut down a lot of boilerplate code + tf_to_pt_weight_rename = model.tf_to_pt_weight_rename + + if from_pt: + from .modeling_tf_pytorch_utils import load_pytorch_checkpoint_in_tf2_model + + # Load from a PyTorch checkpoint + return load_pytorch_checkpoint_in_tf2_model( + model, + resolved_archive_file, + allow_missing_keys=True, + output_loading_info=output_loading_info, + _prefix=load_weight_prefix, + tf_to_pt_weight_rename=tf_to_pt_weight_rename, + ) + + # we might need to extend the variable scope for composite models + if load_weight_prefix is not None: + with tf.compat.v1.variable_scope(load_weight_prefix): + model.build_in_name_scope() # build the network with dummy inputs + else: + model.build_in_name_scope() # build the network with dummy inputs + + if safetensors_from_pt and not is_sharded: + from .modeling_tf_pytorch_utils import load_pytorch_state_dict_in_tf2_model + + with safe_open(resolved_archive_file, framework="tf") as safetensors_archive: + # Load from a PyTorch safetensors checkpoint + # We load in TF format here because PT weights often need to be transposed, and this is much + # faster on GPU. Loading as numpy and transposing on CPU adds several seconds to load times. + return load_pytorch_state_dict_in_tf2_model( + model, + safetensors_archive, + tf_inputs=False, # No need to build the model again + allow_missing_keys=True, + output_loading_info=output_loading_info, + _prefix=load_weight_prefix, + ignore_mismatched_sizes=ignore_mismatched_sizes, + tf_to_pt_weight_rename=tf_to_pt_weight_rename, + ) + elif safetensors_from_pt: + from .modeling_tf_pytorch_utils import load_sharded_pytorch_safetensors_in_tf2_model + + return load_sharded_pytorch_safetensors_in_tf2_model( + model, + resolved_archive_file, + tf_inputs=False, + allow_missing_keys=True, + output_loading_info=output_loading_info, + _prefix=load_weight_prefix, + ignore_mismatched_sizes=ignore_mismatched_sizes, + tf_to_pt_weight_rename=tf_to_pt_weight_rename, + ) + + # 'by_name' allow us to do transfer learning by skipping/adding layers + # see https://github.com/tensorflow/tensorflow/blob/00fad90125b18b80fe054de1055770cfb8fe4ba3/tensorflow/python/keras/engine/network.py#L1339-L1357 + try: + if is_sharded: + for file in resolved_archive_file: + os.path.isfile(file), f"Error retrieving files {file}" + if filename == SAFE_WEIGHTS_INDEX_NAME: + missing_keys, unexpected_keys, mismatched_keys = load_tf_sharded_weights_from_safetensors( + model, + resolved_archive_file, + ignore_mismatched_sizes=ignore_mismatched_sizes, + _prefix=load_weight_prefix, + ) + else: + missing_keys, unexpected_keys, mismatched_keys = load_tf_sharded_weights( + model, + resolved_archive_file, + ignore_mismatched_sizes=ignore_mismatched_sizes, + _prefix=load_weight_prefix, + ) + else: + # Handles both H5 and safetensors + missing_keys, unexpected_keys, mismatched_keys = load_tf_weights( + model, + resolved_archive_file, + ignore_mismatched_sizes=ignore_mismatched_sizes, + _prefix=load_weight_prefix, + ) + except OSError as e: + try: + with open(resolved_archive_file) as f: + if f.read().startswith("version"): + raise OSError( + "You seem to have cloned a repository without having git-lfs installed. Please install " + "git-lfs and run `git lfs install` followed by `git lfs pull` in the folder " + "you cloned." + ) + else: + raise ValueError from e + except (UnicodeDecodeError, ValueError): + raise OSError( + "Unable to load weights from h5 file. " + "If you tried to load a TF 2.0 model from a PyTorch checkpoint, please set from_pt=True. " + ) + + if cls._keys_to_ignore_on_load_missing is not None: + for pat in cls._keys_to_ignore_on_load_missing: + missing_keys = [k for k in missing_keys if re.search(pat, k) is None] + + if cls._keys_to_ignore_on_load_unexpected is not None: + for pat in cls._keys_to_ignore_on_load_unexpected: + unexpected_keys = [k for k in unexpected_keys if re.search(pat, k) is None] + + if len(unexpected_keys) > 0: + logger.warning( + f"Some layers from the model checkpoint at {pretrained_model_name_or_path} were not used when" + f" initializing {model.__class__.__name__}: {unexpected_keys}\n- This IS expected if you are" + f" initializing {model.__class__.__name__} from the checkpoint of a model trained on another task or" + " with another architecture (e.g. initializing a BertForSequenceClassification model from a" + " BertForPreTraining model).\n- This IS NOT expected if you are initializing" + f" {model.__class__.__name__} from the checkpoint of a model that you expect to be exactly identical" + " (initializing a BertForSequenceClassification model from a BertForSequenceClassification model)." + ) + else: + logger.warning(f"All model checkpoint layers were used when initializing {model.__class__.__name__}.\n") + + if len(missing_keys) > 0: + logger.warning( + f"Some layers of {model.__class__.__name__} were not initialized from the model checkpoint at" + f" {pretrained_model_name_or_path} and are newly initialized: {missing_keys}\nYou should probably" + " TRAIN this model on a down-stream task to be able to use it for predictions and inference." + ) + elif len(mismatched_keys) == 0: + logger.warning( + f"All the layers of {model.__class__.__name__} were initialized from the model checkpoint at" + f" {pretrained_model_name_or_path}.\nIf your task is similar to the task the model of the checkpoint" + f" was trained on, you can already use {model.__class__.__name__} for predictions without further" + " training." + ) + if len(mismatched_keys) > 0: + mismatched_warning = "\n".join( + [ + f"- {key}: found shape {shape1} in the checkpoint and {shape2} in the model instantiated" + for key, shape1, shape2 in mismatched_keys + ] + ) + logger.warning( + f"Some weights of {model.__class__.__name__} were not initialized from the model checkpoint at" + f" {pretrained_model_name_or_path} and are newly initialized because the shapes did not" + f" match:\n{mismatched_warning}\nYou should probably TRAIN this model on a down-stream task to be able" + " to use it for predictions and inference." + ) + + # If it is a model with generation capabilities, attempt to load the generation config + if model.can_generate(): + try: + model.generation_config = GenerationConfig.from_pretrained( + pretrained_model_name_or_path, + cache_dir=cache_dir, + force_download=force_download, + resume_download=resume_download, + proxies=proxies, + local_files_only=local_files_only, + token=token, + revision=revision, + subfolder=subfolder, + _from_auto=from_auto_class, + _from_pipeline=from_pipeline, + **kwargs, + ) + except OSError: + logger.info( + "Generation config file not found, using a generation config created from the model config." + ) + pass + + if output_loading_info: + loading_info = { + "missing_keys": missing_keys, + "unexpected_keys": unexpected_keys, + "mismatched_keys": mismatched_keys, + } + + return model, loading_info + + return model + + def push_to_hub( + self, + repo_id: str, + use_temp_dir: Optional[bool] = None, + commit_message: Optional[str] = None, + private: Optional[bool] = None, + max_shard_size: Optional[Union[int, str]] = "10GB", + token: Optional[Union[bool, str]] = None, + # (`use_auth_token` is deprecated: we have to keep it here as we don't have **kwargs) + use_auth_token: Optional[Union[bool, str]] = None, + create_pr: bool = False, + **base_model_card_args, + ) -> str: + """ + Upload the model files to the 🤗 Model Hub while synchronizing a local clone of the repo in `repo_path_or_name`. + + Parameters: + repo_id (`str`): + The name of the repository you want to push your model to. It should contain your organization name + when pushing to a given organization. + use_temp_dir (`bool`, *optional*): + Whether or not to use a temporary directory to store the files saved before they are pushed to the Hub. + Will default to `True` if there is no directory named like `repo_id`, `False` otherwise. + commit_message (`str`, *optional*): + Message to commit while pushing. Will default to `"Upload model"`. + private (`bool`, *optional*): + Whether to make the repo private. If `None` (default), the repo will be public unless the organization's default is private. This value is ignored if the repo already exists. + token (`bool` or `str`, *optional*): + The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated + when running `huggingface-cli login` (stored in `~/.huggingface`). Will default to `True` if `repo_url` + is not specified. + max_shard_size (`int` or `str`, *optional*, defaults to `"10GB"`): + Only applicable for models. The maximum size for a checkpoint before being sharded. Checkpoints shard + will then be each of size lower than this size. If expressed as a string, needs to be digits followed + by a unit (like `"5MB"`). + create_pr (`bool`, *optional*, defaults to `False`): + Whether or not to create a PR with the uploaded files or directly commit. + + Examples: + + ```python + from transformers import TFAutoModel + + model = TFAutoModel.from_pretrained("google-bert/bert-base-cased") + + # Push the model to your namespace with the name "my-finetuned-bert". + model.push_to_hub("my-finetuned-bert") + + # Push the model to an organization with the name "my-finetuned-bert". + model.push_to_hub("huggingface/my-finetuned-bert") + ``` + """ + if use_auth_token is not None: + warnings.warn( + "The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers. Please use `token` instead.", + FutureWarning, + ) + if token is not None: + raise ValueError( + "`token` and `use_auth_token` are both specified. Please set only the argument `token`." + ) + token = use_auth_token + + if "repo_path_or_name" in base_model_card_args: + warnings.warn( + "The `repo_path_or_name` argument is deprecated and will be removed in v5 of Transformers. Use " + "`repo_id` instead." + ) + repo_id = base_model_card_args.pop("repo_path_or_name") + # Deprecation warning will be sent after for repo_url and organization + repo_url = base_model_card_args.pop("repo_url", None) + organization = base_model_card_args.pop("organization", None) + + if os.path.isdir(repo_id): + working_dir = repo_id + repo_id = repo_id.split(os.path.sep)[-1] + else: + working_dir = repo_id.split("/")[-1] + + repo_id = self._create_repo( + repo_id, private=private, token=token, repo_url=repo_url, organization=organization + ) + + if use_temp_dir is None: + use_temp_dir = not os.path.isdir(working_dir) + + with working_or_temp_dir(working_dir=working_dir, use_temp_dir=use_temp_dir) as work_dir: + files_timestamps = self._get_files_timestamps(work_dir) + + # Save all files. + self.save_pretrained(work_dir, max_shard_size=max_shard_size) + if hasattr(self, "history") and hasattr(self, "create_model_card"): + # This is a Keras model and we might be able to fish out its History and make a model card out of it + base_model_card_args = { + "output_dir": work_dir, + "model_name": Path(repo_id).name, + } + base_model_card_args.update(base_model_card_args) + self.create_model_card(**base_model_card_args) + + self._upload_modified_files( + work_dir, + repo_id, + files_timestamps, + commit_message=commit_message, + token=token, + create_pr=create_pr, + ) + + @classmethod + def register_for_auto_class(cls, auto_class="TFAutoModel"): + """ + Register this class with a given auto class. This should only be used for custom models as the ones in the + library are already mapped with an auto class. + + + + This API is experimental and may have some slight breaking changes in the next releases. + + + + Args: + auto_class (`str` or `type`, *optional*, defaults to `"TFAutoModel"`): + The auto class to register this new model with. + """ + if not isinstance(auto_class, str): + auto_class = auto_class.__name__ + + import transformers.models.auto as auto_module + + if not hasattr(auto_module, auto_class): + raise ValueError(f"{auto_class} is not a valid auto class.") + + cls._auto_class = auto_class + + +class TFConv1D(keras.layers.Layer): + """ + 1D-convolutional layer as defined by Radford et al. for OpenAI GPT (and also used in GPT-2). + + Basically works like a linear layer but the weights are transposed. + + Args: + nf (`int`): + The number of output features. + nx (`int`): + The number of input features. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation to use to initialize the weights. + kwargs (`Dict[str, Any]`, *optional*): + Additional keyword arguments passed along to the `__init__` of `keras.layers.Layer`. + """ + + def __init__(self, nf, nx, initializer_range=0.02, **kwargs): + super().__init__(**kwargs) + self.nf = nf + self.nx = nx + self.initializer_range = initializer_range + + def build(self, input_shape): + if self.built: + return + self.built = True + self.weight = self.add_weight( + "weight", shape=[self.nx, self.nf], initializer=get_initializer(self.initializer_range) + ) + self.bias = self.add_weight("bias", shape=[1, self.nf], initializer=tf.zeros_initializer()) + + def call(self, x): + bz, sl = shape_list(x)[:2] + + x = tf.reshape(x, [-1, self.nx]) + x = tf.matmul(x, self.weight) + self.bias + + x = tf.reshape(x, [bz, sl, self.nf]) + + return x + + +class TFSharedEmbeddings(keras.layers.Layer): + r""" + Construct shared token embeddings. + + The weights of the embedding layer is usually shared with the weights of the linear decoder when doing language + modeling. + + Args: + vocab_size (`int`): + The size of the vocabulary, e.g., the number of unique tokens. + hidden_size (`int`): + The size of the embedding vectors. + initializer_range (`float`, *optional*): + The standard deviation to use when initializing the weights. If no value is provided, it will default to + \\(1/\sqrt{hidden\_size}\\). + kwargs (`Dict[str, Any]`, *optional*): + Additional keyword arguments passed along to the `__init__` of `keras.layers.Layer`. + """ + + # TODO (joao): flagged for delection due to embeddings refactor + + def __init__(self, vocab_size: int, hidden_size: int, initializer_range: Optional[float] = None, **kwargs): + super().__init__(**kwargs) + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.initializer_range = hidden_size**-0.5 if initializer_range is None else initializer_range + warnings.warn( + "`TFSharedEmbeddings` is scheduled for deletion in v4.32, use `keras.layers.Embedding` instead.", + DeprecationWarning, + ) + + def build(self, input_shape): + """ + Build shared token embedding layer Shared weights logic adapted from + https://github.com/tensorflow/models/blob/a009f4fb9d2fc4949e32192a944688925ef78659/official/transformer/v2/embedding_layer.py#L24 + """ + self.weight = self.add_weight( + "weight", shape=[self.vocab_size, self.hidden_size], initializer=get_initializer(self.initializer_range) + ) + super().build(input_shape) + + def get_config(self): + config = { + "vocab_size": self.vocab_size, + "hidden_size": self.hidden_size, + "initializer_range": self.initializer_range, + } + base_config = super().get_config() + + return dict(list(base_config.items()) + list(config.items())) + + def call(self, inputs: tf.Tensor, mode: str = "embedding") -> tf.Tensor: + """ + Get token embeddings of inputs or decode final hidden state. + + Args: + inputs (`tf.Tensor`): + In embedding mode, should be an int64 tensor with shape `[batch_size, length]`. + + In linear mode, should be a float tensor with shape `[batch_size, length, hidden_size]`. + mode (`str`, defaults to `"embedding"`): + A valid value is either `"embedding"` or `"linear"`, the first one indicates that the layer should be + used as an embedding layer, the second one that the layer should be used as a linear decoder. + + Returns: + `tf.Tensor`: In embedding mode, the output is a float32 embedding tensor, with shape `[batch_size, length, + embedding_size]`. + + In linear mode, the output is a float32 with shape `[batch_size, length, vocab_size]`. + + Raises: + ValueError: if `mode` is not valid. + + Shared weights logic is adapted from + [here](https://github.com/tensorflow/models/blob/a009f4fb9d2fc4949e32192a944688925ef78659/official/transformer/v2/embedding_layer.py#L24). + """ + if mode == "embedding": + return self._embedding(inputs) + elif mode == "linear": + return self._linear(inputs) + else: + raise ValueError(f"mode {mode} is not valid.") + + def _embedding(self, input_ids): + """Applies embedding based on inputs tensor.""" + return tf.gather(self.weight, input_ids) + + def _linear(self, inputs): + """ + Computes logits by running inputs through a linear layer. + + Args: + inputs: A float32 tensor with shape [..., hidden_size] + + Returns: + float32 tensor with shape [..., vocab_size]. + """ + first_dims = shape_list(inputs)[:-1] + x = tf.reshape(inputs, [-1, self.hidden_size]) + logits = tf.matmul(x, self.weight, transpose_b=True) + + return tf.reshape(logits, first_dims + [self.vocab_size]) + + +class TFSequenceSummary(keras.layers.Layer): + """ + Compute a single vector summary of a sequence hidden states. + + Args: + config ([`PretrainedConfig`]): + The config used by the model. Relevant arguments in the config class of the model are (refer to the actual + config class of your model for the default values it uses): + + - **summary_type** (`str`) -- The method to use to make this summary. Accepted values are: + + - `"last"` -- Take the last token hidden state (like XLNet) + - `"first"` -- Take the first token hidden state (like Bert) + - `"mean"` -- Take the mean of all tokens hidden states + - `"cls_index"` -- Supply a Tensor of classification token position (GPT/GPT-2) + - `"attn"` -- Not implemented now, use multi-head attention + + - **summary_use_proj** (`bool`) -- Add a projection after the vector extraction. + - **summary_proj_to_labels** (`bool`) -- If `True`, the projection outputs to `config.num_labels` classes + (otherwise to `config.hidden_size`). + - **summary_activation** (`Optional[str]`) -- Set to `"tanh"` to add a tanh activation to the output, + another string or `None` will add no activation. + - **summary_first_dropout** (`float`) -- Optional dropout probability before the projection and activation. + - **summary_last_dropout** (`float`)-- Optional dropout probability after the projection and activation. + + initializer_range (`float`, *optional*, defaults to 0.02): The standard deviation to use to initialize the weights. + kwargs (`Dict[str, Any]`, *optional*): + Additional keyword arguments passed along to the `__init__` of `keras.layers.Layer`. + """ + + def __init__(self, config: PretrainedConfig, initializer_range: float = 0.02, **kwargs): + super().__init__(**kwargs) + + self.summary_type = config.summary_type if hasattr(config, "summary_use_proj") else "last" + if self.summary_type == "attn": + # We should use a standard multi-head attention module with absolute positional embedding for that. + # Cf. https://github.com/zihangdai/xlnet/blob/master/modeling.py#L253-L276 + # We can probably just use the multi-head attention module of PyTorch >=1.1.0 + raise NotImplementedError + + self.has_summary = hasattr(config, "summary_use_proj") and config.summary_use_proj + if self.has_summary: + if hasattr(config, "summary_proj_to_labels") and config.summary_proj_to_labels and config.num_labels > 0: + num_classes = config.num_labels + else: + num_classes = config.hidden_size + self.summary = keras.layers.Dense( + num_classes, kernel_initializer=get_initializer(initializer_range), name="summary" + ) + + self.has_activation = False + activation_string = getattr(config, "summary_activation", None) + if activation_string is not None: + self.has_activation = True + self.activation = get_tf_activation(activation_string) + + self.has_first_dropout = hasattr(config, "summary_first_dropout") and config.summary_first_dropout > 0 + if self.has_first_dropout: + self.first_dropout = keras.layers.Dropout(config.summary_first_dropout) + + self.has_last_dropout = hasattr(config, "summary_last_dropout") and config.summary_last_dropout > 0 + if self.has_last_dropout: + self.last_dropout = keras.layers.Dropout(config.summary_last_dropout) + self.hidden_size = config.hidden_size + + def call(self, inputs, cls_index=None, training=False): + if not isinstance(inputs, (dict, tuple, list)): + hidden_states = inputs + elif isinstance(inputs, (tuple, list)): + hidden_states = inputs[0] + cls_index = inputs[1] if len(inputs) > 1 else None + assert len(inputs) <= 2, "Too many inputs." + else: + hidden_states = inputs.get("hidden_states") + cls_index = inputs.get("cls_index", None) + + if self.summary_type == "last": + output = hidden_states[:, -1] + elif self.summary_type == "first": + output = hidden_states[:, 0] + elif self.summary_type == "mean": + output = tf.reduce_mean(hidden_states, axis=1) + elif self.summary_type == "cls_index": + hidden_shape = shape_list(hidden_states) # e.g. [batch, num choices, seq length, hidden dims] + if cls_index is None: + cls_index = tf.fill( + hidden_shape[:-2], hidden_shape[-2] - 1 + ) # A tensor full of shape [batch] or [batch, num choices] full of sequence length + cls_shape = shape_list(cls_index) + if len(cls_shape) <= len(hidden_shape) - 2: + cls_index = tf.expand_dims(cls_index, axis=-1) + # else: + # cls_index = cls_index[..., tf.newaxis] + # cls_index = cls_index.expand((-1,) * (cls_index.dim()-1) + (hidden_states.size(-1),)) + # shape of cls_index: (bsz, XX, 1, hidden_size) where XX are optional leading dim of hidden_states + output = tf.gather(hidden_states, cls_index, batch_dims=len(hidden_shape) - 2) + output = tf.squeeze( + output, axis=len(hidden_shape) - 2 + ) # shape of output: (batch, num choices, hidden_size) + elif self.summary_type == "attn": + raise NotImplementedError + + if self.has_first_dropout: + output = self.first_dropout(output, training=training) + + if self.has_summary: + output = self.summary(output) + + if self.has_activation: + output = self.activation(output) + + if self.has_last_dropout: + output = self.last_dropout(output, training=training) + + return output + + def build(self, input_shape): + if self.built: + return + self.built = True + if getattr(self, "summary", None) is not None: + with tf.name_scope("summary"): + self.summary.build(self.hidden_size) + + +def get_initializer(initializer_range: float = 0.02) -> keras.initializers.TruncatedNormal: + """ + Creates a `keras.initializers.TruncatedNormal` with the given range. + + Args: + initializer_range (*float*, defaults to 0.02): Standard deviation of the initializer range. + + Returns: + `keras.initializers.TruncatedNormal`: The truncated normal initializer. + """ + return keras.initializers.TruncatedNormal(stddev=initializer_range) diff --git a/docs/transformers/build/lib/transformers/modeling_utils.py b/docs/transformers/build/lib/transformers/modeling_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..adac0890e6dbaedb32e6bb6cdf64c081e75f25b6 --- /dev/null +++ b/docs/transformers/build/lib/transformers/modeling_utils.py @@ -0,0 +1,6005 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Language Team Authors, Facebook AI Research authors and The HuggingFace Inc. team. +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import collections +import copy +import functools +import gc +import importlib.metadata +import inspect +import itertools +import json +import os +import re +import shutil +import tempfile +import warnings +from collections import defaultdict +from collections.abc import MutableMapping +from contextlib import contextmanager +from dataclasses import dataclass +from enum import Enum +from functools import partial, wraps +from threading import Thread +from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Type, TypeVar, Union +from zipfile import is_zipfile + +import torch +import torch.distributed.tensor +from huggingface_hub import split_torch_state_dict_into_shards +from packaging import version +from torch import Tensor, nn +from torch.distributions import constraints +from torch.nn import CrossEntropyLoss, Identity +from torch.utils.checkpoint import checkpoint + +from transformers.utils import is_torchao_available + + +if is_torchao_available(): + from torchao.quantization import Int4WeightOnlyConfig + +from .activations import get_activation +from .configuration_utils import PretrainedConfig +from .dynamic_module_utils import custom_object_save +from .generation import CompileConfig, GenerationConfig +from .integrations import PeftAdapterMixin, deepspeed_config, is_deepspeed_zero3_enabled +from .integrations.accelerate import find_tied_parameters, init_empty_weights +from .integrations.deepspeed import _load_state_dict_into_zero3_model, is_deepspeed_available +from .integrations.flash_attention import flash_attention_forward +from .integrations.flex_attention import flex_attention_forward +from .integrations.sdpa_attention import sdpa_attention_forward +from .integrations.tensor_parallel import ( + SUPPORTED_TP_STYLES, + shard_and_distribute_module, +) +from .loss.loss_utils import LOSS_MAPPING +from .pytorch_utils import ( # noqa: F401 + Conv1D, + apply_chunking_to_forward, + find_pruneable_heads_and_indices, + id_tensor_storage, + prune_conv1d_layer, + prune_layer, + prune_linear_layer, +) +from .quantizers import AutoHfQuantizer, HfQuantizer +from .quantizers.quantizers_utils import get_module_from_name +from .safetensors_conversion import auto_conversion +from .utils import ( + ADAPTER_SAFE_WEIGHTS_NAME, + ADAPTER_WEIGHTS_NAME, + CONFIG_NAME, + DUMMY_INPUTS, + FLAX_WEIGHTS_NAME, + SAFE_WEIGHTS_INDEX_NAME, + SAFE_WEIGHTS_NAME, + TF2_WEIGHTS_NAME, + TF_WEIGHTS_NAME, + WEIGHTS_INDEX_NAME, + WEIGHTS_NAME, + ContextManagers, + ModelOutput, + PushToHubMixin, + cached_file, + copy_func, + download_url, + extract_commit_hash, + has_file, + is_accelerate_available, + is_bitsandbytes_available, + is_flash_attn_2_available, + is_kernels_available, + is_offline_mode, + is_optimum_available, + is_peft_available, + is_remote_url, + is_safetensors_available, + is_torch_flex_attn_available, + is_torch_greater_or_equal, + is_torch_mlu_available, + is_torch_npu_available, + is_torch_sdpa_available, + is_torch_xla_available, + logging, + replace_return_docstrings, + strtobool, +) +from .utils.hub import create_and_tag_model_card, get_checkpoint_shard_files +from .utils.import_utils import ( + ENV_VARS_TRUE_VALUES, + is_sagemaker_mp_enabled, + is_torch_fx_proxy, + is_torchdynamo_compiling, +) +from .utils.quantization_config import BitsAndBytesConfig, QuantizationMethod + + +XLA_USE_BF16 = os.environ.get("XLA_USE_BF16", "0").upper() +XLA_DOWNCAST_BF16 = os.environ.get("XLA_DOWNCAST_BF16", "0").upper() + + +if is_accelerate_available(): + from accelerate import dispatch_model, infer_auto_device_map + from accelerate.hooks import add_hook_to_module + from accelerate.utils import ( + check_tied_parameters_on_same_device, + extract_model_from_parallel, + get_balanced_memory, + get_max_memory, + load_offloaded_weights, + offload_weight, + save_offload_index, + ) + + accelerate_version = version.parse(importlib.metadata.version("accelerate")) + if accelerate_version >= version.parse("0.31"): + from accelerate.utils.modeling import get_state_dict_from_offload + +if is_safetensors_available(): + from safetensors import safe_open + from safetensors.torch import load_file as safe_load_file + from safetensors.torch import save_file as safe_save_file + + +if is_deepspeed_available(): + import deepspeed + +if is_kernels_available(): + from kernels import get_kernel + +logger = logging.get_logger(__name__) + + +_init_weights = True +_is_quantized = False +_is_ds_init_called = False +_torch_distributed_available = torch.distributed.is_available() + + +def is_fsdp_enabled(): + return ( + torch.distributed.is_available() + and torch.distributed.is_initialized() + and strtobool(os.environ.get("ACCELERATE_USE_FSDP", "False")) == 1 + and strtobool(os.environ.get("FSDP_CPU_RAM_EFFICIENT_LOADING", "False")) == 1 + ) + + +def is_local_dist_rank_0(): + return ( + torch.distributed.is_available() + and torch.distributed.is_initialized() + and int(os.environ.get("LOCAL_RANK", -1)) == 0 + ) + + +if is_sagemaker_mp_enabled(): + import smdistributed.modelparallel.torch as smp + from smdistributed.modelparallel import __version__ as SMP_VERSION + + IS_SAGEMAKER_MP_POST_1_10 = version.parse(SMP_VERSION) >= version.parse("1.10") +else: + IS_SAGEMAKER_MP_POST_1_10 = False + +if is_peft_available(): + from .utils import find_adapter_config_file + + +SpecificPreTrainedModelType = TypeVar("SpecificPreTrainedModelType", bound="PreTrainedModel") + +TORCH_INIT_FUNCTIONS = { + "uniform_": nn.init.uniform_, + "normal_": nn.init.normal_, + "trunc_normal_": nn.init.trunc_normal_, + "constant_": nn.init.constant_, + "xavier_uniform_": nn.init.xavier_uniform_, + "xavier_normal_": nn.init.xavier_normal_, + "kaiming_uniform_": nn.init.kaiming_uniform_, + "kaiming_normal_": nn.init.kaiming_normal_, + "uniform": nn.init.uniform, + "normal": nn.init.normal, + "xavier_uniform": nn.init.xavier_uniform, + "xavier_normal": nn.init.xavier_normal, + "kaiming_uniform": nn.init.kaiming_uniform, + "kaiming_normal": nn.init.kaiming_normal, +} + + +@contextmanager +def no_init_weights(): + """ + Context manager to globally disable weight initialization to speed up loading large models. + """ + global _init_weights + old_init_weights = _init_weights + + _init_weights = False + + def _skip_init(*args, **kwargs): + pass + + # Save the original initialization functions + for name, init_func in TORCH_INIT_FUNCTIONS.items(): + setattr(torch.nn.init, name, _skip_init) + + try: + yield + finally: + _init_weights = old_init_weights + # Restore the original initialization functions + for name, init_func in TORCH_INIT_FUNCTIONS.items(): + setattr(torch.nn.init, name, init_func) + + +@contextmanager +def set_quantized_state(): + global _is_quantized + _is_quantized = True + try: + yield + finally: + _is_quantized = False + + +# Skip recursive calls to deepspeed.zero.Init to avoid pinning errors. +# This issue occurs with ZeRO stage 3 when using NVMe offloading. +# For more details, refer to issue #34429. +@contextmanager +def set_zero3_state(): + global _is_ds_init_called + _is_ds_init_called = True + try: + yield + finally: + _is_ds_init_called = False + + +def restore_default_torch_dtype(func): + """ + Decorator to restore the default torch dtype + at the end of the function. Serves + as a backup in case calling the function raises + an error after the function has changed the default dtype but before it could restore it. + """ + + @wraps(func) + def _wrapper(*args, **kwargs): + old_dtype = torch.get_default_dtype() + try: + return func(*args, **kwargs) + finally: + torch.set_default_dtype(old_dtype) + + return _wrapper + + +def get_torch_context_manager_or_global_device(): + """ + Test if a device context manager is currently in use, or if it is not the case, check if the default device + is not "cpu". This is used to infer the correct device to load the model on, in case `device_map` is not provided. + """ + device_in_context = torch.tensor([]).device + default_device = torch.get_default_device() + # This case means no context manager was used -> we still check if the default that was potentially set is not cpu + if device_in_context == default_device: + if default_device != torch.device("cpu"): + return default_device + return None + return device_in_context + + +def get_parameter_device(parameter: Union[nn.Module, "ModuleUtilsMixin"]): + try: + return next(parameter.parameters()).device + except StopIteration: + # For nn.DataParallel compatibility in PyTorch 1.5 + + def find_tensor_attributes(module: nn.Module) -> List[Tuple[str, Tensor]]: + tuples = [(k, v) for k, v in module.__dict__.items() if torch.is_tensor(v)] + return tuples + + gen = parameter._named_members(get_members_fn=find_tensor_attributes) + first_tuple = next(gen) + return first_tuple[1].device + + +def get_parameter_dtype(parameter: Union[nn.Module, "ModuleUtilsMixin"]): + """ + Returns the first found floating dtype in parameters if there is one, otherwise returns the last dtype it found. + """ + last_dtype = None + for t in parameter.parameters(): + last_dtype = t.dtype + if t.is_floating_point(): + # Adding fix for https://github.com/pytorch/xla/issues/4152 + # Fixes issue where the model code passes a value that is out of range for XLA_USE_BF16=1 + # and XLA_DOWNCAST_BF16=1 so the conversion would cast it to -inf + # NOTE: `is_torch_xla_available()` is checked last as it induces a graph break in torch dynamo + if XLA_USE_BF16 in ENV_VARS_TRUE_VALUES and is_torch_xla_available(): + return torch.bfloat16 + if XLA_DOWNCAST_BF16 in ENV_VARS_TRUE_VALUES and is_torch_xla_available(): + if t.dtype == torch.float: + return torch.bfloat16 + if t.dtype == torch.double: + return torch.float32 + return t.dtype + + if last_dtype is not None: + # if no floating dtype was found return whatever the first dtype is + return last_dtype + + # For nn.DataParallel compatibility in PyTorch > 1.5 + def find_tensor_attributes(module: nn.Module) -> List[Tuple[str, Tensor]]: + tuples = [(k, v) for k, v in module.__dict__.items() if torch.is_tensor(v)] + return tuples + + gen = parameter._named_members(get_members_fn=find_tensor_attributes) + last_tuple = None + for tuple in gen: + last_tuple = tuple + if tuple[1].is_floating_point(): + return tuple[1].dtype + + if last_tuple is not None: + # fallback to the last dtype + return last_tuple[1].dtype + + # fallback to buffer dtype + for t in parameter.buffers(): + last_dtype = t.dtype + if t.is_floating_point(): + return t.dtype + return last_dtype + + +def get_state_dict_dtype(state_dict): + """ + Returns the first found floating dtype in `state_dict` if there is one, otherwise returns the first dtype. + """ + for t in state_dict.values(): + if t.is_floating_point(): + return t.dtype + + # if no floating dtype was found return whatever the first dtype is + else: + return next(state_dict.values()).dtype + + +def load_sharded_checkpoint(model, folder, strict=True, prefer_safe=True): + """ + This is the same as + [`torch.nn.Module.load_state_dict`](https://pytorch.org/docs/stable/generated/torch.nn.Module.html?highlight=load_state_dict#torch.nn.Module.load_state_dict) + but for a sharded checkpoint. + + This load is performed efficiently: each checkpoint shard is loaded one by one in RAM and deleted after being + loaded in the model. + + Args: + model (`torch.nn.Module`): The model in which to load the checkpoint. + folder (`str` or `os.PathLike`): A path to a folder containing the sharded checkpoint. + strict (`bool`, *optional*, defaults to `True`): + Whether to strictly enforce that the keys in the model state dict match the keys in the sharded checkpoint. + prefer_safe (`bool`, *optional*, defaults to `False`): + If both safetensors and PyTorch save files are present in checkpoint and `prefer_safe` is True, the + safetensors files will be loaded. Otherwise, PyTorch files are always loaded when possible. + + Returns: + `NamedTuple`: A named tuple with `missing_keys` and `unexpected_keys` fields + - `missing_keys` is a list of str containing the missing keys + - `unexpected_keys` is a list of str containing the unexpected keys + """ + # Load the index + index_file = os.path.join(folder, WEIGHTS_INDEX_NAME) + safe_index_file = os.path.join(folder, SAFE_WEIGHTS_INDEX_NAME) + + index_present = os.path.isfile(index_file) + safe_index_present = os.path.isfile(safe_index_file) + + if not index_present and not (safe_index_present and is_safetensors_available()): + filenames = ( + (WEIGHTS_INDEX_NAME, SAFE_WEIGHTS_INDEX_NAME) if is_safetensors_available() else (WEIGHTS_INDEX_NAME,) + ) + raise ValueError(f"Can't find a checkpoint index ({' or '.join(filenames)}) in {folder}.") + + load_safe = False + if safe_index_present: + if prefer_safe: + if is_safetensors_available(): + load_safe = True # load safe due to preference + else: + logger.warning( + f"Cannot load sharded checkpoint at {folder} safely since safetensors is not installed!" + ) + elif not index_present: + load_safe = True # load safe since we have no other choice + + load_index = safe_index_file if load_safe else index_file + + with open(load_index, "r", encoding="utf-8") as f: + index = json.load(f) + + shard_files = list(set(index["weight_map"].values())) + + # If strict=True, error before loading any of the state dicts. + loaded_keys = index["weight_map"].keys() + model_keys = model.state_dict().keys() + missing_keys = [key for key in model_keys if key not in loaded_keys] + unexpected_keys = [key for key in loaded_keys if key not in model_keys] + if strict and (len(missing_keys) > 0 or len(unexpected_keys) > 0): + error_message = f"Error(s) in loading state_dict for {model.__class__.__name__}" + if len(missing_keys) > 0: + str_missing_keys = ",".join([f'"{k}"' for k in missing_keys]) + error_message += f"\nMissing key(s): {str_missing_keys}." + if len(unexpected_keys) > 0: + str_unexpected_keys = ",".join([f'"{k}"' for k in unexpected_keys]) + error_message += f"\nMissing key(s): {str_unexpected_keys}." + raise RuntimeError(error_message) + + loader = safe_load_file if load_safe else partial(torch.load, map_location="cpu", weights_only=True) + + for shard_file in shard_files: + state_dict = loader(os.path.join(folder, shard_file)) + model.load_state_dict(state_dict, strict=False) + + # Make sure memory is freed before we load the next state dict. + del state_dict + gc.collect() + + # Return the same thing as PyTorch load_state_dict function. + return torch.nn.modules.module._IncompatibleKeys(missing_keys, unexpected_keys) + + +str_to_torch_dtype = { + "BOOL": torch.bool, + "U8": torch.uint8, + "I8": torch.int8, + "I16": torch.int16, + "F16": torch.float16, + "BF16": torch.bfloat16, + "I32": torch.int32, + "F32": torch.float32, + "F64": torch.float64, + "I64": torch.int64, + "F8_E4M3": torch.float8_e4m3fn, + "F8_E5M2": torch.float8_e5m2, +} + + +if is_torch_greater_or_equal("2.3.0"): + str_to_torch_dtype["U16"] = torch.uint16 + str_to_torch_dtype["U32"] = torch.uint32 + str_to_torch_dtype["U64"] = torch.uint64 + + +def load_state_dict( + checkpoint_file: Union[str, os.PathLike], + is_quantized: bool = False, + map_location: Optional[Union[str, torch.device]] = "cpu", + weights_only: bool = True, +): + """ + Reads a `safetensor` or a `.bin` checkpoint file. We load the checkpoint on "cpu" by default. + """ + if checkpoint_file.endswith(".safetensors") and is_safetensors_available(): + with safe_open(checkpoint_file, framework="pt") as f: + metadata = f.metadata() + + if metadata is not None and metadata.get("format") not in ["pt", "tf", "flax", "mlx"]: + raise OSError( + f"The safetensors archive passed at {checkpoint_file} does not contain the valid metadata. Make sure " + "you save your model with the `save_pretrained` method." + ) + state_dict = {} + for k in f.keys(): + k_dtype = f.get_slice(k).get_dtype() + if k_dtype in str_to_torch_dtype: + dtype = str_to_torch_dtype[k_dtype] + else: + raise ValueError(f"Cannot load safetensors of unknown dtype {k_dtype}") + if map_location == "meta": + state_dict[k] = torch.empty(size=f.get_slice(k).get_shape(), dtype=dtype, device="meta") + else: + state_dict[k] = f.get_tensor(k) + return state_dict + + try: + if map_location is None: + if ( + ( + is_deepspeed_zero3_enabled() + and torch.distributed.is_initialized() + and torch.distributed.get_rank() > 0 + ) + or (is_fsdp_enabled() and not is_local_dist_rank_0()) + ) and not is_quantized: + map_location = "meta" + else: + map_location = "cpu" + extra_args = {} + # mmap can only be used with files serialized with zipfile-based format. + if isinstance(checkpoint_file, str) and map_location != "meta" and is_zipfile(checkpoint_file): + extra_args = {"mmap": True} + return torch.load( + checkpoint_file, + map_location=map_location, + weights_only=weights_only, + **extra_args, + ) + except Exception as e: + try: + with open(checkpoint_file) as f: + if f.read(7) == "version": + raise OSError( + "You seem to have cloned a repository without having git-lfs installed. Please install " + "git-lfs and run `git lfs install` followed by `git lfs pull` in the folder " + "you cloned." + ) + else: + raise ValueError( + f"Unable to locate the file {checkpoint_file} which is necessary to load this pretrained " + "model. Make sure you have saved the model properly." + ) from e + except (UnicodeDecodeError, ValueError): + raise OSError( + f"Unable to load weights from pytorch checkpoint file for '{checkpoint_file}' " + f"at '{checkpoint_file}'. " + "If you tried to load a PyTorch model from a TF 2.0 checkpoint, please set from_tf=True." + ) + + +def set_initialized_submodules(model, state_dict_keys): + """ + Sets the `_is_hf_initialized` flag in all submodules of a given model when all its weights are in the loaded state + dict. + """ + state_dict_keys = set(state_dict_keys) + not_initialized_submodules = {} + for module_name, module in model.named_modules(): + if module_name == "": + # When checking if the root module is loaded there's no need to prepend module_name. + module_keys = set(module.state_dict()) + else: + module_keys = {f"{module_name}.{k}" for k in module.state_dict()} + if module_keys.issubset(state_dict_keys): + module._is_hf_initialized = True + else: + not_initialized_submodules[module_name] = module + return not_initialized_submodules + + +def _end_ptr(tensor: torch.Tensor) -> int: + # extract the end of the pointer if the tensor is a slice of a bigger tensor + if tensor.nelement(): + stop = tensor.view(-1)[-1].data_ptr() + tensor.element_size() + else: + stop = tensor.data_ptr() + return stop + + +def _get_tied_weight_keys(module: nn.Module, prefix=""): + tied_weight_keys = [] + if getattr(module, "_tied_weights_keys", None) is not None: + names = [f"{prefix}.{k}" if prefix else k for k in module._tied_weights_keys] + tied_weight_keys.extend(names) + if getattr(module, "_dynamic_tied_weights_keys", None) is not None: + names = [f"{prefix}.{k}" if prefix else k for k in module._dynamic_tied_weights_keys] + tied_weight_keys.extend(names) + for name, submodule in module.named_children(): + local_prefix = f"{prefix}.{name}" if prefix else name + tied_weight_keys.extend(_get_tied_weight_keys(submodule, prefix=local_prefix)) + return tied_weight_keys + + +def _find_disjoint(tensors: List[Set[str]], state_dict: Dict[str, torch.Tensor]) -> Tuple[List[Set[str]], List[str]]: + filtered_tensors = [] + for shared in tensors: + if len(shared) < 2: + filtered_tensors.append(shared) + continue + + areas = [] + for name in shared: + tensor = state_dict[name] + areas.append((tensor.data_ptr(), _end_ptr(tensor), name)) + areas.sort() + + _, last_stop, last_name = areas[0] + filtered_tensors.append({last_name}) + for start, stop, name in areas[1:]: + if start >= last_stop: + filtered_tensors.append({name}) + else: + filtered_tensors[-1].add(name) + last_stop = stop + disjoint_tensors = [] + shared_tensors = [] + for tensors in filtered_tensors: + if len(tensors) == 1: + disjoint_tensors.append(tensors.pop()) + else: + shared_tensors.append(tensors) + return shared_tensors, disjoint_tensors + + +def _find_identical(tensors: List[Set[str]], state_dict: Dict[str, torch.Tensor]) -> Tuple[List[Set[str]], Set[str]]: + shared_tensors = [] + identical = [] + for shared in tensors: + if len(shared) < 2: + continue + + areas = collections.defaultdict(set) + for name in shared: + tensor = state_dict[name] + area = (tensor.device, tensor.data_ptr(), _end_ptr(tensor)) + areas[area].add(name) + if len(areas) == 1: + identical.append(shared) + else: + shared_tensors.append(shared) + return shared_tensors, identical + + +def _infer_parameter_dtype( + model: "PreTrainedModel", + param_name: str, + empty_param: torch.Tensor, + keep_in_fp32_regex: Optional[re.Pattern] = None, + hf_quantizer: Optional[HfQuantizer] = None, +) -> Union[bool, Optional[torch.dtype]]: + try: + old_param = model.get_parameter_or_buffer(param_name) + except Exception as e: + if hf_quantizer is not None and hf_quantizer.quantization_config.quant_method in { + QuantizationMethod.HQQ, + QuantizationMethod.QUARK, + }: + return True, None + else: + raise e + is_torch_e4m3fn_available = hasattr(torch, "float8_e4m3fn") + # We convert floating dtypes to the `dtype` passed except for float8_e4m3fn type. We also want to keep the buffers/params + # in int/uint/bool and not cast them. + casting_dtype = None + is_param_float8_e4m3fn = is_torch_e4m3fn_available and empty_param.dtype == torch.float8_e4m3fn + if empty_param.dtype.is_floating_point and not is_param_float8_e4m3fn: + # First fp32 if part of the exception list + if keep_in_fp32_regex is not None and keep_in_fp32_regex.search(param_name): + casting_dtype = torch.float32 + # Then dtype that was instantiated in the meta model -- note that this respects subconfigs dtypes + elif hf_quantizer is not None: + casting_dtype = model.config._pre_quantization_dtype + else: + casting_dtype = old_param.dtype + return old_param is not None and old_param.is_contiguous(), casting_dtype + + +def _load_parameter_into_model(model: "PreTrainedModel", param_name: str, tensor: torch.Tensor): + """Cast a single parameter `param_name` into the `model`, with value `tensor`.""" + module, param_type = get_module_from_name(model, param_name) + # This will check potential shape mismatch if skipped before + module.load_state_dict({param_type: tensor}, strict=False, assign=True) + + +@torch.no_grad() +def _load_state_dict_into_meta_model( + model: "PreTrainedModel", + state_dict: Dict, + shard_file: str, + expected_keys: List[str], + reverse_renaming_mapping: Dict[str, str], + device_map: Optional[Dict] = None, + disk_offload_folder: Optional[str] = None, + disk_offload_index: Optional[Dict] = None, + cpu_offload_folder: Optional[str] = None, + cpu_offload_index: Optional[Dict] = None, + hf_quantizer: Optional[HfQuantizer] = None, + is_safetensors: bool = False, + keep_in_fp32_regex: Optional[re.Pattern] = None, + unexpected_keys: Optional[List[str]] = None, # passing `unexpected` for cleanup from quantization items + device_mesh: Optional["torch.distributed.device_mesh.DeviceMesh"] = None, +) -> Tuple[Optional[Dict], Optional[Dict]]: + """Load parameters from `meta_state_dict` into the model. The parameters of the `meta_state_dict` are on the meta + device in order to easily infer the shapes and dtypes that they will have. Then proper parameters are then loaded + from `shard_file`, which is the actual state dict file on disk. + This function takes care of correctly casting dtypes, devices, and sharding tensors in case of tensor parallelism. + """ + tensor_device = "cpu" + if device_map is not None and device_map.get("", None) is not None: + if device_map[""] not in ("cpu", torch.device("cpu")): + tensor_device = device_map[""].index if isinstance(device_map[""], torch.device) else device_map[""] + if device_map is not None: + device_map_regex = "|".join([re.escape(k) for k in sorted(device_map.keys(), reverse=True)]) + + is_quantized = hf_quantizer is not None + is_hqq_or_bnb = is_quantized and hf_quantizer.quantization_config.quant_method in { + QuantizationMethod.HQQ, + QuantizationMethod.BITS_AND_BYTES, + } + is_meta_state_dict = shard_file.endswith(".safetensors") and not is_hqq_or_bnb + file_pointer = None + if is_meta_state_dict: + file_pointer = safe_open(shard_file, framework="pt", device=tensor_device) + + for param_name, empty_param in state_dict.items(): + if param_name not in expected_keys: + continue + + # we need to use serialized_param_name as file pointer is untouched + if is_meta_state_dict: + # This is the name of the parameter as it appears on disk file + serialized_param_name = reverse_renaming_mapping[param_name] + param = file_pointer.get_slice(serialized_param_name) + else: + param = empty_param.to(tensor_device) # It is actually not empty! + + to_contiguous, casting_dtype = _infer_parameter_dtype( + model, + param_name, + empty_param, + keep_in_fp32_regex, + hf_quantizer, + ) + + if device_mesh is not None: # In this case, the param is already on the correct device! + shard_and_distribute_module( + model, + param, + empty_param, + param_name, + casting_dtype, + to_contiguous, + int(os.environ["RANK"]), # the rank + device_mesh, + ) + else: + param = param[...] + if casting_dtype is not None: + param = param.to(casting_dtype) + if to_contiguous: + param = param.contiguous() + + if device_map is None: + param_device = "cpu" + else: + module_layer = re.search(device_map_regex, param_name) + if not module_layer: + raise ValueError(f"{param_name} doesn't have any device set.") + else: + param_device = device_map[module_layer.group()] + + if param_device == "disk": + if not is_safetensors: + disk_offload_index = offload_weight(param, param_name, disk_offload_folder, disk_offload_index) + elif param_device == "cpu" and cpu_offload_index is not None: + cpu_offload_index = offload_weight(param, param_name, cpu_offload_folder, cpu_offload_index) + elif ( + not is_quantized + or (not hf_quantizer.requires_parameters_quantization) + or ( + not hf_quantizer.check_quantized_param( + model, + param, + param_name, + state_dict, + param_device=param_device, + device_map=device_map, + ) + ) + ): + if is_fsdp_enabled(): + param_device = "cpu" if is_local_dist_rank_0() else "meta" + + _load_parameter_into_model(model, param_name, param.to(param_device)) + + else: + hf_quantizer.create_quantized_param( + model, param, param_name, param_device, state_dict, unexpected_keys + ) + # For quantized modules with FSDP/DeepSpeed Stage 3, we need to quantize the parameter on the GPU + # and then cast it to CPU to avoid excessive memory usage on each GPU + # in comparison to the sharded model across GPUs. + if is_fsdp_enabled() or is_deepspeed_zero3_enabled(): + module, param_type = get_module_from_name(model, param_name) + value = getattr(module, param_type) + param_to = "cpu" + if is_fsdp_enabled() and not is_local_dist_rank_0(): + param_to = "meta" + val_kwargs = {} + if hasattr(module, "weight") and module.weight.__class__.__name__ == "Int8Params": + val_kwargs["requires_grad"] = False + value = type(value)(value.data.to(param_to), **val_kwargs, **value.__dict__) + setattr(module, param_type, value) + + if file_pointer is not None: + file_pointer.__exit__(None, None, None) + + return disk_offload_index, cpu_offload_index + + +def _add_variant(weights_name: str, variant: Optional[str] = None) -> str: + if variant is not None: + path, name = weights_name.rsplit(".", 1) + weights_name = f"{path}.{variant}.{name}" + return weights_name + + +def _get_resolved_checkpoint_files( + pretrained_model_name_or_path: Optional[Union[str, os.PathLike]], + subfolder: str, + variant: Optional[str], + gguf_file: Optional[str], + from_tf: bool, + from_flax: bool, + use_safetensors: bool, + cache_dir: str, + force_download: bool, + proxies: Optional[Dict[str, str]], + local_files_only: bool, + token: Optional[Union[str, bool]], + user_agent: dict, + revision: str, + commit_hash: Optional[str], +) -> Tuple[Optional[List[str]], Optional[Dict]]: + """Get all the checkpoint filenames based on `pretrained_model_name_or_path`, and optional metadata if the + checkpoints are sharded. + This function will download the data if necesary. + """ + is_sharded = False + + if pretrained_model_name_or_path is not None and gguf_file is None: + pretrained_model_name_or_path = str(pretrained_model_name_or_path) + is_local = os.path.isdir(pretrained_model_name_or_path) + if is_local: + if from_tf and os.path.isfile( + os.path.join(pretrained_model_name_or_path, subfolder, TF_WEIGHTS_NAME + ".index") + ): + # Load from a TF 1.0 checkpoint in priority if from_tf + archive_file = os.path.join(pretrained_model_name_or_path, subfolder, TF_WEIGHTS_NAME + ".index") + elif from_tf and os.path.isfile(os.path.join(pretrained_model_name_or_path, subfolder, TF2_WEIGHTS_NAME)): + # Load from a TF 2.0 checkpoint in priority if from_tf + archive_file = os.path.join(pretrained_model_name_or_path, subfolder, TF2_WEIGHTS_NAME) + elif from_flax and os.path.isfile( + os.path.join(pretrained_model_name_or_path, subfolder, FLAX_WEIGHTS_NAME) + ): + # Load from a Flax checkpoint in priority if from_flax + archive_file = os.path.join(pretrained_model_name_or_path, subfolder, FLAX_WEIGHTS_NAME) + elif use_safetensors is not False and os.path.isfile( + os.path.join(pretrained_model_name_or_path, subfolder, _add_variant(SAFE_WEIGHTS_NAME, variant)) + ): + # Load from a safetensors checkpoint + archive_file = os.path.join( + pretrained_model_name_or_path, subfolder, _add_variant(SAFE_WEIGHTS_NAME, variant) + ) + elif use_safetensors is not False and os.path.isfile( + os.path.join(pretrained_model_name_or_path, subfolder, _add_variant(SAFE_WEIGHTS_INDEX_NAME, variant)) + ): + # Load from a sharded safetensors checkpoint + archive_file = os.path.join( + pretrained_model_name_or_path, subfolder, _add_variant(SAFE_WEIGHTS_INDEX_NAME, variant) + ) + is_sharded = True + elif not use_safetensors and os.path.isfile( + os.path.join(pretrained_model_name_or_path, subfolder, _add_variant(WEIGHTS_NAME, variant)) + ): + # Load from a PyTorch checkpoint + archive_file = os.path.join( + pretrained_model_name_or_path, subfolder, _add_variant(WEIGHTS_NAME, variant) + ) + elif not use_safetensors and os.path.isfile( + os.path.join(pretrained_model_name_or_path, subfolder, _add_variant(WEIGHTS_INDEX_NAME, variant)) + ): + # Load from a sharded PyTorch checkpoint + archive_file = os.path.join( + pretrained_model_name_or_path, subfolder, _add_variant(WEIGHTS_INDEX_NAME, variant) + ) + is_sharded = True + # At this stage we don't have a weight file so we will raise an error. + elif not use_safetensors and ( + os.path.isfile(os.path.join(pretrained_model_name_or_path, subfolder, TF_WEIGHTS_NAME + ".index")) + or os.path.isfile(os.path.join(pretrained_model_name_or_path, subfolder, TF2_WEIGHTS_NAME)) + ): + raise EnvironmentError( + f"Error no file named {_add_variant(WEIGHTS_NAME, variant)} found in directory" + f" {pretrained_model_name_or_path} but there is a file for TensorFlow weights. Use" + " `from_tf=True` to load this model from those weights." + ) + elif not use_safetensors and os.path.isfile( + os.path.join(pretrained_model_name_or_path, subfolder, FLAX_WEIGHTS_NAME) + ): + raise EnvironmentError( + f"Error no file named {_add_variant(WEIGHTS_NAME, variant)} found in directory" + f" {pretrained_model_name_or_path} but there is a file for Flax weights. Use `from_flax=True`" + " to load this model from those weights." + ) + elif use_safetensors: + raise EnvironmentError( + f"Error no file named {_add_variant(SAFE_WEIGHTS_NAME, variant)} found in directory" + f" {pretrained_model_name_or_path}." + ) + else: + raise EnvironmentError( + f"Error no file named {_add_variant(WEIGHTS_NAME, variant)}, {_add_variant(SAFE_WEIGHTS_NAME, variant)}," + f" {TF2_WEIGHTS_NAME}, {TF_WEIGHTS_NAME + '.index'} or {FLAX_WEIGHTS_NAME} found in directory" + f" {pretrained_model_name_or_path}." + ) + elif os.path.isfile(os.path.join(subfolder, pretrained_model_name_or_path)): + archive_file = pretrained_model_name_or_path + is_local = True + elif os.path.isfile(os.path.join(subfolder, pretrained_model_name_or_path + ".index")): + if not from_tf: + raise ValueError( + f"We found a TensorFlow checkpoint at {pretrained_model_name_or_path + '.index'}, please set " + "from_tf to True to load from this checkpoint." + ) + archive_file = os.path.join(subfolder, pretrained_model_name_or_path + ".index") + is_local = True + elif is_remote_url(pretrained_model_name_or_path): + filename = pretrained_model_name_or_path + resolved_archive_file = download_url(pretrained_model_name_or_path) + else: + # set correct filename + if from_tf: + filename = TF2_WEIGHTS_NAME + elif from_flax: + filename = FLAX_WEIGHTS_NAME + elif use_safetensors is not False: + filename = _add_variant(SAFE_WEIGHTS_NAME, variant) + else: + filename = _add_variant(WEIGHTS_NAME, variant) + + try: + # Load from URL or cache if already cached + cached_file_kwargs = { + "cache_dir": cache_dir, + "force_download": force_download, + "proxies": proxies, + "local_files_only": local_files_only, + "token": token, + "user_agent": user_agent, + "revision": revision, + "subfolder": subfolder, + "_raise_exceptions_for_gated_repo": False, + "_raise_exceptions_for_missing_entries": False, + "_commit_hash": commit_hash, + } + resolved_archive_file = cached_file(pretrained_model_name_or_path, filename, **cached_file_kwargs) + + # Since we set _raise_exceptions_for_missing_entries=False, we don't get an exception but a None + # result when internet is up, the repo and revision exist, but the file does not. + if resolved_archive_file is None and filename == _add_variant(SAFE_WEIGHTS_NAME, variant): + # Maybe the checkpoint is sharded, we try to grab the index name in this case. + resolved_archive_file = cached_file( + pretrained_model_name_or_path, + _add_variant(SAFE_WEIGHTS_INDEX_NAME, variant), + **cached_file_kwargs, + ) + if resolved_archive_file is not None: + is_sharded = True + elif use_safetensors: + if revision == "main": + resolved_archive_file, revision, is_sharded = auto_conversion( + pretrained_model_name_or_path, **cached_file_kwargs + ) + cached_file_kwargs["revision"] = revision + if resolved_archive_file is None: + raise EnvironmentError( + f"{pretrained_model_name_or_path} does not appear to have a file named" + f" {_add_variant(SAFE_WEIGHTS_NAME, variant)} or {_add_variant(SAFE_WEIGHTS_INDEX_NAME, variant)} " + "and thus cannot be loaded with `safetensors`. Please make sure that the model has " + "been saved with `safe_serialization=True` or do not set `use_safetensors=True`." + ) + else: + # This repo has no safetensors file of any kind, we switch to PyTorch. + filename = _add_variant(WEIGHTS_NAME, variant) + resolved_archive_file = cached_file( + pretrained_model_name_or_path, filename, **cached_file_kwargs + ) + if resolved_archive_file is None and filename == _add_variant(WEIGHTS_NAME, variant): + # Maybe the checkpoint is sharded, we try to grab the index name in this case. + resolved_archive_file = cached_file( + pretrained_model_name_or_path, + _add_variant(WEIGHTS_INDEX_NAME, variant), + **cached_file_kwargs, + ) + if resolved_archive_file is not None: + is_sharded = True + if not local_files_only and not is_offline_mode(): + if resolved_archive_file is not None: + if filename in [WEIGHTS_NAME, WEIGHTS_INDEX_NAME]: + # If the PyTorch file was found, check if there is a safetensors file on the repository + # If there is no safetensors file on the repositories, start an auto conversion + safe_weights_name = SAFE_WEIGHTS_INDEX_NAME if is_sharded else SAFE_WEIGHTS_NAME + has_file_kwargs = { + "revision": revision, + "proxies": proxies, + "token": token, + "cache_dir": cache_dir, + "local_files_only": local_files_only, + } + cached_file_kwargs = { + "cache_dir": cache_dir, + "force_download": force_download, + "local_files_only": local_files_only, + "user_agent": user_agent, + "subfolder": subfolder, + "_raise_exceptions_for_gated_repo": False, + "_raise_exceptions_for_missing_entries": False, + "_commit_hash": commit_hash, + **has_file_kwargs, + } + if not has_file(pretrained_model_name_or_path, safe_weights_name, **has_file_kwargs): + Thread( + target=auto_conversion, + args=(pretrained_model_name_or_path,), + kwargs={"ignore_errors_during_conversion": True, **cached_file_kwargs}, + name="Thread-auto_conversion", + ).start() + else: + # Otherwise, no PyTorch file was found, maybe there is a TF or Flax model file. + # We try those to give a helpful error message. + has_file_kwargs = { + "revision": revision, + "proxies": proxies, + "token": token, + "cache_dir": cache_dir, + "local_files_only": local_files_only, + } + if has_file(pretrained_model_name_or_path, TF2_WEIGHTS_NAME, **has_file_kwargs): + raise EnvironmentError( + f"{pretrained_model_name_or_path} does not appear to have a file named" + f" {_add_variant(WEIGHTS_NAME, variant)} but there is a file for TensorFlow weights." + " Use `from_tf=True` to load this model from those weights." + ) + elif has_file(pretrained_model_name_or_path, FLAX_WEIGHTS_NAME, **has_file_kwargs): + raise EnvironmentError( + f"{pretrained_model_name_or_path} does not appear to have a file named" + f" {_add_variant(WEIGHTS_NAME, variant)} but there is a file for Flax weights. Use" + " `from_flax=True` to load this model from those weights." + ) + elif variant is not None and has_file( + pretrained_model_name_or_path, WEIGHTS_NAME, **has_file_kwargs + ): + raise EnvironmentError( + f"{pretrained_model_name_or_path} does not appear to have a file named" + f" {_add_variant(WEIGHTS_NAME, variant)} but there is a file without the variant" + f" {variant}. Use `variant=None` to load this model from those weights." + ) + else: + raise EnvironmentError( + f"{pretrained_model_name_or_path} does not appear to have a file named" + f" {_add_variant(WEIGHTS_NAME, variant)}, {_add_variant(SAFE_WEIGHTS_NAME, variant)}," + f" {TF2_WEIGHTS_NAME}, {TF_WEIGHTS_NAME} or {FLAX_WEIGHTS_NAME}." + ) + + except EnvironmentError: + # Raise any environment error raise by `cached_file`. It will have a helpful error message adapted + # to the original exception. + raise + except Exception as e: + # For any other exception, we throw a generic error. + raise EnvironmentError( + f"Can't load the model for '{pretrained_model_name_or_path}'. If you were trying to load it" + " from 'https://huggingface.co/models', make sure you don't have a local directory with the" + f" same name. Otherwise, make sure '{pretrained_model_name_or_path}' is the correct path to a" + f" directory containing a file named {_add_variant(WEIGHTS_NAME, variant)}," + f" {TF2_WEIGHTS_NAME}, {TF_WEIGHTS_NAME} or {FLAX_WEIGHTS_NAME}." + ) from e + + if is_local: + logger.info(f"loading weights file {archive_file}") + resolved_archive_file = archive_file + else: + logger.info(f"loading weights file {filename} from cache at {resolved_archive_file}") + + elif gguf_file: + # Case 1: the GGUF file is present locally + if os.path.isfile(gguf_file): + resolved_archive_file = gguf_file + # Case 2: The GGUF path is a location on the Hub + # Load from URL or cache if already cached + else: + cached_file_kwargs = { + "cache_dir": cache_dir, + "force_download": force_download, + "proxies": proxies, + "local_files_only": local_files_only, + "token": token, + "user_agent": user_agent, + "revision": revision, + "subfolder": subfolder, + "_raise_exceptions_for_gated_repo": False, + "_raise_exceptions_for_missing_entries": False, + "_commit_hash": commit_hash, + } + + resolved_archive_file = cached_file(pretrained_model_name_or_path, gguf_file, **cached_file_kwargs) + + # We now download and resolve all checkpoint files if the checkpoint is sharded + sharded_metadata = None + if is_sharded: + checkpoint_files, sharded_metadata = get_checkpoint_shard_files( + pretrained_model_name_or_path, + resolved_archive_file, + cache_dir=cache_dir, + force_download=force_download, + proxies=proxies, + local_files_only=local_files_only, + token=token, + user_agent=user_agent, + revision=revision, + subfolder=subfolder, + _commit_hash=commit_hash, + ) + else: + checkpoint_files = [resolved_archive_file] if pretrained_model_name_or_path is not None else None + + return checkpoint_files, sharded_metadata + + +def _get_torch_dtype( + cls, + torch_dtype: Optional[Union[str, torch.dtype, Dict]], + checkpoint_files: Optional[List[str]], + config: PretrainedConfig, + sharded_metadata: Optional[Dict], + state_dict: Optional[Dict], + weights_only: bool, +) -> Tuple[PretrainedConfig, Optional[torch.dtype], Optional[torch.dtype]]: + """Find the correct `torch_dtype` to use based on provided arguments. Also update the `config` based on the + inferred dtype. We do the following: + 1. If torch_dtype is not None, we use that dtype + 2. If torch_dtype is "auto", we auto-detect dtype from the loaded state_dict, by checking its first + weights entry that is of a floating type - we assume all floating dtype weights are of the same dtype + we also may have config.torch_dtype available, but we won't rely on it till v5 + """ + dtype_orig = None + is_sharded = sharded_metadata is not None + + if torch_dtype is not None: + if isinstance(torch_dtype, str): + if torch_dtype == "auto": + if hasattr(config, "torch_dtype") and config.torch_dtype is not None: + torch_dtype = config.torch_dtype + logger.info(f"Will use torch_dtype={torch_dtype} as defined in model's config object") + else: + if is_sharded and "dtype" in sharded_metadata: + torch_dtype = sharded_metadata["dtype"] + elif state_dict is not None: + torch_dtype = get_state_dict_dtype(state_dict) + else: + state_dict = load_state_dict( + checkpoint_files[0], map_location="meta", weights_only=weights_only + ) + torch_dtype = get_state_dict_dtype(state_dict) + logger.info( + "Since the `torch_dtype` attribute can't be found in model's config object, " + "will use torch_dtype={torch_dtype} as derived from model's weights" + ) + elif hasattr(torch, torch_dtype): + torch_dtype = getattr(torch, torch_dtype) + config.torch_dtype = torch_dtype + for sub_config_key in config.sub_configs.keys(): + sub_config = getattr(config, sub_config_key) + sub_config.torch_dtype = torch_dtype + elif isinstance(torch_dtype, torch.dtype): + config.torch_dtype = torch_dtype + for sub_config_key in config.sub_configs.keys(): + sub_config = getattr(config, sub_config_key) + sub_config.torch_dtype = torch_dtype + elif isinstance(torch_dtype, dict): + for key, curr_dtype in torch_dtype.items(): + if hasattr(config, key): + value = getattr(config, key) + curr_dtype = curr_dtype if not isinstance(curr_dtype, str) else getattr(torch, curr_dtype) + value.torch_dtype = curr_dtype + # main torch dtype for modules that aren't part of any sub-config + torch_dtype = torch_dtype.get("") + torch_dtype = torch_dtype if not isinstance(torch_dtype, str) else getattr(torch, torch_dtype) + config.torch_dtype = torch_dtype + if torch_dtype is None: + torch_dtype = torch.float32 + else: + raise ValueError( + f"`torch_dtype` can be one of: `torch.dtype`, `'auto'`, a string of a valid `torch.dtype` or a `dict` with valid `torch_dtype` " + f"for each sub-config in composite configs, but received {torch_dtype}" + ) + + dtype_orig = cls._set_default_torch_dtype(torch_dtype) + else: + # set fp32 as the default dtype for BC + default_dtype = torch.get_default_dtype() + config.torch_dtype = default_dtype + for key in config.sub_configs.keys(): + value = getattr(config, key) + value.torch_dtype = default_dtype + + return config, torch_dtype, dtype_orig + + +def _get_device_map( + model: "PreTrainedModel", + device_map: Optional[Union[str, Dict]], + max_memory: Optional[Dict], + hf_quantizer: Optional[HfQuantizer], + torch_dtype: Optional[torch.dtype], + keep_in_fp32_regex: Optional[re.Pattern], +) -> Dict: + """Compute the final `device_map` to use if we passed a value in ['auto', 'balanced', 'balanced_low_0', 'sequential']. + Otherwise, we check for any device inconsistencies in the device_map. + """ + if isinstance(device_map, str): + special_dtypes = {} + if hf_quantizer is not None: + special_dtypes.update(hf_quantizer.get_special_dtypes_update(model, torch_dtype)) + if keep_in_fp32_regex is not None: + special_dtypes.update( + {name: torch.float32 for name, _ in model.named_parameters() if keep_in_fp32_regex.search(name)} + ) + + target_dtype = torch_dtype + + if hf_quantizer is not None: + target_dtype = hf_quantizer.adjust_target_dtype(target_dtype) + + no_split_modules = model._get_no_split_modules(device_map) + device_map_kwargs = {"no_split_module_classes": no_split_modules} + + if "special_dtypes" in inspect.signature(infer_auto_device_map).parameters: + device_map_kwargs["special_dtypes"] = special_dtypes + elif len(special_dtypes) > 0: + logger.warning( + "This model has some weights that should be kept in higher precision, you need to upgrade " + "`accelerate` to properly deal with them (`pip install --upgrade accelerate`)." + ) + + if device_map != "sequential": + max_memory = get_balanced_memory( + model, + dtype=target_dtype, + low_zero=(device_map == "balanced_low_0"), + max_memory=max_memory, + **device_map_kwargs, + ) + else: + max_memory = get_max_memory(max_memory) + if hf_quantizer is not None: + max_memory = hf_quantizer.adjust_max_memory(max_memory) + device_map_kwargs["max_memory"] = max_memory + + device_map = infer_auto_device_map(model, dtype=target_dtype, **device_map_kwargs) + + if hf_quantizer is not None: + hf_quantizer.validate_environment(device_map=device_map) + + elif device_map is not None: + tied_params = find_tied_parameters(model) + # check if we don't have tied param in different devices + check_tied_parameters_on_same_device(tied_params, device_map) + + return device_map + + +def _find_missing_and_unexpected_keys( + cls, + model: "PreTrainedModel", + original_checkpoint_keys: List[str], + checkpoint_keys: List[str], + loading_base_model_from_task_state_dict: bool, + hf_quantizer: Optional[HfQuantizer], + device_map: Dict, +) -> Tuple[List[str], List[str]]: + """Find missing keys (keys that are part of the model parameters but were NOT found in the loaded state dict keys) and unexpected keys + (keys found in the loaded state dict keys, but that are NOT part of the model parameters) + """ + prefix = model.base_model_prefix + + # Compute expected keys, i.e. keys that the FULL model (not model_to_load) expects + expected_keys = list(model.state_dict().keys()) + if hf_quantizer is not None: + expected_keys = hf_quantizer.update_expected_keys(model, expected_keys, checkpoint_keys) + + # Adjust prefix of the keys to make them match loaded keys before removing them + missing_keys = sorted(set(expected_keys) - set(checkpoint_keys)) + unexpected_keys = set(checkpoint_keys) - set(expected_keys) + # If a module has the same name under the base and task specific model, we have to re-add it to unexpected keys + if loading_base_model_from_task_state_dict: + task_specific_keys = [k for k in original_checkpoint_keys if not k.startswith(f"{prefix}.")] + unexpected_keys.update(task_specific_keys) + + # Remove nonpersistent buffers from unexpected keys: they are not in the expected keys (model state dict), but + # may be in the loaded keys. Note that removing all buffers does the job, as they were part of the expected keys anyway + model_buffers = {n for n, _ in model.named_buffers()} + unexpected_keys = sorted(unexpected_keys - model_buffers) + + # Old checkpoints may have keys for rotary_emb.inv_freq for each layer, however we moved this buffer to the main model + # (so the buffer name has changed). Remove them in such a case + has_inv_freq_buffers = any(buffer.endswith("rotary_emb.inv_freq") for buffer in model_buffers) + if has_inv_freq_buffers: + unexpected_keys = [k for k in unexpected_keys if "rotary_emb.inv_freq" not in k] + + tied_params = find_tied_parameters(model) + for group in tied_params: + missing_in_group = [k for k in missing_keys if k in group] + if len(missing_in_group) > 0 and len(missing_in_group) < len(group): + missing_keys = [k for k in missing_keys if k not in missing_in_group] + + if hf_quantizer is not None: + missing_keys = hf_quantizer.update_missing_keys(model, missing_keys, prefix) + unexpected_keys = hf_quantizer.update_unexpected_keys(model, unexpected_keys, prefix) + + # Model-specific exceptions for missing and unexpected keys (e.g. if the modeling change over time, or any other reason...) + if cls._keys_to_ignore_on_load_missing is not None: + for pattern in cls._keys_to_ignore_on_load_missing: + missing_keys = [k for k in missing_keys if re.search(pattern, k) is None] + + if cls._keys_to_ignore_on_load_unexpected is not None: + for pattern in cls._keys_to_ignore_on_load_unexpected: + unexpected_keys = [k for k in unexpected_keys if re.search(pattern, k) is None] + + return missing_keys, unexpected_keys + + +def _find_mismatched_keys( + model: "PreTrainedModel", + state_dict: Optional[Dict], + checkpoint_files: Optional[List[str]], + ignore_mismatched_sizes: bool, + keys_to_rename_mapping: Dict[str, str], + is_quantized: bool, + weights_only: bool, +) -> Tuple[List[str], List[Tuple[int, int]]]: + """ + Find potential shape mismatch between the different state dicts and the model parameters, but only if `ignore_mismatched_sizes` + is True. Otherwise, return immediately and any shape mismatch that may exist will be raised later on. This avoids checking + every parameter in advance, as shape mismatch are extremely rare in practice. If we want to ignore them however, we do + need to check in advance as we need to know which parameters we need to move back from meta to cpu, and initialize + correctly. Indeed, as our model initialization takes place at the module level, and not the weight level, in the + case of a sharded checkpoint we cannot correctly initialize the weights according to `model._init_weights()` if we perform + this check on each state dict at loading time (after the first loaded checkpoint, there are no way to initialize only the + mismatched weights if any, without overwriting the previously loaded weights as well because all the module will be + initialized, not only the weights that are mismatched). + """ + + # An error will be raised later on anyway if there is a mismatch - this avoids running the rest of this function + # if there are no mismatch (which is almost always the case) + if not ignore_mismatched_sizes: + return [], [] + + if state_dict is not None: + checkpoint_files = [""] + + model_state_dict = model.state_dict() + mismatched_keys = [] + mismatched_shapes = [] + for shard_file in checkpoint_files: + # If shard_file is "", we use the existing state_dict instead of loading it + if shard_file != "": + state_dict = load_state_dict( + shard_file, is_quantized=is_quantized, map_location="meta", weights_only=weights_only + ) + + # Fix the key names + new_state_dict = {keys_to_rename_mapping[k]: v for k, v in state_dict.items() if k in keys_to_rename_mapping} + + for key in new_state_dict.keys(): + if key in model_state_dict and new_state_dict[key].shape != model_state_dict[key].shape: + # This skips size mismatches for 4-bit weights. Two 4-bit values share an 8-bit container, causing size differences. + # Without matching with module type or paramter type it seems like a practical way to detect valid 4bit weights. + if not ( + new_state_dict[key].shape[-1] == 1 + and new_state_dict[key].numel() * 2 == model_state_dict[key].numel() + ): + mismatched_keys.append(key) + mismatched_shapes.append((new_state_dict[key].shape, model_state_dict[key].shape)) + + return mismatched_keys, mismatched_shapes + + +class PipelineParallel(Enum): + inputs: 0 + outputs: 1 + + +class ModuleUtilsMixin: + """ + A few utilities for `torch.nn.Modules`, to be used as a mixin. + """ + + @staticmethod + def _hook_rss_memory_pre_forward(module, *args, **kwargs): + try: + import psutil + except ImportError: + raise ImportError("You need to install psutil (pip install psutil) to use memory tracing.") + + process = psutil.Process(os.getpid()) + mem = process.memory_info() + module.mem_rss_pre_forward = mem.rss + return None + + @staticmethod + def _hook_rss_memory_post_forward(module, *args, **kwargs): + try: + import psutil + except ImportError: + raise ImportError("You need to install psutil (pip install psutil) to use memory tracing.") + + process = psutil.Process(os.getpid()) + mem = process.memory_info() + module.mem_rss_post_forward = mem.rss + mem_rss_diff = module.mem_rss_post_forward - module.mem_rss_pre_forward + module.mem_rss_diff = mem_rss_diff + (module.mem_rss_diff if hasattr(module, "mem_rss_diff") else 0) + return None + + def add_memory_hooks(self): + """ + Add a memory hook before and after each sub-module forward pass to record increase in memory consumption. + + Increase in memory consumption is stored in a `mem_rss_diff` attribute for each module and can be reset to zero + with `model.reset_memory_hooks_state()`. + """ + for module in self.modules(): + module.register_forward_pre_hook(self._hook_rss_memory_pre_forward) + module.register_forward_hook(self._hook_rss_memory_post_forward) + self.reset_memory_hooks_state() + + def reset_memory_hooks_state(self): + """ + Reset the `mem_rss_diff` attribute of each module (see [`~modeling_utils.ModuleUtilsMixin.add_memory_hooks`]). + """ + for module in self.modules(): + module.mem_rss_diff = 0 + module.mem_rss_post_forward = 0 + module.mem_rss_pre_forward = 0 + + @property + def device(self) -> torch.device: + """ + `torch.device`: The device on which the module is (assuming that all the module parameters are on the same + device). + """ + return get_parameter_device(self) + + @property + def dtype(self) -> torch.dtype: + """ + `torch.dtype`: The dtype of the module (assuming that all the module parameters have the same dtype). + """ + return get_parameter_dtype(self) + + def invert_attention_mask(self, encoder_attention_mask: Tensor) -> Tensor: + """ + Invert an attention mask (e.g., switches 0. and 1.). + + Args: + encoder_attention_mask (`torch.Tensor`): An attention mask. + + Returns: + `torch.Tensor`: The inverted attention mask. + """ + if encoder_attention_mask.dim() == 3: + encoder_extended_attention_mask = encoder_attention_mask[:, None, :, :] + if encoder_attention_mask.dim() == 2: + encoder_extended_attention_mask = encoder_attention_mask[:, None, None, :] + # T5 has a mask that can compare sequence ids, we can simulate this here with this transposition + # Cf. https://github.com/tensorflow/mesh/blob/8d2465e9bc93129b913b5ccc6a59aa97abd96ec6/mesh_tensorflow + # /transformer/transformer_layers.py#L270 + # encoder_extended_attention_mask = (encoder_extended_attention_mask == + # encoder_extended_attention_mask.transpose(-1, -2)) + encoder_extended_attention_mask = encoder_extended_attention_mask.to(dtype=self.dtype) # fp16 compatibility + encoder_extended_attention_mask = (1.0 - encoder_extended_attention_mask) * torch.finfo(self.dtype).min + + return encoder_extended_attention_mask + + @staticmethod + def create_extended_attention_mask_for_decoder(input_shape, attention_mask, device=None): + if device is not None: + warnings.warn( + "The `device` argument is deprecated and will be removed in v5 of Transformers.", FutureWarning + ) + else: + device = attention_mask.device + batch_size, seq_length = input_shape + seq_ids = torch.arange(seq_length, device=device) + causal_mask = seq_ids[None, None, :].repeat(batch_size, seq_length, 1) <= seq_ids[None, :, None] + # in case past_key_values are used we need to add a prefix ones mask to the causal mask + causal_mask = causal_mask.to(attention_mask.dtype) + + if causal_mask.shape[1] < attention_mask.shape[1]: + prefix_seq_len = attention_mask.shape[1] - causal_mask.shape[1] + causal_mask = torch.cat( + [ + torch.ones((batch_size, seq_length, prefix_seq_len), device=device, dtype=causal_mask.dtype), + causal_mask, + ], + axis=-1, + ) + + extended_attention_mask = causal_mask[:, None, :, :] * attention_mask[:, None, None, :] + return extended_attention_mask + + def get_extended_attention_mask( + self, attention_mask: Tensor, input_shape: Tuple[int], device: torch.device = None, dtype: torch.float = None + ) -> Tensor: + """ + Makes broadcastable attention and causal masks so that future and masked tokens are ignored. + + Arguments: + attention_mask (`torch.Tensor`): + Mask with ones indicating tokens to attend to, zeros for tokens to ignore. + input_shape (`Tuple[int]`): + The shape of the input to the model. + + Returns: + `torch.Tensor` The extended attention mask, with a the same dtype as `attention_mask.dtype`. + """ + if dtype is None: + dtype = self.dtype + + if not (attention_mask.dim() == 2 and self.config.is_decoder): + # show warning only if it won't be shown in `create_extended_attention_mask_for_decoder` + if device is not None: + warnings.warn( + "The `device` argument is deprecated and will be removed in v5 of Transformers.", FutureWarning + ) + # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length] + # ourselves in which case we just need to make it broadcastable to all heads. + if attention_mask.dim() == 3: + extended_attention_mask = attention_mask[:, None, :, :] + elif attention_mask.dim() == 2: + # Provided a padding mask of dimensions [batch_size, seq_length] + # - if the model is a decoder, apply a causal mask in addition to the padding mask + # - if the model is an encoder, make the mask broadcastable to [batch_size, num_heads, seq_length, seq_length] + if self.config.is_decoder: + extended_attention_mask = ModuleUtilsMixin.create_extended_attention_mask_for_decoder( + input_shape, attention_mask, device + ) + else: + extended_attention_mask = attention_mask[:, None, None, :] + else: + raise ValueError( + f"Wrong shape for input_ids (shape {input_shape}) or attention_mask (shape {attention_mask.shape})" + ) + + # Since attention_mask is 1.0 for positions we want to attend and 0.0 for + # masked positions, this operation will create a tensor which is 0.0 for + # positions we want to attend and the dtype's smallest value for masked positions. + # Since we are adding it to the raw scores before the softmax, this is + # effectively the same as removing these entirely. + extended_attention_mask = extended_attention_mask.to(dtype=dtype) # fp16 compatibility + extended_attention_mask = (1.0 - extended_attention_mask) * torch.finfo(dtype).min + return extended_attention_mask + + def get_head_mask( + self, head_mask: Optional[Tensor], num_hidden_layers: int, is_attention_chunked: bool = False + ) -> Tensor: + """ + Prepare the head mask if needed. + + Args: + head_mask (`torch.Tensor` with shape `[num_heads]` or `[num_hidden_layers x num_heads]`, *optional*): + The mask indicating if we should keep the heads or not (1.0 for keep, 0.0 for discard). + num_hidden_layers (`int`): + The number of hidden layers in the model. + is_attention_chunked (`bool`, *optional*, defaults to `False`): + Whether or not the attentions scores are computed by chunks or not. + + Returns: + `torch.Tensor` with shape `[num_hidden_layers x batch x num_heads x seq_length x seq_length]` or list with + `[None]` for each layer. + """ + if head_mask is not None: + head_mask = self._convert_head_mask_to_5d(head_mask, num_hidden_layers) + if is_attention_chunked is True: + head_mask = head_mask.unsqueeze(-1) + else: + head_mask = [None] * num_hidden_layers + + return head_mask + + def _convert_head_mask_to_5d(self, head_mask, num_hidden_layers): + """-> [num_hidden_layers x batch x num_heads x seq_length x seq_length]""" + if head_mask.dim() == 1: + head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(-1).unsqueeze(-1) + head_mask = head_mask.expand(num_hidden_layers, -1, -1, -1, -1) + elif head_mask.dim() == 2: + head_mask = head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1) # We can specify head_mask for each layer + assert head_mask.dim() == 5, f"head_mask.dim != 5, instead {head_mask.dim()}" + head_mask = head_mask.to(dtype=self.dtype) # switch to float if need + fp16 compatibility + return head_mask + + def num_parameters(self, only_trainable: bool = False, exclude_embeddings: bool = False) -> int: + """ + Get number of (optionally, trainable or non-embeddings) parameters in the module. + + Args: + only_trainable (`bool`, *optional*, defaults to `False`): + Whether or not to return only the number of trainable parameters + + exclude_embeddings (`bool`, *optional*, defaults to `False`): + Whether or not to return only the number of non-embeddings parameters + + Returns: + `int`: The number of parameters. + """ + + if exclude_embeddings: + embedding_param_names = [ + f"{name}.weight" for name, module_type in self.named_modules() if isinstance(module_type, nn.Embedding) + ] + total_parameters = [ + parameter for name, parameter in self.named_parameters() if name not in embedding_param_names + ] + else: + total_parameters = list(self.parameters()) + + total_numel = [] + is_loaded_in_4bit = getattr(self, "is_loaded_in_4bit", False) + + if is_loaded_in_4bit: + if is_bitsandbytes_available(): + import bitsandbytes as bnb + else: + raise ValueError( + "bitsandbytes is not installed but it seems that the model has been loaded in 4bit precision, something went wrong" + " make sure to install bitsandbytes with `pip install bitsandbytes`. You also need a GPU. " + ) + + for param in total_parameters: + if param.requires_grad or not only_trainable: + # For 4bit models, we need to multiply the number of parameters by 2 as half of the parameters are + # used for the 4bit quantization (uint8 tensors are stored) + if is_loaded_in_4bit and isinstance(param, bnb.nn.Params4bit): + if hasattr(param, "element_size"): + num_bytes = param.element_size() + elif hasattr(param, "quant_storage"): + num_bytes = param.quant_storage.itemsize + else: + num_bytes = 1 + total_numel.append(param.numel() * 2 * num_bytes) + else: + total_numel.append(param.numel()) + + return sum(total_numel) + + def estimate_tokens(self, input_dict: Dict[str, Union[torch.Tensor, Any]]) -> int: + """ + Helper function to estimate the total number of tokens from the model inputs. + + Args: + inputs (`dict`): The model inputs. + + Returns: + `int`: The total number of tokens. + """ + if not hasattr(self, "warnings_issued"): + self.warnings_issued = {} + if self.main_input_name in input_dict: + return input_dict[self.main_input_name].numel() + elif "estimate_tokens" not in self.warnings_issued: + logger.warning( + "Could not estimate the number of tokens of the input, floating-point operations will not be computed" + ) + self.warnings_issued["estimate_tokens"] = True + return 0 + + def floating_point_ops( + self, input_dict: Dict[str, Union[torch.Tensor, Any]], exclude_embeddings: bool = True + ) -> int: + """ + Get number of (optionally, non-embeddings) floating-point operations for the forward and backward passes of a + batch with this transformer model. Default approximation neglects the quadratic dependency on the number of + tokens (valid if `12 * d_model << sequence_length`) as laid out in [this + paper](https://arxiv.org/pdf/2001.08361.pdf) section 2.1. Should be overridden for transformers with parameter + re-use e.g. Albert or Universal Transformers, or if doing long-range modeling with very high sequence lengths. + + Args: + batch_size (`int`): + The batch size for the forward pass. + + sequence_length (`int`): + The number of tokens in each line of the batch. + + exclude_embeddings (`bool`, *optional*, defaults to `True`): + Whether or not to count embedding and softmax operations. + + Returns: + `int`: The number of floating-point operations. + """ + + return 6 * self.estimate_tokens(input_dict) * self.num_parameters(exclude_embeddings=exclude_embeddings) + + +class PreTrainedModel(nn.Module, ModuleUtilsMixin, PushToHubMixin, PeftAdapterMixin): + r""" + Base class for all models. + + [`PreTrainedModel`] takes care of storing the configuration of the models and handles methods for loading, + downloading and saving models as well as a few methods common to all models to: + + - resize the input embeddings, + - prune heads in the self-attention heads. + + Class attributes (overridden by derived classes): + + - **config_class** ([`PretrainedConfig`]) -- A subclass of [`PretrainedConfig`] to use as configuration class + for this model architecture. + - **load_tf_weights** (`Callable`) -- A python *method* for loading a TensorFlow checkpoint in a PyTorch model, + taking as arguments: + + - **model** ([`PreTrainedModel`]) -- An instance of the model on which to load the TensorFlow checkpoint. + - **config** ([`PreTrainedConfig`]) -- An instance of the configuration associated to the model. + - **path** (`str`) -- A path to the TensorFlow checkpoint. + + - **base_model_prefix** (`str`) -- A string indicating the attribute associated to the base model in derived + classes of the same architecture adding modules on top of the base model. + - **is_parallelizable** (`bool`) -- A flag indicating whether this model supports model parallelization. + - **main_input_name** (`str`) -- The name of the principal input to the model (often `input_ids` for NLP + models, `pixel_values` for vision models and `input_values` for speech models). + """ + + config_class = None + base_model_prefix = "" + main_input_name = "input_ids" + model_tags = None + + _auto_class = None + _no_split_modules = None + _skip_keys_device_placement = None + _keep_in_fp32_modules = None + + # a list of `re` patterns of `state_dict` keys that should be removed from the list of missing + # keys we find (keys inside the model but not in the checkpoint) and avoid unnecessary warnings. + _keys_to_ignore_on_load_missing = None + # a list of `re` patterns of `state_dict` keys that should be removed from the list of + # unexpected keys we find (keys inside the checkpoint but not the model) and avoid unnecessary + # warnings. + _keys_to_ignore_on_load_unexpected = None + # a list of `state_dict` keys to ignore when saving the model (useful for keys that aren't + # trained, but which are either deterministic or tied variables) + _keys_to_ignore_on_save = None + # a list of `state_dict` keys that are potentially tied to another key in the state_dict. + _tied_weights_keys = None + + is_parallelizable = False + supports_gradient_checkpointing = False + _is_stateful = False + + # Flash Attention 2 support + _supports_flash_attn_2 = False + + # SDPA support + _supports_sdpa = False + + # Flex Attention support + _supports_flex_attn = False + + # Has support for a `Cache` instance as `past_key_values`? Does it support a `StaticCache`? + _supports_cache_class = False + _supports_static_cache = False + + # Has support for a `QuantoQuantizedCache` instance as `past_key_values` + _supports_quantized_cache = False + + # A tensor parallel plan to be applied to the model when TP is enabled. For + # top-level models, this attribute is currently defined in respective model + # code. For base models, this attribute comes from + # `config.base_model_tp_plan` during `__init__`. + # It should identify the layers exactly: if you want to TP model.language_model.layers.fc1 + # by passing `tp_plan` to the init, it should be {"model.language_model.layers.fc1":"colwise"} + # for example. + _tp_plan = None + + # tensor parallel degree to which model is sharded to. + _tp_size = None + + # A pipeline parallel plan specifying the layers which may not be present + # on all ranks when PP is enabled. For top-level models, this attribute is + # currently defined in respective model code. For base models, this + # attribute comes from `config.base_model_pp_plan` during `post_init`. + # + # The variable names for the inputs and outputs of the specified layers can + # be indexed using the `PipelineParallel` enum as follows: + # - `_pp_plan["layers"][PipelineParallel.inputs]` + # - `_pp_plan["layers"][PipelineParallel.outputs]` + _pp_plan = None + + # This flag signal that the model can be used as an efficient backend in TGI and vLLM + # In practice, it means that they support attention interface functions, fully pass the kwargs + # through all modules up to the Attention layer, can slice logits with Tensor, and have a default TP plan + _supports_attention_backend = False + + @property + def dummy_inputs(self) -> Dict[str, torch.Tensor]: + """ + `Dict[str, torch.Tensor]`: Dummy inputs to do a forward pass in the network. + """ + return {"input_ids": torch.tensor(DUMMY_INPUTS)} + + @property + def framework(self) -> str: + """ + :str: Identifies that this is a PyTorch model. + """ + return "pt" + + def __init__(self, config: PretrainedConfig, *inputs, **kwargs): + super().__init__() + if not isinstance(config, PretrainedConfig): + raise ValueError( + f"Parameter config in `{self.__class__.__name__}(config)` should be an instance of class " + "`PretrainedConfig`. To create a model from a pretrained model use " + f"`model = {self.__class__.__name__}.from_pretrained(PRETRAINED_MODEL_NAME)`" + ) + if not getattr(config, "_attn_implementation_autoset", False): + # config usually has a `torch_dtype` but we need the next line for the `no_super_init` tests + dtype = config.torch_dtype if hasattr(config, "torch_dtype") else torch.get_default_dtype() + config = self._autoset_attn_implementation(config, torch_dtype=dtype, check_device_map=False) + self.config = config + + # for initialization of the loss + loss_type = self.__class__.__name__ + if loss_type not in LOSS_MAPPING: + loss_groups = f"({'|'.join(LOSS_MAPPING)})" + loss_type = re.findall(loss_groups, self.__class__.__name__) + if len(loss_type) > 0: + loss_type = loss_type[0] + else: + loss_type = None + self.loss_type = loss_type + + self.name_or_path = config.name_or_path + self.warnings_issued = {} + self.generation_config = GenerationConfig.from_model_config(config) if self.can_generate() else None + # Overwrite the class attribute to make it an instance attribute, so models like + # `InstructBlipForConditionalGeneration` can dynamically update it without modifying the class attribute + # when a different component (e.g. language_model) is used. + self._keep_in_fp32_modules = copy.copy(self.__class__._keep_in_fp32_modules) + + self._no_split_modules = self._no_split_modules or [] + + def post_init(self): + """ + A method executed at the end of each Transformer model initialization, to execute code that needs the model's + modules properly initialized (such as weight initialization). + """ + self.init_weights() + self._backward_compatibility_gradient_checkpointing() + + # Make sure the modules correctly exist if the flag is active + if self._keep_in_fp32_modules is not None: + all_parameters = {name for name, _ in self.named_parameters() if len(name) > 0} + unique_module_names = set() + # Get all unique module names in the module graph, without the prefixes + for param in all_parameters: + unique_module_names.update( + [name for name in param.split(".") if not name.isnumeric() and name not in ["weight", "bias"]] + ) + # Check that every module in the keep_in_fp32 list is part of the module graph + for module in self._keep_in_fp32_modules: + if module not in unique_module_names: + raise ValueError( + f"{module} was specified in the `_keep_in_fp32_modules` list, but is not part of the modules in" + f" {self.__class__.__name__}" + ) + + # If current model is a base model, attach `base_model_tp_plan` and `base_model_pp_plan` from config + self._pp_plan = self.config.base_model_pp_plan.copy() if self.config.base_model_pp_plan is not None else None + self._tp_plan = self.config.base_model_tp_plan.copy() if self.config.base_model_tp_plan is not None else {} + for name, module in self.named_children(): + if plan := getattr(module, "_tp_plan", None): + self._tp_plan.update({f"{name}.{k}": v for k, v in plan.copy().items()}) + + if self._tp_plan is not None and is_torch_greater_or_equal("2.3"): + for _, v in self._tp_plan.items(): + if v not in SUPPORTED_TP_STYLES: + raise ValueError( + f"Unsupported tensor parallel style {v}. Supported styles are {SUPPORTED_TP_STYLES}" + ) + + def dequantize(self): + """ + Potentially dequantize the model in case it has been quantized by a quantization method that support + dequantization. + """ + hf_quantizer = getattr(self, "hf_quantizer", None) + + if hf_quantizer is None: + raise ValueError("You need to first quantize your model in order to dequantize it") + + return hf_quantizer.dequantize(self) + + def _backward_compatibility_gradient_checkpointing(self): + if self.supports_gradient_checkpointing and getattr(self.config, "gradient_checkpointing", False): + self.gradient_checkpointing_enable() + # Remove the attribute now that is has been consumed, so it's no saved in the config. + delattr(self.config, "gradient_checkpointing") + + def add_model_tags(self, tags: Union[List[str], str]) -> None: + r""" + Add custom tags into the model that gets pushed to the Hugging Face Hub. Will + not overwrite existing tags in the model. + + Args: + tags (`Union[List[str], str]`): + The desired tags to inject in the model + + Examples: + + ```python + from transformers import AutoModel + + model = AutoModel.from_pretrained("google-bert/bert-base-cased") + + model.add_model_tags(["custom", "custom-bert"]) + + # Push the model to your namespace with the name "my-custom-bert". + model.push_to_hub("my-custom-bert") + ``` + """ + if isinstance(tags, str): + tags = [tags] + + if self.model_tags is None: + self.model_tags = [] + + for tag in tags: + if tag not in self.model_tags: + self.model_tags.append(tag) + + @classmethod + @restore_default_torch_dtype + def _from_config(cls, config, **kwargs): + """ + All context managers that the model should be initialized under go here. + + Args: + torch_dtype (`torch.dtype`, *optional*): + Override the default `torch.dtype` and load the model under this dtype. + """ + # when we init a model from within another model (e.g. VLMs) and dispatch on FA2 + # a warning is raised that dtype should be fp16. Since we never pass dtype from within + # modeling code, we can try to infer it here same way as done in `from_pretrained` + torch_dtype = kwargs.pop("torch_dtype", config.torch_dtype) + if isinstance(torch_dtype, str): + torch_dtype = getattr(torch, torch_dtype) + + use_flash_attention_2 = kwargs.pop("use_flash_attention_2", False) + + # override default dtype if needed + dtype_orig = None + if torch_dtype is not None: + dtype_orig = cls._set_default_torch_dtype(torch_dtype) + + config = copy.deepcopy(config) # We do not want to modify the config inplace in _from_config. + + if config._attn_implementation_internal is not None: + # In this case, the config has been created with the attn_implementation set by the user, which we + # should respect. + attn_implementation = config._attn_implementation_internal + else: + attn_implementation = None + + config._attn_implementation = kwargs.pop("attn_implementation", attn_implementation) + if not getattr(config, "_attn_implementation_autoset", False): + config = cls._autoset_attn_implementation( + config, + use_flash_attention_2=use_flash_attention_2, + check_device_map=False, + torch_dtype=torch_dtype, + ) + + if is_deepspeed_zero3_enabled() and not _is_quantized and not _is_ds_init_called: + logger.info("Detected DeepSpeed ZeRO-3: activating zero.init() for this model") + # this immediately partitions the model across all gpus, to avoid the overhead in time + # and memory copying it on CPU or each GPU first + init_contexts = [deepspeed.zero.Init(config_dict_or_path=deepspeed_config()), set_zero3_state()] + with ContextManagers(init_contexts): + model = cls(config, **kwargs) + + else: + model = cls(config, **kwargs) + + # restore default dtype if it was modified + if dtype_orig is not None: + torch.set_default_dtype(dtype_orig) + + return model + + @classmethod + def _autoset_attn_implementation( + cls, + config, + use_flash_attention_2: bool = False, + torch_dtype: Optional[torch.dtype] = None, + device_map: Optional[Union[str, Dict[str, int]]] = None, + check_device_map: bool = True, + ): + """ + Automatically checks and dispatches to a default attention implementation. In order of priority: + 1. An implementation specified in `config._attn_implementation` (due for example to the argument attn_implementation="sdpa" in from_pretrained). + 2. DEPRECATED: if use_flash_attention_2 is set to `True` and `flash_attn` is available, flash attention. (`LlamaFlashAttention` for example) + 3. SDPA implementation, if available and supported by the model type. (`LlamaSdpaAttention` for example) + 4. The default model's implementation otherwise (`LlamaAttention` for example) . + """ + # Here we use config._attn_implementation_internal to check whether the attention implementation was explicitly set by the user. + # The property `PretrainedConfig._attn_implementation` is never `None`, for backward compatibility (always fall back on "eager"). + # The `hasattr` here is used as some Transformers tests for some reason do not call PretrainedConfig __init__ (e.g. test_no_super_init_config_and_model) + requested_attn_implementation = None + if hasattr(config, "_attn_implementation_internal") and config._attn_implementation_internal is not None: + if config._attn_implementation != "flash_attention_2" and use_flash_attention_2: + raise ValueError( + f'Both attn_implementation="{config._attn_implementation}" and `use_flash_attention_2=True` were used when loading the model, which are not compatible.' + ' We recommend to just use `attn_implementation="flash_attention_2"` when loading the model.' + ) + + if isinstance(config._attn_implementation, str) and re.match( + r"^[^/:]+/[^/:]+:[^/:]+$", config._attn_implementation + ): + if not is_kernels_available(): + raise ValueError("kernels is not installed. Please install it with `pip install kernels`.") + + # Extract repo_id and kernel_name from the string + repo_id, kernel_name = config._attn_implementation.split(":") + kernel_name = kernel_name.strip() + repo_id = repo_id.strip() + + try: + kernel = get_kernel(repo_id) + ALL_ATTENTION_FUNCTIONS.register( + f"kernel_{repo_id.replace('/', '_')}", getattr(kernel, kernel_name) + ) + config._attn_implementation = f"kernel_{repo_id.replace('/', '_')}" + except FileNotFoundError as e: + logger.warning( + f"Could not find a kernel repository '{repo_id}' compatible with your devicein the hub: {e}. Using eager attention implementation instead." + ) + config._attn_implementation = "eager" + except AttributeError: + raise ValueError( + "the kernel function name or class specified in the attn_implementation argument is not valid. \ + Please check the documentation for the correct format, \ + and check that the kernel exports the class and the function correctly." + ) + + if ( + not isinstance(config._attn_implementation, dict) + and config._attn_implementation not in ["eager"] + ALL_ATTENTION_FUNCTIONS.valid_keys() + ): + message = f'Specified `attn_implementation="{config._attn_implementation}"` is not supported. The only possible arguments are `attn_implementation="eager"` (manual attention implementation)' + if cls._supports_flash_attn_2: + message += ', `"attn_implementation=flash_attention_2"` (implementation using flash attention 2)' + if cls._supports_sdpa: + message += ', `"attn_implementation=sdpa"` (implementation using torch.nn.functional.scaled_dot_product_attention)' + if cls._supports_flex_attn: + message += ( + ', `"attn_implementation=flex_attention"` (implementation using torch\'s flex_attention)' + ) + raise ValueError(message + ".") + + # If a config is passed with a preset attn_implementation, we skip the automatic dispatch and use the user-provided config, with hard checks that the requested attention implementation is available. + requested_attn_implementation = config._attn_implementation_internal + + # Composite models consisting of several PretrainedModels have to specify attention impl as a dict + # where keys are sub-config names. But most people will specify one `str` which means that should dispatch it + # for all sub-models. + # Below we check if a config is composite and manually prepare a dict of attn impl if not already passed as a dict. + # Later each sub-module will dispatch with its own attn impl, by calling `XXXModel._from_config(config.text_config)` + # If any of sub-modules doesn't support requested attn, an error will be raised. See https://github.com/huggingface/transformers/pull/32238 + for key in config.sub_configs.keys(): + sub_config = getattr(config, key) + curr_attn_implementation = ( + requested_attn_implementation + if not isinstance(requested_attn_implementation, dict) + else requested_attn_implementation.get(key, None) + ) + # For models with backbone sub-config might be not initialized + if sub_config is not None: + sub_config._attn_implementation_internal = curr_attn_implementation + + if use_flash_attention_2: + logger.warning_once( + 'The model was loaded with use_flash_attention_2=True, which is deprecated and may be removed in a future release. Please use `attn_implementation="flash_attention_2"` instead.' + ) + config._attn_implementation = "flash_attention_2" + + if config._attn_implementation == "flash_attention_2": + cls._check_and_enable_flash_attn_2( + config, + torch_dtype=torch_dtype, + device_map=device_map, + hard_check_only=False, + check_device_map=check_device_map, + ) + elif requested_attn_implementation == "flex_attention": + config = cls._check_and_enable_flex_attn(config, hard_check_only=True) + elif requested_attn_implementation in [None, "sdpa"] and not is_torch_xla_available(): + # use_flash_attention_2 takes priority over SDPA, hence SDPA treated in this elif. + config = cls._check_and_enable_sdpa( + config, + hard_check_only=False if requested_attn_implementation is None else True, + ) + + if ( + torch.version.hip is not None + and config._attn_implementation == "sdpa" + and torch.cuda.device_count() > 1 + and version.parse(torch.__version__) < version.parse("2.4.1") + ): + logger.warning_once( + "Using the `SDPA` attention implementation on multi-gpu setup with ROCM may lead to performance issues due to the FA backend. Disabling it to use alternative backends." + ) + torch.backends.cuda.enable_flash_sdp(False) + elif requested_attn_implementation in ALL_ATTENTION_FUNCTIONS.valid_keys(): + config._attn_implementation = requested_attn_implementation + elif isinstance(requested_attn_implementation, dict): + config._attn_implementation = None + else: + config._attn_implementation = "eager" + + config._attn_implementation_autoset = True + return config + + @classmethod + def _set_default_torch_dtype(cls, dtype: torch.dtype) -> torch.dtype: + """ + Change the default dtype and return the previous one. This is needed when wanting to instantiate the model + under specific dtype. + + Args: + dtype (`torch.dtype`): + a floating dtype to set to. + + Returns: + `torch.dtype`: the original `dtype` that can be used to restore `torch.set_default_dtype(dtype)` if it was + modified. If it wasn't, returns `None`. + + Note `set_default_dtype` currently only works with floating-point types and asserts if for example, + `torch.int64` is passed. So if a non-float `dtype` is passed this functions will throw an exception. + """ + if not dtype.is_floating_point: + raise ValueError( + f"Can't instantiate {cls.__name__} model under dtype={dtype} since it is not a floating point dtype" + ) + + logger.info(f"Instantiating {cls.__name__} model under default dtype {dtype}.") + dtype_orig = torch.get_default_dtype() + torch.set_default_dtype(dtype) + return dtype_orig + + @property + def base_model(self) -> nn.Module: + """ + `torch.nn.Module`: The main body of the model. + """ + return getattr(self, self.base_model_prefix, self) + + @classmethod + def can_generate(cls) -> bool: + """ + Returns whether this model can generate sequences with `.generate()` from the `GenerationMixin`. + + Under the hood, on classes where this function returns True, some generation-specific changes are triggered: + for instance, the model instance will have a populated `generation_config` attribute. + + Returns: + `bool`: Whether this model can generate sequences with `.generate()`. + """ + # Directly inherits `GenerationMixin` -> can generate + if "GenerationMixin" in str(cls.__bases__): + return True + # The class inherits from a class that can generate (recursive check) -> can generate + for base in cls.__bases__: + if not hasattr(base, "can_generate"): + continue + if "PreTrainedModel" not in str(base) and base.can_generate(): + return True + # Detects whether `prepare_inputs_for_generation` has been overwritten in the model. Prior to v4.45, this + # was how we detected whether a model could generate. + if hasattr(cls, "prepare_inputs_for_generation"): # implicit: doesn't inherit `GenerationMixin` + logger.warning( + f"{cls.__name__} has generative capabilities, as `prepare_inputs_for_generation` is explicitly " + "defined. However, it doesn't directly inherit from `GenerationMixin`. From 👉v4.50👈 onwards, " + "`PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability " + "to call `generate` and other related functions." + "\n - If you're using `trust_remote_code=True`, you can get rid of this warning by loading the " + "model with an auto class. See https://huggingface.co/docs/transformers/en/model_doc/auto#auto-classes" + "\n - If you are the owner of the model architecture code, please modify your model class such that " + "it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception)." + "\n - If you are not the owner of the model architecture class, please contact the model code owner " + "to update it." + ) + # Otherwise, can't generate + return False + + @classmethod + def _check_and_enable_flash_attn_2( + cls, + config, + torch_dtype: Optional[torch.dtype] = None, + device_map: Optional[Union[str, Dict[str, int]]] = None, + check_device_map: bool = True, + hard_check_only: bool = False, + ) -> PretrainedConfig: + """ + Checks the availability of Flash Attention 2 and compatibility with the current model. + + If all checks pass and `hard_check_only` is False, the method will set the config attribute `attn_implementation` to "flash_attention_2" so that the model can initialize the correct attention module. + """ + if not cls._supports_flash_attn_2: + raise ValueError( + f"{cls.__name__} does not support Flash Attention 2.0 yet. Please request to add support where" + f" the model is hosted, on its model hub page: https://huggingface.co/{config._name_or_path}/discussions/new" + " or in the Transformers GitHub repo: https://github.com/huggingface/transformers/issues/new" + ) + + if not is_flash_attn_2_available(): + preface = "FlashAttention2 has been toggled on, but it cannot be used due to the following error:" + install_message = "Please refer to the documentation of https://huggingface.co/docs/transformers/perf_infer_gpu_one#flashattention-2 to install Flash Attention 2." + + if importlib.util.find_spec("flash_attn") is None: + # package `flash-attn` can not be installed on Ascend NPU, ignore related validation logic and early exit. + if is_torch_npu_available(): + if not hard_check_only: + config._attn_implementation = "flash_attention_2" + + logger.info("Detect using FlashAttention2 on Ascend NPU.") + return config + else: + raise ImportError(f"{preface} the package flash_attn seems to be not installed. {install_message}") + + flash_attention_version = version.parse(importlib.metadata.version("flash_attn")) + if torch.version.cuda: + if flash_attention_version < version.parse("2.1.0"): + raise ImportError( + f"{preface} you need flash_attn package version to be greater or equal than 2.1.0. Detected version {flash_attention_version}. {install_message}" + ) + elif not torch.cuda.is_available(): + raise ValueError( + f"{preface} Flash Attention 2 is not available on CPU. Please make sure torch can access a CUDA device." + ) + else: + raise ImportError(f"{preface} Flash Attention 2 is not available. {install_message}") + elif torch.version.hip: + if flash_attention_version < version.parse("2.0.4"): + raise ImportError( + f"{preface} you need flash_attn package version to be greater or equal than 2.0.4. Make sure to have that version installed - detected version {flash_attention_version}. {install_message}" + ) + else: + raise ImportError(f"{preface} Flash Attention 2 is not available. {install_message}") + + _is_bettertransformer = getattr(cls, "use_bettertransformer", False) + + if _is_bettertransformer: + raise ValueError( + "Flash Attention 2 and BetterTransformer API are not compatible. Please make sure to disable BetterTransformers by doing model.reverse_bettertransformer()" + ) + + if torch_dtype is None: + logger.warning_once( + "You are attempting to use Flash Attention 2.0 without specifying a torch dtype. This might lead to unexpected behaviour" + ) + elif torch_dtype is not None and torch_dtype not in [torch.float16, torch.bfloat16]: + logger.warning_once( + "Flash Attention 2.0 only supports torch.float16 and torch.bfloat16 dtypes, but" + f" the current dype in {cls.__name__} is {torch_dtype}. You should run training or inference using Automatic Mixed-Precision via the `with torch.autocast(device_type='torch_device'):` decorator," + ' or load the model with the `torch_dtype` argument. Example: `model = AutoModel.from_pretrained("openai/whisper-tiny", attn_implementation="flash_attention_2", torch_dtype=torch.float16)`' + ) + + # The check `torch.empty(0).device.type != "cuda"` is needed as the model may be initialized after `torch.set_default_device` has been called, + # or the model may be initialized under the context manager `with torch.device("cuda"):`. + if check_device_map and device_map is None and torch.empty(0).device.type not in ["cuda", "mlu"]: + if torch.cuda.is_available(): + logger.warning_once( + "You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU" + " after initializing it on CPU with `model.to('cuda')`." + ) + elif is_torch_mlu_available(): + logger.warning_once( + "You are attempting to use Flash Attention 2.0 with a model not initialized on MLU. Make sure to move the model to MLU" + " after initializing it on CPU with `model.to('mlu')`." + ) + else: + raise ValueError( + "You are attempting to use Flash Attention 2.0 with a model not initialized on GPU and with no GPU available. " + "This is not supported yet. Please make sure to have access to a GPU and either initialise the model on a GPU by passing a device_map " + "or initialising the model on CPU and then moving it to GPU." + ) + elif ( + check_device_map + and device_map is not None + and isinstance(device_map, dict) + and ("cpu" in device_map.values() or "disk" in device_map.values()) + ): + raise ValueError( + "You are attempting to use Flash Attention 2.0 with a model dispatched on CPU or disk. This is not supported. Please make sure to " + "initialise the model on a GPU by passing a device_map that contains only GPU devices as keys." + ) + if not hard_check_only: + config._attn_implementation = "flash_attention_2" + return config + + @classmethod + def _check_and_enable_sdpa(cls, config, hard_check_only: bool = False) -> PretrainedConfig: + """ + Checks the availability of SDPA for a given model. + + If all checks pass and `hard_check_only` is False, the method will set the config attribute `_attn_implementation` to "sdpa" so that the model can initialize the correct attention module. + """ + if hard_check_only: + if not cls._supports_sdpa: + raise ValueError( + f"{cls.__name__} does not support an attention implementation through torch.nn.functional.scaled_dot_product_attention yet." + " Please request the support for this architecture: https://github.com/huggingface/transformers/issues/28005. If you believe" + ' this error is a bug, please open an issue in Transformers GitHub repository and load your model with the argument `attn_implementation="eager"` meanwhile. Example: `model = AutoModel.from_pretrained("openai/whisper-tiny", attn_implementation="eager")`' + ) + if not is_torch_sdpa_available(): + raise ImportError( + "PyTorch SDPA requirements in Transformers are not met. Please install torch>=2.1.1." + ) + + if not is_torch_sdpa_available() or not cls._supports_sdpa: + return config + + _is_bettertransformer = getattr(cls, "use_bettertransformer", False) + if _is_bettertransformer: + return config + + if not hard_check_only: + config._attn_implementation = "sdpa" + return config + + @classmethod + def _check_and_enable_flex_attn(cls, config, hard_check_only: bool = False) -> PretrainedConfig: + """ + Checks the availability of Flex Attention for a given model. + + If all checks pass and `hard_check_only` is False, the method will set the config attribute `_attn_implementation` to "flex_attention" so that the model can initialize the correct attention module. + """ + if hard_check_only: + if not cls._supports_flex_attn: + raise ValueError( + f"{cls.__name__} does not support an attention implementation through torch's flex_attention." + " Please request the support for this architecture: https://github.com/huggingface/transformers/issues/34809." + " If you believe this error is a bug, please open an issue in Transformers GitHub repository" + ' and load your model with the argument `attn_implementation="eager"` meanwhile.' + ' Example: `model = AutoModel.from_pretrained("openai/whisper-tiny", attn_implementation="eager")`' + ) + if not is_torch_flex_attn_available(): + raise ImportError( + "PyTorch Flex Attention requirements in Transformers are not met. Please install torch>=2.5.0." + ) + + if not is_torch_flex_attn_available() or not cls._supports_flex_attn: + return config + + if not hard_check_only: + config._attn_implementation = "flex_attention" + + return config + + def enable_input_require_grads(self): + """ + Enables the gradients for the input embeddings. This is useful for fine-tuning adapter weights while keeping + the model weights fixed. + """ + + def make_inputs_require_grads(module, input, output): + output.requires_grad_(True) + + self._require_grads_hook = self.get_input_embeddings().register_forward_hook(make_inputs_require_grads) + + def disable_input_require_grads(self): + """ + Removes the `_require_grads_hook`. + """ + self._require_grads_hook.remove() + + def get_input_embeddings(self) -> nn.Module: + """ + Returns the model's input embeddings. + + Returns: + `nn.Module`: A torch module mapping vocabulary to hidden states. + """ + base_model = getattr(self, self.base_model_prefix, self) + if base_model is not self: + return base_model.get_input_embeddings() + else: + raise NotImplementedError + + def set_input_embeddings(self, value: nn.Module): + """ + Set model's input embeddings. + + Args: + value (`nn.Module`): A module mapping vocabulary to hidden states. + """ + base_model = getattr(self, self.base_model_prefix, self) + if base_model is not self: + base_model.set_input_embeddings(value) + else: + raise NotImplementedError + + def get_output_embeddings(self) -> nn.Module: + """ + Returns the model's output embeddings. + + Returns: + `nn.Module`: A torch module mapping hidden states to vocabulary. + """ + return None # Overwrite for models with output embeddings + + def _init_weights(self, module): + """ + Initialize the weights. This method should be overridden by derived class and is + the only initialization method that will be called when loading a checkpoint + using `from_pretrained`. Any attempt to initialize outside of this function + will be useless as the torch.nn.init function are all replaced with skip. + """ + pass + + def _initialize_weights(self, module): + """ + Initialize the weights if they are not already initialized. + """ + if getattr(module, "_is_hf_initialized", False): + return + self._init_weights(module) + module._is_hf_initialized = True + + @torch.no_grad() + def initialize_weights(self): + """ + This is equivalent to calling `self.apply(self._initialize_weights)`, but correctly handles composite models. + This function dynamically dispatches the correct `init_weights` function to the modules as we advance in the + module graph along the recursion. It can handle an arbitrary number of sub-models. Without it, every composite + model would have to recurse a second time on all sub-models explicitly in the outer-most `_init_weights`, which + is extremely error prone and inefficient. + + Note that the `torch.no_grad()` decorator is very important as well, as most of our `_init_weights` do not use + `torch.nn.init` functions (which are all no_grad by default), but simply do in-place ops such as + `module.weight.data.zero_()`. + """ + if not hasattr(torch.nn.Module, "smart_apply"): + # This function is equivalent to `torch.nn.Module.apply`, except that it dynamically adjust the function + # to apply as we go down the graph + def smart_apply(self, fn): + for module in self.children(): + # We found a sub-model: recursively dispatch its own init function now! + if hasattr(module, "_init_weights"): + module.smart_apply(module._initialize_weights) + else: + module.smart_apply(fn) + fn(self) + return self + + torch.nn.Module.smart_apply = smart_apply + + # Let the magic happen with this simple call + self.smart_apply(self._initialize_weights) + + def tie_weights(self): + """ + Tie the weights between the input embeddings and the output embeddings. + + If the `torchscript` flag is set in the configuration, can't handle parameter sharing so we are cloning the + weights instead. + """ + if getattr(self.config.get_text_config(decoder=True), "tie_word_embeddings", True): + output_embeddings = self.get_output_embeddings() + if output_embeddings is not None: + self._tie_or_clone_weights(output_embeddings, self.get_input_embeddings()) + + if getattr(self.config, "is_encoder_decoder", False) and getattr(self.config, "tie_encoder_decoder", False): + if hasattr(self, self.base_model_prefix): + self = getattr(self, self.base_model_prefix) + tied_weights = self._tie_encoder_decoder_weights( + self.encoder, self.decoder, self.base_model_prefix, "encoder" + ) + # Setting a dynamic variable instead of `_tied_weights_keys` because it's a class + # attributed not an instance member, therefore modifying it will modify the entire class + # Leading to issues on subsequent calls by different tests or subsequent calls. + self._dynamic_tied_weights_keys = tied_weights + + for module in self.modules(): + if hasattr(module, "_tie_weights"): + module._tie_weights() + + @staticmethod + def _tie_encoder_decoder_weights( + encoder: nn.Module, decoder: nn.Module, base_model_prefix: str, base_encoder_name: str + ): + uninitialized_encoder_weights: List[str] = [] + tied_weights: List[str] = [] + if decoder.__class__ != encoder.__class__: + logger.info( + f"{decoder.__class__} and {encoder.__class__} are not equal. In this case make sure that all encoder" + " weights are correctly initialized." + ) + + def tie_encoder_to_decoder_recursively( + decoder_pointer: nn.Module, + encoder_pointer: nn.Module, + module_name: str, + base_encoder_name: str, + uninitialized_encoder_weights: List[str], + depth=0, + total_decoder_name="", + total_encoder_name="", + ): + assert isinstance(decoder_pointer, nn.Module) and isinstance(encoder_pointer, nn.Module), ( + f"{decoder_pointer} and {encoder_pointer} have to be of type nn.Module" + ) + if hasattr(decoder_pointer, "weight"): + assert hasattr(encoder_pointer, "weight") + encoder_pointer.weight = decoder_pointer.weight + tied_weights.append(f"{base_encoder_name}{total_encoder_name}.weight") + if hasattr(decoder_pointer, "bias"): + assert hasattr(encoder_pointer, "bias") + tied_weights.append(f"{base_encoder_name}{total_encoder_name}.bias") + encoder_pointer.bias = decoder_pointer.bias + return + + encoder_modules = encoder_pointer._modules + decoder_modules = decoder_pointer._modules + if len(decoder_modules) > 0: + assert len(encoder_modules) > 0, ( + f"Encoder module {encoder_pointer} does not match decoder module {decoder_pointer}" + ) + + all_encoder_weights = {module_name + "/" + sub_name for sub_name in encoder_modules.keys()} + encoder_layer_pos = 0 + for name, module in decoder_modules.items(): + if name.isdigit(): + encoder_name = str(int(name) + encoder_layer_pos) + decoder_name = name + if not isinstance(decoder_modules[decoder_name], type(encoder_modules[encoder_name])) and len( + encoder_modules + ) != len(decoder_modules): + # this can happen if the name corresponds to the position in a list module list of layers + # in this case the decoder has added a cross-attention that the encoder does not have + # thus skip this step and subtract one layer pos from encoder + encoder_layer_pos -= 1 + continue + elif name not in encoder_modules: + continue + elif depth > 500: + raise ValueError( + "Max depth of recursive function `tie_encoder_to_decoder` reached. It seems that there is" + " a circular dependency between two or more `nn.Modules` of your model." + ) + else: + decoder_name = encoder_name = name + tie_encoder_to_decoder_recursively( + decoder_modules[decoder_name], + encoder_modules[encoder_name], + module_name + "/" + name, + base_encoder_name, + uninitialized_encoder_weights, + depth=depth + 1, + total_encoder_name=f"{total_encoder_name}.{encoder_name}", + total_decoder_name=f"{total_decoder_name}.{decoder_name}", + ) + all_encoder_weights.remove(module_name + "/" + encoder_name) + + uninitialized_encoder_weights += list(all_encoder_weights) + + # tie weights recursively + tie_encoder_to_decoder_recursively( + decoder, encoder, base_model_prefix, base_encoder_name, uninitialized_encoder_weights + ) + + if len(uninitialized_encoder_weights) > 0: + logger.warning( + f"The following encoder weights were not tied to the decoder {uninitialized_encoder_weights}" + ) + return tied_weights + + def _tie_or_clone_weights(self, output_embeddings, input_embeddings): + """Tie or clone module weights depending of whether we are using TorchScript or not""" + if self.config.torchscript: + output_embeddings.weight = nn.Parameter(input_embeddings.weight.clone()) + else: + output_embeddings.weight = input_embeddings.weight + + if getattr(output_embeddings, "bias", None) is not None: + output_embeddings.bias.data = nn.functional.pad( + output_embeddings.bias.data, + ( + 0, + output_embeddings.weight.shape[0] - output_embeddings.bias.shape[0], + ), + "constant", + 0, + ) + if hasattr(output_embeddings, "out_features") and hasattr(input_embeddings, "num_embeddings"): + output_embeddings.out_features = input_embeddings.num_embeddings + + def _get_no_split_modules(self, device_map: str): + """ + Get the modules of the model that should not be spit when using device_map. We iterate through the modules to + get the underlying `_no_split_modules`. + + Args: + device_map (`str`): + The device map value. Options are ["auto", "balanced", "balanced_low_0", "sequential"] + + Returns: + `List[str]`: List of modules that should not be split + """ + _no_split_modules = set() + modules_to_check = [self] + while len(modules_to_check) > 0: + module = modules_to_check.pop(-1) + # if the module does not appear in _no_split_modules, we also check the children + if module.__class__.__name__ not in _no_split_modules: + if isinstance(module, PreTrainedModel): + if module._no_split_modules is None: + raise ValueError( + f"{module.__class__.__name__} does not support `device_map='{device_map}'`. To implement support, the model " + "class needs to implement the `_no_split_modules` attribute." + ) + else: + _no_split_modules = _no_split_modules | set(module._no_split_modules) + modules_to_check += list(module.children()) + return list(_no_split_modules) + + def resize_token_embeddings( + self, + new_num_tokens: Optional[int] = None, + pad_to_multiple_of: Optional[int] = None, + mean_resizing: bool = True, + ) -> nn.Embedding: + """ + Resizes input token embeddings matrix of the model if `new_num_tokens != config.vocab_size`. + + Takes care of tying weights embeddings afterwards if the model class has a `tie_weights()` method. + + Arguments: + new_num_tokens (`int`, *optional*): + The new number of tokens in the embedding matrix. Increasing the size will add newly initialized + vectors at the end. Reducing the size will remove vectors from the end. If not provided or `None`, just + returns a pointer to the input tokens `torch.nn.Embedding` module of the model without doing anything. + pad_to_multiple_of (`int`, *optional*): + If set will pad the embedding matrix to a multiple of the provided value.If `new_num_tokens` is set to + `None` will just pad the embedding to a multiple of `pad_to_multiple_of`. + + This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability + `>= 7.5` (Volta), or on TPUs which benefit from having sequence lengths be a multiple of 128. For more + details about this, or help on choosing the correct value for resizing, refer to this guide: + https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#requirements-tc + mean_resizing (`bool`): + Whether to initialize the added embeddings from a multivariate normal distribution that has old embeddings' mean and + covariance or to initialize them with a normal distribution that has a mean of zero and std equals `config.initializer_range`. + + Setting `mean_resizing` to `True` is useful when increasing the size of the embeddings of causal language models, + where the generated tokens' probabilities won't be affected by the added embeddings because initializing the new embeddings with the + old embeddings' mean will reduce the kl-divergence between the next token probability before and after adding the new embeddings. + Refer to this article for more information: https://nlp.stanford.edu/~johnhew/vocab-expansion.html + + Return: + `torch.nn.Embedding`: Pointer to the input tokens Embeddings Module of the model. + """ + model_embeds = self._resize_token_embeddings(new_num_tokens, pad_to_multiple_of, mean_resizing) + if new_num_tokens is None and pad_to_multiple_of is None: + return model_embeds + + # Since we are basically reusing the same old embeddings with new weight values, gathering is required + is_quantized = hasattr(self, "hf_quantizer") and self.hf_quantizer is not None + if is_deepspeed_zero3_enabled() and not is_quantized: + with deepspeed.zero.GatheredParameters(model_embeds.weight, modifier_rank=None): + vocab_size = model_embeds.weight.shape[0] + else: + vocab_size = model_embeds.weight.shape[0] + + # Update base model and current model config. + self.config.get_text_config().vocab_size = vocab_size + self.vocab_size = vocab_size + + # Tie weights again if needed + self.tie_weights() + + return model_embeds + + def _resize_token_embeddings(self, new_num_tokens, pad_to_multiple_of=None, mean_resizing=True): + old_embeddings = self.get_input_embeddings() + new_embeddings = self._get_resized_embeddings( + old_embeddings, new_num_tokens, pad_to_multiple_of, mean_resizing + ) + if hasattr(old_embeddings, "_hf_hook"): + hook = old_embeddings._hf_hook + add_hook_to_module(new_embeddings, hook) + old_embeddings_requires_grad = old_embeddings.weight.requires_grad + new_embeddings.requires_grad_(old_embeddings_requires_grad) + self.set_input_embeddings(new_embeddings) + is_quantized = hasattr(self, "hf_quantizer") and self.hf_quantizer is not None + + # Update new_num_tokens with the actual size of new_embeddings + if pad_to_multiple_of is not None: + if is_deepspeed_zero3_enabled() and not is_quantized: + with deepspeed.zero.GatheredParameters(new_embeddings.weight, modifier_rank=None): + new_num_tokens = new_embeddings.weight.shape[0] + else: + new_num_tokens = new_embeddings.weight.shape[0] + + # if word embeddings are not tied, make sure that lm head is resized as well + if ( + self.get_output_embeddings() is not None + and not self.config.get_text_config(decoder=True).tie_word_embeddings + ): + old_lm_head = self.get_output_embeddings() + if isinstance(old_lm_head, torch.nn.Embedding): + new_lm_head = self._get_resized_embeddings(old_lm_head, new_num_tokens, mean_resizing=mean_resizing) + else: + new_lm_head = self._get_resized_lm_head(old_lm_head, new_num_tokens, mean_resizing=mean_resizing) + if hasattr(old_lm_head, "_hf_hook"): + hook = old_lm_head._hf_hook + add_hook_to_module(new_lm_head, hook) + old_lm_head_requires_grad = old_lm_head.weight.requires_grad + new_lm_head.requires_grad_(old_lm_head_requires_grad) + self.set_output_embeddings(new_lm_head) + + return self.get_input_embeddings() + + def _get_resized_embeddings( + self, + old_embeddings: nn.Embedding, + new_num_tokens: Optional[int] = None, + pad_to_multiple_of: Optional[int] = None, + mean_resizing: bool = True, + ) -> nn.Embedding: + """ + Build a resized Embedding Module from a provided token Embedding Module. Increasing the size will add newly + initialized vectors at the end. Reducing the size will remove vectors from the end + + Args: + old_embeddings (`torch.nn.Embedding`): + Old embeddings to be resized. + new_num_tokens (`int`, *optional*): + New number of tokens in the embedding matrix. + + Increasing the size will add newly initialized vectors at the end. Reducing the size will remove + vectors from the end. If not provided or `None`, just returns a pointer to the input tokens + `torch.nn.Embedding` module of the model without doing anything. + pad_to_multiple_of (`int`, *optional*): + If set will pad the embedding matrix to a multiple of the provided value. If `new_num_tokens` is set to + `None` will just pad the embedding to a multiple of `pad_to_multiple_of`. + + This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability + `>= 7.5` (Volta), or on TPUs which benefit from having sequence lengths be a multiple of 128. For more + details about this, or help on choosing the correct value for resizing, refer to this guide: + https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#requirements-tc + mean_resizing (`bool`): + Whether to initialize the added embeddings from a multivariate normal distribution that has old embeddings' mean and + covariance or to initialize them with a normal distribution that has a mean of zero and std equals `config.initializer_range`. + + Setting `mean_resizing` to `True` is useful when increasing the size of the embeddings of causal language models, + where the generated tokens' probabilities will not be affected by the added embeddings because initializing the new embeddings with the + old embeddings' mean will reduce the kl-divergence between the next token probability before and after adding the new embeddings. + Refer to this article for more information: https://nlp.stanford.edu/~johnhew/vocab-expansion.html + + + Return: + `torch.nn.Embedding`: Pointer to the resized Embedding Module or the old Embedding Module if + `new_num_tokens` is `None` + """ + + if pad_to_multiple_of is not None: + if not isinstance(pad_to_multiple_of, int): + raise ValueError( + f"Asking to pad the embedding matrix to a multiple of `{pad_to_multiple_of}`, which is not and integer. Please make sure to pass an integer" + ) + if new_num_tokens is None: + new_num_tokens = old_embeddings.weight.shape[0] + new_num_tokens = ((new_num_tokens + pad_to_multiple_of - 1) // pad_to_multiple_of) * pad_to_multiple_of + else: + logger.info( + "You are resizing the embedding layer without providing a `pad_to_multiple_of` parameter. This means that the new embedding" + f" dimension will be {new_num_tokens}. This might induce some performance reduction as *Tensor Cores* will not be available." + " For more details about this, or help on choosing the correct value for resizing, refer to this guide:" + " https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#requirements-tc" + ) + + if new_num_tokens is None: + return old_embeddings + + is_quantized = hasattr(self, "hf_quantizer") and self.hf_quantizer is not None + if is_deepspeed_zero3_enabled() and not is_quantized: + with deepspeed.zero.GatheredParameters(old_embeddings.weight, modifier_rank=None): + old_num_tokens, old_embedding_dim = old_embeddings.weight.size() + else: + old_num_tokens, old_embedding_dim = old_embeddings.weight.size() + + if old_num_tokens == new_num_tokens and not is_deepspeed_zero3_enabled(): + return old_embeddings + + if not isinstance(old_embeddings, nn.Embedding): + raise TypeError( + f"Old embeddings are of type {type(old_embeddings)}, which is not an instance of {nn.Embedding}. You" + " should either use a different resize function or make sure that `old_embeddings` are an instance of" + f" {nn.Embedding}." + ) + + # Build new embeddings + + # When using DeepSpeed ZeRO-3, we shouldn't create new embeddings with DeepSpeed init + # because the shape of the new embedding layer is used across various modeling files + # as well as to update config vocab size. Shape will be 0 when using DeepSpeed init leading + # to errors when training. + new_embeddings = nn.Embedding( + new_num_tokens, + old_embedding_dim, + device=old_embeddings.weight.device, + dtype=old_embeddings.weight.dtype, + ) + + if new_num_tokens > old_num_tokens and not mean_resizing: + # initialize new embeddings (in particular added tokens) with a mean of 0 and std equals `config.initializer_range`. + self._init_weights(new_embeddings) + + elif new_num_tokens > old_num_tokens and mean_resizing: + # initialize new embeddings (in particular added tokens). The new embeddings will be initialized + # from a multivariate normal distribution that has old embeddings' mean and covariance. + # as described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html + logger.warning_once( + "The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. " + "As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. " + "To disable this, use `mean_resizing=False`" + ) + + added_num_tokens = new_num_tokens - old_num_tokens + if is_deepspeed_zero3_enabled() and not is_quantized: + with deepspeed.zero.GatheredParameters([old_embeddings.weight], modifier_rank=None): + self._init_added_embeddings_weights_with_mean( + old_embeddings, new_embeddings, old_embedding_dim, old_num_tokens, added_num_tokens + ) + else: + self._init_added_embeddings_weights_with_mean( + old_embeddings, new_embeddings, old_embedding_dim, old_num_tokens, added_num_tokens + ) + + # Copy token embeddings from the previous weights + + # numbers of tokens to copy + n = min(old_num_tokens, new_num_tokens) + + if is_deepspeed_zero3_enabled() and not is_quantized: + params = [old_embeddings.weight, new_embeddings.weight] + with deepspeed.zero.GatheredParameters(params, modifier_rank=0): + new_embeddings.weight.data[:n, :] = old_embeddings.weight.data[:n, :] + else: + new_embeddings.weight.data[:n, :] = old_embeddings.weight.data[:n, :] + + # Replace weights in old_embeddings and return to maintain the same embedding type. + # This ensures correct functionality when a Custom Embedding class is passed as input. + # The input and output embedding types remain consistent. (c.f. https://github.com/huggingface/transformers/pull/31979) + if is_deepspeed_zero3_enabled() and not is_quantized: + params = [old_embeddings.weight, new_embeddings.weight] + with deepspeed.zero.GatheredParameters(params, modifier_rank=0): + old_embeddings.weight = new_embeddings.weight + old_embeddings.num_embeddings = new_embeddings.weight.data.shape[0] + + # If the new number of tokens is smaller than the original `padding_idx`, the `padding_idx` + # will be set to `None` in the resized embeddings. + if old_embeddings.padding_idx is not None and (new_num_tokens - 1) < old_embeddings.padding_idx: + old_embeddings.padding_idx = None + else: + old_embeddings.weight.data = new_embeddings.weight.data + old_embeddings.num_embeddings = new_embeddings.weight.data.shape[0] + if old_embeddings.padding_idx is not None and (new_num_tokens - 1) < old_embeddings.padding_idx: + old_embeddings.padding_idx = None + + return old_embeddings + + def _get_resized_lm_head( + self, + old_lm_head: nn.Linear, + new_num_tokens: Optional[int] = None, + transposed: Optional[bool] = False, + mean_resizing: bool = True, + ) -> nn.Linear: + """ + Build a resized Linear Module from a provided old Linear Module. Increasing the size will add newly initialized + vectors at the end. Reducing the size will remove vectors from the end + + Args: + old_lm_head (`torch.nn.Linear`): + Old lm head liner layer to be resized. + new_num_tokens (`int`, *optional*): + New number of tokens in the linear matrix. + + Increasing the size will add newly initialized vectors at the end. Reducing the size will remove + vectors from the end. If not provided or `None`, just returns a pointer to the input tokens + `torch.nn.Linear` module of the model without doing anything. transposed (`bool`, *optional*, defaults + to `False`): Whether `old_lm_head` is transposed or not. If True `old_lm_head.size()` is `lm_head_dim, + vocab_size` else `vocab_size, lm_head_dim`. + mean_resizing (`bool`): + Whether to initialize the added embeddings from a multivariate normal distribution that has old embeddings' mean and + covariance or to initialize them with a normal distribution that has a mean of zero and std equals `config.initializer_range`. + + Setting `mean_resizing` to `True` is useful when increasing the size of the embeddings of causal language models, + where the generated tokens' probabilities will not be affected by the added embeddings because initializing the new embeddings with the + old embeddings' mean will reduce the kl-divergence between the next token probability before and after adding the new embeddings. + Refer to this article for more information: https://nlp.stanford.edu/~johnhew/vocab-expansion.html + + Return: + `torch.nn.Linear`: Pointer to the resized Linear Module or the old Linear Module if `new_num_tokens` is + `None` + """ + if new_num_tokens is None: + return old_lm_head + + is_quantized = hasattr(self, "hf_quantizer") and self.hf_quantizer is not None + if is_deepspeed_zero3_enabled() and not is_quantized: + with deepspeed.zero.GatheredParameters(old_lm_head.weight, modifier_rank=None): + old_num_tokens, old_lm_head_dim = ( + old_lm_head.weight.size() if not transposed else old_lm_head.weight.t().size() + ) + else: + old_num_tokens, old_lm_head_dim = ( + old_lm_head.weight.size() if not transposed else old_lm_head.weight.t().size() + ) + + if old_num_tokens == new_num_tokens and not is_deepspeed_zero3_enabled(): + return old_lm_head + + if not isinstance(old_lm_head, nn.Linear): + raise TypeError( + f"Old language model head is of type {type(old_lm_head)}, which is not an instance of {nn.Linear}. You" + " should either use a different resize function or make sure that `old_lm_head` are an instance of" + f" {nn.Linear}." + ) + + # Build new lm head + new_lm_head_shape = (old_lm_head_dim, new_num_tokens) if not transposed else (new_num_tokens, old_lm_head_dim) + has_new_lm_head_bias = old_lm_head.bias is not None + + # When using DeepSpeed ZeRO-3, we shouldn't create new embeddings with DeepSpeed init + # because the shape of the new embedding layer is used across various modeling files + # as well as to update config vocab size. Shape will be 0 when using DeepSpeed init leading + # to errors when training. + new_lm_head = nn.Linear( + *new_lm_head_shape, + bias=has_new_lm_head_bias, + device=old_lm_head.weight.device, + dtype=old_lm_head.weight.dtype, + ) + + if new_num_tokens > old_num_tokens and not mean_resizing: + # initialize new embeddings (in particular added tokens) with a mean of 0 and std equals `config.initializer_range`. + self._init_weights(new_lm_head) + + elif new_num_tokens > old_num_tokens and mean_resizing: + # initialize new lm_head weights (in particular added tokens). The new lm_head weights + # will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. + # as described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html + logger.warning_once( + "The new lm_head weights will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. " + "As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. " + "To disable this, use `mean_resizing=False`" + ) + + added_num_tokens = new_num_tokens - old_num_tokens + if is_deepspeed_zero3_enabled() and not is_quantized: + params = [old_lm_head.weight] + if has_new_lm_head_bias: + params += [old_lm_head.bias] + with deepspeed.zero.GatheredParameters(params, modifier_rank=None): + self._init_added_lm_head_weights_with_mean( + old_lm_head, new_lm_head, old_lm_head_dim, old_num_tokens, added_num_tokens, transposed + ) + if has_new_lm_head_bias: + self._init_added_lm_head_bias_with_mean(old_lm_head, new_lm_head, added_num_tokens) + + else: + self._init_added_lm_head_weights_with_mean( + old_lm_head, new_lm_head, old_lm_head_dim, old_num_tokens, added_num_tokens, transposed + ) + if has_new_lm_head_bias: + self._init_added_lm_head_bias_with_mean(old_lm_head, new_lm_head, added_num_tokens) + + num_tokens_to_copy = min(old_num_tokens, new_num_tokens) + + if is_deepspeed_zero3_enabled() and not is_quantized: + params = [old_lm_head.weight, old_lm_head.bias, new_lm_head.weight, new_lm_head.bias] + with deepspeed.zero.GatheredParameters(params, modifier_rank=0): + self._copy_lm_head_original_to_resized( + new_lm_head, old_lm_head, num_tokens_to_copy, transposed, has_new_lm_head_bias + ) + else: + self._copy_lm_head_original_to_resized( + new_lm_head, old_lm_head, num_tokens_to_copy, transposed, has_new_lm_head_bias + ) + + return new_lm_head + + def _init_added_embeddings_weights_with_mean( + self, old_embeddings, new_embeddings, old_embedding_dim, old_num_tokens, added_num_tokens + ): + old_embeddings_weight = old_embeddings.weight.data.to(torch.float32) + mean_embeddings = torch.mean(old_embeddings_weight, axis=0) + old_centered_embeddings = old_embeddings_weight - mean_embeddings + covariance = old_centered_embeddings.T @ old_centered_embeddings / old_num_tokens + + # Check if the covariance is positive definite. + epsilon = 1e-9 + is_covariance_psd = constraints.positive_definite.check(epsilon * covariance).all() + if is_covariance_psd: + # If covariances is positive definite, a distribution can be created. and we can sample new weights from it. + distribution = torch.distributions.multivariate_normal.MultivariateNormal( + mean_embeddings, covariance_matrix=epsilon * covariance + ) + new_embeddings.weight.data[-1 * added_num_tokens :, :] = distribution.sample( + sample_shape=(added_num_tokens,) + ).to(old_embeddings.weight.dtype) + else: + # Otherwise, just initialize with the mean. because distribution will not be created. + new_embeddings.weight.data[-1 * added_num_tokens :, :] = ( + mean_embeddings[None, :].repeat(added_num_tokens, 1).to(old_embeddings.weight.dtype) + ) + + def _init_added_lm_head_weights_with_mean( + self, + old_lm_head, + new_lm_head, + old_lm_head_dim, + old_num_tokens, + added_num_tokens, + transposed=False, + ): + if transposed: + # Transpose to the desired shape for the function. + new_lm_head.weight.data = new_lm_head.weight.data.T + old_lm_head.weight.data = old_lm_head.weight.data.T + + # The same initialization logic as Embeddings. + self._init_added_embeddings_weights_with_mean( + old_lm_head, new_lm_head, old_lm_head_dim, old_num_tokens, added_num_tokens + ) + + if transposed: + # Transpose again to the correct shape. + new_lm_head.weight.data = new_lm_head.weight.data.T + old_lm_head.weight.data = old_lm_head.weight.data.T + + def _init_added_lm_head_bias_with_mean(self, old_lm_head, new_lm_head, added_num_tokens): + bias_mean = torch.mean(old_lm_head.bias.data, axis=0, dtype=torch.float32) + bias_std = torch.std(old_lm_head.bias.data, axis=0).to(torch.float32) + new_lm_head.bias.data[-1 * added_num_tokens :].normal_(mean=bias_mean, std=1e-9 * bias_std) + + def _copy_lm_head_original_to_resized( + self, new_lm_head, old_lm_head, num_tokens_to_copy, transposed, has_new_lm_head_bias + ): + # Copy old lm head weights to new lm head + if not transposed: + new_lm_head.weight.data[:num_tokens_to_copy, :] = old_lm_head.weight.data[:num_tokens_to_copy, :] + else: + new_lm_head.weight.data[:, :num_tokens_to_copy] = old_lm_head.weight.data[:, :num_tokens_to_copy] + + # Copy bias weights to new lm head + if has_new_lm_head_bias: + new_lm_head.bias.data[:num_tokens_to_copy] = old_lm_head.bias.data[:num_tokens_to_copy] + + def resize_position_embeddings(self, new_num_position_embeddings: int): + raise NotImplementedError( + f"`resize_position_embeddings` is not implemented for {self.__class__}`. To implement it, you should " + f"overwrite this method in the class {self.__class__} in `modeling_{self.__class__.__module__}.py`" + ) + + def get_position_embeddings(self) -> Union[nn.Embedding, Tuple[nn.Embedding]]: + raise NotImplementedError( + f"`get_position_embeddings` is not implemented for {self.__class__}`. To implement it, you should " + f"overwrite this method in the class {self.__class__} in `modeling_{self.__class__.__module__}.py`" + ) + + def init_weights(self): + """ + If needed prunes and maybe initializes weights. If using a custom `PreTrainedModel`, you need to implement any + initialization logic in `_init_weights`. + """ + # Prune heads if needed + if self.config.pruned_heads: + self.prune_heads(self.config.pruned_heads) + + if _init_weights: + # Initialize weights + self.initialize_weights() + + # Tie weights should be skipped when not initializing all weights + # since from_pretrained(...) calls tie weights anyways + self.tie_weights() + + def prune_heads(self, heads_to_prune: Dict[int, List[int]]): + """ + Prunes heads of the base model. + + Arguments: + heads_to_prune (`Dict[int, List[int]]`): + Dictionary with keys being selected layer indices (`int`) and associated values being the list of heads + to prune in said layer (list of `int`). For instance {1: [0, 2], 2: [2, 3]} will prune heads 0 and 2 on + layer 1 and heads 2 and 3 on layer 2. + """ + # save new sets of pruned heads as union of previously stored pruned heads and newly pruned heads + for layer, heads in heads_to_prune.items(): + union_heads = set(self.config.pruned_heads.get(layer, [])) | set(heads) + self.config.pruned_heads[layer] = list(union_heads) # Unfortunately we have to store it as list for JSON + + self.base_model._prune_heads(heads_to_prune) + + def gradient_checkpointing_enable(self, gradient_checkpointing_kwargs=None): + """ + Activates gradient checkpointing for the current model. + + Note that in other frameworks this feature can be referred to as "activation checkpointing" or "checkpoint + activations". + + We pass the `__call__` method of the modules instead of `forward` because `__call__` attaches all the hooks of + the module. https://discuss.pytorch.org/t/any-different-between-model-input-and-model-forward-input/3690/2 + + Args: + gradient_checkpointing_kwargs (dict, *optional*): + Additional keyword arguments passed along to the `torch.utils.checkpoint.checkpoint` function. + """ + if not self.supports_gradient_checkpointing: + raise ValueError(f"{self.__class__.__name__} does not support gradient checkpointing.") + + if gradient_checkpointing_kwargs is None: + gradient_checkpointing_kwargs = {"use_reentrant": True} + + gradient_checkpointing_func = functools.partial(checkpoint, **gradient_checkpointing_kwargs) + + # For old GC format (transformers < 4.35.0) for models that live on the Hub + # we will fall back to the overwritten `_set_gradient_checkpointing` method + _is_using_old_format = "value" in inspect.signature(self._set_gradient_checkpointing).parameters + + if not _is_using_old_format: + self._set_gradient_checkpointing(enable=True, gradient_checkpointing_func=gradient_checkpointing_func) + else: + self.apply(partial(self._set_gradient_checkpointing, value=True)) + logger.warning( + "You are using an old version of the checkpointing format that is deprecated (We will also silently ignore `gradient_checkpointing_kwargs` in case you passed it)." + "Please update to the new format on your modeling file. To use the new format, you need to completely remove the definition of the method `_set_gradient_checkpointing` in your model." + ) + + if getattr(self, "_hf_peft_config_loaded", False): + # When using PEFT + gradient checkpointing + Trainer we need to make sure the input has requires_grad=True + # we do it also on PEFT: https://github.com/huggingface/peft/blob/85013987aa82aa1af3da1236b6902556ce3e483e/src/peft/peft_model.py#L334 + # When training with PEFT, only LoRA layers will have requires grad set to True, but the output of frozen layers need to propagate + # the gradients to make sure the gradient flows. + self.enable_input_require_grads() + + def _set_gradient_checkpointing(self, enable: bool = True, gradient_checkpointing_func: Callable = checkpoint): + is_gradient_checkpointing_set = False + + # Apply it on the top-level module in case the top-level modules supports it + # for example, LongT5Stack inherits from `PreTrainedModel`. + if hasattr(self, "gradient_checkpointing"): + self._gradient_checkpointing_func = gradient_checkpointing_func + self.gradient_checkpointing = enable + is_gradient_checkpointing_set = True + + for module in self.modules(): + if hasattr(module, "gradient_checkpointing"): + module._gradient_checkpointing_func = gradient_checkpointing_func + module.gradient_checkpointing = enable + is_gradient_checkpointing_set = True + + if not is_gradient_checkpointing_set: + raise ValueError( + f"{self.__class__.__name__} is not compatible with gradient checkpointing. Make sure all the architecture support it by setting a boolean attribute" + " `gradient_checkpointing` to modules of the model that uses checkpointing." + ) + + def gradient_checkpointing_disable(self): + """ + Deactivates gradient checkpointing for the current model. + + Note that in other frameworks this feature can be referred to as "activation checkpointing" or "checkpoint + activations". + """ + if self.supports_gradient_checkpointing: + # For old GC format (transformers < 4.35.0) for models that live on the Hub + # we will fall back to the overwritten `_set_gradient_checkpointing` method + _is_using_old_format = "value" in inspect.signature(self._set_gradient_checkpointing).parameters + if not _is_using_old_format: + self._set_gradient_checkpointing(enable=False) + else: + logger.warning( + "You are using an old version of the checkpointing format that is deprecated (We will also silently ignore `gradient_checkpointing_kwargs` in case you passed it)." + "Please update to the new format on your modeling file. To use the new format, you need to completely remove the definition of the method `_set_gradient_checkpointing` in your model." + ) + self.apply(partial(self._set_gradient_checkpointing, value=False)) + + if getattr(self, "_hf_peft_config_loaded", False): + self.disable_input_require_grads() + + @property + def is_gradient_checkpointing(self) -> bool: + """ + Whether gradient checkpointing is activated for this model or not. + + Note that in other frameworks this feature can be referred to as "activation checkpointing" or "checkpoint + activations". + """ + return any(hasattr(m, "gradient_checkpointing") and m.gradient_checkpointing for m in self.modules()) + + def save_pretrained( + self, + save_directory: Union[str, os.PathLike], + is_main_process: bool = True, + state_dict: Optional[dict] = None, + save_function: Callable = torch.save, + push_to_hub: bool = False, + max_shard_size: Union[int, str] = "5GB", + safe_serialization: bool = True, + variant: Optional[str] = None, + token: Optional[Union[str, bool]] = None, + save_peft_format: bool = True, + **kwargs, + ): + """ + Save a model and its configuration file to a directory, so that it can be re-loaded using the + [`~PreTrainedModel.from_pretrained`] class method. + + Arguments: + save_directory (`str` or `os.PathLike`): + Directory to which to save. Will be created if it doesn't exist. + is_main_process (`bool`, *optional*, defaults to `True`): + Whether the process calling this is the main process or not. Useful when in distributed training like + TPUs and need to call this function on all processes. In this case, set `is_main_process=True` only on + the main process to avoid race conditions. + state_dict (nested dictionary of `torch.Tensor`): + The state dictionary of the model to save. Will default to `self.state_dict()`, but can be used to only + save parts of the model or if special precautions need to be taken when recovering the state dictionary + of a model (like when using model parallelism). + save_function (`Callable`): + The function to use to save the state dictionary. Useful on distributed training like TPUs when one + need to replace `torch.save` by another method. + push_to_hub (`bool`, *optional*, defaults to `False`): + Whether or not to push your model to the Hugging Face model hub after saving it. You can specify the + repository you want to push to with `repo_id` (will default to the name of `save_directory` in your + namespace). + max_shard_size (`int` or `str`, *optional*, defaults to `"5GB"`): + The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size + lower than this size. If expressed as a string, needs to be digits followed by a unit (like `"5MB"`). + We default it to 5GB in order for models to be able to run easily on free-tier google colab instances + without CPU OOM issues. + + + + If a single weight of the model is bigger than `max_shard_size`, it will be in its own checkpoint shard + which will be bigger than `max_shard_size`. + + + + safe_serialization (`bool`, *optional*, defaults to `True`): + Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`). + variant (`str`, *optional*): + If specified, weights are saved in the format pytorch_model..bin. + token (`str` or `bool`, *optional*): + The token to use as HTTP bearer authorization for remote files. If `True`, or not specified, will use + the token generated when running `huggingface-cli login` (stored in `~/.huggingface`). + save_peft_format (`bool`, *optional*, defaults to `True`): + For backward compatibility with PEFT library, in case adapter weights are attached to the model, all + keys of the state dict of adapters needs to be pre-pended with `base_model.model`. Advanced users can + disable this behaviours by setting `save_peft_format` to `False`. + kwargs (`Dict[str, Any]`, *optional*): + Additional key word arguments passed along to the [`~utils.PushToHubMixin.push_to_hub`] method. + """ + use_auth_token = kwargs.pop("use_auth_token", None) + ignore_metadata_errors = kwargs.pop("ignore_metadata_errors", False) + + if use_auth_token is not None: + warnings.warn( + "The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers. Please use `token` instead.", + FutureWarning, + ) + if token is not None: + raise ValueError( + "`token` and `use_auth_token` are both specified. Please set only the argument `token`." + ) + token = use_auth_token + + if token is not None: + kwargs["token"] = token + + _hf_peft_config_loaded = getattr(self, "_hf_peft_config_loaded", False) + + hf_quantizer = getattr(self, "hf_quantizer", None) + quantization_serializable = ( + hf_quantizer is not None + and isinstance(hf_quantizer, HfQuantizer) + and hf_quantizer.is_serializable(safe_serialization=safe_serialization) + ) + + if hf_quantizer is not None and not _hf_peft_config_loaded and not quantization_serializable: + raise ValueError( + f"The model is quantized with {hf_quantizer.quantization_config.quant_method} and is not serializable - check out the warnings from" + " the logger on the traceback to understand the reason why the quantized model is not serializable." + ) + + if "save_config" in kwargs: + warnings.warn( + "`save_config` is deprecated and will be removed in v5 of Transformers. Use `is_main_process` instead." + ) + is_main_process = kwargs.pop("save_config") + if safe_serialization and not is_safetensors_available(): + raise ImportError("`safe_serialization` requires the `safetensors library: `pip install safetensors`.") + + if os.path.isfile(save_directory): + logger.error(f"Provided path ({save_directory}) should be a directory, not a file") + return + + os.makedirs(save_directory, exist_ok=True) + + if push_to_hub: + commit_message = kwargs.pop("commit_message", None) + repo_id = kwargs.pop("repo_id", save_directory.split(os.path.sep)[-1]) + repo_id = self._create_repo(repo_id, **kwargs) + files_timestamps = self._get_files_timestamps(save_directory) + + # Only save the model itself if we are using distributed training + model_to_save = unwrap_model(self) + + # save the string version of dtype to the config, e.g. convert torch.float32 => "float32" + # we currently don't use this setting automatically, but may start to use with v5 + dtype = get_parameter_dtype(model_to_save) + model_to_save.config.torch_dtype = str(dtype).split(".")[1] + + # Attach architecture to the config + model_to_save.config.architectures = [model_to_save.__class__.__name__] + + # Unset attn implementation so it can be set to another one when loading back + model_to_save.config._attn_implementation_autoset = False + + # If we have a custom model, we copy the file defining it in the folder and set the attributes so it can be + # loaded from the Hub. + if self._auto_class is not None: + custom_object_save(self, save_directory, config=self.config) + + # Save the config + if is_main_process: + if not _hf_peft_config_loaded: + # If the model config has set attributes that should be in the generation config, move them there. + misplaced_generation_parameters = model_to_save.config._get_non_default_generation_parameters() + if self.can_generate() and len(misplaced_generation_parameters) > 0: + warnings.warn( + "Moving the following attributes in the config to the generation config: " + f"{misplaced_generation_parameters}. You are seeing this warning because you've set " + "generation parameters in the model config, as opposed to in the generation config.", + UserWarning, + ) + for param_name, param_value in misplaced_generation_parameters.items(): + setattr(model_to_save.generation_config, param_name, param_value) + setattr(model_to_save.config, param_name, None) + + model_to_save.config.save_pretrained(save_directory) + if self.can_generate(): + model_to_save.generation_config.save_pretrained(save_directory) + + if _hf_peft_config_loaded: + logger.info( + "Detected adapters on the model, saving the model in the PEFT format, only adapter weights will be saved." + ) + state_dict = model_to_save.get_adapter_state_dict(state_dict=state_dict) + + if save_peft_format: + logger.info( + "To match the expected format of the PEFT library, all keys of the state dict of adapters will be pre-pended with `base_model.model`." + ) + peft_state_dict = {} + for key, value in state_dict.items(): + peft_state_dict[f"base_model.model.{key}"] = value + state_dict = peft_state_dict + + active_adapter = self.active_adapters() + + if len(active_adapter) > 1: + raise ValueError( + "Multiple active adapters detected, saving multiple active adapters is not supported yet. You can save adapters separately one by one " + "by iteratively calling `model.set_adapter(adapter_name)` then `model.save_pretrained(...)`" + ) + active_adapter = active_adapter[0] + + current_peft_config = self.peft_config[active_adapter] + current_peft_config.save_pretrained(save_directory) + + # for offloaded modules + module_map = {} + + # Save the model + if state_dict is None: + # if any model parameters are offloaded, make module map + if ( + hasattr(self, "hf_device_map") + and len(set(self.hf_device_map.values())) > 1 + and ("cpu" in self.hf_device_map.values() or "disk" in self.hf_device_map.values()) + ): + warnings.warn( + "Attempting to save a model with offloaded modules. Ensure that unallocated cpu memory exceeds the `shard_size` (5GB default)" + ) + for name, module in model_to_save.named_modules(): + if name == "": + continue + module_state_dict = module.state_dict() + + for key in module_state_dict: + module_map[name + f".{key}"] = module + state_dict = model_to_save.state_dict() + + # Translate state_dict from smp to hf if saving with smp >= 1.10 + if IS_SAGEMAKER_MP_POST_1_10: + for smp_to_hf, _ in smp.state.module_manager.translate_functions: + state_dict = smp_to_hf(state_dict) + + # Handle the case where some state_dict keys shouldn't be saved + if self._keys_to_ignore_on_save is not None: + for ignore_key in self._keys_to_ignore_on_save: + if ignore_key in state_dict.keys(): + del state_dict[ignore_key] + + # Rename state_dict keys before saving to file. Do nothing unless overridden in a particular model. + # (initially introduced with TimmWrapperModel to remove prefix and make checkpoints compatible with timm) + state_dict = self._fix_state_dict_keys_on_save(state_dict) + + if safe_serialization: + # Safetensors does not allow tensor aliasing. + # We're going to remove aliases before saving + ptrs = collections.defaultdict(list) + for name, tensor in state_dict.items(): + # Sometimes in the state_dict we have non-tensor objects. + # e.g. in bitsandbytes we have some `str` objects in the state_dict + if isinstance(tensor, torch.Tensor): + ptrs[id_tensor_storage(tensor)].append(name) + else: + # In the non-tensor case, fall back to the pointer of the object itself + ptrs[id(tensor)].append(name) + + # These are all the pointers of shared tensors + if hasattr(self, "hf_device_map"): + # if the model has offloaded parameters, we must check using find_tied_parameters() + tied_params = find_tied_parameters(self) + if tied_params: + tied_names = tied_params[0] + shared_ptrs = { + ptr: names for ptr, names in ptrs.items() if any(name in tied_names for name in names) + } + else: + shared_ptrs = {} + else: + shared_ptrs = {ptr: names for ptr, names in ptrs.items() if len(names) > 1} + + # Recursively descend to find tied weight keys + _tied_weights_keys = _get_tied_weight_keys(self) + error_names = [] + to_delete_names = set() + for names in shared_ptrs.values(): + # Removing the keys which are declared as known duplicates on + # load. This allows to make sure the name which is kept is consistent. + if _tied_weights_keys is not None: + found = 0 + for name in sorted(names): + matches_pattern = any(re.search(pat, name) for pat in _tied_weights_keys) + if matches_pattern and name in state_dict: + found += 1 + if found < len(names): + to_delete_names.add(name) + # We are entering a place where the weights and the transformers configuration do NOT match. + shared_names, disjoint_names = _find_disjoint(shared_ptrs.values(), state_dict) + # Those are actually tensor sharing but disjoint from each other, we can safely clone them + # Reloaded won't have the same property, but it shouldn't matter in any meaningful way. + for name in disjoint_names: + state_dict[name] = state_dict[name].clone() + + # When not all duplicates have been cleaned, still remove those keys, but put a clear warning. + # If the link between tensors was done at runtime then `from_pretrained` will not get + # the key back leading to random tensor. A proper warning will be shown + # during reload (if applicable), but since the file is not necessarily compatible with + # the config, better show a proper warning. + shared_names, identical_names = _find_identical(shared_names, state_dict) + # delete tensors that have identical storage + for inames in identical_names: + known = inames.intersection(to_delete_names) + for name in known: + del state_dict[name] + unknown = inames.difference(to_delete_names) + if len(unknown) > 1: + error_names.append(unknown) + + if shared_names: + error_names.append(set(shared_names)) + + if len(error_names) > 0: + raise RuntimeError( + f"The weights trying to be saved contained shared tensors {error_names} that are mismatching the transformers base configuration. Try saving using `safe_serialization=False` or remove this tensor sharing.", + ) + + # Shard the model if it is too big. + if not _hf_peft_config_loaded: + weights_name = SAFE_WEIGHTS_NAME if safe_serialization else WEIGHTS_NAME + weights_name = _add_variant(weights_name, variant) + else: + weights_name = ADAPTER_SAFE_WEIGHTS_NAME if safe_serialization else ADAPTER_WEIGHTS_NAME + + filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors") + state_dict_split = split_torch_state_dict_into_shards( + state_dict, filename_pattern=filename_pattern, max_shard_size=max_shard_size + ) + # Save index if sharded + index = None + if state_dict_split.is_sharded: + index = { + "metadata": state_dict_split.metadata, + "weight_map": state_dict_split.tensor_to_filename, + } + + # Clean the folder from a previous save + for filename in os.listdir(save_directory): + full_filename = os.path.join(save_directory, filename) + # If we have a shard file that is not going to be replaced, we delete it, but only from the main process + # in distributed settings to avoid race conditions. + weights_no_suffix = weights_name.replace(".bin", "").replace(".safetensors", "") + + # make sure that file to be deleted matches format of sharded file, e.g. pytorch_model-00001-of-00005 + filename_no_suffix = filename.replace(".bin", "").replace(".safetensors", "") + reg = re.compile(r"(.*?)-\d{5}-of-\d{5}") + + if ( + filename.startswith(weights_no_suffix) + and os.path.isfile(full_filename) + and filename not in state_dict_split.filename_to_tensors.keys() + and is_main_process + and reg.fullmatch(filename_no_suffix) is not None + ): + os.remove(full_filename) + # Save the model + filename_to_tensors = state_dict_split.filename_to_tensors.items() + if module_map: + filename_to_tensors = logging.tqdm(filename_to_tensors, desc="Saving checkpoint shards") + for shard_file, tensors in filename_to_tensors: + shard = {} + for tensor in tensors: + shard[tensor] = state_dict[tensor].contiguous() + # delete reference, see https://github.com/huggingface/transformers/pull/34890 + del state_dict[tensor] + + # remake shard with onloaded parameters if necessary + if module_map: + if accelerate_version < version.parse("0.31"): + raise ImportError( + f"You need accelerate version to be greater or equal than 0.31 to save models with offloaded parameters. Detected version {accelerate_version}. " + f"Please upgrade accelerate with `pip install -U accelerate`" + ) + # init state_dict for this shard + shard_state_dict = dict.fromkeys(shard, "") + for module_name in shard: + # skip to collect this weight again + if shard_state_dict.get(module_name) != "": + continue + module = module_map[module_name] + # update state dict with onloaded parameters + shard_state_dict = get_state_dict_from_offload(module, module_name, shard_state_dict) + + # assign shard to be the completed state dict + shard = shard_state_dict + del shard_state_dict + gc.collect() + + if safe_serialization: + # At some point we will need to deal better with save_function (used for TPU and other distributed + # joyfulness), but for now this enough. + safe_save_file(shard, os.path.join(save_directory, shard_file), metadata={"format": "pt"}) + else: + save_function(shard, os.path.join(save_directory, shard_file)) + + del state_dict + + if index is None: + path_to_weights = os.path.join(save_directory, weights_name) + logger.info(f"Model weights saved in {path_to_weights}") + else: + save_index_file = SAFE_WEIGHTS_INDEX_NAME if safe_serialization else WEIGHTS_INDEX_NAME + save_index_file = os.path.join(save_directory, _add_variant(save_index_file, variant)) + # Save the index as well + with open(save_index_file, "w", encoding="utf-8") as f: + content = json.dumps(index, indent=2, sort_keys=True) + "\n" + f.write(content) + logger.info( + f"The model is bigger than the maximum size per checkpoint ({max_shard_size}) and is going to be " + f"split in {len(state_dict_split.filename_to_tensors)} checkpoint shards. You can find where each parameters has been saved in the " + f"index located at {save_index_file}." + ) + + if push_to_hub: + # Eventually create an empty model card + model_card = create_and_tag_model_card( + repo_id, self.model_tags, token=token, ignore_metadata_errors=ignore_metadata_errors + ) + + # Update model card if needed: + model_card.save(os.path.join(save_directory, "README.md")) + + self._upload_modified_files( + save_directory, + repo_id, + files_timestamps, + commit_message=commit_message, + token=token, + ) + + @wraps(PushToHubMixin.push_to_hub) + def push_to_hub(self, *args, **kwargs): + tags = self.model_tags if self.model_tags is not None else [] + + tags_kwargs = kwargs.get("tags", []) + if isinstance(tags_kwargs, str): + tags_kwargs = [tags_kwargs] + + for tag in tags_kwargs: + if tag not in tags: + tags.append(tag) + + if tags: + kwargs["tags"] = tags + return super().push_to_hub(*args, **kwargs) + + def get_memory_footprint(self, return_buffers=True): + r""" + Get the memory footprint of a model. This will return the memory footprint of the current model in bytes. + Useful to benchmark the memory footprint of the current model and design some tests. Solution inspired from the + PyTorch discussions: https://discuss.pytorch.org/t/gpu-memory-that-model-uses/56822/2 + + Arguments: + return_buffers (`bool`, *optional*, defaults to `True`): + Whether to return the size of the buffer tensors in the computation of the memory footprint. Buffers + are tensors that do not require gradients and not registered as parameters. E.g. mean and std in batch + norm layers. Please see: https://discuss.pytorch.org/t/what-pytorch-means-by-buffers/120266/2 + """ + mem = sum([param.nelement() * param.element_size() for param in self.parameters()]) + if return_buffers: + mem_bufs = sum([buf.nelement() * buf.element_size() for buf in self.buffers()]) + mem = mem + mem_bufs + return mem + + @wraps(torch.nn.Module.cuda) + def cuda(self, *args, **kwargs): + if getattr(self, "quantization_method", None) == QuantizationMethod.HQQ: + raise ValueError("`.cuda` is not supported for HQQ-quantized models.") + # Checks if the model has been loaded in 4-bit or 8-bit with BNB + if getattr(self, "quantization_method", None) == QuantizationMethod.BITS_AND_BYTES: + if getattr(self, "is_loaded_in_8bit", False): + raise ValueError( + "Calling `cuda()` is not supported for `8-bit` quantized models. " + " Please use the model as it is, since the model has already been set to the correct devices." + ) + elif version.parse(importlib.metadata.version("bitsandbytes")) < version.parse("0.43.2"): + raise ValueError( + "Calling `cuda()` is not supported for `4-bit` quantized models with the installed version of bitsandbytes. " + f"The current device is `{self.device}`. If you intended to move the model, please install bitsandbytes >= 0.43.2." + ) + else: + return super().cuda(*args, **kwargs) + + @wraps(torch.nn.Module.to) + def to(self, *args, **kwargs): + # For BNB/GPTQ models, we prevent users from casting the model to another dtype to restrict unwanted behaviours. + # the correct API should be to load the model with the desired dtype directly through `from_pretrained`. + dtype_present_in_args = "dtype" in kwargs + + if not dtype_present_in_args: + for arg in args: + if isinstance(arg, torch.dtype): + dtype_present_in_args = True + break + + if getattr(self, "quantization_method", None) == QuantizationMethod.HQQ: + raise ValueError("`.to` is not supported for HQQ-quantized models.") + + if dtype_present_in_args and getattr(self, "quantization_method", None) == QuantizationMethod.QUARK: + raise ValueError("Casting a Quark quantized model to a new `dtype` is not supported.") + + # Checks if the model has been loaded in 4-bit or 8-bit with BNB + if getattr(self, "quantization_method", None) == QuantizationMethod.BITS_AND_BYTES: + if dtype_present_in_args: + raise ValueError( + "You cannot cast a bitsandbytes model in a new `dtype`. Make sure to load the model using `from_pretrained` using the" + " desired `dtype` by passing the correct `torch_dtype` argument." + ) + + if getattr(self, "is_loaded_in_8bit", False): + raise ValueError( + "`.to` is not supported for `8-bit` bitsandbytes models. Please use the model as it is, since the" + " model has already been set to the correct devices and casted to the correct `dtype`." + ) + elif version.parse(importlib.metadata.version("bitsandbytes")) < version.parse("0.43.2"): + raise ValueError( + "Calling `to()` is not supported for `4-bit` quantized models with the installed version of bitsandbytes. " + f"The current device is `{self.device}`. If you intended to move the model, please install bitsandbytes >= 0.43.2." + ) + elif getattr(self, "quantization_method", None) == QuantizationMethod.GPTQ: + if dtype_present_in_args: + raise ValueError( + "You cannot cast a GPTQ model in a new `dtype`. Make sure to load the model using `from_pretrained` using the desired" + " `dtype` by passing the correct `torch_dtype` argument." + ) + return super().to(*args, **kwargs) + + def half(self, *args): + # Checks if the model is quantized + if getattr(self, "is_quantized", False): + raise ValueError( + "`.half()` is not supported for quantized model. Please use the model as it is, since the" + " model has already been casted to the correct `dtype`." + ) + else: + return super().half(*args) + + def float(self, *args): + # Checks if the model is quantized + if getattr(self, "is_quantized", False): + raise ValueError( + "`.float()` is not supported for quantized model. Please use the model as it is, since the" + " model has already been casted to the correct `dtype`." + ) + else: + return super().float(*args) + + @classmethod + def get_init_context(cls, is_quantized: bool, _is_ds_init_called: bool): + if is_deepspeed_zero3_enabled(): + init_contexts = [no_init_weights()] + # We cannot initialize the model on meta device with deepspeed when not quantized + if not is_quantized and not _is_ds_init_called: + logger.info("Detected DeepSpeed ZeRO-3: activating zero.init() for this model") + init_contexts.extend([deepspeed.zero.Init(config_dict_or_path=deepspeed_config()), set_zero3_state()]) + elif is_quantized: + init_contexts.extend([init_empty_weights(), set_quantized_state()]) + else: + init_contexts = [no_init_weights(), init_empty_weights()] + + return init_contexts + + @classmethod + @restore_default_torch_dtype + def from_pretrained( + cls: Type[SpecificPreTrainedModelType], + pretrained_model_name_or_path: Optional[Union[str, os.PathLike]], + *model_args, + config: Optional[Union[PretrainedConfig, str, os.PathLike]] = None, + cache_dir: Optional[Union[str, os.PathLike]] = None, + ignore_mismatched_sizes: bool = False, + force_download: bool = False, + local_files_only: bool = False, + token: Optional[Union[str, bool]] = None, + revision: str = "main", + use_safetensors: Optional[bool] = None, + weights_only: bool = True, + **kwargs, + ) -> SpecificPreTrainedModelType: + r""" + Instantiate a pretrained pytorch model from a pre-trained model configuration. + + The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated). To train + the model, you should first set it back in training mode with `model.train()`. + + The warning *Weights from XXX not initialized from pretrained model* means that the weights of XXX do not come + pretrained with the rest of the model. It is up to you to train those weights with a downstream fine-tuning + task. + + The warning *Weights from XXX not used in YYY* means that the layer XXX is not used by YYY, therefore those + weights are discarded. + + Parameters: + pretrained_model_name_or_path (`str` or `os.PathLike`, *optional*): + Can be either: + + - A string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co. + - A path to a *directory* containing model weights saved using + [`~PreTrainedModel.save_pretrained`], e.g., `./my_model_directory/`. + - A path or url to a *tensorflow index checkpoint file* (e.g, `./tf_model/model.ckpt.index`). In + this case, `from_tf` should be set to `True` and a configuration object should be provided as + `config` argument. This loading path is slower than converting the TensorFlow checkpoint in a + PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards. + - A path or url to a model folder containing a *flax checkpoint file* in *.msgpack* format (e.g, + `./flax_model/` containing `flax_model.msgpack`). In this case, `from_flax` should be set to + `True`. + - `None` if you are both providing the configuration and state dictionary (resp. with keyword + arguments `config` and `state_dict`). + model_args (sequence of positional arguments, *optional*): + All remaining positional arguments will be passed to the underlying model's `__init__` method. + config (`Union[PretrainedConfig, str, os.PathLike]`, *optional*): + Can be either: + + - an instance of a class derived from [`PretrainedConfig`], + - a string or path valid as input to [`~PretrainedConfig.from_pretrained`]. + + Configuration for the model to use instead of an automatically loaded configuration. Configuration can + be automatically loaded when: + + - The model is a model provided by the library (loaded with the *model id* string of a pretrained + model). + - The model was saved using [`~PreTrainedModel.save_pretrained`] and is reloaded by supplying the + save directory. + - The model is loaded by supplying a local directory as `pretrained_model_name_or_path` and a + configuration JSON file named *config.json* is found in the directory. + state_dict (`Dict[str, torch.Tensor]`, *optional*): + A state dictionary to use instead of a state dictionary loaded from saved weights file. + + This option can be used if you want to create a model from a pretrained configuration but load your own + weights. In this case though, you should check if using [`~PreTrainedModel.save_pretrained`] and + [`~PreTrainedModel.from_pretrained`] is not a simpler option. + cache_dir (`Union[str, os.PathLike]`, *optional*): + Path to a directory in which a downloaded pretrained model configuration should be cached if the + standard cache should not be used. + from_tf (`bool`, *optional*, defaults to `False`): + Load the model weights from a TensorFlow checkpoint save file (see docstring of + `pretrained_model_name_or_path` argument). + from_flax (`bool`, *optional*, defaults to `False`): + Load the model weights from a Flax checkpoint save file (see docstring of + `pretrained_model_name_or_path` argument). + ignore_mismatched_sizes (`bool`, *optional*, defaults to `False`): + Whether or not to raise an error if some of the weights from the checkpoint do not have the same size + as the weights of the model (if for instance, you are instantiating a model with 10 labels from a + checkpoint with 3 labels). + force_download (`bool`, *optional*, defaults to `False`): + Whether or not to force the (re-)download of the model weights and configuration files, overriding the + cached versions if they exist. + resume_download: + Deprecated and ignored. All downloads are now resumed by default when possible. + Will be removed in v5 of Transformers. + proxies (`Dict[str, str]`, *optional*): + A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128', + 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request. + output_loading_info(`bool`, *optional*, defaults to `False`): + Whether ot not to also return a dictionary containing missing keys, unexpected keys and error messages. + local_files_only(`bool`, *optional*, defaults to `False`): + Whether or not to only look at local files (i.e., do not try to download the model). + token (`str` or `bool`, *optional*): + The token to use as HTTP bearer authorization for remote files. If `True`, or not specified, will use + the token generated when running `huggingface-cli login` (stored in `~/.huggingface`). + revision (`str`, *optional*, defaults to `"main"`): + The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a + git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any + identifier allowed by git. + + + + To test a pull request you made on the Hub, you can pass `revision="refs/pr/"`. + + + attn_implementation (`str`, *optional*): + The attention implementation to use in the model (if relevant). Can be any of `"eager"` (manual implementation of the attention), `"sdpa"` (using [`F.scaled_dot_product_attention`](https://pytorch.org/docs/master/generated/torch.nn.functional.scaled_dot_product_attention.html)), or `"flash_attention_2"` (using [Dao-AILab/flash-attention](https://github.com/Dao-AILab/flash-attention)). By default, if available, SDPA will be used for torch>=2.1.1. The default is otherwise the manual `"eager"` implementation. + + > Parameters for big model inference + + torch_dtype (`str` or `torch.dtype`, *optional*): + Override the default `torch.dtype` and load the model under a specific `dtype`. The different options + are: + + 1. `torch.float16` or `torch.bfloat16` or `torch.float`: load in a specified + `dtype`, ignoring the model's `config.torch_dtype` if one exists. If not specified + - the model will get loaded in `torch.float` (fp32). + + 2. `"auto"` - A `torch_dtype` entry in the `config.json` file of the model will be + attempted to be used. If this entry isn't found then next check the `dtype` of the first weight in + the checkpoint that's of a floating point type and use that as `dtype`. This will load the model + using the `dtype` it was saved in at the end of the training. It can't be used as an indicator of how + the model was trained. Since it could be trained in one of half precision dtypes, but saved in fp32. + + 3. A string that is a valid `torch.dtype`. E.g. "float32" loads the model in `torch.float32`, "float16" loads in `torch.float16` etc. + + + + For some models the `dtype` they were trained in is unknown - you may try to check the model's paper or + reach out to the authors and ask them to add this information to the model's card and to insert the + `torch_dtype` entry in `config.json` on the hub. + + + + device_map (`str` or `Dict[str, Union[int, str, torch.device]]` or `int` or `torch.device`, *optional*): + A map that specifies where each submodule should go. It doesn't need to be refined to each + parameter/buffer name, once a given module name is inside, every submodule of it will be sent to the + same device. If we only pass the device (*e.g.*, `"cpu"`, `"cuda:1"`, `"mps"`, or a GPU ordinal rank + like `1`) on which the model will be allocated, the device map will map the entire model to this + device. Passing `device_map = 0` means put the whole model on GPU 0. + + To have Accelerate compute the most optimized `device_map` automatically, set `device_map="auto"`. For + more information about each option see [designing a device + map](https://hf.co/docs/accelerate/main/en/usage_guides/big_modeling#designing-a-device-map). + max_memory (`Dict`, *optional*): + A dictionary device identifier to maximum memory if using `device_map`. Will default to the maximum memory available for each + GPU and the available CPU RAM if unset. + tp_plan (`str`, *optional*): + A torch tensor parallel plan, see [here](https://pytorch.org/tutorials/intermediate/TP_tutorial.html). Currently, it only accepts + `tp_plan="auto"` to use predefined plan based on the model. Note that if you use it, you should launch your script accordingly with + `torchrun [args] script.py`. This will be much faster than using a `device_map`, but has limitations. + tp_size (`str`, *optional*): + A torch tensor parallel degree. If not provided would default to world size. + offload_folder (`str` or `os.PathLike`, *optional*): + If the `device_map` contains any value `"disk"`, the folder where we will offload weights. + offload_state_dict (`bool`, *optional*): + If `True`, will temporarily offload the CPU state dict to the hard drive to avoid getting out of CPU + RAM if the weight of the CPU state dict + the biggest shard of the checkpoint does not fit. Defaults to + `True` when there is some disk offload. + offload_buffers (`bool`, *optional*): + Whether or not to offload the buffers with the model parameters. + quantization_config (`Union[QuantizationConfigMixin,Dict]`, *optional*): + A dictionary of configuration parameters or a QuantizationConfigMixin object for quantization (e.g + bitsandbytes, gptq). There may be other quantization-related kwargs, including `load_in_4bit` and + `load_in_8bit`, which are parsed by QuantizationConfigParser. Supported only for bitsandbytes + quantizations and not preferred. consider inserting all such arguments into quantization_config + instead. + subfolder (`str`, *optional*, defaults to `""`): + In case the relevant files are located inside a subfolder of the model repo on huggingface.co, you can + specify the folder name here. + variant (`str`, *optional*): + If specified load weights from `variant` filename, *e.g.* pytorch_model..bin. `variant` is + ignored when using `from_tf` or `from_flax`. + use_safetensors (`bool`, *optional*, defaults to `None`): + Whether or not to use `safetensors` checkpoints. Defaults to `None`. If not specified and `safetensors` + is not installed, it will be set to `False`. + weights_only (`bool`, *optional*, defaults to `True`): + Indicates whether unpickler should be restricted to loading only tensors, primitive types, + dictionaries and any types added via torch.serialization.add_safe_globals(). + When set to False, we can load wrapper tensor subclass weights. + key_mapping (`Dict[str, str], *optional*): + A potential mapping of the weight names if using a model on the Hub which is compatible to a Transformers + architecture, but was not converted accordingly. + kwargs (remaining dictionary of keyword arguments, *optional*): + Can be used to update the configuration object (after it being loaded) and initiate the model (e.g., + `output_attentions=True`). Behaves differently depending on whether a `config` is provided or + automatically loaded: + + - If a configuration is provided with `config`, `**kwargs` will be directly passed to the + underlying model's `__init__` method (we assume all relevant updates to the configuration have + already been done) + - If a configuration is not provided, `kwargs` will be first passed to the configuration class + initialization function ([`~PretrainedConfig.from_pretrained`]). Each key of `kwargs` that + corresponds to a configuration attribute will be used to override said attribute with the + supplied `kwargs` value. Remaining keys that do not correspond to any configuration attribute + will be passed to the underlying model's `__init__` function. + + + + Activate the special ["offline-mode"](https://huggingface.co/transformers/installation.html#offline-mode) to + use this method in a firewalled environment. + + + + Examples: + + ```python + >>> from transformers import BertConfig, BertModel + + >>> # Download model and configuration from huggingface.co and cache. + >>> model = BertModel.from_pretrained("google-bert/bert-base-uncased") + >>> # Model was saved using *save_pretrained('./test/saved_model/')* (for example purposes, not runnable). + >>> model = BertModel.from_pretrained("./test/saved_model/") + >>> # Update configuration during loading. + >>> model = BertModel.from_pretrained("google-bert/bert-base-uncased", output_attentions=True) + >>> assert model.config.output_attentions == True + >>> # Loading from a TF checkpoint file instead of a PyTorch model (slower, for example purposes, not runnable). + >>> config = BertConfig.from_json_file("./tf_model/my_tf_model_config.json") + >>> model = BertModel.from_pretrained("./tf_model/my_tf_checkpoint.ckpt.index", from_tf=True, config=config) + >>> # Loading from a Flax checkpoint file instead of a PyTorch model (slower) + >>> model = BertModel.from_pretrained("google-bert/bert-base-uncased", from_flax=True) + ``` + """ + state_dict = kwargs.pop("state_dict", None) + from_tf = kwargs.pop("from_tf", False) + from_flax = kwargs.pop("from_flax", False) + proxies = kwargs.pop("proxies", None) + output_loading_info = kwargs.pop("output_loading_info", False) + use_auth_token = kwargs.pop("use_auth_token", None) + from_pipeline = kwargs.pop("_from_pipeline", None) + from_auto_class = kwargs.pop("_from_auto", False) + torch_dtype = kwargs.pop("torch_dtype", None) + device_map = kwargs.pop("device_map", None) + max_memory = kwargs.pop("max_memory", None) + offload_folder = kwargs.pop("offload_folder", None) + offload_state_dict = kwargs.pop("offload_state_dict", False) + offload_buffers = kwargs.pop("offload_buffers", False) + load_in_8bit = kwargs.pop("load_in_8bit", False) + load_in_4bit = kwargs.pop("load_in_4bit", False) + quantization_config = kwargs.pop("quantization_config", None) + subfolder = kwargs.pop("subfolder", "") + commit_hash = kwargs.pop("_commit_hash", None) + variant = kwargs.pop("variant", None) + adapter_kwargs = kwargs.pop("adapter_kwargs", {}) + adapter_name = kwargs.pop("adapter_name", "default") + use_flash_attention_2 = kwargs.pop("use_flash_attention_2", False) + generation_config = kwargs.pop("generation_config", None) + gguf_file = kwargs.pop("gguf_file", None) + tp_plan = kwargs.pop("tp_plan", None) + tp_size = kwargs.pop("tp_size", None) + key_mapping = kwargs.pop("key_mapping", None) + # Not used anymore -- remove them from the kwargs + _ = kwargs.pop("resume_download", None) + _ = kwargs.pop("trust_remote_code", None) + _ = kwargs.pop("mirror", None) + _ = kwargs.pop("_fast_init", True) + _ = kwargs.pop("low_cpu_mem_usage", None) + + if state_dict is not None and (pretrained_model_name_or_path is not None or gguf_file is not None): + raise ValueError( + "`state_dict` cannot be passed together with a model name or a `gguf_file`. Use one of the two loading strategies." + ) + if tp_size is not None and tp_plan is None: + raise ValueError("tp_plan has to be set when tp_size is passed.") + if tp_plan is not None and tp_plan != "auto": + # TODO: we can relax this check when we support taking tp_plan from a json file, for example. + raise ValueError(f"tp_plan supports 'auto' only for now but got {tp_plan}.") + if tp_plan is not None and device_map is not None: + raise ValueError( + "`tp_plan` and `device_map` are mutually exclusive. Choose either one for parallelization." + ) + + # If torchrun was used, make sure to TP by default. This way people don't need to change tp or device map + if device_map == "auto" and tp_plan is None and int(os.environ.get("WORLD_SIZE", 0)): + tp_plan = "auto" # device_map = "auto" in torchrun equivalent to TP plan = AUTO! + device_map = None + + # We need to correctly dispatch the model on the current process device. The easiest way for this is to use a simple + # `device_map` pointing to the correct device + device_mesh = None + if tp_plan is not None: + if not is_torch_greater_or_equal("2.5"): + raise EnvironmentError("tensor parallel is only supported for `torch>=2.5`.") + + # Detect the accelerator on the machine. If no accelerator is available, it returns CPU. + device_type = torch._C._get_accelerator().type + + if not torch.distributed.is_initialized(): + try: + rank = int(os.environ["RANK"]) + world_size = int(os.environ["WORLD_SIZE"]) + if device_type == "cuda": + torch.distributed.init_process_group( + "nccl", rank=rank, world_size=world_size, init_method="env://" + ) + torch.cuda.set_device(int(os.environ["LOCAL_RANK"])) + elif device_type == "cpu": + cpu_backend = "ccl" if int(os.environ.get("CCL_WORKER_COUNT", 0)) else "gloo" + torch.distributed.init_process_group(cpu_backend, rank=rank, world_size=world_size) + elif device_type == "xpu": + torch.distributed.init_process_group("ccl", rank=rank, world_size=world_size) + torch.xpu.set_device(int(os.environ["LOCAL_RANK"])) + + except Exception as e: + raise EnvironmentError( + "We tried to initialize torch.distributed for you, but it failed, make" + "sure you init torch distributed in your script to use `tp_plan='auto'`" + ) from e + + # Get device with index assuming equal number of devices per host + if device_type == "xpu": + index = torch.xpu.current_device() + else: + index = None if device_type == "cpu" else torch.cuda.current_device() + tp_device = torch.device(device_type, index) + + if index is not None and index > 0: + import sys + + sys.stdout = open(os.devnull, "w") + sys.stderr = open(os.devnull, "w") + # This is the easiest way to dispatch to the current process device + device_map = tp_device + + # Assuming sharding the model onto the world when tp_size not provided + tp_size = tp_size if tp_size is not None else torch.distributed.get_world_size() + device_mesh = torch.distributed.init_device_mesh(tp_device.type, (tp_size,)) + + if use_auth_token is not None: + warnings.warn( + "The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers. Please use `token` instead.", + FutureWarning, + ) + if token is not None: + raise ValueError( + "`token` and `use_auth_token` are both specified. Please set only the argument `token`." + ) + token = use_auth_token + + if token is not None and adapter_kwargs is not None and "token" not in adapter_kwargs: + adapter_kwargs["token"] = token + + if use_safetensors is None and not is_safetensors_available(): + use_safetensors = False + + if gguf_file is not None and not is_accelerate_available(): + raise ValueError("accelerate is required when loading a GGUF file `pip install accelerate`.") + + if commit_hash is None: + if not isinstance(config, PretrainedConfig): + # We make a call to the config file first (which may be absent) to get the commit hash as soon as possible + resolved_config_file = cached_file( + pretrained_model_name_or_path, + CONFIG_NAME, + cache_dir=cache_dir, + force_download=force_download, + proxies=proxies, + local_files_only=local_files_only, + token=token, + revision=revision, + subfolder=subfolder, + _raise_exceptions_for_gated_repo=False, + _raise_exceptions_for_missing_entries=False, + _raise_exceptions_for_connection_errors=False, + ) + commit_hash = extract_commit_hash(resolved_config_file, commit_hash) + else: + commit_hash = getattr(config, "_commit_hash", None) + + if is_peft_available(): + _adapter_model_path = adapter_kwargs.pop("_adapter_model_path", None) + + if _adapter_model_path is None: + _adapter_model_path = find_adapter_config_file( + pretrained_model_name_or_path, + cache_dir=cache_dir, + force_download=force_download, + proxies=proxies, + local_files_only=local_files_only, + _commit_hash=commit_hash, + **adapter_kwargs, + ) + if _adapter_model_path is not None and os.path.isfile(_adapter_model_path): + with open(_adapter_model_path, "r", encoding="utf-8") as f: + _adapter_model_path = pretrained_model_name_or_path + pretrained_model_name_or_path = json.load(f)["base_model_name_or_path"] + else: + _adapter_model_path = None + + # Potentially detect context manager or global device, and use it (only if no device_map was provided) + if device_map is None and not is_deepspeed_zero3_enabled(): + device_in_context = get_torch_context_manager_or_global_device() + if device_in_context == torch.device("meta"): + # TODO Cyril: raise an error instead of the warning in v4.53 (and change the test to check for raise instead of success) + logger.warning( + "We detected that you are using `from_pretrained` with a meta device context manager or `torch.set_default_device('meta')`\n" + "This is an anti-pattern and will raise an Error in version v4.53\nIf you want to initialize a model on the meta device, use " + "the context manager or global device with `from_config`, or `ModelClass(config)`" + ) + device_map = device_in_context + + # change device_map into a map if we passed an int, a str or a torch.device + if isinstance(device_map, torch.device): + device_map = {"": device_map} + elif isinstance(device_map, str) and device_map not in ["auto", "balanced", "balanced_low_0", "sequential"]: + try: + device_map = {"": torch.device(device_map)} + except RuntimeError: + raise ValueError( + "When passing device_map as a string, the value needs to be a device name (e.g. cpu, cuda:0) or " + f"'auto', 'balanced', 'balanced_low_0', 'sequential' but found {device_map}." + ) + elif isinstance(device_map, int): + if device_map < 0: + raise ValueError( + "You can't pass device_map as a negative int. If you want to put the model on the cpu, pass device_map = 'cpu' " + ) + else: + device_map = {"": device_map} + + if device_map is not None: + if is_deepspeed_zero3_enabled(): + raise ValueError("DeepSpeed Zero-3 is not compatible with passing a `device_map`.") + if not is_accelerate_available(): + raise ValueError( + ( + "Using a `device_map`, `tp_plan`, `torch.device` context manager or setting `torch.set_default_device(device)` " + "requires `accelerate`. You can install it with `pip install accelerate`" + ) + ) + + # handling bnb config from kwargs, remove after `load_in_{4/8}bit` deprecation. + if load_in_4bit or load_in_8bit: + if quantization_config is not None: + raise ValueError( + "You can't pass `load_in_4bit`or `load_in_8bit` as a kwarg when passing " + "`quantization_config` argument at the same time." + ) + + # preparing BitsAndBytesConfig from kwargs + config_dict = {k: v for k, v in kwargs.items() if k in inspect.signature(BitsAndBytesConfig).parameters} + config_dict = {**config_dict, "load_in_4bit": load_in_4bit, "load_in_8bit": load_in_8bit} + quantization_config, kwargs = BitsAndBytesConfig.from_dict( + config_dict=config_dict, return_unused_kwargs=True, **kwargs + ) + logger.warning( + "The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. " + "Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead." + ) + + from_pt = not (from_tf | from_flax) + + user_agent = {"file_type": "model", "framework": "pytorch", "from_auto_class": from_auto_class} + if from_pipeline is not None: + user_agent["using_pipeline"] = from_pipeline + + if is_offline_mode() and not local_files_only: + logger.info("Offline mode: forcing local_files_only=True") + local_files_only = True + + # Load config if we don't provide a configuration + if not isinstance(config, PretrainedConfig): + config_path = config if config is not None else pretrained_model_name_or_path + config, model_kwargs = cls.config_class.from_pretrained( + config_path, + cache_dir=cache_dir, + return_unused_kwargs=True, + force_download=force_download, + proxies=proxies, + local_files_only=local_files_only, + token=token, + revision=revision, + subfolder=subfolder, + gguf_file=gguf_file, + _from_auto=from_auto_class, + _from_pipeline=from_pipeline, + **kwargs, + ) + if "gguf_file" in model_kwargs: + model_kwargs.pop("gguf_file") + else: + # In case one passes a config to `from_pretrained` + "attn_implementation" + # override the `_attn_implementation` attribute to `attn_implementation` of the kwargs + # Please see: https://github.com/huggingface/transformers/issues/28038 + + # Overwrite `config._attn_implementation` by the one from the kwargs --> in auto-factory + # we pop attn_implementation from the kwargs but this handles the case where users + # passes manually the config to `from_pretrained`. + config = copy.deepcopy(config) + + kwarg_attn_imp = kwargs.pop("attn_implementation", None) + if kwarg_attn_imp is not None: + config._attn_implementation = kwarg_attn_imp + + model_kwargs = kwargs + + pre_quantized = hasattr(config, "quantization_config") + if pre_quantized and not AutoHfQuantizer.supports_quant_method(config.quantization_config): + pre_quantized = False + + if pre_quantized or quantization_config is not None: + if pre_quantized: + config.quantization_config = AutoHfQuantizer.merge_quantization_configs( + config.quantization_config, quantization_config + ) + else: + config.quantization_config = quantization_config + + hf_quantizer = AutoHfQuantizer.from_config( + config.quantization_config, + pre_quantized=pre_quantized, + ) + else: + hf_quantizer = None + + if hf_quantizer is not None: + hf_quantizer.validate_environment( + torch_dtype=torch_dtype, + from_tf=from_tf, + from_flax=from_flax, + device_map=device_map, + weights_only=weights_only, + ) + torch_dtype = hf_quantizer.update_torch_dtype(torch_dtype) + device_map = hf_quantizer.update_device_map(device_map) + config = hf_quantizer.update_tp_plan(config) + + # In order to ensure popular quantization methods are supported. Can be disable with `disable_telemetry` + if hasattr(hf_quantizer.quantization_config.quant_method, "value"): + user_agent["quant"] = hf_quantizer.quantization_config.quant_method.value + else: + user_agent["quant"] = hf_quantizer.quantization_config.quant_method + + if gguf_file is not None and hf_quantizer is not None: + raise ValueError( + "You cannot combine Quantization and loading a model from a GGUF file, try again by making sure you did not passed a `quantization_config` or that you did not load a quantized model from the Hub." + ) + + if ( + gguf_file + and device_map is not None + and ((isinstance(device_map, dict) and "disk" in device_map.values()) or "disk" in device_map) + ): + raise RuntimeError( + "One or more modules is configured to be mapped to disk. Disk offload is not supported for models " + "loaded from GGUF files." + ) + + checkpoint_files, sharded_metadata = _get_resolved_checkpoint_files( + pretrained_model_name_or_path=pretrained_model_name_or_path, + subfolder=subfolder, + variant=variant, + gguf_file=gguf_file, + from_tf=from_tf, + from_flax=from_flax, + use_safetensors=use_safetensors, + cache_dir=cache_dir, + force_download=force_download, + proxies=proxies, + local_files_only=local_files_only, + token=token, + user_agent=user_agent, + revision=revision, + commit_hash=commit_hash, + ) + + is_sharded = sharded_metadata is not None + is_quantized = hf_quantizer is not None + is_from_file = pretrained_model_name_or_path is not None or gguf_file is not None + + if ( + is_safetensors_available() + and is_from_file + and not is_sharded + and checkpoint_files[0].endswith(".safetensors") + ): + with safe_open(checkpoint_files[0], framework="pt") as f: + metadata = f.metadata() + + if metadata is None: + # Assume it's a pytorch checkpoint (introduced for timm checkpoints) + pass + elif metadata.get("format") == "pt": + pass + elif metadata.get("format") == "tf": + from_tf = True + logger.info("A TensorFlow safetensors file is being loaded in a PyTorch model.") + elif metadata.get("format") == "flax": + from_flax = True + logger.info("A Flax safetensors file is being loaded in a PyTorch model.") + elif metadata.get("format") == "mlx": + # This is a mlx file, we assume weights are compatible with pt + pass + else: + raise ValueError( + f"Incompatible safetensors file. File metadata is not ['pt', 'tf', 'flax', 'mlx'] but {metadata.get('format')}" + ) + + from_pt = not (from_tf | from_flax) + + if from_pt: + if gguf_file: + from .modeling_gguf_pytorch_utils import load_gguf_checkpoint + + # we need a dummy model to get the state_dict - for this reason, we keep the state_dict as if it was + # passed directly as a kwarg from now on + with torch.device("meta"): + dummy_model = cls(config) + state_dict = load_gguf_checkpoint(checkpoint_files[0], return_tensors=True, model_to_load=dummy_model)[ + "tensors" + ] + + # Find the correct dtype based on current state + config, torch_dtype, dtype_orig = _get_torch_dtype( + cls, torch_dtype, checkpoint_files, config, sharded_metadata, state_dict, weights_only + ) + + config.name_or_path = pretrained_model_name_or_path + + # Instantiate model. + model_init_context = cls.get_init_context(is_quantized, _is_ds_init_called) + + config = copy.deepcopy(config) # We do not want to modify the config inplace in from_pretrained. + if not getattr(config, "_attn_implementation_autoset", False): + config = cls._autoset_attn_implementation( + config, + use_flash_attention_2=use_flash_attention_2, + torch_dtype=torch_dtype, + device_map=device_map, + ) + + with ContextManagers(model_init_context): + # Let's make sure we don't run the init function of buffer modules + model = cls(config, *model_args, **model_kwargs) + + # Make sure to tie the weights correctly + model.tie_weights() + + # Last check for tp + if device_mesh is not None and not model.supports_tp_plan: + if config.base_model_tp_plan is None and config.get_text_config().base_model_tp_plan is None: + raise NotImplementedError("This model does not have a tensor parallel plan.") + + # make sure we use the model's config since the __init__ call might have copied it + config = model.config + + # Find fp32 modules if needed + keep_in_fp32_regex = None + # The _keep_in_fp32_modules flag is only used to avoid bf16 -> fp16 casting precision issues. It was introduced + # in case of force loading a model that should stay bf16 in fp16 (which includes a few quantizers as this is a pre-processing + # step for e.g. bitsandbytes). See https://github.com/huggingface/transformers/issues/20287 for details. + if model._keep_in_fp32_modules is not None and ( + torch_dtype == torch.float16 or getattr(hf_quantizer, "use_keep_in_fp32_modules", False) + ): + # We need to match exact layers, so we add either `.` on each side, or start/end of string + keep_in_fp32_regex = re.compile( + "|".join([rf"((^|\.){module}($|\.))" for module in model._keep_in_fp32_modules]) + ) + + if hf_quantizer is not None: + hf_quantizer.preprocess_model( + model=model, device_map=device_map, keep_in_fp32_modules=model._keep_in_fp32_modules, config=config + ) + # We store the original dtype for quantized models as we cannot easily retrieve it + # once the weights have been quantized + # Note that once you have loaded a quantized model, you can't change its dtype so this will + # remain a single source of truth + original_dtype = torch_dtype if torch_dtype is not None else torch.get_default_dtype() + + def _assign_original_dtype(module): + for child in module.children(): + if isinstance(child, PreTrainedModel): + child.config._pre_quantization_dtype = original_dtype + _assign_original_dtype(child) + + config._pre_quantization_dtype = original_dtype + _assign_original_dtype(model) + + # Prepare the full device map + if device_map is not None: + device_map = _get_device_map(model, device_map, max_memory, hf_quantizer, torch_dtype, keep_in_fp32_regex) + + # Finalize model weight initialization + if from_tf: + model, loading_info = cls._load_from_tf(model, config, checkpoint_files) + elif from_flax: + model = cls._load_from_flax(model, checkpoint_files) + elif from_pt: + # restore default dtype + if dtype_orig is not None: + torch.set_default_dtype(dtype_orig) + + ( + model, + missing_keys, + unexpected_keys, + mismatched_keys, + offload_index, + error_msgs, + ) = cls._load_pretrained_model( + model, + state_dict, + checkpoint_files, + pretrained_model_name_or_path, + ignore_mismatched_sizes=ignore_mismatched_sizes, + sharded_metadata=sharded_metadata, + device_map=device_map, + disk_offload_folder=offload_folder, + offload_state_dict=offload_state_dict, + dtype=torch_dtype, + hf_quantizer=hf_quantizer, + keep_in_fp32_regex=keep_in_fp32_regex, + device_mesh=device_mesh, + key_mapping=key_mapping, + weights_only=weights_only, + ) + + # record tp degree the model sharded to + model._tp_size = tp_size + + # make sure token embedding weights are still tied if needed + model.tie_weights() + + # Set model in evaluation mode to deactivate DropOut modules by default + model.eval() + + # If it is a model with generation capabilities, attempt to load the generation config + if model.can_generate() and generation_config is not None: + logger.info("The user-defined `generation_config` will be used to override the default generation config.") + model.generation_config = model.generation_config.from_dict(generation_config.to_dict()) + elif model.can_generate() and pretrained_model_name_or_path is not None: + try: + model.generation_config = GenerationConfig.from_pretrained( + pretrained_model_name_or_path, + cache_dir=cache_dir, + force_download=force_download, + proxies=proxies, + local_files_only=local_files_only, + token=token, + revision=revision, + subfolder=subfolder, + _from_auto=from_auto_class, + _from_pipeline=from_pipeline, + **kwargs, + ) + except OSError: + logger.info( + "Generation config file not found, using a generation config created from the model config." + ) + pass + + # Dispatch model with hooks on all devices if necessary (not needed with a tp_plan, so we skip it as it slightly + # harm performances) + if device_map is not None and device_mesh is None: + device_map_kwargs = { + "device_map": device_map, + "offload_dir": offload_folder, + "offload_index": offload_index, + "offload_buffers": offload_buffers, + } + if "skip_keys" in inspect.signature(dispatch_model).parameters: + device_map_kwargs["skip_keys"] = model._skip_keys_device_placement + # For HQQ method we force-set the hooks for single GPU envs + if ( + "force_hooks" in inspect.signature(dispatch_model).parameters + and hf_quantizer is not None + and hf_quantizer.quantization_config.quant_method == QuantizationMethod.HQQ + ): + device_map_kwargs["force_hooks"] = True + if ( + hf_quantizer is not None + and hf_quantizer.quantization_config.quant_method == QuantizationMethod.FBGEMM_FP8 + and isinstance(device_map, dict) + and ("cpu" in device_map.values() or "disk" in device_map.values()) + ): + device_map_kwargs["offload_buffers"] = True + + if not is_fsdp_enabled() and not is_deepspeed_zero3_enabled(): + dispatch_model(model, **device_map_kwargs) + + if hf_quantizer is not None: + hf_quantizer.postprocess_model(model, config=config) + model.hf_quantizer = hf_quantizer + + if _adapter_model_path is not None: + model.load_adapter( + _adapter_model_path, + adapter_name=adapter_name, + token=token, + adapter_kwargs=adapter_kwargs, + ) + + if output_loading_info: + if from_pt: + loading_info = { + "missing_keys": missing_keys, + "unexpected_keys": unexpected_keys, + "mismatched_keys": mismatched_keys, + "error_msgs": error_msgs, + } + elif from_flax: + loading_info = None + return model, loading_info + return model + + @staticmethod + def _fix_state_dict_key_on_load(key: str) -> Tuple[str, bool]: + """Replace legacy parameter names with their modern equivalents. E.g. beta -> bias, gamma -> weight.""" + # Rename LayerNorm beta & gamma params for some early models ported from Tensorflow (e.g. Bert) + # This rename is logged. + if key.endswith("LayerNorm.beta"): + return key.replace("LayerNorm.beta", "LayerNorm.bias"), True + if key.endswith("LayerNorm.gamma"): + return key.replace("LayerNorm.gamma", "LayerNorm.weight"), True + + # Rename weight norm parametrizations to match changes across torch versions. + # Impacts a number of speech/wav2vec models. e.g. Hubert, Wav2Vec2, and others. + # This rename is not logged. + if hasattr(nn.utils.parametrizations, "weight_norm"): + if key.endswith("weight_g"): + return key.replace("weight_g", "parametrizations.weight.original0"), True + if key.endswith("weight_v"): + return key.replace("weight_v", "parametrizations.weight.original1"), True + else: + if key.endswith("parametrizations.weight.original0"): + return key.replace("parametrizations.weight.original0", "weight_g"), True + if key.endswith("parametrizations.weight.original1"): + return key.replace("parametrizations.weight.original1", "weight_v"), True + + return key, False + + def _get_key_renaming_mapping( + self, + checkpoint_keys: List[str], + key_mapping: Optional[Dict[str, str]] = None, + loading_base_model_from_task_state_dict: bool = False, + loading_task_model_from_base_state_dict: bool = False, + ): + """ + Compute a mapping between the serialized keys on disk `checkpoint_keys`, and the keys that the model + that we are loading expects. This is the single entry point for key renaming that will be used during + loading. + Log if any parameters have been renamed. + """ + prefix = self.base_model_prefix + _prefix = f"{prefix}." + + renamed_keys = {} + key_renaming_mapping = {} + for key in checkpoint_keys: + # Class specific rename + new_key, has_changed = self._fix_state_dict_key_on_load(key) + + # Optionally map the key according to `key_mapping` + if key_mapping is not None: + for pattern, replacement in key_mapping.items(): + new_key, n_replace = re.subn(pattern, replacement, new_key) + # Early exit of the loop + if n_replace > 0: + has_changed = True + break + + # In this case, we need to add the prefix to the keys, to match them to the expected keys + if loading_task_model_from_base_state_dict: + new_key = ".".join([prefix, new_key]) + # In this case we need to remove the prefix from the key to match them to the expected keys, and use + # only the keys starting with the prefix + elif loading_base_model_from_task_state_dict: + if not new_key.startswith(_prefix): + continue + new_key = new_key[len(_prefix) :] + + key_renaming_mapping[key] = new_key + + # track gamma/beta rename for logging + if has_changed: + if key.endswith("LayerNorm.gamma"): + renamed_keys["LayerNorm.gamma"] = (key, new_key) + elif key.endswith("LayerNorm.beta"): + renamed_keys["LayerNorm.beta"] = (key, new_key) + + if renamed_keys: + warning_msg = f"A pretrained model of type `{self.__class__.__name__}` " + warning_msg += "contains parameters that have been renamed internally (a few are listed below but more are present in the model):\n" + for old_key, new_key in renamed_keys.values(): + warning_msg += f"* `{old_key}` -> `{new_key}`\n" + warning_msg += "If you are using a model from the Hub, consider submitting a PR to adjust these weights and help future users." + logger.info_once(warning_msg) + + return key_renaming_mapping + + @staticmethod + def _fix_state_dict_key_on_save(key) -> Tuple[str, bool]: + """ + Similar to `_fix_state_dict_key_on_load` allows to define hook for state dict key renaming on model save. + Do nothing by default, but can be overridden in particular models. + """ + return key, False + + def _fix_state_dict_keys_on_save(self, state_dict): + """ + Similar to `_fix_state_dict_keys_on_load` allows to define hook for state dict key renaming on model save. + Apply `_fix_state_dict_key_on_save` to all keys in `state_dict`. + """ + return {self._fix_state_dict_key_on_save(key)[0]: value for key, value in state_dict.items()} + + @classmethod + def _load_pretrained_model( + cls, + model: "PreTrainedModel", + state_dict: Optional[Dict], + checkpoint_files: Optional[List[str]], + pretrained_model_name_or_path: Optional[str], + ignore_mismatched_sizes: bool = False, + sharded_metadata: Optional[Dict] = None, + device_map: Optional[Dict] = None, + disk_offload_folder: Optional[str] = None, + offload_state_dict: Optional[bool] = None, + dtype: Optional[torch.dtype] = None, + hf_quantizer: Optional[HfQuantizer] = None, + keep_in_fp32_regex: Optional[re.Pattern] = None, + device_mesh: Optional["torch.distributed.device_mesh.DeviceMesh"] = None, + key_mapping: Optional[Dict[str, str]] = None, + weights_only: bool = True, + ): + # Useful flags + is_quantized = hf_quantizer is not None + is_hqq_or_quark = is_quantized and hf_quantizer.quantization_config.quant_method in { + QuantizationMethod.HQQ, + QuantizationMethod.QUARK, + } + is_hqq_or_bnb = is_quantized and hf_quantizer.quantization_config.quant_method in { + QuantizationMethod.HQQ, + QuantizationMethod.BITS_AND_BYTES, + } + + # Get all the keys of the state dicts that we have to initialize the model + if sharded_metadata is not None: + original_checkpoint_keys = sharded_metadata["all_checkpoint_keys"] + elif state_dict is not None: + original_checkpoint_keys = list(state_dict.keys()) + else: + original_checkpoint_keys = list( + load_state_dict(checkpoint_files[0], map_location="meta", weights_only=weights_only).keys() + ) + + # Check if we are in a special state, i.e. loading from a state dict coming from a different architecture + prefix = model.base_model_prefix + _prefix = f"{prefix}." + has_prefix_module = any(s.startswith(prefix) for s in original_checkpoint_keys) if len(prefix) > 0 else False + expects_prefix_module = hasattr(model, prefix) if len(prefix) > 0 else False + loading_task_model_from_base_state_dict = not has_prefix_module and expects_prefix_module + loading_base_model_from_task_state_dict = has_prefix_module and not expects_prefix_module + + # Find the key names that the model expects from the serialized keys + key_renaming_mapping = model._get_key_renaming_mapping( + original_checkpoint_keys, + key_mapping, + loading_base_model_from_task_state_dict, + loading_task_model_from_base_state_dict, + ) + checkpoint_keys = list(key_renaming_mapping.values()) + + # Find missing and unexpected keys from the state dict + missing_keys, unexpected_keys = _find_missing_and_unexpected_keys( + cls, + model, + original_checkpoint_keys, + checkpoint_keys, + loading_base_model_from_task_state_dict, + hf_quantizer, + device_map, + ) + # Find all the keys with shape mismatch (if we ignore the mismatch, the weights need to be newly initialized the + # same way as missing keys) + mismatched_keys, mismatched_shapes = _find_mismatched_keys( + model, + state_dict, + checkpoint_files, + ignore_mismatched_sizes, + key_renaming_mapping, + is_quantized, + weights_only, + ) + + # We need to update both the mapping and the list of checkpoint keys to remove the mismatched ones + key_renaming_mapping = {k: v for k, v in key_renaming_mapping.items() if v not in mismatched_keys} + checkpoint_keys = list(key_renaming_mapping.values()) + + # Move missing (and potentially mismatched) keys back to cpu from meta device (because they won't be moved when + # loading the weights as they are not in the loaded state dict) + model._move_missing_keys_from_meta_to_cpu(missing_keys + mismatched_keys, unexpected_keys, dtype, hf_quantizer) + + # correctly initialize the missing (and potentially mismatched) keys + model._initialize_missing_keys(checkpoint_keys, ignore_mismatched_sizes, is_quantized) + + # Set some modules to fp32 if needed + if keep_in_fp32_regex is not None: + for name, param in model.named_parameters(): + if keep_in_fp32_regex.search(name): + # param = param.to(torch.float32) does not work here as only in the local scope. + param.data = param.data.to(torch.float32) + + # Make sure we are able to load base models as well as derived models (specific task models, with heads) + model_to_load = model + # In this case, we load a ForTaskModel with keys from a BaseModel -> only load keys to the BaseModel + if loading_task_model_from_base_state_dict: + model_to_load = getattr(model, prefix) + # Here we need to remove the prefix we added to correctly find missing/unexpected keys, as we will load + # in the submodule + key_renaming_mapping = {k: v[len(_prefix) :] for k, v in key_renaming_mapping.items()} + checkpoint_keys = list(key_renaming_mapping.values()) + # We need to update the device map as well + if device_map is not None: + device_map = {k[len(_prefix) :] if k.startswith(_prefix) else k: v for k, v in device_map.items()} + # small sanity check: the base model should not contain task-specific head keys + task_specific_expected_keys = [s for s in model.state_dict().keys() if not s.startswith(_prefix)] + base_model_expected_keys = list(model_to_load.state_dict().keys()) + if any( + key in task_specific_expected_keys and key not in base_model_expected_keys for key in checkpoint_keys + ): + raise ValueError( + "The state dictionary of the model you are trying to load is corrupted. Are you sure it was " + "properly saved?" + ) + + # Get reverse key mapping + reverse_key_renaming_mapping = {v: k for k, v in key_renaming_mapping.items()} + + is_offloaded_safetensors = False + # This offload index if for params explicitly on the "disk" in the device_map + disk_offload_index = None + disk_only_shard_files = [] + # Prepare parameters offloading if needed + if device_map is not None and "disk" in device_map.values(): + if offload_state_dict is None: + offload_state_dict = True + if disk_offload_folder is not None: + os.makedirs(disk_offload_folder, exist_ok=True) + is_offloaded_safetensors = checkpoint_files is not None and checkpoint_files[0].endswith(".safetensors") + if disk_offload_folder is None and not is_offloaded_safetensors: + raise ValueError( + "The current `device_map` had weights offloaded to the disk. Please provide an `offload_folder`" + " for them. Alternatively, make sure you have `safetensors` installed if the model you are using" + " offers the weights in this format." + ) + if is_offloaded_safetensors: + param_device_map = expand_device_map(device_map, checkpoint_keys) + str_dtype = str(dtype).replace("torch.", "") if dtype is not None else "float32" + if sharded_metadata is None: + weight_map = dict.fromkeys(checkpoint_keys, checkpoint_files[0]) + else: + folder = os.path.sep.join(checkpoint_files[0].split(os.path.sep)[:-1]) + # Fix the weight map keys according to the key mapping + weight_map = { + key_renaming_mapping[k]: v + for k, v in sharded_metadata["weight_map"].items() + if k in key_renaming_mapping + } + weight_map = {k: os.path.join(folder, v) for k, v in weight_map.items()} + # Find potential checkpoints containing only offloaded weights + disk_only_shard_files = get_disk_only_shard_files(device_map, weight_map) + disk_offload_index = { + name: { + "safetensors_file": file, + "weight_name": reverse_key_renaming_mapping[name], + "dtype": str_dtype, + } + for name, file in weight_map.items() + if param_device_map[name] == "disk" + } + else: + disk_offload_index = {} + + # This offload index if for params that are supposed to be on the "cpu", either with or without a device_map + # It allows to load parameters one-by-one from the state dict, avoiding a memory peak of 2 x state_dict_size, + # i.e. 1x to load it, and 1x to copy it to model + cpu_offload_folder = None + cpu_offload_index = None + if offload_state_dict: + cpu_offload_folder = tempfile.mkdtemp() + cpu_offload_index = {} + + # For nice tqdm bars + if checkpoint_files is not None and len(checkpoint_files) > 1: + checkpoint_files = logging.tqdm(checkpoint_files, desc="Loading checkpoint shards") + # To be able to iterate, even if we don't use it if the state_dict is already provided + elif state_dict is not None: + checkpoint_files = [""] + + # Compute expected model keys + expected_keys = list(model_to_load.state_dict().keys()) + if hf_quantizer is not None: + expected_keys = hf_quantizer.update_expected_keys(model_to_load, expected_keys, checkpoint_keys) + + # Warmup cuda to load the weights much faster on devices + if device_map is not None and not is_hqq_or_quark: + expanded_device_map = expand_device_map(device_map, expected_keys) + caching_allocator_warmup(model_to_load, expanded_device_map, hf_quantizer) + + error_msgs = [] + # Iterate on all the shards to load the weights + for shard_file in checkpoint_files: + # Skip the load for shards that only contain disk-offloaded weights + if shard_file in disk_only_shard_files: + continue + + map_location = "cpu" + if ( + shard_file.endswith(".safetensors") + and not is_hqq_or_bnb + and not (is_deepspeed_zero3_enabled() and not is_quantized) + ): + map_location = "meta" + elif ( + device_map is not None + and hf_quantizer is not None + and hf_quantizer.quantization_config.quant_method == QuantizationMethod.TORCHAO + and ( + hf_quantizer.quantization_config.quant_type in ["int4_weight_only", "autoquant"] + or isinstance(hf_quantizer.quantization_config.quant_type, Int4WeightOnlyConfig) + ) + ): + map_location = torch.device([d for d in device_map.values() if d not in ["cpu", "disk"]][0]) + + # If shard_file is "", we use the existing state_dict instead of loading it + if shard_file != "": + state_dict = load_state_dict( + shard_file, is_quantized=is_quantized, map_location=map_location, weights_only=weights_only + ) + + # Fix the key names + state_dict = {key_renaming_mapping[k]: v for k, v in state_dict.items() if k in key_renaming_mapping} + + if is_deepspeed_zero3_enabled() and not is_quantized: + error_msgs += _load_state_dict_into_zero3_model(model_to_load, state_dict) + # Skip it with fsdp on ranks other than 0 + elif not (is_fsdp_enabled() and not is_local_dist_rank_0() and not is_quantized): + disk_offload_index, cpu_offload_index = _load_state_dict_into_meta_model( + model_to_load, + state_dict, + shard_file, + expected_keys, + reverse_key_renaming_mapping, + device_map=device_map, + disk_offload_folder=disk_offload_folder, + disk_offload_index=disk_offload_index, + cpu_offload_folder=cpu_offload_folder, + cpu_offload_index=cpu_offload_index, + hf_quantizer=hf_quantizer, + is_safetensors=is_offloaded_safetensors, + keep_in_fp32_regex=keep_in_fp32_regex, + unexpected_keys=unexpected_keys, + device_mesh=device_mesh, + ) + + # force memory release if loading multiple shards, to avoid having 2 state dicts in memory in next loop + del state_dict + + # Adjust offloaded weights name and save if needed + if disk_offload_index is not None and len(disk_offload_index) > 0: + if loading_task_model_from_base_state_dict: + # We need to add the prefix of the base model + prefix = cls.base_model_prefix + if not is_offloaded_safetensors: + for weight_name in disk_offload_index: + shutil.move( + os.path.join(disk_offload_folder, f"{weight_name}.dat"), + os.path.join(disk_offload_folder, f"{prefix}.{weight_name}.dat"), + ) + disk_offload_index = {f"{prefix}.{key}": value for key, value in disk_offload_index.items()} + if not is_offloaded_safetensors: + save_offload_index(disk_offload_index, disk_offload_folder) + disk_offload_index = None + + # one-at-a-time param loading for the cpu offloaded params + if offload_state_dict: + # Load back temporarily offloaded state dict + load_offloaded_weights(model_to_load, cpu_offload_index, cpu_offload_folder) + shutil.rmtree(cpu_offload_folder) + + if hf_quantizer is not None: + missing_keys = hf_quantizer.update_missing_keys_after_loading(model_to_load, missing_keys, prefix) + + # Post-processing for tensor parallelism + if device_mesh is not None: + # When using TP, the device map is a single device for all parameters + tp_device = list(device_map.values())[0] + # This is needed for the RotaryEmbedding, which was not initialized on the correct device as it is + # not part of the state_dict (persistent=False) + for buffer in model.buffers(): + if buffer.device != tp_device: + buffer.data = buffer.to(tp_device) + + # In this case, the top-most task module weights were not moved to device and parallelized as they + # were not part of the loaded weights: do it now + if loading_task_model_from_base_state_dict: + parameters_to_initialize = { + name: param for name, param in model.named_parameters() if not name.startswith(prefix) + } + for name, param in parameters_to_initialize.items(): + # First move data to correct + to_contiguous, casting_dtype = _infer_parameter_dtype(model, name, param, keep_in_fp32_regex) + shard_and_distribute_module( + model, + param.to(tp_device), + param, + name, + casting_dtype, + to_contiguous, + os.environ["RANK"], + device_mesh, + ) + + # All potential warnings/infos + if len(error_msgs) > 0: + error_msg = "\n\t".join(error_msgs) + if "size mismatch" in error_msg: + error_msg += ( + "\n\tYou may consider adding `ignore_mismatched_sizes=True` in the model `from_pretrained` method." + ) + raise RuntimeError(f"Error(s) in loading state_dict for {model.__class__.__name__}:\n\t{error_msg}") + if len(unexpected_keys) > 0: + archs = [] if model.config.architectures is None else model.config.architectures + warner = logger.warning if model.__class__.__name__ in archs else logger.info + warner( + f"Some weights of the model checkpoint at {pretrained_model_name_or_path} were not used when" + f" initializing {model.__class__.__name__}: {unexpected_keys}\n- This IS expected if you are" + f" initializing {model.__class__.__name__} from the checkpoint of a model trained on another task or" + " with another architecture (e.g. initializing a BertForSequenceClassification model from a" + " BertForPreTraining model).\n- This IS NOT expected if you are initializing" + f" {model.__class__.__name__} from the checkpoint of a model that you expect to be exactly identical" + " (initializing a BertForSequenceClassification model from a BertForSequenceClassification model)." + ) + else: + logger.info(f"All model checkpoint weights were used when initializing {model.__class__.__name__}.\n") + if len(missing_keys) > 0: + logger.warning( + f"Some weights of {model.__class__.__name__} were not initialized from the model checkpoint at" + f" {pretrained_model_name_or_path} and are newly initialized: {missing_keys}\nYou should probably" + " TRAIN this model on a down-stream task to be able to use it for predictions and inference." + ) + elif len(mismatched_keys) == 0: + logger.info( + f"All the weights of {model.__class__.__name__} were initialized from the model checkpoint at" + f" {pretrained_model_name_or_path}.\nIf your task is similar to the task the model of the checkpoint" + f" was trained on, you can already use {model.__class__.__name__} for predictions without further" + " training." + ) + if len(mismatched_keys) > 0: + mismatched_warning = "\n".join( + [ + f"- {key}: found shape {shape1} in the checkpoint and {shape2} in the model instantiated" + for key, (shape1, shape2) in zip(mismatched_keys, mismatched_shapes) + ] + ) + logger.warning( + f"Some weights of {model.__class__.__name__} were not initialized from the model checkpoint at" + f" {pretrained_model_name_or_path} and are newly initialized because the shapes did not" + f" match:\n{mismatched_warning}\nYou should probably TRAIN this model on a down-stream task to be able" + " to use it for predictions and inference." + ) + + return model, missing_keys, unexpected_keys, mismatched_keys, disk_offload_index, error_msgs + + @classmethod + def _load_from_tf(cls, model, config, checkpoint_files): + if checkpoint_files[0].endswith(".index"): + # Load from a TensorFlow 1.X checkpoint - provided by original authors + model = cls.load_tf_weights(model, config, checkpoint_files[0][:-6]) # Remove the '.index' + loading_info = None + else: + # Load from our TensorFlow 2.0 checkpoints + try: + from .modeling_tf_pytorch_utils import load_tf2_checkpoint_in_pytorch_model + + model, loading_info = load_tf2_checkpoint_in_pytorch_model( + model, checkpoint_files[0], allow_missing_keys=True, output_loading_info=True + ) + except ImportError: + logger.error( + "Loading a TensorFlow model in PyTorch, requires both PyTorch and TensorFlow to be installed." + " Please see https://pytorch.org/ and https://www.tensorflow.org/install/ for installation" + " instructions." + ) + raise + return model, loading_info + + @classmethod + def _load_from_flax(cls, model, checkpoint_files): + try: + from .modeling_flax_pytorch_utils import load_flax_checkpoint_in_pytorch_model + + model = load_flax_checkpoint_in_pytorch_model(model, checkpoint_files[0]) + except ImportError: + logger.error( + "Loading a Flax model in PyTorch, requires both PyTorch and Flax to be installed. Please see" + " https://pytorch.org/ and https://flax.readthedocs.io/en/latest/installation.html for" + " installation instructions." + ) + raise + return model + + def retrieve_modules_from_names(self, names, add_prefix=False, remove_prefix=False): + module_keys = {".".join(key.split(".")[:-1]) for key in names} + + # torch.nn.ParameterList is a special case where two parameter keywords + # are appended to the module name, *e.g.* bert.special_embeddings.0 + module_keys = module_keys.union( + {".".join(key.split(".")[:-2]) for key in names if len(key) > 0 and key[-1].isdigit()} + ) + + retrieved_modules = [] + # retrieve all modules that has at least one missing weight name + for name, module in self.named_modules(): + if remove_prefix: + _prefix = f"{self.base_model_prefix}." + name = name[len(_prefix) :] if name.startswith(_prefix) else name + elif add_prefix: + name = ".".join([self.base_model_prefix, name]) if len(name) > 0 else self.base_model_prefix + + if name in module_keys: + retrieved_modules.append(module) + + return retrieved_modules + + @classmethod + def register_for_auto_class(cls, auto_class="AutoModel"): + """ + Register this class with a given auto class. This should only be used for custom models as the ones in the + library are already mapped with an auto class. + + + + This API is experimental and may have some slight breaking changes in the next releases. + + + + Args: + auto_class (`str` or `type`, *optional*, defaults to `"AutoModel"`): + The auto class to register this new model with. + """ + if not isinstance(auto_class, str): + auto_class = auto_class.__name__ + + import transformers.models.auto as auto_module + + if not hasattr(auto_module, auto_class): + raise ValueError(f"{auto_class} is not a valid auto class.") + + cls._auto_class = auto_class + + def to_bettertransformer(self) -> "PreTrainedModel": + """ + Converts the model to use [PyTorch's native attention + implementation](https://pytorch.org/docs/stable/generated/torch.nn.MultiheadAttention.html), integrated to + Transformers through [Optimum library](https://huggingface.co/docs/optimum/bettertransformer/overview). Only a + subset of all Transformers models are supported. + + PyTorch's attention fastpath allows to speed up inference through kernel fusions and the use of [nested + tensors](https://pytorch.org/docs/stable/nested.html). Detailed benchmarks can be found in [this blog + post](https://medium.com/pytorch/bettertransformer-out-of-the-box-performance-for-huggingface-transformers-3fbe27d50ab2). + + Returns: + [`PreTrainedModel`]: The model converted to BetterTransformer. + """ + if not is_optimum_available(): + raise ImportError("The package `optimum` is required to use Better Transformer.") + + from optimum.version import __version__ as optimum_version + + if version.parse(optimum_version) < version.parse("1.7.0"): + raise ImportError( + f"Please install optimum>=1.7.0 to use Better Transformer. The version {optimum_version} was found." + ) + + from optimum.bettertransformer import BetterTransformer + + return BetterTransformer.transform(self) + + def reverse_bettertransformer(self): + """ + Reverts the transformation from [`~PreTrainedModel.to_bettertransformer`] so that the original modeling is + used, for example in order to save the model. + + Returns: + [`PreTrainedModel`]: The model converted back to the original modeling. + """ + if not is_optimum_available(): + raise ImportError("The package `optimum` is required to use Better Transformer.") + + from optimum.version import __version__ as optimum_version + + if version.parse(optimum_version) < version.parse("1.7.0"): + raise ImportError( + f"Please install optimum>=1.7.0 to use Better Transformer. The version {optimum_version} was found." + ) + + from optimum.bettertransformer import BetterTransformer + + return BetterTransformer.reverse(self) + + def warn_if_padding_and_no_attention_mask(self, input_ids, attention_mask): + """ + Shows a one-time warning if the input_ids appear to contain padding and no attention mask was given. + """ + + # Skip the check during tracing. + if is_torch_fx_proxy(input_ids) or torch.jit.is_tracing() or is_torchdynamo_compiling(): + return + + if (attention_mask is not None) or (self.config.pad_token_id is None): + return + + # Check only the first and last input IDs to reduce overhead. + if self.config.pad_token_id in input_ids[:, [-1, 0]]: + warn_string = ( + "We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See " + "https://huggingface.co/docs/transformers/troubleshooting" + "#incorrect-output-when-padding-tokens-arent-masked." + ) + + # If the pad token is equal to either BOS, EOS, or SEP, we do not know whether the user should use an + # attention_mask or not. In this case, we should still show a warning because this is a rare case. + if ( + (self.config.bos_token_id is not None and self.config.bos_token_id == self.config.pad_token_id) + or (self.config.eos_token_id is not None and self.config.eos_token_id == self.config.pad_token_id) + or (self.config.sep_token_id is not None and self.config.sep_token_id == self.config.pad_token_id) + ): + warn_string += ( + f"\nYou may ignore this warning if your `pad_token_id` ({self.config.pad_token_id}) is identical " + f"to the `bos_token_id` ({self.config.bos_token_id}), `eos_token_id` ({self.config.eos_token_id}), " + f"or the `sep_token_id` ({self.config.sep_token_id}), and your input is not padded." + ) + + logger.warning_once(warn_string) + + @property + def supports_tp_plan(self): + """ + Returns whether the model has a tensor parallelism plan. + """ + if self._tp_plan is not None: + return True + # Check if base model has a TP plan + if getattr(self.base_model, "_tp_plan", None) is not None: + return True + return False + + @property + def tp_size(self): + """ + Returns the model's tensor parallelism degree. + """ + # if None, the model didn't undergo tensor parallel sharding + return self._tp_size + + @property + def supports_pp_plan(self): + if self._pp_plan is not None: + return True + # Check if base model has PP plan + if getattr(self.base_model, "_pp_plan", None) is not None: + return True + return False + + @property + def loss_function(self): + if hasattr(self, "_loss_function"): + return self._loss_function + + loss_type = getattr(self, "loss_type", None) + + if loss_type is None or loss_type not in LOSS_MAPPING: + logger.warning_once( + f"`loss_type={loss_type}` was set in the config but it is unrecognised." + f"Using the default loss: `ForCausalLMLoss`." + ) + loss_type = "ForCausalLM" + return LOSS_MAPPING[loss_type] + + @loss_function.setter + def loss_function(self, value): + self._loss_function = value + + def get_compiled_call(self, compile_config: Optional[CompileConfig]) -> Callable: + """Return a `torch.compile`'d version of `self.__call__`. This is useful to dynamically choose between + non-compiled/compiled `forward` during inference, especially to switch between prefill (where we don't + want to use compiled version to avoid recomputing the graph with new shapes) and iterative decoding + (where we want the speed-ups of compiled version with static shapes).""" + # Only reset it if not present or different from previous config + if "llama4" in self.config.model_type: # TODO try to enable for FULL COMPILE HYBRID CACHE SUPPORT + return self.__call__ + default_config = getattr(self.generation_config, "compile_config", None) or CompileConfig() + if ( + not hasattr(self, "_compiled_call") + or getattr(self, "_last_compile_config", default_config) != compile_config + ): + self._last_compile_config = compile_config + self._compiled_call = torch.compile(self.__call__, **compile_config.to_dict()) + return self._compiled_call + + @classmethod + def is_backend_compatible(cls): + return cls._supports_attention_backend + + def _move_missing_keys_from_meta_to_cpu( + self, + missing_keys: List[str], + unexpected_keys: List[str], + dtype: Optional[torch.dtype], + hf_quantizer: Optional[HfQuantizer], + ) -> "PreTrainedModel": + """Move the missing keys (keys that are part of the model parameters, but were NOT found in the loaded state dicts) back + from meta device to cpu. + """ + is_quantized = hf_quantizer is not None + + # In this case we need to move everything back + if is_fsdp_enabled() and not is_local_dist_rank_0() and not is_quantized: + # We only do it for the parameters, as the buffers are not initialized on the meta device by default + for key, param in self.named_parameters(): + value = torch.empty_like(param, dtype=dtype, device="cpu") + _load_parameter_into_model(self, key, value) + return + + model_state_dict = self.state_dict() + for key in missing_keys: + param = model_state_dict[key] + # Buffers are not initialized on the meta device, so we still need this check to avoid overwriting them + if param.device == torch.device("meta"): + value = torch.empty_like(param, dtype=dtype, device="cpu") + if ( + not is_quantized + or (getattr(hf_quantizer, "requires_parameters_quantization", False)) + or not hf_quantizer.check_quantized_param(self, param_value=value, param_name=key, state_dict={}) + ): + _load_parameter_into_model(self, key, value) + else: + hf_quantizer.create_quantized_param(self, value, key, "cpu", model_state_dict, unexpected_keys) + + def _initialize_missing_keys( + self, + loaded_keys: List[str], + ignore_mismatched_sizes: bool, + is_quantized: bool, + ) -> "PreTrainedModel": + """Initialize the missing keys (keys that are part of the model parameters, but were NOT found in the loaded state dicts), according to + `_initialize_weights`. Indeed, since the corresponding weights are missing from the state dict, they will not be replaced and need to + be initialized correctly (i.e. weight initialization distribution). + Also take care of setting the `_is_hf_initialized` flag for keys that are not missing. + """ + if not ignore_mismatched_sizes: + not_initialized_submodules = set_initialized_submodules(self, loaded_keys) + # If we're about to tie the output embeds to the input embeds we don't need to init them + if ( + hasattr(self.config.get_text_config(decoder=True), "tie_word_embeddings") + and self.config.get_text_config(decoder=True).tie_word_embeddings + ): + output_embeddings = self.get_output_embeddings() + if output_embeddings is not None: + # Still need to initialize if there is a bias term since biases are not tied. + if not hasattr(output_embeddings, "bias") or output_embeddings.bias is None: + output_embeddings._is_hf_initialized = True + else: + not_initialized_submodules = dict(self.named_modules()) + # This will only initialize submodules that are not marked as initialized by the line above. + if is_deepspeed_zero3_enabled() and not is_quantized: + not_initialized_parameters = list( + set( + itertools.chain.from_iterable( + submodule.parameters(recurse=False) for submodule in not_initialized_submodules.values() + ) + ) + ) + with deepspeed.zero.GatheredParameters(not_initialized_parameters, modifier_rank=0): + self.initialize_weights() + else: + self.initialize_weights() + + def get_parameter_or_buffer(self, target: str): + """ + Return the parameter or buffer given by `target` if it exists, otherwise throw an error. This combines + `get_parameter()` and `get_buffer()` in a single handy function. Note that it only work if `target` is a + leaf of the model. + """ + try: + return self.get_parameter(target) + except AttributeError: + pass + try: + return self.get_buffer(target) + except AttributeError: + pass + raise AttributeError(f"`{target}` is neither a parameter nor a buffer.") + + +PreTrainedModel.push_to_hub = copy_func(PreTrainedModel.push_to_hub) +if PreTrainedModel.push_to_hub.__doc__ is not None: + PreTrainedModel.push_to_hub.__doc__ = PreTrainedModel.push_to_hub.__doc__.format( + object="model", object_class="AutoModel", object_files="model file" + ) + + +class PoolerStartLogits(nn.Module): + """ + Compute SQuAD start logits from sequence hidden states. + + Args: + config ([`PretrainedConfig`]): + The config used by the model, will be used to grab the `hidden_size` of the model. + """ + + def __init__(self, config: PretrainedConfig): + super().__init__() + self.dense = nn.Linear(config.hidden_size, 1) + logger.warning_once( + "[DEPRECATION WARNING] `PoolerStartLogits` is deprecated and will be removed in v4.53. " + "Please use model-specific class, e.g. `XLMPoolerStartLogits`." + ) + + def forward( + self, hidden_states: torch.FloatTensor, p_mask: Optional[torch.FloatTensor] = None + ) -> torch.FloatTensor: + """ + Args: + hidden_states (`torch.FloatTensor` of shape `(batch_size, seq_len, hidden_size)`): + The final hidden states of the model. + p_mask (`torch.FloatTensor` of shape `(batch_size, seq_len)`, *optional*): + Mask for tokens at invalid position, such as query and special symbols (PAD, SEP, CLS). 1.0 means token + should be masked. + + Returns: + `torch.FloatTensor`: The start logits for SQuAD. + """ + x = self.dense(hidden_states).squeeze(-1) + + if p_mask is not None: + if get_parameter_dtype(self) == torch.float16: + x = x * (1 - p_mask) - 65500 * p_mask + else: + x = x * (1 - p_mask) - 1e30 * p_mask + + return x + + +class PoolerEndLogits(nn.Module): + """ + Compute SQuAD end logits from sequence hidden states. + + Args: + config ([`PretrainedConfig`]): + The config used by the model, will be used to grab the `hidden_size` of the model and the `layer_norm_eps` + to use. + """ + + def __init__(self, config: PretrainedConfig): + super().__init__() + self.dense_0 = nn.Linear(config.hidden_size * 2, config.hidden_size) + self.activation = nn.Tanh() + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.dense_1 = nn.Linear(config.hidden_size, 1) + logger.warning_once( + "[DEPRECATION WARNING] `PoolerEndLogits` is deprecated and will be removed in v4.53. " + "Please use model-specific class, e.g. `XLMPoolerEndLogits`." + ) + + def forward( + self, + hidden_states: torch.FloatTensor, + start_states: Optional[torch.FloatTensor] = None, + start_positions: Optional[torch.LongTensor] = None, + p_mask: Optional[torch.FloatTensor] = None, + ) -> torch.FloatTensor: + """ + Args: + hidden_states (`torch.FloatTensor` of shape `(batch_size, seq_len, hidden_size)`): + The final hidden states of the model. + start_states (`torch.FloatTensor` of shape `(batch_size, seq_len, hidden_size)`, *optional*): + The hidden states of the first tokens for the labeled span. + start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + The position of the first token for the labeled span. + p_mask (`torch.FloatTensor` of shape `(batch_size, seq_len)`, *optional*): + Mask for tokens at invalid position, such as query and special symbols (PAD, SEP, CLS). 1.0 means token + should be masked. + + + + One of `start_states` or `start_positions` should be not `None`. If both are set, `start_positions` overrides + `start_states`. + + + + Returns: + `torch.FloatTensor`: The end logits for SQuAD. + """ + assert start_states is not None or start_positions is not None, ( + "One of start_states, start_positions should be not None" + ) + if start_positions is not None: + slen, hsz = hidden_states.shape[-2:] + start_positions = start_positions[:, None, None].expand(-1, -1, hsz) # shape (bsz, 1, hsz) + start_states = hidden_states.gather(-2, start_positions) # shape (bsz, 1, hsz) + start_states = start_states.expand(-1, slen, -1) # shape (bsz, slen, hsz) + + x = self.dense_0(torch.cat([hidden_states, start_states], dim=-1)) + x = self.activation(x) + x = self.LayerNorm(x) + x = self.dense_1(x).squeeze(-1) + + if p_mask is not None: + if get_parameter_dtype(self) == torch.float16: + x = x * (1 - p_mask) - 65500 * p_mask + else: + x = x * (1 - p_mask) - 1e30 * p_mask + + return x + + +class PoolerAnswerClass(nn.Module): + """ + Compute SQuAD 2.0 answer class from classification and start tokens hidden states. + + Args: + config ([`PretrainedConfig`]): + The config used by the model, will be used to grab the `hidden_size` of the model. + """ + + def __init__(self, config): + super().__init__() + self.dense_0 = nn.Linear(config.hidden_size * 2, config.hidden_size) + self.activation = nn.Tanh() + self.dense_1 = nn.Linear(config.hidden_size, 1, bias=False) + logger.warning_once( + "[DEPRECATION WARNING] `PoolerAnswerClass` is deprecated and will be removed in v4.53. " + "Please use model-specific class, e.g. `XLMPoolerAnswerClass`." + ) + + def forward( + self, + hidden_states: torch.FloatTensor, + start_states: Optional[torch.FloatTensor] = None, + start_positions: Optional[torch.LongTensor] = None, + cls_index: Optional[torch.LongTensor] = None, + ) -> torch.FloatTensor: + """ + Args: + hidden_states (`torch.FloatTensor` of shape `(batch_size, seq_len, hidden_size)`): + The final hidden states of the model. + start_states (`torch.FloatTensor` of shape `(batch_size, seq_len, hidden_size)`, *optional*): + The hidden states of the first tokens for the labeled span. + start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + The position of the first token for the labeled span. + cls_index (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Position of the CLS token for each sentence in the batch. If `None`, takes the last token. + + + + One of `start_states` or `start_positions` should be not `None`. If both are set, `start_positions` overrides + `start_states`. + + + + Returns: + `torch.FloatTensor`: The SQuAD 2.0 answer class. + """ + # No dependency on end_feature so that we can obtain one single `cls_logits` for each sample. + hsz = hidden_states.shape[-1] + assert start_states is not None or start_positions is not None, ( + "One of start_states, start_positions should be not None" + ) + if start_positions is not None: + start_positions = start_positions[:, None, None].expand(-1, -1, hsz) # shape (bsz, 1, hsz) + start_states = hidden_states.gather(-2, start_positions).squeeze(-2) # shape (bsz, hsz) + + if cls_index is not None: + cls_index = cls_index[:, None, None].expand(-1, -1, hsz) # shape (bsz, 1, hsz) + cls_token_state = hidden_states.gather(-2, cls_index).squeeze(-2) # shape (bsz, hsz) + else: + cls_token_state = hidden_states[:, -1, :] # shape (bsz, hsz) + + x = self.dense_0(torch.cat([start_states, cls_token_state], dim=-1)) + x = self.activation(x) + x = self.dense_1(x).squeeze(-1) + + return x + + +@dataclass +class SquadHeadOutput(ModelOutput): + """ + Base class for outputs of question answering models using a [`~modeling_utils.SQuADHead`]. + + Args: + loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned if both `start_positions` and `end_positions` are provided): + Classification loss as the sum of start token, end token (and is_impossible if provided) classification + losses. + start_top_log_probs (`torch.FloatTensor` of shape `(batch_size, config.start_n_top)`, *optional*, returned if `start_positions` or `end_positions` is not provided): + Log probabilities for the top config.start_n_top start token possibilities (beam-search). + start_top_index (`torch.LongTensor` of shape `(batch_size, config.start_n_top)`, *optional*, returned if `start_positions` or `end_positions` is not provided): + Indices for the top config.start_n_top start token possibilities (beam-search). + end_top_log_probs (`torch.FloatTensor` of shape `(batch_size, config.start_n_top * config.end_n_top)`, *optional*, returned if `start_positions` or `end_positions` is not provided): + Log probabilities for the top `config.start_n_top * config.end_n_top` end token possibilities + (beam-search). + end_top_index (`torch.LongTensor` of shape `(batch_size, config.start_n_top * config.end_n_top)`, *optional*, returned if `start_positions` or `end_positions` is not provided): + Indices for the top `config.start_n_top * config.end_n_top` end token possibilities (beam-search). + cls_logits (`torch.FloatTensor` of shape `(batch_size,)`, *optional*, returned if `start_positions` or `end_positions` is not provided): + Log probabilities for the `is_impossible` label of the answers. + + """ + + loss: Optional[torch.FloatTensor] = None + start_top_log_probs: Optional[torch.FloatTensor] = None + start_top_index: Optional[torch.LongTensor] = None + end_top_log_probs: Optional[torch.FloatTensor] = None + end_top_index: Optional[torch.LongTensor] = None + cls_logits: Optional[torch.FloatTensor] = None + + def __post_init__(self): + logger.warning_once( + "[DEPRECATION WARNING] `SquadHeadOutput` is deprecated and will be removed in v4.53. " + "Please use model-specific class, e.g. `XLMSquadHeadOutput`." + ) + + +class SQuADHead(nn.Module): + r""" + A SQuAD head inspired by XLNet. + + Args: + config ([`PretrainedConfig`]): + The config used by the model, will be used to grab the `hidden_size` of the model and the `layer_norm_eps` + to use. + """ + + def __init__(self, config): + super().__init__() + self.start_n_top = config.start_n_top + self.end_n_top = config.end_n_top + + self.start_logits = PoolerStartLogits(config) + self.end_logits = PoolerEndLogits(config) + self.answer_class = PoolerAnswerClass(config) + + logger.warning_once( + "[DEPRECATION WARNING] `SQuADHead` is deprecated and will be removed in v4.53. " + "Please use model-specific class, e.g. `XLMSQuADHead`." + ) + + @replace_return_docstrings(output_type=SquadHeadOutput, config_class=PretrainedConfig) + def forward( + self, + hidden_states: torch.FloatTensor, + start_positions: Optional[torch.LongTensor] = None, + end_positions: Optional[torch.LongTensor] = None, + cls_index: Optional[torch.LongTensor] = None, + is_impossible: Optional[torch.LongTensor] = None, + p_mask: Optional[torch.FloatTensor] = None, + return_dict: bool = False, + ) -> Union[SquadHeadOutput, Tuple[torch.FloatTensor]]: + """ + Args: + hidden_states (`torch.FloatTensor` of shape `(batch_size, seq_len, hidden_size)`): + Final hidden states of the model on the sequence tokens. + start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Positions of the first token for the labeled span. + end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Positions of the last token for the labeled span. + cls_index (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Position of the CLS token for each sentence in the batch. If `None`, takes the last token. + is_impossible (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Whether the question has a possible answer in the paragraph or not. + p_mask (`torch.FloatTensor` of shape `(batch_size, seq_len)`, *optional*): + Mask for tokens at invalid position, such as query and special symbols (PAD, SEP, CLS). 1.0 means token + should be masked. + return_dict (`bool`, *optional*, defaults to `False`): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. + + Returns: + """ + start_logits = self.start_logits(hidden_states, p_mask=p_mask) + + if start_positions is not None and end_positions is not None: + # If we are on multi-GPU, let's remove the dimension added by batch splitting + for x in (start_positions, end_positions, cls_index, is_impossible): + if x is not None and x.dim() > 1: + x.squeeze_(-1) + + # during training, compute the end logits based on the ground truth of the start position + end_logits = self.end_logits(hidden_states, start_positions=start_positions, p_mask=p_mask) + + loss_fct = CrossEntropyLoss() + start_loss = loss_fct(start_logits, start_positions) + end_loss = loss_fct(end_logits, end_positions) + total_loss = (start_loss + end_loss) / 2 + + if cls_index is not None and is_impossible is not None: + # Predict answerability from the representation of CLS and START + cls_logits = self.answer_class(hidden_states, start_positions=start_positions, cls_index=cls_index) + loss_fct_cls = nn.BCEWithLogitsLoss() + cls_loss = loss_fct_cls(cls_logits, is_impossible) + + # note(zhiliny): by default multiply the loss by 0.5 so that the scale is comparable to start_loss and end_loss + total_loss += cls_loss * 0.5 + + return SquadHeadOutput(loss=total_loss) if return_dict else (total_loss,) + + else: + # during inference, compute the end logits based on beam search + bsz, slen, hsz = hidden_states.size() + start_log_probs = nn.functional.softmax(start_logits, dim=-1) # shape (bsz, slen) + + start_top_log_probs, start_top_index = torch.topk( + start_log_probs, self.start_n_top, dim=-1 + ) # shape (bsz, start_n_top) + start_top_index_exp = start_top_index.unsqueeze(-1).expand(-1, -1, hsz) # shape (bsz, start_n_top, hsz) + start_states = torch.gather(hidden_states, -2, start_top_index_exp) # shape (bsz, start_n_top, hsz) + start_states = start_states.unsqueeze(1).expand(-1, slen, -1, -1) # shape (bsz, slen, start_n_top, hsz) + + hidden_states_expanded = hidden_states.unsqueeze(2).expand_as( + start_states + ) # shape (bsz, slen, start_n_top, hsz) + p_mask = p_mask.unsqueeze(-1) if p_mask is not None else None + end_logits = self.end_logits(hidden_states_expanded, start_states=start_states, p_mask=p_mask) + end_log_probs = nn.functional.softmax(end_logits, dim=1) # shape (bsz, slen, start_n_top) + + end_top_log_probs, end_top_index = torch.topk( + end_log_probs, self.end_n_top, dim=1 + ) # shape (bsz, end_n_top, start_n_top) + end_top_log_probs = end_top_log_probs.view(-1, self.start_n_top * self.end_n_top) + end_top_index = end_top_index.view(-1, self.start_n_top * self.end_n_top) + + start_states = torch.einsum("blh,bl->bh", hidden_states, start_log_probs) + cls_logits = self.answer_class(hidden_states, start_states=start_states, cls_index=cls_index) + + if not return_dict: + return (start_top_log_probs, start_top_index, end_top_log_probs, end_top_index, cls_logits) + else: + return SquadHeadOutput( + start_top_log_probs=start_top_log_probs, + start_top_index=start_top_index, + end_top_log_probs=end_top_log_probs, + end_top_index=end_top_index, + cls_logits=cls_logits, + ) + + +class SequenceSummary(nn.Module): + r""" + Compute a single vector summary of a sequence hidden states. + + Args: + config ([`PretrainedConfig`]): + The config used by the model. Relevant arguments in the config class of the model are (refer to the actual + config class of your model for the default values it uses): + + - **summary_type** (`str`) -- The method to use to make this summary. Accepted values are: + + - `"last"` -- Take the last token hidden state (like XLNet) + - `"first"` -- Take the first token hidden state (like Bert) + - `"mean"` -- Take the mean of all tokens hidden states + - `"cls_index"` -- Supply a Tensor of classification token position (GPT/GPT-2) + - `"attn"` -- Not implemented now, use multi-head attention + + - **summary_use_proj** (`bool`) -- Add a projection after the vector extraction. + - **summary_proj_to_labels** (`bool`) -- If `True`, the projection outputs to `config.num_labels` classes + (otherwise to `config.hidden_size`). + - **summary_activation** (`Optional[str]`) -- Set to `"tanh"` to add a tanh activation to the output, + another string or `None` will add no activation. + - **summary_first_dropout** (`float`) -- Optional dropout probability before the projection and activation. + - **summary_last_dropout** (`float`)-- Optional dropout probability after the projection and activation. + """ + + def __init__(self, config: PretrainedConfig): + super().__init__() + + self.summary_type = getattr(config, "summary_type", "last") + if self.summary_type == "attn": + # We should use a standard multi-head attention module with absolute positional embedding for that. + # Cf. https://github.com/zihangdai/xlnet/blob/master/modeling.py#L253-L276 + # We can probably just use the multi-head attention module of PyTorch >=1.1.0 + raise NotImplementedError + + self.summary = Identity() + if hasattr(config, "summary_use_proj") and config.summary_use_proj: + if hasattr(config, "summary_proj_to_labels") and config.summary_proj_to_labels and config.num_labels > 0: + num_classes = config.num_labels + else: + num_classes = config.hidden_size + self.summary = nn.Linear(config.hidden_size, num_classes) + + activation_string = getattr(config, "summary_activation", None) + self.activation: Callable = get_activation(activation_string) if activation_string else Identity() + + self.first_dropout = Identity() + if hasattr(config, "summary_first_dropout") and config.summary_first_dropout > 0: + self.first_dropout = nn.Dropout(config.summary_first_dropout) + + self.last_dropout = Identity() + if hasattr(config, "summary_last_dropout") and config.summary_last_dropout > 0: + self.last_dropout = nn.Dropout(config.summary_last_dropout) + + logger.warning_once( + "[DEPRECATION WARNING] `SequenceSummary` is deprecated and will be removed in v4.53. " + "Please use model-specific class, e.g. `XLMSequenceSummary`." + ) + + def forward( + self, hidden_states: torch.FloatTensor, cls_index: Optional[torch.LongTensor] = None + ) -> torch.FloatTensor: + """ + Compute a single vector summary of a sequence hidden states. + + Args: + hidden_states (`torch.FloatTensor` of shape `[batch_size, seq_len, hidden_size]`): + The hidden states of the last layer. + cls_index (`torch.LongTensor` of shape `[batch_size]` or `[batch_size, ...]` where ... are optional leading dimensions of `hidden_states`, *optional*): + Used if `summary_type == "cls_index"` and takes the last token of the sequence as classification token. + + Returns: + `torch.FloatTensor`: The summary of the sequence hidden states. + """ + if self.summary_type == "last": + output = hidden_states[:, -1] + elif self.summary_type == "first": + output = hidden_states[:, 0] + elif self.summary_type == "mean": + output = hidden_states.mean(dim=1) + elif self.summary_type == "cls_index": + if cls_index is None: + cls_index = torch.full_like( + hidden_states[..., :1, :], + hidden_states.shape[-2] - 1, + dtype=torch.long, + ) + else: + cls_index = cls_index.unsqueeze(-1).unsqueeze(-1) + cls_index = cls_index.expand((-1,) * (cls_index.dim() - 1) + (hidden_states.size(-1),)) + # shape of cls_index: (bsz, XX, 1, hidden_size) where XX are optional leading dim of hidden_states + output = hidden_states.gather(-2, cls_index).squeeze(-2) # shape (bsz, XX, hidden_size) + elif self.summary_type == "attn": + raise NotImplementedError + + output = self.first_dropout(output) + output = self.summary(output) + output = self.activation(output) + output = self.last_dropout(output) + + return output + + +def unwrap_model(model: nn.Module, recursive: bool = False) -> nn.Module: + """ + Recursively unwraps a model from potential containers (as used in distributed training). + + Args: + model (`torch.nn.Module`): The model to unwrap. + recursive (`bool`, *optional*, defaults to `False`): + Whether to recursively extract all cases of `module.module` from `model` as well as unwrap child sublayers + recursively, not just the top-level distributed containers. + """ + # Use accelerate implementation if available (should always be the case when using torch) + # This is for pytorch, as we also have to handle things like dynamo + if is_accelerate_available(): + kwargs = {} + if recursive: + if not is_accelerate_available("0.29.0"): + raise RuntimeError( + "Setting `recursive=True` to `unwrap_model` requires `accelerate` v0.29.0. Please upgrade your version of accelerate" + ) + else: + kwargs["recursive"] = recursive + return extract_model_from_parallel(model, **kwargs) + else: + # since there could be multiple levels of wrapping, unwrap recursively + if hasattr(model, "module"): + return unwrap_model(model.module) + else: + return model + + +def expand_device_map(device_map, param_names): + """ + Expand a device map to return the correspondence parameter name to device. + """ + new_device_map = {} + for module, device in device_map.items(): + new_device_map.update( + {p: device for p in param_names if p == module or p.startswith(f"{module}.") or module == ""} + ) + return new_device_map + + +def is_accelerator_device(device: Union[str, int, torch.device]) -> bool: + """Check if the device is an accelerator. We need to function, as device_map can be "disk" as well, which is not + a proper `torch.device`. + """ + if device == "disk": + return False + else: + return torch.device(device).type not in ["meta", "cpu"] + + +def caching_allocator_warmup(model: PreTrainedModel, expanded_device_map: Dict, hf_quantizer: Optional[HfQuantizer]): + """This function warm-ups the caching allocator based on the size of the model tensors that will reside on each + device. It allows to have one large call to Malloc, instead of recursively calling it later when loading + the model, which is actually the loading speed botteneck. + Calling this function allows to cut the model loading time by a very large margin. + + A few facts related to loading speed (taking into account the use of this function): + - When loading a model the first time, it is usually slower than the subsequent times, because the OS is very likely + to cache the different state dicts (if enough ressources/RAM are available) + - Trying to force the OS to cache the files in advance (by e.g. accessing a small portion of them) is really hard, + and not a good idea in general as this is low level OS optimizations that depend on ressource usage anyway + - As of 18/03/2025, loading a Llama 70B model with TP takes ~1 min without file cache, and ~13s with full file cache. + The baseline, i.e. only loading the tensor shards on device and adjusting dtype (i.e. copying them) is ~5s with full cache. + These numbers are reported for TP on 4 H100 GPUs. + - It is useless to pre-allocate more than the model size in this function (i.e. using an `allocation_factor` > 1) as + cudaMalloc is not a bottleneck at all anymore + - Loading speed bottleneck is now almost only tensor copy (i.e. changing the dtype) and moving the tensors to the devices. + However, we cannot really improve on those aspects obviously, as the data needs to be moved/copied in the end. + """ + factor = 2 if hf_quantizer is None else hf_quantizer.get_cuda_warm_up_factor() + + # Remove disk, cpu and meta devices, and cast to proper torch.device + accelerator_device_map = { + param: torch.device(device) for param, device in expanded_device_map.items() if is_accelerator_device(device) + } + if not len(accelerator_device_map): + return + + tp_plan_regex = ( + re.compile("|".join([re.escape(plan) for plan in model._tp_plan])) + if _torch_distributed_available and torch.distributed.is_initialized() + else None + ) + total_byte_count = defaultdict(lambda: 0) + for param_name, device in accelerator_device_map.items(): + param = model.get_parameter_or_buffer(param_name) + # The dtype of different parameters may be different with composite models or `keep_in_fp32_modules` + param_byte_count = param.numel() * param.element_size() + + if tp_plan_regex is not None: + generic_name = re.sub(r"\.\d+\.", ".*.", param_name) + param_byte_count //= torch.distributed.get_world_size() if tp_plan_regex.search(generic_name) else 1 + + total_byte_count[device] += param_byte_count + + # This will kick off the caching allocator to avoid having to Malloc afterwards + for device, byte_count in total_byte_count.items(): + if device.type == "cuda": + index = device.index if device.index is not None else torch.cuda.current_device() + device_memory = torch.cuda.mem_get_info(index)[0] + # Allow up to (max device memory - 1.2 GiB) in resource-constrained hardware configurations. Trying to reserve more + # than that amount might sometimes lead to unecesary cuda OOM, if the last parameter to be loaded on the device is large, + # and the remaining reserved memory portion is smaller than the param size -> torch will then try to fully re-allocate all + # the param size, instead of using the remaining reserved part, and allocating only the difference, which can lead + # to OOM. See https://github.com/huggingface/transformers/issues/37436#issuecomment-2808982161 for more details. + # Note that we use an absolute value instead of device proportion here, as a 8GiB device could still allocate too much + # if using e.g. 90% of device size, while a 140GiB device would allocate too little + byte_count = min(byte_count, max(0, int(device_memory - 1.2 * 1024**3))) + # Allocate memory + _ = torch.empty(byte_count // factor, dtype=torch.float16, device=device, requires_grad=False) + + +def get_disk_only_shard_files(device_map, weight_map): + """ + Returns the list of shard files containing only weights offloaded to disk. + """ + files_content = collections.defaultdict(list) + for weight_name, filename in weight_map.items(): + while len(weight_name) > 0 and weight_name not in device_map: + weight_name = ".".join(weight_name.split(".")[:-1]) + files_content[filename].append(device_map[weight_name]) + + return [fname for fname, devices in files_content.items() if set(devices) == {"disk"}] + + +class AttentionInterface(MutableMapping): + """ + Dict-like object keeping track of allowed attention functions. You can easily add a new attention function + with a call to `register()`. If a model needs to locally overwrite an existing attention function, say `sdpa`, + it needs to declare a new instance of this class inside the `modeling_.py`, and declare it on that instance. + """ + + # Class instance object, so that a call to `register` can be reflected into all other files correctly, even if + # a new instance is created (in order to locally override a given function) + _global_mapping = { + "flash_attention_2": flash_attention_forward, + "flex_attention": flex_attention_forward, + "sdpa": sdpa_attention_forward, + } + + def __init__(self): + self._local_mapping = {} + + def __getitem__(self, key): + # First check if instance has a local override + if key in self._local_mapping: + return self._local_mapping[key] + return self._global_mapping[key] + + def __setitem__(self, key, value): + # Allow local update of the default functions without impacting other instances + self._local_mapping.update({key: value}) + + def __delitem__(self, key): + del self._local_mapping[key] + + def __iter__(self): + # Ensure we use all keys, with the overwritten ones on top + return iter({**self._global_mapping, **self._local_mapping}) + + def __len__(self): + return len(self._global_mapping.keys() | self._local_mapping.keys()) + + @classmethod + def register(cls, key: str, value: Callable): + cls._global_mapping.update({key: value}) + + def valid_keys(self) -> List[str]: + return list(self.keys()) + + +# Global AttentionInterface shared by all models which do not need to overwrite any of the existing ones +ALL_ATTENTION_FUNCTIONS: AttentionInterface = AttentionInterface() diff --git a/docs/transformers/build/lib/transformers/models/__init__.py b/docs/transformers/build/lib/transformers/models/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..26e6e0a97994aa1db773ecc59cdd5e4b48555a14 --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/__init__.py @@ -0,0 +1,335 @@ +# Copyright 2020 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ..utils import _LazyModule +from ..utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .albert import * + from .align import * + from .altclip import * + from .aria import * + from .audio_spectrogram_transformer import * + from .auto import * + from .autoformer import * + from .aya_vision import * + from .bamba import * + from .bark import * + from .bart import * + from .barthez import * + from .bartpho import * + from .beit import * + from .bert import * + from .bert_generation import * + from .bert_japanese import * + from .bertweet import * + from .big_bird import * + from .bigbird_pegasus import * + from .biogpt import * + from .bit import * + from .blenderbot import * + from .blenderbot_small import * + from .blip import * + from .blip_2 import * + from .bloom import * + from .bridgetower import * + from .bros import * + from .byt5 import * + from .camembert import * + from .canine import * + from .chameleon import * + from .chinese_clip import * + from .clap import * + from .clip import * + from .clipseg import * + from .clvp import * + from .code_llama import * + from .codegen import * + from .cohere import * + from .cohere2 import * + from .colpali import * + from .conditional_detr import * + from .convbert import * + from .convnext import * + from .convnextv2 import * + from .cpm import * + from .cpmant import * + from .ctrl import * + from .cvt import * + from .dab_detr import * + from .dac import * + from .data2vec import * + from .dbrx import * + from .deberta import * + from .deberta_v2 import * + from .decision_transformer import * + from .deformable_detr import * + from .deit import * + from .deprecated import * + from .depth_anything import * + from .depth_pro import * + from .detr import * + from .dialogpt import * + from .diffllama import * + from .dinat import * + from .dinov2 import * + from .dinov2_with_registers import * + from .distilbert import * + from .dit import * + from .donut import * + from .dpr import * + from .dpt import * + from .efficientnet import * + from .electra import * + from .emu3 import * + from .encodec import * + from .encoder_decoder import * + from .ernie import * + from .esm import * + from .falcon import * + from .falcon_mamba import * + from .fastspeech2_conformer import * + from .flaubert import * + from .flava import * + from .fnet import * + from .focalnet import * + from .fsmt import * + from .funnel import * + from .fuyu import * + from .gemma import * + from .gemma2 import * + from .gemma3 import * + from .git import * + from .glm import * + from .glm4 import * + from .glpn import * + from .got_ocr2 import * + from .gpt2 import * + from .gpt_bigcode import * + from .gpt_neo import * + from .gpt_neox import * + from .gpt_neox_japanese import * + from .gpt_sw3 import * + from .gptj import * + from .granite import * + from .granite_speech import * + from .granitemoe import * + from .granitemoeshared import * + from .grounding_dino import * + from .groupvit import * + from .helium import * + from .herbert import * + from .hiera import * + from .hubert import * + from .ibert import * + from .idefics import * + from .idefics2 import * + from .idefics3 import * + from .ijepa import * + from .imagegpt import * + from .informer import * + from .instructblip import * + from .instructblipvideo import * + from .internvl import * + from .jamba import * + from .janus import * + from .jetmoe import * + from .kosmos2 import * + from .layoutlm import * + from .layoutlmv2 import * + from .layoutlmv3 import * + from .layoutxlm import * + from .led import * + from .levit import * + from .lilt import * + from .llama import * + from .llama4 import * + from .llava import * + from .llava_next import * + from .llava_next_video import * + from .llava_onevision import * + from .longformer import * + from .longt5 import * + from .luke import * + from .lxmert import * + from .m2m_100 import * + from .mamba import * + from .mamba2 import * + from .marian import * + from .markuplm import * + from .mask2former import * + from .maskformer import * + from .mbart import * + from .mbart50 import * + from .megatron_bert import * + from .megatron_gpt2 import * + from .mgp_str import * + from .mimi import * + from .mistral import * + from .mistral3 import * + from .mixtral import * + from .mlcd import * + from .mllama import * + from .mluke import * + from .mobilebert import * + from .mobilenet_v1 import * + from .mobilenet_v2 import * + from .mobilevit import * + from .mobilevitv2 import * + from .modernbert import * + from .moonshine import * + from .moshi import * + from .mpnet import * + from .mpt import * + from .mra import * + from .mt5 import * + from .musicgen import * + from .musicgen_melody import * + from .mvp import * + from .myt5 import * + from .nemotron import * + from .nllb import * + from .nllb_moe import * + from .nougat import * + from .nystromformer import * + from .olmo import * + from .olmo2 import * + from .olmoe import * + from .omdet_turbo import * + from .oneformer import * + from .openai import * + from .opt import * + from .owlv2 import * + from .owlvit import * + from .paligemma import * + from .patchtsmixer import * + from .patchtst import * + from .pegasus import * + from .pegasus_x import * + from .perceiver import * + from .persimmon import * + from .phi import * + from .phi3 import * + from .phi4_multimodal import * + from .phimoe import * + from .phobert import * + from .pix2struct import * + from .pixtral import * + from .plbart import * + from .poolformer import * + from .pop2piano import * + from .prophetnet import * + from .pvt import * + from .pvt_v2 import * + from .qwen2 import * + from .qwen2_5_vl import * + from .qwen2_audio import * + from .qwen2_moe import * + from .qwen2_vl import * + from .rag import * + from .recurrent_gemma import * + from .reformer import * + from .regnet import * + from .rembert import * + from .resnet import * + from .roberta import * + from .roberta_prelayernorm import * + from .roc_bert import * + from .roformer import * + from .rt_detr import * + from .rt_detr_v2 import * + from .rwkv import * + from .sam import * + from .seamless_m4t import * + from .seamless_m4t_v2 import * + from .segformer import * + from .seggpt import * + from .sew import * + from .sew_d import * + from .siglip import * + from .siglip2 import * + from .smolvlm import * + from .speech_encoder_decoder import * + from .speech_to_text import * + from .speecht5 import * + from .splinter import * + from .squeezebert import * + from .stablelm import * + from .starcoder2 import * + from .superglue import * + from .superpoint import * + from .swiftformer import * + from .swin import * + from .swin2sr import * + from .swinv2 import * + from .switch_transformers import * + from .t5 import * + from .table_transformer import * + from .tapas import * + from .textnet import * + from .time_series_transformer import * + from .timesfm import * + from .timesformer import * + from .timm_backbone import * + from .timm_wrapper import * + from .trocr import * + from .tvp import * + from .udop import * + from .umt5 import * + from .unispeech import * + from .unispeech_sat import * + from .univnet import * + from .upernet import * + from .video_llava import * + from .videomae import * + from .vilt import * + from .vipllava import * + from .vision_encoder_decoder import * + from .vision_text_dual_encoder import * + from .visual_bert import * + from .vit import * + from .vit_mae import * + from .vit_msn import * + from .vitdet import * + from .vitmatte import * + from .vitpose import * + from .vitpose_backbone import * + from .vits import * + from .vivit import * + from .wav2vec2 import * + from .wav2vec2_bert import * + from .wav2vec2_conformer import * + from .wav2vec2_phoneme import * + from .wav2vec2_with_lm import * + from .wavlm import * + from .whisper import * + from .x_clip import * + from .xglm import * + from .xlm import * + from .xlm_roberta import * + from .xlm_roberta_xl import * + from .xlnet import * + from .xmod import * + from .yolos import * + from .yoso import * + from .zamba import * + from .zamba2 import * + from .zoedepth import * +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/docs/transformers/build/lib/transformers/models/albert/__init__.py b/docs/transformers/build/lib/transformers/models/albert/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..57b5747909e091ede05ff07c98254224fbebed97 --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/albert/__init__.py @@ -0,0 +1,31 @@ +# Copyright 2020 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ...utils import _LazyModule +from ...utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .configuration_albert import * + from .modeling_albert import * + from .modeling_flax_albert import * + from .modeling_tf_albert import * + from .tokenization_albert import * + from .tokenization_albert_fast import * +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/docs/transformers/build/lib/transformers/models/albert/configuration_albert.py b/docs/transformers/build/lib/transformers/models/albert/configuration_albert.py new file mode 100644 index 0000000000000000000000000000000000000000..e1e2d4547cc4e285634a8e9fcfac68092c8ded68 --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/albert/configuration_albert.py @@ -0,0 +1,170 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""ALBERT model configuration""" + +from collections import OrderedDict +from typing import Mapping + +from ...configuration_utils import PretrainedConfig +from ...onnx import OnnxConfig + + +class AlbertConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`AlbertModel`] or a [`TFAlbertModel`]. It is used + to instantiate an ALBERT model according to the specified arguments, defining the model architecture. Instantiating + a configuration with the defaults will yield a similar configuration to that of the ALBERT + [albert/albert-xxlarge-v2](https://huggingface.co/albert/albert-xxlarge-v2) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + vocab_size (`int`, *optional*, defaults to 30000): + Vocabulary size of the ALBERT model. Defines the number of different tokens that can be represented by the + `inputs_ids` passed when calling [`AlbertModel`] or [`TFAlbertModel`]. + embedding_size (`int`, *optional*, defaults to 128): + Dimensionality of vocabulary embeddings. + hidden_size (`int`, *optional*, defaults to 4096): + Dimensionality of the encoder layers and the pooler layer. + num_hidden_layers (`int`, *optional*, defaults to 12): + Number of hidden layers in the Transformer encoder. + num_hidden_groups (`int`, *optional*, defaults to 1): + Number of groups for the hidden layers, parameters in the same group are shared. + num_attention_heads (`int`, *optional*, defaults to 64): + Number of attention heads for each attention layer in the Transformer encoder. + intermediate_size (`int`, *optional*, defaults to 16384): + The dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder. + inner_group_num (`int`, *optional*, defaults to 1): + The number of inner repetition of attention and ffn. + hidden_act (`str` or `Callable`, *optional*, defaults to `"gelu_new"`): + The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, + `"relu"`, `"silu"` and `"gelu_new"` are supported. + hidden_dropout_prob (`float`, *optional*, defaults to 0): + The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. + attention_probs_dropout_prob (`float`, *optional*, defaults to 0): + The dropout ratio for the attention probabilities. + max_position_embeddings (`int`, *optional*, defaults to 512): + The maximum sequence length that this model might ever be used with. Typically set this to something large + (e.g., 512 or 1024 or 2048). + type_vocab_size (`int`, *optional*, defaults to 2): + The vocabulary size of the `token_type_ids` passed when calling [`AlbertModel`] or [`TFAlbertModel`]. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + layer_norm_eps (`float`, *optional*, defaults to 1e-12): + The epsilon used by the layer normalization layers. + classifier_dropout_prob (`float`, *optional*, defaults to 0.1): + The dropout ratio for attached classifiers. + position_embedding_type (`str`, *optional*, defaults to `"absolute"`): + Type of position embedding. Choose one of `"absolute"`, `"relative_key"`, `"relative_key_query"`. For + positional embeddings use `"absolute"`. For more information on `"relative_key"`, please refer to + [Self-Attention with Relative Position Representations (Shaw et al.)](https://arxiv.org/abs/1803.02155). + For more information on `"relative_key_query"`, please refer to *Method 4* in [Improve Transformer Models + with Better Relative Position Embeddings (Huang et al.)](https://arxiv.org/abs/2009.13658). + pad_token_id (`int`, *optional*, defaults to 0): + Padding token id. + bos_token_id (`int`, *optional*, defaults to 2): + Beginning of stream token id. + eos_token_id (`int`, *optional*, defaults to 3): + End of stream token id. + + Examples: + + ```python + >>> from transformers import AlbertConfig, AlbertModel + + >>> # Initializing an ALBERT-xxlarge style configuration + >>> albert_xxlarge_configuration = AlbertConfig() + + >>> # Initializing an ALBERT-base style configuration + >>> albert_base_configuration = AlbertConfig( + ... hidden_size=768, + ... num_attention_heads=12, + ... intermediate_size=3072, + ... ) + + >>> # Initializing a model (with random weights) from the ALBERT-base style configuration + >>> model = AlbertModel(albert_xxlarge_configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "albert" + + def __init__( + self, + vocab_size=30000, + embedding_size=128, + hidden_size=4096, + num_hidden_layers=12, + num_hidden_groups=1, + num_attention_heads=64, + intermediate_size=16384, + inner_group_num=1, + hidden_act="gelu_new", + hidden_dropout_prob=0, + attention_probs_dropout_prob=0, + max_position_embeddings=512, + type_vocab_size=2, + initializer_range=0.02, + layer_norm_eps=1e-12, + classifier_dropout_prob=0.1, + position_embedding_type="absolute", + pad_token_id=0, + bos_token_id=2, + eos_token_id=3, + **kwargs, + ): + super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs) + + self.vocab_size = vocab_size + self.embedding_size = embedding_size + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_hidden_groups = num_hidden_groups + self.num_attention_heads = num_attention_heads + self.inner_group_num = inner_group_num + self.hidden_act = hidden_act + self.intermediate_size = intermediate_size + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.max_position_embeddings = max_position_embeddings + self.type_vocab_size = type_vocab_size + self.initializer_range = initializer_range + self.layer_norm_eps = layer_norm_eps + self.classifier_dropout_prob = classifier_dropout_prob + self.position_embedding_type = position_embedding_type + + +# Copied from transformers.models.bert.configuration_bert.BertOnnxConfig with Roberta->Albert +class AlbertOnnxConfig(OnnxConfig): + @property + def inputs(self) -> Mapping[str, Mapping[int, str]]: + if self.task == "multiple-choice": + dynamic_axis = {0: "batch", 1: "choice", 2: "sequence"} + else: + dynamic_axis = {0: "batch", 1: "sequence"} + return OrderedDict( + [ + ("input_ids", dynamic_axis), + ("attention_mask", dynamic_axis), + ("token_type_ids", dynamic_axis), + ] + ) + + +__all__ = ["AlbertConfig", "AlbertOnnxConfig"] diff --git a/docs/transformers/build/lib/transformers/models/albert/convert_albert_original_tf_checkpoint_to_pytorch.py b/docs/transformers/build/lib/transformers/models/albert/convert_albert_original_tf_checkpoint_to_pytorch.py new file mode 100644 index 0000000000000000000000000000000000000000..df2a22610187586dc63511581a1cf28416bfd0c2 --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/albert/convert_albert_original_tf_checkpoint_to_pytorch.py @@ -0,0 +1,62 @@ +# coding=utf-8 +# Copyright 2018 The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Convert ALBERT checkpoint.""" + +import argparse + +import torch + +from ...utils import logging +from . import AlbertConfig, AlbertForPreTraining, load_tf_weights_in_albert + + +logging.set_verbosity_info() + + +def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, albert_config_file, pytorch_dump_path): + # Initialise PyTorch model + config = AlbertConfig.from_json_file(albert_config_file) + print(f"Building PyTorch model from configuration: {config}") + model = AlbertForPreTraining(config) + + # Load weights from tf checkpoint + load_tf_weights_in_albert(model, config, tf_checkpoint_path) + + # Save pytorch-model + print(f"Save PyTorch model to {pytorch_dump_path}") + torch.save(model.state_dict(), pytorch_dump_path) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + # Required parameters + parser.add_argument( + "--tf_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path." + ) + parser.add_argument( + "--albert_config_file", + default=None, + type=str, + required=True, + help=( + "The config json file corresponding to the pre-trained ALBERT model. \n" + "This specifies the model architecture." + ), + ) + parser.add_argument( + "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model." + ) + args = parser.parse_args() + convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path, args.albert_config_file, args.pytorch_dump_path) diff --git a/docs/transformers/build/lib/transformers/models/albert/modeling_albert.py b/docs/transformers/build/lib/transformers/models/albert/modeling_albert.py new file mode 100644 index 0000000000000000000000000000000000000000..68abb317ee3f7ac8c16963fd87685802096c0d13 --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/albert/modeling_albert.py @@ -0,0 +1,1483 @@ +# coding=utf-8 +# Copyright 2018 Google AI, Google Brain and the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch ALBERT model.""" + +import math +import os +from dataclasses import dataclass +from typing import Dict, List, Optional, Tuple, Union + +import torch +from torch import nn +from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss + +from ...activations import ACT2FN +from ...modeling_attn_mask_utils import _prepare_4d_attention_mask_for_sdpa +from ...modeling_outputs import ( + BaseModelOutput, + BaseModelOutputWithPooling, + MaskedLMOutput, + MultipleChoiceModelOutput, + QuestionAnsweringModelOutput, + SequenceClassifierOutput, + TokenClassifierOutput, +) +from ...modeling_utils import PreTrainedModel +from ...pytorch_utils import ( + apply_chunking_to_forward, + find_pruneable_heads_and_indices, + is_torch_greater_or_equal_than_2_2, + prune_linear_layer, +) +from ...utils import ( + ModelOutput, + add_code_sample_docstrings, + add_start_docstrings, + add_start_docstrings_to_model_forward, + logging, + replace_return_docstrings, +) +from .configuration_albert import AlbertConfig + + +logger = logging.get_logger(__name__) + +_CHECKPOINT_FOR_DOC = "albert/albert-base-v2" +_CONFIG_FOR_DOC = "AlbertConfig" + + +def load_tf_weights_in_albert(model, config, tf_checkpoint_path): + """Load tf checkpoints in a pytorch model.""" + try: + import re + + import numpy as np + import tensorflow as tf + except ImportError: + logger.error( + "Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see " + "https://www.tensorflow.org/install/ for installation instructions." + ) + raise + tf_path = os.path.abspath(tf_checkpoint_path) + logger.info(f"Converting TensorFlow checkpoint from {tf_path}") + # Load weights from TF model + init_vars = tf.train.list_variables(tf_path) + names = [] + arrays = [] + for name, shape in init_vars: + logger.info(f"Loading TF weight {name} with shape {shape}") + array = tf.train.load_variable(tf_path, name) + names.append(name) + arrays.append(array) + + for name, array in zip(names, arrays): + print(name) + + for name, array in zip(names, arrays): + original_name = name + + # If saved from the TF HUB module + name = name.replace("module/", "") + + # Renaming and simplifying + name = name.replace("ffn_1", "ffn") + name = name.replace("bert/", "albert/") + name = name.replace("attention_1", "attention") + name = name.replace("transform/", "") + name = name.replace("LayerNorm_1", "full_layer_layer_norm") + name = name.replace("LayerNorm", "attention/LayerNorm") + name = name.replace("transformer/", "") + + # The feed forward layer had an 'intermediate' step which has been abstracted away + name = name.replace("intermediate/dense/", "") + name = name.replace("ffn/intermediate/output/dense/", "ffn_output/") + + # ALBERT attention was split between self and output which have been abstracted away + name = name.replace("/output/", "/") + name = name.replace("/self/", "/") + + # The pooler is a linear layer + name = name.replace("pooler/dense", "pooler") + + # The classifier was simplified to predictions from cls/predictions + name = name.replace("cls/predictions", "predictions") + name = name.replace("predictions/attention", "predictions") + + # Naming was changed to be more explicit + name = name.replace("embeddings/attention", "embeddings") + name = name.replace("inner_group_", "albert_layers/") + name = name.replace("group_", "albert_layer_groups/") + + # Classifier + if len(name.split("/")) == 1 and ("output_bias" in name or "output_weights" in name): + name = "classifier/" + name + + # No ALBERT model currently handles the next sentence prediction task + if "seq_relationship" in name: + name = name.replace("seq_relationship/output_", "sop_classifier/classifier/") + name = name.replace("weights", "weight") + + name = name.split("/") + + # Ignore the gradients applied by the LAMB/ADAM optimizers. + if ( + "adam_m" in name + or "adam_v" in name + or "AdamWeightDecayOptimizer" in name + or "AdamWeightDecayOptimizer_1" in name + or "global_step" in name + ): + logger.info(f"Skipping {'/'.join(name)}") + continue + + pointer = model + for m_name in name: + if re.fullmatch(r"[A-Za-z]+_\d+", m_name): + scope_names = re.split(r"_(\d+)", m_name) + else: + scope_names = [m_name] + + if scope_names[0] == "kernel" or scope_names[0] == "gamma": + pointer = getattr(pointer, "weight") + elif scope_names[0] == "output_bias" or scope_names[0] == "beta": + pointer = getattr(pointer, "bias") + elif scope_names[0] == "output_weights": + pointer = getattr(pointer, "weight") + elif scope_names[0] == "squad": + pointer = getattr(pointer, "classifier") + else: + try: + pointer = getattr(pointer, scope_names[0]) + except AttributeError: + logger.info(f"Skipping {'/'.join(name)}") + continue + if len(scope_names) >= 2: + num = int(scope_names[1]) + pointer = pointer[num] + + if m_name[-11:] == "_embeddings": + pointer = getattr(pointer, "weight") + elif m_name == "kernel": + array = np.transpose(array) + try: + if pointer.shape != array.shape: + raise ValueError(f"Pointer shape {pointer.shape} and array shape {array.shape} mismatched") + except ValueError as e: + e.args += (pointer.shape, array.shape) + raise + print(f"Initialize PyTorch weight {name} from {original_name}") + pointer.data = torch.from_numpy(array) + + return model + + +class AlbertEmbeddings(nn.Module): + """ + Construct the embeddings from word, position and token_type embeddings. + """ + + def __init__(self, config: AlbertConfig): + super().__init__() + self.word_embeddings = nn.Embedding(config.vocab_size, config.embedding_size, padding_idx=config.pad_token_id) + self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.embedding_size) + self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.embedding_size) + + # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load + # any TensorFlow checkpoint file + self.LayerNorm = nn.LayerNorm(config.embedding_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + # position_ids (1, len position emb) is contiguous in memory and exported when serialized + self.register_buffer( + "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False + ) + self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") + self.register_buffer( + "token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False + ) + + # Copied from transformers.models.bert.modeling_bert.BertEmbeddings.forward + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + token_type_ids: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + past_key_values_length: int = 0, + ) -> torch.Tensor: + if input_ids is not None: + input_shape = input_ids.size() + else: + input_shape = inputs_embeds.size()[:-1] + + seq_length = input_shape[1] + + if position_ids is None: + position_ids = self.position_ids[:, past_key_values_length : seq_length + past_key_values_length] + + # Setting the token_type_ids to the registered buffer in constructor where it is all zeros, which usually occurs + # when its auto-generated, registered buffer helps users when tracing the model without passing token_type_ids, solves + # issue #5664 + if token_type_ids is None: + if hasattr(self, "token_type_ids"): + buffered_token_type_ids = self.token_type_ids[:, :seq_length] + buffered_token_type_ids_expanded = buffered_token_type_ids.expand(input_shape[0], seq_length) + token_type_ids = buffered_token_type_ids_expanded + else: + token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device) + + if inputs_embeds is None: + inputs_embeds = self.word_embeddings(input_ids) + token_type_embeddings = self.token_type_embeddings(token_type_ids) + + embeddings = inputs_embeds + token_type_embeddings + if self.position_embedding_type == "absolute": + position_embeddings = self.position_embeddings(position_ids) + embeddings += position_embeddings + embeddings = self.LayerNorm(embeddings) + embeddings = self.dropout(embeddings) + return embeddings + + +class AlbertAttention(nn.Module): + def __init__(self, config: AlbertConfig): + super().__init__() + if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"): + raise ValueError( + f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention " + f"heads ({config.num_attention_heads}" + ) + + self.num_attention_heads = config.num_attention_heads + self.hidden_size = config.hidden_size + self.attention_head_size = config.hidden_size // config.num_attention_heads + self.all_head_size = self.num_attention_heads * self.attention_head_size + + self.query = nn.Linear(config.hidden_size, self.all_head_size) + self.key = nn.Linear(config.hidden_size, self.all_head_size) + self.value = nn.Linear(config.hidden_size, self.all_head_size) + + self.attention_dropout = nn.Dropout(config.attention_probs_dropout_prob) + self.output_dropout = nn.Dropout(config.hidden_dropout_prob) + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.pruned_heads = set() + + self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") + if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query": + self.max_position_embeddings = config.max_position_embeddings + self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size) + + # Copied from transformers.models.bert.modeling_bert.BertSelfAttention.transpose_for_scores + def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor: + new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size) + x = x.view(new_x_shape) + return x.permute(0, 2, 1, 3) + + def prune_heads(self, heads: List[int]) -> None: + if len(heads) == 0: + return + heads, index = find_pruneable_heads_and_indices( + heads, self.num_attention_heads, self.attention_head_size, self.pruned_heads + ) + + # Prune linear layers + self.query = prune_linear_layer(self.query, index) + self.key = prune_linear_layer(self.key, index) + self.value = prune_linear_layer(self.value, index) + self.dense = prune_linear_layer(self.dense, index, dim=1) + + # Update hyper params and store pruned heads + self.num_attention_heads = self.num_attention_heads - len(heads) + self.all_head_size = self.attention_head_size * self.num_attention_heads + self.pruned_heads = self.pruned_heads.union(heads) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.FloatTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + output_attentions: bool = False, + ) -> Union[Tuple[torch.Tensor], Tuple[torch.Tensor, torch.Tensor]]: + mixed_query_layer = self.query(hidden_states) + mixed_key_layer = self.key(hidden_states) + mixed_value_layer = self.value(hidden_states) + + query_layer = self.transpose_for_scores(mixed_query_layer) + key_layer = self.transpose_for_scores(mixed_key_layer) + value_layer = self.transpose_for_scores(mixed_value_layer) + + # Take the dot product between "query" and "key" to get the raw attention scores. + attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2)) + attention_scores = attention_scores / math.sqrt(self.attention_head_size) + + if attention_mask is not None: + # Apply the attention mask is (precomputed for all layers in BertModel forward() function) + attention_scores = attention_scores + attention_mask + + if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query": + seq_length = hidden_states.size()[1] + position_ids_l = torch.arange(seq_length, dtype=torch.long, device=hidden_states.device).view(-1, 1) + position_ids_r = torch.arange(seq_length, dtype=torch.long, device=hidden_states.device).view(1, -1) + distance = position_ids_l - position_ids_r + positional_embedding = self.distance_embedding(distance + self.max_position_embeddings - 1) + positional_embedding = positional_embedding.to(dtype=query_layer.dtype) # fp16 compatibility + + if self.position_embedding_type == "relative_key": + relative_position_scores = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding) + attention_scores = attention_scores + relative_position_scores + elif self.position_embedding_type == "relative_key_query": + relative_position_scores_query = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding) + relative_position_scores_key = torch.einsum("bhrd,lrd->bhlr", key_layer, positional_embedding) + attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key + + # Normalize the attention scores to probabilities. + attention_probs = nn.functional.softmax(attention_scores, dim=-1) + + # This is actually dropping out entire tokens to attend to, which might + # seem a bit unusual, but is taken from the original Transformer paper. + attention_probs = self.attention_dropout(attention_probs) + + # Mask heads if we want to + if head_mask is not None: + attention_probs = attention_probs * head_mask + + context_layer = torch.matmul(attention_probs, value_layer) + context_layer = context_layer.transpose(2, 1).flatten(2) + + projected_context_layer = self.dense(context_layer) + projected_context_layer_dropout = self.output_dropout(projected_context_layer) + layernormed_context_layer = self.LayerNorm(hidden_states + projected_context_layer_dropout) + return (layernormed_context_layer, attention_probs) if output_attentions else (layernormed_context_layer,) + + +class AlbertSdpaAttention(AlbertAttention): + def __init__(self, config): + super().__init__(config) + self.dropout_prob = config.attention_probs_dropout_prob + self.require_contiguous_qkv = not is_torch_greater_or_equal_than_2_2 + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.FloatTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + output_attentions: bool = False, + ) -> Union[Tuple[torch.Tensor], Tuple[torch.Tensor, torch.Tensor]]: + if self.position_embedding_type != "absolute" or output_attentions or head_mask is not None: + logger.warning( + "AlbertSdpaAttention is used but `torch.nn.functional.scaled_dot_product_attention` does not support " + "non-absolute `position_embedding_type` or `output_attentions=True` or `head_mask`. Falling back to " + "the eager attention implementation, but specifying the eager implementation will be required from " + "Transformers version v5.0.0 onwards. This warning can be removed using the argument " + '`attn_implementation="eager"` when loading the model.' + ) + return super().forward(hidden_states, attention_mask, head_mask, output_attentions) + + batch_size, seq_len, _ = hidden_states.size() + query_layer = self.transpose_for_scores(self.query(hidden_states)) + key_layer = self.transpose_for_scores(self.key(hidden_states)) + value_layer = self.transpose_for_scores(self.value(hidden_states)) + + # SDPA with memory-efficient backend is broken in torch==2.1.2 when using non-contiguous inputs and a custom + # attn_mask, so we need to call `.contiguous()` here. This was fixed in torch==2.2.0. + # Reference: https://github.com/pytorch/pytorch/issues/112577 + if self.require_contiguous_qkv and query_layer.device.type == "cuda" and attention_mask is not None: + query_layer = query_layer.contiguous() + key_layer = key_layer.contiguous() + value_layer = value_layer.contiguous() + + attention_output = torch.nn.functional.scaled_dot_product_attention( + query=query_layer, + key=key_layer, + value=value_layer, + attn_mask=attention_mask, + dropout_p=self.dropout_prob if self.training else 0.0, + is_causal=False, + ) + + attention_output = attention_output.transpose(1, 2) + attention_output = attention_output.reshape(batch_size, seq_len, self.all_head_size) + + projected_context_layer = self.dense(attention_output) + projected_context_layer_dropout = self.output_dropout(projected_context_layer) + layernormed_context_layer = self.LayerNorm(hidden_states + projected_context_layer_dropout) + return (layernormed_context_layer,) + + +ALBERT_ATTENTION_CLASSES = { + "eager": AlbertAttention, + "sdpa": AlbertSdpaAttention, +} + + +class AlbertLayer(nn.Module): + def __init__(self, config: AlbertConfig): + super().__init__() + + self.config = config + self.chunk_size_feed_forward = config.chunk_size_feed_forward + self.seq_len_dim = 1 + self.full_layer_layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.attention = ALBERT_ATTENTION_CLASSES[config._attn_implementation](config) + self.ffn = nn.Linear(config.hidden_size, config.intermediate_size) + self.ffn_output = nn.Linear(config.intermediate_size, config.hidden_size) + self.activation = ACT2FN[config.hidden_act] + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.FloatTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + output_attentions: bool = False, + output_hidden_states: bool = False, + ) -> Tuple[torch.Tensor, torch.Tensor]: + attention_output = self.attention(hidden_states, attention_mask, head_mask, output_attentions) + + ffn_output = apply_chunking_to_forward( + self.ff_chunk, + self.chunk_size_feed_forward, + self.seq_len_dim, + attention_output[0], + ) + hidden_states = self.full_layer_layer_norm(ffn_output + attention_output[0]) + + return (hidden_states,) + attention_output[1:] # add attentions if we output them + + def ff_chunk(self, attention_output: torch.Tensor) -> torch.Tensor: + ffn_output = self.ffn(attention_output) + ffn_output = self.activation(ffn_output) + ffn_output = self.ffn_output(ffn_output) + return ffn_output + + +class AlbertLayerGroup(nn.Module): + def __init__(self, config: AlbertConfig): + super().__init__() + + self.albert_layers = nn.ModuleList([AlbertLayer(config) for _ in range(config.inner_group_num)]) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.FloatTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + output_attentions: bool = False, + output_hidden_states: bool = False, + ) -> Tuple[Union[torch.Tensor, Tuple[torch.Tensor]], ...]: + layer_hidden_states = () + layer_attentions = () + + for layer_index, albert_layer in enumerate(self.albert_layers): + layer_output = albert_layer(hidden_states, attention_mask, head_mask[layer_index], output_attentions) + hidden_states = layer_output[0] + + if output_attentions: + layer_attentions = layer_attentions + (layer_output[1],) + + if output_hidden_states: + layer_hidden_states = layer_hidden_states + (hidden_states,) + + outputs = (hidden_states,) + if output_hidden_states: + outputs = outputs + (layer_hidden_states,) + if output_attentions: + outputs = outputs + (layer_attentions,) + return outputs # last-layer hidden state, (layer hidden states), (layer attentions) + + +class AlbertTransformer(nn.Module): + def __init__(self, config: AlbertConfig): + super().__init__() + + self.config = config + self.embedding_hidden_mapping_in = nn.Linear(config.embedding_size, config.hidden_size) + self.albert_layer_groups = nn.ModuleList([AlbertLayerGroup(config) for _ in range(config.num_hidden_groups)]) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.FloatTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + output_attentions: bool = False, + output_hidden_states: bool = False, + return_dict: bool = True, + ) -> Union[BaseModelOutput, Tuple]: + hidden_states = self.embedding_hidden_mapping_in(hidden_states) + + all_hidden_states = (hidden_states,) if output_hidden_states else None + all_attentions = () if output_attentions else None + + head_mask = [None] * self.config.num_hidden_layers if head_mask is None else head_mask + + for i in range(self.config.num_hidden_layers): + # Number of layers in a hidden group + layers_per_group = int(self.config.num_hidden_layers / self.config.num_hidden_groups) + + # Index of the hidden group + group_idx = int(i / (self.config.num_hidden_layers / self.config.num_hidden_groups)) + + layer_group_output = self.albert_layer_groups[group_idx]( + hidden_states, + attention_mask, + head_mask[group_idx * layers_per_group : (group_idx + 1) * layers_per_group], + output_attentions, + output_hidden_states, + ) + hidden_states = layer_group_output[0] + + if output_attentions: + all_attentions = all_attentions + layer_group_output[-1] + + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + if not return_dict: + return tuple(v for v in [hidden_states, all_hidden_states, all_attentions] if v is not None) + return BaseModelOutput( + last_hidden_state=hidden_states, hidden_states=all_hidden_states, attentions=all_attentions + ) + + +class AlbertPreTrainedModel(PreTrainedModel): + """ + An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained + models. + """ + + config_class = AlbertConfig + load_tf_weights = load_tf_weights_in_albert + base_model_prefix = "albert" + _supports_sdpa = True + + def _init_weights(self, module): + """Initialize the weights.""" + if isinstance(module, nn.Linear): + # Slightly different from the TF version which uses truncated_normal for initialization + # cf https://github.com/pytorch/pytorch/pull/5617 + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + elif isinstance(module, nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + elif isinstance(module, AlbertMLMHead): + module.bias.data.zero_() + + +@dataclass +class AlbertForPreTrainingOutput(ModelOutput): + """ + Output type of [`AlbertForPreTraining`]. + + Args: + loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`): + Total loss as the sum of the masked language modeling loss and the next sequence prediction + (classification) loss. + prediction_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`): + Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). + sop_logits (`torch.FloatTensor` of shape `(batch_size, 2)`): + Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation + before SoftMax). + hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of + shape `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + + loss: Optional[torch.FloatTensor] = None + prediction_logits: Optional[torch.FloatTensor] = None + sop_logits: Optional[torch.FloatTensor] = None + hidden_states: Optional[Tuple[torch.FloatTensor]] = None + attentions: Optional[Tuple[torch.FloatTensor]] = None + + +ALBERT_START_DOCSTRING = r""" + + This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads + etc.) + + This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. + Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage + and behavior. + + Args: + config ([`AlbertConfig`]): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights. +""" + +ALBERT_INPUTS_DOCSTRING = r""" + Args: + input_ids (`torch.LongTensor` of shape `({0})`): + Indices of input sequence tokens in the vocabulary. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.__call__`] and + [`PreTrainedTokenizer.encode`] for details. + + [What are input IDs?](../glossary#input-ids) + attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*): + Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0, + 1]`: + + - 0 corresponds to a *sentence A* token, + - 1 corresponds to a *sentence B* token. + + [What are token type IDs?](../glossary#token-type-ids) + position_ids (`torch.LongTensor` of shape `({0})`, *optional*): + Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, + config.max_position_embeddings - 1]`. + + [What are position IDs?](../glossary#position-ids) + head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*): + Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This + is useful if you want more control over how to convert `input_ids` indices into associated vectors than the + model's internal embedding lookup matrix. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. +""" + + +@add_start_docstrings( + "The bare ALBERT Model transformer outputting raw hidden-states without any specific head on top.", + ALBERT_START_DOCSTRING, +) +class AlbertModel(AlbertPreTrainedModel): + config_class = AlbertConfig + base_model_prefix = "albert" + + def __init__(self, config: AlbertConfig, add_pooling_layer: bool = True): + super().__init__(config) + + self.config = config + self.embeddings = AlbertEmbeddings(config) + self.encoder = AlbertTransformer(config) + if add_pooling_layer: + self.pooler = nn.Linear(config.hidden_size, config.hidden_size) + self.pooler_activation = nn.Tanh() + else: + self.pooler = None + self.pooler_activation = None + + self.attn_implementation = config._attn_implementation + self.position_embedding_type = config.position_embedding_type + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self) -> nn.Embedding: + return self.embeddings.word_embeddings + + def set_input_embeddings(self, value: nn.Embedding) -> None: + self.embeddings.word_embeddings = value + + def _prune_heads(self, heads_to_prune: Dict[int, List[int]]) -> None: + """ + Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} ALBERT has + a different architecture in that its layers are shared across groups, which then has inner groups. If an ALBERT + model has 12 hidden layers and 2 hidden groups, with two inner groups, there is a total of 4 different layers. + + These layers are flattened: the indices [0,1] correspond to the two inner groups of the first hidden layer, + while [2,3] correspond to the two inner groups of the second hidden layer. + + Any layer with in index other than [0,1,2,3] will result in an error. See base class PreTrainedModel for more + information about head pruning + """ + for layer, heads in heads_to_prune.items(): + group_idx = int(layer / self.config.inner_group_num) + inner_group_idx = int(layer - group_idx * self.config.inner_group_num) + self.encoder.albert_layer_groups[group_idx].albert_layers[inner_group_idx].attention.prune_heads(heads) + + @add_start_docstrings_to_model_forward(ALBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @add_code_sample_docstrings( + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=BaseModelOutputWithPooling, + config_class=_CONFIG_FOR_DOC, + ) + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.FloatTensor] = None, + token_type_ids: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[BaseModelOutputWithPooling, Tuple]: + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if input_ids is not None and inputs_embeds is not None: + raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") + elif input_ids is not None: + self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask) + input_shape = input_ids.size() + elif inputs_embeds is not None: + input_shape = inputs_embeds.size()[:-1] + else: + raise ValueError("You have to specify either input_ids or inputs_embeds") + + batch_size, seq_length = input_shape + device = input_ids.device if input_ids is not None else inputs_embeds.device + + if attention_mask is None: + attention_mask = torch.ones(input_shape, device=device) + if token_type_ids is None: + if hasattr(self.embeddings, "token_type_ids"): + buffered_token_type_ids = self.embeddings.token_type_ids[:, :seq_length] + buffered_token_type_ids_expanded = buffered_token_type_ids.expand(batch_size, seq_length) + token_type_ids = buffered_token_type_ids_expanded + else: + token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device) + + embedding_output = self.embeddings( + input_ids, position_ids=position_ids, token_type_ids=token_type_ids, inputs_embeds=inputs_embeds + ) + + use_sdpa_attention_mask = ( + self.attn_implementation == "sdpa" + and self.position_embedding_type == "absolute" + and head_mask is None + and not output_attentions + ) + + if use_sdpa_attention_mask: + extended_attention_mask = _prepare_4d_attention_mask_for_sdpa( + attention_mask, embedding_output.dtype, tgt_len=seq_length + ) + else: + extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(2) + extended_attention_mask = extended_attention_mask.to(dtype=self.dtype) # fp16 compatibility + extended_attention_mask = (1.0 - extended_attention_mask) * torch.finfo(self.dtype).min + + head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers) + + encoder_outputs = self.encoder( + embedding_output, + extended_attention_mask, + head_mask=head_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + sequence_output = encoder_outputs[0] + + pooled_output = self.pooler_activation(self.pooler(sequence_output[:, 0])) if self.pooler is not None else None + + if not return_dict: + return (sequence_output, pooled_output) + encoder_outputs[1:] + + return BaseModelOutputWithPooling( + last_hidden_state=sequence_output, + pooler_output=pooled_output, + hidden_states=encoder_outputs.hidden_states, + attentions=encoder_outputs.attentions, + ) + + +@add_start_docstrings( + """ + Albert Model with two heads on top as done during the pretraining: a `masked language modeling` head and a + `sentence order prediction (classification)` head. + """, + ALBERT_START_DOCSTRING, +) +class AlbertForPreTraining(AlbertPreTrainedModel): + _tied_weights_keys = ["predictions.decoder.bias", "predictions.decoder.weight"] + + def __init__(self, config: AlbertConfig): + super().__init__(config) + + self.albert = AlbertModel(config) + self.predictions = AlbertMLMHead(config) + self.sop_classifier = AlbertSOPHead(config) + + # Initialize weights and apply final processing + self.post_init() + + def get_output_embeddings(self) -> nn.Linear: + return self.predictions.decoder + + def set_output_embeddings(self, new_embeddings: nn.Linear) -> None: + self.predictions.decoder = new_embeddings + + def get_input_embeddings(self) -> nn.Embedding: + return self.albert.embeddings.word_embeddings + + @add_start_docstrings_to_model_forward(ALBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @replace_return_docstrings(output_type=AlbertForPreTrainingOutput, config_class=_CONFIG_FOR_DOC) + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.FloatTensor] = None, + token_type_ids: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + sentence_order_label: Optional[torch.LongTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[AlbertForPreTrainingOutput, Tuple]: + r""" + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ..., + config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the + loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]` + sentence_order_label (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair + (see `input_ids` docstring) Indices should be in `[0, 1]`. `0` indicates original order (sequence A, then + sequence B), `1` indicates switched order (sequence B, then sequence A). + + Returns: + + Example: + + ```python + >>> from transformers import AutoTokenizer, AlbertForPreTraining + >>> import torch + + >>> tokenizer = AutoTokenizer.from_pretrained("albert/albert-base-v2") + >>> model = AlbertForPreTraining.from_pretrained("albert/albert-base-v2") + + >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) + >>> # Batch size 1 + >>> outputs = model(input_ids) + + >>> prediction_logits = outputs.prediction_logits + >>> sop_logits = outputs.sop_logits + ```""" + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.albert( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + sequence_output, pooled_output = outputs[:2] + + prediction_scores = self.predictions(sequence_output) + sop_scores = self.sop_classifier(pooled_output) + + total_loss = None + if labels is not None and sentence_order_label is not None: + loss_fct = CrossEntropyLoss() + masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1)) + sentence_order_loss = loss_fct(sop_scores.view(-1, 2), sentence_order_label.view(-1)) + total_loss = masked_lm_loss + sentence_order_loss + + if not return_dict: + output = (prediction_scores, sop_scores) + outputs[2:] + return ((total_loss,) + output) if total_loss is not None else output + + return AlbertForPreTrainingOutput( + loss=total_loss, + prediction_logits=prediction_scores, + sop_logits=sop_scores, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +class AlbertMLMHead(nn.Module): + def __init__(self, config: AlbertConfig): + super().__init__() + + self.LayerNorm = nn.LayerNorm(config.embedding_size, eps=config.layer_norm_eps) + self.bias = nn.Parameter(torch.zeros(config.vocab_size)) + self.dense = nn.Linear(config.hidden_size, config.embedding_size) + self.decoder = nn.Linear(config.embedding_size, config.vocab_size) + self.activation = ACT2FN[config.hidden_act] + self.decoder.bias = self.bias + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.activation(hidden_states) + hidden_states = self.LayerNorm(hidden_states) + hidden_states = self.decoder(hidden_states) + + prediction_scores = hidden_states + + return prediction_scores + + def _tie_weights(self) -> None: + # For accelerate compatibility and to not break backward compatibility + if self.decoder.bias.device.type == "meta": + self.decoder.bias = self.bias + else: + # To tie those two weights if they get disconnected (on TPU or when the bias is resized) + self.bias = self.decoder.bias + + +class AlbertSOPHead(nn.Module): + def __init__(self, config: AlbertConfig): + super().__init__() + + self.dropout = nn.Dropout(config.classifier_dropout_prob) + self.classifier = nn.Linear(config.hidden_size, config.num_labels) + + def forward(self, pooled_output: torch.Tensor) -> torch.Tensor: + dropout_pooled_output = self.dropout(pooled_output) + logits = self.classifier(dropout_pooled_output) + return logits + + +@add_start_docstrings( + "Albert Model with a `language modeling` head on top.", + ALBERT_START_DOCSTRING, +) +class AlbertForMaskedLM(AlbertPreTrainedModel): + _tied_weights_keys = ["predictions.decoder.bias", "predictions.decoder.weight"] + + def __init__(self, config): + super().__init__(config) + + self.albert = AlbertModel(config, add_pooling_layer=False) + self.predictions = AlbertMLMHead(config) + + # Initialize weights and apply final processing + self.post_init() + + def get_output_embeddings(self) -> nn.Linear: + return self.predictions.decoder + + def set_output_embeddings(self, new_embeddings: nn.Linear) -> None: + self.predictions.decoder = new_embeddings + self.predictions.bias = new_embeddings.bias + + def get_input_embeddings(self) -> nn.Embedding: + return self.albert.embeddings.word_embeddings + + @add_start_docstrings_to_model_forward(ALBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @replace_return_docstrings(output_type=MaskedLMOutput, config_class=_CONFIG_FOR_DOC) + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.FloatTensor] = None, + token_type_ids: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[MaskedLMOutput, Tuple]: + r""" + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ..., + config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the + loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]` + + Returns: + + Example: + + ```python + >>> import torch + >>> from transformers import AutoTokenizer, AlbertForMaskedLM + + >>> tokenizer = AutoTokenizer.from_pretrained("albert/albert-base-v2") + >>> model = AlbertForMaskedLM.from_pretrained("albert/albert-base-v2") + + >>> # add mask_token + >>> inputs = tokenizer("The capital of [MASK] is Paris.", return_tensors="pt") + >>> with torch.no_grad(): + ... logits = model(**inputs).logits + + >>> # retrieve index of [MASK] + >>> mask_token_index = (inputs.input_ids == tokenizer.mask_token_id)[0].nonzero(as_tuple=True)[0] + >>> predicted_token_id = logits[0, mask_token_index].argmax(axis=-1) + >>> tokenizer.decode(predicted_token_id) + 'france' + ``` + + ```python + >>> labels = tokenizer("The capital of France is Paris.", return_tensors="pt")["input_ids"] + >>> labels = torch.where(inputs.input_ids == tokenizer.mask_token_id, labels, -100) + >>> outputs = model(**inputs, labels=labels) + >>> round(outputs.loss.item(), 2) + 0.81 + ``` + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.albert( + input_ids=input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + sequence_outputs = outputs[0] + + prediction_scores = self.predictions(sequence_outputs) + + masked_lm_loss = None + if labels is not None: + loss_fct = CrossEntropyLoss() + masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1)) + + if not return_dict: + output = (prediction_scores,) + outputs[2:] + return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output + + return MaskedLMOutput( + loss=masked_lm_loss, + logits=prediction_scores, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +@add_start_docstrings( + """ + Albert Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled + output) e.g. for GLUE tasks. + """, + ALBERT_START_DOCSTRING, +) +class AlbertForSequenceClassification(AlbertPreTrainedModel): + def __init__(self, config: AlbertConfig): + super().__init__(config) + self.num_labels = config.num_labels + self.config = config + + self.albert = AlbertModel(config) + self.dropout = nn.Dropout(config.classifier_dropout_prob) + self.classifier = nn.Linear(config.hidden_size, self.config.num_labels) + + # Initialize weights and apply final processing + self.post_init() + + @add_start_docstrings_to_model_forward(ALBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @add_code_sample_docstrings( + checkpoint="textattack/albert-base-v2-imdb", + output_type=SequenceClassifierOutput, + config_class=_CONFIG_FOR_DOC, + expected_output="'LABEL_1'", + expected_loss=0.12, + ) + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.FloatTensor] = None, + token_type_ids: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[SequenceClassifierOutput, Tuple]: + r""" + labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., + config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If + `config.num_labels > 1` a classification loss is computed (Cross-Entropy). + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.albert( + input_ids=input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + pooled_output = outputs[1] + + pooled_output = self.dropout(pooled_output) + logits = self.classifier(pooled_output) + + loss = None + if labels is not None: + if self.config.problem_type is None: + if self.num_labels == 1: + self.config.problem_type = "regression" + elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int): + self.config.problem_type = "single_label_classification" + else: + self.config.problem_type = "multi_label_classification" + + if self.config.problem_type == "regression": + loss_fct = MSELoss() + if self.num_labels == 1: + loss = loss_fct(logits.squeeze(), labels.squeeze()) + else: + loss = loss_fct(logits, labels) + elif self.config.problem_type == "single_label_classification": + loss_fct = CrossEntropyLoss() + loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) + elif self.config.problem_type == "multi_label_classification": + loss_fct = BCEWithLogitsLoss() + loss = loss_fct(logits, labels) + + if not return_dict: + output = (logits,) + outputs[2:] + return ((loss,) + output) if loss is not None else output + + return SequenceClassifierOutput( + loss=loss, + logits=logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +@add_start_docstrings( + """ + Albert Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for + Named-Entity-Recognition (NER) tasks. + """, + ALBERT_START_DOCSTRING, +) +class AlbertForTokenClassification(AlbertPreTrainedModel): + def __init__(self, config: AlbertConfig): + super().__init__(config) + self.num_labels = config.num_labels + + self.albert = AlbertModel(config, add_pooling_layer=False) + classifier_dropout_prob = ( + config.classifier_dropout_prob + if config.classifier_dropout_prob is not None + else config.hidden_dropout_prob + ) + self.dropout = nn.Dropout(classifier_dropout_prob) + self.classifier = nn.Linear(config.hidden_size, self.config.num_labels) + + # Initialize weights and apply final processing + self.post_init() + + @add_start_docstrings_to_model_forward(ALBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @add_code_sample_docstrings( + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=TokenClassifierOutput, + config_class=_CONFIG_FOR_DOC, + ) + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.FloatTensor] = None, + token_type_ids: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[TokenClassifierOutput, Tuple]: + r""" + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`. + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.albert( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + sequence_output = outputs[0] + + sequence_output = self.dropout(sequence_output) + logits = self.classifier(sequence_output) + + loss = None + if labels is not None: + loss_fct = CrossEntropyLoss() + loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) + + if not return_dict: + output = (logits,) + outputs[2:] + return ((loss,) + output) if loss is not None else output + + return TokenClassifierOutput( + loss=loss, + logits=logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +@add_start_docstrings( + """ + Albert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear + layers on top of the hidden-states output to compute `span start logits` and `span end logits`). + """, + ALBERT_START_DOCSTRING, +) +class AlbertForQuestionAnswering(AlbertPreTrainedModel): + def __init__(self, config: AlbertConfig): + super().__init__(config) + self.num_labels = config.num_labels + + self.albert = AlbertModel(config, add_pooling_layer=False) + self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels) + + # Initialize weights and apply final processing + self.post_init() + + @add_start_docstrings_to_model_forward(ALBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @add_code_sample_docstrings( + checkpoint="twmkn9/albert-base-v2-squad2", + output_type=QuestionAnsweringModelOutput, + config_class=_CONFIG_FOR_DOC, + qa_target_start_index=12, + qa_target_end_index=13, + expected_output="'a nice puppet'", + expected_loss=7.36, + ) + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.FloatTensor] = None, + token_type_ids: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + start_positions: Optional[torch.LongTensor] = None, + end_positions: Optional[torch.LongTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[AlbertForPreTrainingOutput, Tuple]: + r""" + start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for position (index) of the start of the labelled span for computing the token classification loss. + Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence + are not taken into account for computing the loss. + end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for position (index) of the end of the labelled span for computing the token classification loss. + Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence + are not taken into account for computing the loss. + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.albert( + input_ids=input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + sequence_output = outputs[0] + + logits: torch.Tensor = self.qa_outputs(sequence_output) + start_logits, end_logits = logits.split(1, dim=-1) + start_logits = start_logits.squeeze(-1).contiguous() + end_logits = end_logits.squeeze(-1).contiguous() + + total_loss = None + if start_positions is not None and end_positions is not None: + # If we are on multi-GPU, split add a dimension + if len(start_positions.size()) > 1: + start_positions = start_positions.squeeze(-1) + if len(end_positions.size()) > 1: + end_positions = end_positions.squeeze(-1) + # sometimes the start/end positions are outside our model inputs, we ignore these terms + ignored_index = start_logits.size(1) + start_positions = start_positions.clamp(0, ignored_index) + end_positions = end_positions.clamp(0, ignored_index) + + loss_fct = CrossEntropyLoss(ignore_index=ignored_index) + start_loss = loss_fct(start_logits, start_positions) + end_loss = loss_fct(end_logits, end_positions) + total_loss = (start_loss + end_loss) / 2 + + if not return_dict: + output = (start_logits, end_logits) + outputs[2:] + return ((total_loss,) + output) if total_loss is not None else output + + return QuestionAnsweringModelOutput( + loss=total_loss, + start_logits=start_logits, + end_logits=end_logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +@add_start_docstrings( + """ + Albert Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a + softmax) e.g. for RocStories/SWAG tasks. + """, + ALBERT_START_DOCSTRING, +) +class AlbertForMultipleChoice(AlbertPreTrainedModel): + def __init__(self, config: AlbertConfig): + super().__init__(config) + + self.albert = AlbertModel(config) + self.dropout = nn.Dropout(config.classifier_dropout_prob) + self.classifier = nn.Linear(config.hidden_size, 1) + + # Initialize weights and apply final processing + self.post_init() + + @add_start_docstrings_to_model_forward(ALBERT_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length")) + @add_code_sample_docstrings( + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=MultipleChoiceModelOutput, + config_class=_CONFIG_FOR_DOC, + ) + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.FloatTensor] = None, + token_type_ids: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[AlbertForPreTrainingOutput, Tuple]: + r""" + labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the multiple choice classification loss. Indices should be in `[0, ..., + num_choices-1]` where *num_choices* is the size of the second dimension of the input tensors. (see + *input_ids* above) + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1] + + input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None + attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None + token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None + position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None + inputs_embeds = ( + inputs_embeds.view(-1, inputs_embeds.size(-2), inputs_embeds.size(-1)) + if inputs_embeds is not None + else None + ) + outputs = self.albert( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + pooled_output = outputs[1] + + pooled_output = self.dropout(pooled_output) + logits: torch.Tensor = self.classifier(pooled_output) + reshaped_logits = logits.view(-1, num_choices) + + loss = None + if labels is not None: + loss_fct = CrossEntropyLoss() + loss = loss_fct(reshaped_logits, labels) + + if not return_dict: + output = (reshaped_logits,) + outputs[2:] + return ((loss,) + output) if loss is not None else output + + return MultipleChoiceModelOutput( + loss=loss, + logits=reshaped_logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +__all__ = [ + "load_tf_weights_in_albert", + "AlbertPreTrainedModel", + "AlbertModel", + "AlbertForPreTraining", + "AlbertForMaskedLM", + "AlbertForSequenceClassification", + "AlbertForTokenClassification", + "AlbertForQuestionAnswering", + "AlbertForMultipleChoice", +] diff --git a/docs/transformers/build/lib/transformers/models/albert/modeling_flax_albert.py b/docs/transformers/build/lib/transformers/models/albert/modeling_flax_albert.py new file mode 100644 index 0000000000000000000000000000000000000000..df2ebddc7e6275a35524dd72381c3518e34a1b95 --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/albert/modeling_flax_albert.py @@ -0,0 +1,1132 @@ +# coding=utf-8 +# Copyright 2021 Google AI, Google Brain and the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Callable, Optional, Tuple + +import flax +import flax.linen as nn +import jax +import jax.numpy as jnp +import numpy as np +from flax.core.frozen_dict import FrozenDict, freeze, unfreeze +from flax.linen.attention import dot_product_attention_weights +from flax.traverse_util import flatten_dict, unflatten_dict +from jax import lax + +from ...modeling_flax_outputs import ( + FlaxBaseModelOutput, + FlaxBaseModelOutputWithPooling, + FlaxMaskedLMOutput, + FlaxMultipleChoiceModelOutput, + FlaxQuestionAnsweringModelOutput, + FlaxSequenceClassifierOutput, + FlaxTokenClassifierOutput, +) +from ...modeling_flax_utils import ( + ACT2FN, + FlaxPreTrainedModel, + append_call_sample_docstring, + append_replace_return_docstrings, + overwrite_call_docstring, +) +from ...utils import ModelOutput, add_start_docstrings, add_start_docstrings_to_model_forward, logging +from .configuration_albert import AlbertConfig + + +logger = logging.get_logger(__name__) + +_CHECKPOINT_FOR_DOC = "albert/albert-base-v2" +_CONFIG_FOR_DOC = "AlbertConfig" + + +@flax.struct.dataclass +class FlaxAlbertForPreTrainingOutput(ModelOutput): + """ + Output type of [`FlaxAlbertForPreTraining`]. + + Args: + prediction_logits (`jnp.ndarray` of shape `(batch_size, sequence_length, config.vocab_size)`): + Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). + sop_logits (`jnp.ndarray` of shape `(batch_size, 2)`): + Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation + before SoftMax). + hidden_states (`tuple(jnp.ndarray)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `jnp.ndarray` (one for the output of the embeddings + one for the output of each layer) of shape + `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (`tuple(jnp.ndarray)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `jnp.ndarray` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + + prediction_logits: jnp.ndarray = None + sop_logits: jnp.ndarray = None + hidden_states: Optional[Tuple[jnp.ndarray]] = None + attentions: Optional[Tuple[jnp.ndarray]] = None + + +ALBERT_START_DOCSTRING = r""" + + This model inherits from [`FlaxPreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading, saving and converting weights from PyTorch models) + + This model is also a + [flax.linen.Module](https://flax.readthedocs.io/en/latest/api_reference/flax.linen/module.html) subclass. Use it as + a regular Flax linen Module and refer to the Flax documentation for all matter related to general usage and + behavior. + + Finally, this model supports inherent JAX features such as: + + - [Just-In-Time (JIT) compilation](https://jax.readthedocs.io/en/latest/jax.html#just-in-time-compilation-jit) + - [Automatic Differentiation](https://jax.readthedocs.io/en/latest/jax.html#automatic-differentiation) + - [Vectorization](https://jax.readthedocs.io/en/latest/jax.html#vectorization-vmap) + - [Parallelization](https://jax.readthedocs.io/en/latest/jax.html#parallelization-pmap) + + Parameters: + config ([`AlbertConfig`]): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the [`~FlaxPreTrainedModel.from_pretrained`] method to load the model weights. + dtype (`jax.numpy.dtype`, *optional*, defaults to `jax.numpy.float32`): + The data type of the computation. Can be one of `jax.numpy.float32`, `jax.numpy.float16` (on GPUs) and + `jax.numpy.bfloat16` (on TPUs). + + This can be used to enable mixed-precision training or half-precision inference on GPUs or TPUs. If + specified all the computation will be performed with the given `dtype`. + + **Note that this only specifies the dtype of the computation and does not influence the dtype of model + parameters.** + + If you wish to change the dtype of the model parameters, see [`~FlaxPreTrainedModel.to_fp16`] and + [`~FlaxPreTrainedModel.to_bf16`]. +""" + +ALBERT_INPUTS_DOCSTRING = r""" + Args: + input_ids (`numpy.ndarray` of shape `({0})`): + Indices of input sequence tokens in the vocabulary. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + attention_mask (`numpy.ndarray` of shape `({0})`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + token_type_ids (`numpy.ndarray` of shape `({0})`, *optional*): + Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0, + 1]`: + + - 0 corresponds to a *sentence A* token, + - 1 corresponds to a *sentence B* token. + + [What are token type IDs?](../glossary#token-type-ids) + position_ids (`numpy.ndarray` of shape `({0})`, *optional*): + Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, + config.max_position_embeddings - 1]`. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. + +""" + + +class FlaxAlbertEmbeddings(nn.Module): + """Construct the embeddings from word, position and token_type embeddings.""" + + config: AlbertConfig + dtype: jnp.dtype = jnp.float32 # the dtype of the computation + + def setup(self): + self.word_embeddings = nn.Embed( + self.config.vocab_size, + self.config.embedding_size, + embedding_init=jax.nn.initializers.normal(stddev=self.config.initializer_range), + ) + self.position_embeddings = nn.Embed( + self.config.max_position_embeddings, + self.config.embedding_size, + embedding_init=jax.nn.initializers.normal(stddev=self.config.initializer_range), + ) + self.token_type_embeddings = nn.Embed( + self.config.type_vocab_size, + self.config.embedding_size, + embedding_init=jax.nn.initializers.normal(stddev=self.config.initializer_range), + ) + self.LayerNorm = nn.LayerNorm(epsilon=self.config.layer_norm_eps, dtype=self.dtype) + self.dropout = nn.Dropout(rate=self.config.hidden_dropout_prob) + + def __call__(self, input_ids, token_type_ids, position_ids, deterministic: bool = True): + # Embed + inputs_embeds = self.word_embeddings(input_ids.astype("i4")) + position_embeds = self.position_embeddings(position_ids.astype("i4")) + token_type_embeddings = self.token_type_embeddings(token_type_ids.astype("i4")) + + # Sum all embeddings + hidden_states = inputs_embeds + token_type_embeddings + position_embeds + + # Layer Norm + hidden_states = self.LayerNorm(hidden_states) + hidden_states = self.dropout(hidden_states, deterministic=deterministic) + return hidden_states + + +class FlaxAlbertSelfAttention(nn.Module): + config: AlbertConfig + dtype: jnp.dtype = jnp.float32 # the dtype of the computation + + def setup(self): + if self.config.hidden_size % self.config.num_attention_heads != 0: + raise ValueError( + "`config.hidden_size`: {self.config.hidden_size} has to be a multiple of `config.num_attention_heads` " + " : {self.config.num_attention_heads}" + ) + + self.query = nn.Dense( + self.config.hidden_size, + dtype=self.dtype, + kernel_init=jax.nn.initializers.normal(self.config.initializer_range), + ) + self.key = nn.Dense( + self.config.hidden_size, + dtype=self.dtype, + kernel_init=jax.nn.initializers.normal(self.config.initializer_range), + ) + self.value = nn.Dense( + self.config.hidden_size, + dtype=self.dtype, + kernel_init=jax.nn.initializers.normal(self.config.initializer_range), + ) + self.dense = nn.Dense( + self.config.hidden_size, + kernel_init=jax.nn.initializers.normal(self.config.initializer_range), + dtype=self.dtype, + ) + self.LayerNorm = nn.LayerNorm(epsilon=self.config.layer_norm_eps, dtype=self.dtype) + self.dropout = nn.Dropout(rate=self.config.hidden_dropout_prob) + + def __call__(self, hidden_states, attention_mask, deterministic=True, output_attentions: bool = False): + head_dim = self.config.hidden_size // self.config.num_attention_heads + + query_states = self.query(hidden_states).reshape( + hidden_states.shape[:2] + (self.config.num_attention_heads, head_dim) + ) + value_states = self.value(hidden_states).reshape( + hidden_states.shape[:2] + (self.config.num_attention_heads, head_dim) + ) + key_states = self.key(hidden_states).reshape( + hidden_states.shape[:2] + (self.config.num_attention_heads, head_dim) + ) + + # Convert the boolean attention mask to an attention bias. + if attention_mask is not None: + # attention mask in the form of attention bias + attention_mask = jnp.expand_dims(attention_mask, axis=(-3, -2)) + attention_bias = lax.select( + attention_mask > 0, + jnp.full(attention_mask.shape, 0.0).astype(self.dtype), + jnp.full(attention_mask.shape, jnp.finfo(self.dtype).min).astype(self.dtype), + ) + else: + attention_bias = None + + dropout_rng = None + if not deterministic and self.config.attention_probs_dropout_prob > 0.0: + dropout_rng = self.make_rng("dropout") + + attn_weights = dot_product_attention_weights( + query_states, + key_states, + bias=attention_bias, + dropout_rng=dropout_rng, + dropout_rate=self.config.attention_probs_dropout_prob, + broadcast_dropout=True, + deterministic=deterministic, + dtype=self.dtype, + precision=None, + ) + + attn_output = jnp.einsum("...hqk,...khd->...qhd", attn_weights, value_states) + attn_output = attn_output.reshape(attn_output.shape[:2] + (-1,)) + + projected_attn_output = self.dense(attn_output) + projected_attn_output = self.dropout(projected_attn_output, deterministic=deterministic) + layernormed_attn_output = self.LayerNorm(projected_attn_output + hidden_states) + outputs = (layernormed_attn_output, attn_weights) if output_attentions else (layernormed_attn_output,) + return outputs + + +class FlaxAlbertLayer(nn.Module): + config: AlbertConfig + dtype: jnp.dtype = jnp.float32 # the dtype of the computation + + def setup(self): + self.attention = FlaxAlbertSelfAttention(self.config, dtype=self.dtype) + self.ffn = nn.Dense( + self.config.intermediate_size, + kernel_init=jax.nn.initializers.normal(self.config.initializer_range), + dtype=self.dtype, + ) + self.activation = ACT2FN[self.config.hidden_act] + self.ffn_output = nn.Dense( + self.config.hidden_size, + kernel_init=jax.nn.initializers.normal(self.config.initializer_range), + dtype=self.dtype, + ) + self.full_layer_layer_norm = nn.LayerNorm(epsilon=self.config.layer_norm_eps, dtype=self.dtype) + self.dropout = nn.Dropout(rate=self.config.hidden_dropout_prob) + + def __call__( + self, + hidden_states, + attention_mask, + deterministic: bool = True, + output_attentions: bool = False, + ): + attention_outputs = self.attention( + hidden_states, attention_mask, deterministic=deterministic, output_attentions=output_attentions + ) + attention_output = attention_outputs[0] + ffn_output = self.ffn(attention_output) + ffn_output = self.activation(ffn_output) + ffn_output = self.ffn_output(ffn_output) + ffn_output = self.dropout(ffn_output, deterministic=deterministic) + hidden_states = self.full_layer_layer_norm(ffn_output + attention_output) + + outputs = (hidden_states,) + + if output_attentions: + outputs += (attention_outputs[1],) + return outputs + + +class FlaxAlbertLayerCollection(nn.Module): + config: AlbertConfig + dtype: jnp.dtype = jnp.float32 # the dtype of the computation + + def setup(self): + self.layers = [ + FlaxAlbertLayer(self.config, name=str(i), dtype=self.dtype) for i in range(self.config.inner_group_num) + ] + + def __call__( + self, + hidden_states, + attention_mask, + deterministic: bool = True, + output_attentions: bool = False, + output_hidden_states: bool = False, + ): + layer_hidden_states = () + layer_attentions = () + + for layer_index, albert_layer in enumerate(self.layers): + layer_output = albert_layer( + hidden_states, + attention_mask, + deterministic=deterministic, + output_attentions=output_attentions, + ) + hidden_states = layer_output[0] + + if output_attentions: + layer_attentions = layer_attentions + (layer_output[1],) + + if output_hidden_states: + layer_hidden_states = layer_hidden_states + (hidden_states,) + + outputs = (hidden_states,) + if output_hidden_states: + outputs = outputs + (layer_hidden_states,) + if output_attentions: + outputs = outputs + (layer_attentions,) + return outputs # last-layer hidden state, (layer hidden states), (layer attentions) + + +class FlaxAlbertLayerCollections(nn.Module): + config: AlbertConfig + dtype: jnp.dtype = jnp.float32 # the dtype of the computation + layer_index: Optional[str] = None + + def setup(self): + self.albert_layers = FlaxAlbertLayerCollection(self.config, dtype=self.dtype) + + def __call__( + self, + hidden_states, + attention_mask, + deterministic: bool = True, + output_attentions: bool = False, + output_hidden_states: bool = False, + ): + outputs = self.albert_layers( + hidden_states, + attention_mask, + deterministic=deterministic, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + ) + return outputs + + +class FlaxAlbertLayerGroups(nn.Module): + config: AlbertConfig + dtype: jnp.dtype = jnp.float32 # the dtype of the computation + + def setup(self): + self.layers = [ + FlaxAlbertLayerCollections(self.config, name=str(i), layer_index=str(i), dtype=self.dtype) + for i in range(self.config.num_hidden_groups) + ] + + def __call__( + self, + hidden_states, + attention_mask, + deterministic: bool = True, + output_attentions: bool = False, + output_hidden_states: bool = False, + return_dict: bool = True, + ): + all_attentions = () if output_attentions else None + all_hidden_states = (hidden_states,) if output_hidden_states else None + + for i in range(self.config.num_hidden_layers): + # Index of the hidden group + group_idx = int(i / (self.config.num_hidden_layers / self.config.num_hidden_groups)) + layer_group_output = self.layers[group_idx]( + hidden_states, + attention_mask, + deterministic=deterministic, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + ) + hidden_states = layer_group_output[0] + + if output_attentions: + all_attentions = all_attentions + layer_group_output[-1] + + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + if not return_dict: + return tuple(v for v in [hidden_states, all_hidden_states, all_attentions] if v is not None) + return FlaxBaseModelOutput( + last_hidden_state=hidden_states, hidden_states=all_hidden_states, attentions=all_attentions + ) + + +class FlaxAlbertEncoder(nn.Module): + config: AlbertConfig + dtype: jnp.dtype = jnp.float32 # the dtype of the computation + + def setup(self): + self.embedding_hidden_mapping_in = nn.Dense( + self.config.hidden_size, + kernel_init=jax.nn.initializers.normal(self.config.initializer_range), + dtype=self.dtype, + ) + self.albert_layer_groups = FlaxAlbertLayerGroups(self.config, dtype=self.dtype) + + def __call__( + self, + hidden_states, + attention_mask, + deterministic: bool = True, + output_attentions: bool = False, + output_hidden_states: bool = False, + return_dict: bool = True, + ): + hidden_states = self.embedding_hidden_mapping_in(hidden_states) + return self.albert_layer_groups( + hidden_states, + attention_mask, + deterministic=deterministic, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + ) + + +class FlaxAlbertOnlyMLMHead(nn.Module): + config: AlbertConfig + dtype: jnp.dtype = jnp.float32 + bias_init: Callable[..., np.ndarray] = jax.nn.initializers.zeros + + def setup(self): + self.dense = nn.Dense(self.config.embedding_size, dtype=self.dtype) + self.activation = ACT2FN[self.config.hidden_act] + self.LayerNorm = nn.LayerNorm(epsilon=self.config.layer_norm_eps, dtype=self.dtype) + self.decoder = nn.Dense(self.config.vocab_size, dtype=self.dtype, use_bias=False) + self.bias = self.param("bias", self.bias_init, (self.config.vocab_size,)) + + def __call__(self, hidden_states, shared_embedding=None): + hidden_states = self.dense(hidden_states) + hidden_states = self.activation(hidden_states) + hidden_states = self.LayerNorm(hidden_states) + + if shared_embedding is not None: + hidden_states = self.decoder.apply({"params": {"kernel": shared_embedding.T}}, hidden_states) + else: + hidden_states = self.decoder(hidden_states) + + hidden_states += self.bias + return hidden_states + + +class FlaxAlbertSOPHead(nn.Module): + config: AlbertConfig + dtype: jnp.dtype = jnp.float32 + + def setup(self): + self.dropout = nn.Dropout(self.config.classifier_dropout_prob) + self.classifier = nn.Dense(2, dtype=self.dtype) + + def __call__(self, pooled_output, deterministic=True): + pooled_output = self.dropout(pooled_output, deterministic=deterministic) + logits = self.classifier(pooled_output) + return logits + + +class FlaxAlbertPreTrainedModel(FlaxPreTrainedModel): + """ + An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained + models. + """ + + config_class = AlbertConfig + base_model_prefix = "albert" + module_class: nn.Module = None + + def __init__( + self, + config: AlbertConfig, + input_shape: Tuple = (1, 1), + seed: int = 0, + dtype: jnp.dtype = jnp.float32, + _do_init: bool = True, + **kwargs, + ): + module = self.module_class(config=config, dtype=dtype, **kwargs) + super().__init__(config, module, input_shape=input_shape, seed=seed, dtype=dtype, _do_init=_do_init) + + def init_weights(self, rng: jax.random.PRNGKey, input_shape: Tuple, params: FrozenDict = None) -> FrozenDict: + # init input tensors + input_ids = jnp.zeros(input_shape, dtype="i4") + token_type_ids = jnp.zeros_like(input_ids) + position_ids = jnp.broadcast_to(jnp.arange(jnp.atleast_2d(input_ids).shape[-1]), input_shape) + attention_mask = jnp.ones_like(input_ids) + + params_rng, dropout_rng = jax.random.split(rng) + rngs = {"params": params_rng, "dropout": dropout_rng} + + random_params = self.module.init( + rngs, input_ids, attention_mask, token_type_ids, position_ids, return_dict=False + )["params"] + + if params is not None: + random_params = flatten_dict(unfreeze(random_params)) + params = flatten_dict(unfreeze(params)) + for missing_key in self._missing_keys: + params[missing_key] = random_params[missing_key] + self._missing_keys = set() + return freeze(unflatten_dict(params)) + else: + return random_params + + @add_start_docstrings_to_model_forward(ALBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + def __call__( + self, + input_ids, + attention_mask=None, + token_type_ids=None, + position_ids=None, + params: dict = None, + dropout_rng: jax.random.PRNGKey = None, + train: bool = False, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ): + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.return_dict + + # init input tensors if not passed + if token_type_ids is None: + token_type_ids = jnp.zeros_like(input_ids) + + if position_ids is None: + position_ids = jnp.broadcast_to(jnp.arange(jnp.atleast_2d(input_ids).shape[-1]), input_ids.shape) + + if attention_mask is None: + attention_mask = jnp.ones_like(input_ids) + + # Handle any PRNG if needed + rngs = {} + if dropout_rng is not None: + rngs["dropout"] = dropout_rng + + return self.module.apply( + {"params": params or self.params}, + jnp.array(input_ids, dtype="i4"), + jnp.array(attention_mask, dtype="i4"), + jnp.array(token_type_ids, dtype="i4"), + jnp.array(position_ids, dtype="i4"), + not train, + output_attentions, + output_hidden_states, + return_dict, + rngs=rngs, + ) + + +class FlaxAlbertModule(nn.Module): + config: AlbertConfig + dtype: jnp.dtype = jnp.float32 # the dtype of the computation + add_pooling_layer: bool = True + + def setup(self): + self.embeddings = FlaxAlbertEmbeddings(self.config, dtype=self.dtype) + self.encoder = FlaxAlbertEncoder(self.config, dtype=self.dtype) + if self.add_pooling_layer: + self.pooler = nn.Dense( + self.config.hidden_size, + kernel_init=jax.nn.initializers.normal(self.config.initializer_range), + dtype=self.dtype, + name="pooler", + ) + self.pooler_activation = nn.tanh + else: + self.pooler = None + self.pooler_activation = None + + def __call__( + self, + input_ids, + attention_mask, + token_type_ids: Optional[np.ndarray] = None, + position_ids: Optional[np.ndarray] = None, + deterministic: bool = True, + output_attentions: bool = False, + output_hidden_states: bool = False, + return_dict: bool = True, + ): + # make sure `token_type_ids` is correctly initialized when not passed + if token_type_ids is None: + token_type_ids = jnp.zeros_like(input_ids) + + # make sure `position_ids` is correctly initialized when not passed + if position_ids is None: + position_ids = jnp.broadcast_to(jnp.arange(jnp.atleast_2d(input_ids).shape[-1]), input_ids.shape) + + hidden_states = self.embeddings(input_ids, token_type_ids, position_ids, deterministic=deterministic) + + outputs = self.encoder( + hidden_states, + attention_mask, + deterministic=deterministic, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + hidden_states = outputs[0] + if self.add_pooling_layer: + pooled = self.pooler(hidden_states[:, 0]) + pooled = self.pooler_activation(pooled) + else: + pooled = None + + if not return_dict: + # if pooled is None, don't return it + if pooled is None: + return (hidden_states,) + outputs[1:] + return (hidden_states, pooled) + outputs[1:] + + return FlaxBaseModelOutputWithPooling( + last_hidden_state=hidden_states, + pooler_output=pooled, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +@add_start_docstrings( + "The bare Albert Model transformer outputting raw hidden-states without any specific head on top.", + ALBERT_START_DOCSTRING, +) +class FlaxAlbertModel(FlaxAlbertPreTrainedModel): + module_class = FlaxAlbertModule + + +append_call_sample_docstring(FlaxAlbertModel, _CHECKPOINT_FOR_DOC, FlaxBaseModelOutputWithPooling, _CONFIG_FOR_DOC) + + +class FlaxAlbertForPreTrainingModule(nn.Module): + config: AlbertConfig + dtype: jnp.dtype = jnp.float32 + + def setup(self): + self.albert = FlaxAlbertModule(config=self.config, dtype=self.dtype) + self.predictions = FlaxAlbertOnlyMLMHead(config=self.config, dtype=self.dtype) + self.sop_classifier = FlaxAlbertSOPHead(config=self.config, dtype=self.dtype) + + def __call__( + self, + input_ids, + attention_mask, + token_type_ids, + position_ids, + deterministic: bool = True, + output_attentions: bool = False, + output_hidden_states: bool = False, + return_dict: bool = True, + ): + # Model + outputs = self.albert( + input_ids, + attention_mask, + token_type_ids, + position_ids, + deterministic=deterministic, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + if self.config.tie_word_embeddings: + shared_embedding = self.albert.variables["params"]["embeddings"]["word_embeddings"]["embedding"] + else: + shared_embedding = None + + hidden_states = outputs[0] + pooled_output = outputs[1] + + prediction_scores = self.predictions(hidden_states, shared_embedding=shared_embedding) + sop_scores = self.sop_classifier(pooled_output, deterministic=deterministic) + + if not return_dict: + return (prediction_scores, sop_scores) + outputs[2:] + + return FlaxAlbertForPreTrainingOutput( + prediction_logits=prediction_scores, + sop_logits=sop_scores, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +@add_start_docstrings( + """ + Albert Model with two heads on top as done during the pretraining: a `masked language modeling` head and a + `sentence order prediction (classification)` head. + """, + ALBERT_START_DOCSTRING, +) +class FlaxAlbertForPreTraining(FlaxAlbertPreTrainedModel): + module_class = FlaxAlbertForPreTrainingModule + + +FLAX_ALBERT_FOR_PRETRAINING_DOCSTRING = """ + Returns: + + Example: + + ```python + >>> from transformers import AutoTokenizer, FlaxAlbertForPreTraining + + >>> tokenizer = AutoTokenizer.from_pretrained("albert/albert-base-v2") + >>> model = FlaxAlbertForPreTraining.from_pretrained("albert/albert-base-v2") + + >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="np") + >>> outputs = model(**inputs) + + >>> prediction_logits = outputs.prediction_logits + >>> seq_relationship_logits = outputs.sop_logits + ``` +""" + +overwrite_call_docstring( + FlaxAlbertForPreTraining, + ALBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length") + FLAX_ALBERT_FOR_PRETRAINING_DOCSTRING, +) +append_replace_return_docstrings( + FlaxAlbertForPreTraining, output_type=FlaxAlbertForPreTrainingOutput, config_class=_CONFIG_FOR_DOC +) + + +class FlaxAlbertForMaskedLMModule(nn.Module): + config: AlbertConfig + dtype: jnp.dtype = jnp.float32 + + def setup(self): + self.albert = FlaxAlbertModule(config=self.config, add_pooling_layer=False, dtype=self.dtype) + self.predictions = FlaxAlbertOnlyMLMHead(config=self.config, dtype=self.dtype) + + def __call__( + self, + input_ids, + attention_mask, + token_type_ids, + position_ids, + deterministic: bool = True, + output_attentions: bool = False, + output_hidden_states: bool = False, + return_dict: bool = True, + ): + # Model + outputs = self.albert( + input_ids, + attention_mask, + token_type_ids, + position_ids, + deterministic=deterministic, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + hidden_states = outputs[0] + if self.config.tie_word_embeddings: + shared_embedding = self.albert.variables["params"]["embeddings"]["word_embeddings"]["embedding"] + else: + shared_embedding = None + + # Compute the prediction scores + logits = self.predictions(hidden_states, shared_embedding=shared_embedding) + + if not return_dict: + return (logits,) + outputs[1:] + + return FlaxMaskedLMOutput( + logits=logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +@add_start_docstrings("""Albert Model with a `language modeling` head on top.""", ALBERT_START_DOCSTRING) +class FlaxAlbertForMaskedLM(FlaxAlbertPreTrainedModel): + module_class = FlaxAlbertForMaskedLMModule + + +append_call_sample_docstring( + FlaxAlbertForMaskedLM, _CHECKPOINT_FOR_DOC, FlaxMaskedLMOutput, _CONFIG_FOR_DOC, revision="refs/pr/11" +) + + +class FlaxAlbertForSequenceClassificationModule(nn.Module): + config: AlbertConfig + dtype: jnp.dtype = jnp.float32 + + def setup(self): + self.albert = FlaxAlbertModule(config=self.config, dtype=self.dtype) + classifier_dropout = ( + self.config.classifier_dropout_prob + if self.config.classifier_dropout_prob is not None + else self.config.hidden_dropout_prob + ) + self.dropout = nn.Dropout(rate=classifier_dropout) + self.classifier = nn.Dense( + self.config.num_labels, + dtype=self.dtype, + ) + + def __call__( + self, + input_ids, + attention_mask, + token_type_ids, + position_ids, + deterministic: bool = True, + output_attentions: bool = False, + output_hidden_states: bool = False, + return_dict: bool = True, + ): + # Model + outputs = self.albert( + input_ids, + attention_mask, + token_type_ids, + position_ids, + deterministic=deterministic, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + pooled_output = outputs[1] + pooled_output = self.dropout(pooled_output, deterministic=deterministic) + logits = self.classifier(pooled_output) + + if not return_dict: + return (logits,) + outputs[2:] + + return FlaxSequenceClassifierOutput( + logits=logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +@add_start_docstrings( + """ + Albert Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled + output) e.g. for GLUE tasks. + """, + ALBERT_START_DOCSTRING, +) +class FlaxAlbertForSequenceClassification(FlaxAlbertPreTrainedModel): + module_class = FlaxAlbertForSequenceClassificationModule + + +append_call_sample_docstring( + FlaxAlbertForSequenceClassification, + _CHECKPOINT_FOR_DOC, + FlaxSequenceClassifierOutput, + _CONFIG_FOR_DOC, +) + + +class FlaxAlbertForMultipleChoiceModule(nn.Module): + config: AlbertConfig + dtype: jnp.dtype = jnp.float32 + + def setup(self): + self.albert = FlaxAlbertModule(config=self.config, dtype=self.dtype) + self.dropout = nn.Dropout(rate=self.config.hidden_dropout_prob) + self.classifier = nn.Dense(1, dtype=self.dtype) + + def __call__( + self, + input_ids, + attention_mask, + token_type_ids, + position_ids, + deterministic: bool = True, + output_attentions: bool = False, + output_hidden_states: bool = False, + return_dict: bool = True, + ): + num_choices = input_ids.shape[1] + input_ids = input_ids.reshape(-1, input_ids.shape[-1]) if input_ids is not None else None + attention_mask = attention_mask.reshape(-1, attention_mask.shape[-1]) if attention_mask is not None else None + token_type_ids = token_type_ids.reshape(-1, token_type_ids.shape[-1]) if token_type_ids is not None else None + position_ids = position_ids.reshape(-1, position_ids.shape[-1]) if position_ids is not None else None + + # Model + outputs = self.albert( + input_ids, + attention_mask, + token_type_ids, + position_ids, + deterministic=deterministic, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + pooled_output = outputs[1] + pooled_output = self.dropout(pooled_output, deterministic=deterministic) + logits = self.classifier(pooled_output) + + reshaped_logits = logits.reshape(-1, num_choices) + + if not return_dict: + return (reshaped_logits,) + outputs[2:] + + return FlaxMultipleChoiceModelOutput( + logits=reshaped_logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +@add_start_docstrings( + """ + Albert Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a + softmax) e.g. for RocStories/SWAG tasks. + """, + ALBERT_START_DOCSTRING, +) +class FlaxAlbertForMultipleChoice(FlaxAlbertPreTrainedModel): + module_class = FlaxAlbertForMultipleChoiceModule + + +overwrite_call_docstring( + FlaxAlbertForMultipleChoice, ALBERT_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length") +) +append_call_sample_docstring( + FlaxAlbertForMultipleChoice, + _CHECKPOINT_FOR_DOC, + FlaxMultipleChoiceModelOutput, + _CONFIG_FOR_DOC, +) + + +class FlaxAlbertForTokenClassificationModule(nn.Module): + config: AlbertConfig + dtype: jnp.dtype = jnp.float32 + + def setup(self): + self.albert = FlaxAlbertModule(config=self.config, dtype=self.dtype, add_pooling_layer=False) + classifier_dropout = ( + self.config.classifier_dropout_prob + if self.config.classifier_dropout_prob is not None + else self.config.hidden_dropout_prob + ) + self.dropout = nn.Dropout(rate=classifier_dropout) + self.classifier = nn.Dense(self.config.num_labels, dtype=self.dtype) + + def __call__( + self, + input_ids, + attention_mask, + token_type_ids, + position_ids, + deterministic: bool = True, + output_attentions: bool = False, + output_hidden_states: bool = False, + return_dict: bool = True, + ): + # Model + outputs = self.albert( + input_ids, + attention_mask, + token_type_ids, + position_ids, + deterministic=deterministic, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + hidden_states = outputs[0] + hidden_states = self.dropout(hidden_states, deterministic=deterministic) + logits = self.classifier(hidden_states) + + if not return_dict: + return (logits,) + outputs[1:] + + return FlaxTokenClassifierOutput( + logits=logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +@add_start_docstrings( + """ + Albert Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for + Named-Entity-Recognition (NER) tasks. + """, + ALBERT_START_DOCSTRING, +) +class FlaxAlbertForTokenClassification(FlaxAlbertPreTrainedModel): + module_class = FlaxAlbertForTokenClassificationModule + + +append_call_sample_docstring( + FlaxAlbertForTokenClassification, + _CHECKPOINT_FOR_DOC, + FlaxTokenClassifierOutput, + _CONFIG_FOR_DOC, +) + + +class FlaxAlbertForQuestionAnsweringModule(nn.Module): + config: AlbertConfig + dtype: jnp.dtype = jnp.float32 + + def setup(self): + self.albert = FlaxAlbertModule(config=self.config, dtype=self.dtype, add_pooling_layer=False) + self.qa_outputs = nn.Dense(self.config.num_labels, dtype=self.dtype) + + def __call__( + self, + input_ids, + attention_mask, + token_type_ids, + position_ids, + deterministic: bool = True, + output_attentions: bool = False, + output_hidden_states: bool = False, + return_dict: bool = True, + ): + # Model + outputs = self.albert( + input_ids, + attention_mask, + token_type_ids, + position_ids, + deterministic=deterministic, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + hidden_states = outputs[0] + + logits = self.qa_outputs(hidden_states) + start_logits, end_logits = jnp.split(logits, self.config.num_labels, axis=-1) + start_logits = start_logits.squeeze(-1) + end_logits = end_logits.squeeze(-1) + + if not return_dict: + return (start_logits, end_logits) + outputs[1:] + + return FlaxQuestionAnsweringModelOutput( + start_logits=start_logits, + end_logits=end_logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +@add_start_docstrings( + """ + Albert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear + layers on top of the hidden-states output to compute `span start logits` and `span end logits`). + """, + ALBERT_START_DOCSTRING, +) +class FlaxAlbertForQuestionAnswering(FlaxAlbertPreTrainedModel): + module_class = FlaxAlbertForQuestionAnsweringModule + + +append_call_sample_docstring( + FlaxAlbertForQuestionAnswering, + _CHECKPOINT_FOR_DOC, + FlaxQuestionAnsweringModelOutput, + _CONFIG_FOR_DOC, +) + +__all__ = [ + "FlaxAlbertPreTrainedModel", + "FlaxAlbertModel", + "FlaxAlbertForPreTraining", + "FlaxAlbertForMaskedLM", + "FlaxAlbertForSequenceClassification", + "FlaxAlbertForMultipleChoice", + "FlaxAlbertForTokenClassification", + "FlaxAlbertForQuestionAnswering", +] diff --git a/docs/transformers/build/lib/transformers/models/albert/modeling_tf_albert.py b/docs/transformers/build/lib/transformers/models/albert/modeling_tf_albert.py new file mode 100644 index 0000000000000000000000000000000000000000..6800cfa8d16a84a422e804ada3e0a7d9b9e9ef40 --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/albert/modeling_tf_albert.py @@ -0,0 +1,1573 @@ +# coding=utf-8 +# Copyright 2018 The OpenAI Team Authors and HuggingFace Inc. team. +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""TF 2.0 ALBERT model.""" + +from __future__ import annotations + +import math +from dataclasses import dataclass +from typing import Dict, Optional, Tuple, Union + +import numpy as np +import tensorflow as tf + +from ...activations_tf import get_tf_activation +from ...modeling_tf_outputs import ( + TFBaseModelOutput, + TFBaseModelOutputWithPooling, + TFMaskedLMOutput, + TFMultipleChoiceModelOutput, + TFQuestionAnsweringModelOutput, + TFSequenceClassifierOutput, + TFTokenClassifierOutput, +) +from ...modeling_tf_utils import ( + TFMaskedLanguageModelingLoss, + TFModelInputType, + TFMultipleChoiceLoss, + TFPreTrainedModel, + TFQuestionAnsweringLoss, + TFSequenceClassificationLoss, + TFTokenClassificationLoss, + get_initializer, + keras, + keras_serializable, + unpack_inputs, +) +from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax +from ...utils import ( + ModelOutput, + add_code_sample_docstrings, + add_start_docstrings, + add_start_docstrings_to_model_forward, + logging, + replace_return_docstrings, +) +from .configuration_albert import AlbertConfig + + +logger = logging.get_logger(__name__) + +_CHECKPOINT_FOR_DOC = "albert/albert-base-v2" +_CONFIG_FOR_DOC = "AlbertConfig" + + +class TFAlbertPreTrainingLoss: + """ + Loss function suitable for ALBERT pretraining, that is, the task of pretraining a language model by combining SOP + + MLM. .. note:: Any label of -100 will be ignored (along with the corresponding logits) in the loss computation. + """ + + def hf_compute_loss(self, labels: tf.Tensor, logits: tf.Tensor) -> tf.Tensor: + loss_fn = keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction=keras.losses.Reduction.NONE) + if self.config.tf_legacy_loss: + # make sure only labels that are not equal to -100 + # are taken into account as loss + masked_lm_active_loss = tf.not_equal(tf.reshape(tensor=labels["labels"], shape=(-1,)), -100) + masked_lm_reduced_logits = tf.boolean_mask( + tensor=tf.reshape(tensor=logits[0], shape=(-1, shape_list(logits[0])[2])), + mask=masked_lm_active_loss, + ) + masked_lm_labels = tf.boolean_mask( + tensor=tf.reshape(tensor=labels["labels"], shape=(-1,)), mask=masked_lm_active_loss + ) + sentence_order_active_loss = tf.not_equal( + tf.reshape(tensor=labels["sentence_order_label"], shape=(-1,)), -100 + ) + sentence_order_reduced_logits = tf.boolean_mask( + tensor=tf.reshape(tensor=logits[1], shape=(-1, 2)), mask=sentence_order_active_loss + ) + sentence_order_label = tf.boolean_mask( + tensor=tf.reshape(tensor=labels["sentence_order_label"], shape=(-1,)), mask=sentence_order_active_loss + ) + masked_lm_loss = loss_fn(y_true=masked_lm_labels, y_pred=masked_lm_reduced_logits) + sentence_order_loss = loss_fn(y_true=sentence_order_label, y_pred=sentence_order_reduced_logits) + masked_lm_loss = tf.reshape(tensor=masked_lm_loss, shape=(-1, shape_list(sentence_order_loss)[0])) + masked_lm_loss = tf.reduce_mean(input_tensor=masked_lm_loss, axis=0) + + return masked_lm_loss + sentence_order_loss + + # Clip negative labels to zero here to avoid NaNs and errors - those positions will get masked later anyway + unmasked_lm_losses = loss_fn(y_true=tf.nn.relu(labels["labels"]), y_pred=logits[0]) + # make sure only labels that are not equal to -100 + # are taken into account for the loss computation + lm_loss_mask = tf.cast(labels["labels"] != -100, dtype=unmasked_lm_losses.dtype) + masked_lm_losses = unmasked_lm_losses * lm_loss_mask + reduced_masked_lm_loss = tf.reduce_sum(masked_lm_losses) / tf.reduce_sum(lm_loss_mask) + + sop_logits = tf.reshape(logits[1], (-1, 2)) + # Clip negative labels to zero here to avoid NaNs and errors - those positions will get masked later anyway + unmasked_sop_loss = loss_fn(y_true=tf.nn.relu(labels["sentence_order_label"]), y_pred=sop_logits) + sop_loss_mask = tf.cast(labels["sentence_order_label"] != -100, dtype=unmasked_sop_loss.dtype) + + masked_sop_loss = unmasked_sop_loss * sop_loss_mask + reduced_masked_sop_loss = tf.reduce_sum(masked_sop_loss) / tf.reduce_sum(sop_loss_mask) + + return tf.reshape(reduced_masked_lm_loss + reduced_masked_sop_loss, (1,)) + + +class TFAlbertEmbeddings(keras.layers.Layer): + """Construct the embeddings from word, position and token_type embeddings.""" + + def __init__(self, config: AlbertConfig, **kwargs): + super().__init__(**kwargs) + + self.config = config + self.embedding_size = config.embedding_size + self.max_position_embeddings = config.max_position_embeddings + self.initializer_range = config.initializer_range + self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") + self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob) + + def build(self, input_shape=None): + with tf.name_scope("word_embeddings"): + self.weight = self.add_weight( + name="weight", + shape=[self.config.vocab_size, self.embedding_size], + initializer=get_initializer(self.initializer_range), + ) + + with tf.name_scope("token_type_embeddings"): + self.token_type_embeddings = self.add_weight( + name="embeddings", + shape=[self.config.type_vocab_size, self.embedding_size], + initializer=get_initializer(self.initializer_range), + ) + + with tf.name_scope("position_embeddings"): + self.position_embeddings = self.add_weight( + name="embeddings", + shape=[self.max_position_embeddings, self.embedding_size], + initializer=get_initializer(self.initializer_range), + ) + + if self.built: + return + self.built = True + if getattr(self, "LayerNorm", None) is not None: + with tf.name_scope(self.LayerNorm.name): + self.LayerNorm.build([None, None, self.config.embedding_size]) + + # Copied from transformers.models.bert.modeling_tf_bert.TFBertEmbeddings.call + def call( + self, + input_ids: Optional[tf.Tensor] = None, + position_ids: Optional[tf.Tensor] = None, + token_type_ids: Optional[tf.Tensor] = None, + inputs_embeds: Optional[tf.Tensor] = None, + past_key_values_length=0, + training: bool = False, + ) -> tf.Tensor: + """ + Applies embedding based on inputs tensor. + + Returns: + final_embeddings (`tf.Tensor`): output embedding tensor. + """ + if input_ids is None and inputs_embeds is None: + raise ValueError("Need to provide either `input_ids` or `input_embeds`.") + + if input_ids is not None: + check_embeddings_within_bounds(input_ids, self.config.vocab_size) + inputs_embeds = tf.gather(params=self.weight, indices=input_ids) + + input_shape = shape_list(inputs_embeds)[:-1] + + if token_type_ids is None: + token_type_ids = tf.fill(dims=input_shape, value=0) + + if position_ids is None: + position_ids = tf.expand_dims( + tf.range(start=past_key_values_length, limit=input_shape[1] + past_key_values_length), axis=0 + ) + + position_embeds = tf.gather(params=self.position_embeddings, indices=position_ids) + token_type_embeds = tf.gather(params=self.token_type_embeddings, indices=token_type_ids) + final_embeddings = inputs_embeds + position_embeds + token_type_embeds + final_embeddings = self.LayerNorm(inputs=final_embeddings) + final_embeddings = self.dropout(inputs=final_embeddings, training=training) + + return final_embeddings + + +class TFAlbertAttention(keras.layers.Layer): + """Contains the complete attention sublayer, including both dropouts and layer norm.""" + + def __init__(self, config: AlbertConfig, **kwargs): + super().__init__(**kwargs) + + if config.hidden_size % config.num_attention_heads != 0: + raise ValueError( + f"The hidden size ({config.hidden_size}) is not a multiple of the number " + f"of attention heads ({config.num_attention_heads})" + ) + + self.num_attention_heads = config.num_attention_heads + self.attention_head_size = int(config.hidden_size / config.num_attention_heads) + self.all_head_size = self.num_attention_heads * self.attention_head_size + self.sqrt_att_head_size = math.sqrt(self.attention_head_size) + self.output_attentions = config.output_attentions + + self.query = keras.layers.Dense( + units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="query" + ) + self.key = keras.layers.Dense( + units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="key" + ) + self.value = keras.layers.Dense( + units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="value" + ) + self.dense = keras.layers.Dense( + units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" + ) + self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") + # Two different dropout probabilities; see https://github.com/google-research/albert/blob/master/modeling.py#L971-L993 + self.attention_dropout = keras.layers.Dropout(rate=config.attention_probs_dropout_prob) + self.output_dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob) + self.config = config + + def transpose_for_scores(self, tensor: tf.Tensor, batch_size: int) -> tf.Tensor: + # Reshape from [batch_size, seq_length, all_head_size] to [batch_size, seq_length, num_attention_heads, attention_head_size] + tensor = tf.reshape(tensor=tensor, shape=(batch_size, -1, self.num_attention_heads, self.attention_head_size)) + + # Transpose the tensor from [batch_size, seq_length, num_attention_heads, attention_head_size] to [batch_size, num_attention_heads, seq_length, attention_head_size] + return tf.transpose(tensor, perm=[0, 2, 1, 3]) + + def call( + self, + input_tensor: tf.Tensor, + attention_mask: tf.Tensor, + head_mask: tf.Tensor, + output_attentions: bool, + training: bool = False, + ) -> Tuple[tf.Tensor]: + batch_size = shape_list(input_tensor)[0] + mixed_query_layer = self.query(inputs=input_tensor) + mixed_key_layer = self.key(inputs=input_tensor) + mixed_value_layer = self.value(inputs=input_tensor) + query_layer = self.transpose_for_scores(mixed_query_layer, batch_size) + key_layer = self.transpose_for_scores(mixed_key_layer, batch_size) + value_layer = self.transpose_for_scores(mixed_value_layer, batch_size) + + # Take the dot product between "query" and "key" to get the raw attention scores. + # (batch size, num_heads, seq_len_q, seq_len_k) + attention_scores = tf.matmul(query_layer, key_layer, transpose_b=True) + dk = tf.cast(self.sqrt_att_head_size, dtype=attention_scores.dtype) + attention_scores = tf.divide(attention_scores, dk) + + if attention_mask is not None: + # Apply the attention mask is (precomputed for all layers in TFAlbertModel call() function) + attention_scores = tf.add(attention_scores, attention_mask) + + # Normalize the attention scores to probabilities. + attention_probs = stable_softmax(logits=attention_scores, axis=-1) + + # This is actually dropping out entire tokens to attend to, which might + # seem a bit unusual, but is taken from the original Transformer paper. + attention_probs = self.attention_dropout(inputs=attention_probs, training=training) + + # Mask heads if we want to + if head_mask is not None: + attention_probs = tf.multiply(attention_probs, head_mask) + + context_layer = tf.matmul(attention_probs, value_layer) + context_layer = tf.transpose(context_layer, perm=[0, 2, 1, 3]) + + # (batch_size, seq_len_q, all_head_size) + context_layer = tf.reshape(tensor=context_layer, shape=(batch_size, -1, self.all_head_size)) + self_outputs = (context_layer, attention_probs) if output_attentions else (context_layer,) + hidden_states = self_outputs[0] + hidden_states = self.dense(inputs=hidden_states) + hidden_states = self.output_dropout(inputs=hidden_states, training=training) + attention_output = self.LayerNorm(inputs=hidden_states + input_tensor) + + # add attentions if we output them + outputs = (attention_output,) + self_outputs[1:] + + return outputs + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "query", None) is not None: + with tf.name_scope(self.query.name): + self.query.build([None, None, self.config.hidden_size]) + if getattr(self, "key", None) is not None: + with tf.name_scope(self.key.name): + self.key.build([None, None, self.config.hidden_size]) + if getattr(self, "value", None) is not None: + with tf.name_scope(self.value.name): + self.value.build([None, None, self.config.hidden_size]) + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build([None, None, self.config.hidden_size]) + if getattr(self, "LayerNorm", None) is not None: + with tf.name_scope(self.LayerNorm.name): + self.LayerNorm.build([None, None, self.config.hidden_size]) + + +class TFAlbertLayer(keras.layers.Layer): + def __init__(self, config: AlbertConfig, **kwargs): + super().__init__(**kwargs) + + self.attention = TFAlbertAttention(config, name="attention") + self.ffn = keras.layers.Dense( + units=config.intermediate_size, kernel_initializer=get_initializer(config.initializer_range), name="ffn" + ) + + if isinstance(config.hidden_act, str): + self.activation = get_tf_activation(config.hidden_act) + else: + self.activation = config.hidden_act + + self.ffn_output = keras.layers.Dense( + units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="ffn_output" + ) + self.full_layer_layer_norm = keras.layers.LayerNormalization( + epsilon=config.layer_norm_eps, name="full_layer_layer_norm" + ) + self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob) + self.config = config + + def call( + self, + hidden_states: tf.Tensor, + attention_mask: tf.Tensor, + head_mask: tf.Tensor, + output_attentions: bool, + training: bool = False, + ) -> Tuple[tf.Tensor]: + attention_outputs = self.attention( + input_tensor=hidden_states, + attention_mask=attention_mask, + head_mask=head_mask, + output_attentions=output_attentions, + training=training, + ) + ffn_output = self.ffn(inputs=attention_outputs[0]) + ffn_output = self.activation(ffn_output) + ffn_output = self.ffn_output(inputs=ffn_output) + ffn_output = self.dropout(inputs=ffn_output, training=training) + hidden_states = self.full_layer_layer_norm(inputs=ffn_output + attention_outputs[0]) + + # add attentions if we output them + outputs = (hidden_states,) + attention_outputs[1:] + + return outputs + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "attention", None) is not None: + with tf.name_scope(self.attention.name): + self.attention.build(None) + if getattr(self, "ffn", None) is not None: + with tf.name_scope(self.ffn.name): + self.ffn.build([None, None, self.config.hidden_size]) + if getattr(self, "ffn_output", None) is not None: + with tf.name_scope(self.ffn_output.name): + self.ffn_output.build([None, None, self.config.intermediate_size]) + if getattr(self, "full_layer_layer_norm", None) is not None: + with tf.name_scope(self.full_layer_layer_norm.name): + self.full_layer_layer_norm.build([None, None, self.config.hidden_size]) + + +class TFAlbertLayerGroup(keras.layers.Layer): + def __init__(self, config: AlbertConfig, **kwargs): + super().__init__(**kwargs) + + self.albert_layers = [ + TFAlbertLayer(config, name=f"albert_layers_._{i}") for i in range(config.inner_group_num) + ] + + def call( + self, + hidden_states: tf.Tensor, + attention_mask: tf.Tensor, + head_mask: tf.Tensor, + output_attentions: bool, + output_hidden_states: bool, + training: bool = False, + ) -> Union[TFBaseModelOutput, Tuple[tf.Tensor]]: + layer_hidden_states = () if output_hidden_states else None + layer_attentions = () if output_attentions else None + + for layer_index, albert_layer in enumerate(self.albert_layers): + if output_hidden_states: + layer_hidden_states = layer_hidden_states + (hidden_states,) + + layer_output = albert_layer( + hidden_states=hidden_states, + attention_mask=attention_mask, + head_mask=head_mask[layer_index], + output_attentions=output_attentions, + training=training, + ) + hidden_states = layer_output[0] + + if output_attentions: + layer_attentions = layer_attentions + (layer_output[1],) + + # Add last layer + if output_hidden_states: + layer_hidden_states = layer_hidden_states + (hidden_states,) + + return tuple(v for v in [hidden_states, layer_hidden_states, layer_attentions] if v is not None) + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "albert_layers", None) is not None: + for layer in self.albert_layers: + with tf.name_scope(layer.name): + layer.build(None) + + +class TFAlbertTransformer(keras.layers.Layer): + def __init__(self, config: AlbertConfig, **kwargs): + super().__init__(**kwargs) + + self.num_hidden_layers = config.num_hidden_layers + self.num_hidden_groups = config.num_hidden_groups + # Number of layers in a hidden group + self.layers_per_group = int(config.num_hidden_layers / config.num_hidden_groups) + self.embedding_hidden_mapping_in = keras.layers.Dense( + units=config.hidden_size, + kernel_initializer=get_initializer(config.initializer_range), + name="embedding_hidden_mapping_in", + ) + self.albert_layer_groups = [ + TFAlbertLayerGroup(config, name=f"albert_layer_groups_._{i}") for i in range(config.num_hidden_groups) + ] + self.config = config + + def call( + self, + hidden_states: tf.Tensor, + attention_mask: tf.Tensor, + head_mask: tf.Tensor, + output_attentions: bool, + output_hidden_states: bool, + return_dict: bool, + training: bool = False, + ) -> Union[TFBaseModelOutput, Tuple[tf.Tensor]]: + hidden_states = self.embedding_hidden_mapping_in(inputs=hidden_states) + all_attentions = () if output_attentions else None + all_hidden_states = (hidden_states,) if output_hidden_states else None + + for i in range(self.num_hidden_layers): + # Index of the hidden group + group_idx = int(i / (self.num_hidden_layers / self.num_hidden_groups)) + layer_group_output = self.albert_layer_groups[group_idx]( + hidden_states=hidden_states, + attention_mask=attention_mask, + head_mask=head_mask[group_idx * self.layers_per_group : (group_idx + 1) * self.layers_per_group], + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + training=training, + ) + hidden_states = layer_group_output[0] + + if output_attentions: + all_attentions = all_attentions + layer_group_output[-1] + + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + if not return_dict: + return tuple(v for v in [hidden_states, all_hidden_states, all_attentions] if v is not None) + + return TFBaseModelOutput( + last_hidden_state=hidden_states, hidden_states=all_hidden_states, attentions=all_attentions + ) + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "embedding_hidden_mapping_in", None) is not None: + with tf.name_scope(self.embedding_hidden_mapping_in.name): + self.embedding_hidden_mapping_in.build([None, None, self.config.embedding_size]) + if getattr(self, "albert_layer_groups", None) is not None: + for layer in self.albert_layer_groups: + with tf.name_scope(layer.name): + layer.build(None) + + +class TFAlbertPreTrainedModel(TFPreTrainedModel): + """ + An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained + models. + """ + + config_class = AlbertConfig + base_model_prefix = "albert" + + +class TFAlbertMLMHead(keras.layers.Layer): + def __init__(self, config: AlbertConfig, input_embeddings: keras.layers.Layer, **kwargs): + super().__init__(**kwargs) + + self.config = config + self.embedding_size = config.embedding_size + self.dense = keras.layers.Dense( + config.embedding_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" + ) + if isinstance(config.hidden_act, str): + self.activation = get_tf_activation(config.hidden_act) + else: + self.activation = config.hidden_act + + self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") + + # The output weights are the same as the input embeddings, but there is + # an output-only bias for each token. + self.decoder = input_embeddings + + def build(self, input_shape=None): + self.bias = self.add_weight(shape=(self.config.vocab_size,), initializer="zeros", trainable=True, name="bias") + self.decoder_bias = self.add_weight( + shape=(self.config.vocab_size,), initializer="zeros", trainable=True, name="decoder/bias" + ) + + if self.built: + return + self.built = True + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build([None, None, self.config.hidden_size]) + if getattr(self, "LayerNorm", None) is not None: + with tf.name_scope(self.LayerNorm.name): + self.LayerNorm.build([None, None, self.config.embedding_size]) + + def get_output_embeddings(self) -> keras.layers.Layer: + return self.decoder + + def set_output_embeddings(self, value: tf.Variable): + self.decoder.weight = value + self.decoder.vocab_size = shape_list(value)[0] + + def get_bias(self) -> Dict[str, tf.Variable]: + return {"bias": self.bias, "decoder_bias": self.decoder_bias} + + def set_bias(self, value: tf.Variable): + self.bias = value["bias"] + self.decoder_bias = value["decoder_bias"] + self.config.vocab_size = shape_list(value["bias"])[0] + + def call(self, hidden_states: tf.Tensor) -> tf.Tensor: + hidden_states = self.dense(inputs=hidden_states) + hidden_states = self.activation(hidden_states) + hidden_states = self.LayerNorm(inputs=hidden_states) + seq_length = shape_list(tensor=hidden_states)[1] + hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, self.embedding_size]) + hidden_states = tf.matmul(a=hidden_states, b=self.decoder.weight, transpose_b=True) + hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, seq_length, self.config.vocab_size]) + hidden_states = tf.nn.bias_add(value=hidden_states, bias=self.decoder_bias) + + return hidden_states + + +@keras_serializable +class TFAlbertMainLayer(keras.layers.Layer): + config_class = AlbertConfig + + def __init__(self, config: AlbertConfig, add_pooling_layer: bool = True, **kwargs): + super().__init__(**kwargs) + + self.config = config + + self.embeddings = TFAlbertEmbeddings(config, name="embeddings") + self.encoder = TFAlbertTransformer(config, name="encoder") + self.pooler = ( + keras.layers.Dense( + units=config.hidden_size, + kernel_initializer=get_initializer(config.initializer_range), + activation="tanh", + name="pooler", + ) + if add_pooling_layer + else None + ) + + def get_input_embeddings(self) -> keras.layers.Layer: + return self.embeddings + + def set_input_embeddings(self, value: tf.Variable): + self.embeddings.weight = value + self.embeddings.vocab_size = shape_list(value)[0] + + def _prune_heads(self, heads_to_prune): + """ + Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base + class PreTrainedModel + """ + raise NotImplementedError + + @unpack_inputs + def call( + self, + input_ids: TFModelInputType | None = None, + attention_mask: np.ndarray | tf.Tensor | None = None, + token_type_ids: np.ndarray | tf.Tensor | None = None, + position_ids: np.ndarray | tf.Tensor | None = None, + head_mask: np.ndarray | tf.Tensor | None = None, + inputs_embeds: np.ndarray | tf.Tensor | None = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + training: bool = False, + ) -> Union[TFBaseModelOutputWithPooling, Tuple[tf.Tensor]]: + if input_ids is not None and inputs_embeds is not None: + raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") + elif input_ids is not None: + input_shape = shape_list(input_ids) + elif inputs_embeds is not None: + input_shape = shape_list(inputs_embeds)[:-1] + else: + raise ValueError("You have to specify either input_ids or inputs_embeds") + + if attention_mask is None: + attention_mask = tf.fill(dims=input_shape, value=1) + + if token_type_ids is None: + token_type_ids = tf.fill(dims=input_shape, value=0) + + embedding_output = self.embeddings( + input_ids=input_ids, + position_ids=position_ids, + token_type_ids=token_type_ids, + inputs_embeds=inputs_embeds, + training=training, + ) + + # We create a 3D attention mask from a 2D tensor mask. + # Sizes are [batch_size, 1, 1, to_seq_length] + # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length] + # this attention mask is more simple than the triangular masking of causal attention + # used in OpenAI GPT, we just need to prepare the broadcast dimension here. + extended_attention_mask = tf.reshape(attention_mask, (input_shape[0], 1, 1, input_shape[1])) + + # Since attention_mask is 1.0 for positions we want to attend and 0.0 for + # masked positions, this operation will create a tensor which is 0.0 for + # positions we want to attend and -10000.0 for masked positions. + # Since we are adding it to the raw scores before the softmax, this is + # effectively the same as removing these entirely. + extended_attention_mask = tf.cast(extended_attention_mask, dtype=embedding_output.dtype) + one_cst = tf.constant(1.0, dtype=embedding_output.dtype) + ten_thousand_cst = tf.constant(-10000.0, dtype=embedding_output.dtype) + extended_attention_mask = tf.multiply(tf.subtract(one_cst, extended_attention_mask), ten_thousand_cst) + + # Prepare head mask if needed + # 1.0 in head_mask indicate we keep the head + # attention_probs has shape bsz x n_heads x N x N + # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads] + # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length] + if head_mask is not None: + raise NotImplementedError + else: + head_mask = [None] * self.config.num_hidden_layers + + encoder_outputs = self.encoder( + hidden_states=embedding_output, + attention_mask=extended_attention_mask, + head_mask=head_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + ) + + sequence_output = encoder_outputs[0] + pooled_output = self.pooler(inputs=sequence_output[:, 0]) if self.pooler is not None else None + + if not return_dict: + return ( + sequence_output, + pooled_output, + ) + encoder_outputs[1:] + + return TFBaseModelOutputWithPooling( + last_hidden_state=sequence_output, + pooler_output=pooled_output, + hidden_states=encoder_outputs.hidden_states, + attentions=encoder_outputs.attentions, + ) + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "embeddings", None) is not None: + with tf.name_scope(self.embeddings.name): + self.embeddings.build(None) + if getattr(self, "encoder", None) is not None: + with tf.name_scope(self.encoder.name): + self.encoder.build(None) + if getattr(self, "pooler", None) is not None: + with tf.name_scope(self.pooler.name): + self.pooler.build([None, None, self.config.hidden_size]) + + +@dataclass +class TFAlbertForPreTrainingOutput(ModelOutput): + """ + Output type of [`TFAlbertForPreTraining`]. + + Args: + prediction_logits (`tf.Tensor` of shape `(batch_size, sequence_length, config.vocab_size)`): + Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). + sop_logits (`tf.Tensor` of shape `(batch_size, 2)`): + Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation + before SoftMax). + hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape + `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + + loss: Optional[tf.Tensor] = None + prediction_logits: Optional[tf.Tensor] = None + sop_logits: Optional[tf.Tensor] = None + hidden_states: Tuple[tf.Tensor] | None = None + attentions: Tuple[tf.Tensor] | None = None + + +ALBERT_START_DOCSTRING = r""" + + This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads + etc.) + + This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it + as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and + behavior. + + + + TensorFlow models and layers in `transformers` accept two formats as input: + + - having all inputs as keyword arguments (like PyTorch models), or + - having all inputs as a list, tuple or dict in the first positional argument. + + The reason the second format is supported is that Keras methods prefer this format when passing inputs to models + and layers. Because of this support, when using methods like `model.fit()` things should "just work" for you - just + pass your inputs and labels in any format that `model.fit()` supports! If, however, you want to use the second + format outside of Keras methods like `fit()` and `predict()`, such as when creating your own layers or models with + the Keras `Functional` API, there are three possibilities you can use to gather all the input Tensors in the first + positional argument: + + - a single Tensor with `input_ids` only and nothing else: `model(input_ids)` + - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring: + `model([input_ids, attention_mask])` or `model([input_ids, attention_mask, token_type_ids])` + - a dictionary with one or several input Tensors associated to the input names given in the docstring: + `model({"input_ids": input_ids, "token_type_ids": token_type_ids})` + + Note that when creating models and layers with + [subclassing](https://keras.io/guides/making_new_layers_and_models_via_subclassing/) then you don't need to worry + about any of this, as you can just pass inputs like you would to any other Python function! + + + + Args: + config ([`AlbertConfig`]): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights. +""" + +ALBERT_INPUTS_DOCSTRING = r""" + Args: + input_ids (`Numpy array` or `tf.Tensor` of shape `({0})`): + Indices of input sequence tokens in the vocabulary. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.__call__`] and + [`PreTrainedTokenizer.encode`] for details. + + [What are input IDs?](../glossary#input-ids) + attention_mask (`Numpy array` or `tf.Tensor` of shape `({0})`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + token_type_ids (`Numpy array` or `tf.Tensor` of shape `({0})`, *optional*): + Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0, + 1]`: + + - 0 corresponds to a *sentence A* token, + - 1 corresponds to a *sentence B* token. + + [What are token type IDs?](../glossary#token-type-ids) + position_ids (`Numpy array` or `tf.Tensor` of shape `({0})`, *optional*): + Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, + config.max_position_embeddings - 1]`. + + [What are position IDs?](../glossary#position-ids) + head_mask (`Numpy array` or `tf.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*): + Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + inputs_embeds (`tf.Tensor` of shape `({0}, hidden_size)`, *optional*): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This + is useful if you want more control over how to convert `input_ids` indices into associated vectors than the + model's internal embedding lookup matrix. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. This argument can be used only in eager mode, in graph mode the value in the + config will be used instead. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. This argument can be used only in eager mode, in graph mode the value in the config will be + used instead. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. This argument can be used in + eager mode, in graph mode the value will always be set to True. + training (`bool`, *optional*, defaults to `False`): + Whether or not to use the model in training mode (some modules like dropout modules have different + behaviors between training and evaluation). +""" + + +@add_start_docstrings( + "The bare Albert Model transformer outputting raw hidden-states without any specific head on top.", + ALBERT_START_DOCSTRING, +) +class TFAlbertModel(TFAlbertPreTrainedModel): + def __init__(self, config: AlbertConfig, *inputs, **kwargs): + super().__init__(config, *inputs, **kwargs) + + self.albert = TFAlbertMainLayer(config, name="albert") + + @unpack_inputs + @add_start_docstrings_to_model_forward(ALBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @add_code_sample_docstrings( + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=TFBaseModelOutputWithPooling, + config_class=_CONFIG_FOR_DOC, + ) + def call( + self, + input_ids: TFModelInputType | None = None, + attention_mask: np.ndarray | tf.Tensor | None = None, + token_type_ids: np.ndarray | tf.Tensor | None = None, + position_ids: np.ndarray | tf.Tensor | None = None, + head_mask: np.ndarray | tf.Tensor | None = None, + inputs_embeds: np.ndarray | tf.Tensor | None = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + training: Optional[bool] = False, + ) -> Union[TFBaseModelOutputWithPooling, Tuple[tf.Tensor]]: + outputs = self.albert( + input_ids=input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + ) + + return outputs + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "albert", None) is not None: + with tf.name_scope(self.albert.name): + self.albert.build(None) + + +@add_start_docstrings( + """ + Albert Model with two heads on top for pretraining: a `masked language modeling` head and a `sentence order + prediction` (classification) head. + """, + ALBERT_START_DOCSTRING, +) +class TFAlbertForPreTraining(TFAlbertPreTrainedModel, TFAlbertPreTrainingLoss): + # names with a '.' represents the authorized unexpected/missing layers when a TF model is loaded from a PT model + _keys_to_ignore_on_load_unexpected = [r"predictions.decoder.weight"] + + def __init__(self, config: AlbertConfig, *inputs, **kwargs): + super().__init__(config, *inputs, **kwargs) + + self.num_labels = config.num_labels + + self.albert = TFAlbertMainLayer(config, name="albert") + self.predictions = TFAlbertMLMHead(config, input_embeddings=self.albert.embeddings, name="predictions") + self.sop_classifier = TFAlbertSOPHead(config, name="sop_classifier") + + def get_lm_head(self) -> keras.layers.Layer: + return self.predictions + + @unpack_inputs + @add_start_docstrings_to_model_forward(ALBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @replace_return_docstrings(output_type=TFAlbertForPreTrainingOutput, config_class=_CONFIG_FOR_DOC) + def call( + self, + input_ids: TFModelInputType | None = None, + attention_mask: np.ndarray | tf.Tensor | None = None, + token_type_ids: np.ndarray | tf.Tensor | None = None, + position_ids: np.ndarray | tf.Tensor | None = None, + head_mask: np.ndarray | tf.Tensor | None = None, + inputs_embeds: np.ndarray | tf.Tensor | None = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + labels: np.ndarray | tf.Tensor | None = None, + sentence_order_label: np.ndarray | tf.Tensor | None = None, + training: Optional[bool] = False, + ) -> Union[TFAlbertForPreTrainingOutput, Tuple[tf.Tensor]]: + r""" + Return: + + Example: + + ```python + >>> import tensorflow as tf + >>> from transformers import AutoTokenizer, TFAlbertForPreTraining + + >>> tokenizer = AutoTokenizer.from_pretrained("albert/albert-base-v2") + >>> model = TFAlbertForPreTraining.from_pretrained("albert/albert-base-v2") + + >>> input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] + >>> # Batch size 1 + >>> outputs = model(input_ids) + + >>> prediction_logits = outputs.prediction_logits + >>> sop_logits = outputs.sop_logits + ```""" + + outputs = self.albert( + input_ids=input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + ) + sequence_output, pooled_output = outputs[:2] + prediction_scores = self.predictions(hidden_states=sequence_output) + sop_scores = self.sop_classifier(pooled_output=pooled_output, training=training) + total_loss = None + + if labels is not None and sentence_order_label is not None: + d_labels = {"labels": labels} + d_labels["sentence_order_label"] = sentence_order_label + total_loss = self.hf_compute_loss(labels=d_labels, logits=(prediction_scores, sop_scores)) + + if not return_dict: + output = (prediction_scores, sop_scores) + outputs[2:] + return ((total_loss,) + output) if total_loss is not None else output + + return TFAlbertForPreTrainingOutput( + loss=total_loss, + prediction_logits=prediction_scores, + sop_logits=sop_scores, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "albert", None) is not None: + with tf.name_scope(self.albert.name): + self.albert.build(None) + if getattr(self, "predictions", None) is not None: + with tf.name_scope(self.predictions.name): + self.predictions.build(None) + if getattr(self, "sop_classifier", None) is not None: + with tf.name_scope(self.sop_classifier.name): + self.sop_classifier.build(None) + + +class TFAlbertSOPHead(keras.layers.Layer): + def __init__(self, config: AlbertConfig, **kwargs): + super().__init__(**kwargs) + + self.dropout = keras.layers.Dropout(rate=config.classifier_dropout_prob) + self.classifier = keras.layers.Dense( + units=config.num_labels, + kernel_initializer=get_initializer(config.initializer_range), + name="classifier", + ) + self.config = config + + def call(self, pooled_output: tf.Tensor, training: bool) -> tf.Tensor: + dropout_pooled_output = self.dropout(inputs=pooled_output, training=training) + logits = self.classifier(inputs=dropout_pooled_output) + + return logits + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "classifier", None) is not None: + with tf.name_scope(self.classifier.name): + self.classifier.build([None, None, self.config.hidden_size]) + + +@add_start_docstrings("""Albert Model with a `language modeling` head on top.""", ALBERT_START_DOCSTRING) +class TFAlbertForMaskedLM(TFAlbertPreTrainedModel, TFMaskedLanguageModelingLoss): + # names with a '.' represents the authorized unexpected/missing layers when a TF model is loaded from a PT model + _keys_to_ignore_on_load_unexpected = [r"pooler", r"predictions.decoder.weight"] + + def __init__(self, config: AlbertConfig, *inputs, **kwargs): + super().__init__(config, *inputs, **kwargs) + + self.albert = TFAlbertMainLayer(config, add_pooling_layer=False, name="albert") + self.predictions = TFAlbertMLMHead(config, input_embeddings=self.albert.embeddings, name="predictions") + + def get_lm_head(self) -> keras.layers.Layer: + return self.predictions + + @unpack_inputs + @add_start_docstrings_to_model_forward(ALBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @replace_return_docstrings(output_type=TFMaskedLMOutput, config_class=_CONFIG_FOR_DOC) + def call( + self, + input_ids: TFModelInputType | None = None, + attention_mask: np.ndarray | tf.Tensor | None = None, + token_type_ids: np.ndarray | tf.Tensor | None = None, + position_ids: np.ndarray | tf.Tensor | None = None, + head_mask: np.ndarray | tf.Tensor | None = None, + inputs_embeds: np.ndarray | tf.Tensor | None = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + labels: np.ndarray | tf.Tensor | None = None, + training: Optional[bool] = False, + ) -> Union[TFMaskedLMOutput, Tuple[tf.Tensor]]: + r""" + labels (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ..., + config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the + loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]` + + Returns: + + Example: + + ```python + >>> import tensorflow as tf + >>> from transformers import AutoTokenizer, TFAlbertForMaskedLM + + >>> tokenizer = AutoTokenizer.from_pretrained("albert/albert-base-v2") + >>> model = TFAlbertForMaskedLM.from_pretrained("albert/albert-base-v2") + + >>> # add mask_token + >>> inputs = tokenizer(f"The capital of [MASK] is Paris.", return_tensors="tf") + >>> logits = model(**inputs).logits + + >>> # retrieve index of [MASK] + >>> mask_token_index = tf.where(inputs.input_ids == tokenizer.mask_token_id)[0][1] + >>> predicted_token_id = tf.math.argmax(logits[0, mask_token_index], axis=-1) + >>> tokenizer.decode(predicted_token_id) + 'france' + ``` + + ```python + >>> labels = tokenizer("The capital of France is Paris.", return_tensors="tf")["input_ids"] + >>> labels = tf.where(inputs.input_ids == tokenizer.mask_token_id, labels, -100) + >>> outputs = model(**inputs, labels=labels) + >>> round(float(outputs.loss), 2) + 0.81 + ``` + """ + outputs = self.albert( + input_ids=input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + ) + sequence_output = outputs[0] + prediction_scores = self.predictions(hidden_states=sequence_output, training=training) + loss = None if labels is None else self.hf_compute_loss(labels=labels, logits=prediction_scores) + + if not return_dict: + output = (prediction_scores,) + outputs[2:] + + return ((loss,) + output) if loss is not None else output + + return TFMaskedLMOutput( + loss=loss, + logits=prediction_scores, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "albert", None) is not None: + with tf.name_scope(self.albert.name): + self.albert.build(None) + if getattr(self, "predictions", None) is not None: + with tf.name_scope(self.predictions.name): + self.predictions.build(None) + + +@add_start_docstrings( + """ + Albert Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled + output) e.g. for GLUE tasks. + """, + ALBERT_START_DOCSTRING, +) +class TFAlbertForSequenceClassification(TFAlbertPreTrainedModel, TFSequenceClassificationLoss): + # names with a '.' represents the authorized unexpected/missing layers when a TF model is loaded from a PT model + _keys_to_ignore_on_load_unexpected = [r"predictions"] + _keys_to_ignore_on_load_missing = [r"dropout"] + + def __init__(self, config: AlbertConfig, *inputs, **kwargs): + super().__init__(config, *inputs, **kwargs) + + self.num_labels = config.num_labels + + self.albert = TFAlbertMainLayer(config, name="albert") + self.dropout = keras.layers.Dropout(rate=config.classifier_dropout_prob) + self.classifier = keras.layers.Dense( + units=config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier" + ) + self.config = config + + @unpack_inputs + @add_start_docstrings_to_model_forward(ALBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @add_code_sample_docstrings( + checkpoint="vumichien/albert-base-v2-imdb", + output_type=TFSequenceClassifierOutput, + config_class=_CONFIG_FOR_DOC, + expected_output="'LABEL_1'", + expected_loss=0.12, + ) + def call( + self, + input_ids: TFModelInputType | None = None, + attention_mask: np.ndarray | tf.Tensor | None = None, + token_type_ids: np.ndarray | tf.Tensor | None = None, + position_ids: np.ndarray | tf.Tensor | None = None, + head_mask: np.ndarray | tf.Tensor | None = None, + inputs_embeds: np.ndarray | tf.Tensor | None = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + labels: np.ndarray | tf.Tensor | None = None, + training: Optional[bool] = False, + ) -> Union[TFSequenceClassifierOutput, Tuple[tf.Tensor]]: + r""" + labels (`tf.Tensor` of shape `(batch_size,)`, *optional*): + Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., + config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If + `config.num_labels > 1` a classification loss is computed (Cross-Entropy). + """ + outputs = self.albert( + input_ids=input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + ) + pooled_output = outputs[1] + pooled_output = self.dropout(inputs=pooled_output, training=training) + logits = self.classifier(inputs=pooled_output) + loss = None if labels is None else self.hf_compute_loss(labels=labels, logits=logits) + + if not return_dict: + output = (logits,) + outputs[2:] + + return ((loss,) + output) if loss is not None else output + + return TFSequenceClassifierOutput( + loss=loss, + logits=logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "albert", None) is not None: + with tf.name_scope(self.albert.name): + self.albert.build(None) + if getattr(self, "classifier", None) is not None: + with tf.name_scope(self.classifier.name): + self.classifier.build([None, None, self.config.hidden_size]) + + +@add_start_docstrings( + """ + Albert Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for + Named-Entity-Recognition (NER) tasks. + """, + ALBERT_START_DOCSTRING, +) +class TFAlbertForTokenClassification(TFAlbertPreTrainedModel, TFTokenClassificationLoss): + # names with a '.' represents the authorized unexpected/missing layers when a TF model is loaded from a PT model + _keys_to_ignore_on_load_unexpected = [r"pooler", r"predictions"] + _keys_to_ignore_on_load_missing = [r"dropout"] + + def __init__(self, config: AlbertConfig, *inputs, **kwargs): + super().__init__(config, *inputs, **kwargs) + + self.num_labels = config.num_labels + + self.albert = TFAlbertMainLayer(config, add_pooling_layer=False, name="albert") + classifier_dropout_prob = ( + config.classifier_dropout_prob + if config.classifier_dropout_prob is not None + else config.hidden_dropout_prob + ) + self.dropout = keras.layers.Dropout(rate=classifier_dropout_prob) + self.classifier = keras.layers.Dense( + units=config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier" + ) + self.config = config + + @unpack_inputs + @add_start_docstrings_to_model_forward(ALBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @add_code_sample_docstrings( + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=TFTokenClassifierOutput, + config_class=_CONFIG_FOR_DOC, + ) + def call( + self, + input_ids: TFModelInputType | None = None, + attention_mask: np.ndarray | tf.Tensor | None = None, + token_type_ids: np.ndarray | tf.Tensor | None = None, + position_ids: np.ndarray | tf.Tensor | None = None, + head_mask: np.ndarray | tf.Tensor | None = None, + inputs_embeds: np.ndarray | tf.Tensor | None = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + labels: np.ndarray | tf.Tensor | None = None, + training: Optional[bool] = False, + ) -> Union[TFTokenClassifierOutput, Tuple[tf.Tensor]]: + r""" + labels (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`. + """ + outputs = self.albert( + input_ids=input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + ) + sequence_output = outputs[0] + sequence_output = self.dropout(inputs=sequence_output, training=training) + logits = self.classifier(inputs=sequence_output) + loss = None if labels is None else self.hf_compute_loss(labels=labels, logits=logits) + + if not return_dict: + output = (logits,) + outputs[2:] + + return ((loss,) + output) if loss is not None else output + + return TFTokenClassifierOutput( + loss=loss, + logits=logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "albert", None) is not None: + with tf.name_scope(self.albert.name): + self.albert.build(None) + if getattr(self, "classifier", None) is not None: + with tf.name_scope(self.classifier.name): + self.classifier.build([None, None, self.config.hidden_size]) + + +@add_start_docstrings( + """ + Albert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear + layer on top of the hidden-states output to compute `span start logits` and `span end logits`). + """, + ALBERT_START_DOCSTRING, +) +class TFAlbertForQuestionAnswering(TFAlbertPreTrainedModel, TFQuestionAnsweringLoss): + # names with a '.' represents the authorized unexpected/missing layers when a TF model is loaded from a PT model + _keys_to_ignore_on_load_unexpected = [r"pooler", r"predictions"] + + def __init__(self, config: AlbertConfig, *inputs, **kwargs): + super().__init__(config, *inputs, **kwargs) + + self.num_labels = config.num_labels + + self.albert = TFAlbertMainLayer(config, add_pooling_layer=False, name="albert") + self.qa_outputs = keras.layers.Dense( + units=config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs" + ) + self.config = config + + @unpack_inputs + @add_start_docstrings_to_model_forward(ALBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @add_code_sample_docstrings( + checkpoint="vumichien/albert-base-v2-squad2", + output_type=TFQuestionAnsweringModelOutput, + config_class=_CONFIG_FOR_DOC, + qa_target_start_index=12, + qa_target_end_index=13, + expected_output="'a nice puppet'", + expected_loss=7.36, + ) + def call( + self, + input_ids: TFModelInputType | None = None, + attention_mask: np.ndarray | tf.Tensor | None = None, + token_type_ids: np.ndarray | tf.Tensor | None = None, + position_ids: np.ndarray | tf.Tensor | None = None, + head_mask: np.ndarray | tf.Tensor | None = None, + inputs_embeds: np.ndarray | tf.Tensor | None = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + start_positions: np.ndarray | tf.Tensor | None = None, + end_positions: np.ndarray | tf.Tensor | None = None, + training: Optional[bool] = False, + ) -> Union[TFQuestionAnsweringModelOutput, Tuple[tf.Tensor]]: + r""" + start_positions (`tf.Tensor` of shape `(batch_size,)`, *optional*): + Labels for position (index) of the start of the labelled span for computing the token classification loss. + Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence + are not taken into account for computing the loss. + end_positions (`tf.Tensor` of shape `(batch_size,)`, *optional*): + Labels for position (index) of the end of the labelled span for computing the token classification loss. + Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence + are not taken into account for computing the loss. + """ + outputs = self.albert( + input_ids=input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + ) + sequence_output = outputs[0] + logits = self.qa_outputs(inputs=sequence_output) + start_logits, end_logits = tf.split(value=logits, num_or_size_splits=2, axis=-1) + start_logits = tf.squeeze(input=start_logits, axis=-1) + end_logits = tf.squeeze(input=end_logits, axis=-1) + loss = None + + if start_positions is not None and end_positions is not None: + labels = {"start_position": start_positions} + labels["end_position"] = end_positions + loss = self.hf_compute_loss(labels=labels, logits=(start_logits, end_logits)) + + if not return_dict: + output = (start_logits, end_logits) + outputs[2:] + + return ((loss,) + output) if loss is not None else output + + return TFQuestionAnsweringModelOutput( + loss=loss, + start_logits=start_logits, + end_logits=end_logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "albert", None) is not None: + with tf.name_scope(self.albert.name): + self.albert.build(None) + if getattr(self, "qa_outputs", None) is not None: + with tf.name_scope(self.qa_outputs.name): + self.qa_outputs.build([None, None, self.config.hidden_size]) + + +@add_start_docstrings( + """ + Albert Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a + softmax) e.g. for RocStories/SWAG tasks. + """, + ALBERT_START_DOCSTRING, +) +class TFAlbertForMultipleChoice(TFAlbertPreTrainedModel, TFMultipleChoiceLoss): + # names with a '.' represents the authorized unexpected/missing layers when a TF model is loaded from a PT model + _keys_to_ignore_on_load_unexpected = [r"pooler", r"predictions"] + _keys_to_ignore_on_load_missing = [r"dropout"] + + def __init__(self, config: AlbertConfig, *inputs, **kwargs): + super().__init__(config, *inputs, **kwargs) + + self.albert = TFAlbertMainLayer(config, name="albert") + self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob) + self.classifier = keras.layers.Dense( + units=1, kernel_initializer=get_initializer(config.initializer_range), name="classifier" + ) + self.config = config + + @unpack_inputs + @add_start_docstrings_to_model_forward(ALBERT_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length")) + @add_code_sample_docstrings( + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=TFMultipleChoiceModelOutput, + config_class=_CONFIG_FOR_DOC, + ) + def call( + self, + input_ids: TFModelInputType | None = None, + attention_mask: np.ndarray | tf.Tensor | None = None, + token_type_ids: np.ndarray | tf.Tensor | None = None, + position_ids: np.ndarray | tf.Tensor | None = None, + head_mask: np.ndarray | tf.Tensor | None = None, + inputs_embeds: np.ndarray | tf.Tensor | None = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + labels: np.ndarray | tf.Tensor | None = None, + training: Optional[bool] = False, + ) -> Union[TFMultipleChoiceModelOutput, Tuple[tf.Tensor]]: + r""" + labels (`tf.Tensor` of shape `(batch_size,)`, *optional*): + Labels for computing the multiple choice classification loss. Indices should be in `[0, ..., num_choices]` + where `num_choices` is the size of the second dimension of the input tensors. (See `input_ids` above) + """ + + if input_ids is not None: + num_choices = shape_list(input_ids)[1] + seq_length = shape_list(input_ids)[2] + else: + num_choices = shape_list(inputs_embeds)[1] + seq_length = shape_list(inputs_embeds)[2] + + flat_input_ids = tf.reshape(input_ids, (-1, seq_length)) if input_ids is not None else None + flat_attention_mask = ( + tf.reshape(tensor=attention_mask, shape=(-1, seq_length)) if attention_mask is not None else None + ) + flat_token_type_ids = ( + tf.reshape(tensor=token_type_ids, shape=(-1, seq_length)) if token_type_ids is not None else None + ) + flat_position_ids = ( + tf.reshape(tensor=position_ids, shape=(-1, seq_length)) if position_ids is not None else None + ) + flat_inputs_embeds = ( + tf.reshape(tensor=inputs_embeds, shape=(-1, seq_length, shape_list(inputs_embeds)[3])) + if inputs_embeds is not None + else None + ) + outputs = self.albert( + input_ids=flat_input_ids, + attention_mask=flat_attention_mask, + token_type_ids=flat_token_type_ids, + position_ids=flat_position_ids, + head_mask=head_mask, + inputs_embeds=flat_inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + ) + pooled_output = outputs[1] + pooled_output = self.dropout(inputs=pooled_output, training=training) + logits = self.classifier(inputs=pooled_output) + reshaped_logits = tf.reshape(tensor=logits, shape=(-1, num_choices)) + loss = None if labels is None else self.hf_compute_loss(labels=labels, logits=reshaped_logits) + + if not return_dict: + output = (reshaped_logits,) + outputs[2:] + return ((loss,) + output) if loss is not None else output + + return TFMultipleChoiceModelOutput( + loss=loss, + logits=reshaped_logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "albert", None) is not None: + with tf.name_scope(self.albert.name): + self.albert.build(None) + if getattr(self, "classifier", None) is not None: + with tf.name_scope(self.classifier.name): + self.classifier.build([None, None, self.config.hidden_size]) + + +__all__ = [ + "TFAlbertPreTrainedModel", + "TFAlbertModel", + "TFAlbertForPreTraining", + "TFAlbertForMaskedLM", + "TFAlbertForSequenceClassification", + "TFAlbertForTokenClassification", + "TFAlbertForQuestionAnswering", + "TFAlbertForMultipleChoice", + "TFAlbertMainLayer", +] diff --git a/docs/transformers/build/lib/transformers/models/albert/tokenization_albert.py b/docs/transformers/build/lib/transformers/models/albert/tokenization_albert.py new file mode 100644 index 0000000000000000000000000000000000000000..7ecd9f907eb7b694e748094faf6ad145c388133d --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/albert/tokenization_albert.py @@ -0,0 +1,350 @@ +# coding=utf-8 +# Copyright 2018 Google AI, Google Brain and the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Tokenization classes for ALBERT model.""" + +import os +import unicodedata +from shutil import copyfile +from typing import Any, Dict, List, Optional, Tuple + +import sentencepiece as spm + +from ...tokenization_utils import AddedToken, PreTrainedTokenizer +from ...utils import logging +from ...utils.import_utils import requires + + +logger = logging.get_logger(__name__) +VOCAB_FILES_NAMES = {"vocab_file": "spiece.model"} + + +SPIECE_UNDERLINE = "▁" + + +@requires(backends=("sentencepiece",)) +class AlbertTokenizer(PreTrainedTokenizer): + """ + Construct an ALBERT tokenizer. Based on [SentencePiece](https://github.com/google/sentencepiece). + + This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to + this superclass for more information regarding those methods. + + Args: + vocab_file (`str`): + [SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that + contains the vocabulary necessary to instantiate a tokenizer. + do_lower_case (`bool`, *optional*, defaults to `True`): + Whether or not to lowercase the input when tokenizing. + remove_space (`bool`, *optional*, defaults to `True`): + Whether or not to strip the text when tokenizing (removing excess spaces before and after the string). + keep_accents (`bool`, *optional*, defaults to `False`): + Whether or not to keep accents when tokenizing. + bos_token (`str`, *optional*, defaults to `"[CLS]"`): + The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token. + + + + When building a sequence using special tokens, this is not the token that is used for the beginning of + sequence. The token used is the `cls_token`. + + + + eos_token (`str`, *optional*, defaults to `"[SEP]"`): + The end of sequence token. + + + + When building a sequence using special tokens, this is not the token that is used for the end of sequence. + The token used is the `sep_token`. + + + + unk_token (`str`, *optional*, defaults to `""`): + The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this + token instead. + sep_token (`str`, *optional*, defaults to `"[SEP]"`): + The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for + sequence classification or for a text and a question for question answering. It is also used as the last + token of a sequence built with special tokens. + pad_token (`str`, *optional*, defaults to `""`): + The token used for padding, for example when batching sequences of different lengths. + cls_token (`str`, *optional*, defaults to `"[CLS]"`): + The classifier token which is used when doing sequence classification (classification of the whole sequence + instead of per-token classification). It is the first token of the sequence when built with special tokens. + mask_token (`str`, *optional*, defaults to `"[MASK]"`): + The token used for masking values. This is the token used when training this model with masked language + modeling. This is the token which the model will try to predict. + sp_model_kwargs (`dict`, *optional*): + Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for + SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things, + to set: + + - `enable_sampling`: Enable subword regularization. + - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout. + + - `nbest_size = {0,1}`: No sampling is performed. + - `nbest_size > 1`: samples from the nbest_size results. + - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice) + using forward-filtering-and-backward-sampling algorithm. + + - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for + BPE-dropout. + + Attributes: + sp_model (`SentencePieceProcessor`): + The *SentencePiece* processor that is used for every conversion (string, tokens and IDs). + """ + + vocab_files_names = VOCAB_FILES_NAMES + + def __init__( + self, + vocab_file, + do_lower_case=True, + remove_space=True, + keep_accents=False, + bos_token="[CLS]", + eos_token="[SEP]", + unk_token="", + sep_token="[SEP]", + pad_token="", + cls_token="[CLS]", + mask_token="[MASK]", + sp_model_kwargs: Optional[Dict[str, Any]] = None, + **kwargs, + ) -> None: + # Mask token behave like a normal word, i.e. include the space before it and + # is included in the raw text, there should be a match in a non-normalized sentence. + mask_token = ( + AddedToken(mask_token, lstrip=True, rstrip=False, normalized=False) + if isinstance(mask_token, str) + else mask_token + ) + + self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs + + self.do_lower_case = do_lower_case + self.remove_space = remove_space + self.keep_accents = keep_accents + self.vocab_file = vocab_file + + self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) + self.sp_model.Load(vocab_file) + + super().__init__( + do_lower_case=do_lower_case, + remove_space=remove_space, + keep_accents=keep_accents, + bos_token=bos_token, + eos_token=eos_token, + unk_token=unk_token, + sep_token=sep_token, + pad_token=pad_token, + cls_token=cls_token, + mask_token=mask_token, + sp_model_kwargs=self.sp_model_kwargs, + **kwargs, + ) + + @property + def vocab_size(self) -> int: + return len(self.sp_model) + + def get_vocab(self) -> Dict[str, int]: + vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)} + vocab.update(self.added_tokens_encoder) + return vocab + + def __getstate__(self): + state = self.__dict__.copy() + state["sp_model"] = None + return state + + def __setstate__(self, d): + self.__dict__ = d + + # for backward compatibility + if not hasattr(self, "sp_model_kwargs"): + self.sp_model_kwargs = {} + + self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) + self.sp_model.Load(self.vocab_file) + + def preprocess_text(self, inputs): + if self.remove_space: + outputs = " ".join(inputs.strip().split()) + else: + outputs = inputs + outputs = outputs.replace("``", '"').replace("''", '"') + + if not self.keep_accents: + outputs = unicodedata.normalize("NFKD", outputs) + outputs = "".join([c for c in outputs if not unicodedata.combining(c)]) + if self.do_lower_case: + outputs = outputs.lower() + + return outputs + + def _tokenize(self, text: str) -> List[str]: + """Tokenize a string.""" + text = self.preprocess_text(text) + pieces = self.sp_model.encode(text, out_type=str) + new_pieces = [] + for piece in pieces: + if len(piece) > 1 and piece[-1] == str(",") and piece[-2].isdigit(): + # Logic to handle special cases see https://github.com/google-research/bert/blob/master/README.md#tokenization + # `9,9` -> ['▁9', ',', '9'] instead of [`_9,`, '9'] + cur_pieces = self.sp_model.EncodeAsPieces(piece[:-1].replace(SPIECE_UNDERLINE, "")) + if piece[0] != SPIECE_UNDERLINE and cur_pieces[0][0] == SPIECE_UNDERLINE: + if len(cur_pieces[0]) == 1: + cur_pieces = cur_pieces[1:] + else: + cur_pieces[0] = cur_pieces[0][1:] + cur_pieces.append(piece[-1]) + new_pieces.extend(cur_pieces) + else: + new_pieces.append(piece) + + return new_pieces + + def _convert_token_to_id(self, token): + """Converts a token (str) in an id using the vocab.""" + return self.sp_model.PieceToId(token) + + def _convert_id_to_token(self, index): + """Converts an index (integer) in a token (str) using the vocab.""" + return self.sp_model.IdToPiece(index) + + def convert_tokens_to_string(self, tokens): + """Converts a sequence of tokens (string) in a single string.""" + current_sub_tokens = [] + out_string = "" + prev_is_special = False + for token in tokens: + # make sure that special tokens are not decoded using sentencepiece model + if token in self.all_special_tokens: + if not prev_is_special: + out_string += " " + out_string += self.sp_model.decode(current_sub_tokens) + token + prev_is_special = True + current_sub_tokens = [] + else: + current_sub_tokens.append(token) + prev_is_special = False + out_string += self.sp_model.decode(current_sub_tokens) + return out_string.strip() + + def build_inputs_with_special_tokens( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None + ) -> List[int]: + """ + Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and + adding special tokens. An ALBERT sequence has the following format: + + - single sequence: `[CLS] X [SEP]` + - pair of sequences: `[CLS] A [SEP] B [SEP]` + + Args: + token_ids_0 (`List[int]`): + List of IDs to which the special tokens will be added. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + + Returns: + `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens. + """ + sep = [self.sep_token_id] + cls = [self.cls_token_id] + if token_ids_1 is None: + return cls + token_ids_0 + sep + return cls + token_ids_0 + sep + token_ids_1 + sep + + def get_special_tokens_mask( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False + ) -> List[int]: + """ + Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding + special tokens using the tokenizer `prepare_for_model` method. + + Args: + token_ids_0 (`List[int]`): + List of IDs. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + already_has_special_tokens (`bool`, *optional*, defaults to `False`): + Whether or not the token list is already formatted with special tokens for the model. + + Returns: + `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. + """ + + if already_has_special_tokens: + return super().get_special_tokens_mask( + token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True + ) + + if token_ids_1 is not None: + return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1] + return [1] + ([0] * len(token_ids_0)) + [1] + + def create_token_type_ids_from_sequences( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None + ) -> List[int]: + """ + Create a mask from the two sequences passed to be used in a sequence-pair classification task. An ALBERT + sequence pair mask has the following format: + + ``` + 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 + | first sequence | second sequence | + ``` + + If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s). + + Args: + token_ids_0 (`List[int]`): + List of IDs. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + + Returns: + `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s). + """ + sep = [self.sep_token_id] + cls = [self.cls_token_id] + + if token_ids_1 is None: + return len(cls + token_ids_0 + sep) * [0] + return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1] + + def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]: + if not os.path.isdir(save_directory): + logger.error(f"Vocabulary path ({save_directory}) should be a directory") + return + out_vocab_file = os.path.join( + save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"] + ) + + if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file): + copyfile(self.vocab_file, out_vocab_file) + elif not os.path.isfile(self.vocab_file): + with open(out_vocab_file, "wb") as fi: + content_spiece_model = self.sp_model.serialized_model_proto() + fi.write(content_spiece_model) + + return (out_vocab_file,) + + +__all__ = ["AlbertTokenizer"] diff --git a/docs/transformers/build/lib/transformers/models/albert/tokenization_albert_fast.py b/docs/transformers/build/lib/transformers/models/albert/tokenization_albert_fast.py new file mode 100644 index 0000000000000000000000000000000000000000..6e7b110b0afad7e65fba7be2d951ee7a7fba4788 --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/albert/tokenization_albert_fast.py @@ -0,0 +1,212 @@ +# coding=utf-8 +# Copyright 2018 Google AI, Google Brain and the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Tokenization classes for ALBERT model.""" + +import os +from shutil import copyfile +from typing import List, Optional, Tuple + +from ...tokenization_utils import AddedToken +from ...tokenization_utils_fast import PreTrainedTokenizerFast +from ...utils import is_sentencepiece_available, logging + + +if is_sentencepiece_available(): + from .tokenization_albert import AlbertTokenizer +else: + AlbertTokenizer = None + +logger = logging.get_logger(__name__) +VOCAB_FILES_NAMES = {"vocab_file": "spiece.model", "tokenizer_file": "tokenizer.json"} + + +SPIECE_UNDERLINE = "▁" + + +class AlbertTokenizerFast(PreTrainedTokenizerFast): + """ + Construct a "fast" ALBERT tokenizer (backed by HuggingFace's *tokenizers* library). Based on + [Unigram](https://huggingface.co/docs/tokenizers/python/latest/components.html?highlight=unigram#models). This + tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should refer to + this superclass for more information regarding those methods + + Args: + vocab_file (`str`): + [SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that + contains the vocabulary necessary to instantiate a tokenizer. + do_lower_case (`bool`, *optional*, defaults to `True`): + Whether or not to lowercase the input when tokenizing. + remove_space (`bool`, *optional*, defaults to `True`): + Whether or not to strip the text when tokenizing (removing excess spaces before and after the string). + keep_accents (`bool`, *optional*, defaults to `False`): + Whether or not to keep accents when tokenizing. + bos_token (`str`, *optional*, defaults to `"[CLS]"`): + The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token. + + + + When building a sequence using special tokens, this is not the token that is used for the beginning of + sequence. The token used is the `cls_token`. + + + + eos_token (`str`, *optional*, defaults to `"[SEP]"`): + The end of sequence token. .. note:: When building a sequence using special tokens, this is not the token + that is used for the end of sequence. The token used is the `sep_token`. + unk_token (`str`, *optional*, defaults to `""`): + The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this + token instead. + sep_token (`str`, *optional*, defaults to `"[SEP]"`): + The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for + sequence classification or for a text and a question for question answering. It is also used as the last + token of a sequence built with special tokens. + pad_token (`str`, *optional*, defaults to `""`): + The token used for padding, for example when batching sequences of different lengths. + cls_token (`str`, *optional*, defaults to `"[CLS]"`): + The classifier token which is used when doing sequence classification (classification of the whole sequence + instead of per-token classification). It is the first token of the sequence when built with special tokens. + mask_token (`str`, *optional*, defaults to `"[MASK]"`): + The token used for masking values. This is the token used when training this model with masked language + modeling. This is the token which the model will try to predict. + """ + + vocab_files_names = VOCAB_FILES_NAMES + slow_tokenizer_class = AlbertTokenizer + + def __init__( + self, + vocab_file=None, + tokenizer_file=None, + do_lower_case=True, + remove_space=True, + keep_accents=False, + bos_token="[CLS]", + eos_token="[SEP]", + unk_token="", + sep_token="[SEP]", + pad_token="", + cls_token="[CLS]", + mask_token="[MASK]", + **kwargs, + ): + # Mask token behave like a normal word, i.e. include the space before it and + # is included in the raw text, there should be a match in a non-normalized sentence. + mask_token = ( + AddedToken(mask_token, lstrip=True, rstrip=False, normalized=False) + if isinstance(mask_token, str) + else mask_token + ) + + super().__init__( + vocab_file, + tokenizer_file=tokenizer_file, + do_lower_case=do_lower_case, + remove_space=remove_space, + keep_accents=keep_accents, + bos_token=bos_token, + eos_token=eos_token, + unk_token=unk_token, + sep_token=sep_token, + pad_token=pad_token, + cls_token=cls_token, + mask_token=mask_token, + **kwargs, + ) + + self.do_lower_case = do_lower_case + self.remove_space = remove_space + self.keep_accents = keep_accents + self.vocab_file = vocab_file + + @property + def can_save_slow_tokenizer(self) -> bool: + return os.path.isfile(self.vocab_file) if self.vocab_file else False + + def build_inputs_with_special_tokens( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None + ) -> List[int]: + """ + Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and + adding special tokens. An ALBERT sequence has the following format: + + - single sequence: `[CLS] X [SEP]` + - pair of sequences: `[CLS] A [SEP] B [SEP]` + + Args: + token_ids_0 (`List[int]`): + List of IDs to which the special tokens will be added + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + + Returns: + `List[int]`: list of [input IDs](../glossary#input-ids) with the appropriate special tokens. + """ + sep = [self.sep_token_id] + cls = [self.cls_token_id] + if token_ids_1 is None: + return cls + token_ids_0 + sep + return cls + token_ids_0 + sep + token_ids_1 + sep + + def create_token_type_ids_from_sequences( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None + ) -> List[int]: + """ + Creates a mask from the two sequences passed to be used in a sequence-pair classification task. An ALBERT + sequence pair mask has the following format: + + ``` + 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 + | first sequence | second sequence | + ``` + + if token_ids_1 is None, only returns the first portion of the mask (0s). + + Args: + token_ids_0 (`List[int]`): + List of ids. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + + Returns: + `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s). + """ + sep = [self.sep_token_id] + cls = [self.cls_token_id] + + if token_ids_1 is None: + return len(cls + token_ids_0 + sep) * [0] + return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1] + + def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]: + if not self.can_save_slow_tokenizer: + raise ValueError( + "Your fast tokenizer does not have the necessary information to save the vocabulary for a slow " + "tokenizer." + ) + + if not os.path.isdir(save_directory): + logger.error(f"Vocabulary path ({save_directory}) should be a directory") + return + out_vocab_file = os.path.join( + save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"] + ) + + if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file): + copyfile(self.vocab_file, out_vocab_file) + + return (out_vocab_file,) + + +__all__ = ["AlbertTokenizerFast"] diff --git a/docs/transformers/build/lib/transformers/models/align/__init__.py b/docs/transformers/build/lib/transformers/models/align/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..aaa64dfb6064b10e820fca01e9e632aa7ecd68c6 --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/align/__init__.py @@ -0,0 +1,28 @@ +# Copyright 2023 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ...utils import _LazyModule +from ...utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .configuration_align import * + from .modeling_align import * + from .processing_align import * +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/docs/transformers/build/lib/transformers/models/align/configuration_align.py b/docs/transformers/build/lib/transformers/models/align/configuration_align.py new file mode 100644 index 0000000000000000000000000000000000000000..a22ab1dc40f8d0d147e5f98f122a64e53bf18fcc --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/align/configuration_align.py @@ -0,0 +1,349 @@ +# coding=utf-8 +# Copyright 2023 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""ALIGN model configuration""" + +from typing import TYPE_CHECKING, List + + +if TYPE_CHECKING: + pass + +from ...configuration_utils import PretrainedConfig +from ...utils import logging + + +logger = logging.get_logger(__name__) + + +class AlignTextConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`AlignTextModel`]. It is used to instantiate a + ALIGN text encoder according to the specified arguments, defining the model architecture. Instantiating a + configuration with the defaults will yield a similar configuration to that of the text encoder of the ALIGN + [kakaobrain/align-base](https://huggingface.co/kakaobrain/align-base) architecture. The default values here are + copied from BERT. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + vocab_size (`int`, *optional*, defaults to 30522): + Vocabulary size of the Align Text model. Defines the number of different tokens that can be represented by + the `inputs_ids` passed when calling [`AlignTextModel`]. + hidden_size (`int`, *optional*, defaults to 768): + Dimensionality of the encoder layers and the pooler layer. + num_hidden_layers (`int`, *optional*, defaults to 12): + Number of hidden layers in the Transformer encoder. + num_attention_heads (`int`, *optional*, defaults to 12): + Number of attention heads for each attention layer in the Transformer encoder. + intermediate_size (`int`, *optional*, defaults to 3072): + Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder. + hidden_act (`str` or `Callable`, *optional*, defaults to `"gelu"`): + The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, + `"relu"`, `"silu"` and `"gelu_new"` are supported. + hidden_dropout_prob (`float`, *optional*, defaults to 0.1): + The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. + attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1): + The dropout ratio for the attention probabilities. + max_position_embeddings (`int`, *optional*, defaults to 512): + The maximum sequence length that this model might ever be used with. Typically set this to something large + just in case (e.g., 512 or 1024 or 2048). + type_vocab_size (`int`, *optional*, defaults to 2): + The vocabulary size of the `token_type_ids` passed when calling [`AlignTextModel`]. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + layer_norm_eps (`float`, *optional*, defaults to 1e-12): + The epsilon used by the layer normalization layers. + pad_token_id (`int`, *optional*, defaults to 0): + Padding token id. + position_embedding_type (`str`, *optional*, defaults to `"absolute"`): + Type of position embedding. Choose one of `"absolute"`, `"relative_key"`, `"relative_key_query"`. For + positional embeddings use `"absolute"`. For more information on `"relative_key"`, please refer to + [Self-Attention with Relative Position Representations (Shaw et al.)](https://arxiv.org/abs/1803.02155). + For more information on `"relative_key_query"`, please refer to *Method 4* in [Improve Transformer Models + with Better Relative Position Embeddings (Huang et al.)](https://arxiv.org/abs/2009.13658). + use_cache (`bool`, *optional*, defaults to `True`): + Whether or not the model should return the last key/values attentions (not used by all models). Only + relevant if `config.is_decoder=True`. + + Example: + + ```python + >>> from transformers import AlignTextConfig, AlignTextModel + + >>> # Initializing a AlignTextConfig with kakaobrain/align-base style configuration + >>> configuration = AlignTextConfig() + + >>> # Initializing a AlignTextModel (with random weights) from the kakaobrain/align-base style configuration + >>> model = AlignTextModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "align_text_model" + base_config_key = "text_config" + + def __init__( + self, + vocab_size=30522, + hidden_size=768, + num_hidden_layers=12, + num_attention_heads=12, + intermediate_size=3072, + hidden_act="gelu", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + max_position_embeddings=512, + type_vocab_size=2, + initializer_range=0.02, + layer_norm_eps=1e-12, + pad_token_id=0, + position_embedding_type="absolute", + use_cache=True, + **kwargs, + ): + super().__init__(**kwargs) + + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.hidden_act = hidden_act + self.intermediate_size = intermediate_size + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.max_position_embeddings = max_position_embeddings + self.type_vocab_size = type_vocab_size + self.initializer_range = initializer_range + self.layer_norm_eps = layer_norm_eps + self.position_embedding_type = position_embedding_type + self.use_cache = use_cache + self.pad_token_id = pad_token_id + + +class AlignVisionConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`AlignVisionModel`]. It is used to instantiate a + ALIGN vision encoder according to the specified arguments, defining the model architecture. Instantiating a + configuration with the defaults will yield a similar configuration to that of the vision encoder of the ALIGN + [kakaobrain/align-base](https://huggingface.co/kakaobrain/align-base) architecture. The default values are copied + from EfficientNet (efficientnet-b7) + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + num_channels (`int`, *optional*, defaults to 3): + The number of input channels. + image_size (`int`, *optional*, defaults to 600): + The input image size. + width_coefficient (`float`, *optional*, defaults to 2.0): + Scaling coefficient for network width at each stage. + depth_coefficient (`float`, *optional*, defaults to 3.1): + Scaling coefficient for network depth at each stage. + depth_divisor `int`, *optional*, defaults to 8): + A unit of network width. + kernel_sizes (`List[int]`, *optional*, defaults to `[3, 3, 5, 3, 5, 5, 3]`): + List of kernel sizes to be used in each block. + in_channels (`List[int]`, *optional*, defaults to `[32, 16, 24, 40, 80, 112, 192]`): + List of input channel sizes to be used in each block for convolutional layers. + out_channels (`List[int]`, *optional*, defaults to `[16, 24, 40, 80, 112, 192, 320]`): + List of output channel sizes to be used in each block for convolutional layers. + depthwise_padding (`List[int]`, *optional*, defaults to `[]`): + List of block indices with square padding. + strides (`List[int]`, *optional*, defaults to `[1, 2, 2, 2, 1, 2, 1]`): + List of stride sizes to be used in each block for convolutional layers. + num_block_repeats (`List[int]`, *optional*, defaults to `[1, 2, 2, 3, 3, 4, 1]`): + List of the number of times each block is to repeated. + expand_ratios (`List[int]`, *optional*, defaults to `[1, 6, 6, 6, 6, 6, 6]`): + List of scaling coefficient of each block. + squeeze_expansion_ratio (`float`, *optional*, defaults to 0.25): + Squeeze expansion ratio. + hidden_act (`str` or `function`, *optional*, defaults to `"silu"`): + The non-linear activation function (function or string) in each block. If string, `"gelu"`, `"relu"`, + `"selu", `"gelu_new"`, `"silu"` and `"mish"` are supported. + hidden_dim (`int`, *optional*, defaults to 1280): + The hidden dimension of the layer before the classification head. + pooling_type (`str` or `function`, *optional*, defaults to `"mean"`): + Type of final pooling to be applied before the dense classification head. Available options are [`"mean"`, + `"max"`] + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + batch_norm_eps (`float`, *optional*, defaults to 1e-3): + The epsilon used by the batch normalization layers. + batch_norm_momentum (`float`, *optional*, defaults to 0.99): + The momentum used by the batch normalization layers. + drop_connect_rate (`float`, *optional*, defaults to 0.2): + The drop rate for skip connections. + + Example: + + ```python + >>> from transformers import AlignVisionConfig, AlignVisionModel + + >>> # Initializing a AlignVisionConfig with kakaobrain/align-base style configuration + >>> configuration = AlignVisionConfig() + + >>> # Initializing a AlignVisionModel (with random weights) from the kakaobrain/align-base style configuration + >>> model = AlignVisionModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "align_vision_model" + base_config_key = "vision_config" + + def __init__( + self, + num_channels: int = 3, + image_size: int = 600, + width_coefficient: float = 2.0, + depth_coefficient: float = 3.1, + depth_divisor: int = 8, + kernel_sizes: List[int] = [3, 3, 5, 3, 5, 5, 3], + in_channels: List[int] = [32, 16, 24, 40, 80, 112, 192], + out_channels: List[int] = [16, 24, 40, 80, 112, 192, 320], + depthwise_padding: List[int] = [], + strides: List[int] = [1, 2, 2, 2, 1, 2, 1], + num_block_repeats: List[int] = [1, 2, 2, 3, 3, 4, 1], + expand_ratios: List[int] = [1, 6, 6, 6, 6, 6, 6], + squeeze_expansion_ratio: float = 0.25, + hidden_act: str = "swish", + hidden_dim: int = 2560, + pooling_type: str = "mean", + initializer_range: float = 0.02, + batch_norm_eps: float = 0.001, + batch_norm_momentum: float = 0.99, + drop_connect_rate: float = 0.2, + **kwargs, + ): + super().__init__(**kwargs) + + self.num_channels = num_channels + self.image_size = image_size + self.width_coefficient = width_coefficient + self.depth_coefficient = depth_coefficient + self.depth_divisor = depth_divisor + self.kernel_sizes = kernel_sizes + self.in_channels = in_channels + self.out_channels = out_channels + self.depthwise_padding = depthwise_padding + self.strides = strides + self.num_block_repeats = num_block_repeats + self.expand_ratios = expand_ratios + self.squeeze_expansion_ratio = squeeze_expansion_ratio + self.hidden_act = hidden_act + self.hidden_dim = hidden_dim + self.pooling_type = pooling_type + self.initializer_range = initializer_range + self.batch_norm_eps = batch_norm_eps + self.batch_norm_momentum = batch_norm_momentum + self.drop_connect_rate = drop_connect_rate + self.num_hidden_layers = sum(num_block_repeats) * 4 + + +class AlignConfig(PretrainedConfig): + r""" + [`AlignConfig`] is the configuration class to store the configuration of a [`AlignModel`]. It is used to + instantiate a ALIGN model according to the specified arguments, defining the text model and vision model configs. + Instantiating a configuration with the defaults will yield a similar configuration to that of the ALIGN + [kakaobrain/align-base](https://huggingface.co/kakaobrain/align-base) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + text_config (`dict`, *optional*): + Dictionary of configuration options used to initialize [`AlignTextConfig`]. + vision_config (`dict`, *optional*): + Dictionary of configuration options used to initialize [`AlignVisionConfig`]. + projection_dim (`int`, *optional*, defaults to 640): + Dimensionality of text and vision projection layers. + temperature_init_value (`float`, *optional*, defaults to 1.0): + The initial value of the *temperature* parameter. Default is used as per the original ALIGN implementation. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + kwargs (*optional*): + Dictionary of keyword arguments. + + Example: + + ```python + >>> from transformers import AlignConfig, AlignModel + + >>> # Initializing a AlignConfig with kakaobrain/align-base style configuration + >>> configuration = AlignConfig() + + >>> # Initializing a AlignModel (with random weights) from the kakaobrain/align-base style configuration + >>> model = AlignModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + + >>> # We can also initialize a AlignConfig from a AlignTextConfig and a AlignVisionConfig + >>> from transformers import AlignTextConfig, AlignVisionConfig + + >>> # Initializing ALIGN Text and Vision configurations + >>> config_text = AlignTextConfig() + >>> config_vision = AlignVisionConfig() + + >>> config = AlignConfig.from_text_vision_configs(config_text, config_vision) + ```""" + + model_type = "align" + sub_configs = {"text_config": AlignTextConfig, "vision_config": AlignVisionConfig} + + def __init__( + self, + text_config=None, + vision_config=None, + projection_dim=640, + temperature_init_value=1.0, + initializer_range=0.02, + **kwargs, + ): + super().__init__(**kwargs) + + if text_config is None: + text_config = {} + logger.info("text_config is None. Initializing the AlignTextConfig with default values.") + + if vision_config is None: + vision_config = {} + logger.info("vision_config is None. Initializing the AlignVisionConfig with default values.") + + self.text_config = AlignTextConfig(**text_config) + self.vision_config = AlignVisionConfig(**vision_config) + + self.projection_dim = projection_dim + self.temperature_init_value = temperature_init_value + self.initializer_range = initializer_range + + @classmethod + def from_text_vision_configs(cls, text_config: AlignTextConfig, vision_config: AlignVisionConfig, **kwargs): + r""" + Instantiate a [`AlignConfig`] (or a derived class) from align text model configuration and align vision model + configuration. + + Returns: + [`AlignConfig`]: An instance of a configuration object + """ + + return cls(text_config=text_config.to_dict(), vision_config=vision_config.to_dict(), **kwargs) + + +__all__ = ["AlignTextConfig", "AlignVisionConfig", "AlignConfig"] diff --git a/docs/transformers/build/lib/transformers/models/align/convert_align_tf_to_hf.py b/docs/transformers/build/lib/transformers/models/align/convert_align_tf_to_hf.py new file mode 100644 index 0000000000000000000000000000000000000000..610db8482f91628164f2f48ea948ed357ac5ea93 --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/align/convert_align_tf_to_hf.py @@ -0,0 +1,389 @@ +# coding=utf-8 +# Copyright 2023 The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Convert ALIGN checkpoints from the original repository.""" + +import argparse +import os + +import align +import numpy as np +import requests +import tensorflow as tf +import torch +from PIL import Image +from tokenizer import Tokenizer + +from transformers import ( + AlignConfig, + AlignModel, + AlignProcessor, + BertConfig, + BertTokenizer, + EfficientNetConfig, + EfficientNetImageProcessor, +) +from transformers.utils import logging + + +logging.set_verbosity_info() +logger = logging.get_logger(__name__) + + +def preprocess(image): + image = tf.image.resize(image, (346, 346)) + image = tf.image.crop_to_bounding_box(image, (346 - 289) // 2, (346 - 289) // 2, 289, 289) + return image + + +def get_align_config(): + vision_config = EfficientNetConfig.from_pretrained("google/efficientnet-b7") + vision_config.image_size = 289 + vision_config.hidden_dim = 640 + vision_config.id2label = {"0": "LABEL_0", "1": "LABEL_1"} + vision_config.label2id = {"LABEL_0": 0, "LABEL_1": 1} + vision_config.depthwise_padding = [] + + text_config = BertConfig() + config = AlignConfig.from_text_vision_configs( + text_config=text_config, vision_config=vision_config, projection_dim=640 + ) + return config + + +# We will verify our results on an image of cute cats +def prepare_img(): + url = "http://images.cocodataset.org/val2017/000000039769.jpg" + im = Image.open(requests.get(url, stream=True).raw) + return im + + +def get_processor(): + image_processor = EfficientNetImageProcessor( + do_center_crop=True, + rescale_factor=1 / 127.5, + rescale_offset=True, + do_normalize=False, + include_top=False, + resample=Image.BILINEAR, + ) + tokenizer = BertTokenizer.from_pretrained("google-bert/bert-base-uncased") + tokenizer.model_max_length = 64 + processor = AlignProcessor(image_processor=image_processor, tokenizer=tokenizer) + return processor + + +# here we list all keys to be renamed (original name on the left, our name on the right) +def rename_keys(original_param_names): + # EfficientNet image encoder + block_names = [v.split("_")[0].split("block")[1] for v in original_param_names if v.startswith("block")] + block_names = list(set(block_names)) + block_names = sorted(block_names) + num_blocks = len(block_names) + block_name_mapping = {b: str(i) for b, i in zip(block_names, range(num_blocks))} + + rename_keys = [] + rename_keys.append(("stem_conv/kernel:0", "embeddings.convolution.weight")) + rename_keys.append(("stem_bn/gamma:0", "embeddings.batchnorm.weight")) + rename_keys.append(("stem_bn/beta:0", "embeddings.batchnorm.bias")) + rename_keys.append(("stem_bn/moving_mean:0", "embeddings.batchnorm.running_mean")) + rename_keys.append(("stem_bn/moving_variance:0", "embeddings.batchnorm.running_var")) + + for b in block_names: + hf_b = block_name_mapping[b] + rename_keys.append((f"block{b}_expand_conv/kernel:0", f"encoder.blocks.{hf_b}.expansion.expand_conv.weight")) + rename_keys.append((f"block{b}_expand_bn/gamma:0", f"encoder.blocks.{hf_b}.expansion.expand_bn.weight")) + rename_keys.append((f"block{b}_expand_bn/beta:0", f"encoder.blocks.{hf_b}.expansion.expand_bn.bias")) + rename_keys.append( + (f"block{b}_expand_bn/moving_mean:0", f"encoder.blocks.{hf_b}.expansion.expand_bn.running_mean") + ) + rename_keys.append( + (f"block{b}_expand_bn/moving_variance:0", f"encoder.blocks.{hf_b}.expansion.expand_bn.running_var") + ) + rename_keys.append( + (f"block{b}_dwconv/depthwise_kernel:0", f"encoder.blocks.{hf_b}.depthwise_conv.depthwise_conv.weight") + ) + rename_keys.append((f"block{b}_bn/gamma:0", f"encoder.blocks.{hf_b}.depthwise_conv.depthwise_norm.weight")) + rename_keys.append((f"block{b}_bn/beta:0", f"encoder.blocks.{hf_b}.depthwise_conv.depthwise_norm.bias")) + rename_keys.append( + (f"block{b}_bn/moving_mean:0", f"encoder.blocks.{hf_b}.depthwise_conv.depthwise_norm.running_mean") + ) + rename_keys.append( + (f"block{b}_bn/moving_variance:0", f"encoder.blocks.{hf_b}.depthwise_conv.depthwise_norm.running_var") + ) + + rename_keys.append((f"block{b}_se_reduce/kernel:0", f"encoder.blocks.{hf_b}.squeeze_excite.reduce.weight")) + rename_keys.append((f"block{b}_se_reduce/bias:0", f"encoder.blocks.{hf_b}.squeeze_excite.reduce.bias")) + rename_keys.append((f"block{b}_se_expand/kernel:0", f"encoder.blocks.{hf_b}.squeeze_excite.expand.weight")) + rename_keys.append((f"block{b}_se_expand/bias:0", f"encoder.blocks.{hf_b}.squeeze_excite.expand.bias")) + rename_keys.append( + (f"block{b}_project_conv/kernel:0", f"encoder.blocks.{hf_b}.projection.project_conv.weight") + ) + rename_keys.append((f"block{b}_project_bn/gamma:0", f"encoder.blocks.{hf_b}.projection.project_bn.weight")) + rename_keys.append((f"block{b}_project_bn/beta:0", f"encoder.blocks.{hf_b}.projection.project_bn.bias")) + rename_keys.append( + (f"block{b}_project_bn/moving_mean:0", f"encoder.blocks.{hf_b}.projection.project_bn.running_mean") + ) + rename_keys.append( + (f"block{b}_project_bn/moving_variance:0", f"encoder.blocks.{hf_b}.projection.project_bn.running_var") + ) + + key_mapping = {} + for item in rename_keys: + if item[0] in original_param_names: + key_mapping[item[0]] = "vision_model." + item[1] + + # BERT text encoder + rename_keys = [] + old = "tf_bert_model/bert" + new = "text_model" + for i in range(12): + rename_keys.append( + ( + f"{old}/encoder/layer_._{i}/attention/self/query/kernel:0", + f"{new}.encoder.layer.{i}.attention.self.query.weight", + ) + ) + rename_keys.append( + ( + f"{old}/encoder/layer_._{i}/attention/self/query/bias:0", + f"{new}.encoder.layer.{i}.attention.self.query.bias", + ) + ) + rename_keys.append( + ( + f"{old}/encoder/layer_._{i}/attention/self/key/kernel:0", + f"{new}.encoder.layer.{i}.attention.self.key.weight", + ) + ) + rename_keys.append( + ( + f"{old}/encoder/layer_._{i}/attention/self/key/bias:0", + f"{new}.encoder.layer.{i}.attention.self.key.bias", + ) + ) + rename_keys.append( + ( + f"{old}/encoder/layer_._{i}/attention/self/value/kernel:0", + f"{new}.encoder.layer.{i}.attention.self.value.weight", + ) + ) + rename_keys.append( + ( + f"{old}/encoder/layer_._{i}/attention/self/value/bias:0", + f"{new}.encoder.layer.{i}.attention.self.value.bias", + ) + ) + rename_keys.append( + ( + f"{old}/encoder/layer_._{i}/attention/output/dense/kernel:0", + f"{new}.encoder.layer.{i}.attention.output.dense.weight", + ) + ) + rename_keys.append( + ( + f"{old}/encoder/layer_._{i}/attention/output/dense/bias:0", + f"{new}.encoder.layer.{i}.attention.output.dense.bias", + ) + ) + rename_keys.append( + ( + f"{old}/encoder/layer_._{i}/attention/output/LayerNorm/gamma:0", + f"{new}.encoder.layer.{i}.attention.output.LayerNorm.weight", + ) + ) + rename_keys.append( + ( + f"{old}/encoder/layer_._{i}/attention/output/LayerNorm/beta:0", + f"{new}.encoder.layer.{i}.attention.output.LayerNorm.bias", + ) + ) + rename_keys.append( + ( + f"{old}/encoder/layer_._{i}/intermediate/dense/kernel:0", + f"{new}.encoder.layer.{i}.intermediate.dense.weight", + ) + ) + rename_keys.append( + ( + f"{old}/encoder/layer_._{i}/intermediate/dense/bias:0", + f"{new}.encoder.layer.{i}.intermediate.dense.bias", + ) + ) + rename_keys.append( + (f"{old}/encoder/layer_._{i}/output/dense/kernel:0", f"{new}.encoder.layer.{i}.output.dense.weight") + ) + rename_keys.append( + (f"{old}/encoder/layer_._{i}/output/dense/bias:0", f"{new}.encoder.layer.{i}.output.dense.bias") + ) + rename_keys.append( + (f"{old}/encoder/layer_._{i}/output/LayerNorm/gamma:0", f"{new}.encoder.layer.{i}.output.LayerNorm.weight") + ) + rename_keys.append( + (f"{old}/encoder/layer_._{i}/output/LayerNorm/beta:0", f"{new}.encoder.layer.{i}.output.LayerNorm.bias") + ) + + rename_keys.append((f"{old}/embeddings/word_embeddings/weight:0", f"{new}.embeddings.word_embeddings.weight")) + rename_keys.append( + (f"{old}/embeddings/position_embeddings/embeddings:0", f"{new}.embeddings.position_embeddings.weight") + ) + rename_keys.append( + (f"{old}/embeddings/token_type_embeddings/embeddings:0", f"{new}.embeddings.token_type_embeddings.weight") + ) + rename_keys.append((f"{old}/embeddings/LayerNorm/gamma:0", f"{new}.embeddings.LayerNorm.weight")) + rename_keys.append((f"{old}/embeddings/LayerNorm/beta:0", f"{new}.embeddings.LayerNorm.bias")) + + rename_keys.append((f"{old}/pooler/dense/kernel:0", f"{new}.pooler.dense.weight")) + rename_keys.append((f"{old}/pooler/dense/bias:0", f"{new}.pooler.dense.bias")) + rename_keys.append(("dense/kernel:0", "text_projection.weight")) + rename_keys.append(("dense/bias:0", "text_projection.bias")) + rename_keys.append(("dense/bias:0", "text_projection.bias")) + rename_keys.append(("temperature:0", "temperature")) + + for item in rename_keys: + if item[0] in original_param_names: + key_mapping[item[0]] = item[1] + return key_mapping + + +def replace_params(hf_params, tf_params, key_mapping): + list(hf_params.keys()) + + for key, value in tf_params.items(): + if key not in key_mapping: + continue + + hf_key = key_mapping[key] + if "_conv" in key and "kernel" in key: + new_hf_value = torch.from_numpy(value).permute(3, 2, 0, 1) + elif "embeddings" in key: + new_hf_value = torch.from_numpy(value) + elif "depthwise_kernel" in key: + new_hf_value = torch.from_numpy(value).permute(2, 3, 0, 1) + elif "kernel" in key: + new_hf_value = torch.from_numpy(np.transpose(value)) + elif "temperature" in key: + new_hf_value = value + elif "bn/gamma" or "bn/beta" in key: + new_hf_value = torch.from_numpy(np.transpose(value)).squeeze() + else: + new_hf_value = torch.from_numpy(value) + + # Replace HF parameters with original TF model parameters + hf_params[hf_key].copy_(new_hf_value) + + +@torch.no_grad() +def convert_align_checkpoint(checkpoint_path, pytorch_dump_folder_path, save_model, push_to_hub): + """ + Copy/paste/tweak model's weights to our ALIGN structure. + """ + # Load original model + seq_length = 64 + tok = Tokenizer(seq_length) + original_model = align.Align("efficientnet-b7", "bert-base", 640, seq_length, tok.get_vocab_size()) + original_model.compile() + original_model.load_weights(checkpoint_path) + + tf_params = original_model.trainable_variables + tf_non_train_params = original_model.non_trainable_variables + tf_params = {param.name: param.numpy() for param in tf_params} + for param in tf_non_train_params: + tf_params[param.name] = param.numpy() + tf_param_names = list(tf_params.keys()) + + # Load HuggingFace model + config = get_align_config() + hf_model = AlignModel(config).eval() + hf_params = hf_model.state_dict() + + # Create src-to-dst parameter name mapping dictionary + print("Converting parameters...") + key_mapping = rename_keys(tf_param_names) + replace_params(hf_params, tf_params, key_mapping) + + # Initialize processor + processor = get_processor() + inputs = processor( + images=prepare_img(), text="A picture of a cat", padding="max_length", max_length=64, return_tensors="pt" + ) + + # HF model inference + hf_model.eval() + with torch.no_grad(): + outputs = hf_model(**inputs) + + hf_image_features = outputs.image_embeds.detach().numpy() + hf_text_features = outputs.text_embeds.detach().numpy() + + # Original model inference + original_model.trainable = False + tf_image_processor = EfficientNetImageProcessor( + do_center_crop=True, + do_rescale=False, + do_normalize=False, + include_top=False, + resample=Image.BILINEAR, + ) + image = tf_image_processor(images=prepare_img(), return_tensors="tf", data_format="channels_last")["pixel_values"] + text = tok(tf.constant(["A picture of a cat"])) + + image_features = original_model.image_encoder(image, training=False) + text_features = original_model.text_encoder(text, training=False) + + image_features = tf.nn.l2_normalize(image_features, axis=-1) + text_features = tf.nn.l2_normalize(text_features, axis=-1) + + # Check whether original and HF model outputs match -> np.allclose + if not np.allclose(image_features, hf_image_features, atol=1e-3): + raise ValueError("The predicted image features are not the same.") + if not np.allclose(text_features, hf_text_features, atol=1e-3): + raise ValueError("The predicted text features are not the same.") + print("Model outputs match!") + + if save_model: + # Create folder to save model + if not os.path.isdir(pytorch_dump_folder_path): + os.mkdir(pytorch_dump_folder_path) + # Save converted model and image processor + hf_model.save_pretrained(pytorch_dump_folder_path) + processor.save_pretrained(pytorch_dump_folder_path) + + if push_to_hub: + # Push model and image processor to hub + print("Pushing converted ALIGN to the hub...") + processor.push_to_hub("align-base") + hf_model.push_to_hub("align-base") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + # Required parameters + parser.add_argument( + "--checkpoint_path", + default="./weights/model-weights", + type=str, + help="Path to the pretrained TF ALIGN checkpoint.", + ) + parser.add_argument( + "--pytorch_dump_folder_path", + default="hf_model", + type=str, + help="Path to the output PyTorch model directory.", + ) + parser.add_argument("--save_model", action="store_true", help="Save model to local") + parser.add_argument("--push_to_hub", action="store_true", help="Push model and image processor to the hub") + + args = parser.parse_args() + convert_align_checkpoint(args.checkpoint_path, args.pytorch_dump_folder_path, args.save_model, args.push_to_hub) diff --git a/docs/transformers/build/lib/transformers/models/align/modeling_align.py b/docs/transformers/build/lib/transformers/models/align/modeling_align.py new file mode 100644 index 0000000000000000000000000000000000000000..a007b7a7c6d698977369255162c732502b966269 --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/align/modeling_align.py @@ -0,0 +1,1641 @@ +# coding=utf-8 +# Copyright 2023 The Google Research Team Authors and The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch ALIGN model.""" + +import math +from dataclasses import dataclass +from typing import Any, Optional, Tuple, Union + +import torch +import torch.utils.checkpoint +from torch import nn + +from ...activations import ACT2FN +from ...modeling_outputs import ( + BaseModelOutputWithNoAttention, + BaseModelOutputWithPastAndCrossAttentions, + BaseModelOutputWithPoolingAndCrossAttentions, + BaseModelOutputWithPoolingAndNoAttention, +) +from ...modeling_utils import PreTrainedModel +from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer +from ...utils import ( + ModelOutput, + add_start_docstrings, + add_start_docstrings_to_model_forward, + logging, + replace_return_docstrings, +) +from .configuration_align import AlignConfig, AlignTextConfig, AlignVisionConfig + + +logger = logging.get_logger(__name__) + +_CHECKPOINT_FOR_DOC = "kakaobrain/align-base" +_CONFIG_FOR_DOC = "AlignConfig" + + +ALIGN_START_DOCSTRING = r""" + This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads + etc.) + + This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. + Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage + and behavior. + + Parameters: + config ([`AlignConfig`]): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights. +""" + +ALIGN_TEXT_INPUTS_DOCSTRING = r""" + Args: + input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide + it. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, + config.max_position_embeddings - 1]`. + + [What are position IDs?](../glossary#position-ids) + token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*): + Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0, + 1]`: + + - 0 corresponds to a *sentence A* token, + - 1 corresponds to a *sentence B* token. + + [What are token type IDs?](../glossary#token-type-ids) + head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*): + Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This + is useful if you want more control over how to convert `input_ids` indices into associated vectors than the + model's internal embedding lookup matrix. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. +""" + +ALIGN_VISION_INPUTS_DOCSTRING = r""" + Args: + pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`): + Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using + [`AutoImageProcessor`]. See [`EfficientNetImageProcessor.__call__`] for details. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. +""" + +ALIGN_INPUTS_DOCSTRING = r""" + Args: + input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide + it. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, + config.max_position_embeddings - 1]`. + + [What are position IDs?](../glossary#position-ids) + token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*): + Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0, + 1]`: + + - 0 corresponds to a *sentence A* token, + - 1 corresponds to a *sentence B* token. + + [What are token type IDs?](../glossary#token-type-ids) + head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*): + Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This + is useful if you want more control over how to convert `input_ids` indices into associated vectors than the + model's internal embedding lookup matrix. + pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`): + Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using + [`AutoImageProcessor`]. See [`EfficientNetImageProcessor.__call__`] for details. + return_loss (`bool`, *optional*): + Whether or not to return the contrastive loss. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. +""" + + +@dataclass +class AlignVisionModelOutput(ModelOutput): + """ + Base class for vision model's outputs that also contains image embeddings of the pooling of the last hidden states. + + Args: + image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`): + The image embeddings obtained by applying the projection layer to the pooler_output. + last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): + Sequence of hidden-states at the output of the last layer of the model. + hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, + + one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the optional initial embedding outputs. + """ + + image_embeds: Optional[torch.FloatTensor] = None + last_hidden_state: Optional[torch.FloatTensor] = None + hidden_states: Optional[Tuple[torch.FloatTensor]] = None + + +@dataclass +class AlignTextModelOutput(ModelOutput): + """ + Base class for text model's outputs that also contains a pooling of the last hidden states. + + Args: + text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`): + The text embeddings obtained by applying the projection layer to the pooler_output. + last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): + Sequence of hidden-states at the output of the last layer of the model. + hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, + + one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the optional initial embedding outputs. + attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + + text_embeds: Optional[torch.FloatTensor] = None + last_hidden_state: Optional[torch.FloatTensor] = None + hidden_states: Optional[Tuple[torch.FloatTensor]] = None + attentions: Optional[Tuple[torch.FloatTensor]] = None + + +@dataclass +class AlignOutput(ModelOutput): + """ + Args: + loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`): + Contrastive loss for image-text similarity. + logits_per_image:(`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`): + The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text + similarity scores. + logits_per_text:(`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`): + The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image + similarity scores. + text_embeds(`torch.FloatTensor` of shape `(batch_size, output_dim`): + The text embeddings obtained by applying the projection layer to the pooled output of [`AlignTextModel`]. + image_embeds(`torch.FloatTensor` of shape `(batch_size, output_dim`): + The output of [`AlignVisionModel`]. + text_model_output(`BaseModelOutputWithPoolingAndCrossAttentions`): + The output of the [`AlignTextModel`]. + vision_model_output(`BaseModelOutputWithPoolingAndNoAttention`): + The output of the [`AlignVisionModel`]. + """ + + loss: Optional[torch.FloatTensor] = None + logits_per_image: Optional[torch.FloatTensor] = None + logits_per_text: Optional[torch.FloatTensor] = None + text_embeds: Optional[torch.FloatTensor] = None + image_embeds: Optional[torch.FloatTensor] = None + text_model_output: BaseModelOutputWithPoolingAndCrossAttentions = None + vision_model_output: BaseModelOutputWithPoolingAndNoAttention = None + + def to_tuple(self) -> Tuple[Any]: + return tuple( + self[k] if k not in ["text_model_output", "vision_model_output"] else getattr(self, k).to_tuple() + for k in self.keys() + ) + + +# contrastive loss function, adapted from +# https://sachinruk.github.io/blog/pytorch/pytorch%20lightning/loss%20function/gpu/2021/03/07/CLIP.html +def contrastive_loss(logits: torch.Tensor) -> torch.Tensor: + return nn.functional.cross_entropy(logits, torch.arange(len(logits), device=logits.device), label_smoothing=0.1) + + +def align_loss(similarity: torch.Tensor) -> torch.Tensor: + caption_loss = contrastive_loss(similarity) + image_loss = contrastive_loss(similarity.t()) + return (caption_loss + image_loss) / 2.0 + + +# Copied from transformers.models.efficientnet.modeling_efficientnet.round_filters with EfficientNet->AlignVision +def round_filters(config: AlignVisionConfig, num_channels: int): + r""" + Round number of filters based on depth multiplier. + """ + divisor = config.depth_divisor + num_channels *= config.width_coefficient + new_dim = max(divisor, int(num_channels + divisor / 2) // divisor * divisor) + + # Make sure that round down does not go down by more than 10%. + if new_dim < 0.9 * num_channels: + new_dim += divisor + + return int(new_dim) + + +# Copied from transformers.models.efficientnet.modeling_efficientnet.correct_pad +def correct_pad(kernel_size: Union[int, Tuple], adjust: bool = True): + r""" + Utility function to get the tuple padding value for the depthwise convolution. + + Args: + kernel_size (`int` or `tuple`): + Kernel size of the convolution layers. + adjust (`bool`, *optional*, defaults to `True`): + Adjusts padding value to apply to right and bottom sides of the input. + """ + if isinstance(kernel_size, int): + kernel_size = (kernel_size, kernel_size) + + correct = (kernel_size[0] // 2, kernel_size[1] // 2) + if adjust: + return (correct[1] - 1, correct[1], correct[0] - 1, correct[0]) + else: + return (correct[1], correct[1], correct[0], correct[0]) + + +# Copied from transformers.models.efficientnet.modeling_efficientnet.EfficientNetEmbeddings with EfficientNet->AlignVision +class AlignVisionEmbeddings(nn.Module): + r""" + A module that corresponds to the stem module of the original work. + """ + + def __init__(self, config: AlignVisionConfig): + super().__init__() + + self.out_dim = round_filters(config, 32) + self.padding = nn.ZeroPad2d(padding=(0, 1, 0, 1)) + self.convolution = nn.Conv2d( + config.num_channels, self.out_dim, kernel_size=3, stride=2, padding="valid", bias=False + ) + self.batchnorm = nn.BatchNorm2d(self.out_dim, eps=config.batch_norm_eps, momentum=config.batch_norm_momentum) + self.activation = ACT2FN[config.hidden_act] + + def forward(self, pixel_values: torch.Tensor) -> torch.Tensor: + features = self.padding(pixel_values) + features = self.convolution(features) + features = self.batchnorm(features) + features = self.activation(features) + + return features + + +# Copied from transformers.models.efficientnet.modeling_efficientnet.EfficientNetDepthwiseConv2d with EfficientNet->AlignVision +class AlignVisionDepthwiseConv2d(nn.Conv2d): + def __init__( + self, + in_channels, + depth_multiplier=1, + kernel_size=3, + stride=1, + padding=0, + dilation=1, + bias=True, + padding_mode="zeros", + ): + out_channels = in_channels * depth_multiplier + super().__init__( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=kernel_size, + stride=stride, + padding=padding, + dilation=dilation, + groups=in_channels, + bias=bias, + padding_mode=padding_mode, + ) + + +# Copied from transformers.models.efficientnet.modeling_efficientnet.EfficientNetExpansionLayer with EfficientNet->AlignVision +class AlignVisionExpansionLayer(nn.Module): + r""" + This corresponds to the expansion phase of each block in the original implementation. + """ + + def __init__(self, config: AlignVisionConfig, in_dim: int, out_dim: int, stride: int): + super().__init__() + self.expand_conv = nn.Conv2d( + in_channels=in_dim, + out_channels=out_dim, + kernel_size=1, + padding="same", + bias=False, + ) + self.expand_bn = nn.BatchNorm2d(num_features=out_dim, eps=config.batch_norm_eps) + self.expand_act = ACT2FN[config.hidden_act] + + def forward(self, hidden_states: torch.FloatTensor) -> torch.Tensor: + # Expand phase + hidden_states = self.expand_conv(hidden_states) + hidden_states = self.expand_bn(hidden_states) + hidden_states = self.expand_act(hidden_states) + + return hidden_states + + +# Copied from transformers.models.efficientnet.modeling_efficientnet.EfficientNetDepthwiseLayer with EfficientNet->AlignVision +class AlignVisionDepthwiseLayer(nn.Module): + r""" + This corresponds to the depthwise convolution phase of each block in the original implementation. + """ + + def __init__( + self, + config: AlignVisionConfig, + in_dim: int, + stride: int, + kernel_size: int, + adjust_padding: bool, + ): + super().__init__() + self.stride = stride + conv_pad = "valid" if self.stride == 2 else "same" + padding = correct_pad(kernel_size, adjust=adjust_padding) + + self.depthwise_conv_pad = nn.ZeroPad2d(padding=padding) + self.depthwise_conv = AlignVisionDepthwiseConv2d( + in_dim, kernel_size=kernel_size, stride=stride, padding=conv_pad, bias=False + ) + self.depthwise_norm = nn.BatchNorm2d( + num_features=in_dim, eps=config.batch_norm_eps, momentum=config.batch_norm_momentum + ) + self.depthwise_act = ACT2FN[config.hidden_act] + + def forward(self, hidden_states: torch.FloatTensor) -> torch.Tensor: + # Depthwise convolution + if self.stride == 2: + hidden_states = self.depthwise_conv_pad(hidden_states) + + hidden_states = self.depthwise_conv(hidden_states) + hidden_states = self.depthwise_norm(hidden_states) + hidden_states = self.depthwise_act(hidden_states) + + return hidden_states + + +# Copied from transformers.models.efficientnet.modeling_efficientnet.EfficientNetSqueezeExciteLayer with EfficientNet->AlignVision +class AlignVisionSqueezeExciteLayer(nn.Module): + r""" + This corresponds to the Squeeze and Excitement phase of each block in the original implementation. + """ + + def __init__(self, config: AlignVisionConfig, in_dim: int, expand_dim: int, expand: bool = False): + super().__init__() + self.dim = expand_dim if expand else in_dim + self.dim_se = max(1, int(in_dim * config.squeeze_expansion_ratio)) + + self.squeeze = nn.AdaptiveAvgPool2d(output_size=1) + self.reduce = nn.Conv2d( + in_channels=self.dim, + out_channels=self.dim_se, + kernel_size=1, + padding="same", + ) + self.expand = nn.Conv2d( + in_channels=self.dim_se, + out_channels=self.dim, + kernel_size=1, + padding="same", + ) + self.act_reduce = ACT2FN[config.hidden_act] + self.act_expand = nn.Sigmoid() + + def forward(self, hidden_states: torch.FloatTensor) -> torch.Tensor: + inputs = hidden_states + hidden_states = self.squeeze(hidden_states) + hidden_states = self.reduce(hidden_states) + hidden_states = self.act_reduce(hidden_states) + + hidden_states = self.expand(hidden_states) + hidden_states = self.act_expand(hidden_states) + hidden_states = torch.mul(inputs, hidden_states) + + return hidden_states + + +class AlignVisionFinalBlockLayer(nn.Module): + r""" + This corresponds to the final phase of each block in the original implementation. + """ + + def __init__( + self, config: AlignVisionConfig, in_dim: int, out_dim: int, stride: int, drop_rate: float, id_skip: bool + ): + super().__init__() + self.apply_dropout = stride == 1 and not id_skip + self.project_conv = nn.Conv2d( + in_channels=in_dim, + out_channels=out_dim, + kernel_size=1, + padding="same", + bias=False, + ) + self.project_bn = nn.BatchNorm2d( + num_features=out_dim, eps=config.batch_norm_eps, momentum=config.batch_norm_momentum + ) + self.dropout = nn.Dropout(p=drop_rate) + + def forward(self, embeddings: torch.FloatTensor, hidden_states: torch.FloatTensor) -> torch.Tensor: + hidden_states = self.project_conv(hidden_states) + hidden_states = self.project_bn(hidden_states) + + if self.apply_dropout: + hidden_states = self.dropout(hidden_states) + hidden_states = hidden_states + embeddings + + return hidden_states + + +class AlignVisionBlock(nn.Module): + r""" + This corresponds to the block module of original the EfficientNet vision encoder implementation. + + Args: + config ([`AlignVisionConfig`]): + Model configuration class. + in_dim (`int`): + Number of input channels. + out_dim (`int`): + Number of output channels. + stride (`int`): + Stride size to be used in convolution layers. + expand_ratio (`int`): + Expand ratio to set the output dimensions for the expansion and squeeze-excite layers. + kernel_size (`int`): + Kernel size for the depthwise convolution layer. + drop_rate (`float`): + Dropout rate to be used in the final phase of each block. + id_skip (`bool`): + Whether to apply dropout and sum the final hidden states with the input embeddings during the final phase + of each block. Set to `True` for the first block of each stage. + adjust_padding (`bool`): + Whether to apply padding to only right and bottom side of the input kernel before the depthwise convolution + operation, set to `True` for inputs with odd input sizes. + """ + + def __init__( + self, + config: AlignVisionConfig, + in_dim: int, + out_dim: int, + stride: int, + expand_ratio: int, + kernel_size: int, + drop_rate: float, + id_skip: bool, + adjust_padding: bool, + ): + super().__init__() + self.expand_ratio = expand_ratio + self.expand = True if self.expand_ratio != 1 else False + expand_in_dim = in_dim * expand_ratio + + if self.expand: + self.expansion = AlignVisionExpansionLayer( + config=config, in_dim=in_dim, out_dim=expand_in_dim, stride=stride + ) + + self.depthwise_conv = AlignVisionDepthwiseLayer( + config=config, + in_dim=expand_in_dim if self.expand else in_dim, + stride=stride, + kernel_size=kernel_size, + adjust_padding=adjust_padding, + ) + self.squeeze_excite = AlignVisionSqueezeExciteLayer( + config=config, in_dim=in_dim, expand_dim=expand_in_dim, expand=self.expand + ) + self.projection = AlignVisionFinalBlockLayer( + config=config, + in_dim=expand_in_dim if self.expand else in_dim, + out_dim=out_dim, + stride=stride, + drop_rate=drop_rate, + id_skip=id_skip, + ) + + def forward(self, hidden_states: torch.FloatTensor) -> torch.Tensor: + embeddings = hidden_states + # Expansion and depthwise convolution phase + if self.expand_ratio != 1: + hidden_states = self.expansion(hidden_states) + hidden_states = self.depthwise_conv(hidden_states) + + # Squeeze and excite phase + hidden_states = self.squeeze_excite(hidden_states) + hidden_states = self.projection(embeddings, hidden_states) + return hidden_states + + +class AlignVisionEncoder(nn.Module): + r""" + Forward propogates the embeddings through each vision encoder (EfficientNet) block. + + Args: + config ([`AlignVisionConfig`]): + Model configuration class. + """ + + def __init__(self, config: AlignVisionConfig): + super().__init__() + self.depth_coefficient = config.depth_coefficient + + def round_repeats(repeats): + # Round number of block repeats based on depth multiplier. + return int(math.ceil(self.depth_coefficient * repeats)) + + num_base_blocks = len(config.in_channels) + num_blocks = sum(round_repeats(n) for n in config.num_block_repeats) + + curr_block_num = 0 + blocks = [] + for i in range(num_base_blocks): + in_dim = round_filters(config, config.in_channels[i]) + out_dim = round_filters(config, config.out_channels[i]) + stride = config.strides[i] + kernel_size = config.kernel_sizes[i] + expand_ratio = config.expand_ratios[i] + + for j in range(round_repeats(config.num_block_repeats[i])): + id_skip = True if j == 0 else False + stride = 1 if j > 0 else stride + in_dim = out_dim if j > 0 else in_dim + adjust_padding = False if curr_block_num in config.depthwise_padding else True + drop_rate = config.drop_connect_rate * curr_block_num / num_blocks + + block = AlignVisionBlock( + config=config, + in_dim=in_dim, + out_dim=out_dim, + stride=stride, + kernel_size=kernel_size, + expand_ratio=expand_ratio, + drop_rate=drop_rate, + id_skip=id_skip, + adjust_padding=adjust_padding, + ) + blocks.append(block) + curr_block_num += 1 + + self.blocks = nn.ModuleList(blocks) + + def forward( + self, + hidden_states: torch.FloatTensor, + output_hidden_states: Optional[bool] = False, + return_dict: Optional[bool] = True, + ) -> BaseModelOutputWithPoolingAndNoAttention: + all_hidden_states = (hidden_states,) if output_hidden_states else None + + for block in self.blocks: + hidden_states = block(hidden_states) + if output_hidden_states: + all_hidden_states += (hidden_states,) + + if not return_dict: + return tuple(v for v in [hidden_states, all_hidden_states] if v is not None) + + return BaseModelOutputWithNoAttention( + last_hidden_state=hidden_states, + hidden_states=all_hidden_states, + ) + + +# Copied from transformers.models.bert.modeling_bert.BertEmbeddings with Bert->AlignText +class AlignTextEmbeddings(nn.Module): + """Construct the embeddings from word, position and token_type embeddings.""" + + def __init__(self, config): + super().__init__() + self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id) + self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size) + self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size) + + # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load + # any TensorFlow checkpoint file + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + # position_ids (1, len position emb) is contiguous in memory and exported when serialized + self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") + self.register_buffer( + "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False + ) + self.register_buffer( + "token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False + ) + + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + token_type_ids: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + past_key_values_length: int = 0, + ) -> torch.Tensor: + if input_ids is not None: + input_shape = input_ids.size() + else: + input_shape = inputs_embeds.size()[:-1] + + seq_length = input_shape[1] + + if position_ids is None: + position_ids = self.position_ids[:, past_key_values_length : seq_length + past_key_values_length] + + # Setting the token_type_ids to the registered buffer in constructor where it is all zeros, which usually occurs + # when its auto-generated, registered buffer helps users when tracing the model without passing token_type_ids, solves + # issue #5664 + if token_type_ids is None: + if hasattr(self, "token_type_ids"): + buffered_token_type_ids = self.token_type_ids[:, :seq_length] + buffered_token_type_ids_expanded = buffered_token_type_ids.expand(input_shape[0], seq_length) + token_type_ids = buffered_token_type_ids_expanded + else: + token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device) + + if inputs_embeds is None: + inputs_embeds = self.word_embeddings(input_ids) + token_type_embeddings = self.token_type_embeddings(token_type_ids) + + embeddings = inputs_embeds + token_type_embeddings + if self.position_embedding_type == "absolute": + position_embeddings = self.position_embeddings(position_ids) + embeddings += position_embeddings + embeddings = self.LayerNorm(embeddings) + embeddings = self.dropout(embeddings) + return embeddings + + +# Copied from transformers.models.bert.modeling_bert.BertSelfAttention with Bert->AlignText +class AlignTextSelfAttention(nn.Module): + def __init__(self, config, position_embedding_type=None): + super().__init__() + if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"): + raise ValueError( + f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention " + f"heads ({config.num_attention_heads})" + ) + + self.num_attention_heads = config.num_attention_heads + self.attention_head_size = int(config.hidden_size / config.num_attention_heads) + self.all_head_size = self.num_attention_heads * self.attention_head_size + + self.query = nn.Linear(config.hidden_size, self.all_head_size) + self.key = nn.Linear(config.hidden_size, self.all_head_size) + self.value = nn.Linear(config.hidden_size, self.all_head_size) + + self.dropout = nn.Dropout(config.attention_probs_dropout_prob) + self.position_embedding_type = position_embedding_type or getattr( + config, "position_embedding_type", "absolute" + ) + if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query": + self.max_position_embeddings = config.max_position_embeddings + self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size) + + self.is_decoder = config.is_decoder + + def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor: + new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size) + x = x.view(new_x_shape) + return x.permute(0, 2, 1, 3) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.FloatTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + encoder_attention_mask: Optional[torch.FloatTensor] = None, + past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None, + output_attentions: Optional[bool] = False, + ) -> Tuple[torch.Tensor]: + mixed_query_layer = self.query(hidden_states) + + # If this is instantiated as a cross-attention module, the keys + # and values come from an encoder; the attention mask needs to be + # such that the encoder's padding tokens are not attended to. + is_cross_attention = encoder_hidden_states is not None + + if is_cross_attention and past_key_value is not None: + # reuse k,v, cross_attentions + key_layer = past_key_value[0] + value_layer = past_key_value[1] + attention_mask = encoder_attention_mask + elif is_cross_attention: + key_layer = self.transpose_for_scores(self.key(encoder_hidden_states)) + value_layer = self.transpose_for_scores(self.value(encoder_hidden_states)) + attention_mask = encoder_attention_mask + elif past_key_value is not None: + key_layer = self.transpose_for_scores(self.key(hidden_states)) + value_layer = self.transpose_for_scores(self.value(hidden_states)) + key_layer = torch.cat([past_key_value[0], key_layer], dim=2) + value_layer = torch.cat([past_key_value[1], value_layer], dim=2) + else: + key_layer = self.transpose_for_scores(self.key(hidden_states)) + value_layer = self.transpose_for_scores(self.value(hidden_states)) + + query_layer = self.transpose_for_scores(mixed_query_layer) + + use_cache = past_key_value is not None + if self.is_decoder: + # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states. + # Further calls to cross_attention layer can then reuse all cross-attention + # key/value_states (first "if" case) + # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of + # all previous decoder key/value_states. Further calls to uni-directional self-attention + # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case) + # if encoder bi-directional self-attention `past_key_value` is always `None` + past_key_value = (key_layer, value_layer) + + # Take the dot product between "query" and "key" to get the raw attention scores. + attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2)) + + if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query": + query_length, key_length = query_layer.shape[2], key_layer.shape[2] + if use_cache: + position_ids_l = torch.tensor(key_length - 1, dtype=torch.long, device=hidden_states.device).view( + -1, 1 + ) + else: + position_ids_l = torch.arange(query_length, dtype=torch.long, device=hidden_states.device).view(-1, 1) + position_ids_r = torch.arange(key_length, dtype=torch.long, device=hidden_states.device).view(1, -1) + distance = position_ids_l - position_ids_r + + positional_embedding = self.distance_embedding(distance + self.max_position_embeddings - 1) + positional_embedding = positional_embedding.to(dtype=query_layer.dtype) # fp16 compatibility + + if self.position_embedding_type == "relative_key": + relative_position_scores = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding) + attention_scores = attention_scores + relative_position_scores + elif self.position_embedding_type == "relative_key_query": + relative_position_scores_query = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding) + relative_position_scores_key = torch.einsum("bhrd,lrd->bhlr", key_layer, positional_embedding) + attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key + + attention_scores = attention_scores / math.sqrt(self.attention_head_size) + if attention_mask is not None: + # Apply the attention mask is (precomputed for all layers in AlignTextModel forward() function) + attention_scores = attention_scores + attention_mask + + # Normalize the attention scores to probabilities. + attention_probs = nn.functional.softmax(attention_scores, dim=-1) + + # This is actually dropping out entire tokens to attend to, which might + # seem a bit unusual, but is taken from the original Transformer paper. + attention_probs = self.dropout(attention_probs) + + # Mask heads if we want to + if head_mask is not None: + attention_probs = attention_probs * head_mask + + context_layer = torch.matmul(attention_probs, value_layer) + + context_layer = context_layer.permute(0, 2, 1, 3).contiguous() + new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,) + context_layer = context_layer.view(new_context_layer_shape) + + outputs = (context_layer, attention_probs) if output_attentions else (context_layer,) + + if self.is_decoder: + outputs = outputs + (past_key_value,) + return outputs + + +# Copied from transformers.models.bert.modeling_bert.BertSelfOutput with Bert->AlignText +class AlignTextSelfOutput(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = self.LayerNorm(hidden_states + input_tensor) + return hidden_states + + +ALIGN_TEXT_SELF_ATTENTION_CLASSES = { + "eager": AlignTextSelfAttention, +} + + +# Copied from transformers.models.bert.modeling_bert.BertAttention with Bert->AlignText,BERT->ALIGN_TEXT +class AlignTextAttention(nn.Module): + def __init__(self, config, position_embedding_type=None): + super().__init__() + self.self = ALIGN_TEXT_SELF_ATTENTION_CLASSES[config._attn_implementation]( + config, position_embedding_type=position_embedding_type + ) + self.output = AlignTextSelfOutput(config) + self.pruned_heads = set() + + def prune_heads(self, heads): + if len(heads) == 0: + return + heads, index = find_pruneable_heads_and_indices( + heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads + ) + + # Prune linear layers + self.self.query = prune_linear_layer(self.self.query, index) + self.self.key = prune_linear_layer(self.self.key, index) + self.self.value = prune_linear_layer(self.self.value, index) + self.output.dense = prune_linear_layer(self.output.dense, index, dim=1) + + # Update hyper params and store pruned heads + self.self.num_attention_heads = self.self.num_attention_heads - len(heads) + self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads + self.pruned_heads = self.pruned_heads.union(heads) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.FloatTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + encoder_attention_mask: Optional[torch.FloatTensor] = None, + past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None, + output_attentions: Optional[bool] = False, + ) -> Tuple[torch.Tensor]: + self_outputs = self.self( + hidden_states, + attention_mask, + head_mask, + encoder_hidden_states, + encoder_attention_mask, + past_key_value, + output_attentions, + ) + attention_output = self.output(self_outputs[0], hidden_states) + outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them + return outputs + + +# Copied from transformers.models.bert.modeling_bert.BertIntermediate with Bert->AlignText +class AlignTextIntermediate(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.intermediate_size) + if isinstance(config.hidden_act, str): + self.intermediate_act_fn = ACT2FN[config.hidden_act] + else: + self.intermediate_act_fn = config.hidden_act + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.intermediate_act_fn(hidden_states) + return hidden_states + + +# Copied from transformers.models.bert.modeling_bert.BertOutput with Bert->AlignText +class AlignTextOutput(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.intermediate_size, config.hidden_size) + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = self.LayerNorm(hidden_states + input_tensor) + return hidden_states + + +# Copied from transformers.models.bert.modeling_bert.BertLayer with Bert->AlignText +class AlignTextLayer(nn.Module): + def __init__(self, config): + super().__init__() + self.chunk_size_feed_forward = config.chunk_size_feed_forward + self.seq_len_dim = 1 + self.attention = AlignTextAttention(config) + self.is_decoder = config.is_decoder + self.add_cross_attention = config.add_cross_attention + if self.add_cross_attention: + if not self.is_decoder: + raise ValueError(f"{self} should be used as a decoder model if cross attention is added") + self.crossattention = AlignTextAttention(config, position_embedding_type="absolute") + self.intermediate = AlignTextIntermediate(config) + self.output = AlignTextOutput(config) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.FloatTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + encoder_attention_mask: Optional[torch.FloatTensor] = None, + past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None, + output_attentions: Optional[bool] = False, + ) -> Tuple[torch.Tensor]: + # decoder uni-directional self-attention cached key/values tuple is at positions 1,2 + self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None + self_attention_outputs = self.attention( + hidden_states, + attention_mask, + head_mask, + output_attentions=output_attentions, + past_key_value=self_attn_past_key_value, + ) + attention_output = self_attention_outputs[0] + + # if decoder, the last output is tuple of self-attn cache + if self.is_decoder: + outputs = self_attention_outputs[1:-1] + present_key_value = self_attention_outputs[-1] + else: + outputs = self_attention_outputs[1:] # add self attentions if we output attention weights + + cross_attn_present_key_value = None + if self.is_decoder and encoder_hidden_states is not None: + if not hasattr(self, "crossattention"): + raise ValueError( + f"If `encoder_hidden_states` are passed, {self} has to be instantiated with cross-attention layers" + " by setting `config.add_cross_attention=True`" + ) + + # cross_attn cached key/values tuple is at positions 3,4 of past_key_value tuple + cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None + cross_attention_outputs = self.crossattention( + attention_output, + attention_mask, + head_mask, + encoder_hidden_states, + encoder_attention_mask, + cross_attn_past_key_value, + output_attentions, + ) + attention_output = cross_attention_outputs[0] + outputs = outputs + cross_attention_outputs[1:-1] # add cross attentions if we output attention weights + + # add cross-attn cache to positions 3,4 of present_key_value tuple + cross_attn_present_key_value = cross_attention_outputs[-1] + present_key_value = present_key_value + cross_attn_present_key_value + + layer_output = apply_chunking_to_forward( + self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attention_output + ) + outputs = (layer_output,) + outputs + + # if decoder, return the attn key/values as the last output + if self.is_decoder: + outputs = outputs + (present_key_value,) + + return outputs + + def feed_forward_chunk(self, attention_output): + intermediate_output = self.intermediate(attention_output) + layer_output = self.output(intermediate_output, attention_output) + return layer_output + + +# Copied from transformers.models.bert.modeling_bert.BertEncoder with Bert->AlignText +class AlignTextEncoder(nn.Module): + def __init__(self, config): + super().__init__() + self.config = config + self.layer = nn.ModuleList([AlignTextLayer(config) for _ in range(config.num_hidden_layers)]) + self.gradient_checkpointing = False + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.FloatTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + encoder_attention_mask: Optional[torch.FloatTensor] = None, + past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = False, + output_hidden_states: Optional[bool] = False, + return_dict: Optional[bool] = True, + ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPastAndCrossAttentions]: + all_hidden_states = () if output_hidden_states else None + all_self_attentions = () if output_attentions else None + all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None + + if self.gradient_checkpointing and self.training: + if use_cache: + logger.warning_once( + "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." + ) + use_cache = False + + next_decoder_cache = () if use_cache else None + for i, layer_module in enumerate(self.layer): + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + layer_head_mask = head_mask[i] if head_mask is not None else None + past_key_value = past_key_values[i] if past_key_values is not None else None + + if self.gradient_checkpointing and self.training: + layer_outputs = self._gradient_checkpointing_func( + layer_module.__call__, + hidden_states, + attention_mask, + layer_head_mask, + encoder_hidden_states, + encoder_attention_mask, + past_key_value, + output_attentions, + ) + else: + layer_outputs = layer_module( + hidden_states, + attention_mask, + layer_head_mask, + encoder_hidden_states, + encoder_attention_mask, + past_key_value, + output_attentions, + ) + + hidden_states = layer_outputs[0] + if use_cache: + next_decoder_cache += (layer_outputs[-1],) + if output_attentions: + all_self_attentions = all_self_attentions + (layer_outputs[1],) + if self.config.add_cross_attention: + all_cross_attentions = all_cross_attentions + (layer_outputs[2],) + + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + if not return_dict: + return tuple( + v + for v in [ + hidden_states, + next_decoder_cache, + all_hidden_states, + all_self_attentions, + all_cross_attentions, + ] + if v is not None + ) + return BaseModelOutputWithPastAndCrossAttentions( + last_hidden_state=hidden_states, + past_key_values=next_decoder_cache, + hidden_states=all_hidden_states, + attentions=all_self_attentions, + cross_attentions=all_cross_attentions, + ) + + +# Copied from transformers.models.bert.modeling_bert.BertPooler with Bert -> AlignText +class AlignTextPooler(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + self.activation = nn.Tanh() + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + # We "pool" the model by simply taking the hidden state corresponding + # to the first token. + first_token_tensor = hidden_states[:, 0] + pooled_output = self.dense(first_token_tensor) + pooled_output = self.activation(pooled_output) + return pooled_output + + +class AlignPreTrainedModel(PreTrainedModel): + """ + An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained + models. + """ + + config_class = AlignConfig + base_model_prefix = "align" + supports_gradient_checkpointing = True + + def _init_weights(self, module): + """Initialize the weights""" + if isinstance(module, (nn.Linear, nn.Conv2d)): + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, AlignModel): + nn.init.xavier_uniform_(module.text_projection.weight) + module.text_projection.bias.data.zero_() + module.text_projection._is_hf_initialized = True + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + if isinstance(module, nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + + +@add_start_docstrings( + """The text model from ALIGN without any head or projection on top.""", + ALIGN_START_DOCSTRING, +) +class AlignTextModel(AlignPreTrainedModel): + config_class = AlignTextConfig + _no_split_modules = ["AlignTextEmbeddings"] + + def __init__(self, config: AlignTextConfig, add_pooling_layer: bool = True): + super().__init__(config) + self.config = config + + self.embeddings = AlignTextEmbeddings(config) + self.encoder = AlignTextEncoder(config) + + self.pooler = AlignTextPooler(config) if add_pooling_layer else None + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.embeddings.word_embeddings + + def set_input_embeddings(self, value): + self.embeddings.word_embeddings = value + + @add_start_docstrings_to_model_forward(ALIGN_TEXT_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=BaseModelOutputWithPoolingAndCrossAttentions, config_class=AlignTextConfig) + def forward( + self, + input_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + token_type_ids: Optional[torch.Tensor] = None, + position_ids: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, BaseModelOutputWithPoolingAndCrossAttentions]: + r""" + Returns: + + Examples: + + ```python + >>> from transformers import AutoTokenizer, AlignTextModel + + >>> model = AlignTextModel.from_pretrained("kakaobrain/align-base") + >>> tokenizer = AutoTokenizer.from_pretrained("kakaobrain/align-base") + + >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt") + + >>> outputs = model(**inputs) + >>> last_hidden_state = outputs.last_hidden_state + >>> pooled_output = outputs.pooler_output # pooled (EOS token) states + ```""" + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if input_ids is not None and inputs_embeds is not None: + raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") + elif input_ids is not None: + self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask) + input_shape = input_ids.size() + elif inputs_embeds is not None: + input_shape = inputs_embeds.size()[:-1] + else: + raise ValueError("You have to specify either input_ids or inputs_embeds") + + batch_size, seq_length = input_shape + device = input_ids.device if input_ids is not None else inputs_embeds.device + + if attention_mask is None: + attention_mask = torch.ones(((batch_size, seq_length)), device=device) + + if token_type_ids is None: + if hasattr(self.embeddings, "token_type_ids"): + buffered_token_type_ids = self.embeddings.token_type_ids[:, :seq_length] + buffered_token_type_ids_expanded = buffered_token_type_ids.expand(batch_size, seq_length) + token_type_ids = buffered_token_type_ids_expanded + else: + token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device) + + # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length] + # ourselves in which case we just need to make it broadcastable to all heads. + extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape) + + # Prepare head mask if needed + # 1.0 in head_mask indicate we keep the head + # attention_probs has shape bsz x n_heads x N x N + # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads] + # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length] + head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers) + + embedding_output = self.embeddings( + input_ids=input_ids, + position_ids=position_ids, + token_type_ids=token_type_ids, + inputs_embeds=inputs_embeds, + ) + encoder_outputs = self.encoder( + embedding_output, + attention_mask=extended_attention_mask, + head_mask=head_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + sequence_output = encoder_outputs[0] + pooled_output = self.pooler(sequence_output) if self.pooler is not None else None + + if not return_dict: + return (sequence_output, pooled_output) + encoder_outputs[1:] + + return BaseModelOutputWithPoolingAndCrossAttentions( + last_hidden_state=sequence_output, + pooler_output=pooled_output, + hidden_states=encoder_outputs.hidden_states, + attentions=encoder_outputs.attentions, + cross_attentions=encoder_outputs.cross_attentions, + ) + + +@add_start_docstrings( + """The vision model from ALIGN without any head or projection on top.""", + ALIGN_START_DOCSTRING, +) +class AlignVisionModel(AlignPreTrainedModel): + config_class = AlignVisionConfig + main_input_name = "pixel_values" + supports_gradient_checkpointing = False + + def __init__(self, config: AlignVisionConfig): + super().__init__(config) + self.config = config + self.embeddings = AlignVisionEmbeddings(config) + self.encoder = AlignVisionEncoder(config) + + # Final pooling layer + if config.pooling_type == "mean": + self.pooler = nn.AvgPool2d(config.hidden_dim, ceil_mode=True) + elif config.pooling_type == "max": + self.pooler = nn.MaxPool2d(config.hidden_dim, ceil_mode=True) + else: + raise ValueError(f"config.pooling must be one of ['mean', 'max'] got {config.pooling}") + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self) -> nn.Module: + return self.vision_model.embeddings.convolution + + @add_start_docstrings_to_model_forward(ALIGN_VISION_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=BaseModelOutputWithPoolingAndNoAttention, config_class=AlignVisionConfig) + def forward( + self, + pixel_values: Optional[torch.FloatTensor] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, BaseModelOutputWithPoolingAndNoAttention]: + r""" + Returns: + + Examples: + + ```python + >>> from PIL import Image + >>> import requests + >>> from transformers import AutoProcessor, AlignVisionModel + + >>> model = AlignVisionModel.from_pretrained("kakaobrain/align-base") + >>> processor = AutoProcessor.from_pretrained("kakaobrain/align-base") + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + + >>> inputs = processor(images=image, return_tensors="pt") + + >>> outputs = model(**inputs) + >>> last_hidden_state = outputs.last_hidden_state + >>> pooled_output = outputs.pooler_output # pooled CLS states + ```""" + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if pixel_values is None: + raise ValueError("You have to specify pixel_values") + + embedding_output = self.embeddings(pixel_values) + encoder_outputs = self.encoder( + embedding_output, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + # Apply pooling + last_hidden_state = encoder_outputs[0] + pooled_output = self.pooler(last_hidden_state) + # Reshape (batch_size, projection_dim, 1 , 1) -> (batch_size, projection_dim) + pooled_output = pooled_output.reshape(pooled_output.shape[:2]) + + if not return_dict: + return (last_hidden_state, pooled_output) + encoder_outputs[1:] + + return BaseModelOutputWithPoolingAndNoAttention( + last_hidden_state=last_hidden_state, + pooler_output=pooled_output, + hidden_states=encoder_outputs.hidden_states, + ) + + +@add_start_docstrings(ALIGN_START_DOCSTRING) +class AlignModel(AlignPreTrainedModel): + config_class = AlignConfig + + def __init__(self, config: AlignConfig): + super().__init__(config) + + if not isinstance(config.text_config, AlignTextConfig): + raise TypeError( + "config.text_config is expected to be of type AlignTextConfig but is of type" + f" {type(config.text_config)}." + ) + + if not isinstance(config.vision_config, AlignVisionConfig): + raise TypeError( + "config.vision_config is expected to be of type AlignVisionConfig but is of type" + f" {type(config.vision_config)}." + ) + + text_config = config.text_config + vision_config = config.vision_config + + self.projection_dim = config.projection_dim + self.text_embed_dim = text_config.hidden_size + + self.text_model = AlignTextModel(text_config) + self.vision_model = AlignVisionModel(vision_config) + + self.text_projection = nn.Linear(self.text_embed_dim, self.projection_dim) + self.temperature = nn.Parameter(torch.tensor(self.config.temperature_init_value)) + + # Initialize weights and apply final processing + self.post_init() + + @add_start_docstrings_to_model_forward(ALIGN_TEXT_INPUTS_DOCSTRING) + def get_text_features( + self, + input_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + token_type_ids: Optional[torch.Tensor] = None, + position_ids: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> torch.FloatTensor: + r""" + Returns: + text_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The text embeddings obtained by + applying the projection layer to the pooled output of [`AlignTextModel`]. + + Examples: + + ```python + >>> from transformers import AutoTokenizer, AlignModel + + >>> model = AlignModel.from_pretrained("kakaobrain/align-base") + >>> tokenizer = AutoTokenizer.from_pretrained("kakaobrain/align-base") + + >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt") + >>> text_features = model.get_text_features(**inputs) + ```""" + # Use ALIGN model's config for some fields (if specified) instead of those of vision & text components. + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + text_outputs = self.text_model( + input_ids=input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + last_hidden_state = text_outputs[0][:, 0, :] + text_features = self.text_projection(last_hidden_state) + + return text_features + + @add_start_docstrings_to_model_forward(ALIGN_VISION_INPUTS_DOCSTRING) + def get_image_features( + self, + pixel_values: Optional[torch.FloatTensor] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> torch.FloatTensor: + r""" + Returns: + image_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The image embeddings obtained by + applying the projection layer to the pooled output of [`AlignVisionModel`]. + + Examples: + + ```python + >>> from PIL import Image + >>> import requests + >>> from transformers import AutoProcessor, AlignModel + + >>> model = AlignModel.from_pretrained("kakaobrain/align-base") + >>> processor = AutoProcessor.from_pretrained("kakaobrain/align-base") + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + + >>> inputs = processor(images=image, return_tensors="pt") + + >>> image_features = model.get_image_features(**inputs) + ```""" + # Use ALIGN model's config for some fields (if specified) instead of those of vision & text components. + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + vision_outputs = self.vision_model( + pixel_values=pixel_values, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + image_features = vision_outputs[1] # pooled_output + + return image_features + + @add_start_docstrings_to_model_forward(ALIGN_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=AlignOutput, config_class=AlignConfig) + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + pixel_values: Optional[torch.FloatTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + token_type_ids: Optional[torch.Tensor] = None, + position_ids: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + return_loss: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, AlignOutput]: + r""" + Returns: + + Examples: + + ```python + >>> from PIL import Image + >>> import requests + >>> from transformers import AutoProcessor, AlignModel + + >>> model = AlignModel.from_pretrained("kakaobrain/align-base") + >>> processor = AutoProcessor.from_pretrained("kakaobrain/align-base") + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + + >>> inputs = processor( + ... images=image, text=["a photo of a cat", "a photo of a dog"], return_tensors="pt", padding=True + ... ) + + >>> outputs = model(**inputs) + >>> logits_per_image = outputs.logits_per_image # this is the image-text similarity score + >>> probs = logits_per_image.softmax(dim=1) # we can take the softmax to get the label probabilities + ```""" + # Use ALIGN model's config for some fields (if specified) instead of those of vision & text components. + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + vision_outputs = self.vision_model( + pixel_values=pixel_values, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + text_outputs = self.text_model( + input_ids=input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + image_embeds = vision_outputs[1] + text_embeds = text_outputs[0][:, 0, :] + text_embeds = self.text_projection(text_embeds) + + # normalized features + image_embeds = image_embeds / image_embeds.norm(p=2, dim=-1, keepdim=True) + text_embeds = text_embeds / text_embeds.norm(p=2, dim=-1, keepdim=True) + + # cosine similarity as logits + logits_per_text = torch.matmul(text_embeds, image_embeds.t()) / self.temperature + logits_per_image = logits_per_text.t() + + loss = None + if return_loss: + loss = align_loss(logits_per_text) + + if not return_dict: + output = (logits_per_image, logits_per_text, text_embeds, image_embeds, text_outputs, vision_outputs) + return ((loss,) + output) if loss is not None else output + + return AlignOutput( + loss=loss, + logits_per_image=logits_per_image, + logits_per_text=logits_per_text, + text_embeds=text_embeds, + image_embeds=image_embeds, + text_model_output=text_outputs, + vision_model_output=vision_outputs, + ) + + +__all__ = ["AlignPreTrainedModel", "AlignTextModel", "AlignVisionModel", "AlignModel"] diff --git a/docs/transformers/build/lib/transformers/models/align/processing_align.py b/docs/transformers/build/lib/transformers/models/align/processing_align.py new file mode 100644 index 0000000000000000000000000000000000000000..628f05b2e01bd6e12f0904c54f6c43b10ece33c0 --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/align/processing_align.py @@ -0,0 +1,161 @@ +# coding=utf-8 +# Copyright 2023 The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Image/Text processor class for ALIGN +""" + +from typing import List, Union + +from ...image_utils import ImageInput +from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack, _validate_images_text_input_order +from ...tokenization_utils_base import BatchEncoding, PreTokenizedInput, TextInput + + +class AlignProcessorKwargs(ProcessingKwargs, total=False): + # see processing_utils.ProcessingKwargs documentation for usage. + _defaults = { + "text_kwargs": { + "padding": "max_length", + "max_length": 64, + }, + } + + +class AlignProcessor(ProcessorMixin): + r""" + Constructs an ALIGN processor which wraps [`EfficientNetImageProcessor`] and + [`BertTokenizer`]/[`BertTokenizerFast`] into a single processor that interits both the image processor and + tokenizer functionalities. See the [`~AlignProcessor.__call__`] and [`~OwlViTProcessor.decode`] for more + information. + The preferred way of passing kwargs is as a dictionary per modality, see usage example below. + ```python + from transformers import AlignProcessor + from PIL import Image + model_id = "kakaobrain/align-base" + processor = AlignProcessor.from_pretrained(model_id) + + processor( + images=your_pil_image, + text=["What is that?"], + images_kwargs = {"crop_size": {"height": 224, "width": 224}}, + text_kwargs = {"padding": "do_not_pad"}, + common_kwargs = {"return_tensors": "pt"}, + ) + ``` + + Args: + image_processor ([`EfficientNetImageProcessor`]): + The image processor is a required input. + tokenizer ([`BertTokenizer`, `BertTokenizerFast`]): + The tokenizer is a required input. + + """ + + attributes = ["image_processor", "tokenizer"] + image_processor_class = "EfficientNetImageProcessor" + tokenizer_class = ("BertTokenizer", "BertTokenizerFast") + + def __init__(self, image_processor, tokenizer): + super().__init__(image_processor, tokenizer) + + def __call__( + self, + images: ImageInput = None, + text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None, + audio=None, + videos=None, + **kwargs: Unpack[AlignProcessorKwargs], + ) -> BatchEncoding: + """ + Main method to prepare text(s) and image(s) to be fed as input to the model. This method forwards the `text` + arguments to BertTokenizerFast's [`~BertTokenizerFast.__call__`] if `text` is not `None` to encode + the text. To prepare the image(s), this method forwards the `images` arguments to + EfficientNetImageProcessor's [`~EfficientNetImageProcessor.__call__`] if `images` is not `None`. Please refer + to the docstring of the above two methods for more information. + + Args: + images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`): + The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch + tensor. Both channels-first and channels-last formats are supported. + text (`str`, `List[str]`): + The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings + (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set + `is_split_into_words=True` (to lift the ambiguity with a batch of sequences). + return_tensors (`str` or [`~utils.TensorType`], *optional*): + If set, will return tensors of a particular framework. Acceptable values are: + - `'tf'`: Return TensorFlow `tf.constant` objects. + - `'pt'`: Return PyTorch `torch.Tensor` objects. + - `'np'`: Return NumPy `np.ndarray` objects. + - `'jax'`: Return JAX `jnp.ndarray` objects. + Returns: + [`BatchEncoding`]: A [`BatchEncoding`] with the following fields: + + - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`. + - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when + `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not + `None`). + - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`. + """ + if text is None and images is None: + raise ValueError("You must specify either text or images.") + # check if images and text inputs are reversed for BC + images, text = _validate_images_text_input_order(images, text) + + output_kwargs = self._merge_kwargs( + AlignProcessorKwargs, + tokenizer_init_kwargs=self.tokenizer.init_kwargs, + **kwargs, + ) + # then, we can pass correct kwargs to each processor + if text is not None: + encoding = self.tokenizer(text, **output_kwargs["text_kwargs"]) + + if images is not None: + image_features = self.image_processor(images, **output_kwargs["images_kwargs"]) + + # BC for explicit return_tensors + if "return_tensors" in output_kwargs["common_kwargs"]: + return_tensors = output_kwargs["common_kwargs"].pop("return_tensors", None) + + if text is not None and images is not None: + encoding["pixel_values"] = image_features.pixel_values + return encoding + elif text is not None: + return encoding + else: + return BatchEncoding(data=dict(**image_features), tensor_type=return_tensors) + + def batch_decode(self, *args, **kwargs): + """ + This method forwards all its arguments to BertTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please + refer to the docstring of this method for more information. + """ + return self.tokenizer.batch_decode(*args, **kwargs) + + def decode(self, *args, **kwargs): + """ + This method forwards all its arguments to BertTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to + the docstring of this method for more information. + """ + return self.tokenizer.decode(*args, **kwargs) + + @property + def model_input_names(self): + tokenizer_input_names = self.tokenizer.model_input_names + image_processor_input_names = self.image_processor.model_input_names + return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names)) + + +__all__ = ["AlignProcessor"] diff --git a/docs/transformers/build/lib/transformers/models/altclip/__init__.py b/docs/transformers/build/lib/transformers/models/altclip/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..a30de8a2527567b521be3e9b60e99d025571cb18 --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/altclip/__init__.py @@ -0,0 +1,28 @@ +# Copyright 2020 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ...utils import _LazyModule +from ...utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .configuration_altclip import * + from .modeling_altclip import * + from .processing_altclip import * +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/docs/transformers/build/lib/transformers/models/altclip/configuration_altclip.py b/docs/transformers/build/lib/transformers/models/altclip/configuration_altclip.py new file mode 100644 index 0000000000000000000000000000000000000000..3c8e91bd473533c37f37a6dda58f550a0f9cfeda --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/altclip/configuration_altclip.py @@ -0,0 +1,384 @@ +# coding=utf-8 +# Copyright 2022 WenXiang ZhongzhiCheng LedellWu LiuGuang BoWenZhang and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""AltCLIP model configuration""" + +from ...configuration_utils import PretrainedConfig +from ...utils import logging + + +logger = logging.get_logger(__name__) + + +class AltCLIPTextConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`AltCLIPTextModel`]. It is used to instantiate a + AltCLIP text model according to the specified arguments, defining the model architecture. Instantiating a + configuration with the defaults will yield a similar configuration to that of the AltCLIP + [BAAI/AltCLIP](https://huggingface.co/BAAI/AltCLIP) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + + Args: + vocab_size (`int`, *optional*, defaults to 250002): + Vocabulary size of the AltCLIP model. Defines the number of different tokens that can be represented by the + `inputs_ids` passed when calling [`AltCLIPTextModel`]. + hidden_size (`int`, *optional*, defaults to 1024): + Dimensionality of the encoder layers and the pooler layer. + num_hidden_layers (`int`, *optional*, defaults to 24): + Number of hidden layers in the Transformer encoder. + num_attention_heads (`int`, *optional*, defaults to 16): + Number of attention heads for each attention layer in the Transformer encoder. + intermediate_size (`int`, *optional*, defaults to 4096): + Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder. + hidden_act (`str` or `Callable`, *optional*, defaults to `"gelu"`): + The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, + `"relu"`, `"silu"` and `"gelu_new"` are supported. + hidden_dropout_prob (`float`, *optional*, defaults to 0.1): + The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. + attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1): + The dropout ratio for the attention probabilities. + max_position_embeddings (`int`, *optional*, defaults to 514): + The maximum sequence length that this model might ever be used with. Typically set this to something large + just in case (e.g., 512 or 1024 or 2048). + type_vocab_size (`int`, *optional*, defaults to 1): + The vocabulary size of the `token_type_ids` passed when calling [`AltCLIPTextModel`] + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + initializer_factor (`float`, *optional*, defaults to 0.02): + A factor for initializing all weight matrices (should be kept to 1, used internally for initialization + testing). + layer_norm_eps (`float`, *optional*, defaults to 1e-05): + The epsilon used by the layer normalization layers. + pad_token_id (`int`, *optional*, defaults to 1): The id of the *padding* token. + bos_token_id (`int`, *optional*, defaults to 0): The id of the *beginning-of-sequence* token. + eos_token_id (`Union[int, List[int]]`, *optional*, defaults to 2): + The id of the *end-of-sequence* token. Optionally, use a list to set multiple *end-of-sequence* tokens. + position_embedding_type (`str`, *optional*, defaults to `"absolute"`): + Type of position embedding. Choose one of `"absolute"`, `"relative_key"`, `"relative_key_query"`. For + positional embeddings use `"absolute"`. For more information on `"relative_key"`, please refer to + [Self-Attention with Relative Position Representations (Shaw et al.)](https://arxiv.org/abs/1803.02155). + For more information on `"relative_key_query"`, please refer to *Method 4* in [Improve Transformer Models + with Better Relative Position Embeddings (Huang et al.)](https://arxiv.org/abs/2009.13658). + use_cache (`bool`, *optional*, defaults to `True`): + Whether or not the model should return the last key/values attentions (not used by all models). Only + relevant if `config.is_decoder=True`. + project_dim (`int`, *optional*, defaults to 768): + The dimensions of the teacher model before the mapping layer. + + Examples: + + ```python + >>> from transformers import AltCLIPTextModel, AltCLIPTextConfig + + >>> # Initializing a AltCLIPTextConfig with BAAI/AltCLIP style configuration + >>> configuration = AltCLIPTextConfig() + + >>> # Initializing a AltCLIPTextModel (with random weights) from the BAAI/AltCLIP style configuration + >>> model = AltCLIPTextModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "altclip_text_model" + + def __init__( + self, + vocab_size=250002, + hidden_size=1024, + num_hidden_layers=24, + num_attention_heads=16, + intermediate_size=4096, + hidden_act="gelu", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + max_position_embeddings=514, + type_vocab_size=1, + initializer_range=0.02, + initializer_factor=0.02, + layer_norm_eps=1e-05, + pad_token_id=1, + bos_token_id=0, + eos_token_id=2, + position_embedding_type="absolute", + use_cache=True, + project_dim=768, + **kwargs, + ): + super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs) + + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.hidden_act = hidden_act + self.intermediate_size = intermediate_size + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.max_position_embeddings = max_position_embeddings + self.type_vocab_size = type_vocab_size + self.initializer_range = initializer_range + self.initializer_factor = initializer_factor + self.layer_norm_eps = layer_norm_eps + self.position_embedding_type = position_embedding_type + self.use_cache = use_cache + self.project_dim = project_dim + + +class AltCLIPVisionConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`AltCLIPModel`]. It is used to instantiate an + AltCLIP model according to the specified arguments, defining the model architecture. Instantiating a configuration + with the defaults will yield a similar configuration to that of the AltCLIP + [BAAI/AltCLIP](https://huggingface.co/BAAI/AltCLIP) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + + Args: + hidden_size (`int`, *optional*, defaults to 768): + Dimensionality of the encoder layers and the pooler layer. + intermediate_size (`int`, *optional*, defaults to 3072): + Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder. + projection_dim (`int`, *optional*, defaults to 512): + Dimensionality of text and vision projection layers. + num_hidden_layers (`int`, *optional*, defaults to 12): + Number of hidden layers in the Transformer encoder. + num_attention_heads (`int`, *optional*, defaults to 12): + Number of attention heads for each attention layer in the Transformer encoder. + num_channels (`int`, *optional*, defaults to 3): + The number of input channels. + image_size (`int`, *optional*, defaults to 224): + The size (resolution) of each image. + patch_size (`int`, *optional*, defaults to 32): + The size (resolution) of each patch. + hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`): + The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, + `"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported. + layer_norm_eps (`float`, *optional*, defaults to 1e-05): + The epsilon used by the layer normalization layers. + attention_dropout (`float`, *optional*, defaults to 0.0): + The dropout ratio for the attention probabilities. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + initializer_factor (`float`, *optional*, defaults to 1.0): + A factor for initializing all weight matrices (should be kept to 1, used internally for initialization + testing). + + Example: + + ```python + >>> from transformers import AltCLIPVisionConfig, AltCLIPVisionModel + + >>> # Initializing a AltCLIPVisionConfig with BAAI/AltCLIP style configuration + >>> configuration = AltCLIPVisionConfig() + + >>> # Initializing a AltCLIPVisionModel (with random weights) from the BAAI/AltCLIP style configuration + >>> model = AltCLIPVisionModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "altclip_vision_model" + base_config_key = "vision_config" + + def __init__( + self, + hidden_size=768, + intermediate_size=3072, + projection_dim=512, + num_hidden_layers=12, + num_attention_heads=12, + num_channels=3, + image_size=224, + patch_size=32, + hidden_act="quick_gelu", + layer_norm_eps=1e-5, + attention_dropout=0.0, + initializer_range=0.02, + initializer_factor=1.0, + **kwargs, + ): + super().__init__(**kwargs) + + self.hidden_size = hidden_size + self.intermediate_size = intermediate_size + self.projection_dim = projection_dim + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.num_channels = num_channels + self.patch_size = patch_size + self.image_size = image_size + self.initializer_range = initializer_range + self.initializer_factor = initializer_factor + self.attention_dropout = attention_dropout + self.layer_norm_eps = layer_norm_eps + self.hidden_act = hidden_act + + +class AltCLIPConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`AltCLIPModel`]. It is used to instantiate an + AltCLIP model according to the specified arguments, defining the model architecture. Instantiating a configuration + with the defaults will yield a similar configuration to that of the AltCLIP + [BAAI/AltCLIP](https://huggingface.co/BAAI/AltCLIP) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + text_config (`dict`, *optional*): + Dictionary of configuration options used to initialize [`AltCLIPTextConfig`]. + vision_config (`dict`, *optional*): + Dictionary of configuration options used to initialize [`AltCLIPVisionConfig`]. + projection_dim (`int`, *optional*, defaults to 768): + Dimensionality of text and vision projection layers. + logit_scale_init_value (`float`, *optional*, defaults to 2.6592): + The initial value of the *logit_scale* parameter. Default is used as per the original CLIP implementation. + kwargs (*optional*): + Dictionary of keyword arguments. + + Example: + + ```python + >>> from transformers import AltCLIPConfig, AltCLIPModel + + >>> # Initializing a AltCLIPConfig with BAAI/AltCLIP style configuration + >>> configuration = AltCLIPConfig() + + >>> # Initializing a AltCLIPModel (with random weights) from the BAAI/AltCLIP style configuration + >>> model = AltCLIPModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + + >>> # We can also initialize a AltCLIPConfig from a AltCLIPTextConfig and a AltCLIPVisionConfig + + >>> # Initializing a AltCLIPText and AltCLIPVision configuration + >>> config_text = AltCLIPTextConfig() + >>> config_vision = AltCLIPVisionConfig() + + >>> config = AltCLIPConfig.from_text_vision_configs(config_text, config_vision) + ```""" + + model_type = "altclip" + sub_configs = {"text_config": AltCLIPTextConfig, "vision_config": AltCLIPVisionConfig} + + def __init__( + self, text_config=None, vision_config=None, projection_dim=768, logit_scale_init_value=2.6592, **kwargs + ): + # If `_config_dict` exist, we use them for the backward compatibility. + # We pop out these 2 attributes before calling `super().__init__` to avoid them being saved (which causes a lot + # of confusion!). + text_config_dict = kwargs.pop("text_config_dict", None) + vision_config_dict = kwargs.pop("vision_config_dict", None) + + super().__init__(**kwargs) + + # Instead of simply assigning `[text|vision]_config_dict` to `[text|vision]_config`, we use the values in + # `[text|vision]_config_dict` to update the values in `[text|vision]_config`. The values should be same in most + # cases, but we don't want to break anything regarding `_config_dict` that existed before commit `8827e1b2`. + if text_config_dict is not None: + if text_config is None: + text_config = {} + + # This is the complete result when using `text_config_dict`. + _text_config_dict = AltCLIPTextConfig(**text_config_dict).to_dict() + + # Give a warning if the values exist in both `_text_config_dict` and `text_config` but being different. + for key, value in _text_config_dict.items(): + if key in text_config and value != text_config[key] and key not in ["transformers_version"]: + # If specified in `text_config_dict` + if key in text_config_dict: + message = ( + f"`{key}` is found in both `text_config_dict` and `text_config` but with different values. " + f'The value `text_config_dict["{key}"]` will be used instead.' + ) + # If inferred from default argument values (just to be super careful) + else: + message = ( + f"`text_config_dict` is provided which will be used to initialize `AltCLIPTextConfig`. The " + f'value `text_config["{key}"]` will be overridden.' + ) + logger.info(message) + + # Update all values in `text_config` with the ones in `_text_config_dict`. + text_config.update(_text_config_dict) + + if vision_config_dict is not None: + if vision_config is None: + vision_config = {} + + # This is the complete result when using `vision_config_dict`. + _vision_config_dict = AltCLIPVisionConfig(**vision_config_dict).to_dict() + # convert keys to string instead of integer + if "id2label" in _vision_config_dict: + _vision_config_dict["id2label"] = { + str(key): value for key, value in _vision_config_dict["id2label"].items() + } + + # Give a warning if the values exist in both `_vision_config_dict` and `vision_config` but being different. + for key, value in _vision_config_dict.items(): + if key in vision_config and value != vision_config[key] and key not in ["transformers_version"]: + # If specified in `vision_config_dict` + if key in vision_config_dict: + message = ( + f"`{key}` is found in both `vision_config_dict` and `vision_config` but with different " + f'values. The value `vision_config_dict["{key}"]` will be used instead.' + ) + # If inferred from default argument values (just to be super careful) + else: + message = ( + f"`vision_config_dict` is provided which will be used to initialize `AltCLIPVisionConfig`. " + f'The value `vision_config["{key}"]` will be overridden.' + ) + logger.info(message) + + # Update all values in `vision_config` with the ones in `_vision_config_dict`. + vision_config.update(_vision_config_dict) + + if text_config is None: + text_config = {} + logger.info("`text_config` is `None`. Initializing the `AltCLIPTextConfig` with default values.") + + if vision_config is None: + vision_config = {} + logger.info("`vision_config` is `None`. initializing the `AltCLIPVisionConfig` with default values.") + + self.text_config = AltCLIPTextConfig(**text_config) + self.vision_config = AltCLIPVisionConfig(**vision_config) + + self.projection_dim = projection_dim + self.logit_scale_init_value = logit_scale_init_value + self.initializer_factor = 1.0 + + @classmethod + def from_text_vision_configs(cls, text_config: AltCLIPTextConfig, vision_config: AltCLIPVisionConfig, **kwargs): + r""" + Instantiate a [`AltCLIPConfig`] (or a derived class) from altclip text model configuration and altclip vision + model configuration. + + Returns: + [`AltCLIPConfig`]: An instance of a configuration object + """ + + return cls(text_config=text_config.to_dict(), vision_config=vision_config.to_dict(), **kwargs) + + +__all__ = ["AltCLIPTextConfig", "AltCLIPVisionConfig", "AltCLIPConfig"] diff --git a/docs/transformers/build/lib/transformers/models/altclip/modeling_altclip.py b/docs/transformers/build/lib/transformers/models/altclip/modeling_altclip.py new file mode 100644 index 0000000000000000000000000000000000000000..9c8be7ea8079a34276a9513ca1ee4eaa6287e2fb --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/altclip/modeling_altclip.py @@ -0,0 +1,1761 @@ +# coding=utf-8 +# Copyright 2022 The BAAI Teams Authors and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch AltCLIP model.""" + +import math +from dataclasses import dataclass +from typing import Any, Callable, List, Optional, Tuple, Union + +import torch +import torch.nn as nn +import torch.utils.checkpoint + +from ...activations import ACT2FN +from ...modeling_outputs import ( + BaseModelOutput, + BaseModelOutputWithPastAndCrossAttentions, + BaseModelOutputWithPooling, + BaseModelOutputWithPoolingAndCrossAttentions, + BaseModelOutputWithPoolingAndProjection, +) +from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel +from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer +from ...utils import ( + ModelOutput, + add_start_docstrings_to_model_forward, + logging, + replace_return_docstrings, + torch_int, +) +from .configuration_altclip import AltCLIPConfig, AltCLIPTextConfig, AltCLIPVisionConfig + + +logger = logging.get_logger(__name__) + +_CHECKPOINT_FOR_DOC = "BAAI/AltCLIP" +_CONFIG_FOR_DOC = "AltCLIPConfig" + + +ALTCLIP_START_DOCSTRING = r""" + This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads + etc.) + + This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. + Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage + and behavior. + + Parameters: + config ([`CLIPConfig`]): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights. +""" + +ALTCLIP_TEXT_INPUTS_DOCSTRING = r""" + Args: + input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide + it. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, + config.max_position_embeddings - 1]`. + + [What are position IDs?](../glossary#position-ids) + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. +""" + +ALTCLIP_VISION_INPUTS_DOCSTRING = r""" + Args: + pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`): + Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using + [`AutoImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + interpolate_pos_encoding (`bool`, *optional*, defaults `False`): + Whether to interpolate the pre-trained position encodings. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. +""" + +ALTCLIP_INPUTS_DOCSTRING = r""" + Args: + input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide + it. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, + config.max_position_embeddings - 1]`. + + [What are position IDs?](../glossary#position-ids) + pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`): + Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using + [`AutoImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details. + return_loss (`bool`, *optional*): + Whether or not to return the contrastive loss. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + interpolate_pos_encoding (`bool`, *optional*, defaults `False`): + Whether to interpolate the pre-trained position encodings. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. +""" + + +# contrastive loss function, adapted from +# https://sachinruk.github.io/blog/pytorch/pytorch%20lightning/loss%20function/gpu/2021/03/07/CLIP.html +def contrastive_loss(logits: torch.Tensor) -> torch.Tensor: + return nn.functional.cross_entropy(logits, torch.arange(len(logits), device=logits.device)) + + +def clip_loss(similarity: torch.Tensor) -> torch.Tensor: + caption_loss = contrastive_loss(similarity) + image_loss = contrastive_loss(similarity.t()) + return (caption_loss + image_loss) / 2.0 + + +@dataclass +# Copied from transformers.models.clip.modeling_clip.CLIPOutput with CLIP->AltCLIP +class AltCLIPOutput(ModelOutput): + """ + Args: + loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`): + Contrastive loss for image-text similarity. + logits_per_image (`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`): + The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text + similarity scores. + logits_per_text (`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`): + The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image + similarity scores. + text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`): + The text embeddings obtained by applying the projection layer to the pooled output of [`AltCLIPTextModel`]. + image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`): + The image embeddings obtained by applying the projection layer to the pooled output of [`AltCLIPVisionModel`]. + text_model_output (`BaseModelOutputWithPooling`): + The output of the [`AltCLIPTextModel`]. + vision_model_output (`BaseModelOutputWithPooling`): + The output of the [`AltCLIPVisionModel`]. + """ + + loss: Optional[torch.FloatTensor] = None + logits_per_image: Optional[torch.FloatTensor] = None + logits_per_text: Optional[torch.FloatTensor] = None + text_embeds: Optional[torch.FloatTensor] = None + image_embeds: Optional[torch.FloatTensor] = None + text_model_output: BaseModelOutputWithPooling = None + vision_model_output: BaseModelOutputWithPooling = None + + def to_tuple(self) -> Tuple[Any]: + return tuple( + self[k] if k not in ["text_model_output", "vision_model_output"] else getattr(self, k).to_tuple() + for k in self.keys() + ) + + +# Copied from transformers.models.roberta.modeling_roberta.RobertaEmbeddings with Roberta->AltRoberta +class AltRobertaEmbeddings(nn.Module): + """ + Same as BertEmbeddings with a tiny tweak for positional embeddings indexing. + """ + + # Copied from transformers.models.bert.modeling_bert.BertEmbeddings.__init__ + def __init__(self, config): + super().__init__() + self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id) + self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size) + self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size) + + # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load + # any TensorFlow checkpoint file + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + # position_ids (1, len position emb) is contiguous in memory and exported when serialized + self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") + self.register_buffer( + "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False + ) + self.register_buffer( + "token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False + ) + + # End copy + self.padding_idx = config.pad_token_id + self.position_embeddings = nn.Embedding( + config.max_position_embeddings, config.hidden_size, padding_idx=self.padding_idx + ) + + def forward( + self, input_ids=None, token_type_ids=None, position_ids=None, inputs_embeds=None, past_key_values_length=0 + ): + if position_ids is None: + if input_ids is not None: + # Create the position ids from the input token ids. Any padded tokens remain padded. + position_ids = create_position_ids_from_input_ids(input_ids, self.padding_idx, past_key_values_length) + else: + position_ids = self.create_position_ids_from_inputs_embeds(inputs_embeds) + + if input_ids is not None: + input_shape = input_ids.size() + else: + input_shape = inputs_embeds.size()[:-1] + + seq_length = input_shape[1] + + # Setting the token_type_ids to the registered buffer in constructor where it is all zeros, which usually occurs + # when its auto-generated, registered buffer helps users when tracing the model without passing token_type_ids, solves + # issue #5664 + if token_type_ids is None: + if hasattr(self, "token_type_ids"): + buffered_token_type_ids = self.token_type_ids[:, :seq_length] + buffered_token_type_ids_expanded = buffered_token_type_ids.expand(input_shape[0], seq_length) + token_type_ids = buffered_token_type_ids_expanded + else: + token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device) + + if inputs_embeds is None: + inputs_embeds = self.word_embeddings(input_ids) + token_type_embeddings = self.token_type_embeddings(token_type_ids) + + embeddings = inputs_embeds + token_type_embeddings + if self.position_embedding_type == "absolute": + position_embeddings = self.position_embeddings(position_ids) + embeddings += position_embeddings + embeddings = self.LayerNorm(embeddings) + embeddings = self.dropout(embeddings) + return embeddings + + def create_position_ids_from_inputs_embeds(self, inputs_embeds): + """ + We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids. + + Args: + inputs_embeds: torch.Tensor + + Returns: torch.Tensor + """ + input_shape = inputs_embeds.size()[:-1] + sequence_length = input_shape[1] + + position_ids = torch.arange( + self.padding_idx + 1, sequence_length + self.padding_idx + 1, dtype=torch.long, device=inputs_embeds.device + ) + return position_ids.unsqueeze(0).expand(input_shape) + + +# Copied from transformers.models.roberta.modeling_roberta.RobertaSelfAttention with Roberta->AltRoberta +class AltRobertaSelfAttention(nn.Module): + def __init__(self, config, position_embedding_type=None): + super().__init__() + if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"): + raise ValueError( + f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention " + f"heads ({config.num_attention_heads})" + ) + + self.num_attention_heads = config.num_attention_heads + self.attention_head_size = int(config.hidden_size / config.num_attention_heads) + self.all_head_size = self.num_attention_heads * self.attention_head_size + + self.query = nn.Linear(config.hidden_size, self.all_head_size) + self.key = nn.Linear(config.hidden_size, self.all_head_size) + self.value = nn.Linear(config.hidden_size, self.all_head_size) + + self.dropout = nn.Dropout(config.attention_probs_dropout_prob) + self.position_embedding_type = position_embedding_type or getattr( + config, "position_embedding_type", "absolute" + ) + if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query": + self.max_position_embeddings = config.max_position_embeddings + self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size) + + self.is_decoder = config.is_decoder + + def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor: + new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size) + x = x.view(new_x_shape) + return x.permute(0, 2, 1, 3) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.FloatTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + encoder_attention_mask: Optional[torch.FloatTensor] = None, + past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None, + output_attentions: Optional[bool] = False, + ) -> Tuple[torch.Tensor]: + mixed_query_layer = self.query(hidden_states) + + # If this is instantiated as a cross-attention module, the keys + # and values come from an encoder; the attention mask needs to be + # such that the encoder's padding tokens are not attended to. + is_cross_attention = encoder_hidden_states is not None + + if is_cross_attention and past_key_value is not None: + # reuse k,v, cross_attentions + key_layer = past_key_value[0] + value_layer = past_key_value[1] + attention_mask = encoder_attention_mask + elif is_cross_attention: + key_layer = self.transpose_for_scores(self.key(encoder_hidden_states)) + value_layer = self.transpose_for_scores(self.value(encoder_hidden_states)) + attention_mask = encoder_attention_mask + elif past_key_value is not None: + key_layer = self.transpose_for_scores(self.key(hidden_states)) + value_layer = self.transpose_for_scores(self.value(hidden_states)) + key_layer = torch.cat([past_key_value[0], key_layer], dim=2) + value_layer = torch.cat([past_key_value[1], value_layer], dim=2) + else: + key_layer = self.transpose_for_scores(self.key(hidden_states)) + value_layer = self.transpose_for_scores(self.value(hidden_states)) + + query_layer = self.transpose_for_scores(mixed_query_layer) + + use_cache = past_key_value is not None + if self.is_decoder: + # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states. + # Further calls to cross_attention layer can then reuse all cross-attention + # key/value_states (first "if" case) + # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of + # all previous decoder key/value_states. Further calls to uni-directional self-attention + # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case) + # if encoder bi-directional self-attention `past_key_value` is always `None` + past_key_value = (key_layer, value_layer) + + # Take the dot product between "query" and "key" to get the raw attention scores. + attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2)) + + if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query": + query_length, key_length = query_layer.shape[2], key_layer.shape[2] + if use_cache: + position_ids_l = torch.tensor(key_length - 1, dtype=torch.long, device=hidden_states.device).view( + -1, 1 + ) + else: + position_ids_l = torch.arange(query_length, dtype=torch.long, device=hidden_states.device).view(-1, 1) + position_ids_r = torch.arange(key_length, dtype=torch.long, device=hidden_states.device).view(1, -1) + distance = position_ids_l - position_ids_r + + positional_embedding = self.distance_embedding(distance + self.max_position_embeddings - 1) + positional_embedding = positional_embedding.to(dtype=query_layer.dtype) # fp16 compatibility + + if self.position_embedding_type == "relative_key": + relative_position_scores = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding) + attention_scores = attention_scores + relative_position_scores + elif self.position_embedding_type == "relative_key_query": + relative_position_scores_query = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding) + relative_position_scores_key = torch.einsum("bhrd,lrd->bhlr", key_layer, positional_embedding) + attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key + + attention_scores = attention_scores / math.sqrt(self.attention_head_size) + if attention_mask is not None: + # Apply the attention mask is (precomputed for all layers in AltRobertaModel forward() function) + attention_scores = attention_scores + attention_mask + + # Normalize the attention scores to probabilities. + attention_probs = nn.functional.softmax(attention_scores, dim=-1) + + # This is actually dropping out entire tokens to attend to, which might + # seem a bit unusual, but is taken from the original Transformer paper. + attention_probs = self.dropout(attention_probs) + + # Mask heads if we want to + if head_mask is not None: + attention_probs = attention_probs * head_mask + + context_layer = torch.matmul(attention_probs, value_layer) + + context_layer = context_layer.permute(0, 2, 1, 3).contiguous() + new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,) + context_layer = context_layer.view(new_context_layer_shape) + + outputs = (context_layer, attention_probs) if output_attentions else (context_layer,) + + if self.is_decoder: + outputs = outputs + (past_key_value,) + return outputs + + +# Copied from transformers.models.roberta.modeling_roberta.RobertaSelfOutput +class AltRobertaSelfOutput(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = self.LayerNorm(hidden_states + input_tensor) + return hidden_states + + +ALT_ROBERTA_SELF_ATTENTION_CLASSES = { + "eager": AltRobertaSelfAttention, +} + + +# Copied from transformers.models.roberta.modeling_roberta.RobertaAttention with Roberta->AltRoberta,ROBERTA->ALT_ROBERTA +class AltRobertaAttention(nn.Module): + def __init__(self, config, position_embedding_type=None): + super().__init__() + self.self = ALT_ROBERTA_SELF_ATTENTION_CLASSES[config._attn_implementation]( + config, position_embedding_type=position_embedding_type + ) + self.output = AltRobertaSelfOutput(config) + self.pruned_heads = set() + + def prune_heads(self, heads): + if len(heads) == 0: + return + heads, index = find_pruneable_heads_and_indices( + heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads + ) + + # Prune linear layers + self.self.query = prune_linear_layer(self.self.query, index) + self.self.key = prune_linear_layer(self.self.key, index) + self.self.value = prune_linear_layer(self.self.value, index) + self.output.dense = prune_linear_layer(self.output.dense, index, dim=1) + + # Update hyper params and store pruned heads + self.self.num_attention_heads = self.self.num_attention_heads - len(heads) + self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads + self.pruned_heads = self.pruned_heads.union(heads) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.FloatTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + encoder_attention_mask: Optional[torch.FloatTensor] = None, + past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None, + output_attentions: Optional[bool] = False, + ) -> Tuple[torch.Tensor]: + self_outputs = self.self( + hidden_states, + attention_mask, + head_mask, + encoder_hidden_states, + encoder_attention_mask, + past_key_value, + output_attentions, + ) + attention_output = self.output(self_outputs[0], hidden_states) + outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them + return outputs + + +# Copied from transformers.models.roberta.modeling_roberta.RobertaIntermediate with Roberta->AltRoberta +class AltRobertaIntermediate(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.intermediate_size) + if isinstance(config.hidden_act, str): + self.intermediate_act_fn = ACT2FN[config.hidden_act] + else: + self.intermediate_act_fn = config.hidden_act + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.intermediate_act_fn(hidden_states) + return hidden_states + + +# Copied from transformers.models.roberta.modeling_roberta.RobertaOutput +class AltRobertaOutput(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.intermediate_size, config.hidden_size) + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = self.LayerNorm(hidden_states + input_tensor) + return hidden_states + + +# Copied from transformers.models.roberta.modeling_roberta.RobertaLayer with Roberta->AltRoberta +class AltRobertaLayer(nn.Module): + def __init__(self, config): + super().__init__() + self.chunk_size_feed_forward = config.chunk_size_feed_forward + self.seq_len_dim = 1 + self.attention = AltRobertaAttention(config) + self.is_decoder = config.is_decoder + self.add_cross_attention = config.add_cross_attention + if self.add_cross_attention: + if not self.is_decoder: + raise ValueError(f"{self} should be used as a decoder model if cross attention is added") + self.crossattention = AltRobertaAttention(config, position_embedding_type="absolute") + self.intermediate = AltRobertaIntermediate(config) + self.output = AltRobertaOutput(config) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.FloatTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + encoder_attention_mask: Optional[torch.FloatTensor] = None, + past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None, + output_attentions: Optional[bool] = False, + ) -> Tuple[torch.Tensor]: + # decoder uni-directional self-attention cached key/values tuple is at positions 1,2 + self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None + self_attention_outputs = self.attention( + hidden_states, + attention_mask, + head_mask, + output_attentions=output_attentions, + past_key_value=self_attn_past_key_value, + ) + attention_output = self_attention_outputs[0] + + # if decoder, the last output is tuple of self-attn cache + if self.is_decoder: + outputs = self_attention_outputs[1:-1] + present_key_value = self_attention_outputs[-1] + else: + outputs = self_attention_outputs[1:] # add self attentions if we output attention weights + + cross_attn_present_key_value = None + if self.is_decoder and encoder_hidden_states is not None: + if not hasattr(self, "crossattention"): + raise ValueError( + f"If `encoder_hidden_states` are passed, {self} has to be instantiated with cross-attention layers" + " by setting `config.add_cross_attention=True`" + ) + + # cross_attn cached key/values tuple is at positions 3,4 of past_key_value tuple + cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None + cross_attention_outputs = self.crossattention( + attention_output, + attention_mask, + head_mask, + encoder_hidden_states, + encoder_attention_mask, + cross_attn_past_key_value, + output_attentions, + ) + attention_output = cross_attention_outputs[0] + outputs = outputs + cross_attention_outputs[1:-1] # add cross attentions if we output attention weights + + # add cross-attn cache to positions 3,4 of present_key_value tuple + cross_attn_present_key_value = cross_attention_outputs[-1] + present_key_value = present_key_value + cross_attn_present_key_value + + layer_output = apply_chunking_to_forward( + self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attention_output + ) + outputs = (layer_output,) + outputs + + # if decoder, return the attn key/values as the last output + if self.is_decoder: + outputs = outputs + (present_key_value,) + + return outputs + + def feed_forward_chunk(self, attention_output): + intermediate_output = self.intermediate(attention_output) + layer_output = self.output(intermediate_output, attention_output) + return layer_output + + +# Copied from transformers.models.roberta.modeling_roberta.RobertaEncoder with Roberta->AltRoberta +class AltRobertaEncoder(nn.Module): + def __init__(self, config): + super().__init__() + self.config = config + self.layer = nn.ModuleList([AltRobertaLayer(config) for _ in range(config.num_hidden_layers)]) + self.gradient_checkpointing = False + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.FloatTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + encoder_attention_mask: Optional[torch.FloatTensor] = None, + past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = False, + output_hidden_states: Optional[bool] = False, + return_dict: Optional[bool] = True, + ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPastAndCrossAttentions]: + all_hidden_states = () if output_hidden_states else None + all_self_attentions = () if output_attentions else None + all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None + + if self.gradient_checkpointing and self.training: + if use_cache: + logger.warning_once( + "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." + ) + use_cache = False + + next_decoder_cache = () if use_cache else None + for i, layer_module in enumerate(self.layer): + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + layer_head_mask = head_mask[i] if head_mask is not None else None + past_key_value = past_key_values[i] if past_key_values is not None else None + + if self.gradient_checkpointing and self.training: + layer_outputs = self._gradient_checkpointing_func( + layer_module.__call__, + hidden_states, + attention_mask, + layer_head_mask, + encoder_hidden_states, + encoder_attention_mask, + past_key_value, + output_attentions, + ) + else: + layer_outputs = layer_module( + hidden_states, + attention_mask, + layer_head_mask, + encoder_hidden_states, + encoder_attention_mask, + past_key_value, + output_attentions, + ) + + hidden_states = layer_outputs[0] + if use_cache: + next_decoder_cache += (layer_outputs[-1],) + if output_attentions: + all_self_attentions = all_self_attentions + (layer_outputs[1],) + if self.config.add_cross_attention: + all_cross_attentions = all_cross_attentions + (layer_outputs[2],) + + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + if not return_dict: + return tuple( + v + for v in [ + hidden_states, + next_decoder_cache, + all_hidden_states, + all_self_attentions, + all_cross_attentions, + ] + if v is not None + ) + return BaseModelOutputWithPastAndCrossAttentions( + last_hidden_state=hidden_states, + past_key_values=next_decoder_cache, + hidden_states=all_hidden_states, + attentions=all_self_attentions, + cross_attentions=all_cross_attentions, + ) + + +# Copied from transformers.models.roberta.modeling_roberta.RobertaPooler +class AltRobertaPooler(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + self.activation = nn.Tanh() + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + # We "pool" the model by simply taking the hidden state corresponding + # to the first token. + first_token_tensor = hidden_states[:, 0] + pooled_output = self.dense(first_token_tensor) + pooled_output = self.activation(pooled_output) + return pooled_output + + +# Copied from transformers.models.siglip.modeling_siglip.eager_attention_forward +def eager_attention_forward( + module: nn.Module, + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + attention_mask: Optional[torch.Tensor], + scaling: float, + dropout: float = 0.0, + **kwargs, +): + attn_weights = torch.matmul(query, key.transpose(-1, -2)) * scaling + if attention_mask is not None: + attn_weights = attn_weights + attention_mask + + attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype) + attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training) + + attn_output = torch.matmul(attn_weights, value) + attn_output = attn_output.transpose(1, 2).contiguous() + return attn_output, attn_weights + + +class AltCLIPAttention(nn.Module): + """Multi-headed attention from 'Attention Is All You Need' paper""" + + def __init__(self, config): + super().__init__() + self.config = config + self.embed_dim = config.hidden_size + self.num_heads = config.num_attention_heads + self.head_dim = self.embed_dim // self.num_heads + if self.head_dim * self.num_heads != self.embed_dim: + raise ValueError( + f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:" + f" {self.num_heads})." + ) + self.scale = self.head_dim**-0.5 + self.dropout = config.attention_dropout + self.is_causal = False + + self.k_proj = nn.Linear(self.embed_dim, self.embed_dim) + self.v_proj = nn.Linear(self.embed_dim, self.embed_dim) + self.q_proj = nn.Linear(self.embed_dim, self.embed_dim) + self.out_proj = nn.Linear(self.embed_dim, self.embed_dim) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + causal_attention_mask: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = False, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]: + """Input shape: Batch x Time x Channel""" + + batch_size, seq_length, embed_dim = hidden_states.shape + + queries = self.q_proj(hidden_states) + keys = self.k_proj(hidden_states) + values = self.v_proj(hidden_states) + + queries = queries.view(batch_size, seq_length, self.num_heads, self.head_dim).transpose(1, 2) + keys = keys.view(batch_size, seq_length, self.num_heads, self.head_dim).transpose(1, 2) + values = values.view(batch_size, seq_length, self.num_heads, self.head_dim).transpose(1, 2) + # CLIP text model uses both `causal_attention_mask` and `attention_mask` + # in case FA2 kernel is called, `is_causal` should be inferred from `causal_attention_mask` + if self.config._attn_implementation != "flash_attention_2": + if attention_mask is not None and causal_attention_mask is not None: + attention_mask = attention_mask + causal_attention_mask + elif causal_attention_mask is not None: + attention_mask = causal_attention_mask + else: + self.is_causal = causal_attention_mask is not None + + attention_interface: Callable = eager_attention_forward + if self.config._attn_implementation != "eager": + if self.config._attn_implementation == "sdpa" and output_attentions: + logger.warning_once( + "`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to " + 'eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.' + ) + else: + attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation] + + attn_output, attn_weights = attention_interface( + self, + queries, + keys, + values, + attention_mask, + is_causal=self.is_causal, + scaling=self.scale, + dropout=0.0 if not self.training else self.dropout, + ) + + attn_output = attn_output.reshape(batch_size, seq_length, embed_dim).contiguous() + attn_output = self.out_proj(attn_output) + if not output_attentions: + attn_weights = None + return attn_output, attn_weights + + +# Copied from transformers.models.clip.modeling_clip.CLIPMLP with CLIP->AltCLIP +class AltCLIPMLP(nn.Module): + def __init__(self, config): + super().__init__() + self.config = config + self.activation_fn = ACT2FN[config.hidden_act] + self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size) + self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size) + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + hidden_states = self.fc1(hidden_states) + hidden_states = self.activation_fn(hidden_states) + hidden_states = self.fc2(hidden_states) + return hidden_states + + +class AltCLIPEncoderLayer(nn.Module): + def __init__(self, config: AltCLIPConfig): + super().__init__() + self.embed_dim = config.hidden_size + self.self_attn = AltCLIPAttention(config) + self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps) + self.mlp = AltCLIPMLP(config) + self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: torch.Tensor, + causal_attention_mask: torch.Tensor, + output_attentions: Optional[bool] = False, + ) -> Tuple[torch.FloatTensor]: + """ + Args: + hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)` + attention_mask (`torch.FloatTensor`): attention mask of size + `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values. + `(config.encoder_attention_heads,)`. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + """ + residual = hidden_states + + hidden_states = self.layer_norm1(hidden_states) + hidden_states, attn_weights = self.self_attn( + hidden_states=hidden_states, + attention_mask=attention_mask, + causal_attention_mask=causal_attention_mask, + output_attentions=output_attentions, + ) + hidden_states = residual + hidden_states + + residual = hidden_states + hidden_states = self.layer_norm2(hidden_states) + hidden_states = self.mlp(hidden_states) + hidden_states = residual + hidden_states + + outputs = (hidden_states,) + + if output_attentions: + outputs += (attn_weights,) + + return outputs + + +class AltCLIPEncoder(nn.Module): + """ + Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a + [`AltCLIPEncoderLayer`]. + + Args: + config: AltCLIPConfig + """ + + def __init__(self, config: AltCLIPConfig): + super().__init__() + self.config = config + self.layers = nn.ModuleList([AltCLIPEncoderLayer(config) for _ in range(config.num_hidden_layers)]) + self.gradient_checkpointing = False + + def forward( + self, + inputs_embeds, + attention_mask: Optional[torch.Tensor] = None, + causal_attention_mask: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, BaseModelOutput]: + r""" + Args: + inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. + This is useful if you want more control over how to convert `input_ids` indices into associated vectors + than the model's internal embedding lookup matrix. + attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + causal_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Causal mask for the text model. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors + for more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. + """ + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + encoder_states = () if output_hidden_states else None + all_attentions = () if output_attentions else None + + hidden_states = inputs_embeds + for idx, encoder_layer in enumerate(self.layers): + if output_hidden_states: + encoder_states = encoder_states + (hidden_states,) + if self.gradient_checkpointing and self.training: + layer_outputs = self._gradient_checkpointing_func( + encoder_layer.__call__, + hidden_states, + attention_mask, + causal_attention_mask, + output_attentions, + ) + else: + layer_outputs = encoder_layer( + hidden_states, + attention_mask, + causal_attention_mask, + output_attentions=output_attentions, + ) + + hidden_states = layer_outputs[0] + + if output_attentions: + all_attentions = all_attentions + (layer_outputs[1],) + + if output_hidden_states: + encoder_states = encoder_states + (hidden_states,) + + if not return_dict: + return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None) + return BaseModelOutput( + last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions + ) + + +# Copied from transformers.models.clip.modeling_clip.CLIPVisionEmbeddings with CLIP->AltCLIP +class AltCLIPVisionEmbeddings(nn.Module): + def __init__(self, config: AltCLIPVisionConfig): + super().__init__() + self.config = config + self.embed_dim = config.hidden_size + self.image_size = config.image_size + self.patch_size = config.patch_size + + self.class_embedding = nn.Parameter(torch.randn(self.embed_dim)) + + self.patch_embedding = nn.Conv2d( + in_channels=config.num_channels, + out_channels=self.embed_dim, + kernel_size=self.patch_size, + stride=self.patch_size, + bias=False, + ) + + self.num_patches = (self.image_size // self.patch_size) ** 2 + self.num_positions = self.num_patches + 1 + self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim) + self.register_buffer("position_ids", torch.arange(self.num_positions).expand((1, -1)), persistent=False) + + def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width: int) -> torch.Tensor: + """ + This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution + images. This method is also adapted to support torch.jit tracing. + + Adapted from: + - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and + - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211 + """ + + num_patches = embeddings.shape[1] - 1 + position_embedding = self.position_embedding.weight.unsqueeze(0) + num_positions = position_embedding.shape[1] - 1 + + # always interpolate when tracing to ensure the exported model works for dynamic input shapes + if not torch.jit.is_tracing() and num_patches == num_positions and height == width: + return self.position_embedding(self.position_ids) + + class_pos_embed = position_embedding[:, :1] + patch_pos_embed = position_embedding[:, 1:] + + dim = embeddings.shape[-1] + + new_height = height // self.patch_size + new_width = width // self.patch_size + + sqrt_num_positions = torch_int(num_positions**0.5) + patch_pos_embed = patch_pos_embed.reshape(1, sqrt_num_positions, sqrt_num_positions, dim) + patch_pos_embed = patch_pos_embed.permute(0, 3, 1, 2) + + patch_pos_embed = nn.functional.interpolate( + patch_pos_embed, + size=(new_height, new_width), + mode="bicubic", + align_corners=False, + ) + + patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim) + + return torch.cat((class_pos_embed, patch_pos_embed), dim=1) + + def forward(self, pixel_values: torch.FloatTensor, interpolate_pos_encoding=False) -> torch.Tensor: + batch_size, _, height, width = pixel_values.shape + if not interpolate_pos_encoding and (height != self.image_size or width != self.image_size): + raise ValueError( + f"Input image size ({height}*{width}) doesn't match model ({self.image_size}*{self.image_size})." + ) + target_dtype = self.patch_embedding.weight.dtype + patch_embeds = self.patch_embedding(pixel_values.to(dtype=target_dtype)) # shape = [*, width, grid, grid] + patch_embeds = patch_embeds.flatten(2).transpose(1, 2) + + class_embeds = self.class_embedding.expand(batch_size, 1, -1) + embeddings = torch.cat([class_embeds, patch_embeds], dim=1) + if interpolate_pos_encoding: + embeddings = embeddings + self.interpolate_pos_encoding(embeddings, height, width) + else: + embeddings = embeddings + self.position_embedding(self.position_ids) + return embeddings + + +class AltCLIPPreTrainedModel(PreTrainedModel): + """ + An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained + models. + """ + + config_class = AltCLIPConfig + base_model_prefix = "altclip" + supports_gradient_checkpointing = True + _no_split_module = [] + + def _init_weights(self, module): + """Initialize the weights""" + factor = self.config.initializer_factor + if isinstance(module, AltCLIPVisionEmbeddings): + factor = self.config.initializer_factor + nn.init.normal_(module.class_embedding, mean=0.0, std=module.embed_dim**-0.5 * factor) + nn.init.normal_(module.patch_embedding.weight, std=module.config.initializer_range * factor) + nn.init.normal_(module.position_embedding.weight, std=module.config.initializer_range * factor) + elif isinstance(module, AltCLIPAttention): + factor = self.config.initializer_factor + in_proj_std = (module.embed_dim**-0.5) * ((2 * module.config.num_hidden_layers) ** -0.5) * factor + out_proj_std = (module.embed_dim**-0.5) * factor + nn.init.normal_(module.q_proj.weight, std=in_proj_std) + nn.init.normal_(module.k_proj.weight, std=in_proj_std) + nn.init.normal_(module.v_proj.weight, std=in_proj_std) + nn.init.normal_(module.out_proj.weight, std=out_proj_std) + elif isinstance(module, AltCLIPMLP): + factor = self.config.initializer_factor + in_proj_std = (module.config.hidden_size**-0.5) * ((2 * module.config.num_hidden_layers) ** -0.5) * factor + fc_std = (2 * module.config.hidden_size) ** -0.5 * factor + nn.init.normal_(module.fc1.weight, std=fc_std) + nn.init.normal_(module.fc2.weight, std=in_proj_std) + elif isinstance(module, AltCLIPModel): + nn.init.normal_( + module.text_projection.weight, + std=module.text_embed_dim**-0.5 * self.config.initializer_factor, + ) + module.text_projection._is_hf_initialized = True + nn.init.normal_( + module.visual_projection.weight, + std=module.vision_embed_dim**-0.5 * self.config.initializer_factor, + ) + module.visual_projection._is_hf_initialized = True + elif isinstance(module, nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + elif isinstance(module, nn.Linear): + module.weight.data.normal_(mean=0.0, std=self.config.initializer_factor) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=self.config.initializer_factor) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + + +class AltCLIPVisionTransformer(nn.Module): + def __init__(self, config: AltCLIPVisionConfig): + super().__init__() + self.config = config + embed_dim = config.hidden_size + + self.embeddings = AltCLIPVisionEmbeddings(config) + self.pre_layrnorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps) + self.encoder = AltCLIPEncoder(config) + self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps) + + @add_start_docstrings_to_model_forward(ALTCLIP_VISION_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=AltCLIPVisionConfig) + def forward( + self, + pixel_values: Optional[torch.FloatTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + interpolate_pos_encoding: Optional[bool] = False, + ) -> Union[Tuple, BaseModelOutputWithPooling]: + r""" + Returns: + + """ + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if pixel_values is None: + raise ValueError("You have to specify pixel_values") + + hidden_states = self.embeddings(pixel_values, interpolate_pos_encoding=interpolate_pos_encoding) + hidden_states = self.pre_layrnorm(hidden_states) + + encoder_outputs = self.encoder( + inputs_embeds=hidden_states, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + last_hidden_state = encoder_outputs[0] + pooled_output = last_hidden_state[:, 0, :] + pooled_output = self.post_layernorm(pooled_output) + + if not return_dict: + return (last_hidden_state, pooled_output) + encoder_outputs[1:] + + return BaseModelOutputWithPooling( + last_hidden_state=last_hidden_state, + pooler_output=pooled_output, + hidden_states=encoder_outputs.hidden_states, + attentions=encoder_outputs.attentions, + ) + + +class AltCLIPVisionModel(AltCLIPPreTrainedModel): + config_class = AltCLIPVisionConfig + main_input_name = "pixel_values" + + def __init__(self, config: AltCLIPVisionConfig): + super().__init__(config) + self.vision_model = AltCLIPVisionTransformer(config) + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self) -> nn.Module: + return self.vision_model.embeddings.patch_embedding + + @add_start_docstrings_to_model_forward(ALTCLIP_VISION_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=AltCLIPVisionConfig) + def forward( + self, + pixel_values: Optional[torch.FloatTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + interpolate_pos_encoding: bool = False, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, BaseModelOutputWithPooling]: + r""" + Returns: + + Examples: + + ```python + >>> from PIL import Image + >>> import requests + >>> from transformers import AutoProcessor, AltCLIPVisionModel + + >>> model = AltCLIPVisionModel.from_pretrained("BAAI/AltCLIP") + >>> processor = AutoProcessor.from_pretrained("BAAI/AltCLIP") + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + + >>> inputs = processor(images=image, return_tensors="pt") + + >>> outputs = model(**inputs) + >>> last_hidden_state = outputs.last_hidden_state + >>> pooled_output = outputs.pooler_output # pooled CLS states + ```""" + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + return self.vision_model( + pixel_values=pixel_values, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + interpolate_pos_encoding=interpolate_pos_encoding, + return_dict=return_dict, + ) + + +class AltRobertaModel(AltCLIPPreTrainedModel): + """ + + The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of + cross-attention is added between the self-attention layers, following the architecture described in *Attention is + all you need*_ by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz + Kaiser and Illia Polosukhin. + + To behave as an decoder the model needs to be initialized with the `is_decoder` argument of the configuration set + to `True`. To be used in a Seq2Seq model, the model needs to initialized with both `is_decoder` argument and + `add_cross_attention` set to `True`; an `encoder_hidden_states` is then expected as an input to the forward pass. + + .. _*Attention is all you need*: https://arxiv.org/abs/1706.03762 + + """ + + config_class = AltCLIPTextConfig + + # Copied from transformers.models.clap.modeling_clap.ClapTextModel.__init__ with ClapText->AltRoberta + def __init__(self, config, add_pooling_layer=True): + super().__init__(config) + self.config = config + + self.embeddings = AltRobertaEmbeddings(config) + self.encoder = AltRobertaEncoder(config) + + self.pooler = AltRobertaPooler(config) if add_pooling_layer else None + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.embeddings.word_embeddings + + def set_input_embeddings(self, value): + self.embeddings.word_embeddings = value + + def _prune_heads(self, heads_to_prune): + """ + Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base + class PreTrainedModel + """ + for layer, heads in heads_to_prune.items(): + self.encoder.layer[layer].attention.prune_heads(heads) + + # Copied from transformers.models.clap.modeling_clap.ClapTextModel.forward + def forward( + self, + input_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + token_type_ids: Optional[torch.Tensor] = None, + position_ids: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, + encoder_attention_mask: Optional[torch.Tensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPoolingAndCrossAttentions]: + r""" + encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if + the model is configured as a decoder. + encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in + the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`): + Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding. + + If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that + don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all + `decoder_input_ids` of shape `(batch_size, sequence_length)`. + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see + `past_key_values`). + """ + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if self.config.is_decoder: + use_cache = use_cache if use_cache is not None else self.config.use_cache + else: + use_cache = False + + if input_ids is not None and inputs_embeds is not None: + raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") + elif input_ids is not None: + self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask) + input_shape = input_ids.size() + elif inputs_embeds is not None: + input_shape = inputs_embeds.size()[:-1] + else: + raise ValueError("You have to specify either input_ids or inputs_embeds") + + batch_size, seq_length = input_shape + device = input_ids.device if input_ids is not None else inputs_embeds.device + + # past_key_values_length + past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0 + + if attention_mask is None: + attention_mask = torch.ones(((batch_size, seq_length + past_key_values_length)), device=device) + + if token_type_ids is None: + if hasattr(self.embeddings, "token_type_ids"): + buffered_token_type_ids = self.embeddings.token_type_ids[:, :seq_length] + buffered_token_type_ids_expanded = buffered_token_type_ids.expand(batch_size, seq_length) + token_type_ids = buffered_token_type_ids_expanded + else: + token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device) + + # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length] + # ourselves in which case we just need to make it broadcastable to all heads. + extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape) + + # If a 2D or 3D attention mask is provided for the cross-attention + # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length] + if self.config.is_decoder and encoder_hidden_states is not None: + encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size() + encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length) + if encoder_attention_mask is None: + encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device) + encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask) + else: + encoder_extended_attention_mask = None + + # Prepare head mask if needed + # 1.0 in head_mask indicate we keep the head + # attention_probs has shape bsz x n_heads x N x N + # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads] + # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length] + head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers) + + embedding_output = self.embeddings( + input_ids=input_ids, + position_ids=position_ids, + token_type_ids=token_type_ids, + inputs_embeds=inputs_embeds, + past_key_values_length=past_key_values_length, + ) + encoder_outputs = self.encoder( + embedding_output, + attention_mask=extended_attention_mask, + head_mask=head_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_extended_attention_mask, + past_key_values=past_key_values, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + sequence_output = encoder_outputs[0] + pooled_output = self.pooler(sequence_output) if self.pooler is not None else None + + if not return_dict: + return (sequence_output, pooled_output) + encoder_outputs[1:] + + return BaseModelOutputWithPoolingAndCrossAttentions( + last_hidden_state=sequence_output, + pooler_output=pooled_output, + past_key_values=encoder_outputs.past_key_values, + hidden_states=encoder_outputs.hidden_states, + attentions=encoder_outputs.attentions, + cross_attentions=encoder_outputs.cross_attentions, + ) + + +class AltCLIPTextModel(AltCLIPPreTrainedModel): + config_class = AltCLIPTextConfig + + def __init__(self, config): + super().__init__(config) + self.roberta = AltRobertaModel(config, add_pooling_layer=False) + self.transformation = nn.Linear(config.hidden_size, config.project_dim) + self.pre_LN = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.post_init() + + def get_input_embeddings(self) -> nn.Module: + return self.roberta.embeddings.word_embeddings + + def set_input_embeddings(self, value: nn.Embedding) -> None: + self.roberta.embeddings.word_embeddings = value + + def resize_token_embeddings(self, new_num_tokens: Optional[int] = None) -> nn.Embedding: + return super().resize_token_embeddings(new_num_tokens) + + @add_start_docstrings_to_model_forward(ALTCLIP_TEXT_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=BaseModelOutputWithPoolingAndProjection, config_class=AltCLIPTextConfig) + def forward( + self, + input_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + token_type_ids: Optional[torch.Tensor] = None, + position_ids: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, + encoder_attention_mask: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + return_dict: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + ) -> Union[Tuple, BaseModelOutputWithPoolingAndProjection]: + r""" + Returns: + + Examples: + + ```python + >>> from transformers import AutoProcessor, AltCLIPTextModel + + >>> model = AltCLIPTextModel.from_pretrained("BAAI/AltCLIP") + >>> processor = AutoProcessor.from_pretrained("BAAI/AltCLIP") + + >>> texts = ["it's a cat", "it's a dog"] + + >>> inputs = processor(text=texts, padding=True, return_tensors="pt") + + >>> outputs = model(**inputs) + >>> last_hidden_state = outputs.last_hidden_state + >>> pooled_output = outputs.pooler_output # pooled CLS states + ```""" + + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.roberta( + input_ids=input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + # last module outputs + sequence_output = outputs[0] + + # project every module + sequence_output = self.pre_LN(sequence_output) + + # pooler + projection_state = self.transformation(sequence_output) + pooler_output = projection_state[:, 0] + + if not return_dict: + return (projection_state, pooler_output) + outputs[2:4] + + return BaseModelOutputWithPoolingAndProjection( + last_hidden_state=projection_state, + pooler_output=pooler_output, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +class AltCLIPModel(AltCLIPPreTrainedModel): + config_class = AltCLIPConfig + + def __init__(self, config: AltCLIPConfig): + super().__init__(config) + + if not isinstance(config.vision_config, AltCLIPVisionConfig): + raise TypeError( + "config.vision_config is expected to be of type AltCLIPVisionConfig but is of type" + f" {type(config.vision_config)}." + ) + if not isinstance(config.text_config, AltCLIPTextConfig): + raise TypeError( + "config.text_config is expected to be of type AltCLIPTextConfig but is of type" + f" {type(config.text_config)}." + ) + + text_config = config.text_config + vision_config = config.vision_config + + self.projection_dim = config.projection_dim + self.text_embed_dim = text_config.project_dim + self.vision_embed_dim = vision_config.hidden_size + + self.text_model = AltCLIPTextModel(text_config) + self.vision_model = AltCLIPVisionTransformer(vision_config) + + self.visual_projection = nn.Linear(self.vision_embed_dim, self.projection_dim, bias=False) + self.text_projection = nn.Linear(self.text_embed_dim, self.projection_dim, bias=False) + self.logit_scale = nn.Parameter(torch.tensor(self.config.logit_scale_init_value)) + + # Initialize weights and apply final processing + self.post_init() + + @add_start_docstrings_to_model_forward(ALTCLIP_TEXT_INPUTS_DOCSTRING) + def get_text_features( + self, + input_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.Tensor] = None, + token_type_ids=None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> torch.FloatTensor: + r""" + Returns: + text_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The text embeddings obtained by + applying the projection layer to the pooled output of [`AltCLIPTextModel`]. + + Examples: + + ```python + >>> from transformers import AutoProcessor, AltCLIPModel + + >>> model = AltCLIPModel.from_pretrained("BAAI/AltCLIP") + >>> processor = AutoProcessor.from_pretrained("BAAI/AltCLIP") + >>> inputs = processor(text=["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt") + >>> text_features = model.get_text_features(**inputs) + ```""" + # Use AltCLIP model's config for some fields (if specified) instead of those of vision & text components. + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + text_outputs = self.text_model( + input_ids=input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + token_type_ids=token_type_ids, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + pooled_output = text_outputs[1] + text_features = self.text_projection(pooled_output) + + return text_features + + @add_start_docstrings_to_model_forward(ALTCLIP_VISION_INPUTS_DOCSTRING) + def get_image_features( + self, + pixel_values: Optional[torch.FloatTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + interpolate_pos_encoding: bool = False, + return_dict: Optional[bool] = None, + ) -> torch.FloatTensor: + r""" + Returns: + image_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The image embeddings obtained by + applying the projection layer to the pooled output of [`AltCLIPVisionModel`]. + + Examples: + + ```python + >>> from PIL import Image + >>> import requests + >>> from transformers import AutoProcessor, AltCLIPModel + + >>> model = AltCLIPModel.from_pretrained("BAAI/AltCLIP") + >>> processor = AutoProcessor.from_pretrained("BAAI/AltCLIP") + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + >>> inputs = processor(images=image, return_tensors="pt") + >>> image_features = model.get_image_features(**inputs) + ```""" + # Use AltCLIP model's config for some fields (if specified) instead of those of vision & text components. + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + vision_outputs = self.vision_model( + pixel_values=pixel_values, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + interpolate_pos_encoding=interpolate_pos_encoding, + return_dict=return_dict, + ) + + pooled_output = vision_outputs[1] # pooled_output + image_features = self.visual_projection(pooled_output) + + return image_features + + @add_start_docstrings_to_model_forward(ALTCLIP_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=AltCLIPOutput, config_class=AltCLIPConfig) + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + pixel_values: Optional[torch.FloatTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + token_type_ids: Optional[torch.Tensor] = None, + return_loss: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + interpolate_pos_encoding: bool = False, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, AltCLIPOutput]: + r""" + Returns: + + Examples: + + ```python + >>> from PIL import Image + >>> import requests + >>> from transformers import AutoProcessor, AltCLIPModel + + >>> model = AltCLIPModel.from_pretrained("BAAI/AltCLIP") + >>> processor = AutoProcessor.from_pretrained("BAAI/AltCLIP") + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + >>> inputs = processor( + ... text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="pt", padding=True + ... ) + >>> outputs = model(**inputs) + >>> logits_per_image = outputs.logits_per_image # this is the image-text similarity score + >>> probs = logits_per_image.softmax(dim=1) # we can take the softmax to get the label probabilities + ```""" + # Use AltCLIP model's config for some fields (if specified) instead of those of vision & text components. + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + text_outputs = self.text_model( + input_ids=input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + vision_outputs = self.vision_model( + pixel_values=pixel_values, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + interpolate_pos_encoding=interpolate_pos_encoding, + return_dict=return_dict, + ) + + image_embeds = vision_outputs[1] + image_embeds = self.visual_projection(image_embeds) + + text_embeds = text_outputs[1] + text_embeds = self.text_projection(text_embeds) + + # normalized features + image_embeds = image_embeds / image_embeds.norm(p=2, dim=-1, keepdim=True) + text_embeds = text_embeds / text_embeds.norm(p=2, dim=-1, keepdim=True) + + # cosine similarity as logits + logit_scale = self.logit_scale.exp() + logits_per_text = torch.matmul(text_embeds, image_embeds.t()) * logit_scale + logits_per_image = logits_per_text.T + + loss = None + if return_loss: + loss = clip_loss(logits_per_text) + + if not return_dict: + output = (logits_per_image, logits_per_text, text_embeds, image_embeds, text_outputs, vision_outputs) + return ((loss,) + output) if loss is not None else output + + return AltCLIPOutput( + loss=loss, + logits_per_image=logits_per_image, + logits_per_text=logits_per_text, + text_embeds=text_embeds, + image_embeds=image_embeds, + text_model_output=text_outputs, + vision_model_output=vision_outputs, + ) + + +# Copied from transformers.models.roberta.modeling_roberta.create_position_ids_from_input_ids +def create_position_ids_from_input_ids(input_ids, padding_idx, past_key_values_length=0): + """ + Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols + are ignored. This is modified from fairseq's `utils.make_positions`. + + Args: + x: torch.Tensor x: + + Returns: torch.Tensor + """ + # The series of casts and type-conversions here are carefully balanced to both work with ONNX export and XLA. + mask = input_ids.ne(padding_idx).int() + incremental_indices = (torch.cumsum(mask, dim=1).type_as(mask) + past_key_values_length) * mask + return incremental_indices.long() + padding_idx + + +__all__ = ["AltCLIPPreTrainedModel", "AltCLIPVisionModel", "AltCLIPTextModel", "AltCLIPModel"] diff --git a/docs/transformers/build/lib/transformers/models/altclip/processing_altclip.py b/docs/transformers/build/lib/transformers/models/altclip/processing_altclip.py new file mode 100644 index 0000000000000000000000000000000000000000..3ce4f2481d4905c651d1abcbe7fc020e92d2c001 --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/altclip/processing_altclip.py @@ -0,0 +1,148 @@ +# coding=utf-8 +# Copyright 2022 WenXiang ZhongzhiCheng LedellWu LiuGuang BoWenZhang The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Image/Text processor class for AltCLIP +""" + +from typing import List, Union + +from ...image_utils import ImageInput +from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack +from ...tokenization_utils_base import BatchEncoding, PreTokenizedInput, TextInput +from ...utils.deprecation import deprecate_kwarg + + +class AltClipProcessorKwargs(ProcessingKwargs, total=False): + _defaults = {} + + +class AltCLIPProcessor(ProcessorMixin): + r""" + Constructs a AltCLIP processor which wraps a CLIP image processor and a XLM-Roberta tokenizer into a single + processor. + + [`AltCLIPProcessor`] offers all the functionalities of [`CLIPImageProcessor`] and [`XLMRobertaTokenizerFast`]. See + the [`~AltCLIPProcessor.__call__`] and [`~AltCLIPProcessor.decode`] for more information. + + Args: + image_processor ([`CLIPImageProcessor`], *optional*): + The image processor is a required input. + tokenizer ([`XLMRobertaTokenizerFast`], *optional*): + The tokenizer is a required input. + """ + + attributes = ["image_processor", "tokenizer"] + image_processor_class = ("CLIPImageProcessor", "CLIPImageProcessorFast") + tokenizer_class = ("XLMRobertaTokenizer", "XLMRobertaTokenizerFast") + + @deprecate_kwarg(old_name="feature_extractor", version="5.0.0", new_name="image_processor") + def __init__(self, image_processor=None, tokenizer=None): + if image_processor is None: + raise ValueError("You need to specify an `image_processor`.") + if tokenizer is None: + raise ValueError("You need to specify a `tokenizer`.") + + super().__init__(image_processor, tokenizer) + + def __call__( + self, + images: ImageInput = None, + text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None, + audio=None, + videos=None, + **kwargs: Unpack[AltClipProcessorKwargs], + ) -> BatchEncoding: + """ + Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text` + and `kwargs` arguments to XLMRobertaTokenizerFast's [`~XLMRobertaTokenizerFast.__call__`] if `text` is not + `None` to encode the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to + CLIPImageProcessor's [`~CLIPImageProcessor.__call__`] if `images` is not `None`. Please refer to the docstring + of the above two methods for more information. + + Args: + + images (`ImageInput`): + The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch + tensor. Both channels-first and channels-last formats are supported. + text (`TextInput`, `PreTokenizedInput`, `List[TextInput]`, `List[PreTokenizedInput]`): + The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings + (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set + `is_split_into_words=True` (to lift the ambiguity with a batch of sequences). + return_tensors (`str` or [`~utils.TensorType`], *optional*): + If set, will return tensors of a particular framework. Acceptable values are: + - `'tf'`: Return TensorFlow `tf.constant` objects. + - `'pt'`: Return PyTorch `torch.Tensor` objects. + - `'np'`: Return NumPy `np.ndarray` objects. + - `'jax'`: Return JAX `jnp.ndarray` objects. + Returns: + [`BatchEncoding`]: A [`BatchEncoding`] with the following fields: + + - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`. + - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when + `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not + `None`). + - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`. + """ + + if text is None and images is None: + raise ValueError("You must specify either text or images.") + + if text is None and images is None: + raise ValueError("You must specify either text or images.") + output_kwargs = self._merge_kwargs( + AltClipProcessorKwargs, + tokenizer_init_kwargs=self.tokenizer.init_kwargs, + **kwargs, + ) + + if text is not None: + encoding = self.tokenizer(text, **output_kwargs["text_kwargs"]) + if images is not None: + image_features = self.image_processor(images, **output_kwargs["images_kwargs"]) + + # BC for explicit return_tensors + if "return_tensors" in output_kwargs["common_kwargs"]: + return_tensors = output_kwargs["common_kwargs"].pop("return_tensors", None) + + if text is not None and images is not None: + encoding["pixel_values"] = image_features.pixel_values + return encoding + elif text is not None: + return encoding + else: + return BatchEncoding(data=dict(**image_features), tensor_type=return_tensors) + + def batch_decode(self, *args, **kwargs): + """ + This method forwards all its arguments to XLMRobertaTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. + Please refer to the docstring of this method for more information. + """ + return self.tokenizer.batch_decode(*args, **kwargs) + + def decode(self, *args, **kwargs): + """ + This method forwards all its arguments to XLMRobertaTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please + refer to the docstring of this method for more information. + """ + return self.tokenizer.decode(*args, **kwargs) + + @property + def model_input_names(self): + tokenizer_input_names = self.tokenizer.model_input_names + image_processor_input_names = self.image_processor.model_input_names + return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names)) + + +__all__ = ["AltCLIPProcessor"] diff --git a/docs/transformers/build/lib/transformers/models/aria/__init__.py b/docs/transformers/build/lib/transformers/models/aria/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..f73301321527c185cfab149b171a38f5fd4f7852 --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/aria/__init__.py @@ -0,0 +1,30 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ...utils import _LazyModule +from ...utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .configuration_aria import * + from .image_processing_aria import * + from .modeling_aria import * + from .processing_aria import * + +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/docs/transformers/build/lib/transformers/models/aria/configuration_aria.py b/docs/transformers/build/lib/transformers/models/aria/configuration_aria.py new file mode 100644 index 0000000000000000000000000000000000000000..f3faa60ca3d5920ad9f43596719c928c54d48b7b --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/aria/configuration_aria.py @@ -0,0 +1,307 @@ +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# This file was automatically generated from src/transformers/models/aria/modular_aria.py. +# Do NOT edit this file manually as any edits will be overwritten by the generation of +# the file from the modular. If any change should be done, please apply the change to the +# modular_aria.py file directly. One of our CI enforces this. +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# coding=utf-8 +# Copyright 2024 The Rhymes-AI Teams Authors and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import Dict + +from ...configuration_utils import PretrainedConfig +from ...modeling_rope_utils import rope_config_validation +from ..auto import CONFIG_MAPPING, AutoConfig + + +class AriaTextConfig(PretrainedConfig): + r""" + This class handles the configuration for the text component of the Aria model. + Instantiating a configuration with the defaults will yield a similar configuration to that of the model of the Aria + [rhymes-ai/Aria](https://huggingface.co/rhymes-ai/Aria) architecture. + This class extends the LlamaConfig to include additional parameters specific to the Mixture of Experts (MoE) architecture. + + Args: + vocab_size (`int`, *optional*, defaults to 32000): + Vocabulary size of the LLaMA model. Defines the number of different tokens that can be represented by the + `inputs_ids` passed when calling [`LlamaModel`] + hidden_size (`int`, *optional*, defaults to 4096): + Dimension of the hidden representations. + intermediate_size (`int`, *optional*, defaults to 4096): + The size of the MLP representations. + num_hidden_layers (`int`, *optional*, defaults to 32): + Number of hidden layers in the Transformer decoder. + num_attention_heads (`int`, *optional*, defaults to 32): + Number of attention heads for each attention layer in the Transformer decoder. + num_key_value_heads (`int`, *optional*): + This is the number of key_value heads that should be used to implement Grouped Query Attention. If + `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if + `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When + converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed + by meanpooling all the original heads within that group. For more details checkout [this + paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to + `num_attention_heads`. + hidden_act (`str` or `function`, *optional*, defaults to `"silu"`): + The non-linear activation function (function or string) in the decoder. + max_position_embeddings (`int`, *optional*, defaults to 2048): + The maximum sequence length that this model might ever be used with. Llama 1 supports up to 2048 tokens, + Llama 2 up to 4096, CodeLlama up to 16384. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + rms_norm_eps (`float`, *optional*, defaults to 1e-06): + The epsilon used by the rms normalization layers. + use_cache (`bool`, *optional*, defaults to `True`): + Whether or not the model should return the last key/values attentions (not used by all models). Only + relevant if `config.is_decoder=True`. + pad_token_id (`int`, *optional*, defaults to 2): + Padding token id. + bos_token_id (`int`, *optional*, defaults to 1): + Beginning of stream token id. + eos_token_id (`int`, *optional*, defaults to 2): + End of stream token id. + pretraining_tp (`int`, *optional*, defaults to 1): + Experimental feature. Tensor parallelism rank used during pretraining. Please refer to [this + document](https://huggingface.co/docs/transformers/main/perf_train_gpu_many#tensor-parallelism) to + understand more about it. This value is necessary to ensure exact reproducibility of the pretraining + results. Please refer to [this issue](https://github.com/pytorch/pytorch/issues/76232). + tie_word_embeddings (`bool`, *optional*, defaults to `False`): + Whether to tie weight embeddings + rope_theta (`float`, *optional*, defaults to 10000.0): + The base period of the RoPE embeddings. + rope_scaling (`Dict`, *optional*): + Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type + and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value + accordingly. + Expected contents: + `rope_type` (`str`): + The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope', + 'llama3'], with 'default' being the original RoPE implementation. + `factor` (`float`, *optional*): + Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In + most scaling types, a `factor` of x will enable the model to handle sequences of length x * + original maximum pre-trained length. + `original_max_position_embeddings` (`int`, *optional*): + Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during + pretraining. + `attention_factor` (`float`, *optional*): + Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention + computation. If unspecified, it defaults to value recommended by the implementation, using the + `factor` field to infer the suggested value. + `beta_fast` (`float`, *optional*): + Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear + ramp function. If unspecified, it defaults to 32. + `beta_slow` (`float`, *optional*): + Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear + ramp function. If unspecified, it defaults to 1. + `short_factor` (`List[float]`, *optional*): + Only used with 'longrope'. The scaling factor to be applied to short contexts (< + `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden + size divided by the number of attention heads divided by 2 + `long_factor` (`List[float]`, *optional*): + Only used with 'longrope'. The scaling factor to be applied to long contexts (< + `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden + size divided by the number of attention heads divided by 2 + `low_freq_factor` (`float`, *optional*): + Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE + `high_freq_factor` (`float`, *optional*): + Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE + attention_bias (`bool`, *optional*, defaults to `False`): + Whether to use a bias in the query, key, value and output projection layers during self-attention. + attention_dropout (`float`, *optional*, defaults to 0.0): + The dropout ratio for the attention probabilities. + mlp_bias (`bool`, *optional*, defaults to `False`): + Whether to use a bias in up_proj, down_proj and gate_proj layers in the MLP layers. + head_dim (`int`, *optional*): + The attention head dimension. If None, it will default to hidden_size // num_heads + moe_num_experts (`int`, *optional*, defaults to 8): + The number of experts in the MoE layer. + moe_topk (`int`, *optional*, defaults to 2): + The number of top experts to route to for each token. + moe_num_shared_experts (`int`, *optional*, defaults to 2): + The number of shared experts. + """ + + model_type = "aria_text" + keys_to_ignore_at_inference = ["past_key_values"] + # Default tensor parallel plan for base model `AriaTextModel` + base_model_tp_plan = { + "layers.*.self_attn.q_proj": "colwise", + "layers.*.self_attn.k_proj": "colwise", + "layers.*.self_attn.v_proj": "colwise", + "layers.*.self_attn.o_proj": "rowwise", + "layers.*.mlp.gate_proj": "colwise", + "layers.*.mlp.up_proj": "colwise", + "layers.*.mlp.down_proj": "rowwise", + } + base_model_pp_plan = { + "embed_tokens": (["input_ids"], ["inputs_embeds"]), + "layers": (["hidden_states", "attention_mask"], ["hidden_states"]), + "norm": (["hidden_states"], ["hidden_states"]), + } + base_config_key = "text_config" + + def __init__( + self, + vocab_size=32000, + hidden_size=4096, + intermediate_size: int = 4096, + num_hidden_layers=32, + num_attention_heads=32, + num_key_value_heads=None, + hidden_act="silu", + max_position_embeddings=2048, + initializer_range=0.02, + rms_norm_eps=1e-6, + use_cache=True, + pad_token_id=2, + bos_token_id=1, + eos_token_id=2, + pretraining_tp=1, + tie_word_embeddings=False, + rope_theta=10000.0, + rope_scaling=None, + attention_bias=False, + attention_dropout=0.0, + mlp_bias=False, + head_dim=None, + moe_num_experts: int = 8, + moe_topk: int = 2, + moe_num_shared_experts: int = 2, + **kwargs, + ): + super().__init__( + pad_token_id=pad_token_id, + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + tie_word_embeddings=tie_word_embeddings, + **kwargs, + ) + self.vocab_size = vocab_size + self.max_position_embeddings = max_position_embeddings + self.hidden_size = hidden_size + self.intermediate_size = intermediate_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + + # for backward compatibility + if num_key_value_heads is None: + num_key_value_heads = num_attention_heads + + self.num_key_value_heads = num_key_value_heads + self.hidden_act = hidden_act + self.initializer_range = initializer_range + self.rms_norm_eps = rms_norm_eps + self.pretraining_tp = pretraining_tp + self.use_cache = use_cache + self.rope_theta = rope_theta + self.rope_scaling = rope_scaling + self.attention_bias = attention_bias + self.attention_dropout = attention_dropout + self.mlp_bias = mlp_bias + self.head_dim = head_dim if head_dim is not None else self.hidden_size // self.num_attention_heads + # Validate the correctness of rotary position embeddings parameters + # BC: if there is a 'type' field, copy it it to 'rope_type'. + if self.rope_scaling is not None and "type" in self.rope_scaling: + self.rope_scaling["rope_type"] = self.rope_scaling["type"] + rope_config_validation(self) + self.moe_num_experts = moe_num_experts + self.moe_topk = moe_topk + self.moe_num_shared_experts = moe_num_shared_experts + + +class AriaConfig(PretrainedConfig): + r""" + This class handles the configuration for both vision and text components of the Aria model, + as well as additional parameters for image token handling and projector mapping. + Instantiating a configuration with the defaults will yield a similar configuration to that of the model of the Aria + [rhymes-ai/Aria](https://huggingface.co/rhymes-ai/Aria) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + vision_config (`AriaVisionConfig` or `dict`, *optional*): + Configuration for the vision component. + vision_feature_layer (`int`, *optional*, defaults to -1): + The index of the layer to select the vision feature. + text_config (`AriaTextConfig` or `dict`, *optional*): + Configuration for the text component. + projector_patch_to_query_dict (`dict`, *optional*): + Mapping of patch sizes to query dimensions. + image_token_index (`int`, *optional*, defaults to 9): + Index used to represent image tokens. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated normal initializer for initializing all weight matrices. + + Attributes: + model_type (`str`): + Type of the model, set to `"aria"`. + image_token_index (`int`): + Index used to represent image tokens. + projector_patch_to_query_dict (`dict`): + Mapping of patch sizes to query dimensions. + vision_config (`AriaVisionConfig`): + Configuration for the vision component. + text_config (`AriaTextConfig`): + Configuration for the text component. + """ + + model_type = "aria" + attribute_map = { + "image_token_id": "image_token_index", + } + sub_configs = {"text_config": AriaTextConfig, "vision_config": AutoConfig} + + def __init__( + self, + vision_config=None, + vision_feature_layer: int = -1, + text_config: AriaTextConfig = None, + projector_patch_to_query_dict: Dict = None, + image_token_index: int = 9, + initializer_range: float = 0.02, + **kwargs, + ): + self.image_token_index = image_token_index + + # Convert the keys and values of projector_patch_to_query_dict to integers + # This ensures consistency even if they were provided as strings + if projector_patch_to_query_dict is None: + projector_patch_to_query_dict = { + 1225: 128, + 4900: 256, + } + self.projector_patch_to_query_dict = {int(k): int(v) for k, v in projector_patch_to_query_dict.items()} + self.max_value_projector_patch_to_query_dict = max(self.projector_patch_to_query_dict.values()) + self.vision_feature_layer = vision_feature_layer + if isinstance(vision_config, dict): + vision_config["model_type"] = "idefics3_vision" + vision_config = CONFIG_MAPPING[vision_config["model_type"]](**vision_config) + elif vision_config is None: + vision_config = CONFIG_MAPPING["idefics3_vision"]() + + self.vision_config = vision_config + self.initializer_range = initializer_range + + if isinstance(text_config, dict) and "model_type" in text_config: + text_config = AriaTextConfig(**text_config) + elif text_config is None: + text_config = AriaTextConfig() + + self.text_config = text_config + + super().__init__(**kwargs) + + +__all__ = ["AriaConfig", "AriaTextConfig"] diff --git a/docs/transformers/build/lib/transformers/models/aria/convert_aria_weights_to_hf.py b/docs/transformers/build/lib/transformers/models/aria/convert_aria_weights_to_hf.py new file mode 100644 index 0000000000000000000000000000000000000000..a95f3cda8349c45da1870720063e381f43d34438 --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/aria/convert_aria_weights_to_hf.py @@ -0,0 +1,162 @@ +# Copyright 2024 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import argparse +import glob + +import torch +from huggingface_hub import snapshot_download +from safetensors import safe_open + +from transformers import ( + AddedToken, + AriaForConditionalGeneration, + AriaProcessor, + AutoConfig, + AutoTokenizer, +) + + +EPILOG_TXT = """Example: + python transformers/src/transformers/models/aria/convert_aria_weights_to_hf.py --text_model_id rhymes-ai/Aria --vision_model_id rhymes-ai/Aria --output_hub_path m-ric/Aria_hf_2 --old_state_dict_id rhymes-ai/Aria + +Example for creating the old state dict file with Python: + + import torch + from aria.model.language_model.aria_llama import AriaTextForCausalLM + + # load model + kwargs = {"device_map": "auto", "torch_dtype": torch.float16} + model = AriaTextForCausalLM.from_pretrained("rhymes-ai/Aria", low_cpu_mem_usage=True, **kwargs) + + # load vision tower + model.get_vision_tower().load_model() + + # Save state dict + torch.save(model.state_dict(), "tmp/hf_models/aria/model_state_dict.bin") +""" + +KEYS_TO_MODIFY_MAPPING = { + "vision_tower.vision_model": "vision_tower", + "ln_ffn": "layer_norm", + "ffn": "feed_forward", + "ln_kv": "layer_norm_kv", +} + + +def load_original_state_dict(model_id): + directory_path = snapshot_download(repo_id=model_id, allow_patterns=["*.safetensors"]) + + original_state_dict = {} + for path in glob.glob(f"{directory_path}/*"): + if path.endswith(".safetensors"): + with safe_open(path, framework="pt", device="cpu") as f: + for key in f.keys(): + original_state_dict[key] = f.get_tensor(key) + + return original_state_dict + + +def convert_state_dict_to_hf(state_dict): + new_state_dict = {} + for key, value in state_dict.items(): + if key.endswith(".inv_freq"): + continue + for key_to_modify, new_key in KEYS_TO_MODIFY_MAPPING.items(): + if key_to_modify in key: + key = key.replace(key_to_modify, new_key) + + new_state_dict[key] = value + new_state_dict["vision_tower.post_layernorm.weight"] = torch.zeros((1152,)) + new_state_dict["vision_tower.post_layernorm.bias"] = torch.zeros((1152,)) + + return new_state_dict + + +def convert_aria_llama_to_hf(text_model_id, vision_model_id, output_hub_path, old_state_dict_id): + torch.set_default_dtype(torch.float16) + + tokenizer = AutoTokenizer.from_pretrained( + text_model_id, + extra_special_tokens={ + "image_token": "<|img|>", + "pad_token": "", + }, + ) + tokenizer.add_tokens(AddedToken("<|img|>", special=True, normalized=False), special_tokens=True) + tokenizer.add_special_tokens({"pad_token": ""}) + tokenizer.chat_template = "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}<|im_start|>{{ message['role'] }}\n{% if message['content'] is string %}{{ message['content'] }}{% elif message['content'] is iterable %}{% for item in message['content'] %}{% if item['type'] == 'text' %}{{ item['text'] }}{% elif item['type'] == 'image' %}<|img|>{% endif %}{% endfor %}{% endif %}<|im_end|>\n{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant\n{% endif %}" + + processor = AriaProcessor.from_pretrained( + text_model_id, + tokenizer=tokenizer, + ) + + config = AutoConfig.from_pretrained(text_model_id) + config.vision_config.hidden_size = 1152 + config.vision_config.attention_heads = 16 + config.pad_token_id = 2 + config.image_token_id = 9 + config.intermediate_size = config.moe_intermediate_size + config.auto_map = { + "AutoConfig": "modeling_aria.AriaConfig", + "AutoModelForCausalLM": "modeling_aria.AriaForConditionalGeneration", + } + + with torch.device("meta"): + model = AriaForConditionalGeneration(config) + + state_dict = load_original_state_dict(old_state_dict_id) + + state_dict = convert_state_dict_to_hf(state_dict) + model.load_state_dict(state_dict, strict=False, assign=True) + + # print("Saving models") + # model.save_pretrained("local_aria", safe_serialization=False) + # processor.save_pretrained("local_aria") + print("Pushing to hub") + model.push_to_hub(output_hub_path, create_pr=True) + processor.push_to_hub(output_hub_path, create_pr=True) + + +def main(): + parser = argparse.ArgumentParser( + epilog=EPILOG_TXT, + formatter_class=argparse.RawDescriptionHelpFormatter, + ) + parser.add_argument( + "--text_model_id", + default="rhymes-ai/Aria", + help="Hub location of the text model", + ) + parser.add_argument( + "--vision_model_id", + default="rhymes-ai/Aria", + help="Hub location of the vision model", + ) + parser.add_argument( + "--output_hub_path", + default="rhymes-ai/Aria", + help="Location on the hub of the converted model", + ) + parser.add_argument( + "--old_state_dict_id", + default="rhymes-ai/Aria", + help="Location on the hub of the raw state dict of the original model. The filename needs to be `model_state_dict.bin`", + ) + args = parser.parse_args() + convert_aria_llama_to_hf(args.text_model_id, args.vision_model_id, args.output_hub_path, args.old_state_dict_id) + + +if __name__ == "__main__": + main() diff --git a/docs/transformers/build/lib/transformers/models/aria/image_processing_aria.py b/docs/transformers/build/lib/transformers/models/aria/image_processing_aria.py new file mode 100644 index 0000000000000000000000000000000000000000..364f8f70df1f01d1200e0455083ea128d1befa50 --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/aria/image_processing_aria.py @@ -0,0 +1,518 @@ +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# This file was automatically generated from src/transformers/models/aria/modular_aria.py. +# Do NOT edit this file manually as any edits will be overwritten by the generation of +# the file from the modular. If any change should be done, please apply the change to the +# modular_aria.py file directly. One of our CI enforces this. +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# coding=utf-8 +# Copyright 2024 The Rhymes-AI Teams Authors and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import math +from typing import Iterable, List, Optional, Tuple, Union + +import numpy as np + +from ...image_processing_utils import BaseImageProcessor, BatchFeature, select_best_resolution +from ...image_transforms import PaddingMode, convert_to_rgb, pad, resize, to_channel_dimension_format +from ...image_utils import ( + ChannelDimension, + ImageInput, + PILImageResampling, + get_image_size, + infer_channel_dimension_format, + is_scaled_image, + make_flat_list_of_images, + to_numpy_array, + valid_images, + validate_preprocess_arguments, +) +from ...utils import TensorType, logging + + +logger = logging.get_logger(__name__) + + +def divide_to_patches(image: np.array, patch_size: int, input_data_format) -> List[np.array]: + """ + Divides an image into patches of a specified size. + + Args: + image (`np.array`): + The input image. + patch_size (`int`): + The size of each patch. + input_data_format (`ChannelDimension` or `str`): + The channel dimension format of the input image. + + Returns: + list: A list of np.array representing the patches. + """ + patches = [] + height, width = get_image_size(image, channel_dim=input_data_format) + for i in range(0, height, patch_size): + for j in range(0, width, patch_size): + if input_data_format == ChannelDimension.LAST: + patch = image[i : i + patch_size, j : j + patch_size] + else: + patch = image[:, i : i + patch_size, j : j + patch_size] + patches.append(patch) + + return patches + + +def _get_patch_output_size(image, target_resolution, input_data_format): + original_height, original_width = get_image_size(image, channel_dim=input_data_format) + target_height, target_width = target_resolution + + scale_w = target_width / original_width + scale_h = target_height / original_height + + if scale_w < scale_h: + new_width = target_width + new_height = min(math.ceil(original_height * scale_w), target_height) + else: + new_height = target_height + new_width = min(math.ceil(original_width * scale_h), target_width) + + return new_height, new_width + + +class AriaImageProcessor(BaseImageProcessor): + """ + A vision processor for the Aria model that handles image preprocessing. + Initialize the AriaImageProcessor. + + Args: + image_mean (`list`, *optional*, defaults to [0.5, 0.5, 0.5]): + Mean values for normalization. + image_std (`list`, *optional*, defaults to [0.5, 0.5, 0.5]): + Standard deviation values for normalization. + max_image_size (`int`, *optional*, defaults to 980): + Maximum image size. + min_image_size (`int`, *optional*, defaults to 336): + Minimum image size. + split_resolutions (`list`, *optional*, defaults to a list of optimal,resolutions as tuples): + The optimal resolutions for splitting the image. + split_image (`bool`, *optional*, defaults to `False`): + Whether to split the image. + do_convert_rgb (`bool`, *optional*, defaults to `True`): + Whether to convert the image to RGB. + do_rescale (`bool`, *optional*, defaults to `True`): + Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by `do_rescale` in + the `preprocess` method. + rescale_factor (`int` or `float`, *optional*, defaults to `1/255`): + Scale factor to use if rescaling the image. Can be overridden by `rescale_factor` in the `preprocess` + method. + do_normalize (`bool`, *optional*, defaults to `True`): + Whether to normalize the image. + resample (PILImageResampling, *optional*, defaults to `BICUBIC`): + The resampling filter to use if resizing the image. + """ + + model_input_names = ["pixel_values", "pixel_mask", "num_crops"] + + def __init__( + self, + image_mean: List[float] = None, + image_std: List[float] = None, + max_image_size: int = 980, + min_image_size: int = 336, + split_resolutions: Optional[List[Tuple[int, int]]] = None, + split_image: Optional[bool] = False, + do_convert_rgb: Optional[bool] = True, + do_rescale: bool = True, + rescale_factor: Union[int, float] = 1 / 255, + do_normalize: Optional[bool] = True, + resample: PILImageResampling = PILImageResampling.BICUBIC, + **kwargs, + ): + super().__init__(**kwargs) + + if image_mean is None: + image_mean = [0.5, 0.5, 0.5] + if image_std is None: + image_std = [0.5, 0.5, 0.5] + self.max_image_size = max_image_size + self.min_image_size = min_image_size + self.image_mean = image_mean + self.image_std = image_std + self.split_image = split_image + if split_resolutions is None: + split_resolutions = [(1, 2), (1, 3), (1, 4), (1, 5), (1, 6), (1, 7), (1, 8), (2, 4), (2, 3), (2, 2), (2, 1), (3, 1), (3, 2), (4, 1), (4, 2), (5, 1), (6, 1), (7, 1), (8, 1)] # fmt: skip + split_resolutions = [(el[0] * 490, el[1] * 490) for el in split_resolutions] + self.split_resolutions = split_resolutions + self.do_convert_rgb = do_convert_rgb + self.do_rescale = do_rescale + self.rescale_factor = rescale_factor + self.do_normalize = do_normalize + self.resample = resample + + def preprocess( + self, + images: Union[ImageInput, List[ImageInput]], + image_mean: Optional[Union[float, List[float]]] = None, + image_std: Optional[Union[float, List[float]]] = None, + max_image_size: Optional[int] = None, + min_image_size: Optional[int] = None, + split_image: Optional[bool] = None, + do_convert_rgb: Optional[bool] = None, + do_rescale: Optional[bool] = None, + rescale_factor: Optional[float] = None, + do_normalize: Optional[bool] = None, + resample: PILImageResampling = None, + return_tensors: Optional[Union[str, TensorType]] = "pt", + data_format: Optional[ChannelDimension] = ChannelDimension.FIRST, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + ): + """ + Process a list of images. + + Args: + images (ImageInput or list of ImageInput): + The input image or a list of images. + image_mean (`list`, *optional*, defaults to [0.5, 0.5, 0.5]): + Mean values for normalization. + image_std (`list`, *optional*, defaults to [0.5, 0.5, 0.5]): + Standard deviation values for normalization. + max_image_size (`int`, *optional*, defaults to `self.max_image_size` (980)): + Maximum image size. + min_image_size (`int`, *optional*, defaults to `self.min_image_size` (336)): + Minimum image size. + split_image (`bool`, *optional*, defaults to `self.split_image` (False)): + Whether to split the image. + do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb` (True)): + Whether to convert the image to RGB. + do_rescale (`bool`, *optional*, defaults to `self.do_rescale`): + Whether to rescale the image. + rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`): + Rescale factor to rescale the image by if `do_rescale` is set to `True`. + do_normalize (`bool`, *optional*, defaults to `self.do_normalize` (True)): + Whether to normalize the image. + resample (PILImageResampling, *optional*, defaults to `self.resample` (BICUBIC)): + The resampling filter to use if resizing the image. + return_tensors (`str` or `TensorType`, *optional*, defaults to "pt"): + The type of tensor to return. + data_format (`str` or `ChannelDimension`, *optional*): + The channel dimension format for the output image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: + image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: + image in (height, width, num_channels) format. + If unset, will use same as the input image. + input_data_format (`str` or `ChannelDimension`, *optional*): + The channel dimension format for the input image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: + image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: + image in (height, width, num_channels) format. + If unset, will use the inferred format of the input image. + + Returns: + BatchFeature: + A BatchFeature object containing: + - 'pixel_values': + Tensor of processed image pixel values. + - 'pixel_mask': + Boolean pixel mask. This mask is a 2D tensor of shape (max_image_size, max_image_size) where: + - True (1) values indicate pixels that belong to the original resized image. + - False (0) values indicate pixels that are part of the padding. + The mask helps distinguish between actual image content and padded areas in subsequent processing steps. + - 'num_crops': + The maximum number of crops across all images. + """ + image_mean = image_mean if image_mean is not None else self.image_mean + image_std = image_std if image_std is not None else self.image_std + max_image_size = max_image_size if max_image_size is not None else self.max_image_size + min_image_size = min_image_size if min_image_size is not None else self.min_image_size + split_image = split_image if split_image is not None else self.split_image + do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb + do_rescale = do_rescale if do_rescale is not None else self.do_rescale + rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor + do_normalize = do_normalize if do_normalize is not None else self.do_normalize + resample = resample if resample is not None else self.resample + + if max_image_size not in [490, 980]: + raise ValueError("max_image_size must be either 490 or 980") + + images = make_flat_list_of_images(images) + + if not valid_images(images): + raise ValueError( + "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, " + "torch.Tensor, tf.Tensor or jax.ndarray." + ) + + validate_preprocess_arguments( + do_normalize=do_normalize, + image_mean=image_mean, + image_std=image_std, + resample=resample, + do_rescale=do_rescale, + rescale_factor=rescale_factor, + ) + + if do_convert_rgb: + images = [convert_to_rgb(image) for image in images] + + # All transformations expect numpy arrays. + images = [to_numpy_array(image) for image in images] + + if do_rescale and is_scaled_image(images[0]): + logger.warning_once( + "It looks like you are trying to rescale already rescaled images. If the input" + " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again." + ) + + if input_data_format is None: + # We assume that all images have the same channel dimension format. + input_data_format = infer_channel_dimension_format(images[0]) + + pixel_values = [] + pixel_masks = [] + num_crops = None + + for image in images: + if split_image: + crop_images = self.get_image_patches( + image, + self.split_resolutions, + max_image_size, + resample, + data_format=input_data_format, + input_data_format=input_data_format, + ) + else: + crop_images = [image] + if num_crops is None or len(crop_images) > num_crops: + num_crops = len(crop_images) + + for crop_image in crop_images: + # At this point the scale is the rescaling factor that would bring the image to max_size in its larger dimension + h, w = get_image_size(crop_image) + scale = max_image_size / max(h, w) + if w >= h: + new_size = (max(int(h * scale), min_image_size), max_image_size) # h, w + else: + new_size = (max_image_size, max(int(w * scale), min_image_size)) # h, w + + crop_image_resized = resize( + crop_image, + new_size, + resample=resample, + data_format=input_data_format, + input_data_format=input_data_format, + ) + + padding_bottom, padding_right = max_image_size - new_size[0], max_image_size - new_size[1] + crop_image_padded = pad( + crop_image_resized, + ((0, padding_bottom), (0, padding_right)), + data_format=input_data_format, + input_data_format=input_data_format, + ) + + # Create a pixel mask + pixel_mask = np.zeros((max_image_size, max_image_size), dtype=bool) + pixel_mask[: new_size[0], : new_size[1]] = 1 + pixel_masks.append(pixel_mask) + + if do_rescale: + crop_image_padded = self.rescale( + image=crop_image_padded, scale=rescale_factor, input_data_format=input_data_format + ) + + if do_normalize: + crop_image_padded = self.normalize( + crop_image_padded, + self.image_mean, + self.image_std, + data_format=input_data_format, + input_data_format=input_data_format, + ) + crop_image_padded = ( + to_channel_dimension_format(crop_image_padded, data_format, input_data_format) + if data_format is not None + else crop_image_padded + ) + + pixel_values.append(crop_image_padded) + return BatchFeature( + data={ + "pixel_values": np.stack(pixel_values, axis=0), + "pixel_mask": np.stack(pixel_masks, axis=0), + "num_crops": num_crops, + }, + tensor_type=return_tensors, + ) + + def _resize_for_patching( + self, image: np.array, target_resolution: tuple, resample, input_data_format: ChannelDimension + ) -> np.array: + """ + Resizes an image to a target resolution while maintaining aspect ratio. + + Args: + image (np.array): + The input image. + target_resolution (tuple): + The target resolution (height, width) of the image. + resample (`PILImageResampling`): + Resampling filter to use if resizing the image. + input_data_format (`ChannelDimension` or `str`): + The channel dimension format of the input image. + + Returns: + np.array: The resized and padded image. + """ + new_height, new_width = _get_patch_output_size(image, target_resolution, input_data_format) + + # Resize the image + resized_image = resize(image, (new_height, new_width), resample=resample, input_data_format=input_data_format) + + return resized_image + + def _pad_for_patching( + self, image: np.array, target_resolution: tuple, input_data_format: ChannelDimension + ) -> np.array: + """ + Pad an image to a target resolution while maintaining aspect ratio. + """ + target_height, target_width = target_resolution + new_height, new_width = _get_patch_output_size(image, target_resolution, input_data_format) + + paste_x = (target_width - new_width) // 2 + paste_y = (target_height - new_height) // 2 + + padded_image = self.pad(image, padding=((paste_y, paste_y), (paste_x, paste_x))) + + return padded_image + + def pad( + self, + image: np.ndarray, + padding: Union[int, Tuple[int, int], Iterable[Tuple[int, int]]], + mode: PaddingMode = PaddingMode.CONSTANT, + constant_values: Union[float, Iterable[float]] = 0.0, + data_format: Optional[Union[str, ChannelDimension]] = None, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + ) -> np.ndarray: + """ + Pads the `image` with the specified `padding` and `mode`. Padding can be in the (`height`, `width`) + dimension of in the (`num_patches`) dimension. In the second case an iterable if tuples is expected + as input. + + Args: + image (`np.ndarray`): + The image to pad. + padding (`int` or `Tuple[int, int]` or `Iterable[Tuple[int, int]]`): + Padding to apply to the edges of the height, width axes. Can be one of three formats: + - `((before_height, after_height), (before_width, after_width))` unique pad widths for each axis. + - `((before, after),)` yields same before and after pad for height and width. + - `(pad,)` or int is a shortcut for before = after = pad width for all axes. + mode (`PaddingMode`): + The padding mode to use. Can be one of: + - `"constant"`: pads with a constant value. + - `"reflect"`: pads with the reflection of the vector mirrored on the first and last values of the + vector along each axis. + - `"replicate"`: pads with the replication of the last value on the edge of the array along each axis. + - `"symmetric"`: pads with the reflection of the vector mirrored along the edge of the array. + constant_values (`float` or `Iterable[float]`, *optional*): + The value to use for the padding if `mode` is `"constant"`. + data_format (`str` or `ChannelDimension`, *optional*): + The channel dimension format for the output image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + If unset, will use same as the input image. + input_data_format (`str` or `ChannelDimension`, *optional*): + The channel dimension format for the input image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + If unset, will use the inferred format of the input image. + + Returns: + `np.ndarray`: The padded image. + + """ + + # call the general `pad` if padding on `height/width`, otherwise it's the `num_patched` dim + if isinstance(padding, int) or len(padding) != 4: + return pad(image, padding, mode, constant_values, data_format, input_data_format) + + if input_data_format is None: + input_data_format = infer_channel_dimension_format(image) + + padding_mode_mapping = { + PaddingMode.CONSTANT: "constant", + PaddingMode.REFLECT: "reflect", + PaddingMode.REPLICATE: "edge", + PaddingMode.SYMMETRIC: "symmetric", + } + image = np.pad(image, padding, mode=padding_mode_mapping[mode], constant_values=constant_values) + image = ( + to_channel_dimension_format(image, data_format, input_data_format) if data_format is not None else image + ) + return image + + def get_image_patches( + self, + image: np.array, + grid_pinpoints: List[Tuple[int, int]], + patch_size: int, + resample: PILImageResampling, + data_format: ChannelDimension, + input_data_format: ChannelDimension, + ) -> List[np.array]: + """ + Process an image with variable resolutions by dividing it into patches. + + Args: + image (`np.array`): + The input image to be processed. + grid_pinpoints (List[Tuple[int, int]]): + A list of possible resolutions as tuples. + patch_size (`int`): + Size of the patches to divide the image into. + resample (`PILImageResampling`): + Resampling filter to use if resizing the image. + data_format (`ChannelDimension` or `str`): + The channel dimension format for the output image. + input_data_format (`ChannelDimension` or `str`): + The channel dimension format of the input image. + + Returns: + `List[np.array]`: A list of NumPy arrays containing the processed image patches. + """ + if not isinstance(grid_pinpoints, list): + raise TypeError("grid_pinpoints must be a list of possible resolutions.") + + possible_resolutions = grid_pinpoints + + image_size = get_image_size(image, channel_dim=input_data_format) + best_resolution = select_best_resolution(image_size, possible_resolutions) + resized_image = self._resize_for_patching( + image, best_resolution, resample=resample, input_data_format=input_data_format + ) + padded_image = self._pad_for_patching(resized_image, best_resolution, input_data_format=input_data_format) + + patches = divide_to_patches(padded_image, patch_size=patch_size, input_data_format=input_data_format) + + # make sure that all patches are in the input data format + patches = [ + to_channel_dimension_format(patch, channel_dim=data_format, input_channel_dim=input_data_format) + for patch in patches + ] + return patches + + +__all__ = ["AriaImageProcessor"] diff --git a/docs/transformers/build/lib/transformers/models/aria/modeling_aria.py b/docs/transformers/build/lib/transformers/models/aria/modeling_aria.py new file mode 100644 index 0000000000000000000000000000000000000000..1b9892c94f85b4b1eed091f02fb9e299fb24e548 --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/aria/modeling_aria.py @@ -0,0 +1,1584 @@ +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# This file was automatically generated from src/transformers/models/aria/modular_aria.py. +# Do NOT edit this file manually as any edits will be overwritten by the generation of +# the file from the modular. If any change should be done, please apply the change to the +# modular_aria.py file directly. One of our CI enforces this. +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# coding=utf-8 +# Copyright 2024 The Rhymes-AI Teams Authors and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from dataclasses import dataclass +from typing import Callable, List, Optional, Tuple, Union + +from ...activations import ACT2FN +from ...cache_utils import Cache, DynamicCache, StaticCache +from ...generation import GenerationMixin +from ...integrations import use_kernel_forward_from_hub +from ...modeling_attn_mask_utils import AttentionMaskConverter +from ...modeling_flash_attention_utils import FlashAttentionKwargs +from ...modeling_layers import GradientCheckpointingLayer +from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, ModelOutput +from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update +from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel +from ...processing_utils import Unpack +from ...utils import ( + LossKwargs, + add_start_docstrings, + add_start_docstrings_to_model_forward, + can_return_tuple, + is_torch_flex_attn_available, + logging, + replace_return_docstrings, +) +from ...utils.import_utils import is_torch_available +from ..auto import AutoModel, AutoModelForCausalLM +from .configuration_aria import AriaConfig, AriaTextConfig + + +if is_torch_available(): + import torch + from torch import nn + + +if is_torch_flex_attn_available(): + from torch.nn.attention.flex_attention import BlockMask + + from ...integrations.flex_attention import make_flex_block_causal_mask + + +logger = logging.get_logger(__name__) +_CONFIG_FOR_DOC = "AriaTextConfig" + + +@use_kernel_forward_from_hub("RMSNorm") +class AriaTextRMSNorm(nn.Module): + def __init__(self, hidden_size, eps=1e-6): + """ + AriaTextRMSNorm is equivalent to T5LayerNorm + """ + super().__init__() + self.weight = nn.Parameter(torch.ones(hidden_size)) + self.variance_epsilon = eps + + def forward(self, hidden_states): + input_dtype = hidden_states.dtype + hidden_states = hidden_states.to(torch.float32) + variance = hidden_states.pow(2).mean(-1, keepdim=True) + hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon) + return self.weight * hidden_states.to(input_dtype) + + def extra_repr(self): + return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}" + + +class AriaProjectorMLP(nn.Module): + """ + Feed-Forward Network module for the Aria Projector. + + Args: + in_features (`int`): + Input embedding dimension. + hidden_features (`int`): + Hidden dimension of the feed-forward network. + output_dim (`int`): + Output dimension. + """ + + def __init__(self, in_features, hidden_features, output_dim): + super().__init__() + self.linear_in = nn.Linear(in_features, hidden_features, bias=False) + self.linear_out = nn.Linear(hidden_features, output_dim, bias=False) + self.act = ACT2FN["gelu_new"] + + def forward(self, hidden_states): + hidden_states = self.act(self.linear_in(hidden_states)) + hidden_states = self.linear_out(hidden_states) + return hidden_states + + +class AriaCrossAttention(nn.Module): + """ + Aria Cross-Attention module. + + Args: + config (`AriaConfig`): + The configuration to use. + """ + + def __init__(self, config: AriaConfig, dropout_rate: float = 0): + super().__init__() + hidden_size = config.vision_config.hidden_size + num_heads = config.vision_config.num_attention_heads + self.num_heads = num_heads + self.q_proj = nn.Linear(hidden_size, hidden_size, bias=False) + self.k_proj = nn.Linear(hidden_size, hidden_size, bias=False) + self.v_proj = nn.Linear(hidden_size, hidden_size, bias=False) + + # Original code here: https://github.com/rhymes-ai/Aria/blob/719ff4e52b727443cba3793b0e27fe64e0244fe1/aria/model/projector.py#L48 + self.multihead_attn = nn.MultiheadAttention(hidden_size, num_heads, batch_first=True) + self.linear = nn.Linear(hidden_size, hidden_size) + self.dropout = nn.Dropout(dropout_rate) + + self.layer_norm = nn.LayerNorm(hidden_size) + self.layer_norm_kv = nn.LayerNorm(hidden_size) + + def forward(self, key_value_states, hidden_states, attn_mask=None): + """ + Forward pass of the AriaCrossAttention module. + + Args: + key_value_states (`torch.Tensor`): + Input tensor for key and value. + hidden_states (`torch.Tensor`): + Input tensor for query. + attn_mask (`torch.Tensor`, *optional*, defaults to None): + Attention mask. + + Returns: + torch.Tensor: + Output tensor after cross-attention. + """ + query = self.q_proj(self.layer_norm(hidden_states)) + + key_value_states = self.layer_norm_kv(key_value_states) + key = self.k_proj(key_value_states) + value = self.v_proj(key_value_states) + + attn_output, _ = self.multihead_attn(query, key, value, attn_mask=attn_mask) + + attn_output = self.dropout(self.linear(attn_output)) + + return attn_output + + +class AriaProjector(nn.Module): + """ + Aria Projector module. + + This module projects vision features into the language model's embedding space, enabling interaction between vision and language components. + + Args: + config (`AriaConfig`): + Configuration object for the model. + """ + + def __init__( + self, + config: AriaConfig, + ): + super().__init__() + + self.patch_to_query_dict = config.projector_patch_to_query_dict + self.in_features = config.vision_config.hidden_size + self.num_heads = config.vision_config.num_attention_heads + self.kv_dim = config.vision_config.hidden_size + self.hidden_features = config.text_config.hidden_size + self.output_dim = config.text_config.hidden_size + + self.query = nn.Parameter(torch.zeros(config.max_value_projector_patch_to_query_dict, self.in_features)) + + self.cross_attn = AriaCrossAttention(config) + + self.layer_norm = nn.LayerNorm(self.in_features) + self.feed_forward = AriaProjectorMLP(self.in_features, self.hidden_features, self.output_dim) + + def forward(self, key_value_states: torch.Tensor, attn_mask: Optional[torch.Tensor] = None): + """ + Forward pass of the Projector module. + + Args: + key_value_states (`torch.Tensor`): + Input tensor of shape (batch_size, num_patches, kv_dim). + attn_mask (`torch.Tensor`, *optional*, default is None): + Attention mask. + + Returns: + `torch.Tensor`: Output tensor of shape (batch_size, query_number, output_dim). + """ + batch_size, num_patches = key_value_states.shape[0], key_value_states.shape[1] + + if num_patches not in self.patch_to_query_dict.keys(): + raise KeyError( + f"Number of patches {num_patches} not found in patch_to_query_dict amongst possible values {self.patch_to_query_dict.keys()}." + ) + query_num = self.patch_to_query_dict[num_patches] + + queries = self.query[:query_num].unsqueeze(0).repeat(batch_size, 1, 1) + + if attn_mask is not None: + attn_mask = attn_mask.repeat_interleave(self.num_heads, 0) + attn_mask = attn_mask.unsqueeze(1).expand(-1, queries.size(1), -1) + + attention_out = self.cross_attn(key_value_states, queries, attn_mask=attn_mask) + + out = self.feed_forward(self.layer_norm(attention_out)) + + return out + + +class AriaSharedExpertsMLP(nn.Module): + """ + Shared Expert MLP for shared experts. + + Unlike routed experts, shared experts process all tokens without routing. + This class reconfigures the intermediate size in comparison to the LlamaMLP. + + Args: + config (`AriaTextConfig`): Configuration object for the Aria language model. + """ + + def __init__(self, config: AriaTextConfig): + super().__init__() + self.config = config + self.hidden_size = config.hidden_size + self.intermediate_size = config.intermediate_size * config.moe_num_shared_experts + self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=config.mlp_bias) + self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=config.mlp_bias) + self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=config.mlp_bias) + self.act_fn = ACT2FN[config.hidden_act] + + def forward(self, x): + down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x)) + return down_proj + + +def sequential_experts_gemm(token_states, expert_weights, tokens_per_expert): + """ + Compute the matrix multiplication (GEMM) for each expert sequentially. This approach is computationally inefficient, especially when dealing with a large number of experts. + + Args: + token_states (torch.Tensor): Input tensor of shape (num_tokens, in_features). + expert_weights (torch.Tensor): Weight tensor of shape (num_experts, in_features, out_features). + tokens_per_expert (torch.Tensor): Number of tokens assigned to each expert. + + Returns: + torch.Tensor: Output tensor of shape (num_tokens, out_features). + """ + num_tokens = token_states.shape[0] + out_features = expert_weights.shape[-1] + output = torch.zeros(num_tokens, out_features, dtype=token_states.dtype, device=token_states.device) + + cumsum_num_tokens = torch.cumsum(tokens_per_expert, dim=0) + # Insert zero at the beginning for offset index's convenience + zero_tensor = torch.zeros(1, dtype=torch.long, device=cumsum_num_tokens.device) + cumsum_num_tokens = torch.cat((zero_tensor, cumsum_num_tokens)) + + for expert_num in range(expert_weights.shape[0]): + start = cumsum_num_tokens[expert_num] + end = cumsum_num_tokens[expert_num + 1] + tokens = token_states[start:end] + + out = torch.matmul(tokens, expert_weights[expert_num]) + output[start:end] = out + return output + + +class AriaGroupedExpertsGemm(nn.Module): + """ + Grouped GEMM (General Matrix Multiplication) module for efficient expert computation. + This module utilizes the grouped_gemm library (https://github.com/fanshiqing/grouped_gemm) + for optimized performance. If the grouped_gemm library is not installed, it gracefully + falls back to a sequential GEMM implementation, which may be slower but ensures + functionality. + + Args: + in_features (`int`): + Number of input features. + out_features (`int`): + Number of output features. + groups (`int`): + Number of expert groups. + """ + + def __init__(self, in_features, out_features, groups): + super().__init__() + self.in_features = in_features + self.out_features = out_features + self.groups = groups + self.weight = nn.Parameter(torch.empty(groups, in_features, out_features)) + + def forward(self, input, tokens_per_expert): + """ + Perform grouped matrix multiplication. + + Args: + input (`torch.Tensor`): + Input tensor of shape (num_tokens, in_features). + tokens_per_expert (`torch.Tensor`): + Number of tokens assigned to each expert. + + Returns: + torch.Tensor: Output tensor of shape (num_tokens, out_features). + """ + return sequential_experts_gemm( + input, + self.weight, + tokens_per_expert.cpu(), + ) + + +class AriaGroupedExpertsMLP(nn.Module): + """ + Grouped MLP module for Mixture of Experts. + + Args: + config (`AriaTextConfig`): + Configuration object for the model. + """ + + def __init__(self, config: AriaTextConfig) -> None: + super().__init__() + self.config = config + self.fc1 = AriaGroupedExpertsGemm(config.hidden_size, config.intermediate_size * 2, config.moe_num_experts) + self.fc2 = AriaGroupedExpertsGemm(config.intermediate_size, config.hidden_size, config.moe_num_experts) + + def forward(self, permuted_tokens, tokens_per_expert): + """ + Forward pass of the Grouped MLP. + + Args: + permuted_tokens (torch.Tensor): Permuted input tokens. + tokens_per_expert (torch.Tensor): Number of tokens assigned to each expert. + + Returns: + torch.Tensor: Output tensor after passing through the MLP. + """ + fc1_output = self.fc1(permuted_tokens, tokens_per_expert) + projection, gate = torch.chunk(fc1_output, 2, dim=-1) + fc1_output = nn.functional.silu(projection) * gate + fc2_output = self.fc2(fc1_output, tokens_per_expert) + return fc2_output + + +# Token permutation adapted from https://github.com/NVIDIA/Megatron-LM/blob/54f1f78529cbc2b9cddad313e7f9d96ac0420a27/megatron/core/transformer/moe/token_dispatcher.py#L291-L587 +class AriaTextMoELayer(nn.Module): + """ + Aria Text Mixture of Experts (MoE) Layer. + + This layer applies a gating mechanism to route input tokens to different experts. + + Args: + config (`AriaTextConfig`): + Configuration object for the text component of the model. + """ + + def __init__(self, config: AriaTextConfig): + super().__init__() + + self.router = nn.Linear(config.hidden_size, config.moe_num_experts, bias=False) + self.experts = AriaGroupedExpertsMLP(config) + self.shared_experts = AriaSharedExpertsMLP(config) + self.config = config + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + """ + Forward pass of the MoE Layer. + + Args: + hidden_states (`torch.Tensor`): + Input tensor of shape (batch_size, sequence_length, hidden_size). + + Returns: + torch.Tensor: Output tensor after passing through the MoE layer. + + Process: + 1. Route tokens to experts using the router. + 2. Permute tokens based on routing decisions. + 3. Process tokens through experts. + 4. Unpermute and combine expert outputs. + 5. Add shared expert output to the final result. + """ + original_shape = hidden_states.shape + hidden_states = hidden_states.view(-1, hidden_states.size(-1)) + + # Top K Routing + logits = self.router(hidden_states) + top_logits, top_indices = torch.topk(logits, k=self.config.moe_topk, dim=1) + scores = nn.functional.softmax(top_logits, dim=-1) + + original_dtype = top_indices.dtype + + tokens_per_expert = torch.histc( + top_indices.flatten().to(torch.float32), + bins=self.config.moe_num_experts, + min=0, + max=self.config.moe_num_experts - 1, + ).to(original_dtype) + indices = top_indices + + # Token permutation + flatten_indices = indices.view(-1) + sorted_indices = torch.argsort(flatten_indices) + permuted_tokens = hidden_states.index_select(0, sorted_indices // self.config.moe_topk) + + # Process through experts + expert_output = self.experts(permuted_tokens, tokens_per_expert) + + # Token unpermutation + unpermuted_tokens = torch.zeros( + (scores.shape[0] * self.config.moe_topk, expert_output.size(1)), + dtype=expert_output.dtype, + device=expert_output.device, + ) + unpermuted_tokens.index_copy_(0, sorted_indices, expert_output) + unpermuted_tokens = unpermuted_tokens.view(-1, self.config.moe_topk, expert_output.size(1)) + + output = (unpermuted_tokens * scores.unsqueeze(-1)).sum(dim=1).view(original_shape) + + # Add shared expert output + shared_expert_output = self.shared_experts(hidden_states.view(original_shape)) + return output + shared_expert_output + + +def rotate_half(x): + """Rotates half the hidden dims of the input.""" + x1 = x[..., : x.shape[-1] // 2] + x2 = x[..., x.shape[-1] // 2 :] + return torch.cat((-x2, x1), dim=-1) + + +def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1): + """Applies Rotary Position Embedding to the query and key tensors. + + Args: + q (`torch.Tensor`): The query tensor. + k (`torch.Tensor`): The key tensor. + cos (`torch.Tensor`): The cosine part of the rotary embedding. + sin (`torch.Tensor`): The sine part of the rotary embedding. + position_ids (`torch.Tensor`, *optional*): + Deprecated and unused. + unsqueeze_dim (`int`, *optional*, defaults to 1): + The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and + sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note + that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and + k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes + cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have + the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2. + Returns: + `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding. + """ + cos = cos.unsqueeze(unsqueeze_dim) + sin = sin.unsqueeze(unsqueeze_dim) + q_embed = (q * cos) + (rotate_half(q) * sin) + k_embed = (k * cos) + (rotate_half(k) * sin) + return q_embed, k_embed + + +def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor: + """ + This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch, + num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim) + """ + batch, num_key_value_heads, slen, head_dim = hidden_states.shape + if n_rep == 1: + return hidden_states + hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim) + return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim) + + +def eager_attention_forward( + module: nn.Module, + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + attention_mask: Optional[torch.Tensor], + scaling: float, + dropout: float = 0.0, + **kwargs, +): + key_states = repeat_kv(key, module.num_key_value_groups) + value_states = repeat_kv(value, module.num_key_value_groups) + + attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling + if attention_mask is not None: + causal_mask = attention_mask[:, :, :, : key_states.shape[-2]] + attn_weights = attn_weights + causal_mask + + attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype) + attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training) + attn_output = torch.matmul(attn_weights, value_states) + attn_output = attn_output.transpose(1, 2).contiguous() + + return attn_output, attn_weights + + +class AriaTextAttention(nn.Module): + """Multi-headed attention from 'Attention Is All You Need' paper""" + + def __init__(self, config: AriaTextConfig, layer_idx: int): + super().__init__() + self.config = config + self.layer_idx = layer_idx + self.head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads) + self.num_key_value_groups = config.num_attention_heads // config.num_key_value_heads + self.scaling = self.head_dim**-0.5 + self.attention_dropout = config.attention_dropout + self.is_causal = True + + self.q_proj = nn.Linear( + config.hidden_size, config.num_attention_heads * self.head_dim, bias=config.attention_bias + ) + self.k_proj = nn.Linear( + config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias + ) + self.v_proj = nn.Linear( + config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias + ) + self.o_proj = nn.Linear( + config.num_attention_heads * self.head_dim, config.hidden_size, bias=config.attention_bias + ) + + def forward( + self, + hidden_states: torch.Tensor, + position_embeddings: Tuple[torch.Tensor, torch.Tensor], + attention_mask: Optional[torch.Tensor], + past_key_value: Optional[Cache] = None, + cache_position: Optional[torch.LongTensor] = None, + **kwargs: Unpack[FlashAttentionKwargs], + ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + input_shape = hidden_states.shape[:-1] + hidden_shape = (*input_shape, -1, self.head_dim) + + query_states = self.q_proj(hidden_states).view(hidden_shape).transpose(1, 2) + key_states = self.k_proj(hidden_states).view(hidden_shape).transpose(1, 2) + value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2) + + cos, sin = position_embeddings + query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin) + + if past_key_value is not None: + # sin and cos are specific to RoPE models; cache_position needed for the static cache + cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position} + key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs) + + attention_interface: Callable = eager_attention_forward + + if self.config._attn_implementation != "eager": + if self.config._attn_implementation == "sdpa" and kwargs.get("output_attentions", False): + logger.warning_once( + "`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to " + 'eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.' + ) + else: + attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation] + + attn_output, attn_weights = attention_interface( + self, + query_states, + key_states, + value_states, + attention_mask, + dropout=0.0 if not self.training else self.attention_dropout, + scaling=self.scaling, + **kwargs, + ) + + attn_output = attn_output.reshape(*input_shape, -1).contiguous() + attn_output = self.o_proj(attn_output) + return attn_output, attn_weights + + +class AriaTextDecoderLayer(GradientCheckpointingLayer): + """ + Aria Text Decoder Layer. + + This class defines a single decoder layer in the language model, incorporating self-attention and Mixture of Experts (MoE) feed-forward network. + + Args: + config (`AriaTextConfig`): + Configuration object for the text component of the model. + layer_idx (`int`): + Index of the layer. + """ + + def __init__(self, config: AriaTextConfig, layer_idx: int): + super().__init__() + self.hidden_size = config.hidden_size + + self.self_attn = AriaTextAttention(config=config, layer_idx=layer_idx) + self.mlp = AriaTextMoELayer(config) + self.input_layernorm = AriaTextRMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.post_attention_layernorm = AriaTextRMSNorm(config.hidden_size, eps=config.rms_norm_eps) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Cache] = None, + output_attentions: Optional[bool] = False, + use_cache: Optional[bool] = False, + cache_position: Optional[torch.LongTensor] = None, + position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC + **kwargs: Unpack[FlashAttentionKwargs], + ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]: + residual = hidden_states + hidden_states = self.input_layernorm(hidden_states) + + # Self Attention + hidden_states, self_attn_weights = self.self_attn( + hidden_states=hidden_states, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_value=past_key_value, + output_attentions=output_attentions, + use_cache=use_cache, + cache_position=cache_position, + position_embeddings=position_embeddings, + **kwargs, + ) + hidden_states = residual + hidden_states + + # Fully Connected + residual = hidden_states + hidden_states = self.post_attention_layernorm(hidden_states) + hidden_states = self.mlp(hidden_states) + hidden_states = residual + hidden_states + + outputs = (hidden_states,) + if output_attentions: + outputs += (self_attn_weights,) + + return outputs + + +class AriaTextPreTrainedModel(PreTrainedModel): + """ + An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained models. + """ + + config_class = AriaConfig + base_model_prefix = "model" + _no_split_modules = ["AriaTextDecoderLayer", "AriaGroupedExpertsGemm"] + supports_gradient_checkpointing = True + _skip_keys_device_placement = "past_key_values" + _supports_flash_attn_2 = False + _supports_sdpa = True + _supports_cache_class = True + + def _init_weights(self, module): + std = self.config.initializer_range + if isinstance(module, nn.Linear): + module.weight.data.normal_(mean=0.0, std=std) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=std) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + elif isinstance(module, AriaTextRMSNorm): + module.weight.data.fill_(1.0) + elif isinstance(module, AriaGroupedExpertsGemm): + module.weight.data.normal_(mean=0.0, std=std) + + +ARIA_TEXT_START_DOCSTRING = r""" + This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads + etc.) + + This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. + Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage + and behavior. + + Parameters: + config ([`AriaTextConfig`]): + Model configuration class with all the parameters of the model. Initializing with a config file does not + load the weights associated with the model, only the configuration. Check out the + [`~PreTrainedModel.from_pretrained`] method to load the model weights. +""" + + +@add_start_docstrings( + "The bare Aria Model outputting raw hidden-states without any specific head on top.", + ARIA_TEXT_START_DOCSTRING, +) +class AriaPreTrainedModel(PreTrainedModel): + config_class = AriaTextConfig + base_model_prefix = "model" + supports_gradient_checkpointing = True + _no_split_modules = ["AriaDecoderLayer"] + _skip_keys_device_placement = ["past_key_values"] + _supports_flash_attn_2 = True + _supports_sdpa = True + _supports_flex_attn = True + _supports_cache_class = True + _supports_quantized_cache = True + _supports_static_cache = False # MoE models don't work with torch.compile (dynamic slicing) + _supports_attention_backend = False + + def _init_weights(self, module): + std = self.config.initializer_range + + if isinstance(module, nn.Linear): + module.weight.data.normal_(mean=0.0, std=std) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.MultiheadAttention): + # This uses torch's original init + module._reset_parameters() + elif isinstance(module, nn.LayerNorm): + module.weight.data.fill_(1.0) + module.bias.data.zero_() + elif isinstance(module, AriaProjector): + nn.init.trunc_normal_(module.query, std=std) + + +class AriaTextRotaryEmbedding(nn.Module): + def __init__(self, config: AriaTextConfig, device=None): + super().__init__() + # BC: "rope_type" was originally "type" + if hasattr(config, "rope_scaling") and config.rope_scaling is not None: + self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type")) + else: + self.rope_type = "default" + self.max_seq_len_cached = config.max_position_embeddings + self.original_max_seq_len = config.max_position_embeddings + + self.config = config + self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] + + inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device) + self.register_buffer("inv_freq", inv_freq, persistent=False) + self.original_inv_freq = self.inv_freq + + @torch.no_grad() + @dynamic_rope_update # power user: used with advanced RoPE types (e.g. dynamic rope) + def forward(self, x, position_ids): + inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device) + position_ids_expanded = position_ids[:, None, :].float() + + device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu" + with torch.autocast(device_type=device_type, enabled=False): # Force float32 + freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2) + emb = torch.cat((freqs, freqs), dim=-1) + cos = emb.cos() * self.attention_scaling + sin = emb.sin() * self.attention_scaling + + return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype) + + +ARIA_TEXT_INPUTS_DOCSTRING = r""" + Args: + input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide + it. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length) or `BlockMask`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + If the model is configured to use flex_attention, it will attempt to convert the mask Tensor into a BlockMask, + but you can also pass a `BlockMask` object directly here. + + [What are attention masks?](../glossary#attention-mask) + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + If `past_key_values` is used, optionally only the last `input_ids` have to be input (see + `past_key_values`). + + If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`] + and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more + information on the default strategy. + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, + config.n_positions - 1]`. + + [What are position IDs?](../glossary#position-ids) + past_key_values (`Cache`, *optional*): + Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention + blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values` + returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`. + + It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache). + + If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't + have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids` + of shape `(batch_size, sequence_length)`. + inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This + is useful if you want more control over how to convert `input_ids` indices into associated vectors than the + model's internal embedding lookup matrix. + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see + `past_key_values`). + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. + cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*): + Indices depicting the position of the input sequence tokens in the sequence. Contrarily to `position_ids`, + this tensor is not affected by padding. It is used to update the cache in the correct position and to infer + the complete sequence length. +""" + + +@add_start_docstrings( + "The bare AriaText Model outputting raw hidden-states without any specific head on top.", + ARIA_TEXT_START_DOCSTRING, +) +class AriaTextModel(AriaTextPreTrainedModel): + """ + Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`AriaTextDecoderLayer`] + + Args: + config: AriaTextConfig + """ + + def __init__(self, config: AriaTextConfig): + super().__init__(config) + self.padding_idx = config.pad_token_id + self.vocab_size = config.vocab_size + + self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx) + self.layers = nn.ModuleList( + [AriaTextDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)] + ) + self.norm = AriaTextRMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.rotary_emb = AriaTextRotaryEmbedding(config=config) + self.gradient_checkpointing = False + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.embed_tokens + + def set_input_embeddings(self, value): + self.embed_tokens = value + + @can_return_tuple + @add_start_docstrings_to_model_forward(ARIA_TEXT_INPUTS_DOCSTRING) + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + cache_position: Optional[torch.LongTensor] = None, + **flash_attn_kwargs: Unpack[FlashAttentionKwargs], + ) -> BaseModelOutputWithPast: + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + use_cache = use_cache if use_cache is not None else self.config.use_cache + + if (input_ids is None) ^ (inputs_embeds is not None): + raise ValueError("You must specify exactly one of input_ids or inputs_embeds") + + if self.gradient_checkpointing and self.training and use_cache: + logger.warning_once( + "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`." + ) + use_cache = False + + # TODO (joao): remove this exception in v4.56 -- it exists for users that try to pass a legacy cache + if not isinstance(past_key_values, (type(None), Cache)): + raise ValueError("The `past_key_values` should be either a `Cache` object or `None`.") + + if inputs_embeds is None: + inputs_embeds = self.embed_tokens(input_ids) + + if use_cache and past_key_values is None: + past_key_values = DynamicCache() + + if cache_position is None: + past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0 + cache_position = torch.arange( + past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device + ) + + if position_ids is None: + position_ids = cache_position.unsqueeze(0) + + causal_mask = self._update_causal_mask( + attention_mask, inputs_embeds, cache_position, past_key_values, output_attentions + ) + + hidden_states = inputs_embeds + + # create position embeddings to be shared across the decoder layers + position_embeddings = self.rotary_emb(hidden_states, position_ids) + + # decoder layers + all_hidden_states = () if output_hidden_states else None + all_self_attns = () if output_attentions else None + + for decoder_layer in self.layers[: self.config.num_hidden_layers]: + if output_hidden_states: + all_hidden_states += (hidden_states,) + + layer_outputs = decoder_layer( + hidden_states, + attention_mask=causal_mask, + position_ids=position_ids, + past_key_value=past_key_values, + output_attentions=output_attentions, + use_cache=use_cache, + cache_position=cache_position, + position_embeddings=position_embeddings, + **flash_attn_kwargs, + ) + + hidden_states = layer_outputs[0] + + if output_attentions: + all_self_attns += (layer_outputs[1],) + + hidden_states = self.norm(hidden_states) + + # add hidden states from the last decoder layer + if output_hidden_states: + all_hidden_states += (hidden_states,) + + return BaseModelOutputWithPast( + last_hidden_state=hidden_states, + past_key_values=past_key_values if use_cache else None, + hidden_states=all_hidden_states, + attentions=all_self_attns, + ) + + def _update_causal_mask( + self, + attention_mask: Union[torch.Tensor, "BlockMask"], + input_tensor: torch.Tensor, + cache_position: torch.Tensor, + past_key_values: Cache, + output_attentions: bool = False, + ): + if self.config._attn_implementation == "flash_attention_2": + if attention_mask is not None and (attention_mask == 0.0).any(): + return attention_mask + return None + if self.config._attn_implementation == "flex_attention": + if isinstance(attention_mask, torch.Tensor): + attention_mask = make_flex_block_causal_mask(attention_mask) + return attention_mask + + # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in + # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail + # to infer the attention mask. + past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0 + using_static_cache = isinstance(past_key_values, StaticCache) + + # When output attentions is True, sdpa implementation's forward method calls the eager implementation's forward + if self.config._attn_implementation == "sdpa" and not using_static_cache and not output_attentions: + if AttentionMaskConverter._ignore_causal_mask_sdpa( + attention_mask, + inputs_embeds=input_tensor, + past_key_values_length=past_seen_tokens, + is_training=self.training, + ): + return None + + dtype, device = input_tensor.dtype, input_tensor.device + sequence_length = input_tensor.shape[1] + if using_static_cache: + target_length = past_key_values.get_max_cache_shape() + else: + target_length = ( + attention_mask.shape[-1] + if isinstance(attention_mask, torch.Tensor) + else past_seen_tokens + sequence_length + 1 + ) + + # In case the provided `attention` mask is 2D, we generate a causal mask here (4D). + causal_mask = self._prepare_4d_causal_attention_mask_with_cache_position( + attention_mask, + sequence_length=sequence_length, + target_length=target_length, + dtype=dtype, + device=device, + cache_position=cache_position, + batch_size=input_tensor.shape[0], + ) + + if ( + self.config._attn_implementation == "sdpa" + and attention_mask is not None + and attention_mask.device.type in ["cuda", "xpu", "npu"] + and not output_attentions + ): + # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when + # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path. + # Details: https://github.com/pytorch/pytorch/issues/110213 + min_dtype = torch.finfo(dtype).min + causal_mask = AttentionMaskConverter._unmask_unattended(causal_mask, min_dtype) + + return causal_mask + + @staticmethod + def _prepare_4d_causal_attention_mask_with_cache_position( + attention_mask: torch.Tensor, + sequence_length: int, + target_length: int, + dtype: torch.dtype, + device: torch.device, + cache_position: torch.Tensor, + batch_size: int, + **kwargs, + ): + """ + Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape + `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing. + + Args: + attention_mask (`torch.Tensor`): + A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape + `(batch_size, 1, query_length, key_value_length)`. + sequence_length (`int`): + The sequence length being processed. + target_length (`int`): + The target length: when generating with static cache, the mask should be as long as the static cache, + to account for the 0 padding, the part of the cache that is not filled yet. + dtype (`torch.dtype`): + The dtype to use for the 4D attention mask. + device (`torch.device`): + The device to place the 4D attention mask on. + cache_position (`torch.Tensor`): + Indices depicting the position of the input sequence tokens in the sequence. + batch_size (`torch.Tensor`): + Batch size. + """ + if attention_mask is not None and attention_mask.dim() == 4: + # In this case we assume that the mask comes already in inverted form and requires no inversion or slicing. + causal_mask = attention_mask + else: + min_dtype = torch.finfo(dtype).min + causal_mask = torch.full( + (sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device + ) + if sequence_length != 1: + causal_mask = torch.triu(causal_mask, diagonal=1) + causal_mask *= torch.arange(target_length, device=device) > cache_position.reshape(-1, 1) + causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1) + if attention_mask is not None: + causal_mask = causal_mask.clone() # copy to contiguous memory for in-place edit + mask_length = attention_mask.shape[-1] + padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :].to( + causal_mask.device + ) + padding_mask = padding_mask == 0 + causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill( + padding_mask, min_dtype + ) + + return causal_mask + + +class KwargsForCausalLM(FlashAttentionKwargs, LossKwargs): ... + + +class AriaTextForCausalLM(AriaTextPreTrainedModel, GenerationMixin): + """ + Aria model for causal language modeling tasks. + + This class extends `LlamaForCausalLM` to incorporate the Mixture of Experts (MoE) approach, + allowing for more efficient and scalable language modeling. + + Args: + config (`AriaTextConfig`): + Configuration object for the model. + """ + + _tied_weights_keys = ["lm_head.weight"] + _tp_plan = {"lm_head": "colwise_rep"} + _pp_plan = {"lm_head": (["hidden_states"], ["logits"])} + config_class = AriaTextConfig + + def __init__(self, config: AriaTextConfig): + super().__init__(config) + self.model = AriaTextModel(config) + self.vocab_size = config.vocab_size + self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.model.embed_tokens + + def set_input_embeddings(self, value): + self.model.embed_tokens = value + + def get_output_embeddings(self): + return self.lm_head + + def set_output_embeddings(self, new_embeddings): + self.lm_head = new_embeddings + + def set_decoder(self, decoder): + self.model = decoder + + def get_decoder(self): + return self.model + + @can_return_tuple + @add_start_docstrings_to_model_forward(ARIA_TEXT_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC) + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Cache] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + cache_position: Optional[torch.LongTensor] = None, + logits_to_keep: Union[int, torch.Tensor] = 0, + **kwargs: Unpack[KwargsForCausalLM], + ) -> CausalLMOutputWithPast: + r""" + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., + config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored + (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. + + logits_to_keep (`int` or `torch.Tensor`, *optional*): + If an `int`, compute logits for the last `logits_to_keep` tokens. If `0`, calculate logits for all + `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that + token can save memory, which becomes pretty significant for long sequences or large vocabulary size. + If a `torch.Tensor`, must be 1D corresponding to the indices to keep in the sequence length dimension. + This is useful when using packed tensor format (single dimension for batch and sequence length). + + Returns: + + Example: + + ```python + >>> from transformers import AutoTokenizer, AriaTextForCausalLM + + >>> model = AriaTextForCausalLM.from_pretrained("meta-aria_text/AriaText-2-7b-hf") + >>> tokenizer = AutoTokenizer.from_pretrained("meta-aria_text/AriaText-2-7b-hf") + + >>> prompt = "Hey, are you conscious? Can you talk to me?" + >>> inputs = tokenizer(prompt, return_tensors="pt") + + >>> # Generate + >>> generate_ids = model.generate(inputs.input_ids, max_length=30) + >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] + "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you." + ```""" + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + + # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn) + outputs: BaseModelOutputWithPast = self.model( + input_ids=input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + cache_position=cache_position, + **kwargs, + ) + + hidden_states = outputs.last_hidden_state + # Only compute necessary logits, and do not upcast them to float if we are not computing the loss + slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep + logits = self.lm_head(hidden_states[:, slice_indices, :]) + + loss = None + if labels is not None: + loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.config.vocab_size, **kwargs) + + return CausalLMOutputWithPast( + loss=loss, + logits=logits, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +@dataclass +class AriaCausalLMOutputWithPast(ModelOutput): + """ + Base class for Aria causal language model (or autoregressive) outputs. + + Args: + loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided): + Language modeling loss (for next-token prediction). + logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`): + Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). + past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): + Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape + `(batch_size, num_heads, sequence_length, embed_size_per_head)`) + + Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see + `past_key_values` input) to speed up sequential decoding. + hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, + + one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the optional initial embedding outputs. + attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + image_hidden_states (`torch.FloatTensor`, *optional*): + A `torch.FloatTensor` of size (batch_size, num_images, sequence_length, hidden_size)`. + image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state. + """ + + loss: Optional[torch.FloatTensor] = None + logits: Optional[torch.FloatTensor] = None + past_key_values: Optional[List[torch.FloatTensor]] = None + hidden_states: Optional[Tuple[torch.FloatTensor]] = None + attentions: Optional[Tuple[torch.FloatTensor]] = None + image_hidden_states: Optional[torch.FloatTensor] = None + + +ARIA_INPUTS_DOCSTRING = r""" + Args: + input_ids (`torch.LongTensor`, *optional*): + Input token IDs. + pixel_values (`torch.FloatTensor`, *optional*): + Pixel values of the images. + pixel_mask (`torch.LongTensor`, *optional*): + Mask for the pixel values. + attention_mask (`torch.Tensor`, *optional*): + Attention mask. + position_ids (`torch.LongTensor`, *optional*): + Position IDs. + past_key_values (`List[torch.FloatTensor]`, *optional*): + Past key values for efficient processing. + inputs_embeds (`torch.FloatTensor`, *optional*): + Input embeddings. + labels (`torch.LongTensor`, *optional*): + Labels for computing the language modeling loss. + use_cache (`bool`, *optional*): + Whether to use the model's cache mechanism. + output_attentions (`bool`, *optional*): + Whether to output attention weights. + output_hidden_states (`bool`, *optional*): + Whether to output hidden states. + return_dict (`bool`, *optional*): + Whether to return a `ModelOutput` object. + logits_to_keep (`int` or `torch.Tensor`, *optional*, defaults to 0): + If an `int`, calculate logits for the last `logits_to_keep` tokens, or all `input_ids` if `0`. + Otherwise, slice according to the 1D tensor in the sequence length dimension + cache_position (`torch.LongTensor`, *optional*): + Cache positions. + **loss_kwargs: + Additional keyword arguments for loss calculation. +""" + +ARIA_START_DOCSTRING = r""" + This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads + etc.) + + This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. + Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage + and behavior. + + Parameters: + config (`AriaConfig`): + Model configuration class with all the parameters of the model. Initializing with a config file does not + load the weights associated with the model, only the configuration. Check out the + [`~PreTrainedModel.from_pretrained`] method to load the model weights. +""" + + +@add_start_docstrings( + """Aria model for conditional generation tasks. + + This model combines a vision tower, a multi-modal projector, and a language model + to perform tasks that involve both image and text inputs.""", + ARIA_START_DOCSTRING, +) +class AriaForConditionalGeneration(AriaPreTrainedModel, GenerationMixin): + config_class = AriaConfig + _supports_flash_attn_2 = False + _supports_flex_attn = False + _supports_sdpa = False + _tied_weights_keys = ["language_model.lm_head.weight"] + + def __init__(self, config: AriaConfig): + super().__init__(config) + + self.vision_tower = AutoModel.from_config(config.vision_config) + self.multi_modal_projector = AriaProjector(config) + self.vocab_size = config.text_config.vocab_size + self.language_model = AutoModelForCausalLM.from_config(config.text_config) + self.pad_token_id = self.config.pad_token_id if self.config.pad_token_id is not None else -1 + self._use_flash_attention_2 = config.text_config._attn_implementation == "flash_attention_2" + self.post_init() + + def _create_patch_attention_mask(self, pixel_mask): + if pixel_mask is None: + return None + + patches_subgrid = pixel_mask.unfold( + dimension=1, + size=self.vision_tower.config.patch_size, + step=self.vision_tower.config.patch_size, + ) + patches_subgrid = patches_subgrid.unfold( + dimension=2, + size=self.vision_tower.config.patch_size, + step=self.vision_tower.config.patch_size, + ) + return (patches_subgrid.sum(dim=(-1, -2)) > 0).bool() + + def get_input_embeddings(self): + return self.language_model.get_input_embeddings() + + def set_input_embeddings(self, value): + self.language_model.set_input_embeddings(value) + + def get_output_embeddings(self): + return self.language_model.get_output_embeddings() + + def set_output_embeddings(self, new_embeddings): + self.language_model.set_output_embeddings(new_embeddings) + + def set_decoder(self, decoder): + self.language_model.set_decoder(decoder) + + def get_decoder(self): + return self.language_model.get_decoder() + + def get_image_features( + self, + pixel_values: torch.FloatTensor, + pixel_mask: Optional[torch.FloatTensor] = None, + vision_feature_layer: int = -1, + ): + patch_attention_mask = self._create_patch_attention_mask(pixel_mask) + image_outputs = self.vision_tower( + pixel_values, patch_attention_mask=patch_attention_mask, output_hidden_states=True + ) + image_attn_mask = None + if patch_attention_mask is not None: + flattened_mask = patch_attention_mask.flatten(1) + image_attn_mask = torch.logical_not(flattened_mask) + + selected_image_feature = image_outputs.hidden_states[vision_feature_layer] + image_features = self.multi_modal_projector(selected_image_feature, attn_mask=image_attn_mask) + return image_features + + @can_return_tuple + @add_start_docstrings_to_model_forward(ARIA_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=AriaCausalLMOutputWithPast, config_class=AriaConfig) + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + pixel_values: Optional[torch.FloatTensor] = None, + pixel_mask: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + logits_to_keep: Union[int, torch.Tensor] = 0, + cache_position: Optional[torch.LongTensor] = None, + **loss_kwargs, + ) -> AriaCausalLMOutputWithPast: + r""" + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., + config.vocab_size]` or `model.image_token_id` (where `model` is your instance of `Idefics3ForConditionalGeneration`). + Tokens with indices set to `model.image_token_id` are ignored (masked), the loss is only + computed for the tokens with labels in `[0, ..., config.vocab_size]`. + Returns: + + Example: + + ```python + >>> import requests + >>> import torch + >>> from PIL import Image + >>> from io import BytesIO + + >>> from transformers import AutoProcessor, AutoModel + >>> from transformers.image_utils import load_image + + >>> # Note that passing the image urls (instead of the actual pil images) to the processor is also possible + >>> image1 = load_image("https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg") + >>> image2 = load_image("https://cdn.britannica.com/59/94459-050-DBA42467/Skyline-Chicago.jpg") + >>> image3 = load_image("https://cdn.britannica.com/68/170868-050-8DDE8263/Golden-Gate-Bridge-San-Francisco.jpg") + + >>> processor = AutoProcessor.from_pretrained("Rhymes-AI/Aria") + >>> model = AutoModel.from_pretrained("Rhymes-AI/Aria", torch_dtype=torch.bfloat16, device_map="auto") + + >>> # Create inputs + >>> messages = [ + ... { + ... "role": "user", + ... "content": [ + ... {"type": "image"}, + ... {"type": "text", "text": "In this image, we can see the city of New York, and more specifically the Statue of Liberty."}, + ... {"type": "image"}, + ... {"type": "text", "text": "What can we see in this image?"}, + ... ] + ... }, + ... { + ... "role": "user", + ... "content": [ + ... {"type": "image"}, + ... {"type": "text", "text": "In which city is that bridge located?"}, + ... ] + ... } + ... ] + + >>> prompts = [processor.apply_chat_template([message], add_generation_prompt=True) for message in messages] + >>> images = [[image1, image2], [image3]] + >>> inputs = processor(text=prompts, images=images, padding=True, return_tensors="pt").to(model.device) + + >>> # Generate + >>> generated_ids = model.generate(**inputs, max_new_tokens=256) + >>> generated_texts = processor.batch_decode(generated_ids, skip_special_tokens=True) + + >>> print(generated_texts[0]) + Assistant: There are buildings, trees, lights, and water visible in this image. + + >>> print(generated_texts[1]) + Assistant: The bridge is in San Francisco. + ```""" + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + + if inputs_embeds is None: + inputs_embeds = self.get_input_embeddings()(input_ids) + + # 2. Merge text and images + if pixel_values is not None and inputs_embeds.shape[1] != 1: + if input_ids is None: + special_image_mask = inputs_embeds == self.get_input_embeddings()( + torch.tensor(self.config.image_token_id, dtype=torch.long, device=inputs_embeds.device) + ) + n_image_tokens = (special_image_mask).sum(dim=1).sum(dim=0)[0] + else: + image_embeds = input_ids == self.config.image_token_id + special_image_mask = image_embeds.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device) + n_image_tokens = (image_embeds).sum(dim=1).sum(dim=0) + image_features = self.get_image_features( + pixel_values=pixel_values, + pixel_mask=pixel_mask, + vision_feature_layer=self.config.vision_feature_layer, + ) + n_images, n_features_per_image = image_features.shape[0], image_features.shape[1] + n_image_features = n_images * n_features_per_image + if n_image_tokens != n_image_features: + raise ValueError( + f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}" + ) + + image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype) + inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features) + + outputs: CausalLMOutputWithPast = self.language_model( + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + logits_to_keep=logits_to_keep, + cache_position=cache_position, + ) + + logits = outputs.logits + + loss = None + if labels is not None: + loss = self.loss_function( + logits=logits, labels=labels, vocab_size=self.config.text_config.vocab_size, **loss_kwargs + ) + + return AriaCausalLMOutputWithPast( + loss=loss, + logits=logits, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + def prepare_inputs_for_generation( + self, + input_ids, + past_key_values=None, + inputs_embeds=None, + pixel_values=None, + pixel_mask=None, + attention_mask=None, + cache_position=None, + logits_to_keep=None, + **kwargs, + ): + model_inputs = self.language_model.prepare_inputs_for_generation( + input_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + attention_mask=attention_mask, + cache_position=cache_position, + logits_to_keep=logits_to_keep, + **kwargs, + ) + + if cache_position[0] == 0: + # If we're in cached decoding stage, pixel values should be None because input ids do not contain special image token anymore + # Otherwise we need pixel values to be passed to model + model_inputs["pixel_values"] = pixel_values + model_inputs["pixel_mask"] = pixel_mask + + return model_inputs + + +__all__ = [ + "AriaForConditionalGeneration", + "AriaPreTrainedModel", + "AriaTextPreTrainedModel", + "AriaTextModel", + "AriaTextForCausalLM", +] diff --git a/docs/transformers/build/lib/transformers/models/aria/modular_aria.py b/docs/transformers/build/lib/transformers/models/aria/modular_aria.py new file mode 100644 index 0000000000000000000000000000000000000000..add5bdc16b764201a73717a3bc6a0981237d1b3f --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/aria/modular_aria.py @@ -0,0 +1,1643 @@ +# coding=utf-8 +# Copyright 2024 The Rhymes-AI Teams Authors and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import math +from typing import Dict, Iterable, List, Optional, Tuple, Union + +import numpy as np + +from ...activations import ACT2FN +from ...configuration_utils import PretrainedConfig +from ...generation import GenerationMixin +from ...image_processing_utils import BaseImageProcessor, BatchFeature, select_best_resolution +from ...image_transforms import PaddingMode, convert_to_rgb, pad, resize, to_channel_dimension_format +from ...image_utils import ( + ChannelDimension, + ImageInput, + PILImageResampling, + get_image_size, + infer_channel_dimension_format, + is_scaled_image, + make_flat_list_of_images, + to_numpy_array, + valid_images, + validate_preprocess_arguments, +) +from ...modeling_outputs import CausalLMOutputWithPast +from ...modeling_utils import PreTrainedModel +from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack +from ...tokenization_utils import ( + PreTokenizedInput, + TextInput, +) +from ...utils import ( + TensorType, + add_start_docstrings, + add_start_docstrings_to_model_forward, + can_return_tuple, + logging, + replace_return_docstrings, +) +from ...utils.import_utils import is_torch_available +from ..auto import CONFIG_MAPPING, AutoConfig, AutoModel, AutoModelForCausalLM, AutoTokenizer +from ..llama.configuration_llama import LlamaConfig +from ..llama.modeling_llama import ( + LlamaDecoderLayer, + LlamaForCausalLM, + LlamaMLP, + LlamaModel, + LlamaPreTrainedModel, + LlamaRMSNorm, +) +from ..llava.modeling_llava import LlavaCausalLMOutputWithPast +from ..llava_next.image_processing_llava_next import divide_to_patches + + +logger = logging.get_logger(__name__) + +if is_torch_available(): + import torch + from torch import nn + + +def sequential_experts_gemm(token_states, expert_weights, tokens_per_expert): + """ + Compute the matrix multiplication (GEMM) for each expert sequentially. This approach is computationally inefficient, especially when dealing with a large number of experts. + + Args: + token_states (torch.Tensor): Input tensor of shape (num_tokens, in_features). + expert_weights (torch.Tensor): Weight tensor of shape (num_experts, in_features, out_features). + tokens_per_expert (torch.Tensor): Number of tokens assigned to each expert. + + Returns: + torch.Tensor: Output tensor of shape (num_tokens, out_features). + """ + num_tokens = token_states.shape[0] + out_features = expert_weights.shape[-1] + output = torch.zeros(num_tokens, out_features, dtype=token_states.dtype, device=token_states.device) + + cumsum_num_tokens = torch.cumsum(tokens_per_expert, dim=0) + # Insert zero at the beginning for offset index's convenience + zero_tensor = torch.zeros(1, dtype=torch.long, device=cumsum_num_tokens.device) + cumsum_num_tokens = torch.cat((zero_tensor, cumsum_num_tokens)) + + for expert_num in range(expert_weights.shape[0]): + start = cumsum_num_tokens[expert_num] + end = cumsum_num_tokens[expert_num + 1] + tokens = token_states[start:end] + + out = torch.matmul(tokens, expert_weights[expert_num]) + output[start:end] = out + return output + + +class AriaTextConfig(LlamaConfig): + r""" + This class handles the configuration for the text component of the Aria model. + Instantiating a configuration with the defaults will yield a similar configuration to that of the model of the Aria + [rhymes-ai/Aria](https://huggingface.co/rhymes-ai/Aria) architecture. + This class extends the LlamaConfig to include additional parameters specific to the Mixture of Experts (MoE) architecture. + + Args: + vocab_size (`int`, *optional*, defaults to 32000): + Vocabulary size of the LLaMA model. Defines the number of different tokens that can be represented by the + `inputs_ids` passed when calling [`LlamaModel`] + hidden_size (`int`, *optional*, defaults to 4096): + Dimension of the hidden representations. + intermediate_size (`int`, *optional*, defaults to 4096): + The size of the MLP representations. + num_hidden_layers (`int`, *optional*, defaults to 32): + Number of hidden layers in the Transformer decoder. + num_attention_heads (`int`, *optional*, defaults to 32): + Number of attention heads for each attention layer in the Transformer decoder. + num_key_value_heads (`int`, *optional*): + This is the number of key_value heads that should be used to implement Grouped Query Attention. If + `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if + `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When + converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed + by meanpooling all the original heads within that group. For more details checkout [this + paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to + `num_attention_heads`. + hidden_act (`str` or `function`, *optional*, defaults to `"silu"`): + The non-linear activation function (function or string) in the decoder. + max_position_embeddings (`int`, *optional*, defaults to 2048): + The maximum sequence length that this model might ever be used with. Llama 1 supports up to 2048 tokens, + Llama 2 up to 4096, CodeLlama up to 16384. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + rms_norm_eps (`float`, *optional*, defaults to 1e-06): + The epsilon used by the rms normalization layers. + use_cache (`bool`, *optional*, defaults to `True`): + Whether or not the model should return the last key/values attentions (not used by all models). Only + relevant if `config.is_decoder=True`. + pad_token_id (`int`, *optional*, defaults to 2): + Padding token id. + bos_token_id (`int`, *optional*, defaults to 1): + Beginning of stream token id. + eos_token_id (`int`, *optional*, defaults to 2): + End of stream token id. + pretraining_tp (`int`, *optional*, defaults to 1): + Experimental feature. Tensor parallelism rank used during pretraining. Please refer to [this + document](https://huggingface.co/docs/transformers/main/perf_train_gpu_many#tensor-parallelism) to + understand more about it. This value is necessary to ensure exact reproducibility of the pretraining + results. Please refer to [this issue](https://github.com/pytorch/pytorch/issues/76232). + tie_word_embeddings (`bool`, *optional*, defaults to `False`): + Whether to tie weight embeddings + rope_theta (`float`, *optional*, defaults to 10000.0): + The base period of the RoPE embeddings. + rope_scaling (`Dict`, *optional*): + Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type + and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value + accordingly. + Expected contents: + `rope_type` (`str`): + The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope', + 'llama3'], with 'default' being the original RoPE implementation. + `factor` (`float`, *optional*): + Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In + most scaling types, a `factor` of x will enable the model to handle sequences of length x * + original maximum pre-trained length. + `original_max_position_embeddings` (`int`, *optional*): + Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during + pretraining. + `attention_factor` (`float`, *optional*): + Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention + computation. If unspecified, it defaults to value recommended by the implementation, using the + `factor` field to infer the suggested value. + `beta_fast` (`float`, *optional*): + Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear + ramp function. If unspecified, it defaults to 32. + `beta_slow` (`float`, *optional*): + Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear + ramp function. If unspecified, it defaults to 1. + `short_factor` (`List[float]`, *optional*): + Only used with 'longrope'. The scaling factor to be applied to short contexts (< + `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden + size divided by the number of attention heads divided by 2 + `long_factor` (`List[float]`, *optional*): + Only used with 'longrope'. The scaling factor to be applied to long contexts (< + `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden + size divided by the number of attention heads divided by 2 + `low_freq_factor` (`float`, *optional*): + Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE + `high_freq_factor` (`float`, *optional*): + Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE + attention_bias (`bool`, *optional*, defaults to `False`): + Whether to use a bias in the query, key, value and output projection layers during self-attention. + attention_dropout (`float`, *optional*, defaults to 0.0): + The dropout ratio for the attention probabilities. + mlp_bias (`bool`, *optional*, defaults to `False`): + Whether to use a bias in up_proj, down_proj and gate_proj layers in the MLP layers. + head_dim (`int`, *optional*): + The attention head dimension. If None, it will default to hidden_size // num_heads + moe_num_experts (`int`, *optional*, defaults to 8): + The number of experts in the MoE layer. + moe_topk (`int`, *optional*, defaults to 2): + The number of top experts to route to for each token. + moe_num_shared_experts (`int`, *optional*, defaults to 2): + The number of shared experts. + """ + + model_type = "aria_text" + base_config_key = "text_config" + + def __init__( + self, + intermediate_size: int = 4096, + moe_num_experts: int = 8, + moe_topk: int = 2, + moe_num_shared_experts: int = 2, + pad_token_id=2, + **super_kwargs, + ): + super().__init__(pad_token_id=pad_token_id, **super_kwargs) + self.intermediate_size = intermediate_size + self.moe_num_experts = moe_num_experts + self.moe_topk = moe_topk + self.moe_num_shared_experts = moe_num_shared_experts + + +class AriaConfig(PretrainedConfig): + r""" + This class handles the configuration for both vision and text components of the Aria model, + as well as additional parameters for image token handling and projector mapping. + Instantiating a configuration with the defaults will yield a similar configuration to that of the model of the Aria + [rhymes-ai/Aria](https://huggingface.co/rhymes-ai/Aria) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + vision_config (`AriaVisionConfig` or `dict`, *optional*): + Configuration for the vision component. + vision_feature_layer (`int`, *optional*, defaults to -1): + The index of the layer to select the vision feature. + text_config (`AriaTextConfig` or `dict`, *optional*): + Configuration for the text component. + projector_patch_to_query_dict (`dict`, *optional*): + Mapping of patch sizes to query dimensions. + image_token_index (`int`, *optional*, defaults to 9): + Index used to represent image tokens. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated normal initializer for initializing all weight matrices. + + Attributes: + model_type (`str`): + Type of the model, set to `"aria"`. + image_token_index (`int`): + Index used to represent image tokens. + projector_patch_to_query_dict (`dict`): + Mapping of patch sizes to query dimensions. + vision_config (`AriaVisionConfig`): + Configuration for the vision component. + text_config (`AriaTextConfig`): + Configuration for the text component. + """ + + model_type = "aria" + attribute_map = { + "image_token_id": "image_token_index", + } + sub_configs = {"text_config": AriaTextConfig, "vision_config": AutoConfig} + + def __init__( + self, + vision_config=None, + vision_feature_layer: int = -1, + text_config: AriaTextConfig = None, + projector_patch_to_query_dict: Dict = None, + image_token_index: int = 9, + initializer_range: float = 0.02, + **kwargs, + ): + self.image_token_index = image_token_index + + # Convert the keys and values of projector_patch_to_query_dict to integers + # This ensures consistency even if they were provided as strings + if projector_patch_to_query_dict is None: + projector_patch_to_query_dict = { + 1225: 128, + 4900: 256, + } + self.projector_patch_to_query_dict = {int(k): int(v) for k, v in projector_patch_to_query_dict.items()} + self.max_value_projector_patch_to_query_dict = max(self.projector_patch_to_query_dict.values()) + self.vision_feature_layer = vision_feature_layer + if isinstance(vision_config, dict): + vision_config["model_type"] = "idefics3_vision" + vision_config = CONFIG_MAPPING[vision_config["model_type"]](**vision_config) + elif vision_config is None: + vision_config = CONFIG_MAPPING["idefics3_vision"]() + + self.vision_config = vision_config + self.initializer_range = initializer_range + + if isinstance(text_config, dict) and "model_type" in text_config: + text_config = AriaTextConfig(**text_config) + elif text_config is None: + text_config = AriaTextConfig() + + self.text_config = text_config + + super().__init__(**kwargs) + + +class AriaTextRMSNorm(LlamaRMSNorm): + pass + + +class AriaProjectorMLP(nn.Module): + """ + Feed-Forward Network module for the Aria Projector. + + Args: + in_features (`int`): + Input embedding dimension. + hidden_features (`int`): + Hidden dimension of the feed-forward network. + output_dim (`int`): + Output dimension. + """ + + def __init__(self, in_features, hidden_features, output_dim): + super().__init__() + self.linear_in = nn.Linear(in_features, hidden_features, bias=False) + self.linear_out = nn.Linear(hidden_features, output_dim, bias=False) + self.act = ACT2FN["gelu_new"] + + def forward(self, hidden_states): + hidden_states = self.act(self.linear_in(hidden_states)) + hidden_states = self.linear_out(hidden_states) + return hidden_states + + +class AriaCrossAttention(nn.Module): + """ + Aria Cross-Attention module. + + Args: + config (`AriaConfig`): + The configuration to use. + """ + + def __init__(self, config: AriaConfig, dropout_rate: float = 0): + super().__init__() + hidden_size = config.vision_config.hidden_size + num_heads = config.vision_config.num_attention_heads + self.num_heads = num_heads + self.q_proj = nn.Linear(hidden_size, hidden_size, bias=False) + self.k_proj = nn.Linear(hidden_size, hidden_size, bias=False) + self.v_proj = nn.Linear(hidden_size, hidden_size, bias=False) + + # Original code here: https://github.com/rhymes-ai/Aria/blob/719ff4e52b727443cba3793b0e27fe64e0244fe1/aria/model/projector.py#L48 + self.multihead_attn = nn.MultiheadAttention(hidden_size, num_heads, batch_first=True) + self.linear = nn.Linear(hidden_size, hidden_size) + self.dropout = nn.Dropout(dropout_rate) + + self.layer_norm = nn.LayerNorm(hidden_size) + self.layer_norm_kv = nn.LayerNorm(hidden_size) + + def forward(self, key_value_states, hidden_states, attn_mask=None): + """ + Forward pass of the AriaCrossAttention module. + + Args: + key_value_states (`torch.Tensor`): + Input tensor for key and value. + hidden_states (`torch.Tensor`): + Input tensor for query. + attn_mask (`torch.Tensor`, *optional*, defaults to None): + Attention mask. + + Returns: + torch.Tensor: + Output tensor after cross-attention. + """ + query = self.q_proj(self.layer_norm(hidden_states)) + + key_value_states = self.layer_norm_kv(key_value_states) + key = self.k_proj(key_value_states) + value = self.v_proj(key_value_states) + + attn_output, _ = self.multihead_attn(query, key, value, attn_mask=attn_mask) + + attn_output = self.dropout(self.linear(attn_output)) + + return attn_output + + +class AriaProjector(nn.Module): + """ + Aria Projector module. + + This module projects vision features into the language model's embedding space, enabling interaction between vision and language components. + + Args: + config (`AriaConfig`): + Configuration object for the model. + """ + + def __init__( + self, + config: AriaConfig, + ): + super().__init__() + + self.patch_to_query_dict = config.projector_patch_to_query_dict + self.in_features = config.vision_config.hidden_size + self.num_heads = config.vision_config.num_attention_heads + self.kv_dim = config.vision_config.hidden_size + self.hidden_features = config.text_config.hidden_size + self.output_dim = config.text_config.hidden_size + + self.query = nn.Parameter(torch.zeros(config.max_value_projector_patch_to_query_dict, self.in_features)) + + self.cross_attn = AriaCrossAttention(config) + + self.layer_norm = nn.LayerNorm(self.in_features) + self.feed_forward = AriaProjectorMLP(self.in_features, self.hidden_features, self.output_dim) + + def forward(self, key_value_states: torch.Tensor, attn_mask: Optional[torch.Tensor] = None): + """ + Forward pass of the Projector module. + + Args: + key_value_states (`torch.Tensor`): + Input tensor of shape (batch_size, num_patches, kv_dim). + attn_mask (`torch.Tensor`, *optional*, default is None): + Attention mask. + + Returns: + `torch.Tensor`: Output tensor of shape (batch_size, query_number, output_dim). + """ + batch_size, num_patches = key_value_states.shape[0], key_value_states.shape[1] + + if num_patches not in self.patch_to_query_dict.keys(): + raise KeyError( + f"Number of patches {num_patches} not found in patch_to_query_dict amongst possible values {self.patch_to_query_dict.keys()}." + ) + query_num = self.patch_to_query_dict[num_patches] + + queries = self.query[:query_num].unsqueeze(0).repeat(batch_size, 1, 1) + + if attn_mask is not None: + attn_mask = attn_mask.repeat_interleave(self.num_heads, 0) + attn_mask = attn_mask.unsqueeze(1).expand(-1, queries.size(1), -1) + + attention_out = self.cross_attn(key_value_states, queries, attn_mask=attn_mask) + + out = self.feed_forward(self.layer_norm(attention_out)) + + return out + + +def _get_patch_output_size(image, target_resolution, input_data_format): + original_height, original_width = get_image_size(image, channel_dim=input_data_format) + target_height, target_width = target_resolution + + scale_w = target_width / original_width + scale_h = target_height / original_height + + if scale_w < scale_h: + new_width = target_width + new_height = min(math.ceil(original_height * scale_w), target_height) + else: + new_height = target_height + new_width = min(math.ceil(original_width * scale_h), target_width) + + return new_height, new_width + + +class AriaImageProcessor(BaseImageProcessor): + """ + A vision processor for the Aria model that handles image preprocessing. + Initialize the AriaImageProcessor. + + Args: + image_mean (`list`, *optional*, defaults to [0.5, 0.5, 0.5]): + Mean values for normalization. + image_std (`list`, *optional*, defaults to [0.5, 0.5, 0.5]): + Standard deviation values for normalization. + max_image_size (`int`, *optional*, defaults to 980): + Maximum image size. + min_image_size (`int`, *optional*, defaults to 336): + Minimum image size. + split_resolutions (`list`, *optional*, defaults to a list of optimal,resolutions as tuples): + The optimal resolutions for splitting the image. + split_image (`bool`, *optional*, defaults to `False`): + Whether to split the image. + do_convert_rgb (`bool`, *optional*, defaults to `True`): + Whether to convert the image to RGB. + do_rescale (`bool`, *optional*, defaults to `True`): + Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by `do_rescale` in + the `preprocess` method. + rescale_factor (`int` or `float`, *optional*, defaults to `1/255`): + Scale factor to use if rescaling the image. Can be overridden by `rescale_factor` in the `preprocess` + method. + do_normalize (`bool`, *optional*, defaults to `True`): + Whether to normalize the image. + resample (PILImageResampling, *optional*, defaults to `BICUBIC`): + The resampling filter to use if resizing the image. + """ + + model_input_names = ["pixel_values", "pixel_mask", "num_crops"] + + def __init__( + self, + image_mean: List[float] = None, + image_std: List[float] = None, + max_image_size: int = 980, + min_image_size: int = 336, + split_resolutions: Optional[List[Tuple[int, int]]] = None, + split_image: Optional[bool] = False, + do_convert_rgb: Optional[bool] = True, + do_rescale: bool = True, + rescale_factor: Union[int, float] = 1 / 255, + do_normalize: Optional[bool] = True, + resample: PILImageResampling = PILImageResampling.BICUBIC, + **kwargs, + ): + super().__init__(**kwargs) + + if image_mean is None: + image_mean = [0.5, 0.5, 0.5] + if image_std is None: + image_std = [0.5, 0.5, 0.5] + self.max_image_size = max_image_size + self.min_image_size = min_image_size + self.image_mean = image_mean + self.image_std = image_std + self.split_image = split_image + if split_resolutions is None: + split_resolutions = [(1, 2), (1, 3), (1, 4), (1, 5), (1, 6), (1, 7), (1, 8), (2, 4), (2, 3), (2, 2), (2, 1), (3, 1), (3, 2), (4, 1), (4, 2), (5, 1), (6, 1), (7, 1), (8, 1)] # fmt: skip + split_resolutions = [(el[0] * 490, el[1] * 490) for el in split_resolutions] + self.split_resolutions = split_resolutions + self.do_convert_rgb = do_convert_rgb + self.do_rescale = do_rescale + self.rescale_factor = rescale_factor + self.do_normalize = do_normalize + self.resample = resample + + def preprocess( + self, + images: Union[ImageInput, List[ImageInput]], + image_mean: Optional[Union[float, List[float]]] = None, + image_std: Optional[Union[float, List[float]]] = None, + max_image_size: Optional[int] = None, + min_image_size: Optional[int] = None, + split_image: Optional[bool] = None, + do_convert_rgb: Optional[bool] = None, + do_rescale: Optional[bool] = None, + rescale_factor: Optional[float] = None, + do_normalize: Optional[bool] = None, + resample: PILImageResampling = None, + return_tensors: Optional[Union[str, TensorType]] = "pt", + data_format: Optional[ChannelDimension] = ChannelDimension.FIRST, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + ): + """ + Process a list of images. + + Args: + images (ImageInput or list of ImageInput): + The input image or a list of images. + image_mean (`list`, *optional*, defaults to [0.5, 0.5, 0.5]): + Mean values for normalization. + image_std (`list`, *optional*, defaults to [0.5, 0.5, 0.5]): + Standard deviation values for normalization. + max_image_size (`int`, *optional*, defaults to `self.max_image_size` (980)): + Maximum image size. + min_image_size (`int`, *optional*, defaults to `self.min_image_size` (336)): + Minimum image size. + split_image (`bool`, *optional*, defaults to `self.split_image` (False)): + Whether to split the image. + do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb` (True)): + Whether to convert the image to RGB. + do_rescale (`bool`, *optional*, defaults to `self.do_rescale`): + Whether to rescale the image. + rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`): + Rescale factor to rescale the image by if `do_rescale` is set to `True`. + do_normalize (`bool`, *optional*, defaults to `self.do_normalize` (True)): + Whether to normalize the image. + resample (PILImageResampling, *optional*, defaults to `self.resample` (BICUBIC)): + The resampling filter to use if resizing the image. + return_tensors (`str` or `TensorType`, *optional*, defaults to "pt"): + The type of tensor to return. + data_format (`str` or `ChannelDimension`, *optional*): + The channel dimension format for the output image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: + image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: + image in (height, width, num_channels) format. + If unset, will use same as the input image. + input_data_format (`str` or `ChannelDimension`, *optional*): + The channel dimension format for the input image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: + image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: + image in (height, width, num_channels) format. + If unset, will use the inferred format of the input image. + + Returns: + BatchFeature: + A BatchFeature object containing: + - 'pixel_values': + Tensor of processed image pixel values. + - 'pixel_mask': + Boolean pixel mask. This mask is a 2D tensor of shape (max_image_size, max_image_size) where: + - True (1) values indicate pixels that belong to the original resized image. + - False (0) values indicate pixels that are part of the padding. + The mask helps distinguish between actual image content and padded areas in subsequent processing steps. + - 'num_crops': + The maximum number of crops across all images. + """ + image_mean = image_mean if image_mean is not None else self.image_mean + image_std = image_std if image_std is not None else self.image_std + max_image_size = max_image_size if max_image_size is not None else self.max_image_size + min_image_size = min_image_size if min_image_size is not None else self.min_image_size + split_image = split_image if split_image is not None else self.split_image + do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb + do_rescale = do_rescale if do_rescale is not None else self.do_rescale + rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor + do_normalize = do_normalize if do_normalize is not None else self.do_normalize + resample = resample if resample is not None else self.resample + + if max_image_size not in [490, 980]: + raise ValueError("max_image_size must be either 490 or 980") + + images = make_flat_list_of_images(images) + + if not valid_images(images): + raise ValueError( + "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, " + "torch.Tensor, tf.Tensor or jax.ndarray." + ) + + validate_preprocess_arguments( + do_normalize=do_normalize, + image_mean=image_mean, + image_std=image_std, + resample=resample, + do_rescale=do_rescale, + rescale_factor=rescale_factor, + ) + + if do_convert_rgb: + images = [convert_to_rgb(image) for image in images] + + # All transformations expect numpy arrays. + images = [to_numpy_array(image) for image in images] + + if do_rescale and is_scaled_image(images[0]): + logger.warning_once( + "It looks like you are trying to rescale already rescaled images. If the input" + " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again." + ) + + if input_data_format is None: + # We assume that all images have the same channel dimension format. + input_data_format = infer_channel_dimension_format(images[0]) + + pixel_values = [] + pixel_masks = [] + num_crops = None + + for image in images: + if split_image: + crop_images = self.get_image_patches( + image, + self.split_resolutions, + max_image_size, + resample, + data_format=input_data_format, + input_data_format=input_data_format, + ) + else: + crop_images = [image] + if num_crops is None or len(crop_images) > num_crops: + num_crops = len(crop_images) + + for crop_image in crop_images: + # At this point the scale is the rescaling factor that would bring the image to max_size in its larger dimension + h, w = get_image_size(crop_image) + scale = max_image_size / max(h, w) + if w >= h: + new_size = (max(int(h * scale), min_image_size), max_image_size) # h, w + else: + new_size = (max_image_size, max(int(w * scale), min_image_size)) # h, w + + crop_image_resized = resize( + crop_image, + new_size, + resample=resample, + data_format=input_data_format, + input_data_format=input_data_format, + ) + + padding_bottom, padding_right = max_image_size - new_size[0], max_image_size - new_size[1] + crop_image_padded = pad( + crop_image_resized, + ((0, padding_bottom), (0, padding_right)), + data_format=input_data_format, + input_data_format=input_data_format, + ) + + # Create a pixel mask + pixel_mask = np.zeros((max_image_size, max_image_size), dtype=bool) + pixel_mask[: new_size[0], : new_size[1]] = 1 + pixel_masks.append(pixel_mask) + + if do_rescale: + crop_image_padded = self.rescale( + image=crop_image_padded, scale=rescale_factor, input_data_format=input_data_format + ) + + if do_normalize: + crop_image_padded = self.normalize( + crop_image_padded, + self.image_mean, + self.image_std, + data_format=input_data_format, + input_data_format=input_data_format, + ) + crop_image_padded = ( + to_channel_dimension_format(crop_image_padded, data_format, input_data_format) + if data_format is not None + else crop_image_padded + ) + + pixel_values.append(crop_image_padded) + return BatchFeature( + data={ + "pixel_values": np.stack(pixel_values, axis=0), + "pixel_mask": np.stack(pixel_masks, axis=0), + "num_crops": num_crops, + }, + tensor_type=return_tensors, + ) + + def _resize_for_patching( + self, image: np.array, target_resolution: tuple, resample, input_data_format: ChannelDimension + ) -> np.array: + """ + Resizes an image to a target resolution while maintaining aspect ratio. + + Args: + image (np.array): + The input image. + target_resolution (tuple): + The target resolution (height, width) of the image. + resample (`PILImageResampling`): + Resampling filter to use if resizing the image. + input_data_format (`ChannelDimension` or `str`): + The channel dimension format of the input image. + + Returns: + np.array: The resized and padded image. + """ + new_height, new_width = _get_patch_output_size(image, target_resolution, input_data_format) + + # Resize the image + resized_image = resize(image, (new_height, new_width), resample=resample, input_data_format=input_data_format) + + return resized_image + + def _pad_for_patching( + self, image: np.array, target_resolution: tuple, input_data_format: ChannelDimension + ) -> np.array: + """ + Pad an image to a target resolution while maintaining aspect ratio. + """ + target_height, target_width = target_resolution + new_height, new_width = _get_patch_output_size(image, target_resolution, input_data_format) + + paste_x = (target_width - new_width) // 2 + paste_y = (target_height - new_height) // 2 + + padded_image = self.pad(image, padding=((paste_y, paste_y), (paste_x, paste_x))) + + return padded_image + + def pad( + self, + image: np.ndarray, + padding: Union[int, Tuple[int, int], Iterable[Tuple[int, int]]], + mode: PaddingMode = PaddingMode.CONSTANT, + constant_values: Union[float, Iterable[float]] = 0.0, + data_format: Optional[Union[str, ChannelDimension]] = None, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + ) -> np.ndarray: + """ + Pads the `image` with the specified `padding` and `mode`. Padding can be in the (`height`, `width`) + dimension of in the (`num_patches`) dimension. In the second case an iterable if tuples is expected + as input. + + Args: + image (`np.ndarray`): + The image to pad. + padding (`int` or `Tuple[int, int]` or `Iterable[Tuple[int, int]]`): + Padding to apply to the edges of the height, width axes. Can be one of three formats: + - `((before_height, after_height), (before_width, after_width))` unique pad widths for each axis. + - `((before, after),)` yields same before and after pad for height and width. + - `(pad,)` or int is a shortcut for before = after = pad width for all axes. + mode (`PaddingMode`): + The padding mode to use. Can be one of: + - `"constant"`: pads with a constant value. + - `"reflect"`: pads with the reflection of the vector mirrored on the first and last values of the + vector along each axis. + - `"replicate"`: pads with the replication of the last value on the edge of the array along each axis. + - `"symmetric"`: pads with the reflection of the vector mirrored along the edge of the array. + constant_values (`float` or `Iterable[float]`, *optional*): + The value to use for the padding if `mode` is `"constant"`. + data_format (`str` or `ChannelDimension`, *optional*): + The channel dimension format for the output image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + If unset, will use same as the input image. + input_data_format (`str` or `ChannelDimension`, *optional*): + The channel dimension format for the input image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + If unset, will use the inferred format of the input image. + + Returns: + `np.ndarray`: The padded image. + + """ + + # call the general `pad` if padding on `height/width`, otherwise it's the `num_patched` dim + if isinstance(padding, int) or len(padding) != 4: + return pad(image, padding, mode, constant_values, data_format, input_data_format) + + if input_data_format is None: + input_data_format = infer_channel_dimension_format(image) + + padding_mode_mapping = { + PaddingMode.CONSTANT: "constant", + PaddingMode.REFLECT: "reflect", + PaddingMode.REPLICATE: "edge", + PaddingMode.SYMMETRIC: "symmetric", + } + image = np.pad(image, padding, mode=padding_mode_mapping[mode], constant_values=constant_values) + image = ( + to_channel_dimension_format(image, data_format, input_data_format) if data_format is not None else image + ) + return image + + def get_image_patches( + self, + image: np.array, + grid_pinpoints: List[Tuple[int, int]], + patch_size: int, + resample: PILImageResampling, + data_format: ChannelDimension, + input_data_format: ChannelDimension, + ) -> List[np.array]: + """ + Process an image with variable resolutions by dividing it into patches. + + Args: + image (`np.array`): + The input image to be processed. + grid_pinpoints (List[Tuple[int, int]]): + A list of possible resolutions as tuples. + patch_size (`int`): + Size of the patches to divide the image into. + resample (`PILImageResampling`): + Resampling filter to use if resizing the image. + data_format (`ChannelDimension` or `str`): + The channel dimension format for the output image. + input_data_format (`ChannelDimension` or `str`): + The channel dimension format of the input image. + + Returns: + `List[np.array]`: A list of NumPy arrays containing the processed image patches. + """ + if not isinstance(grid_pinpoints, list): + raise TypeError("grid_pinpoints must be a list of possible resolutions.") + + possible_resolutions = grid_pinpoints + + image_size = get_image_size(image, channel_dim=input_data_format) + best_resolution = select_best_resolution(image_size, possible_resolutions) + resized_image = self._resize_for_patching( + image, best_resolution, resample=resample, input_data_format=input_data_format + ) + padded_image = self._pad_for_patching(resized_image, best_resolution, input_data_format=input_data_format) + + patches = divide_to_patches(padded_image, patch_size=patch_size, input_data_format=input_data_format) + + # make sure that all patches are in the input data format + patches = [ + to_channel_dimension_format(patch, channel_dim=data_format, input_channel_dim=input_data_format) + for patch in patches + ] + return patches + + +class AriaProcessorKwargs(ProcessingKwargs, total=False): + _defaults = { + "text_kwargs": { + "padding": False, + }, + "images_kwargs": { + "max_image_size": 980, + "split_image": False, + }, + "return_tensors": TensorType.PYTORCH, + } + + +class AriaProcessor(ProcessorMixin): + """ + AriaProcessor is a processor for the Aria model which wraps the Aria image preprocessor and the LLama slow tokenizer. + + Args: + image_processor (`AriaImageProcessor`, *optional*): + The AriaImageProcessor to use for image preprocessing. + tokenizer (`PreTrainedTokenizerBase`, *optional*): + An instance of [`PreTrainedTokenizerBase`]. This should correspond with the model's text model. The tokenizer is a required input. + chat_template (`str`, *optional*): + A Jinja template which will be used to convert lists of messages in a chat into a tokenizable string. + size_conversion (`Dict`, *optional*): + A dictionary indicating size conversions for images. + """ + + attributes = ["image_processor", "tokenizer"] + valid_kwargs = ["chat_template", "size_conversion"] + image_processor_class = "AriaImageProcessor" + tokenizer_class = "AutoTokenizer" + + def __init__( + self, + image_processor=None, + tokenizer: Union[AutoTokenizer, str] = None, + chat_template: Optional[str] = None, + size_conversion: Optional[Dict[Union[float, int], int]] = None, + ): + if size_conversion is None: + size_conversion = {490: 128, 980: 256} + self.size_conversion = {int(k): v for k, v in size_conversion.items()} + + self.image_token = tokenizer.image_token + self.image_token_id = tokenizer.image_token_id + if tokenizer is not None and tokenizer.pad_token is None: + tokenizer.pad_token = tokenizer.unk_token + + super().__init__(image_processor, tokenizer, chat_template=chat_template) + + def __call__( + self, + text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]], + images: Optional[ImageInput] = None, + audio=None, + videos=None, + **kwargs: Unpack[AriaProcessorKwargs], + ) -> BatchFeature: + """ + Main method to prepare for the model one or several sequences(s) and image(s). + + Args: + text (`TextInput`, `PreTokenizedInput`, `List[TextInput]`, `List[PreTokenizedInput]`): + The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings + (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set + `is_split_into_words=True` (to lift the ambiguity with a batch of sequences). + images (`ImageInput`): + The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch + tensor. Both channels-first and channels-last formats are supported. + + + Returns: + [`BatchFeature`]: A [`BatchFeature`] with the following fields: + - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`. + - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when + `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not + `None`). + - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`. + - **pixel_mask** -- Pixel mask to be fed to a model. Returned when `images` is not `None`. + """ + output_kwargs = self._merge_kwargs( + AriaProcessorKwargs, + tokenizer_init_kwargs=self.tokenizer.init_kwargs, + **kwargs, + ) + + if isinstance(text, str): + text = [text] + elif not isinstance(text, list) and not isinstance(text[0], str): + raise ValueError("Invalid input text. Please provide a string, or a list of strings") + + if images is not None: + image_inputs = self.image_processor( + images, + **output_kwargs["images_kwargs"], + ) + # expand the image_token according to the num_crops and tokens per image + tokens_per_image = self.size_conversion[image_inputs.pixel_values.shape[2]] + prompt_strings = [] + num_crops = image_inputs.pop("num_crops") * tokens_per_image + for sample in text: + sample = sample.replace(self.tokenizer.image_token, self.tokenizer.image_token * num_crops) + prompt_strings.append(sample) + + else: + image_inputs = {} + prompt_strings = text + + return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None) + text_inputs = self.tokenizer(prompt_strings, **output_kwargs["text_kwargs"]) + self._check_special_mm_tokens(prompt_strings, text_inputs, modalities=["image"]) + + return BatchFeature(data={**text_inputs, **image_inputs}, tensor_type=return_tensors) + + def batch_decode(self, *args, **kwargs): + """ + This method forwards all its arguments to LlamaTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please + refer to the docstring of this method for more information. + """ + return self.tokenizer.batch_decode(*args, **kwargs) + + def decode(self, *args, **kwargs): + """ + This method forwards all its arguments to LlamaTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to + the docstring of this method for more information. + """ + return self.tokenizer.decode(*args, **kwargs) + + @property + def model_input_names(self): + tokenizer_input_names = self.tokenizer.model_input_names + image_processor_input_names = self.image_processor.model_input_names + + # Remove `num_crops`, it is popped and used only when processing. Make a copy of list when remocing + # otherwise `self.image_processor.model_input_names` is also modified + image_processor_input_names = [name for name in image_processor_input_names if name != "num_crops"] + return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names)) + + +class AriaSharedExpertsMLP(LlamaMLP): + """ + Shared Expert MLP for shared experts. + + Unlike routed experts, shared experts process all tokens without routing. + This class reconfigures the intermediate size in comparison to the LlamaMLP. + + Args: + config (`AriaTextConfig`): Configuration object for the Aria language model. + """ + + def __init__(self, config: AriaTextConfig): + super().__init__(self) + self.intermediate_size = config.intermediate_size * config.moe_num_shared_experts + + +class AriaGroupedExpertsGemm(nn.Module): + """ + Grouped GEMM (General Matrix Multiplication) module for efficient expert computation. + This module utilizes the grouped_gemm library (https://github.com/fanshiqing/grouped_gemm) + for optimized performance. If the grouped_gemm library is not installed, it gracefully + falls back to a sequential GEMM implementation, which may be slower but ensures + functionality. + + Args: + in_features (`int`): + Number of input features. + out_features (`int`): + Number of output features. + groups (`int`): + Number of expert groups. + """ + + def __init__(self, in_features, out_features, groups): + super().__init__() + self.in_features = in_features + self.out_features = out_features + self.groups = groups + self.weight = nn.Parameter(torch.empty(groups, in_features, out_features)) + + def forward(self, input, tokens_per_expert): + """ + Perform grouped matrix multiplication. + + Args: + input (`torch.Tensor`): + Input tensor of shape (num_tokens, in_features). + tokens_per_expert (`torch.Tensor`): + Number of tokens assigned to each expert. + + Returns: + torch.Tensor: Output tensor of shape (num_tokens, out_features). + """ + return sequential_experts_gemm( + input, + self.weight, + tokens_per_expert.cpu(), + ) + + +class AriaGroupedExpertsMLP(nn.Module): + """ + Grouped MLP module for Mixture of Experts. + + Args: + config (`AriaTextConfig`): + Configuration object for the model. + """ + + def __init__(self, config: AriaTextConfig) -> None: + super().__init__() + self.config = config + self.fc1 = AriaGroupedExpertsGemm(config.hidden_size, config.intermediate_size * 2, config.moe_num_experts) + self.fc2 = AriaGroupedExpertsGemm(config.intermediate_size, config.hidden_size, config.moe_num_experts) + + def forward(self, permuted_tokens, tokens_per_expert): + """ + Forward pass of the Grouped MLP. + + Args: + permuted_tokens (torch.Tensor): Permuted input tokens. + tokens_per_expert (torch.Tensor): Number of tokens assigned to each expert. + + Returns: + torch.Tensor: Output tensor after passing through the MLP. + """ + fc1_output = self.fc1(permuted_tokens, tokens_per_expert) + projection, gate = torch.chunk(fc1_output, 2, dim=-1) + fc1_output = nn.functional.silu(projection) * gate + fc2_output = self.fc2(fc1_output, tokens_per_expert) + return fc2_output + + +# Token permutation adapted from https://github.com/NVIDIA/Megatron-LM/blob/54f1f78529cbc2b9cddad313e7f9d96ac0420a27/megatron/core/transformer/moe/token_dispatcher.py#L291-L587 +class AriaTextMoELayer(nn.Module): + """ + Aria Text Mixture of Experts (MoE) Layer. + + This layer applies a gating mechanism to route input tokens to different experts. + + Args: + config (`AriaTextConfig`): + Configuration object for the text component of the model. + """ + + def __init__(self, config: AriaTextConfig): + super().__init__() + + self.router = nn.Linear(config.hidden_size, config.moe_num_experts, bias=False) + self.experts = AriaGroupedExpertsMLP(config) + self.shared_experts = AriaSharedExpertsMLP(config) + self.config = config + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + """ + Forward pass of the MoE Layer. + + Args: + hidden_states (`torch.Tensor`): + Input tensor of shape (batch_size, sequence_length, hidden_size). + + Returns: + torch.Tensor: Output tensor after passing through the MoE layer. + + Process: + 1. Route tokens to experts using the router. + 2. Permute tokens based on routing decisions. + 3. Process tokens through experts. + 4. Unpermute and combine expert outputs. + 5. Add shared expert output to the final result. + """ + original_shape = hidden_states.shape + hidden_states = hidden_states.view(-1, hidden_states.size(-1)) + + # Top K Routing + logits = self.router(hidden_states) + top_logits, top_indices = torch.topk(logits, k=self.config.moe_topk, dim=1) + scores = nn.functional.softmax(top_logits, dim=-1) + + original_dtype = top_indices.dtype + + tokens_per_expert = torch.histc( + top_indices.flatten().to(torch.float32), + bins=self.config.moe_num_experts, + min=0, + max=self.config.moe_num_experts - 1, + ).to(original_dtype) + indices = top_indices + + # Token permutation + flatten_indices = indices.view(-1) + sorted_indices = torch.argsort(flatten_indices) + permuted_tokens = hidden_states.index_select(0, sorted_indices // self.config.moe_topk) + + # Process through experts + expert_output = self.experts(permuted_tokens, tokens_per_expert) + + # Token unpermutation + unpermuted_tokens = torch.zeros( + (scores.shape[0] * self.config.moe_topk, expert_output.size(1)), + dtype=expert_output.dtype, + device=expert_output.device, + ) + unpermuted_tokens.index_copy_(0, sorted_indices, expert_output) + unpermuted_tokens = unpermuted_tokens.view(-1, self.config.moe_topk, expert_output.size(1)) + + output = (unpermuted_tokens * scores.unsqueeze(-1)).sum(dim=1).view(original_shape) + + # Add shared expert output + shared_expert_output = self.shared_experts(hidden_states.view(original_shape)) + return output + shared_expert_output + + +class AriaTextDecoderLayer(LlamaDecoderLayer): + """ + Aria Text Decoder Layer. + + This class defines a single decoder layer in the language model, incorporating self-attention and Mixture of Experts (MoE) feed-forward network. + + Args: + config (`AriaTextConfig`): + Configuration object for the text component of the model. + layer_idx (`int`): + Index of the layer. + """ + + def __init__(self, config: AriaTextConfig, layer_idx: int): + super().__init__(self) + self.mlp = AriaTextMoELayer(config) + + +class AriaTextPreTrainedModel(PreTrainedModel): + """ + An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained models. + """ + + config_class = AriaConfig + base_model_prefix = "model" + _no_split_modules = ["AriaTextDecoderLayer", "AriaGroupedExpertsGemm"] + supports_gradient_checkpointing = True + _skip_keys_device_placement = "past_key_values" + _supports_flash_attn_2 = False + _supports_sdpa = True + _supports_cache_class = True + + def _init_weights(self, module): + std = self.config.initializer_range + if isinstance(module, nn.Linear): + module.weight.data.normal_(mean=0.0, std=std) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=std) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + elif isinstance(module, AriaTextRMSNorm): + module.weight.data.fill_(1.0) + elif isinstance(module, AriaGroupedExpertsGemm): + module.weight.data.normal_(mean=0.0, std=std) + + +class AriaPreTrainedModel(LlamaPreTrainedModel): + _supports_static_cache = False # MoE models don't work with torch.compile (dynamic slicing) + _supports_attention_backend = False + + def _init_weights(self, module): + std = self.config.initializer_range + + if isinstance(module, nn.Linear): + module.weight.data.normal_(mean=0.0, std=std) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.MultiheadAttention): + # This uses torch's original init + module._reset_parameters() + elif isinstance(module, nn.LayerNorm): + module.weight.data.fill_(1.0) + module.bias.data.zero_() + elif isinstance(module, AriaProjector): + nn.init.trunc_normal_(module.query, std=std) + + +class AriaTextModel(LlamaModel): + def __init__(self, config: AriaTextConfig): + super().__init__(config) + self.layers = nn.ModuleList( + [AriaTextDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)] + ) + self.gradient_checkpointing = False + self.post_init() + + +class AriaTextForCausalLM(AriaTextPreTrainedModel, LlamaForCausalLM): + """ + Aria model for causal language modeling tasks. + + This class extends `LlamaForCausalLM` to incorporate the Mixture of Experts (MoE) approach, + allowing for more efficient and scalable language modeling. + + Args: + config (`AriaTextConfig`): + Configuration object for the model. + """ + + _tied_weights_keys = ["lm_head.weight"] + config_class = AriaTextConfig + + def __init__(self, config: AriaTextConfig): + super().__init__(config) + self.model = AriaTextModel(config) + self.vocab_size = config.vocab_size + self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) + + # Initialize weights and apply final processing + self.post_init() + + +class AriaCausalLMOutputWithPast(LlavaCausalLMOutputWithPast): + pass + + +ARIA_INPUTS_DOCSTRING = r""" + Args: + input_ids (`torch.LongTensor`, *optional*): + Input token IDs. + pixel_values (`torch.FloatTensor`, *optional*): + Pixel values of the images. + pixel_mask (`torch.LongTensor`, *optional*): + Mask for the pixel values. + attention_mask (`torch.Tensor`, *optional*): + Attention mask. + position_ids (`torch.LongTensor`, *optional*): + Position IDs. + past_key_values (`List[torch.FloatTensor]`, *optional*): + Past key values for efficient processing. + inputs_embeds (`torch.FloatTensor`, *optional*): + Input embeddings. + labels (`torch.LongTensor`, *optional*): + Labels for computing the language modeling loss. + use_cache (`bool`, *optional*): + Whether to use the model's cache mechanism. + output_attentions (`bool`, *optional*): + Whether to output attention weights. + output_hidden_states (`bool`, *optional*): + Whether to output hidden states. + return_dict (`bool`, *optional*): + Whether to return a `ModelOutput` object. + logits_to_keep (`int` or `torch.Tensor`, *optional*, defaults to 0): + If an `int`, calculate logits for the last `logits_to_keep` tokens, or all `input_ids` if `0`. + Otherwise, slice according to the 1D tensor in the sequence length dimension + cache_position (`torch.LongTensor`, *optional*): + Cache positions. + **loss_kwargs: + Additional keyword arguments for loss calculation. +""" + +ARIA_START_DOCSTRING = r""" + This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads + etc.) + + This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. + Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage + and behavior. + + Parameters: + config (`AriaConfig`): + Model configuration class with all the parameters of the model. Initializing with a config file does not + load the weights associated with the model, only the configuration. Check out the + [`~PreTrainedModel.from_pretrained`] method to load the model weights. +""" + + +@add_start_docstrings( + """Aria model for conditional generation tasks. + + This model combines a vision tower, a multi-modal projector, and a language model + to perform tasks that involve both image and text inputs.""", + ARIA_START_DOCSTRING, +) +class AriaForConditionalGeneration(AriaPreTrainedModel, GenerationMixin): + config_class = AriaConfig + _supports_flash_attn_2 = False + _supports_flex_attn = False + _supports_sdpa = False + _tied_weights_keys = ["language_model.lm_head.weight"] + + def __init__(self, config: AriaConfig): + super().__init__(config) + + self.vision_tower = AutoModel.from_config(config.vision_config) + self.multi_modal_projector = AriaProjector(config) + self.vocab_size = config.text_config.vocab_size + self.language_model = AutoModelForCausalLM.from_config(config.text_config) + self.pad_token_id = self.config.pad_token_id if self.config.pad_token_id is not None else -1 + self._use_flash_attention_2 = config.text_config._attn_implementation == "flash_attention_2" + self.post_init() + + def _create_patch_attention_mask(self, pixel_mask): + if pixel_mask is None: + return None + + patches_subgrid = pixel_mask.unfold( + dimension=1, + size=self.vision_tower.config.patch_size, + step=self.vision_tower.config.patch_size, + ) + patches_subgrid = patches_subgrid.unfold( + dimension=2, + size=self.vision_tower.config.patch_size, + step=self.vision_tower.config.patch_size, + ) + return (patches_subgrid.sum(dim=(-1, -2)) > 0).bool() + + def get_input_embeddings(self): + return self.language_model.get_input_embeddings() + + def set_input_embeddings(self, value): + self.language_model.set_input_embeddings(value) + + def get_output_embeddings(self): + return self.language_model.get_output_embeddings() + + def set_output_embeddings(self, new_embeddings): + self.language_model.set_output_embeddings(new_embeddings) + + def set_decoder(self, decoder): + self.language_model.set_decoder(decoder) + + def get_decoder(self): + return self.language_model.get_decoder() + + def get_image_features( + self, + pixel_values: torch.FloatTensor, + pixel_mask: Optional[torch.FloatTensor] = None, + vision_feature_layer: int = -1, + ): + patch_attention_mask = self._create_patch_attention_mask(pixel_mask) + image_outputs = self.vision_tower( + pixel_values, patch_attention_mask=patch_attention_mask, output_hidden_states=True + ) + image_attn_mask = None + if patch_attention_mask is not None: + flattened_mask = patch_attention_mask.flatten(1) + image_attn_mask = torch.logical_not(flattened_mask) + + selected_image_feature = image_outputs.hidden_states[vision_feature_layer] + image_features = self.multi_modal_projector(selected_image_feature, attn_mask=image_attn_mask) + return image_features + + @can_return_tuple + @add_start_docstrings_to_model_forward(ARIA_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=AriaCausalLMOutputWithPast, config_class=AriaConfig) + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + pixel_values: Optional[torch.FloatTensor] = None, + pixel_mask: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + logits_to_keep: Union[int, torch.Tensor] = 0, + cache_position: Optional[torch.LongTensor] = None, + **loss_kwargs, + ) -> AriaCausalLMOutputWithPast: + r""" + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., + config.vocab_size]` or `model.image_token_id` (where `model` is your instance of `Idefics3ForConditionalGeneration`). + Tokens with indices set to `model.image_token_id` are ignored (masked), the loss is only + computed for the tokens with labels in `[0, ..., config.vocab_size]`. + Returns: + + Example: + + ```python + >>> import requests + >>> import torch + >>> from PIL import Image + >>> from io import BytesIO + + >>> from transformers import AutoProcessor, AutoModel + >>> from transformers.image_utils import load_image + + >>> # Note that passing the image urls (instead of the actual pil images) to the processor is also possible + >>> image1 = load_image("https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg") + >>> image2 = load_image("https://cdn.britannica.com/59/94459-050-DBA42467/Skyline-Chicago.jpg") + >>> image3 = load_image("https://cdn.britannica.com/68/170868-050-8DDE8263/Golden-Gate-Bridge-San-Francisco.jpg") + + >>> processor = AutoProcessor.from_pretrained("Rhymes-AI/Aria") + >>> model = AutoModel.from_pretrained("Rhymes-AI/Aria", torch_dtype=torch.bfloat16, device_map="auto") + + >>> # Create inputs + >>> messages = [ + ... { + ... "role": "user", + ... "content": [ + ... {"type": "image"}, + ... {"type": "text", "text": "In this image, we can see the city of New York, and more specifically the Statue of Liberty."}, + ... {"type": "image"}, + ... {"type": "text", "text": "What can we see in this image?"}, + ... ] + ... }, + ... { + ... "role": "user", + ... "content": [ + ... {"type": "image"}, + ... {"type": "text", "text": "In which city is that bridge located?"}, + ... ] + ... } + ... ] + + >>> prompts = [processor.apply_chat_template([message], add_generation_prompt=True) for message in messages] + >>> images = [[image1, image2], [image3]] + >>> inputs = processor(text=prompts, images=images, padding=True, return_tensors="pt").to(model.device) + + >>> # Generate + >>> generated_ids = model.generate(**inputs, max_new_tokens=256) + >>> generated_texts = processor.batch_decode(generated_ids, skip_special_tokens=True) + + >>> print(generated_texts[0]) + Assistant: There are buildings, trees, lights, and water visible in this image. + + >>> print(generated_texts[1]) + Assistant: The bridge is in San Francisco. + ```""" + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + + if inputs_embeds is None: + inputs_embeds = self.get_input_embeddings()(input_ids) + + # 2. Merge text and images + if pixel_values is not None and inputs_embeds.shape[1] != 1: + if input_ids is None: + special_image_mask = inputs_embeds == self.get_input_embeddings()( + torch.tensor(self.config.image_token_id, dtype=torch.long, device=inputs_embeds.device) + ) + n_image_tokens = (special_image_mask).sum(dim=1).sum(dim=0)[0] + else: + image_embeds = input_ids == self.config.image_token_id + special_image_mask = image_embeds.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device) + n_image_tokens = (image_embeds).sum(dim=1).sum(dim=0) + image_features = self.get_image_features( + pixel_values=pixel_values, + pixel_mask=pixel_mask, + vision_feature_layer=self.config.vision_feature_layer, + ) + n_images, n_features_per_image = image_features.shape[0], image_features.shape[1] + n_image_features = n_images * n_features_per_image + if n_image_tokens != n_image_features: + raise ValueError( + f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}" + ) + + image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype) + inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features) + + outputs: CausalLMOutputWithPast = self.language_model( + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + logits_to_keep=logits_to_keep, + cache_position=cache_position, + ) + + logits = outputs.logits + + loss = None + if labels is not None: + loss = self.loss_function( + logits=logits, labels=labels, vocab_size=self.config.text_config.vocab_size, **loss_kwargs + ) + + return AriaCausalLMOutputWithPast( + loss=loss, + logits=logits, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + def prepare_inputs_for_generation( + self, + input_ids, + past_key_values=None, + inputs_embeds=None, + pixel_values=None, + pixel_mask=None, + attention_mask=None, + cache_position=None, + logits_to_keep=None, + **kwargs, + ): + model_inputs = self.language_model.prepare_inputs_for_generation( + input_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + attention_mask=attention_mask, + cache_position=cache_position, + logits_to_keep=logits_to_keep, + **kwargs, + ) + + if cache_position[0] == 0: + # If we're in cached decoding stage, pixel values should be None because input ids do not contain special image token anymore + # Otherwise we need pixel values to be passed to model + model_inputs["pixel_values"] = pixel_values + model_inputs["pixel_mask"] = pixel_mask + + return model_inputs + + +__all__ = [ + "AriaConfig", + "AriaTextConfig", + "AriaImageProcessor", + "AriaProcessor", + "AriaForConditionalGeneration", + "AriaPreTrainedModel", + "AriaTextPreTrainedModel", + "AriaTextModel", + "AriaTextForCausalLM", +] diff --git a/docs/transformers/build/lib/transformers/models/aria/processing_aria.py b/docs/transformers/build/lib/transformers/models/aria/processing_aria.py new file mode 100644 index 0000000000000000000000000000000000000000..7d3076247550521b0900b07991ba0d68ad24747a --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/aria/processing_aria.py @@ -0,0 +1,171 @@ +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# This file was automatically generated from src/transformers/models/aria/modular_aria.py. +# Do NOT edit this file manually as any edits will be overwritten by the generation of +# the file from the modular. If any change should be done, please apply the change to the +# modular_aria.py file directly. One of our CI enforces this. +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# coding=utf-8 +# Copyright 2024 The Rhymes-AI Teams Authors and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import Dict, List, Optional, Union + +from ...image_processing_utils import BatchFeature +from ...image_utils import ImageInput +from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack +from ...tokenization_utils import PreTokenizedInput, TextInput +from ...utils import TensorType +from ..auto import AutoTokenizer + + +class AriaProcessorKwargs(ProcessingKwargs, total=False): + _defaults = { + "text_kwargs": { + "padding": False, + }, + "images_kwargs": { + "max_image_size": 980, + "split_image": False, + }, + "return_tensors": TensorType.PYTORCH, + } + + +class AriaProcessor(ProcessorMixin): + """ + AriaProcessor is a processor for the Aria model which wraps the Aria image preprocessor and the LLama slow tokenizer. + + Args: + image_processor (`AriaImageProcessor`, *optional*): + The AriaImageProcessor to use for image preprocessing. + tokenizer (`PreTrainedTokenizerBase`, *optional*): + An instance of [`PreTrainedTokenizerBase`]. This should correspond with the model's text model. The tokenizer is a required input. + chat_template (`str`, *optional*): + A Jinja template which will be used to convert lists of messages in a chat into a tokenizable string. + size_conversion (`Dict`, *optional*): + A dictionary indicating size conversions for images. + """ + + attributes = ["image_processor", "tokenizer"] + valid_kwargs = ["chat_template", "size_conversion"] + image_processor_class = "AriaImageProcessor" + tokenizer_class = "AutoTokenizer" + + def __init__( + self, + image_processor=None, + tokenizer: Union[AutoTokenizer, str] = None, + chat_template: Optional[str] = None, + size_conversion: Optional[Dict[Union[float, int], int]] = None, + ): + if size_conversion is None: + size_conversion = {490: 128, 980: 256} + self.size_conversion = {int(k): v for k, v in size_conversion.items()} + + self.image_token = tokenizer.image_token + self.image_token_id = tokenizer.image_token_id + if tokenizer is not None and tokenizer.pad_token is None: + tokenizer.pad_token = tokenizer.unk_token + + super().__init__(image_processor, tokenizer, chat_template=chat_template) + + def __call__( + self, + text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]], + images: Optional[ImageInput] = None, + audio=None, + videos=None, + **kwargs: Unpack[AriaProcessorKwargs], + ) -> BatchFeature: + """ + Main method to prepare for the model one or several sequences(s) and image(s). + + Args: + text (`TextInput`, `PreTokenizedInput`, `List[TextInput]`, `List[PreTokenizedInput]`): + The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings + (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set + `is_split_into_words=True` (to lift the ambiguity with a batch of sequences). + images (`ImageInput`): + The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch + tensor. Both channels-first and channels-last formats are supported. + + + Returns: + [`BatchFeature`]: A [`BatchFeature`] with the following fields: + - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`. + - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when + `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not + `None`). + - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`. + - **pixel_mask** -- Pixel mask to be fed to a model. Returned when `images` is not `None`. + """ + output_kwargs = self._merge_kwargs( + AriaProcessorKwargs, + tokenizer_init_kwargs=self.tokenizer.init_kwargs, + **kwargs, + ) + + if isinstance(text, str): + text = [text] + elif not isinstance(text, list) and not isinstance(text[0], str): + raise ValueError("Invalid input text. Please provide a string, or a list of strings") + + if images is not None: + image_inputs = self.image_processor( + images, + **output_kwargs["images_kwargs"], + ) + # expand the image_token according to the num_crops and tokens per image + tokens_per_image = self.size_conversion[image_inputs.pixel_values.shape[2]] + prompt_strings = [] + num_crops = image_inputs.pop("num_crops") * tokens_per_image + for sample in text: + sample = sample.replace(self.tokenizer.image_token, self.tokenizer.image_token * num_crops) + prompt_strings.append(sample) + + else: + image_inputs = {} + prompt_strings = text + + return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None) + text_inputs = self.tokenizer(prompt_strings, **output_kwargs["text_kwargs"]) + self._check_special_mm_tokens(prompt_strings, text_inputs, modalities=["image"]) + + return BatchFeature(data={**text_inputs, **image_inputs}, tensor_type=return_tensors) + + def batch_decode(self, *args, **kwargs): + """ + This method forwards all its arguments to LlamaTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please + refer to the docstring of this method for more information. + """ + return self.tokenizer.batch_decode(*args, **kwargs) + + def decode(self, *args, **kwargs): + """ + This method forwards all its arguments to LlamaTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to + the docstring of this method for more information. + """ + return self.tokenizer.decode(*args, **kwargs) + + @property + def model_input_names(self): + tokenizer_input_names = self.tokenizer.model_input_names + image_processor_input_names = self.image_processor.model_input_names + + # Remove `num_crops`, it is popped and used only when processing. Make a copy of list when remocing + # otherwise `self.image_processor.model_input_names` is also modified + image_processor_input_names = [name for name in image_processor_input_names if name != "num_crops"] + return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names)) + + +__all__ = ["AriaProcessor"] diff --git a/docs/transformers/build/lib/transformers/models/audio_spectrogram_transformer/__init__.py b/docs/transformers/build/lib/transformers/models/audio_spectrogram_transformer/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..618fceef70d3891f4313958cdadede27d605f12c --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/audio_spectrogram_transformer/__init__.py @@ -0,0 +1,28 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ...utils import _LazyModule +from ...utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .configuration_audio_spectrogram_transformer import * + from .feature_extraction_audio_spectrogram_transformer import * + from .modeling_audio_spectrogram_transformer import * +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/docs/transformers/build/lib/transformers/models/audio_spectrogram_transformer/configuration_audio_spectrogram_transformer.py b/docs/transformers/build/lib/transformers/models/audio_spectrogram_transformer/configuration_audio_spectrogram_transformer.py new file mode 100644 index 0000000000000000000000000000000000000000..77bec930236f600ae7ee5117d1d2f86a5a12fefd --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/audio_spectrogram_transformer/configuration_audio_spectrogram_transformer.py @@ -0,0 +1,131 @@ +# coding=utf-8 +# Copyright 2022 Google AI and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Audio Spectogram Transformer (AST) model configuration""" + +from typing import Any, Dict + +from ...configuration_utils import PretrainedConfig +from ...utils import logging + + +logger = logging.get_logger(__name__) + + +class ASTConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`ASTModel`]. It is used to instantiate an AST + model according to the specified arguments, defining the model architecture. Instantiating a configuration with the + defaults will yield a similar configuration to that of the AST + [MIT/ast-finetuned-audioset-10-10-0.4593](https://huggingface.co/MIT/ast-finetuned-audioset-10-10-0.4593) + architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + hidden_size (`int`, *optional*, defaults to 768): + Dimensionality of the encoder layers and the pooler layer. + num_hidden_layers (`int`, *optional*, defaults to 12): + Number of hidden layers in the Transformer encoder. + num_attention_heads (`int`, *optional*, defaults to 12): + Number of attention heads for each attention layer in the Transformer encoder. + intermediate_size (`int`, *optional*, defaults to 3072): + Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder. + hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`): + The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, + `"relu"`, `"selu"` and `"gelu_new"` are supported. + hidden_dropout_prob (`float`, *optional*, defaults to 0.0): + The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. + attention_probs_dropout_prob (`float`, *optional*, defaults to 0.0): + The dropout ratio for the attention probabilities. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + layer_norm_eps (`float`, *optional*, defaults to 1e-12): + The epsilon used by the layer normalization layers. + patch_size (`int`, *optional*, defaults to 16): + The size (resolution) of each patch. + qkv_bias (`bool`, *optional*, defaults to `True`): + Whether to add a bias to the queries, keys and values. + frequency_stride (`int`, *optional*, defaults to 10): + Frequency stride to use when patchifying the spectrograms. + time_stride (`int`, *optional*, defaults to 10): + Temporal stride to use when patchifying the spectrograms. + max_length (`int`, *optional*, defaults to 1024): + Temporal dimension of the spectrograms. + num_mel_bins (`int`, *optional*, defaults to 128): + Frequency dimension of the spectrograms (number of Mel-frequency bins). + + Example: + + ```python + >>> from transformers import ASTConfig, ASTModel + + >>> # Initializing a AST MIT/ast-finetuned-audioset-10-10-0.4593 style configuration + >>> configuration = ASTConfig() + + >>> # Initializing a model (with random weights) from the MIT/ast-finetuned-audioset-10-10-0.4593 style configuration + >>> model = ASTModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "audio-spectrogram-transformer" + + def __init__( + self, + hidden_size=768, + num_hidden_layers=12, + num_attention_heads=12, + intermediate_size=3072, + hidden_act="gelu", + hidden_dropout_prob=0.0, + attention_probs_dropout_prob=0.0, + initializer_range=0.02, + layer_norm_eps=1e-12, + patch_size=16, + qkv_bias=True, + frequency_stride=10, + time_stride=10, + max_length=1024, + num_mel_bins=128, + **kwargs, + ): + super().__init__(**kwargs) + + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.intermediate_size = intermediate_size + self.hidden_act = hidden_act + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.initializer_range = initializer_range + self.layer_norm_eps = layer_norm_eps + self.patch_size = patch_size + self.qkv_bias = qkv_bias + self.frequency_stride = frequency_stride + self.time_stride = time_stride + self.max_length = max_length + self.num_mel_bins = num_mel_bins + + # Overwritten from the parent class: AST is not compatible with `generate`, but has a config parameter sharing the + # same name (`max_length`). Sharing the same name triggers checks regarding the config -> generation_config + # generative parameters deprecation cycle, overwriting this function prevents this from happening. + def _get_non_default_generation_parameters(self) -> Dict[str, Any]: + return {} + + +__all__ = ["ASTConfig"] diff --git a/docs/transformers/build/lib/transformers/models/audio_spectrogram_transformer/convert_audio_spectrogram_transformer_original_to_pytorch.py b/docs/transformers/build/lib/transformers/models/audio_spectrogram_transformer/convert_audio_spectrogram_transformer_original_to_pytorch.py new file mode 100644 index 0000000000000000000000000000000000000000..d211ef7ab058f0ab23439d727cf5fa4f22dad4cf --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/audio_spectrogram_transformer/convert_audio_spectrogram_transformer_original_to_pytorch.py @@ -0,0 +1,279 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Convert Audio Spectrogram Transformer checkpoints from the original repository. URL: https://github.com/YuanGongND/ast""" + +import argparse +import json +from pathlib import Path + +import torch +import torchaudio +from datasets import load_dataset +from huggingface_hub import hf_hub_download + +from transformers import ASTConfig, ASTFeatureExtractor, ASTForAudioClassification +from transformers.utils import logging + + +logging.set_verbosity_info() +logger = logging.get_logger(__name__) + + +def get_audio_spectrogram_transformer_config(model_name): + config = ASTConfig() + + if "10-10" in model_name: + pass + elif "speech-commands" in model_name: + config.max_length = 128 + elif "12-12" in model_name: + config.time_stride = 12 + config.frequency_stride = 12 + elif "14-14" in model_name: + config.time_stride = 14 + config.frequency_stride = 14 + elif "16-16" in model_name: + config.time_stride = 16 + config.frequency_stride = 16 + else: + raise ValueError("Model not supported") + + repo_id = "huggingface/label-files" + if "speech-commands" in model_name: + config.num_labels = 35 + filename = "speech-commands-v2-id2label.json" + else: + config.num_labels = 527 + filename = "audioset-id2label.json" + + id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r")) + id2label = {int(k): v for k, v in id2label.items()} + config.id2label = id2label + config.label2id = {v: k for k, v in id2label.items()} + + return config + + +def rename_key(name): + if "module.v" in name: + name = name.replace("module.v", "audio_spectrogram_transformer") + if "cls_token" in name: + name = name.replace("cls_token", "embeddings.cls_token") + if "dist_token" in name: + name = name.replace("dist_token", "embeddings.distillation_token") + if "pos_embed" in name: + name = name.replace("pos_embed", "embeddings.position_embeddings") + if "patch_embed.proj" in name: + name = name.replace("patch_embed.proj", "embeddings.patch_embeddings.projection") + # transformer blocks + if "blocks" in name: + name = name.replace("blocks", "encoder.layer") + if "attn.proj" in name: + name = name.replace("attn.proj", "attention.output.dense") + if "attn" in name: + name = name.replace("attn", "attention.self") + if "norm1" in name: + name = name.replace("norm1", "layernorm_before") + if "norm2" in name: + name = name.replace("norm2", "layernorm_after") + if "mlp.fc1" in name: + name = name.replace("mlp.fc1", "intermediate.dense") + if "mlp.fc2" in name: + name = name.replace("mlp.fc2", "output.dense") + # final layernorm + if "audio_spectrogram_transformer.norm" in name: + name = name.replace("audio_spectrogram_transformer.norm", "audio_spectrogram_transformer.layernorm") + # classifier head + if "module.mlp_head.0" in name: + name = name.replace("module.mlp_head.0", "classifier.layernorm") + if "module.mlp_head.1" in name: + name = name.replace("module.mlp_head.1", "classifier.dense") + + return name + + +def convert_state_dict(orig_state_dict, config): + for key in orig_state_dict.copy().keys(): + val = orig_state_dict.pop(key) + + if "qkv" in key: + key_split = key.split(".") + layer_num = int(key_split[3]) + dim = config.hidden_size + if "weight" in key: + orig_state_dict[ + f"audio_spectrogram_transformer.encoder.layer.{layer_num}.attention.attention.query.weight" + ] = val[:dim, :] + orig_state_dict[ + f"audio_spectrogram_transformer.encoder.layer.{layer_num}.attention.attention.key.weight" + ] = val[dim : dim * 2, :] + orig_state_dict[ + f"audio_spectrogram_transformer.encoder.layer.{layer_num}.attention.attention.value.weight" + ] = val[-dim:, :] + else: + orig_state_dict[ + f"audio_spectrogram_transformer.encoder.layer.{layer_num}.attention.attention.query.bias" + ] = val[:dim] + orig_state_dict[ + f"audio_spectrogram_transformer.encoder.layer.{layer_num}.attention.attention.key.bias" + ] = val[dim : dim * 2] + orig_state_dict[ + f"audio_spectrogram_transformer.encoder.layer.{layer_num}.attention.attention.value.bias" + ] = val[-dim:] + else: + orig_state_dict[rename_key(key)] = val + + return orig_state_dict + + +def remove_keys(state_dict): + ignore_keys = [ + "module.v.head.weight", + "module.v.head.bias", + "module.v.head_dist.weight", + "module.v.head_dist.bias", + ] + for k in ignore_keys: + state_dict.pop(k, None) + + +@torch.no_grad() +def convert_audio_spectrogram_transformer_checkpoint(model_name, pytorch_dump_folder_path, push_to_hub=False): + """ + Copy/paste/tweak model's weights to our Audio Spectrogram Transformer structure. + """ + config = get_audio_spectrogram_transformer_config(model_name) + + model_name_to_url = { + "ast-finetuned-audioset-10-10-0.4593": ( + "https://www.dropbox.com/s/ca0b1v2nlxzyeb4/audioset_10_10_0.4593.pth?dl=1" + ), + "ast-finetuned-audioset-10-10-0.450": ( + "https://www.dropbox.com/s/1tv0hovue1bxupk/audioset_10_10_0.4495.pth?dl=1" + ), + "ast-finetuned-audioset-10-10-0.448": ( + "https://www.dropbox.com/s/6u5sikl4b9wo4u5/audioset_10_10_0.4483.pth?dl=1" + ), + "ast-finetuned-audioset-10-10-0.448-v2": ( + "https://www.dropbox.com/s/kt6i0v9fvfm1mbq/audioset_10_10_0.4475.pth?dl=1" + ), + "ast-finetuned-audioset-12-12-0.447": ( + "https://www.dropbox.com/s/snfhx3tizr4nuc8/audioset_12_12_0.4467.pth?dl=1" + ), + "ast-finetuned-audioset-14-14-0.443": ( + "https://www.dropbox.com/s/z18s6pemtnxm4k7/audioset_14_14_0.4431.pth?dl=1" + ), + "ast-finetuned-audioset-16-16-0.442": ( + "https://www.dropbox.com/s/mdsa4t1xmcimia6/audioset_16_16_0.4422.pth?dl=1" + ), + "ast-finetuned-speech-commands-v2": ( + "https://www.dropbox.com/s/q0tbqpwv44pquwy/speechcommands_10_10_0.9812.pth?dl=1" + ), + } + + # load original state_dict + checkpoint_url = model_name_to_url[model_name] + state_dict = torch.hub.load_state_dict_from_url(checkpoint_url, map_location="cpu") + # remove some keys + remove_keys(state_dict) + # rename some keys + new_state_dict = convert_state_dict(state_dict, config) + + # load 🤗 model + model = ASTForAudioClassification(config) + model.eval() + + model.load_state_dict(new_state_dict) + + # verify outputs on dummy input + # source: https://github.com/YuanGongND/ast/blob/79e873b8a54d0a3b330dd522584ff2b9926cd581/src/run.py#L62 + mean = -4.2677393 if "speech-commands" not in model_name else -6.845978 + std = 4.5689974 if "speech-commands" not in model_name else 5.5654526 + max_length = 1024 if "speech-commands" not in model_name else 128 + feature_extractor = ASTFeatureExtractor(mean=mean, std=std, max_length=max_length) + + if "speech-commands" in model_name: + # TODO: Convert dataset to Parquet + dataset = load_dataset("google/speech_commands", "v0.02", split="validation", trust_remote_code=True) + waveform = dataset[0]["audio"]["array"] + else: + filepath = hf_hub_download( + repo_id="nielsr/audio-spectogram-transformer-checkpoint", + filename="sample_audio.flac", + repo_type="dataset", + ) + + waveform, _ = torchaudio.load(filepath) + waveform = waveform.squeeze().numpy() + + inputs = feature_extractor(waveform, sampling_rate=16000, return_tensors="pt") + + # forward pass + outputs = model(**inputs) + logits = outputs.logits + + if model_name == "ast-finetuned-audioset-10-10-0.4593": + expected_slice = torch.tensor([-0.8760, -7.0042, -8.6602]) + elif model_name == "ast-finetuned-audioset-10-10-0.450": + expected_slice = torch.tensor([-1.1986, -7.0903, -8.2718]) + elif model_name == "ast-finetuned-audioset-10-10-0.448": + expected_slice = torch.tensor([-2.6128, -8.0080, -9.4344]) + elif model_name == "ast-finetuned-audioset-10-10-0.448-v2": + expected_slice = torch.tensor([-1.5080, -7.4534, -8.8917]) + elif model_name == "ast-finetuned-audioset-12-12-0.447": + expected_slice = torch.tensor([-0.5050, -6.5833, -8.0843]) + elif model_name == "ast-finetuned-audioset-14-14-0.443": + expected_slice = torch.tensor([-0.3826, -7.0336, -8.2413]) + elif model_name == "ast-finetuned-audioset-16-16-0.442": + expected_slice = torch.tensor([-1.2113, -6.9101, -8.3470]) + elif model_name == "ast-finetuned-speech-commands-v2": + expected_slice = torch.tensor([6.1589, -8.0566, -8.7984]) + else: + raise ValueError("Unknown model name") + if not torch.allclose(logits[0, :3], expected_slice, atol=1e-4): + raise ValueError("Logits don't match") + print("Looks ok!") + + if pytorch_dump_folder_path is not None: + Path(pytorch_dump_folder_path).mkdir(exist_ok=True) + print(f"Saving model {model_name} to {pytorch_dump_folder_path}") + model.save_pretrained(pytorch_dump_folder_path) + print(f"Saving feature extractor to {pytorch_dump_folder_path}") + feature_extractor.save_pretrained(pytorch_dump_folder_path) + + if push_to_hub: + print("Pushing model and feature extractor to the hub...") + model.push_to_hub(f"MIT/{model_name}") + feature_extractor.push_to_hub(f"MIT/{model_name}") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + # Required parameters + parser.add_argument( + "--model_name", + default="ast-finetuned-audioset-10-10-0.4593", + type=str, + help="Name of the Audio Spectrogram Transformer model you'd like to convert.", + ) + parser.add_argument( + "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory." + ) + parser.add_argument( + "--push_to_hub", action="store_true", help="Whether or not to push the converted model to the 🤗 hub." + ) + + args = parser.parse_args() + convert_audio_spectrogram_transformer_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub) diff --git a/docs/transformers/build/lib/transformers/models/audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.py b/docs/transformers/build/lib/transformers/models/audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.py new file mode 100644 index 0000000000000000000000000000000000000000..888d38e1870367526492ecfa38f3a488cc5a0b02 --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.py @@ -0,0 +1,239 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Feature extractor class for Audio Spectrogram Transformer. +""" + +from typing import List, Optional, Union + +import numpy as np + +from ...audio_utils import mel_filter_bank, spectrogram, window_function +from ...feature_extraction_sequence_utils import SequenceFeatureExtractor +from ...feature_extraction_utils import BatchFeature +from ...utils import TensorType, is_speech_available, is_torch_available, logging + + +if is_speech_available(): + import torchaudio.compliance.kaldi as ta_kaldi + +if is_torch_available(): + import torch + + +logger = logging.get_logger(__name__) + + +class ASTFeatureExtractor(SequenceFeatureExtractor): + r""" + Constructs a Audio Spectrogram Transformer (AST) feature extractor. + + This feature extractor inherits from [`~feature_extraction_sequence_utils.SequenceFeatureExtractor`] which contains + most of the main methods. Users should refer to this superclass for more information regarding those methods. + + This class extracts mel-filter bank features from raw speech using TorchAudio if installed or using numpy + otherwise, pads/truncates them to a fixed length and normalizes them using a mean and standard deviation. + + Args: + feature_size (`int`, *optional*, defaults to 1): + The feature dimension of the extracted features. + sampling_rate (`int`, *optional*, defaults to 16000): + The sampling rate at which the audio files should be digitalized expressed in hertz (Hz). + num_mel_bins (`int`, *optional*, defaults to 128): + Number of Mel-frequency bins. + max_length (`int`, *optional*, defaults to 1024): + Maximum length to which to pad/truncate the extracted features. + do_normalize (`bool`, *optional*, defaults to `True`): + Whether or not to normalize the log-Mel features using `mean` and `std`. + mean (`float`, *optional*, defaults to -4.2677393): + The mean value used to normalize the log-Mel features. Uses the AudioSet mean by default. + std (`float`, *optional*, defaults to 4.5689974): + The standard deviation value used to normalize the log-Mel features. Uses the AudioSet standard deviation + by default. + return_attention_mask (`bool`, *optional*, defaults to `False`): + Whether or not [`~ASTFeatureExtractor.__call__`] should return `attention_mask`. + """ + + model_input_names = ["input_values", "attention_mask"] + + def __init__( + self, + feature_size=1, + sampling_rate=16000, + num_mel_bins=128, + max_length=1024, + padding_value=0.0, + do_normalize=True, + mean=-4.2677393, + std=4.5689974, + return_attention_mask=False, + **kwargs, + ): + super().__init__(feature_size=feature_size, sampling_rate=sampling_rate, padding_value=padding_value, **kwargs) + self.num_mel_bins = num_mel_bins + self.max_length = max_length + self.do_normalize = do_normalize + self.mean = mean + self.std = std + self.return_attention_mask = return_attention_mask + + if not is_speech_available(): + mel_filters = mel_filter_bank( + num_frequency_bins=257, + num_mel_filters=self.num_mel_bins, + min_frequency=20, + max_frequency=sampling_rate // 2, + sampling_rate=sampling_rate, + norm=None, + mel_scale="kaldi", + triangularize_in_mel_space=True, + ) + + self.mel_filters = mel_filters + self.window = window_function(400, "hann", periodic=False) + + def _extract_fbank_features( + self, + waveform: np.ndarray, + max_length: int, + ) -> np.ndarray: + """ + Get mel-filter bank features using TorchAudio. Note that TorchAudio requires 16-bit signed integers as inputs + and hence the waveform should not be normalized before feature extraction. + """ + # waveform = waveform * (2**15) # Kaldi compliance: 16-bit signed integers + if is_speech_available(): + waveform = torch.from_numpy(waveform).unsqueeze(0) + fbank = ta_kaldi.fbank( + waveform, + sample_frequency=self.sampling_rate, + window_type="hanning", + num_mel_bins=self.num_mel_bins, + ) + else: + waveform = np.squeeze(waveform) + fbank = spectrogram( + waveform, + self.window, + frame_length=400, + hop_length=160, + fft_length=512, + power=2.0, + center=False, + preemphasis=0.97, + mel_filters=self.mel_filters, + log_mel="log", + mel_floor=1.192092955078125e-07, + remove_dc_offset=True, + ).T + + fbank = torch.from_numpy(fbank) + + n_frames = fbank.shape[0] + difference = max_length - n_frames + + # pad or truncate, depending on difference + if difference > 0: + pad_module = torch.nn.ZeroPad2d((0, 0, 0, difference)) + fbank = pad_module(fbank) + elif difference < 0: + fbank = fbank[0:max_length, :] + + fbank = fbank.numpy() + + return fbank + + def normalize(self, input_values: np.ndarray) -> np.ndarray: + return (input_values - (self.mean)) / (self.std * 2) + + def __call__( + self, + raw_speech: Union[np.ndarray, List[float], List[np.ndarray], List[List[float]]], + sampling_rate: Optional[int] = None, + return_tensors: Optional[Union[str, TensorType]] = None, + **kwargs, + ) -> BatchFeature: + """ + Main method to featurize and prepare for the model one or several sequence(s). + + Args: + raw_speech (`np.ndarray`, `List[float]`, `List[np.ndarray]`, `List[List[float]]`): + The sequence or batch of sequences to be padded. Each sequence can be a numpy array, a list of float + values, a list of numpy arrays or a list of list of float values. Must be mono channel audio, not + stereo, i.e. single float per timestep. + sampling_rate (`int`, *optional*): + The sampling rate at which the `raw_speech` input was sampled. It is strongly recommended to pass + `sampling_rate` at the forward call to prevent silent errors. + return_tensors (`str` or [`~utils.TensorType`], *optional*): + If set, will return tensors instead of list of python integers. Acceptable values are: + + - `'tf'`: Return TensorFlow `tf.constant` objects. + - `'pt'`: Return PyTorch `torch.Tensor` objects. + - `'np'`: Return Numpy `np.ndarray` objects. + """ + + if sampling_rate is not None: + if sampling_rate != self.sampling_rate: + raise ValueError( + f"The model corresponding to this feature extractor: {self} was trained using a sampling rate of" + f" {self.sampling_rate}. Please make sure that the provided `raw_speech` input was sampled with" + f" {self.sampling_rate} and not {sampling_rate}." + ) + else: + logger.warning( + f"It is strongly recommended to pass the `sampling_rate` argument to `{self.__class__.__name__}()`. " + "Failing to do so can result in silent errors that might be hard to debug." + ) + + is_batched_numpy = isinstance(raw_speech, np.ndarray) and len(raw_speech.shape) > 1 + if is_batched_numpy and len(raw_speech.shape) > 2: + raise ValueError(f"Only mono-channel audio is supported for input to {self}") + is_batched = is_batched_numpy or ( + isinstance(raw_speech, (list, tuple)) and (isinstance(raw_speech[0], (np.ndarray, tuple, list))) + ) + + if is_batched: + raw_speech = [np.asarray(speech, dtype=np.float32) for speech in raw_speech] + elif not is_batched and not isinstance(raw_speech, np.ndarray): + raw_speech = np.asarray(raw_speech, dtype=np.float32) + elif isinstance(raw_speech, np.ndarray) and raw_speech.dtype is np.dtype(np.float64): + raw_speech = raw_speech.astype(np.float32) + + # always return batch + if not is_batched: + raw_speech = [raw_speech] + + # extract fbank features and pad/truncate to max_length + features = [self._extract_fbank_features(waveform, max_length=self.max_length) for waveform in raw_speech] + + # convert into BatchFeature + padded_inputs = BatchFeature({"input_values": features}) + + # make sure list is in array format + input_values = padded_inputs.get("input_values") + if isinstance(input_values[0], list): + padded_inputs["input_values"] = [np.asarray(feature, dtype=np.float32) for feature in input_values] + + # normalization + if self.do_normalize: + padded_inputs["input_values"] = [self.normalize(feature) for feature in input_values] + + if return_tensors is not None: + padded_inputs = padded_inputs.convert_to_tensors(return_tensors) + + return padded_inputs + + +__all__ = ["ASTFeatureExtractor"] diff --git a/docs/transformers/build/lib/transformers/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.py b/docs/transformers/build/lib/transformers/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.py new file mode 100644 index 0000000000000000000000000000000000000000..1aa24ca08a5aa17763074379f0db904c9e0f0fb1 --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/audio_spectrogram_transformer/modeling_audio_spectrogram_transformer.py @@ -0,0 +1,650 @@ +# coding=utf-8 +# Copyright 2022 MIT and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch Audio Spectrogram Transformer (AST) model.""" + +from typing import Callable, Dict, List, Optional, Set, Tuple, Union + +import torch +import torch.utils.checkpoint +from torch import nn +from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss + +from ...activations import ACT2FN +from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling, SequenceClassifierOutput +from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel +from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer +from ...utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging +from .configuration_audio_spectrogram_transformer import ASTConfig + + +logger = logging.get_logger(__name__) + +# General docstring +_CONFIG_FOR_DOC = "ASTConfig" + +# Base docstring +_CHECKPOINT_FOR_DOC = "MIT/ast-finetuned-audioset-10-10-0.4593" +_EXPECTED_OUTPUT_SHAPE = [1, 1214, 768] + +# Audio classification docstring +_SEQ_CLASS_CHECKPOINT = "MIT/ast-finetuned-audioset-10-10-0.4593" +_SEQ_CLASS_EXPECTED_OUTPUT = "'Speech'" +_SEQ_CLASS_EXPECTED_LOSS = 0.17 + + +class ASTEmbeddings(nn.Module): + """ + Construct the CLS token, position and patch embeddings. + """ + + def __init__(self, config: ASTConfig) -> None: + super().__init__() + + self.cls_token = nn.Parameter(torch.zeros(1, 1, config.hidden_size)) + self.distillation_token = nn.Parameter(torch.zeros(1, 1, config.hidden_size)) + self.patch_embeddings = ASTPatchEmbeddings(config) + + frequency_out_dimension, time_out_dimension = self.get_shape(config) + num_patches = frequency_out_dimension * time_out_dimension + self.position_embeddings = nn.Parameter(torch.zeros(1, num_patches + 2, config.hidden_size)) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + self.config = config + + def get_shape(self, config): + # see Karpathy's cs231n blog on how to calculate the output dimensions + # https://cs231n.github.io/convolutional-networks/#conv + frequency_out_dimension = (config.num_mel_bins - config.patch_size) // config.frequency_stride + 1 + time_out_dimension = (config.max_length - config.patch_size) // config.time_stride + 1 + + return frequency_out_dimension, time_out_dimension + + def forward(self, input_values: torch.Tensor) -> torch.Tensor: + batch_size = input_values.shape[0] + embeddings = self.patch_embeddings(input_values) + + cls_tokens = self.cls_token.expand(batch_size, -1, -1) + distillation_tokens = self.distillation_token.expand(batch_size, -1, -1) + embeddings = torch.cat((cls_tokens, distillation_tokens, embeddings), dim=1) + embeddings = embeddings + self.position_embeddings + embeddings = self.dropout(embeddings) + + return embeddings + + +class ASTPatchEmbeddings(nn.Module): + """ + This class turns `input_values` into the initial `hidden_states` (patch embeddings) of shape `(batch_size, + seq_length, hidden_size)` to be consumed by a Transformer. + """ + + def __init__(self, config): + super().__init__() + + patch_size = config.patch_size + frequency_stride = config.frequency_stride + time_stride = config.time_stride + + self.projection = nn.Conv2d( + 1, config.hidden_size, kernel_size=(patch_size, patch_size), stride=(frequency_stride, time_stride) + ) + + def forward(self, input_values: torch.Tensor) -> torch.Tensor: + input_values = input_values.unsqueeze(1) + input_values = input_values.transpose(2, 3) + embeddings = self.projection(input_values).flatten(2).transpose(1, 2) + return embeddings + + +# Copied from transformers.models.vit.modeling_vit.eager_attention_forward +def eager_attention_forward( + module: nn.Module, + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + attention_mask: Optional[torch.Tensor], + scaling: float, + dropout: float = 0.0, + **kwargs, +): + # Take the dot product between "query" and "key" to get the raw attention scores. + attn_weights = torch.matmul(query, key.transpose(-1, -2)) * scaling + + # Normalize the attention scores to probabilities. + attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype) + + # This is actually dropping out entire tokens to attend to, which might + # seem a bit unusual, but is taken from the original Transformer paper. + attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training) + + # Mask heads if we want to + if attention_mask is not None: + attn_weights = attn_weights * attention_mask + + attn_output = torch.matmul(attn_weights, value) + attn_output = attn_output.transpose(1, 2).contiguous() + + return attn_output, attn_weights + + +# Copied from transformers.models.vit.modeling_vit.ViTSelfAttention with ViT->AST +class ASTSelfAttention(nn.Module): + def __init__(self, config: ASTConfig) -> None: + super().__init__() + if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"): + raise ValueError( + f"The hidden size {config.hidden_size} is not a multiple of the number of attention " + f"heads {config.num_attention_heads}." + ) + + self.config = config + self.num_attention_heads = config.num_attention_heads + self.attention_head_size = int(config.hidden_size / config.num_attention_heads) + self.all_head_size = self.num_attention_heads * self.attention_head_size + self.dropout_prob = config.attention_probs_dropout_prob + self.scaling = self.attention_head_size**-0.5 + self.is_causal = False + + self.query = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias) + self.key = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias) + self.value = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias) + + def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor: + new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size) + x = x.view(new_x_shape) + return x.permute(0, 2, 1, 3) + + def forward( + self, hidden_states, head_mask: Optional[torch.Tensor] = None, output_attentions: bool = False + ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]: + key_layer = self.transpose_for_scores(self.key(hidden_states)) + value_layer = self.transpose_for_scores(self.value(hidden_states)) + query_layer = self.transpose_for_scores(self.query(hidden_states)) + + attention_interface: Callable = eager_attention_forward + if self.config._attn_implementation != "eager": + if self.config._attn_implementation == "sdpa" and output_attentions: + logger.warning_once( + "`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to " + 'eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.' + ) + else: + attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation] + + context_layer, attention_probs = attention_interface( + self, + query_layer, + key_layer, + value_layer, + head_mask, + is_causal=self.is_causal, + scaling=self.scaling, + dropout=0.0 if not self.training else self.dropout_prob, + ) + + new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,) + context_layer = context_layer.reshape(new_context_layer_shape) + + outputs = (context_layer, attention_probs) if output_attentions else (context_layer,) + + return outputs + + +# Copied from transformers.models.vit.modeling_vit.ViTSelfOutput with ViT->AST +class ASTSelfOutput(nn.Module): + """ + The residual connection is defined in ASTLayer instead of here (as is the case with other models), due to the + layernorm applied before each block. + """ + + def __init__(self, config: ASTConfig) -> None: + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + + return hidden_states + + +# Copied from transformers.models.vit.modeling_vit.ViTAttention with ViT->AST +class ASTAttention(nn.Module): + def __init__(self, config: ASTConfig) -> None: + super().__init__() + self.attention = ASTSelfAttention(config) + self.output = ASTSelfOutput(config) + self.pruned_heads = set() + + def prune_heads(self, heads: Set[int]) -> None: + if len(heads) == 0: + return + heads, index = find_pruneable_heads_and_indices( + heads, self.attention.num_attention_heads, self.attention.attention_head_size, self.pruned_heads + ) + + # Prune linear layers + self.attention.query = prune_linear_layer(self.attention.query, index) + self.attention.key = prune_linear_layer(self.attention.key, index) + self.attention.value = prune_linear_layer(self.attention.value, index) + self.output.dense = prune_linear_layer(self.output.dense, index, dim=1) + + # Update hyper params and store pruned heads + self.attention.num_attention_heads = self.attention.num_attention_heads - len(heads) + self.attention.all_head_size = self.attention.attention_head_size * self.attention.num_attention_heads + self.pruned_heads = self.pruned_heads.union(heads) + + def forward( + self, + hidden_states: torch.Tensor, + head_mask: Optional[torch.Tensor] = None, + output_attentions: bool = False, + ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]: + self_outputs = self.attention(hidden_states, head_mask, output_attentions) + + attention_output = self.output(self_outputs[0], hidden_states) + + outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them + return outputs + + +# Copied from transformers.models.vit.modeling_vit.ViTIntermediate with ViT->AST +class ASTIntermediate(nn.Module): + def __init__(self, config: ASTConfig) -> None: + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.intermediate_size) + if isinstance(config.hidden_act, str): + self.intermediate_act_fn = ACT2FN[config.hidden_act] + else: + self.intermediate_act_fn = config.hidden_act + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.intermediate_act_fn(hidden_states) + + return hidden_states + + +# Copied from transformers.models.vit.modeling_vit.ViTOutput with ViT->AST +class ASTOutput(nn.Module): + def __init__(self, config: ASTConfig) -> None: + super().__init__() + self.dense = nn.Linear(config.intermediate_size, config.hidden_size) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + + hidden_states = hidden_states + input_tensor + + return hidden_states + + +# Copied from transformers.models.vit.modeling_vit.ViTLayer with ViT->AST,VIT->AST +class ASTLayer(nn.Module): + """This corresponds to the Block class in the timm implementation.""" + + def __init__(self, config: ASTConfig) -> None: + super().__init__() + self.chunk_size_feed_forward = config.chunk_size_feed_forward + self.seq_len_dim = 1 + self.attention = ASTAttention(config) + self.intermediate = ASTIntermediate(config) + self.output = ASTOutput(config) + self.layernorm_before = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.layernorm_after = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + + def forward( + self, + hidden_states: torch.Tensor, + head_mask: Optional[torch.Tensor] = None, + output_attentions: bool = False, + ) -> Union[Tuple[torch.Tensor, torch.Tensor], Tuple[torch.Tensor]]: + self_attention_outputs = self.attention( + self.layernorm_before(hidden_states), # in AST, layernorm is applied before self-attention + head_mask, + output_attentions=output_attentions, + ) + attention_output = self_attention_outputs[0] + outputs = self_attention_outputs[1:] # add self attentions if we output attention weights + + # first residual connection + hidden_states = attention_output + hidden_states + + # in AST, layernorm is also applied after self-attention + layer_output = self.layernorm_after(hidden_states) + layer_output = self.intermediate(layer_output) + + # second residual connection is done here + layer_output = self.output(layer_output, hidden_states) + + outputs = (layer_output,) + outputs + + return outputs + + +# Copied from transformers.models.vit.modeling_vit.ViTEncoder with ViT->AST +class ASTEncoder(nn.Module): + def __init__(self, config: ASTConfig) -> None: + super().__init__() + self.config = config + self.layer = nn.ModuleList([ASTLayer(config) for _ in range(config.num_hidden_layers)]) + self.gradient_checkpointing = False + + def forward( + self, + hidden_states: torch.Tensor, + head_mask: Optional[torch.Tensor] = None, + output_attentions: bool = False, + output_hidden_states: bool = False, + return_dict: bool = True, + ) -> Union[tuple, BaseModelOutput]: + all_hidden_states = () if output_hidden_states else None + all_self_attentions = () if output_attentions else None + + for i, layer_module in enumerate(self.layer): + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + layer_head_mask = head_mask[i] if head_mask is not None else None + + if self.gradient_checkpointing and self.training: + layer_outputs = self._gradient_checkpointing_func( + layer_module.__call__, + hidden_states, + layer_head_mask, + output_attentions, + ) + else: + layer_outputs = layer_module(hidden_states, layer_head_mask, output_attentions) + + hidden_states = layer_outputs[0] + + if output_attentions: + all_self_attentions = all_self_attentions + (layer_outputs[1],) + + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + if not return_dict: + return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None) + return BaseModelOutput( + last_hidden_state=hidden_states, + hidden_states=all_hidden_states, + attentions=all_self_attentions, + ) + + +class ASTPreTrainedModel(PreTrainedModel): + """ + An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained + models. + """ + + config_class = ASTConfig + base_model_prefix = "audio_spectrogram_transformer" + main_input_name = "input_values" + supports_gradient_checkpointing = True + _supports_sdpa = True + _supports_flash_attn_2 = True + + def _init_weights(self, module: Union[nn.Linear, nn.Conv2d, nn.LayerNorm]) -> None: + """Initialize the weights""" + if isinstance(module, (nn.Linear, nn.Conv2d)): + # Upcast the input in `fp32` and cast it back to desired `dtype` to avoid + # `trunc_normal_cpu` not implemented in `half` issues + module.weight.data = nn.init.trunc_normal_( + module.weight.data.to(torch.float32), mean=0.0, std=self.config.initializer_range + ).to(module.weight.dtype) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + elif isinstance(module, ASTEmbeddings): + module.cls_token.data.zero_() + module.position_embeddings.data.zero_() + module.distillation_token.data.zero_() + + +AUDIO_SPECTROGRAM_TRANSFORMER_START_DOCSTRING = r""" + This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it + as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and + behavior. + + Parameters: + config ([`ASTConfig`]): + Model configuration class with all the parameters of the model. Initializing with a config file does not + load the weights associated with the model, only the configuration. Check out the + [`~PreTrainedModel.from_pretrained`] method to load the model weights. +""" + +AUDIO_SPECTROGRAM_TRANSFORMER_INPUTS_DOCSTRING = r""" + Args: + input_values (`torch.FloatTensor` of shape `(batch_size, max_length, num_mel_bins)`): + Float values mel features extracted from the raw audio waveform. Raw audio waveform can be obtained by + loading a `.flac` or `.wav` audio file into an array of type `List[float]` or a `numpy.ndarray`, *e.g.* via + the soundfile library (`pip install soundfile`). To prepare the array into `input_features`, the + [`AutoFeatureExtractor`] should be used for extracting the mel features, padding and conversion into a + tensor of type `torch.FloatTensor`. See [`~ASTFeatureExtractor.__call__`] + + head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*): + Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. +""" + + +@add_start_docstrings( + "The bare AST Model transformer outputting raw hidden-states without any specific head on top.", + AUDIO_SPECTROGRAM_TRANSFORMER_START_DOCSTRING, +) +class ASTModel(ASTPreTrainedModel): + def __init__(self, config: ASTConfig) -> None: + super().__init__(config) + self.config = config + + self.embeddings = ASTEmbeddings(config) + self.encoder = ASTEncoder(config) + + self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self) -> ASTPatchEmbeddings: + return self.embeddings.patch_embeddings + + def _prune_heads(self, heads_to_prune: Dict[int, List[int]]) -> None: + """ + Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base + class PreTrainedModel + """ + for layer, heads in heads_to_prune.items(): + self.encoder.layer[layer].attention.prune_heads(heads) + + @add_start_docstrings_to_model_forward(AUDIO_SPECTROGRAM_TRANSFORMER_INPUTS_DOCSTRING) + @add_code_sample_docstrings( + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=BaseModelOutputWithPooling, + config_class=_CONFIG_FOR_DOC, + modality="audio", + expected_output=_EXPECTED_OUTPUT_SHAPE, + ) + def forward( + self, + input_values: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, BaseModelOutputWithPooling]: + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if input_values is None: + raise ValueError("You have to specify input_values") + + # Prepare head mask if needed + # 1.0 in head_mask indicate we keep the head + # attention_probs has shape bsz x n_heads x N x N + # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads] + # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length] + head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers) + + embedding_output = self.embeddings(input_values) + + encoder_outputs = self.encoder( + embedding_output, + head_mask=head_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + sequence_output = encoder_outputs[0] + sequence_output = self.layernorm(sequence_output) + + pooled_output = (sequence_output[:, 0] + sequence_output[:, 1]) / 2 + + if not return_dict: + return (sequence_output, pooled_output) + encoder_outputs[1:] + + return BaseModelOutputWithPooling( + last_hidden_state=sequence_output, + pooler_output=pooled_output, + hidden_states=encoder_outputs.hidden_states, + attentions=encoder_outputs.attentions, + ) + + +class ASTMLPHead(nn.Module): + def __init__(self, config: ASTConfig): + super().__init__() + self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.dense = nn.Linear(config.hidden_size, config.num_labels) if config.num_labels > 0 else nn.Identity() + + def forward(self, hidden_state): + hidden_state = self.layernorm(hidden_state) + hidden_state = self.dense(hidden_state) + return hidden_state + + +@add_start_docstrings( + """ + Audio Spectrogram Transformer model with an audio classification head on top (a linear layer on top of the pooled + output) e.g. for datasets like AudioSet, Speech Commands v2. + """, + AUDIO_SPECTROGRAM_TRANSFORMER_START_DOCSTRING, +) +class ASTForAudioClassification(ASTPreTrainedModel): + def __init__(self, config: ASTConfig) -> None: + super().__init__(config) + + self.num_labels = config.num_labels + self.audio_spectrogram_transformer = ASTModel(config) + + # Classifier head + self.classifier = ASTMLPHead(config) + + # Initialize weights and apply final processing + self.post_init() + + @add_start_docstrings_to_model_forward(AUDIO_SPECTROGRAM_TRANSFORMER_INPUTS_DOCSTRING) + @add_code_sample_docstrings( + checkpoint=_SEQ_CLASS_CHECKPOINT, + output_type=SequenceClassifierOutput, + config_class=_CONFIG_FOR_DOC, + modality="audio", + expected_output=_SEQ_CLASS_EXPECTED_OUTPUT, + expected_loss=_SEQ_CLASS_EXPECTED_LOSS, + ) + def forward( + self, + input_values: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + labels: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[tuple, SequenceClassifierOutput]: + r""" + labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the audio classification/regression loss. Indices should be in `[0, ..., + config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If + `config.num_labels > 1` a classification loss is computed (Cross-Entropy). + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.audio_spectrogram_transformer( + input_values, + head_mask=head_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + pooled_output = outputs[1] + logits = self.classifier(pooled_output) + + loss = None + if labels is not None: + if self.config.problem_type is None: + if self.num_labels == 1: + self.config.problem_type = "regression" + elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int): + self.config.problem_type = "single_label_classification" + else: + self.config.problem_type = "multi_label_classification" + + if self.config.problem_type == "regression": + loss_fct = MSELoss() + if self.num_labels == 1: + loss = loss_fct(logits.squeeze(), labels.squeeze()) + else: + loss = loss_fct(logits, labels) + elif self.config.problem_type == "single_label_classification": + loss_fct = CrossEntropyLoss() + loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) + elif self.config.problem_type == "multi_label_classification": + loss_fct = BCEWithLogitsLoss() + loss = loss_fct(logits, labels) + + if not return_dict: + output = (logits,) + outputs[2:] + return ((loss,) + output) if loss is not None else output + + return SequenceClassifierOutput( + loss=loss, + logits=logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +__all__ = ["ASTForAudioClassification", "ASTModel", "ASTPreTrainedModel"] diff --git a/docs/transformers/build/lib/transformers/models/auto/__init__.py b/docs/transformers/build/lib/transformers/models/auto/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..6828030287e9472748f3f7ecf2e5b7fb06625e87 --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/auto/__init__.py @@ -0,0 +1,34 @@ +# Copyright 2020 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ...utils import _LazyModule +from ...utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .auto_factory import * + from .configuration_auto import * + from .feature_extraction_auto import * + from .image_processing_auto import * + from .modeling_auto import * + from .modeling_flax_auto import * + from .modeling_tf_auto import * + from .processing_auto import * + from .tokenization_auto import * +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/docs/transformers/build/lib/transformers/models/auto/auto_factory.py b/docs/transformers/build/lib/transformers/models/auto/auto_factory.py new file mode 100644 index 0000000000000000000000000000000000000000..d458c1b6f266f35510a39030efc5237260a28940 --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/auto/auto_factory.py @@ -0,0 +1,849 @@ +# coding=utf-8 +# Copyright 2021 The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Factory function to build auto-model classes.""" + +import copy +import importlib +import json +import warnings +from collections import OrderedDict + +from ...configuration_utils import PretrainedConfig +from ...dynamic_module_utils import get_class_from_dynamic_module, resolve_trust_remote_code +from ...utils import ( + CONFIG_NAME, + cached_file, + copy_func, + extract_commit_hash, + find_adapter_config_file, + is_peft_available, + is_torch_available, + logging, + requires_backends, +) +from .configuration_auto import AutoConfig, model_type_to_module_name, replace_list_option_in_docstrings + + +if is_torch_available(): + from ...generation import GenerationMixin + + +logger = logging.get_logger(__name__) + + +CLASS_DOCSTRING = """ + This is a generic model class that will be instantiated as one of the model classes of the library when created + with the [`~BaseAutoModelClass.from_pretrained`] class method or the [`~BaseAutoModelClass.from_config`] class + method. + + This class cannot be instantiated directly using `__init__()` (throws an error). +""" + +FROM_CONFIG_DOCSTRING = """ + Instantiates one of the model classes of the library from a configuration. + + Note: + Loading a model from its configuration file does **not** load the model weights. It only affects the + model's configuration. Use [`~BaseAutoModelClass.from_pretrained`] to load the model weights. + + Args: + config ([`PretrainedConfig`]): + The model class to instantiate is selected based on the configuration class: + + List options + attn_implementation (`str`, *optional*): + The attention implementation to use in the model (if relevant). Can be any of `"eager"` (manual implementation of the attention), `"sdpa"` (using [`F.scaled_dot_product_attention`](https://pytorch.org/docs/master/generated/torch.nn.functional.scaled_dot_product_attention.html)), or `"flash_attention_2"` (using [Dao-AILab/flash-attention](https://github.com/Dao-AILab/flash-attention)). By default, if available, SDPA will be used for torch>=2.1.1. The default is otherwise the manual `"eager"` implementation. + + Examples: + + ```python + >>> from transformers import AutoConfig, BaseAutoModelClass + + >>> # Download configuration from huggingface.co and cache. + >>> config = AutoConfig.from_pretrained("checkpoint_placeholder") + >>> model = BaseAutoModelClass.from_config(config) + ``` +""" + +FROM_PRETRAINED_TORCH_DOCSTRING = """ + Instantiate one of the model classes of the library from a pretrained model. + + The model class to instantiate is selected based on the `model_type` property of the config object (either + passed as an argument or loaded from `pretrained_model_name_or_path` if possible), or when it's missing, by + falling back to using pattern matching on `pretrained_model_name_or_path`: + + List options + + The model is set in evaluation mode by default using `model.eval()` (so for instance, dropout modules are + deactivated). To train the model, you should first set it back in training mode with `model.train()` + + Args: + pretrained_model_name_or_path (`str` or `os.PathLike`): + Can be either: + + - A string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co. + - A path to a *directory* containing model weights saved using + [`~PreTrainedModel.save_pretrained`], e.g., `./my_model_directory/`. + - A path or url to a *tensorflow index checkpoint file* (e.g, `./tf_model/model.ckpt.index`). In + this case, `from_tf` should be set to `True` and a configuration object should be provided as + `config` argument. This loading path is slower than converting the TensorFlow checkpoint in a + PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards. + model_args (additional positional arguments, *optional*): + Will be passed along to the underlying model `__init__()` method. + config ([`PretrainedConfig`], *optional*): + Configuration for the model to use instead of an automatically loaded configuration. Configuration can + be automatically loaded when: + + - The model is a model provided by the library (loaded with the *model id* string of a pretrained + model). + - The model was saved using [`~PreTrainedModel.save_pretrained`] and is reloaded by supplying the + save directory. + - The model is loaded by supplying a local directory as `pretrained_model_name_or_path` and a + configuration JSON file named *config.json* is found in the directory. + state_dict (*Dict[str, torch.Tensor]*, *optional*): + A state dictionary to use instead of a state dictionary loaded from saved weights file. + + This option can be used if you want to create a model from a pretrained configuration but load your own + weights. In this case though, you should check if using [`~PreTrainedModel.save_pretrained`] and + [`~PreTrainedModel.from_pretrained`] is not a simpler option. + cache_dir (`str` or `os.PathLike`, *optional*): + Path to a directory in which a downloaded pretrained model configuration should be cached if the + standard cache should not be used. + from_tf (`bool`, *optional*, defaults to `False`): + Load the model weights from a TensorFlow checkpoint save file (see docstring of + `pretrained_model_name_or_path` argument). + force_download (`bool`, *optional*, defaults to `False`): + Whether or not to force the (re-)download of the model weights and configuration files, overriding the + cached versions if they exist. + resume_download: + Deprecated and ignored. All downloads are now resumed by default when possible. + Will be removed in v5 of Transformers. + proxies (`Dict[str, str]`, *optional*): + A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128', + 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request. + output_loading_info(`bool`, *optional*, defaults to `False`): + Whether ot not to also return a dictionary containing missing keys, unexpected keys and error messages. + local_files_only(`bool`, *optional*, defaults to `False`): + Whether or not to only look at local files (e.g., not try downloading the model). + revision (`str`, *optional*, defaults to `"main"`): + The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a + git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any + identifier allowed by git. + trust_remote_code (`bool`, *optional*, defaults to `False`): + Whether or not to allow for custom models defined on the Hub in their own modeling files. This option + should only be set to `True` for repositories you trust and in which you have read the code, as it will + execute code present on the Hub on your local machine. + code_revision (`str`, *optional*, defaults to `"main"`): + The specific revision to use for the code on the Hub, if the code leaves in a different repository than + the rest of the model. It can be a branch name, a tag name, or a commit id, since we use a git-based + system for storing models and other artifacts on huggingface.co, so `revision` can be any identifier + allowed by git. + kwargs (additional keyword arguments, *optional*): + Can be used to update the configuration object (after it being loaded) and initiate the model (e.g., + `output_attentions=True`). Behaves differently depending on whether a `config` is provided or + automatically loaded: + + - If a configuration is provided with `config`, `**kwargs` will be directly passed to the + underlying model's `__init__` method (we assume all relevant updates to the configuration have + already been done) + - If a configuration is not provided, `kwargs` will be first passed to the configuration class + initialization function ([`~PretrainedConfig.from_pretrained`]). Each key of `kwargs` that + corresponds to a configuration attribute will be used to override said attribute with the + supplied `kwargs` value. Remaining keys that do not correspond to any configuration attribute + will be passed to the underlying model's `__init__` function. + + Examples: + + ```python + >>> from transformers import AutoConfig, BaseAutoModelClass + + >>> # Download model and configuration from huggingface.co and cache. + >>> model = BaseAutoModelClass.from_pretrained("checkpoint_placeholder") + + >>> # Update configuration during loading + >>> model = BaseAutoModelClass.from_pretrained("checkpoint_placeholder", output_attentions=True) + >>> model.config.output_attentions + True + + >>> # Loading from a TF checkpoint file instead of a PyTorch model (slower) + >>> config = AutoConfig.from_pretrained("./tf_model/shortcut_placeholder_tf_model_config.json") + >>> model = BaseAutoModelClass.from_pretrained( + ... "./tf_model/shortcut_placeholder_tf_checkpoint.ckpt.index", from_tf=True, config=config + ... ) + ``` +""" + +FROM_PRETRAINED_TF_DOCSTRING = """ + Instantiate one of the model classes of the library from a pretrained model. + + The model class to instantiate is selected based on the `model_type` property of the config object (either + passed as an argument or loaded from `pretrained_model_name_or_path` if possible), or when it's missing, by + falling back to using pattern matching on `pretrained_model_name_or_path`: + + List options + + Args: + pretrained_model_name_or_path (`str` or `os.PathLike`): + Can be either: + + - A string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co. + - A path to a *directory* containing model weights saved using + [`~PreTrainedModel.save_pretrained`], e.g., `./my_model_directory/`. + - A path or url to a *PyTorch state_dict save file* (e.g, `./pt_model/pytorch_model.bin`). In this + case, `from_pt` should be set to `True` and a configuration object should be provided as `config` + argument. This loading path is slower than converting the PyTorch model in a TensorFlow model + using the provided conversion scripts and loading the TensorFlow model afterwards. + model_args (additional positional arguments, *optional*): + Will be passed along to the underlying model `__init__()` method. + config ([`PretrainedConfig`], *optional*): + Configuration for the model to use instead of an automatically loaded configuration. Configuration can + be automatically loaded when: + + - The model is a model provided by the library (loaded with the *model id* string of a pretrained + model). + - The model was saved using [`~PreTrainedModel.save_pretrained`] and is reloaded by supplying the + save directory. + - The model is loaded by supplying a local directory as `pretrained_model_name_or_path` and a + configuration JSON file named *config.json* is found in the directory. + cache_dir (`str` or `os.PathLike`, *optional*): + Path to a directory in which a downloaded pretrained model configuration should be cached if the + standard cache should not be used. + from_pt (`bool`, *optional*, defaults to `False`): + Load the model weights from a PyTorch checkpoint save file (see docstring of + `pretrained_model_name_or_path` argument). + force_download (`bool`, *optional*, defaults to `False`): + Whether or not to force the (re-)download of the model weights and configuration files, overriding the + cached versions if they exist. + resume_download: + Deprecated and ignored. All downloads are now resumed by default when possible. + Will be removed in v5 of Transformers. + proxies (`Dict[str, str]`, *optional*): + A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128', + 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request. + output_loading_info(`bool`, *optional*, defaults to `False`): + Whether ot not to also return a dictionary containing missing keys, unexpected keys and error messages. + local_files_only(`bool`, *optional*, defaults to `False`): + Whether or not to only look at local files (e.g., not try downloading the model). + revision (`str`, *optional*, defaults to `"main"`): + The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a + git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any + identifier allowed by git. + trust_remote_code (`bool`, *optional*, defaults to `False`): + Whether or not to allow for custom models defined on the Hub in their own modeling files. This option + should only be set to `True` for repositories you trust and in which you have read the code, as it will + execute code present on the Hub on your local machine. + code_revision (`str`, *optional*, defaults to `"main"`): + The specific revision to use for the code on the Hub, if the code leaves in a different repository than + the rest of the model. It can be a branch name, a tag name, or a commit id, since we use a git-based + system for storing models and other artifacts on huggingface.co, so `revision` can be any identifier + allowed by git. + kwargs (additional keyword arguments, *optional*): + Can be used to update the configuration object (after it being loaded) and initiate the model (e.g., + `output_attentions=True`). Behaves differently depending on whether a `config` is provided or + automatically loaded: + + - If a configuration is provided with `config`, `**kwargs` will be directly passed to the + underlying model's `__init__` method (we assume all relevant updates to the configuration have + already been done) + - If a configuration is not provided, `kwargs` will be first passed to the configuration class + initialization function ([`~PretrainedConfig.from_pretrained`]). Each key of `kwargs` that + corresponds to a configuration attribute will be used to override said attribute with the + supplied `kwargs` value. Remaining keys that do not correspond to any configuration attribute + will be passed to the underlying model's `__init__` function. + + Examples: + + ```python + >>> from transformers import AutoConfig, BaseAutoModelClass + + >>> # Download model and configuration from huggingface.co and cache. + >>> model = BaseAutoModelClass.from_pretrained("checkpoint_placeholder") + + >>> # Update configuration during loading + >>> model = BaseAutoModelClass.from_pretrained("checkpoint_placeholder", output_attentions=True) + >>> model.config.output_attentions + True + + >>> # Loading from a PyTorch checkpoint file instead of a TensorFlow model (slower) + >>> config = AutoConfig.from_pretrained("./pt_model/shortcut_placeholder_pt_model_config.json") + >>> model = BaseAutoModelClass.from_pretrained( + ... "./pt_model/shortcut_placeholder_pytorch_model.bin", from_pt=True, config=config + ... ) + ``` +""" + +FROM_PRETRAINED_FLAX_DOCSTRING = """ + Instantiate one of the model classes of the library from a pretrained model. + + The model class to instantiate is selected based on the `model_type` property of the config object (either + passed as an argument or loaded from `pretrained_model_name_or_path` if possible), or when it's missing, by + falling back to using pattern matching on `pretrained_model_name_or_path`: + + List options + + Args: + pretrained_model_name_or_path (`str` or `os.PathLike`): + Can be either: + + - A string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co. + - A path to a *directory* containing model weights saved using + [`~PreTrainedModel.save_pretrained`], e.g., `./my_model_directory/`. + - A path or url to a *PyTorch state_dict save file* (e.g, `./pt_model/pytorch_model.bin`). In this + case, `from_pt` should be set to `True` and a configuration object should be provided as `config` + argument. This loading path is slower than converting the PyTorch model in a TensorFlow model + using the provided conversion scripts and loading the TensorFlow model afterwards. + model_args (additional positional arguments, *optional*): + Will be passed along to the underlying model `__init__()` method. + config ([`PretrainedConfig`], *optional*): + Configuration for the model to use instead of an automatically loaded configuration. Configuration can + be automatically loaded when: + + - The model is a model provided by the library (loaded with the *model id* string of a pretrained + model). + - The model was saved using [`~PreTrainedModel.save_pretrained`] and is reloaded by supplying the + save directory. + - The model is loaded by supplying a local directory as `pretrained_model_name_or_path` and a + configuration JSON file named *config.json* is found in the directory. + cache_dir (`str` or `os.PathLike`, *optional*): + Path to a directory in which a downloaded pretrained model configuration should be cached if the + standard cache should not be used. + from_pt (`bool`, *optional*, defaults to `False`): + Load the model weights from a PyTorch checkpoint save file (see docstring of + `pretrained_model_name_or_path` argument). + force_download (`bool`, *optional*, defaults to `False`): + Whether or not to force the (re-)download of the model weights and configuration files, overriding the + cached versions if they exist. + resume_download: + Deprecated and ignored. All downloads are now resumed by default when possible. + Will be removed in v5 of Transformers. + proxies (`Dict[str, str]`, *optional*): + A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128', + 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request. + output_loading_info(`bool`, *optional*, defaults to `False`): + Whether ot not to also return a dictionary containing missing keys, unexpected keys and error messages. + local_files_only(`bool`, *optional*, defaults to `False`): + Whether or not to only look at local files (e.g., not try downloading the model). + revision (`str`, *optional*, defaults to `"main"`): + The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a + git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any + identifier allowed by git. + trust_remote_code (`bool`, *optional*, defaults to `False`): + Whether or not to allow for custom models defined on the Hub in their own modeling files. This option + should only be set to `True` for repositories you trust and in which you have read the code, as it will + execute code present on the Hub on your local machine. + code_revision (`str`, *optional*, defaults to `"main"`): + The specific revision to use for the code on the Hub, if the code leaves in a different repository than + the rest of the model. It can be a branch name, a tag name, or a commit id, since we use a git-based + system for storing models and other artifacts on huggingface.co, so `revision` can be any identifier + allowed by git. + kwargs (additional keyword arguments, *optional*): + Can be used to update the configuration object (after it being loaded) and initiate the model (e.g., + `output_attentions=True`). Behaves differently depending on whether a `config` is provided or + automatically loaded: + + - If a configuration is provided with `config`, `**kwargs` will be directly passed to the + underlying model's `__init__` method (we assume all relevant updates to the configuration have + already been done) + - If a configuration is not provided, `kwargs` will be first passed to the configuration class + initialization function ([`~PretrainedConfig.from_pretrained`]). Each key of `kwargs` that + corresponds to a configuration attribute will be used to override said attribute with the + supplied `kwargs` value. Remaining keys that do not correspond to any configuration attribute + will be passed to the underlying model's `__init__` function. + + Examples: + + ```python + >>> from transformers import AutoConfig, BaseAutoModelClass + + >>> # Download model and configuration from huggingface.co and cache. + >>> model = BaseAutoModelClass.from_pretrained("checkpoint_placeholder") + + >>> # Update configuration during loading + >>> model = BaseAutoModelClass.from_pretrained("checkpoint_placeholder", output_attentions=True) + >>> model.config.output_attentions + True + + >>> # Loading from a PyTorch checkpoint file instead of a TensorFlow model (slower) + >>> config = AutoConfig.from_pretrained("./pt_model/shortcut_placeholder_pt_model_config.json") + >>> model = BaseAutoModelClass.from_pretrained( + ... "./pt_model/shortcut_placeholder_pytorch_model.bin", from_pt=True, config=config + ... ) + ``` +""" + + +def _get_model_class(config, model_mapping): + supported_models = model_mapping[type(config)] + if not isinstance(supported_models, (list, tuple)): + return supported_models + + name_to_model = {model.__name__: model for model in supported_models} + architectures = getattr(config, "architectures", []) + for arch in architectures: + if arch in name_to_model: + return name_to_model[arch] + elif f"TF{arch}" in name_to_model: + return name_to_model[f"TF{arch}"] + elif f"Flax{arch}" in name_to_model: + return name_to_model[f"Flax{arch}"] + + # If not architecture is set in the config or match the supported models, the first element of the tuple is the + # defaults. + return supported_models[0] + + +class _BaseAutoModelClass: + # Base class for auto models. + _model_mapping = None + + def __init__(self, *args, **kwargs): + raise EnvironmentError( + f"{self.__class__.__name__} is designed to be instantiated " + f"using the `{self.__class__.__name__}.from_pretrained(pretrained_model_name_or_path)` or " + f"`{self.__class__.__name__}.from_config(config)` methods." + ) + + @classmethod + def from_config(cls, config, **kwargs): + trust_remote_code = kwargs.pop("trust_remote_code", None) + has_remote_code = hasattr(config, "auto_map") and cls.__name__ in config.auto_map + has_local_code = type(config) in cls._model_mapping.keys() + trust_remote_code = resolve_trust_remote_code( + trust_remote_code, config._name_or_path, has_local_code, has_remote_code + ) + + if has_remote_code and trust_remote_code: + class_ref = config.auto_map[cls.__name__] + if "--" in class_ref: + repo_id, class_ref = class_ref.split("--") + else: + repo_id = config.name_or_path + model_class = get_class_from_dynamic_module(class_ref, repo_id, **kwargs) + cls.register(config.__class__, model_class, exist_ok=True) + _ = kwargs.pop("code_revision", None) + model_class = add_generation_mixin_to_remote_model(model_class) + return model_class._from_config(config, **kwargs) + elif type(config) in cls._model_mapping.keys(): + model_class = _get_model_class(config, cls._model_mapping) + return model_class._from_config(config, **kwargs) + + raise ValueError( + f"Unrecognized configuration class {config.__class__} for this kind of AutoModel: {cls.__name__}.\n" + f"Model type should be one of {', '.join(c.__name__ for c in cls._model_mapping.keys())}." + ) + + @classmethod + def _prepare_config_for_auto_class(cls, config: PretrainedConfig) -> PretrainedConfig: + """Additional autoclass-specific config post-loading manipulation. May be overridden in subclasses.""" + return config + + @classmethod + def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): + config = kwargs.pop("config", None) + trust_remote_code = kwargs.pop("trust_remote_code", None) + kwargs["_from_auto"] = True + hub_kwargs_names = [ + "cache_dir", + "force_download", + "local_files_only", + "proxies", + "resume_download", + "revision", + "subfolder", + "use_auth_token", + "token", + ] + hub_kwargs = {name: kwargs.pop(name) for name in hub_kwargs_names if name in kwargs} + code_revision = kwargs.pop("code_revision", None) + commit_hash = kwargs.pop("_commit_hash", None) + adapter_kwargs = kwargs.pop("adapter_kwargs", None) + + token = hub_kwargs.pop("token", None) + use_auth_token = hub_kwargs.pop("use_auth_token", None) + if use_auth_token is not None: + warnings.warn( + "The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers. Please use `token` instead.", + FutureWarning, + ) + if token is not None: + raise ValueError( + "`token` and `use_auth_token` are both specified. Please set only the argument `token`." + ) + token = use_auth_token + + if token is not None: + hub_kwargs["token"] = token + + if commit_hash is None: + if not isinstance(config, PretrainedConfig): + # We make a call to the config file first (which may be absent) to get the commit hash as soon as possible + resolved_config_file = cached_file( + pretrained_model_name_or_path, + CONFIG_NAME, + _raise_exceptions_for_gated_repo=False, + _raise_exceptions_for_missing_entries=False, + _raise_exceptions_for_connection_errors=False, + **hub_kwargs, + ) + commit_hash = extract_commit_hash(resolved_config_file, commit_hash) + else: + commit_hash = getattr(config, "_commit_hash", None) + + if is_peft_available(): + if adapter_kwargs is None: + adapter_kwargs = {} + if token is not None: + adapter_kwargs["token"] = token + + maybe_adapter_path = find_adapter_config_file( + pretrained_model_name_or_path, _commit_hash=commit_hash, **adapter_kwargs + ) + + if maybe_adapter_path is not None: + with open(maybe_adapter_path, "r", encoding="utf-8") as f: + adapter_config = json.load(f) + + adapter_kwargs["_adapter_model_path"] = pretrained_model_name_or_path + pretrained_model_name_or_path = adapter_config["base_model_name_or_path"] + + if not isinstance(config, PretrainedConfig): + kwargs_orig = copy.deepcopy(kwargs) + # ensure not to pollute the config object with torch_dtype="auto" - since it's + # meaningless in the context of the config object - torch.dtype values are acceptable + if kwargs.get("torch_dtype", None) == "auto": + _ = kwargs.pop("torch_dtype") + # to not overwrite the quantization_config if config has a quantization_config + if kwargs.get("quantization_config", None) is not None: + _ = kwargs.pop("quantization_config") + + config, kwargs = AutoConfig.from_pretrained( + pretrained_model_name_or_path, + return_unused_kwargs=True, + trust_remote_code=trust_remote_code, + code_revision=code_revision, + _commit_hash=commit_hash, + **hub_kwargs, + **kwargs, + ) + + # if torch_dtype=auto was passed here, ensure to pass it on + if kwargs_orig.get("torch_dtype", None) == "auto": + kwargs["torch_dtype"] = "auto" + if kwargs_orig.get("quantization_config", None) is not None: + kwargs["quantization_config"] = kwargs_orig["quantization_config"] + + has_remote_code = hasattr(config, "auto_map") and cls.__name__ in config.auto_map + has_local_code = type(config) in cls._model_mapping.keys() + trust_remote_code = resolve_trust_remote_code( + trust_remote_code, pretrained_model_name_or_path, has_local_code, has_remote_code + ) + + # Set the adapter kwargs + kwargs["adapter_kwargs"] = adapter_kwargs + + if has_remote_code and trust_remote_code: + class_ref = config.auto_map[cls.__name__] + model_class = get_class_from_dynamic_module( + class_ref, pretrained_model_name_or_path, code_revision=code_revision, **hub_kwargs, **kwargs + ) + _ = hub_kwargs.pop("code_revision", None) + cls.register(config.__class__, model_class, exist_ok=True) + model_class = add_generation_mixin_to_remote_model(model_class) + return model_class.from_pretrained( + pretrained_model_name_or_path, *model_args, config=config, **hub_kwargs, **kwargs + ) + elif type(config) in cls._model_mapping.keys(): + model_class = _get_model_class(config, cls._model_mapping) + if model_class.config_class == config.sub_configs.get("text_config", None): + config = config.get_text_config() + return model_class.from_pretrained( + pretrained_model_name_or_path, *model_args, config=config, **hub_kwargs, **kwargs + ) + raise ValueError( + f"Unrecognized configuration class {config.__class__} for this kind of AutoModel: {cls.__name__}.\n" + f"Model type should be one of {', '.join(c.__name__ for c in cls._model_mapping.keys())}." + ) + + @classmethod + def register(cls, config_class, model_class, exist_ok=False): + """ + Register a new model for this class. + + Args: + config_class ([`PretrainedConfig`]): + The configuration corresponding to the model to register. + model_class ([`PreTrainedModel`]): + The model to register. + """ + if hasattr(model_class, "config_class") and model_class.config_class.__name__ != config_class.__name__: + raise ValueError( + "The model class you are passing has a `config_class` attribute that is not consistent with the " + f"config class you passed (model has {model_class.config_class} and you passed {config_class}. Fix " + "one of those so they match!" + ) + cls._model_mapping.register(config_class, model_class, exist_ok=exist_ok) + + +class _BaseAutoBackboneClass(_BaseAutoModelClass): + # Base class for auto backbone models. + _model_mapping = None + + @classmethod + def _load_timm_backbone_from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): + requires_backends(cls, ["vision", "timm"]) + from ...models.timm_backbone import TimmBackboneConfig + + config = kwargs.pop("config", TimmBackboneConfig()) + + if kwargs.get("out_features", None) is not None: + raise ValueError("Cannot specify `out_features` for timm backbones") + + if kwargs.get("output_loading_info", False): + raise ValueError("Cannot specify `output_loading_info=True` when loading from timm") + + num_channels = kwargs.pop("num_channels", config.num_channels) + features_only = kwargs.pop("features_only", config.features_only) + use_pretrained_backbone = kwargs.pop("use_pretrained_backbone", config.use_pretrained_backbone) + out_indices = kwargs.pop("out_indices", config.out_indices) + config = TimmBackboneConfig( + backbone=pretrained_model_name_or_path, + num_channels=num_channels, + features_only=features_only, + use_pretrained_backbone=use_pretrained_backbone, + out_indices=out_indices, + ) + return super().from_config(config, **kwargs) + + @classmethod + def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): + use_timm_backbone = kwargs.pop("use_timm_backbone", False) + if use_timm_backbone: + return cls._load_timm_backbone_from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) + + return super().from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) + + +def insert_head_doc(docstring, head_doc=""): + if len(head_doc) > 0: + return docstring.replace( + "one of the model classes of the library ", + f"one of the model classes of the library (with a {head_doc} head) ", + ) + return docstring.replace( + "one of the model classes of the library ", "one of the base model classes of the library " + ) + + +def auto_class_update(cls, checkpoint_for_example="google-bert/bert-base-cased", head_doc=""): + # Create a new class with the right name from the base class + model_mapping = cls._model_mapping + name = cls.__name__ + class_docstring = insert_head_doc(CLASS_DOCSTRING, head_doc=head_doc) + cls.__doc__ = class_docstring.replace("BaseAutoModelClass", name) + + # Now we need to copy and re-register `from_config` and `from_pretrained` as class methods otherwise we can't + # have a specific docstrings for them. + from_config = copy_func(_BaseAutoModelClass.from_config) + from_config_docstring = insert_head_doc(FROM_CONFIG_DOCSTRING, head_doc=head_doc) + from_config_docstring = from_config_docstring.replace("BaseAutoModelClass", name) + from_config_docstring = from_config_docstring.replace("checkpoint_placeholder", checkpoint_for_example) + from_config.__doc__ = from_config_docstring + from_config = replace_list_option_in_docstrings(model_mapping._model_mapping, use_model_types=False)(from_config) + cls.from_config = classmethod(from_config) + + if name.startswith("TF"): + from_pretrained_docstring = FROM_PRETRAINED_TF_DOCSTRING + elif name.startswith("Flax"): + from_pretrained_docstring = FROM_PRETRAINED_FLAX_DOCSTRING + else: + from_pretrained_docstring = FROM_PRETRAINED_TORCH_DOCSTRING + from_pretrained = copy_func(_BaseAutoModelClass.from_pretrained) + from_pretrained_docstring = insert_head_doc(from_pretrained_docstring, head_doc=head_doc) + from_pretrained_docstring = from_pretrained_docstring.replace("BaseAutoModelClass", name) + from_pretrained_docstring = from_pretrained_docstring.replace("checkpoint_placeholder", checkpoint_for_example) + shortcut = checkpoint_for_example.split("/")[-1].split("-")[0] + from_pretrained_docstring = from_pretrained_docstring.replace("shortcut_placeholder", shortcut) + from_pretrained.__doc__ = from_pretrained_docstring + from_pretrained = replace_list_option_in_docstrings(model_mapping._model_mapping)(from_pretrained) + cls.from_pretrained = classmethod(from_pretrained) + return cls + + +def get_values(model_mapping): + result = [] + for model in model_mapping.values(): + if isinstance(model, (list, tuple)): + result += list(model) + else: + result.append(model) + + return result + + +def getattribute_from_module(module, attr): + if attr is None: + return None + if isinstance(attr, tuple): + return tuple(getattribute_from_module(module, a) for a in attr) + if hasattr(module, attr): + return getattr(module, attr) + # Some of the mappings have entries model_type -> object of another model type. In that case we try to grab the + # object at the top level. + transformers_module = importlib.import_module("transformers") + + if module != transformers_module: + try: + return getattribute_from_module(transformers_module, attr) + except ValueError: + raise ValueError(f"Could not find {attr} neither in {module} nor in {transformers_module}!") + else: + raise ValueError(f"Could not find {attr} in {transformers_module}!") + + +def add_generation_mixin_to_remote_model(model_class): + """ + Adds `GenerationMixin` to the inheritance of `model_class`, if `model_class` is a PyTorch model. + + This function is used for backwards compatibility purposes: in v4.45, we've started a deprecation cycle to make + `PreTrainedModel` stop inheriting from `GenerationMixin`. Without this function, older models dynamically loaded + from the Hub may not have the `generate` method after we remove the inheritance. + """ + # 1. If it is not a PT model (i.e. doesn't inherit Module), do nothing + if "torch.nn.modules.module.Module" not in str(model_class.__mro__): + return model_class + + # 2. If it already **directly** inherits from GenerationMixin, do nothing + if "GenerationMixin" in str(model_class.__bases__): + return model_class + + # 3. Prior to v4.45, we could detect whether a model was `generate`-compatible if it had its own `generate` and/or + # `prepare_inputs_for_generation` method. + has_custom_generate = hasattr(model_class, "generate") and "GenerationMixin" not in str( + getattr(model_class, "generate") + ) + has_custom_prepare_inputs = hasattr(model_class, "prepare_inputs_for_generation") and "GenerationMixin" not in str( + getattr(model_class, "prepare_inputs_for_generation") + ) + if has_custom_generate or has_custom_prepare_inputs: + model_class_with_generation_mixin = type( + model_class.__name__, (model_class, GenerationMixin), {**model_class.__dict__} + ) + return model_class_with_generation_mixin + return model_class + + +class _LazyAutoMapping(OrderedDict): + """ + " A mapping config to object (model or tokenizer for instance) that will load keys and values when it is accessed. + + Args: + - config_mapping: The map model type to config class + - model_mapping: The map model type to model (or tokenizer) class + """ + + def __init__(self, config_mapping, model_mapping): + self._config_mapping = config_mapping + self._reverse_config_mapping = {v: k for k, v in config_mapping.items()} + self._model_mapping = model_mapping + self._model_mapping._model_mapping = self + self._extra_content = {} + self._modules = {} + + def __len__(self): + common_keys = set(self._config_mapping.keys()).intersection(self._model_mapping.keys()) + return len(common_keys) + len(self._extra_content) + + def __getitem__(self, key): + if key in self._extra_content: + return self._extra_content[key] + model_type = self._reverse_config_mapping[key.__name__] + if model_type in self._model_mapping: + model_name = self._model_mapping[model_type] + return self._load_attr_from_module(model_type, model_name) + + # Maybe there was several model types associated with this config. + model_types = [k for k, v in self._config_mapping.items() if v == key.__name__] + for mtype in model_types: + if mtype in self._model_mapping: + model_name = self._model_mapping[mtype] + return self._load_attr_from_module(mtype, model_name) + raise KeyError(key) + + def _load_attr_from_module(self, model_type, attr): + module_name = model_type_to_module_name(model_type) + if module_name not in self._modules: + self._modules[module_name] = importlib.import_module(f".{module_name}", "transformers.models") + return getattribute_from_module(self._modules[module_name], attr) + + def keys(self): + mapping_keys = [ + self._load_attr_from_module(key, name) + for key, name in self._config_mapping.items() + if key in self._model_mapping.keys() + ] + return mapping_keys + list(self._extra_content.keys()) + + def get(self, key, default): + try: + return self.__getitem__(key) + except KeyError: + return default + + def __bool__(self): + return bool(self.keys()) + + def values(self): + mapping_values = [ + self._load_attr_from_module(key, name) + for key, name in self._model_mapping.items() + if key in self._config_mapping.keys() + ] + return mapping_values + list(self._extra_content.values()) + + def items(self): + mapping_items = [ + ( + self._load_attr_from_module(key, self._config_mapping[key]), + self._load_attr_from_module(key, self._model_mapping[key]), + ) + for key in self._model_mapping.keys() + if key in self._config_mapping.keys() + ] + return mapping_items + list(self._extra_content.items()) + + def __iter__(self): + return iter(self.keys()) + + def __contains__(self, item): + if item in self._extra_content: + return True + if not hasattr(item, "__name__") or item.__name__ not in self._reverse_config_mapping: + return False + model_type = self._reverse_config_mapping[item.__name__] + return model_type in self._model_mapping + + def register(self, key, value, exist_ok=False): + """ + Register a new model in this mapping. + """ + if hasattr(key, "__name__") and key.__name__ in self._reverse_config_mapping: + model_type = self._reverse_config_mapping[key.__name__] + if model_type in self._model_mapping.keys() and not exist_ok: + raise ValueError(f"'{key}' is already used by a Transformers model.") + + self._extra_content[key] = value + + +__all__ = ["get_values"] diff --git a/docs/transformers/build/lib/transformers/models/auto/configuration_auto.py b/docs/transformers/build/lib/transformers/models/auto/configuration_auto.py new file mode 100644 index 0000000000000000000000000000000000000000..7d13bc788d466caa36c0f46338f4159783715618 --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/auto/configuration_auto.py @@ -0,0 +1,1199 @@ +# coding=utf-8 +# Copyright 2018 The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Auto Config class.""" + +import importlib +import os +import re +import warnings +from collections import OrderedDict +from typing import List, Union + +from ...configuration_utils import PretrainedConfig +from ...dynamic_module_utils import get_class_from_dynamic_module, resolve_trust_remote_code +from ...utils import CONFIG_NAME, logging + + +logger = logging.get_logger(__name__) + + +CONFIG_MAPPING_NAMES = OrderedDict( + [ + # Add configs here + ("albert", "AlbertConfig"), + ("align", "AlignConfig"), + ("altclip", "AltCLIPConfig"), + ("aria", "AriaConfig"), + ("aria_text", "AriaTextConfig"), + ("audio-spectrogram-transformer", "ASTConfig"), + ("autoformer", "AutoformerConfig"), + ("aya_vision", "AyaVisionConfig"), + ("bamba", "BambaConfig"), + ("bark", "BarkConfig"), + ("bart", "BartConfig"), + ("beit", "BeitConfig"), + ("bert", "BertConfig"), + ("bert-generation", "BertGenerationConfig"), + ("big_bird", "BigBirdConfig"), + ("bigbird_pegasus", "BigBirdPegasusConfig"), + ("biogpt", "BioGptConfig"), + ("bit", "BitConfig"), + ("blenderbot", "BlenderbotConfig"), + ("blenderbot-small", "BlenderbotSmallConfig"), + ("blip", "BlipConfig"), + ("blip-2", "Blip2Config"), + ("blip_2_qformer", "Blip2QFormerConfig"), + ("bloom", "BloomConfig"), + ("bridgetower", "BridgeTowerConfig"), + ("bros", "BrosConfig"), + ("camembert", "CamembertConfig"), + ("canine", "CanineConfig"), + ("chameleon", "ChameleonConfig"), + ("chinese_clip", "ChineseCLIPConfig"), + ("chinese_clip_vision_model", "ChineseCLIPVisionConfig"), + ("clap", "ClapConfig"), + ("clip", "CLIPConfig"), + ("clip_text_model", "CLIPTextConfig"), + ("clip_vision_model", "CLIPVisionConfig"), + ("clipseg", "CLIPSegConfig"), + ("clvp", "ClvpConfig"), + ("code_llama", "LlamaConfig"), + ("codegen", "CodeGenConfig"), + ("cohere", "CohereConfig"), + ("cohere2", "Cohere2Config"), + ("colpali", "ColPaliConfig"), + ("conditional_detr", "ConditionalDetrConfig"), + ("convbert", "ConvBertConfig"), + ("convnext", "ConvNextConfig"), + ("convnextv2", "ConvNextV2Config"), + ("cpmant", "CpmAntConfig"), + ("ctrl", "CTRLConfig"), + ("cvt", "CvtConfig"), + ("dab-detr", "DabDetrConfig"), + ("dac", "DacConfig"), + ("data2vec-audio", "Data2VecAudioConfig"), + ("data2vec-text", "Data2VecTextConfig"), + ("data2vec-vision", "Data2VecVisionConfig"), + ("dbrx", "DbrxConfig"), + ("deberta", "DebertaConfig"), + ("deberta-v2", "DebertaV2Config"), + ("decision_transformer", "DecisionTransformerConfig"), + ("deepseek_v3", "DeepseekV3Config"), + ("deformable_detr", "DeformableDetrConfig"), + ("deit", "DeiTConfig"), + ("depth_anything", "DepthAnythingConfig"), + ("depth_pro", "DepthProConfig"), + ("deta", "DetaConfig"), + ("detr", "DetrConfig"), + ("diffllama", "DiffLlamaConfig"), + ("dinat", "DinatConfig"), + ("dinov2", "Dinov2Config"), + ("dinov2_with_registers", "Dinov2WithRegistersConfig"), + ("distilbert", "DistilBertConfig"), + ("donut-swin", "DonutSwinConfig"), + ("dpr", "DPRConfig"), + ("dpt", "DPTConfig"), + ("efficientformer", "EfficientFormerConfig"), + ("efficientnet", "EfficientNetConfig"), + ("electra", "ElectraConfig"), + ("emu3", "Emu3Config"), + ("encodec", "EncodecConfig"), + ("encoder-decoder", "EncoderDecoderConfig"), + ("ernie", "ErnieConfig"), + ("ernie_m", "ErnieMConfig"), + ("esm", "EsmConfig"), + ("falcon", "FalconConfig"), + ("falcon_mamba", "FalconMambaConfig"), + ("fastspeech2_conformer", "FastSpeech2ConformerConfig"), + ("flaubert", "FlaubertConfig"), + ("flava", "FlavaConfig"), + ("fnet", "FNetConfig"), + ("focalnet", "FocalNetConfig"), + ("fsmt", "FSMTConfig"), + ("funnel", "FunnelConfig"), + ("fuyu", "FuyuConfig"), + ("gemma", "GemmaConfig"), + ("gemma2", "Gemma2Config"), + ("gemma3", "Gemma3Config"), + ("gemma3_text", "Gemma3TextConfig"), + ("git", "GitConfig"), + ("glm", "GlmConfig"), + ("glm4", "Glm4Config"), + ("glpn", "GLPNConfig"), + ("got_ocr2", "GotOcr2Config"), + ("gpt-sw3", "GPT2Config"), + ("gpt2", "GPT2Config"), + ("gpt_bigcode", "GPTBigCodeConfig"), + ("gpt_neo", "GPTNeoConfig"), + ("gpt_neox", "GPTNeoXConfig"), + ("gpt_neox_japanese", "GPTNeoXJapaneseConfig"), + ("gptj", "GPTJConfig"), + ("gptsan-japanese", "GPTSanJapaneseConfig"), + ("granite", "GraniteConfig"), + ("granite_speech", "GraniteSpeechConfig"), + ("granitemoe", "GraniteMoeConfig"), + ("granitemoeshared", "GraniteMoeSharedConfig"), + ("granitevision", "LlavaNextConfig"), + ("graphormer", "GraphormerConfig"), + ("grounding-dino", "GroundingDinoConfig"), + ("groupvit", "GroupViTConfig"), + ("helium", "HeliumConfig"), + ("hiera", "HieraConfig"), + ("hubert", "HubertConfig"), + ("ibert", "IBertConfig"), + ("idefics", "IdeficsConfig"), + ("idefics2", "Idefics2Config"), + ("idefics3", "Idefics3Config"), + ("idefics3_vision", "Idefics3VisionConfig"), + ("ijepa", "IJepaConfig"), + ("imagegpt", "ImageGPTConfig"), + ("informer", "InformerConfig"), + ("instructblip", "InstructBlipConfig"), + ("instructblipvideo", "InstructBlipVideoConfig"), + ("internvl", "InternVLConfig"), + ("internvl_vision", "InternVLVisionConfig"), + ("jamba", "JambaConfig"), + ("janus", "JanusConfig"), + ("jetmoe", "JetMoeConfig"), + ("jukebox", "JukeboxConfig"), + ("kosmos-2", "Kosmos2Config"), + ("layoutlm", "LayoutLMConfig"), + ("layoutlmv2", "LayoutLMv2Config"), + ("layoutlmv3", "LayoutLMv3Config"), + ("led", "LEDConfig"), + ("levit", "LevitConfig"), + ("lilt", "LiltConfig"), + ("llama", "LlamaConfig"), + ("llama4", "Llama4Config"), + ("llama4_text", "Llama4TextConfig"), + ("llava", "LlavaConfig"), + ("llava_next", "LlavaNextConfig"), + ("llava_next_video", "LlavaNextVideoConfig"), + ("llava_onevision", "LlavaOnevisionConfig"), + ("longformer", "LongformerConfig"), + ("longt5", "LongT5Config"), + ("luke", "LukeConfig"), + ("lxmert", "LxmertConfig"), + ("m2m_100", "M2M100Config"), + ("mamba", "MambaConfig"), + ("mamba2", "Mamba2Config"), + ("marian", "MarianConfig"), + ("markuplm", "MarkupLMConfig"), + ("mask2former", "Mask2FormerConfig"), + ("maskformer", "MaskFormerConfig"), + ("maskformer-swin", "MaskFormerSwinConfig"), + ("mbart", "MBartConfig"), + ("mctct", "MCTCTConfig"), + ("mega", "MegaConfig"), + ("megatron-bert", "MegatronBertConfig"), + ("mgp-str", "MgpstrConfig"), + ("mimi", "MimiConfig"), + ("mistral", "MistralConfig"), + ("mistral3", "Mistral3Config"), + ("mixtral", "MixtralConfig"), + ("mlcd", "MLCDVisionConfig"), + ("mllama", "MllamaConfig"), + ("mobilebert", "MobileBertConfig"), + ("mobilenet_v1", "MobileNetV1Config"), + ("mobilenet_v2", "MobileNetV2Config"), + ("mobilevit", "MobileViTConfig"), + ("mobilevitv2", "MobileViTV2Config"), + ("modernbert", "ModernBertConfig"), + ("moonshine", "MoonshineConfig"), + ("moshi", "MoshiConfig"), + ("mpnet", "MPNetConfig"), + ("mpt", "MptConfig"), + ("mra", "MraConfig"), + ("mt5", "MT5Config"), + ("musicgen", "MusicgenConfig"), + ("musicgen_melody", "MusicgenMelodyConfig"), + ("mvp", "MvpConfig"), + ("nat", "NatConfig"), + ("nemotron", "NemotronConfig"), + ("nezha", "NezhaConfig"), + ("nllb-moe", "NllbMoeConfig"), + ("nougat", "VisionEncoderDecoderConfig"), + ("nystromformer", "NystromformerConfig"), + ("olmo", "OlmoConfig"), + ("olmo2", "Olmo2Config"), + ("olmoe", "OlmoeConfig"), + ("omdet-turbo", "OmDetTurboConfig"), + ("oneformer", "OneFormerConfig"), + ("open-llama", "OpenLlamaConfig"), + ("openai-gpt", "OpenAIGPTConfig"), + ("opt", "OPTConfig"), + ("owlv2", "Owlv2Config"), + ("owlvit", "OwlViTConfig"), + ("paligemma", "PaliGemmaConfig"), + ("patchtsmixer", "PatchTSMixerConfig"), + ("patchtst", "PatchTSTConfig"), + ("pegasus", "PegasusConfig"), + ("pegasus_x", "PegasusXConfig"), + ("perceiver", "PerceiverConfig"), + ("persimmon", "PersimmonConfig"), + ("phi", "PhiConfig"), + ("phi3", "Phi3Config"), + ("phi4_multimodal", "Phi4MultimodalConfig"), + ("phimoe", "PhimoeConfig"), + ("pix2struct", "Pix2StructConfig"), + ("pixtral", "PixtralVisionConfig"), + ("plbart", "PLBartConfig"), + ("poolformer", "PoolFormerConfig"), + ("pop2piano", "Pop2PianoConfig"), + ("prompt_depth_anything", "PromptDepthAnythingConfig"), + ("prophetnet", "ProphetNetConfig"), + ("pvt", "PvtConfig"), + ("pvt_v2", "PvtV2Config"), + ("qdqbert", "QDQBertConfig"), + ("qwen2", "Qwen2Config"), + ("qwen2_5_omni", "Qwen2_5OmniConfig"), + ("qwen2_5_vl", "Qwen2_5_VLConfig"), + ("qwen2_5_vl_text", "Qwen2_5_VLTextConfig"), + ("qwen2_audio", "Qwen2AudioConfig"), + ("qwen2_audio_encoder", "Qwen2AudioEncoderConfig"), + ("qwen2_moe", "Qwen2MoeConfig"), + ("qwen2_vl", "Qwen2VLConfig"), + ("qwen2_vl_text", "Qwen2VLTextConfig"), + ("qwen3", "Qwen3Config"), + ("qwen3_moe", "Qwen3MoeConfig"), + ("rag", "RagConfig"), + ("realm", "RealmConfig"), + ("recurrent_gemma", "RecurrentGemmaConfig"), + ("reformer", "ReformerConfig"), + ("regnet", "RegNetConfig"), + ("rembert", "RemBertConfig"), + ("resnet", "ResNetConfig"), + ("retribert", "RetriBertConfig"), + ("roberta", "RobertaConfig"), + ("roberta-prelayernorm", "RobertaPreLayerNormConfig"), + ("roc_bert", "RoCBertConfig"), + ("roformer", "RoFormerConfig"), + ("rt_detr", "RTDetrConfig"), + ("rt_detr_resnet", "RTDetrResNetConfig"), + ("rt_detr_v2", "RTDetrV2Config"), + ("rwkv", "RwkvConfig"), + ("sam", "SamConfig"), + ("sam_vision_model", "SamVisionConfig"), + ("seamless_m4t", "SeamlessM4TConfig"), + ("seamless_m4t_v2", "SeamlessM4Tv2Config"), + ("segformer", "SegformerConfig"), + ("seggpt", "SegGptConfig"), + ("sew", "SEWConfig"), + ("sew-d", "SEWDConfig"), + ("shieldgemma2", "ShieldGemma2Config"), + ("siglip", "SiglipConfig"), + ("siglip2", "Siglip2Config"), + ("siglip_vision_model", "SiglipVisionConfig"), + ("smolvlm", "SmolVLMConfig"), + ("smolvlm_vision", "SmolVLMVisionConfig"), + ("speech-encoder-decoder", "SpeechEncoderDecoderConfig"), + ("speech_to_text", "Speech2TextConfig"), + ("speech_to_text_2", "Speech2Text2Config"), + ("speecht5", "SpeechT5Config"), + ("splinter", "SplinterConfig"), + ("squeezebert", "SqueezeBertConfig"), + ("stablelm", "StableLmConfig"), + ("starcoder2", "Starcoder2Config"), + ("superglue", "SuperGlueConfig"), + ("superpoint", "SuperPointConfig"), + ("swiftformer", "SwiftFormerConfig"), + ("swin", "SwinConfig"), + ("swin2sr", "Swin2SRConfig"), + ("swinv2", "Swinv2Config"), + ("switch_transformers", "SwitchTransformersConfig"), + ("t5", "T5Config"), + ("table-transformer", "TableTransformerConfig"), + ("tapas", "TapasConfig"), + ("textnet", "TextNetConfig"), + ("time_series_transformer", "TimeSeriesTransformerConfig"), + ("timesfm", "TimesFmConfig"), + ("timesformer", "TimesformerConfig"), + ("timm_backbone", "TimmBackboneConfig"), + ("timm_wrapper", "TimmWrapperConfig"), + ("trajectory_transformer", "TrajectoryTransformerConfig"), + ("transfo-xl", "TransfoXLConfig"), + ("trocr", "TrOCRConfig"), + ("tvlt", "TvltConfig"), + ("tvp", "TvpConfig"), + ("udop", "UdopConfig"), + ("umt5", "UMT5Config"), + ("unispeech", "UniSpeechConfig"), + ("unispeech-sat", "UniSpeechSatConfig"), + ("univnet", "UnivNetConfig"), + ("upernet", "UperNetConfig"), + ("van", "VanConfig"), + ("video_llava", "VideoLlavaConfig"), + ("videomae", "VideoMAEConfig"), + ("vilt", "ViltConfig"), + ("vipllava", "VipLlavaConfig"), + ("vision-encoder-decoder", "VisionEncoderDecoderConfig"), + ("vision-text-dual-encoder", "VisionTextDualEncoderConfig"), + ("visual_bert", "VisualBertConfig"), + ("vit", "ViTConfig"), + ("vit_hybrid", "ViTHybridConfig"), + ("vit_mae", "ViTMAEConfig"), + ("vit_msn", "ViTMSNConfig"), + ("vitdet", "VitDetConfig"), + ("vitmatte", "VitMatteConfig"), + ("vitpose", "VitPoseConfig"), + ("vitpose_backbone", "VitPoseBackboneConfig"), + ("vits", "VitsConfig"), + ("vivit", "VivitConfig"), + ("wav2vec2", "Wav2Vec2Config"), + ("wav2vec2-bert", "Wav2Vec2BertConfig"), + ("wav2vec2-conformer", "Wav2Vec2ConformerConfig"), + ("wavlm", "WavLMConfig"), + ("whisper", "WhisperConfig"), + ("xclip", "XCLIPConfig"), + ("xglm", "XGLMConfig"), + ("xlm", "XLMConfig"), + ("xlm-prophetnet", "XLMProphetNetConfig"), + ("xlm-roberta", "XLMRobertaConfig"), + ("xlm-roberta-xl", "XLMRobertaXLConfig"), + ("xlnet", "XLNetConfig"), + ("xmod", "XmodConfig"), + ("yolos", "YolosConfig"), + ("yoso", "YosoConfig"), + ("zamba", "ZambaConfig"), + ("zamba2", "Zamba2Config"), + ("zoedepth", "ZoeDepthConfig"), + ] +) + + +MODEL_NAMES_MAPPING = OrderedDict( + [ + # Add full (and cased) model names here + ("albert", "ALBERT"), + ("align", "ALIGN"), + ("altclip", "AltCLIP"), + ("aria", "Aria"), + ("aria_text", "AriaText"), + ("audio-spectrogram-transformer", "Audio Spectrogram Transformer"), + ("autoformer", "Autoformer"), + ("aya_vision", "AyaVision"), + ("bamba", "Bamba"), + ("bark", "Bark"), + ("bart", "BART"), + ("barthez", "BARThez"), + ("bartpho", "BARTpho"), + ("beit", "BEiT"), + ("bert", "BERT"), + ("bert-generation", "Bert Generation"), + ("bert-japanese", "BertJapanese"), + ("bertweet", "BERTweet"), + ("big_bird", "BigBird"), + ("bigbird_pegasus", "BigBird-Pegasus"), + ("biogpt", "BioGpt"), + ("bit", "BiT"), + ("blenderbot", "Blenderbot"), + ("blenderbot-small", "BlenderbotSmall"), + ("blip", "BLIP"), + ("blip-2", "BLIP-2"), + ("blip_2_qformer", "BLIP-2 QFormer"), + ("bloom", "BLOOM"), + ("bort", "BORT"), + ("bridgetower", "BridgeTower"), + ("bros", "BROS"), + ("byt5", "ByT5"), + ("camembert", "CamemBERT"), + ("canine", "CANINE"), + ("chameleon", "Chameleon"), + ("chinese_clip", "Chinese-CLIP"), + ("chinese_clip_vision_model", "ChineseCLIPVisionModel"), + ("clap", "CLAP"), + ("clip", "CLIP"), + ("clip_text_model", "CLIPTextModel"), + ("clip_vision_model", "CLIPVisionModel"), + ("clipseg", "CLIPSeg"), + ("clvp", "CLVP"), + ("code_llama", "CodeLlama"), + ("codegen", "CodeGen"), + ("cohere", "Cohere"), + ("cohere2", "Cohere2"), + ("colpali", "ColPali"), + ("conditional_detr", "Conditional DETR"), + ("convbert", "ConvBERT"), + ("convnext", "ConvNeXT"), + ("convnextv2", "ConvNeXTV2"), + ("cpm", "CPM"), + ("cpmant", "CPM-Ant"), + ("ctrl", "CTRL"), + ("cvt", "CvT"), + ("dab-detr", "DAB-DETR"), + ("dac", "DAC"), + ("data2vec-audio", "Data2VecAudio"), + ("data2vec-text", "Data2VecText"), + ("data2vec-vision", "Data2VecVision"), + ("dbrx", "DBRX"), + ("deberta", "DeBERTa"), + ("deberta-v2", "DeBERTa-v2"), + ("decision_transformer", "Decision Transformer"), + ("deepseek_v3", "DeepSeek-V3"), + ("deformable_detr", "Deformable DETR"), + ("deit", "DeiT"), + ("deplot", "DePlot"), + ("depth_anything", "Depth Anything"), + ("depth_anything_v2", "Depth Anything V2"), + ("depth_pro", "DepthPro"), + ("deta", "DETA"), + ("detr", "DETR"), + ("dialogpt", "DialoGPT"), + ("diffllama", "DiffLlama"), + ("dinat", "DiNAT"), + ("dinov2", "DINOv2"), + ("dinov2_with_registers", "DINOv2 with Registers"), + ("distilbert", "DistilBERT"), + ("dit", "DiT"), + ("donut-swin", "DonutSwin"), + ("dpr", "DPR"), + ("dpt", "DPT"), + ("efficientformer", "EfficientFormer"), + ("efficientnet", "EfficientNet"), + ("electra", "ELECTRA"), + ("emu3", "Emu3"), + ("encodec", "EnCodec"), + ("encoder-decoder", "Encoder decoder"), + ("ernie", "ERNIE"), + ("ernie_m", "ErnieM"), + ("esm", "ESM"), + ("falcon", "Falcon"), + ("falcon3", "Falcon3"), + ("falcon_mamba", "FalconMamba"), + ("fastspeech2_conformer", "FastSpeech2Conformer"), + ("flan-t5", "FLAN-T5"), + ("flan-ul2", "FLAN-UL2"), + ("flaubert", "FlauBERT"), + ("flava", "FLAVA"), + ("fnet", "FNet"), + ("focalnet", "FocalNet"), + ("fsmt", "FairSeq Machine-Translation"), + ("funnel", "Funnel Transformer"), + ("fuyu", "Fuyu"), + ("gemma", "Gemma"), + ("gemma2", "Gemma2"), + ("gemma3", "Gemma3ForConditionalGeneration"), + ("gemma3_text", "Gemma3ForCausalLM"), + ("git", "GIT"), + ("glm", "GLM"), + ("glm4", "glm4"), + ("glpn", "GLPN"), + ("got_ocr2", "GOT-OCR2"), + ("gpt-sw3", "GPT-Sw3"), + ("gpt2", "OpenAI GPT-2"), + ("gpt_bigcode", "GPTBigCode"), + ("gpt_neo", "GPT Neo"), + ("gpt_neox", "GPT NeoX"), + ("gpt_neox_japanese", "GPT NeoX Japanese"), + ("gptj", "GPT-J"), + ("gptsan-japanese", "GPTSAN-japanese"), + ("granite", "Granite"), + ("granite_speech", "GraniteSpeech"), + ("granitemoe", "GraniteMoeMoe"), + ("granitemoeshared", "GraniteMoeSharedMoe"), + ("granitevision", "LLaVA-NeXT"), + ("graphormer", "Graphormer"), + ("grounding-dino", "Grounding DINO"), + ("groupvit", "GroupViT"), + ("helium", "Helium"), + ("herbert", "HerBERT"), + ("hiera", "Hiera"), + ("hubert", "Hubert"), + ("ibert", "I-BERT"), + ("idefics", "IDEFICS"), + ("idefics2", "Idefics2"), + ("idefics3", "Idefics3"), + ("idefics3_vision", "Idefics3VisionTransformer"), + ("ijepa", "I-JEPA"), + ("imagegpt", "ImageGPT"), + ("informer", "Informer"), + ("instructblip", "InstructBLIP"), + ("instructblipvideo", "InstructBlipVideo"), + ("internvl", "InternVL"), + ("internvl_vision", "InternVLVision"), + ("jamba", "Jamba"), + ("janus", "Janus"), + ("jetmoe", "JetMoe"), + ("jukebox", "Jukebox"), + ("kosmos-2", "KOSMOS-2"), + ("layoutlm", "LayoutLM"), + ("layoutlmv2", "LayoutLMv2"), + ("layoutlmv3", "LayoutLMv3"), + ("layoutxlm", "LayoutXLM"), + ("led", "LED"), + ("levit", "LeViT"), + ("lilt", "LiLT"), + ("llama", "LLaMA"), + ("llama2", "Llama2"), + ("llama3", "Llama3"), + ("llama4", "Llama4"), + ("llama4_text", "Llama4ForCausalLM"), + ("llava", "LLaVa"), + ("llava_next", "LLaVA-NeXT"), + ("llava_next_video", "LLaVa-NeXT-Video"), + ("llava_onevision", "LLaVA-Onevision"), + ("longformer", "Longformer"), + ("longt5", "LongT5"), + ("luke", "LUKE"), + ("lxmert", "LXMERT"), + ("m2m_100", "M2M100"), + ("madlad-400", "MADLAD-400"), + ("mamba", "Mamba"), + ("mamba2", "mamba2"), + ("marian", "Marian"), + ("markuplm", "MarkupLM"), + ("mask2former", "Mask2Former"), + ("maskformer", "MaskFormer"), + ("maskformer-swin", "MaskFormerSwin"), + ("matcha", "MatCha"), + ("mbart", "mBART"), + ("mbart50", "mBART-50"), + ("mctct", "M-CTC-T"), + ("mega", "MEGA"), + ("megatron-bert", "Megatron-BERT"), + ("megatron_gpt2", "Megatron-GPT2"), + ("mgp-str", "MGP-STR"), + ("mimi", "Mimi"), + ("mistral", "Mistral"), + ("mistral3", "Mistral3"), + ("mixtral", "Mixtral"), + ("mlcd", "MLCD"), + ("mllama", "Mllama"), + ("mluke", "mLUKE"), + ("mms", "MMS"), + ("mobilebert", "MobileBERT"), + ("mobilenet_v1", "MobileNetV1"), + ("mobilenet_v2", "MobileNetV2"), + ("mobilevit", "MobileViT"), + ("mobilevitv2", "MobileViTV2"), + ("modernbert", "ModernBERT"), + ("moonshine", "Moonshine"), + ("moshi", "Moshi"), + ("mpnet", "MPNet"), + ("mpt", "MPT"), + ("mra", "MRA"), + ("mt5", "MT5"), + ("musicgen", "MusicGen"), + ("musicgen_melody", "MusicGen Melody"), + ("mvp", "MVP"), + ("myt5", "myt5"), + ("nat", "NAT"), + ("nemotron", "Nemotron"), + ("nezha", "Nezha"), + ("nllb", "NLLB"), + ("nllb-moe", "NLLB-MOE"), + ("nougat", "Nougat"), + ("nystromformer", "Nyströmformer"), + ("olmo", "OLMo"), + ("olmo2", "OLMo2"), + ("olmoe", "OLMoE"), + ("omdet-turbo", "OmDet-Turbo"), + ("oneformer", "OneFormer"), + ("open-llama", "OpenLlama"), + ("openai-gpt", "OpenAI GPT"), + ("opt", "OPT"), + ("owlv2", "OWLv2"), + ("owlvit", "OWL-ViT"), + ("paligemma", "PaliGemma"), + ("patchtsmixer", "PatchTSMixer"), + ("patchtst", "PatchTST"), + ("pegasus", "Pegasus"), + ("pegasus_x", "PEGASUS-X"), + ("perceiver", "Perceiver"), + ("persimmon", "Persimmon"), + ("phi", "Phi"), + ("phi3", "Phi3"), + ("phi4_multimodal", "Phi4Multimodal"), + ("phimoe", "Phimoe"), + ("phobert", "PhoBERT"), + ("pix2struct", "Pix2Struct"), + ("pixtral", "Pixtral"), + ("plbart", "PLBart"), + ("poolformer", "PoolFormer"), + ("pop2piano", "Pop2Piano"), + ("prompt_depth_anything", "PromptDepthAnything"), + ("prophetnet", "ProphetNet"), + ("pvt", "PVT"), + ("pvt_v2", "PVTv2"), + ("qdqbert", "QDQBert"), + ("qwen2", "Qwen2"), + ("qwen2_5_omni", "Qwen2_5Omni"), + ("qwen2_5_vl", "Qwen2_5_VL"), + ("qwen2_5_vl_text", "Qwen2_5_VL"), + ("qwen2_audio", "Qwen2Audio"), + ("qwen2_audio_encoder", "Qwen2AudioEncoder"), + ("qwen2_moe", "Qwen2MoE"), + ("qwen2_vl", "Qwen2VL"), + ("qwen2_vl_text", "Qwen2VL"), + ("qwen3", "Qwen3"), + ("qwen3_moe", "Qwen3MoE"), + ("rag", "RAG"), + ("realm", "REALM"), + ("recurrent_gemma", "RecurrentGemma"), + ("reformer", "Reformer"), + ("regnet", "RegNet"), + ("rembert", "RemBERT"), + ("resnet", "ResNet"), + ("retribert", "RetriBERT"), + ("roberta", "RoBERTa"), + ("roberta-prelayernorm", "RoBERTa-PreLayerNorm"), + ("roc_bert", "RoCBert"), + ("roformer", "RoFormer"), + ("rt_detr", "RT-DETR"), + ("rt_detr_resnet", "RT-DETR-ResNet"), + ("rt_detr_v2", "RT-DETRv2"), + ("rwkv", "RWKV"), + ("sam", "SAM"), + ("sam_vision_model", "SamVisionModel"), + ("seamless_m4t", "SeamlessM4T"), + ("seamless_m4t_v2", "SeamlessM4Tv2"), + ("segformer", "SegFormer"), + ("seggpt", "SegGPT"), + ("sew", "SEW"), + ("sew-d", "SEW-D"), + ("shieldgemma2", "Shieldgemma2"), + ("siglip", "SigLIP"), + ("siglip2", "SigLIP2"), + ("siglip2_vision_model", "Siglip2VisionModel"), + ("siglip_vision_model", "SiglipVisionModel"), + ("smolvlm", "SmolVLM"), + ("smolvlm_vision", "SmolVLMVisionTransformer"), + ("speech-encoder-decoder", "Speech Encoder decoder"), + ("speech_to_text", "Speech2Text"), + ("speech_to_text_2", "Speech2Text2"), + ("speecht5", "SpeechT5"), + ("splinter", "Splinter"), + ("squeezebert", "SqueezeBERT"), + ("stablelm", "StableLm"), + ("starcoder2", "Starcoder2"), + ("superglue", "SuperGlue"), + ("superpoint", "SuperPoint"), + ("swiftformer", "SwiftFormer"), + ("swin", "Swin Transformer"), + ("swin2sr", "Swin2SR"), + ("swinv2", "Swin Transformer V2"), + ("switch_transformers", "SwitchTransformers"), + ("t5", "T5"), + ("t5v1.1", "T5v1.1"), + ("table-transformer", "Table Transformer"), + ("tapas", "TAPAS"), + ("tapex", "TAPEX"), + ("textnet", "TextNet"), + ("time_series_transformer", "Time Series Transformer"), + ("timesfm", "TimesFm"), + ("timesformer", "TimeSformer"), + ("timm_backbone", "TimmBackbone"), + ("timm_wrapper", "TimmWrapperModel"), + ("trajectory_transformer", "Trajectory Transformer"), + ("transfo-xl", "Transformer-XL"), + ("trocr", "TrOCR"), + ("tvlt", "TVLT"), + ("tvp", "TVP"), + ("udop", "UDOP"), + ("ul2", "UL2"), + ("umt5", "UMT5"), + ("unispeech", "UniSpeech"), + ("unispeech-sat", "UniSpeechSat"), + ("univnet", "UnivNet"), + ("upernet", "UPerNet"), + ("van", "VAN"), + ("video_llava", "VideoLlava"), + ("videomae", "VideoMAE"), + ("vilt", "ViLT"), + ("vipllava", "VipLlava"), + ("vision-encoder-decoder", "Vision Encoder decoder"), + ("vision-text-dual-encoder", "VisionTextDualEncoder"), + ("visual_bert", "VisualBERT"), + ("vit", "ViT"), + ("vit_hybrid", "ViT Hybrid"), + ("vit_mae", "ViTMAE"), + ("vit_msn", "ViTMSN"), + ("vitdet", "VitDet"), + ("vitmatte", "ViTMatte"), + ("vitpose", "ViTPose"), + ("vitpose_backbone", "ViTPoseBackbone"), + ("vits", "VITS"), + ("vivit", "ViViT"), + ("wav2vec2", "Wav2Vec2"), + ("wav2vec2-bert", "Wav2Vec2-BERT"), + ("wav2vec2-conformer", "Wav2Vec2-Conformer"), + ("wav2vec2_phoneme", "Wav2Vec2Phoneme"), + ("wavlm", "WavLM"), + ("whisper", "Whisper"), + ("xclip", "X-CLIP"), + ("xglm", "XGLM"), + ("xlm", "XLM"), + ("xlm-prophetnet", "XLM-ProphetNet"), + ("xlm-roberta", "XLM-RoBERTa"), + ("xlm-roberta-xl", "XLM-RoBERTa-XL"), + ("xlm-v", "XLM-V"), + ("xlnet", "XLNet"), + ("xls_r", "XLS-R"), + ("xlsr_wav2vec2", "XLSR-Wav2Vec2"), + ("xmod", "X-MOD"), + ("yolos", "YOLOS"), + ("yoso", "YOSO"), + ("zamba", "Zamba"), + ("zamba2", "Zamba2"), + ("zoedepth", "ZoeDepth"), + ] +) + +# This is tied to the processing `-` -> `_` in `model_type_to_module_name`. For example, instead of putting +# `transfo-xl` (as in `CONFIG_MAPPING_NAMES`), we should use `transfo_xl`. +DEPRECATED_MODELS = [ + "bort", + "deta", + "efficientformer", + "ernie_m", + "gptsan_japanese", + "graphormer", + "jukebox", + "mctct", + "mega", + "mmbt", + "nat", + "nezha", + "open_llama", + "qdqbert", + "realm", + "retribert", + "speech_to_text_2", + "tapex", + "trajectory_transformer", + "transfo_xl", + "tvlt", + "van", + "vit_hybrid", + "xlm_prophetnet", +] + +SPECIAL_MODEL_TYPE_TO_MODULE_NAME = OrderedDict( + [ + ("openai-gpt", "openai"), + ("data2vec-audio", "data2vec"), + ("data2vec-text", "data2vec"), + ("data2vec-vision", "data2vec"), + ("donut-swin", "donut"), + ("kosmos-2", "kosmos2"), + ("maskformer-swin", "maskformer"), + ("xclip", "x_clip"), + ("clip_vision_model", "clip"), + ("qwen2_audio_encoder", "qwen2_audio"), + ("clip_text_model", "clip"), + ("aria_text", "aria"), + ("gemma3_text", "gemma3"), + ("idefics3_vision", "idefics3"), + ("siglip_vision_model", "siglip"), + ("smolvlm_vision", "smolvlm"), + ("chinese_clip_vision_model", "chinese_clip"), + ("rt_detr_resnet", "rt_detr"), + ("granitevision", "llava_next"), + ("internvl_vision", "internvl"), + ("qwen2_5_vl_text", "qwen2_5_vl"), + ("qwen2_vl_text", "qwen2_vl"), + ("sam_vision_model", "sam"), + ("llama4_text", "llama4"), + ("blip_2_qformer", "blip_2"), + ] +) + + +def model_type_to_module_name(key): + """Converts a config key to the corresponding module.""" + # Special treatment + if key in SPECIAL_MODEL_TYPE_TO_MODULE_NAME: + key = SPECIAL_MODEL_TYPE_TO_MODULE_NAME[key] + + if key in DEPRECATED_MODELS: + key = f"deprecated.{key}" + return key + + key = key.replace("-", "_") + if key in DEPRECATED_MODELS: + key = f"deprecated.{key}" + + return key + + +def config_class_to_model_type(config): + """Converts a config class name to the corresponding model type""" + for key, cls in CONFIG_MAPPING_NAMES.items(): + if cls == config: + return key + # if key not found check in extra content + for key, cls in CONFIG_MAPPING._extra_content.items(): + if cls.__name__ == config: + return key + return None + + +class _LazyConfigMapping(OrderedDict): + """ + A dictionary that lazily load its values when they are requested. + """ + + def __init__(self, mapping): + self._mapping = mapping + self._extra_content = {} + self._modules = {} + + def __getitem__(self, key): + if key in self._extra_content: + return self._extra_content[key] + if key not in self._mapping: + raise KeyError(key) + value = self._mapping[key] + module_name = model_type_to_module_name(key) + if module_name not in self._modules: + self._modules[module_name] = importlib.import_module(f".{module_name}", "transformers.models") + if hasattr(self._modules[module_name], value): + return getattr(self._modules[module_name], value) + + # Some of the mappings have entries model_type -> config of another model type. In that case we try to grab the + # object at the top level. + transformers_module = importlib.import_module("transformers") + return getattr(transformers_module, value) + + def keys(self): + return list(self._mapping.keys()) + list(self._extra_content.keys()) + + def values(self): + return [self[k] for k in self._mapping.keys()] + list(self._extra_content.values()) + + def items(self): + return [(k, self[k]) for k in self._mapping.keys()] + list(self._extra_content.items()) + + def __iter__(self): + return iter(list(self._mapping.keys()) + list(self._extra_content.keys())) + + def __contains__(self, item): + return item in self._mapping or item in self._extra_content + + def register(self, key, value, exist_ok=False): + """ + Register a new configuration in this mapping. + """ + if key in self._mapping.keys() and not exist_ok: + raise ValueError(f"'{key}' is already used by a Transformers config, pick another name.") + self._extra_content[key] = value + + +CONFIG_MAPPING = _LazyConfigMapping(CONFIG_MAPPING_NAMES) + + +class _LazyLoadAllMappings(OrderedDict): + """ + A mapping that will load all pairs of key values at the first access (either by indexing, requestions keys, values, + etc.) + + Args: + mapping: The mapping to load. + """ + + def __init__(self, mapping): + self._mapping = mapping + self._initialized = False + self._data = {} + + def _initialize(self): + if self._initialized: + return + + for model_type, map_name in self._mapping.items(): + module_name = model_type_to_module_name(model_type) + module = importlib.import_module(f".{module_name}", "transformers.models") + mapping = getattr(module, map_name) + self._data.update(mapping) + + self._initialized = True + + def __getitem__(self, key): + self._initialize() + return self._data[key] + + def keys(self): + self._initialize() + return self._data.keys() + + def values(self): + self._initialize() + return self._data.values() + + def items(self): + self._initialize() + return self._data.keys() + + def __iter__(self): + self._initialize() + return iter(self._data) + + def __contains__(self, item): + self._initialize() + return item in self._data + + +def _get_class_name(model_class: Union[str, List[str]]): + if isinstance(model_class, (list, tuple)): + return " or ".join([f"[`{c}`]" for c in model_class if c is not None]) + return f"[`{model_class}`]" + + +def _list_model_options(indent, config_to_class=None, use_model_types=True): + if config_to_class is None and not use_model_types: + raise ValueError("Using `use_model_types=False` requires a `config_to_class` dictionary.") + if use_model_types: + if config_to_class is None: + model_type_to_name = {model_type: f"[`{config}`]" for model_type, config in CONFIG_MAPPING_NAMES.items()} + else: + model_type_to_name = { + model_type: _get_class_name(model_class) + for model_type, model_class in config_to_class.items() + if model_type in MODEL_NAMES_MAPPING + } + lines = [ + f"{indent}- **{model_type}** -- {model_type_to_name[model_type]} ({MODEL_NAMES_MAPPING[model_type]} model)" + for model_type in sorted(model_type_to_name.keys()) + ] + else: + config_to_name = { + CONFIG_MAPPING_NAMES[config]: _get_class_name(clas) + for config, clas in config_to_class.items() + if config in CONFIG_MAPPING_NAMES + } + config_to_model_name = { + config: MODEL_NAMES_MAPPING[model_type] for model_type, config in CONFIG_MAPPING_NAMES.items() + } + lines = [ + f"{indent}- [`{config_name}`] configuration class:" + f" {config_to_name[config_name]} ({config_to_model_name[config_name]} model)" + for config_name in sorted(config_to_name.keys()) + ] + return "\n".join(lines) + + +def replace_list_option_in_docstrings(config_to_class=None, use_model_types=True): + def docstring_decorator(fn): + docstrings = fn.__doc__ + if docstrings is None: + # Example: -OO + return fn + lines = docstrings.split("\n") + i = 0 + while i < len(lines) and re.search(r"^(\s*)List options\s*$", lines[i]) is None: + i += 1 + if i < len(lines): + indent = re.search(r"^(\s*)List options\s*$", lines[i]).groups()[0] + if use_model_types: + indent = f"{indent} " + lines[i] = _list_model_options(indent, config_to_class=config_to_class, use_model_types=use_model_types) + docstrings = "\n".join(lines) + else: + raise ValueError( + f"The function {fn} should have an empty 'List options' in its docstring as placeholder, current" + f" docstring is:\n{docstrings}" + ) + fn.__doc__ = docstrings + return fn + + return docstring_decorator + + +class AutoConfig: + r""" + This is a generic configuration class that will be instantiated as one of the configuration classes of the library + when created with the [`~AutoConfig.from_pretrained`] class method. + + This class cannot be instantiated directly using `__init__()` (throws an error). + """ + + def __init__(self): + raise EnvironmentError( + "AutoConfig is designed to be instantiated " + "using the `AutoConfig.from_pretrained(pretrained_model_name_or_path)` method." + ) + + @classmethod + def for_model(cls, model_type: str, *args, **kwargs): + if model_type in CONFIG_MAPPING: + config_class = CONFIG_MAPPING[model_type] + return config_class(*args, **kwargs) + raise ValueError( + f"Unrecognized model identifier: {model_type}. Should contain one of {', '.join(CONFIG_MAPPING.keys())}" + ) + + @classmethod + @replace_list_option_in_docstrings() + def from_pretrained(cls, pretrained_model_name_or_path, **kwargs): + r""" + Instantiate one of the configuration classes of the library from a pretrained model configuration. + + The configuration class to instantiate is selected based on the `model_type` property of the config object that + is loaded, or when it's missing, by falling back to using pattern matching on `pretrained_model_name_or_path`: + + List options + + Args: + pretrained_model_name_or_path (`str` or `os.PathLike`): + Can be either: + + - A string, the *model id* of a pretrained model configuration hosted inside a model repo on + huggingface.co. + - A path to a *directory* containing a configuration file saved using the + [`~PretrainedConfig.save_pretrained`] method, or the [`~PreTrainedModel.save_pretrained`] method, + e.g., `./my_model_directory/`. + - A path or url to a saved configuration JSON *file*, e.g., + `./my_model_directory/configuration.json`. + cache_dir (`str` or `os.PathLike`, *optional*): + Path to a directory in which a downloaded pretrained model configuration should be cached if the + standard cache should not be used. + force_download (`bool`, *optional*, defaults to `False`): + Whether or not to force the (re-)download the model weights and configuration files and override the + cached versions if they exist. + resume_download: + Deprecated and ignored. All downloads are now resumed by default when possible. + Will be removed in v5 of Transformers. + proxies (`Dict[str, str]`, *optional*): + A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128', + 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request. + revision (`str`, *optional*, defaults to `"main"`): + The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a + git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any + identifier allowed by git. + return_unused_kwargs (`bool`, *optional*, defaults to `False`): + If `False`, then this function returns just the final configuration object. + + If `True`, then this functions returns a `Tuple(config, unused_kwargs)` where *unused_kwargs* is a + dictionary consisting of the key/value pairs whose keys are not configuration attributes: i.e., the + part of `kwargs` which has not been used to update `config` and is otherwise ignored. + trust_remote_code (`bool`, *optional*, defaults to `False`): + Whether or not to allow for custom models defined on the Hub in their own modeling files. This option + should only be set to `True` for repositories you trust and in which you have read the code, as it will + execute code present on the Hub on your local machine. + kwargs(additional keyword arguments, *optional*): + The values in kwargs of any keys which are configuration attributes will be used to override the loaded + values. Behavior concerning key/value pairs whose keys are *not* configuration attributes is controlled + by the `return_unused_kwargs` keyword parameter. + + Examples: + + ```python + >>> from transformers import AutoConfig + + >>> # Download configuration from huggingface.co and cache. + >>> config = AutoConfig.from_pretrained("google-bert/bert-base-uncased") + + >>> # Download configuration from huggingface.co (user-uploaded) and cache. + >>> config = AutoConfig.from_pretrained("dbmdz/bert-base-german-cased") + + >>> # If configuration file is in a directory (e.g., was saved using *save_pretrained('./test/saved_model/')*). + >>> config = AutoConfig.from_pretrained("./test/bert_saved_model/") + + >>> # Load a specific configuration file. + >>> config = AutoConfig.from_pretrained("./test/bert_saved_model/my_configuration.json") + + >>> # Change some config attributes when loading a pretrained config. + >>> config = AutoConfig.from_pretrained("google-bert/bert-base-uncased", output_attentions=True, foo=False) + >>> config.output_attentions + True + + >>> config, unused_kwargs = AutoConfig.from_pretrained( + ... "google-bert/bert-base-uncased", output_attentions=True, foo=False, return_unused_kwargs=True + ... ) + >>> config.output_attentions + True + + >>> unused_kwargs + {'foo': False} + ```""" + use_auth_token = kwargs.pop("use_auth_token", None) + if use_auth_token is not None: + warnings.warn( + "The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers. Please use `token` instead.", + FutureWarning, + ) + if kwargs.get("token", None) is not None: + raise ValueError( + "`token` and `use_auth_token` are both specified. Please set only the argument `token`." + ) + kwargs["token"] = use_auth_token + + kwargs["_from_auto"] = True + kwargs["name_or_path"] = pretrained_model_name_or_path + trust_remote_code = kwargs.pop("trust_remote_code", None) + code_revision = kwargs.pop("code_revision", None) + + config_dict, unused_kwargs = PretrainedConfig.get_config_dict(pretrained_model_name_or_path, **kwargs) + has_remote_code = "auto_map" in config_dict and "AutoConfig" in config_dict["auto_map"] + has_local_code = "model_type" in config_dict and config_dict["model_type"] in CONFIG_MAPPING + trust_remote_code = resolve_trust_remote_code( + trust_remote_code, pretrained_model_name_or_path, has_local_code, has_remote_code + ) + + if has_remote_code and trust_remote_code: + class_ref = config_dict["auto_map"]["AutoConfig"] + config_class = get_class_from_dynamic_module( + class_ref, pretrained_model_name_or_path, code_revision=code_revision, **kwargs + ) + if os.path.isdir(pretrained_model_name_or_path): + config_class.register_for_auto_class() + return config_class.from_pretrained(pretrained_model_name_or_path, **kwargs) + elif "model_type" in config_dict: + try: + config_class = CONFIG_MAPPING[config_dict["model_type"]] + except KeyError: + raise ValueError( + f"The checkpoint you are trying to load has model type `{config_dict['model_type']}` " + "but Transformers does not recognize this architecture. This could be because of an " + "issue with the checkpoint, or because your version of Transformers is out of date.\n\n" + "You can update Transformers with the command `pip install --upgrade transformers`. If this " + "does not work, and the checkpoint is very new, then there may not be a release version " + "that supports this model yet. In this case, you can get the most up-to-date code by installing " + "Transformers from source with the command " + "`pip install git+https://github.com/huggingface/transformers.git`" + ) + return config_class.from_dict(config_dict, **unused_kwargs) + else: + # Fallback: use pattern matching on the string. + # We go from longer names to shorter names to catch roberta before bert (for instance) + for pattern in sorted(CONFIG_MAPPING.keys(), key=len, reverse=True): + if pattern in str(pretrained_model_name_or_path): + return CONFIG_MAPPING[pattern].from_dict(config_dict, **unused_kwargs) + + raise ValueError( + f"Unrecognized model in {pretrained_model_name_or_path}. " + f"Should have a `model_type` key in its {CONFIG_NAME}, or contain one of the following strings " + f"in its name: {', '.join(CONFIG_MAPPING.keys())}" + ) + + @staticmethod + def register(model_type, config, exist_ok=False): + """ + Register a new configuration for this class. + + Args: + model_type (`str`): The model type like "bert" or "gpt". + config ([`PretrainedConfig`]): The config to register. + """ + if issubclass(config, PretrainedConfig) and config.model_type != model_type: + raise ValueError( + "The config you are passing has a `model_type` attribute that is not consistent with the model type " + f"you passed (config has {config.model_type} and you passed {model_type}. Fix one of those so they " + "match!" + ) + CONFIG_MAPPING.register(model_type, config, exist_ok=exist_ok) + + +__all__ = ["CONFIG_MAPPING", "MODEL_NAMES_MAPPING", "AutoConfig"] diff --git a/docs/transformers/build/lib/transformers/models/auto/feature_extraction_auto.py b/docs/transformers/build/lib/transformers/models/auto/feature_extraction_auto.py new file mode 100644 index 0000000000000000000000000000000000000000..86dc8703c426ab9d953bccfa622b45d5d346b9d7 --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/auto/feature_extraction_auto.py @@ -0,0 +1,412 @@ +# coding=utf-8 +# Copyright 2021 The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""AutoFeatureExtractor class.""" + +import importlib +import json +import os +import warnings +from collections import OrderedDict +from typing import Dict, Optional, Union + +# Build the list of all feature extractors +from ...configuration_utils import PretrainedConfig +from ...dynamic_module_utils import get_class_from_dynamic_module, resolve_trust_remote_code +from ...feature_extraction_utils import FeatureExtractionMixin +from ...utils import CONFIG_NAME, FEATURE_EXTRACTOR_NAME, cached_file, logging +from .auto_factory import _LazyAutoMapping +from .configuration_auto import ( + CONFIG_MAPPING_NAMES, + AutoConfig, + model_type_to_module_name, + replace_list_option_in_docstrings, +) + + +logger = logging.get_logger(__name__) + +FEATURE_EXTRACTOR_MAPPING_NAMES = OrderedDict( + [ + ("audio-spectrogram-transformer", "ASTFeatureExtractor"), + ("beit", "BeitFeatureExtractor"), + ("chinese_clip", "ChineseCLIPFeatureExtractor"), + ("clap", "ClapFeatureExtractor"), + ("clip", "CLIPFeatureExtractor"), + ("clipseg", "ViTFeatureExtractor"), + ("clvp", "ClvpFeatureExtractor"), + ("conditional_detr", "ConditionalDetrFeatureExtractor"), + ("convnext", "ConvNextFeatureExtractor"), + ("cvt", "ConvNextFeatureExtractor"), + ("dac", "DacFeatureExtractor"), + ("data2vec-audio", "Wav2Vec2FeatureExtractor"), + ("data2vec-vision", "BeitFeatureExtractor"), + ("deformable_detr", "DeformableDetrFeatureExtractor"), + ("deit", "DeiTFeatureExtractor"), + ("detr", "DetrFeatureExtractor"), + ("dinat", "ViTFeatureExtractor"), + ("donut-swin", "DonutFeatureExtractor"), + ("dpt", "DPTFeatureExtractor"), + ("encodec", "EncodecFeatureExtractor"), + ("flava", "FlavaFeatureExtractor"), + ("glpn", "GLPNFeatureExtractor"), + ("granite_speech", "GraniteSpeechFeatureExtractor"), + ("groupvit", "CLIPFeatureExtractor"), + ("hubert", "Wav2Vec2FeatureExtractor"), + ("imagegpt", "ImageGPTFeatureExtractor"), + ("layoutlmv2", "LayoutLMv2FeatureExtractor"), + ("layoutlmv3", "LayoutLMv3FeatureExtractor"), + ("levit", "LevitFeatureExtractor"), + ("maskformer", "MaskFormerFeatureExtractor"), + ("mctct", "MCTCTFeatureExtractor"), + ("mimi", "EncodecFeatureExtractor"), + ("mobilenet_v1", "MobileNetV1FeatureExtractor"), + ("mobilenet_v2", "MobileNetV2FeatureExtractor"), + ("mobilevit", "MobileViTFeatureExtractor"), + ("moonshine", "Wav2Vec2FeatureExtractor"), + ("moshi", "EncodecFeatureExtractor"), + ("nat", "ViTFeatureExtractor"), + ("owlvit", "OwlViTFeatureExtractor"), + ("perceiver", "PerceiverFeatureExtractor"), + ("phi4_multimodal", "Phi4MultimodalFeatureExtractor"), + ("poolformer", "PoolFormerFeatureExtractor"), + ("pop2piano", "Pop2PianoFeatureExtractor"), + ("regnet", "ConvNextFeatureExtractor"), + ("resnet", "ConvNextFeatureExtractor"), + ("seamless_m4t", "SeamlessM4TFeatureExtractor"), + ("seamless_m4t_v2", "SeamlessM4TFeatureExtractor"), + ("segformer", "SegformerFeatureExtractor"), + ("sew", "Wav2Vec2FeatureExtractor"), + ("sew-d", "Wav2Vec2FeatureExtractor"), + ("speech_to_text", "Speech2TextFeatureExtractor"), + ("speecht5", "SpeechT5FeatureExtractor"), + ("swiftformer", "ViTFeatureExtractor"), + ("swin", "ViTFeatureExtractor"), + ("swinv2", "ViTFeatureExtractor"), + ("table-transformer", "DetrFeatureExtractor"), + ("timesformer", "VideoMAEFeatureExtractor"), + ("tvlt", "TvltFeatureExtractor"), + ("unispeech", "Wav2Vec2FeatureExtractor"), + ("unispeech-sat", "Wav2Vec2FeatureExtractor"), + ("univnet", "UnivNetFeatureExtractor"), + ("van", "ConvNextFeatureExtractor"), + ("videomae", "VideoMAEFeatureExtractor"), + ("vilt", "ViltFeatureExtractor"), + ("vit", "ViTFeatureExtractor"), + ("vit_mae", "ViTFeatureExtractor"), + ("vit_msn", "ViTFeatureExtractor"), + ("wav2vec2", "Wav2Vec2FeatureExtractor"), + ("wav2vec2-bert", "Wav2Vec2FeatureExtractor"), + ("wav2vec2-conformer", "Wav2Vec2FeatureExtractor"), + ("wavlm", "Wav2Vec2FeatureExtractor"), + ("whisper", "WhisperFeatureExtractor"), + ("xclip", "CLIPFeatureExtractor"), + ("yolos", "YolosFeatureExtractor"), + ] +) + +FEATURE_EXTRACTOR_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, FEATURE_EXTRACTOR_MAPPING_NAMES) + + +def feature_extractor_class_from_name(class_name: str): + for module_name, extractors in FEATURE_EXTRACTOR_MAPPING_NAMES.items(): + if class_name in extractors: + module_name = model_type_to_module_name(module_name) + + module = importlib.import_module(f".{module_name}", "transformers.models") + try: + return getattr(module, class_name) + except AttributeError: + continue + + for _, extractor in FEATURE_EXTRACTOR_MAPPING._extra_content.items(): + if getattr(extractor, "__name__", None) == class_name: + return extractor + + # We did not fine the class, but maybe it's because a dep is missing. In that case, the class will be in the main + # init and we return the proper dummy to get an appropriate error message. + main_module = importlib.import_module("transformers") + if hasattr(main_module, class_name): + return getattr(main_module, class_name) + + return None + + +def get_feature_extractor_config( + pretrained_model_name_or_path: Union[str, os.PathLike], + cache_dir: Optional[Union[str, os.PathLike]] = None, + force_download: bool = False, + resume_download: Optional[bool] = None, + proxies: Optional[Dict[str, str]] = None, + token: Optional[Union[bool, str]] = None, + revision: Optional[str] = None, + local_files_only: bool = False, + **kwargs, +): + """ + Loads the tokenizer configuration from a pretrained model tokenizer configuration. + + Args: + pretrained_model_name_or_path (`str` or `os.PathLike`): + This can be either: + + - a string, the *model id* of a pretrained model configuration hosted inside a model repo on + huggingface.co. + - a path to a *directory* containing a configuration file saved using the + [`~PreTrainedTokenizer.save_pretrained`] method, e.g., `./my_model_directory/`. + + cache_dir (`str` or `os.PathLike`, *optional*): + Path to a directory in which a downloaded pretrained model configuration should be cached if the standard + cache should not be used. + force_download (`bool`, *optional*, defaults to `False`): + Whether or not to force to (re-)download the configuration files and override the cached versions if they + exist. + resume_download: + Deprecated and ignored. All downloads are now resumed by default when possible. + Will be removed in v5 of Transformers. + proxies (`Dict[str, str]`, *optional*): + A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128', + 'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request. + token (`str` or *bool*, *optional*): + The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated + when running `huggingface-cli login` (stored in `~/.huggingface`). + revision (`str`, *optional*, defaults to `"main"`): + The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a + git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any + identifier allowed by git. + local_files_only (`bool`, *optional*, defaults to `False`): + If `True`, will only try to load the tokenizer configuration from local files. + + + + Passing `token=True` is required when you want to use a private model. + + + + Returns: + `Dict`: The configuration of the tokenizer. + + Examples: + + ```python + # Download configuration from huggingface.co and cache. + tokenizer_config = get_tokenizer_config("google-bert/bert-base-uncased") + # This model does not have a tokenizer config so the result will be an empty dict. + tokenizer_config = get_tokenizer_config("FacebookAI/xlm-roberta-base") + + # Save a pretrained tokenizer locally and you can reload its config + from transformers import AutoTokenizer + + tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-cased") + tokenizer.save_pretrained("tokenizer-test") + tokenizer_config = get_tokenizer_config("tokenizer-test") + ```""" + use_auth_token = kwargs.pop("use_auth_token", None) + if use_auth_token is not None: + warnings.warn( + "The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers. Please use `token` instead.", + FutureWarning, + ) + if token is not None: + raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.") + token = use_auth_token + + resolved_config_file = cached_file( + pretrained_model_name_or_path, + FEATURE_EXTRACTOR_NAME, + cache_dir=cache_dir, + force_download=force_download, + resume_download=resume_download, + proxies=proxies, + token=token, + revision=revision, + local_files_only=local_files_only, + _raise_exceptions_for_gated_repo=False, + _raise_exceptions_for_missing_entries=False, + _raise_exceptions_for_connection_errors=False, + ) + if resolved_config_file is None: + logger.info( + "Could not locate the feature extractor configuration file, will try to use the model config instead." + ) + return {} + + with open(resolved_config_file, encoding="utf-8") as reader: + return json.load(reader) + + +class AutoFeatureExtractor: + r""" + This is a generic feature extractor class that will be instantiated as one of the feature extractor classes of the + library when created with the [`AutoFeatureExtractor.from_pretrained`] class method. + + This class cannot be instantiated directly using `__init__()` (throws an error). + """ + + def __init__(self): + raise EnvironmentError( + "AutoFeatureExtractor is designed to be instantiated " + "using the `AutoFeatureExtractor.from_pretrained(pretrained_model_name_or_path)` method." + ) + + @classmethod + @replace_list_option_in_docstrings(FEATURE_EXTRACTOR_MAPPING_NAMES) + def from_pretrained(cls, pretrained_model_name_or_path, **kwargs): + r""" + Instantiate one of the feature extractor classes of the library from a pretrained model vocabulary. + + The feature extractor class to instantiate is selected based on the `model_type` property of the config object + (either passed as an argument or loaded from `pretrained_model_name_or_path` if possible), or when it's + missing, by falling back to using pattern matching on `pretrained_model_name_or_path`: + + List options + + Params: + pretrained_model_name_or_path (`str` or `os.PathLike`): + This can be either: + + - a string, the *model id* of a pretrained feature_extractor hosted inside a model repo on + huggingface.co. + - a path to a *directory* containing a feature extractor file saved using the + [`~feature_extraction_utils.FeatureExtractionMixin.save_pretrained`] method, e.g., + `./my_model_directory/`. + - a path or url to a saved feature extractor JSON *file*, e.g., + `./my_model_directory/preprocessor_config.json`. + cache_dir (`str` or `os.PathLike`, *optional*): + Path to a directory in which a downloaded pretrained model feature extractor should be cached if the + standard cache should not be used. + force_download (`bool`, *optional*, defaults to `False`): + Whether or not to force to (re-)download the feature extractor files and override the cached versions + if they exist. + resume_download: + Deprecated and ignored. All downloads are now resumed by default when possible. + Will be removed in v5 of Transformers. + proxies (`Dict[str, str]`, *optional*): + A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128', + 'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request. + token (`str` or *bool*, *optional*): + The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated + when running `huggingface-cli login` (stored in `~/.huggingface`). + revision (`str`, *optional*, defaults to `"main"`): + The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a + git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any + identifier allowed by git. + return_unused_kwargs (`bool`, *optional*, defaults to `False`): + If `False`, then this function returns just the final feature extractor object. If `True`, then this + functions returns a `Tuple(feature_extractor, unused_kwargs)` where *unused_kwargs* is a dictionary + consisting of the key/value pairs whose keys are not feature extractor attributes: i.e., the part of + `kwargs` which has not been used to update `feature_extractor` and is otherwise ignored. + trust_remote_code (`bool`, *optional*, defaults to `False`): + Whether or not to allow for custom models defined on the Hub in their own modeling files. This option + should only be set to `True` for repositories you trust and in which you have read the code, as it will + execute code present on the Hub on your local machine. + kwargs (`Dict[str, Any]`, *optional*): + The values in kwargs of any keys which are feature extractor attributes will be used to override the + loaded values. Behavior concerning key/value pairs whose keys are *not* feature extractor attributes is + controlled by the `return_unused_kwargs` keyword parameter. + + + + Passing `token=True` is required when you want to use a private model. + + + + Examples: + + ```python + >>> from transformers import AutoFeatureExtractor + + >>> # Download feature extractor from huggingface.co and cache. + >>> feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-base-960h") + + >>> # If feature extractor files are in a directory (e.g. feature extractor was saved using *save_pretrained('./test/saved_model/')*) + >>> # feature_extractor = AutoFeatureExtractor.from_pretrained("./test/saved_model/") + ```""" + use_auth_token = kwargs.pop("use_auth_token", None) + if use_auth_token is not None: + warnings.warn( + "The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers. Please use `token` instead.", + FutureWarning, + ) + if kwargs.get("token", None) is not None: + raise ValueError( + "`token` and `use_auth_token` are both specified. Please set only the argument `token`." + ) + kwargs["token"] = use_auth_token + + config = kwargs.pop("config", None) + trust_remote_code = kwargs.pop("trust_remote_code", None) + kwargs["_from_auto"] = True + + config_dict, _ = FeatureExtractionMixin.get_feature_extractor_dict(pretrained_model_name_or_path, **kwargs) + feature_extractor_class = config_dict.get("feature_extractor_type", None) + feature_extractor_auto_map = None + if "AutoFeatureExtractor" in config_dict.get("auto_map", {}): + feature_extractor_auto_map = config_dict["auto_map"]["AutoFeatureExtractor"] + + # If we don't find the feature extractor class in the feature extractor config, let's try the model config. + if feature_extractor_class is None and feature_extractor_auto_map is None: + if not isinstance(config, PretrainedConfig): + config = AutoConfig.from_pretrained( + pretrained_model_name_or_path, trust_remote_code=trust_remote_code, **kwargs + ) + # It could be in `config.feature_extractor_type`` + feature_extractor_class = getattr(config, "feature_extractor_type", None) + if hasattr(config, "auto_map") and "AutoFeatureExtractor" in config.auto_map: + feature_extractor_auto_map = config.auto_map["AutoFeatureExtractor"] + + if feature_extractor_class is not None: + feature_extractor_class = feature_extractor_class_from_name(feature_extractor_class) + + has_remote_code = feature_extractor_auto_map is not None + has_local_code = feature_extractor_class is not None or type(config) in FEATURE_EXTRACTOR_MAPPING + trust_remote_code = resolve_trust_remote_code( + trust_remote_code, pretrained_model_name_or_path, has_local_code, has_remote_code + ) + + if has_remote_code and trust_remote_code: + feature_extractor_class = get_class_from_dynamic_module( + feature_extractor_auto_map, pretrained_model_name_or_path, **kwargs + ) + _ = kwargs.pop("code_revision", None) + if os.path.isdir(pretrained_model_name_or_path): + feature_extractor_class.register_for_auto_class() + return feature_extractor_class.from_dict(config_dict, **kwargs) + elif feature_extractor_class is not None: + return feature_extractor_class.from_dict(config_dict, **kwargs) + # Last try: we use the FEATURE_EXTRACTOR_MAPPING. + elif type(config) in FEATURE_EXTRACTOR_MAPPING: + feature_extractor_class = FEATURE_EXTRACTOR_MAPPING[type(config)] + return feature_extractor_class.from_dict(config_dict, **kwargs) + + raise ValueError( + f"Unrecognized feature extractor in {pretrained_model_name_or_path}. Should have a " + f"`feature_extractor_type` key in its {FEATURE_EXTRACTOR_NAME} of {CONFIG_NAME}, or one of the following " + f"`model_type` keys in its {CONFIG_NAME}: {', '.join(c for c in FEATURE_EXTRACTOR_MAPPING_NAMES.keys())}" + ) + + @staticmethod + def register(config_class, feature_extractor_class, exist_ok=False): + """ + Register a new feature extractor for this class. + + Args: + config_class ([`PretrainedConfig`]): + The configuration corresponding to the model to register. + feature_extractor_class ([`FeatureExtractorMixin`]): The feature extractor to register. + """ + FEATURE_EXTRACTOR_MAPPING.register(config_class, feature_extractor_class, exist_ok=exist_ok) + + +__all__ = ["FEATURE_EXTRACTOR_MAPPING", "AutoFeatureExtractor"] diff --git a/docs/transformers/build/lib/transformers/models/auto/image_processing_auto.py b/docs/transformers/build/lib/transformers/models/auto/image_processing_auto.py new file mode 100644 index 0000000000000000000000000000000000000000..79b13f1f4a262e44793b4a7891e27f3f8cff420f --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/auto/image_processing_auto.py @@ -0,0 +1,649 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""AutoImageProcessor class.""" + +import importlib +import json +import os +import warnings +from collections import OrderedDict +from typing import TYPE_CHECKING, Dict, Optional, Tuple, Union + +# Build the list of all image processors +from ...configuration_utils import PretrainedConfig +from ...dynamic_module_utils import get_class_from_dynamic_module, resolve_trust_remote_code +from ...image_processing_utils import ImageProcessingMixin +from ...image_processing_utils_fast import BaseImageProcessorFast +from ...utils import ( + CONFIG_NAME, + IMAGE_PROCESSOR_NAME, + cached_file, + is_timm_config_dict, + is_timm_local_checkpoint, + is_torchvision_available, + is_vision_available, + logging, +) +from ...utils.import_utils import requires +from .auto_factory import _LazyAutoMapping +from .configuration_auto import ( + CONFIG_MAPPING_NAMES, + AutoConfig, + model_type_to_module_name, + replace_list_option_in_docstrings, +) + + +logger = logging.get_logger(__name__) + + +if TYPE_CHECKING: + # This significantly improves completion suggestion performance when + # the transformers package is used with Microsoft's Pylance language server. + IMAGE_PROCESSOR_MAPPING_NAMES: OrderedDict[str, Tuple[Optional[str], Optional[str]]] = OrderedDict() +else: + IMAGE_PROCESSOR_MAPPING_NAMES = OrderedDict( + [ + ("align", ("EfficientNetImageProcessor", "EfficientNetImageProcessorFast")), + ("aria", ("AriaImageProcessor",)), + ("beit", ("BeitImageProcessor",)), + ("bit", ("BitImageProcessor", "BitImageProcessorFast")), + ("blip", ("BlipImageProcessor", "BlipImageProcessorFast")), + ("blip-2", ("BlipImageProcessor", "BlipImageProcessorFast")), + ("bridgetower", ("BridgeTowerImageProcessor", "BridgeTowerImageProcessorFast")), + ("chameleon", ("ChameleonImageProcessor",)), + ("chinese_clip", ("ChineseCLIPImageProcessor", "ChineseCLIPImageProcessorFast")), + ("clip", ("CLIPImageProcessor", "CLIPImageProcessorFast")), + ("clipseg", ("ViTImageProcessor", "ViTImageProcessorFast")), + ("conditional_detr", ("ConditionalDetrImageProcessor", "ConditionalDetrImageProcessorFast")), + ("convnext", ("ConvNextImageProcessor", "ConvNextImageProcessorFast")), + ("convnextv2", ("ConvNextImageProcessor", "ConvNextImageProcessorFast")), + ("cvt", ("ConvNextImageProcessor", "ConvNextImageProcessorFast")), + ("data2vec-vision", ("BeitImageProcessor",)), + ("deformable_detr", ("DeformableDetrImageProcessor", "DeformableDetrImageProcessorFast")), + ("deit", ("DeiTImageProcessor", "DeiTImageProcessorFast")), + ("depth_anything", ("DPTImageProcessor",)), + ("depth_pro", ("DepthProImageProcessor", "DepthProImageProcessorFast")), + ("deta", ("DetaImageProcessor",)), + ("detr", ("DetrImageProcessor", "DetrImageProcessorFast")), + ("dinat", ("ViTImageProcessor", "ViTImageProcessorFast")), + ("dinov2", ("BitImageProcessor", "BitImageProcessorFast")), + ("donut-swin", ("DonutImageProcessor", "DonutImageProcessorFast")), + ("dpt", ("DPTImageProcessor",)), + ("efficientformer", ("EfficientFormerImageProcessor",)), + ("efficientnet", ("EfficientNetImageProcessor", "EfficientNetImageProcessorFast")), + ("flava", ("FlavaImageProcessor", "FlavaImageProcessorFast")), + ("focalnet", ("BitImageProcessor", "BitImageProcessorFast")), + ("fuyu", ("FuyuImageProcessor",)), + ("gemma3", ("Gemma3ImageProcessor", "Gemma3ImageProcessorFast")), + ("git", ("CLIPImageProcessor", "CLIPImageProcessorFast")), + ("glpn", ("GLPNImageProcessor",)), + ("got_ocr2", ("GotOcr2ImageProcessor", "GotOcr2ImageProcessorFast")), + ("grounding-dino", ("GroundingDinoImageProcessor", "GroundingDinoImageProcessorFast")), + ("groupvit", ("CLIPImageProcessor", "CLIPImageProcessorFast")), + ("hiera", ("BitImageProcessor", "BitImageProcessorFast")), + ("idefics", ("IdeficsImageProcessor",)), + ("idefics2", ("Idefics2ImageProcessor",)), + ("idefics3", ("Idefics3ImageProcessor",)), + ("ijepa", ("ViTImageProcessor", "ViTImageProcessorFast")), + ("imagegpt", ("ImageGPTImageProcessor",)), + ("instructblip", ("BlipImageProcessor", "BlipImageProcessorFast")), + ("instructblipvideo", ("InstructBlipVideoImageProcessor",)), + ("janus", ("JanusImageProcessor")), + ("kosmos-2", ("CLIPImageProcessor", "CLIPImageProcessorFast")), + ("layoutlmv2", ("LayoutLMv2ImageProcessor", "LayoutLMv2ImageProcessorFast")), + ("layoutlmv3", ("LayoutLMv3ImageProcessor", "LayoutLMv3ImageProcessorFast")), + ("levit", ("LevitImageProcessor", "LevitImageProcessorFast")), + ("llama4", ("Llama4ImageProcessor", "Llama4ImageProcessorFast")), + ("llava", ("LlavaImageProcessor", "LlavaImageProcessorFast")), + ("llava_next", ("LlavaNextImageProcessor", "LlavaNextImageProcessorFast")), + ("llava_next_video", ("LlavaNextVideoImageProcessor",)), + ("llava_onevision", ("LlavaOnevisionImageProcessor", "LlavaOnevisionImageProcessorFast")), + ("mask2former", ("Mask2FormerImageProcessor",)), + ("maskformer", ("MaskFormerImageProcessor",)), + ("mgp-str", ("ViTImageProcessor", "ViTImageProcessorFast")), + ("mistral3", ("PixtralImageProcessor", "PixtralImageProcessorFast")), + ("mlcd", ("CLIPImageProcessor", "CLIPImageProcessorFast")), + ("mllama", ("MllamaImageProcessor",)), + ("mobilenet_v1", ("MobileNetV1ImageProcessor", "MobileNetV1ImageProcessorFast")), + ("mobilenet_v2", ("MobileNetV2ImageProcessor", "MobileNetV2ImageProcessorFast")), + ("mobilevit", ("MobileViTImageProcessor",)), + ("mobilevitv2", ("MobileViTImageProcessor",)), + ("nat", ("ViTImageProcessor", "ViTImageProcessorFast")), + ("nougat", ("NougatImageProcessor",)), + ("oneformer", ("OneFormerImageProcessor",)), + ("owlv2", ("Owlv2ImageProcessor",)), + ("owlvit", ("OwlViTImageProcessor", "OwlViTImageProcessorFast")), + ("paligemma", ("SiglipImageProcessor", "SiglipImageProcessorFast")), + ("perceiver", ("PerceiverImageProcessor", "PerceiverImageProcessorFast")), + ("phi4_multimodal", "Phi4MultimodalImageProcessorFast"), + ("pix2struct", ("Pix2StructImageProcessor",)), + ("pixtral", ("PixtralImageProcessor", "PixtralImageProcessorFast")), + ("poolformer", ("PoolFormerImageProcessor", "PoolFormerImageProcessorFast")), + ("prompt_depth_anything", ("PromptDepthAnythingImageProcessor",)), + ("pvt", ("PvtImageProcessor", "PvtImageProcessorFast")), + ("pvt_v2", ("PvtImageProcessor", "PvtImageProcessorFast")), + ("qwen2_5_vl", ("Qwen2VLImageProcessor", "Qwen2VLImageProcessorFast")), + ("qwen2_vl", ("Qwen2VLImageProcessor", "Qwen2VLImageProcessorFast")), + ("regnet", ("ConvNextImageProcessor", "ConvNextImageProcessorFast")), + ("resnet", ("ConvNextImageProcessor", "ConvNextImageProcessorFast")), + ("rt_detr", ("RTDetrImageProcessor", "RTDetrImageProcessorFast")), + ("sam", ("SamImageProcessor",)), + ("segformer", ("SegformerImageProcessor",)), + ("seggpt", ("SegGptImageProcessor",)), + ("shieldgemma2", ("Gemma3ImageProcessor", "Gemma3ImageProcessorFast")), + ("siglip", ("SiglipImageProcessor", "SiglipImageProcessorFast")), + ("siglip2", ("Siglip2ImageProcessor", "Siglip2ImageProcessorFast")), + ("superglue", ("SuperGlueImageProcessor",)), + ("swiftformer", ("ViTImageProcessor", "ViTImageProcessorFast")), + ("swin", ("ViTImageProcessor", "ViTImageProcessorFast")), + ("swin2sr", ("Swin2SRImageProcessor",)), + ("swinv2", ("ViTImageProcessor", "ViTImageProcessorFast")), + ("table-transformer", ("DetrImageProcessor",)), + ("timesformer", ("VideoMAEImageProcessor",)), + ("timm_wrapper", ("TimmWrapperImageProcessor",)), + ("tvlt", ("TvltImageProcessor",)), + ("tvp", ("TvpImageProcessor",)), + ("udop", ("LayoutLMv3ImageProcessor", "LayoutLMv3ImageProcessorFast")), + ("upernet", ("SegformerImageProcessor",)), + ("van", ("ConvNextImageProcessor", "ConvNextImageProcessorFast")), + ("videomae", ("VideoMAEImageProcessor",)), + ("vilt", ("ViltImageProcessor",)), + ("vipllava", ("CLIPImageProcessor", "CLIPImageProcessorFast")), + ("vit", ("ViTImageProcessor", "ViTImageProcessorFast")), + ("vit_hybrid", ("ViTHybridImageProcessor",)), + ("vit_mae", ("ViTImageProcessor", "ViTImageProcessorFast")), + ("vit_msn", ("ViTImageProcessor", "ViTImageProcessorFast")), + ("vitmatte", ("VitMatteImageProcessor",)), + ("xclip", ("CLIPImageProcessor", "CLIPImageProcessorFast")), + ("yolos", ("YolosImageProcessor", "YolosImageProcessorFast")), + ("zoedepth", ("ZoeDepthImageProcessor",)), + ] + ) + +for model_type, image_processors in IMAGE_PROCESSOR_MAPPING_NAMES.items(): + slow_image_processor_class, *fast_image_processor_class = image_processors + if not is_vision_available(): + slow_image_processor_class = None + + # If the fast image processor is not defined, or torchvision is not available, we set it to None + if not fast_image_processor_class or fast_image_processor_class[0] is None or not is_torchvision_available(): + fast_image_processor_class = None + else: + fast_image_processor_class = fast_image_processor_class[0] + + IMAGE_PROCESSOR_MAPPING_NAMES[model_type] = (slow_image_processor_class, fast_image_processor_class) + +IMAGE_PROCESSOR_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, IMAGE_PROCESSOR_MAPPING_NAMES) + + +def get_image_processor_class_from_name(class_name: str): + if class_name == "BaseImageProcessorFast": + return BaseImageProcessorFast + + for module_name, extractors in IMAGE_PROCESSOR_MAPPING_NAMES.items(): + if class_name in extractors: + module_name = model_type_to_module_name(module_name) + + module = importlib.import_module(f".{module_name}", "transformers.models") + try: + return getattr(module, class_name) + except AttributeError: + continue + + for _, extractors in IMAGE_PROCESSOR_MAPPING._extra_content.items(): + for extractor in extractors: + if getattr(extractor, "__name__", None) == class_name: + return extractor + + # We did not find the class, but maybe it's because a dep is missing. In that case, the class will be in the main + # init and we return the proper dummy to get an appropriate error message. + main_module = importlib.import_module("transformers") + if hasattr(main_module, class_name): + return getattr(main_module, class_name) + + return None + + +def get_image_processor_config( + pretrained_model_name_or_path: Union[str, os.PathLike], + cache_dir: Optional[Union[str, os.PathLike]] = None, + force_download: bool = False, + resume_download: Optional[bool] = None, + proxies: Optional[Dict[str, str]] = None, + token: Optional[Union[bool, str]] = None, + revision: Optional[str] = None, + local_files_only: bool = False, + **kwargs, +): + """ + Loads the image processor configuration from a pretrained model image processor configuration. + + Args: + pretrained_model_name_or_path (`str` or `os.PathLike`): + This can be either: + + - a string, the *model id* of a pretrained model configuration hosted inside a model repo on + huggingface.co. + - a path to a *directory* containing a configuration file saved using the + [`~PreTrainedTokenizer.save_pretrained`] method, e.g., `./my_model_directory/`. + + cache_dir (`str` or `os.PathLike`, *optional*): + Path to a directory in which a downloaded pretrained model configuration should be cached if the standard + cache should not be used. + force_download (`bool`, *optional*, defaults to `False`): + Whether or not to force to (re-)download the configuration files and override the cached versions if they + exist. + resume_download: + Deprecated and ignored. All downloads are now resumed by default when possible. + Will be removed in v5 of Transformers. + proxies (`Dict[str, str]`, *optional*): + A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128', + 'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request. + token (`str` or *bool*, *optional*): + The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated + when running `huggingface-cli login` (stored in `~/.huggingface`). + revision (`str`, *optional*, defaults to `"main"`): + The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a + git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any + identifier allowed by git. + local_files_only (`bool`, *optional*, defaults to `False`): + If `True`, will only try to load the image processor configuration from local files. + + + + Passing `token=True` is required when you want to use a private model. + + + + Returns: + `Dict`: The configuration of the image processor. + + Examples: + + ```python + # Download configuration from huggingface.co and cache. + image_processor_config = get_image_processor_config("google-bert/bert-base-uncased") + # This model does not have a image processor config so the result will be an empty dict. + image_processor_config = get_image_processor_config("FacebookAI/xlm-roberta-base") + + # Save a pretrained image processor locally and you can reload its config + from transformers import AutoTokenizer + + image_processor = AutoImageProcessor.from_pretrained("google/vit-base-patch16-224-in21k") + image_processor.save_pretrained("image-processor-test") + image_processor_config = get_image_processor_config("image-processor-test") + ```""" + use_auth_token = kwargs.pop("use_auth_token", None) + if use_auth_token is not None: + warnings.warn( + "The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers. Please use `token` instead.", + FutureWarning, + ) + if token is not None: + raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.") + token = use_auth_token + + resolved_config_file = cached_file( + pretrained_model_name_or_path, + IMAGE_PROCESSOR_NAME, + cache_dir=cache_dir, + force_download=force_download, + resume_download=resume_download, + proxies=proxies, + token=token, + revision=revision, + local_files_only=local_files_only, + _raise_exceptions_for_gated_repo=False, + _raise_exceptions_for_missing_entries=False, + _raise_exceptions_for_connection_errors=False, + ) + if resolved_config_file is None: + logger.info( + "Could not locate the image processor configuration file, will try to use the model config instead." + ) + return {} + + with open(resolved_config_file, encoding="utf-8") as reader: + return json.load(reader) + + +def _warning_fast_image_processor_available(fast_class): + logger.warning( + f"Fast image processor class {fast_class} is available for this model. " + "Using slow image processor class. To use the fast image processor class set `use_fast=True`." + ) + + +@requires(backends=("vision",)) +class AutoImageProcessor: + r""" + This is a generic image processor class that will be instantiated as one of the image processor classes of the + library when created with the [`AutoImageProcessor.from_pretrained`] class method. + + This class cannot be instantiated directly using `__init__()` (throws an error). + """ + + def __init__(self): + raise EnvironmentError( + "AutoImageProcessor is designed to be instantiated " + "using the `AutoImageProcessor.from_pretrained(pretrained_model_name_or_path)` method." + ) + + @classmethod + @replace_list_option_in_docstrings(IMAGE_PROCESSOR_MAPPING_NAMES) + def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs): + r""" + Instantiate one of the image processor classes of the library from a pretrained model vocabulary. + + The image processor class to instantiate is selected based on the `model_type` property of the config object + (either passed as an argument or loaded from `pretrained_model_name_or_path` if possible), or when it's + missing, by falling back to using pattern matching on `pretrained_model_name_or_path`: + + List options + + Params: + pretrained_model_name_or_path (`str` or `os.PathLike`): + This can be either: + + - a string, the *model id* of a pretrained image_processor hosted inside a model repo on + huggingface.co. + - a path to a *directory* containing a image processor file saved using the + [`~image_processing_utils.ImageProcessingMixin.save_pretrained`] method, e.g., + `./my_model_directory/`. + - a path or url to a saved image processor JSON *file*, e.g., + `./my_model_directory/preprocessor_config.json`. + cache_dir (`str` or `os.PathLike`, *optional*): + Path to a directory in which a downloaded pretrained model image processor should be cached if the + standard cache should not be used. + force_download (`bool`, *optional*, defaults to `False`): + Whether or not to force to (re-)download the image processor files and override the cached versions if + they exist. + resume_download: + Deprecated and ignored. All downloads are now resumed by default when possible. + Will be removed in v5 of Transformers. + proxies (`Dict[str, str]`, *optional*): + A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128', + 'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request. + token (`str` or *bool*, *optional*): + The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated + when running `huggingface-cli login` (stored in `~/.huggingface`). + revision (`str`, *optional*, defaults to `"main"`): + The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a + git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any + identifier allowed by git. + use_fast (`bool`, *optional*, defaults to `False`): + Use a fast torchvision-base image processor if it is supported for a given model. + If a fast image processor is not available for a given model, a normal numpy-based image processor + is returned instead. + return_unused_kwargs (`bool`, *optional*, defaults to `False`): + If `False`, then this function returns just the final image processor object. If `True`, then this + functions returns a `Tuple(image_processor, unused_kwargs)` where *unused_kwargs* is a dictionary + consisting of the key/value pairs whose keys are not image processor attributes: i.e., the part of + `kwargs` which has not been used to update `image_processor` and is otherwise ignored. + trust_remote_code (`bool`, *optional*, defaults to `False`): + Whether or not to allow for custom models defined on the Hub in their own modeling files. This option + should only be set to `True` for repositories you trust and in which you have read the code, as it will + execute code present on the Hub on your local machine. + image_processor_filename (`str`, *optional*, defaults to `"config.json"`): + The name of the file in the model directory to use for the image processor config. + kwargs (`Dict[str, Any]`, *optional*): + The values in kwargs of any keys which are image processor attributes will be used to override the + loaded values. Behavior concerning key/value pairs whose keys are *not* image processor attributes is + controlled by the `return_unused_kwargs` keyword parameter. + + + + Passing `token=True` is required when you want to use a private model. + + + + Examples: + + ```python + >>> from transformers import AutoImageProcessor + + >>> # Download image processor from huggingface.co and cache. + >>> image_processor = AutoImageProcessor.from_pretrained("google/vit-base-patch16-224-in21k") + + >>> # If image processor files are in a directory (e.g. image processor was saved using *save_pretrained('./test/saved_model/')*) + >>> # image_processor = AutoImageProcessor.from_pretrained("./test/saved_model/") + ```""" + use_auth_token = kwargs.pop("use_auth_token", None) + if use_auth_token is not None: + warnings.warn( + "The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers. Please use `token` instead.", + FutureWarning, + ) + if kwargs.get("token", None) is not None: + raise ValueError( + "`token` and `use_auth_token` are both specified. Please set only the argument `token`." + ) + kwargs["token"] = use_auth_token + + config = kwargs.pop("config", None) + # TODO: @yoni, change in v4.48 (use_fast set to True by default) + use_fast = kwargs.pop("use_fast", None) + trust_remote_code = kwargs.pop("trust_remote_code", None) + kwargs["_from_auto"] = True + + # Resolve the image processor config filename + if "image_processor_filename" in kwargs: + image_processor_filename = kwargs.pop("image_processor_filename") + elif is_timm_local_checkpoint(pretrained_model_name_or_path): + image_processor_filename = CONFIG_NAME + else: + image_processor_filename = IMAGE_PROCESSOR_NAME + + # Load the image processor config + try: + # Main path for all transformers models and local TimmWrapper checkpoints + config_dict, _ = ImageProcessingMixin.get_image_processor_dict( + pretrained_model_name_or_path, image_processor_filename=image_processor_filename, **kwargs + ) + except Exception as initial_exception: + # Fallback path for Hub TimmWrapper checkpoints. Timm models' image processing is saved in `config.json` + # instead of `preprocessor_config.json`. Because this is an Auto class and we don't have any information + # except the model name, the only way to check if a remote checkpoint is a timm model is to try to + # load `config.json` and if it fails with some error, we raise the initial exception. + try: + config_dict, _ = ImageProcessingMixin.get_image_processor_dict( + pretrained_model_name_or_path, image_processor_filename=CONFIG_NAME, **kwargs + ) + except Exception: + raise initial_exception + + # In case we have a config_dict, but it's not a timm config dict, we raise the initial exception, + # because only timm models have image processing in `config.json`. + if not is_timm_config_dict(config_dict): + raise initial_exception + + image_processor_type = config_dict.get("image_processor_type", None) + image_processor_auto_map = None + if "AutoImageProcessor" in config_dict.get("auto_map", {}): + image_processor_auto_map = config_dict["auto_map"]["AutoImageProcessor"] + + # If we still don't have the image processor class, check if we're loading from a previous feature extractor config + # and if so, infer the image processor class from there. + if image_processor_type is None and image_processor_auto_map is None: + feature_extractor_class = config_dict.pop("feature_extractor_type", None) + if feature_extractor_class is not None: + image_processor_type = feature_extractor_class.replace("FeatureExtractor", "ImageProcessor") + if "AutoFeatureExtractor" in config_dict.get("auto_map", {}): + feature_extractor_auto_map = config_dict["auto_map"]["AutoFeatureExtractor"] + image_processor_auto_map = feature_extractor_auto_map.replace("FeatureExtractor", "ImageProcessor") + + # If we don't find the image processor class in the image processor config, let's try the model config. + if image_processor_type is None and image_processor_auto_map is None: + if not isinstance(config, PretrainedConfig): + config = AutoConfig.from_pretrained( + pretrained_model_name_or_path, + trust_remote_code=trust_remote_code, + **kwargs, + ) + # It could be in `config.image_processor_type`` + image_processor_type = getattr(config, "image_processor_type", None) + if hasattr(config, "auto_map") and "AutoImageProcessor" in config.auto_map: + image_processor_auto_map = config.auto_map["AutoImageProcessor"] + + image_processor_class = None + # TODO: @yoni, change logic in v4.52 (when use_fast set to True by default) + if image_processor_type is not None: + # if use_fast is not set and the processor was saved with a fast processor, we use it, otherwise we use the slow processor. + if use_fast is None: + use_fast = image_processor_type.endswith("Fast") + if not use_fast: + logger.warning_once( + "Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. " + "`use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. " + "This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`." + ) + # Update class name to reflect the use_fast option. If class is not found, we fall back to the slow version. + if use_fast and not is_torchvision_available(): + logger.warning_once( + "Using `use_fast=True` but `torchvision` is not available. Falling back to the slow image processor." + ) + use_fast = False + if use_fast: + if not image_processor_type.endswith("Fast"): + image_processor_type += "Fast" + for _, image_processors in IMAGE_PROCESSOR_MAPPING_NAMES.items(): + if image_processor_type in image_processors: + break + else: + image_processor_type = image_processor_type[:-4] + use_fast = False + logger.warning_once( + "`use_fast` is set to `True` but the image processor class does not have a fast version. " + " Falling back to the slow version." + ) + image_processor_class = get_image_processor_class_from_name(image_processor_type) + else: + image_processor_type = ( + image_processor_type[:-4] if image_processor_type.endswith("Fast") else image_processor_type + ) + image_processor_class = get_image_processor_class_from_name(image_processor_type) + + has_remote_code = image_processor_auto_map is not None + has_local_code = image_processor_class is not None or type(config) in IMAGE_PROCESSOR_MAPPING + trust_remote_code = resolve_trust_remote_code( + trust_remote_code, pretrained_model_name_or_path, has_local_code, has_remote_code + ) + + if image_processor_auto_map is not None and not isinstance(image_processor_auto_map, tuple): + # In some configs, only the slow image processor class is stored + image_processor_auto_map = (image_processor_auto_map, None) + + if has_remote_code and trust_remote_code: + if not use_fast and image_processor_auto_map[1] is not None: + _warning_fast_image_processor_available(image_processor_auto_map[1]) + + if use_fast and image_processor_auto_map[1] is not None: + class_ref = image_processor_auto_map[1] + else: + class_ref = image_processor_auto_map[0] + image_processor_class = get_class_from_dynamic_module(class_ref, pretrained_model_name_or_path, **kwargs) + _ = kwargs.pop("code_revision", None) + if os.path.isdir(pretrained_model_name_or_path): + image_processor_class.register_for_auto_class() + return image_processor_class.from_dict(config_dict, **kwargs) + elif image_processor_class is not None: + return image_processor_class.from_dict(config_dict, **kwargs) + # Last try: we use the IMAGE_PROCESSOR_MAPPING. + elif type(config) in IMAGE_PROCESSOR_MAPPING: + image_processor_tuple = IMAGE_PROCESSOR_MAPPING[type(config)] + + image_processor_class_py, image_processor_class_fast = image_processor_tuple + + if not use_fast and image_processor_class_fast is not None: + _warning_fast_image_processor_available(image_processor_class_fast) + + if image_processor_class_fast and (use_fast or image_processor_class_py is None): + return image_processor_class_fast.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs) + else: + if image_processor_class_py is not None: + return image_processor_class_py.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs) + else: + raise ValueError( + "This image processor cannot be instantiated. Please make sure you have `Pillow` installed." + ) + + raise ValueError( + f"Unrecognized image processor in {pretrained_model_name_or_path}. Should have a " + f"`image_processor_type` key in its {IMAGE_PROCESSOR_NAME} of {CONFIG_NAME}, or one of the following " + f"`model_type` keys in its {CONFIG_NAME}: {', '.join(c for c in IMAGE_PROCESSOR_MAPPING_NAMES.keys())}" + ) + + @staticmethod + def register( + config_class, + image_processor_class=None, + slow_image_processor_class=None, + fast_image_processor_class=None, + exist_ok=False, + ): + """ + Register a new image processor for this class. + + Args: + config_class ([`PretrainedConfig`]): + The configuration corresponding to the model to register. + image_processor_class ([`ImageProcessingMixin`]): The image processor to register. + """ + if image_processor_class is not None: + if slow_image_processor_class is not None: + raise ValueError("Cannot specify both image_processor_class and slow_image_processor_class") + warnings.warn( + "The image_processor_class argument is deprecated and will be removed in v4.42. Please use `slow_image_processor_class`, or `fast_image_processor_class` instead", + FutureWarning, + ) + slow_image_processor_class = image_processor_class + + if slow_image_processor_class is None and fast_image_processor_class is None: + raise ValueError("You need to specify either slow_image_processor_class or fast_image_processor_class") + if slow_image_processor_class is not None and issubclass(slow_image_processor_class, BaseImageProcessorFast): + raise ValueError("You passed a fast image processor in as the `slow_image_processor_class`.") + if fast_image_processor_class is not None and not issubclass( + fast_image_processor_class, BaseImageProcessorFast + ): + raise ValueError("The `fast_image_processor_class` should inherit from `BaseImageProcessorFast`.") + + if ( + slow_image_processor_class is not None + and fast_image_processor_class is not None + and issubclass(fast_image_processor_class, BaseImageProcessorFast) + and fast_image_processor_class.slow_image_processor_class != slow_image_processor_class + ): + raise ValueError( + "The fast processor class you are passing has a `slow_image_processor_class` attribute that is not " + "consistent with the slow processor class you passed (fast tokenizer has " + f"{fast_image_processor_class.slow_image_processor_class} and you passed {slow_image_processor_class}. Fix one of those " + "so they match!" + ) + + # Avoid resetting a set slow/fast image processor if we are passing just the other ones. + if config_class in IMAGE_PROCESSOR_MAPPING._extra_content: + existing_slow, existing_fast = IMAGE_PROCESSOR_MAPPING[config_class] + if slow_image_processor_class is None: + slow_image_processor_class = existing_slow + if fast_image_processor_class is None: + fast_image_processor_class = existing_fast + + IMAGE_PROCESSOR_MAPPING.register( + config_class, (slow_image_processor_class, fast_image_processor_class), exist_ok=exist_ok + ) + + +__all__ = ["IMAGE_PROCESSOR_MAPPING", "AutoImageProcessor"] diff --git a/docs/transformers/build/lib/transformers/models/auto/modeling_auto.py b/docs/transformers/build/lib/transformers/models/auto/modeling_auto.py new file mode 100644 index 0000000000000000000000000000000000000000..a7271d04f6071cc2450861fc19655a5e3e1a6a76 --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/auto/modeling_auto.py @@ -0,0 +1,2077 @@ +# coding=utf-8 +# Copyright 2018 The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Auto Model class.""" + +import warnings +from collections import OrderedDict + +from ...utils import logging +from .auto_factory import ( + _BaseAutoBackboneClass, + _BaseAutoModelClass, + _LazyAutoMapping, + auto_class_update, +) +from .configuration_auto import CONFIG_MAPPING_NAMES + + +logger = logging.get_logger(__name__) + +MODEL_MAPPING_NAMES = OrderedDict( + [ + # Base model mapping + ("albert", "AlbertModel"), + ("align", "AlignModel"), + ("altclip", "AltCLIPModel"), + ("aria", "AriaForConditionalGeneration"), + ("aria_text", "AriaTextModel"), + ("audio-spectrogram-transformer", "ASTModel"), + ("autoformer", "AutoformerModel"), + ("bamba", "BambaModel"), + ("bark", "BarkModel"), + ("bart", "BartModel"), + ("beit", "BeitModel"), + ("bert", "BertModel"), + ("bert-generation", "BertGenerationEncoder"), + ("big_bird", "BigBirdModel"), + ("bigbird_pegasus", "BigBirdPegasusModel"), + ("biogpt", "BioGptModel"), + ("bit", "BitModel"), + ("blenderbot", "BlenderbotModel"), + ("blenderbot-small", "BlenderbotSmallModel"), + ("blip", "BlipModel"), + ("blip-2", "Blip2Model"), + ("blip_2_qformer", "Blip2QFormerModel"), + ("bloom", "BloomModel"), + ("bridgetower", "BridgeTowerModel"), + ("bros", "BrosModel"), + ("camembert", "CamembertModel"), + ("canine", "CanineModel"), + ("chameleon", "ChameleonModel"), + ("chinese_clip", "ChineseCLIPModel"), + ("chinese_clip_vision_model", "ChineseCLIPVisionModel"), + ("clap", "ClapModel"), + ("clip", "CLIPModel"), + ("clip_text_model", "CLIPTextModel"), + ("clip_vision_model", "CLIPVisionModel"), + ("clipseg", "CLIPSegModel"), + ("clvp", "ClvpModelForConditionalGeneration"), + ("code_llama", "LlamaModel"), + ("codegen", "CodeGenModel"), + ("cohere", "CohereModel"), + ("cohere2", "Cohere2Model"), + ("conditional_detr", "ConditionalDetrModel"), + ("convbert", "ConvBertModel"), + ("convnext", "ConvNextModel"), + ("convnextv2", "ConvNextV2Model"), + ("cpmant", "CpmAntModel"), + ("ctrl", "CTRLModel"), + ("cvt", "CvtModel"), + ("dab-detr", "DabDetrModel"), + ("dac", "DacModel"), + ("data2vec-audio", "Data2VecAudioModel"), + ("data2vec-text", "Data2VecTextModel"), + ("data2vec-vision", "Data2VecVisionModel"), + ("dbrx", "DbrxModel"), + ("deberta", "DebertaModel"), + ("deberta-v2", "DebertaV2Model"), + ("decision_transformer", "DecisionTransformerModel"), + ("deepseek_v3", "DeepseekV3Model"), + ("deformable_detr", "DeformableDetrModel"), + ("deit", "DeiTModel"), + ("depth_pro", "DepthProModel"), + ("deta", "DetaModel"), + ("detr", "DetrModel"), + ("diffllama", "DiffLlamaModel"), + ("dinat", "DinatModel"), + ("dinov2", "Dinov2Model"), + ("dinov2_with_registers", "Dinov2WithRegistersModel"), + ("distilbert", "DistilBertModel"), + ("donut-swin", "DonutSwinModel"), + ("dpr", "DPRQuestionEncoder"), + ("dpt", "DPTModel"), + ("efficientformer", "EfficientFormerModel"), + ("efficientnet", "EfficientNetModel"), + ("electra", "ElectraModel"), + ("encodec", "EncodecModel"), + ("ernie", "ErnieModel"), + ("ernie_m", "ErnieMModel"), + ("esm", "EsmModel"), + ("falcon", "FalconModel"), + ("falcon_mamba", "FalconMambaModel"), + ("fastspeech2_conformer", "FastSpeech2ConformerModel"), + ("flaubert", "FlaubertModel"), + ("flava", "FlavaModel"), + ("fnet", "FNetModel"), + ("focalnet", "FocalNetModel"), + ("fsmt", "FSMTModel"), + ("funnel", ("FunnelModel", "FunnelBaseModel")), + ("gemma", "GemmaModel"), + ("gemma2", "Gemma2Model"), + ("gemma3_text", "Gemma3TextModel"), + ("git", "GitModel"), + ("glm", "GlmModel"), + ("glm4", "Glm4Model"), + ("glpn", "GLPNModel"), + ("got_ocr2", "GotOcr2ForConditionalGeneration"), + ("gpt-sw3", "GPT2Model"), + ("gpt2", "GPT2Model"), + ("gpt_bigcode", "GPTBigCodeModel"), + ("gpt_neo", "GPTNeoModel"), + ("gpt_neox", "GPTNeoXModel"), + ("gpt_neox_japanese", "GPTNeoXJapaneseModel"), + ("gptj", "GPTJModel"), + ("gptsan-japanese", "GPTSanJapaneseForConditionalGeneration"), + ("granite", "GraniteModel"), + ("granitemoe", "GraniteMoeModel"), + ("granitemoeshared", "GraniteMoeSharedModel"), + ("graphormer", "GraphormerModel"), + ("grounding-dino", "GroundingDinoModel"), + ("groupvit", "GroupViTModel"), + ("helium", "HeliumModel"), + ("hiera", "HieraModel"), + ("hubert", "HubertModel"), + ("ibert", "IBertModel"), + ("idefics", "IdeficsModel"), + ("idefics2", "Idefics2Model"), + ("idefics3", "Idefics3Model"), + ("idefics3_vision", "Idefics3VisionTransformer"), + ("ijepa", "IJepaModel"), + ("imagegpt", "ImageGPTModel"), + ("informer", "InformerModel"), + ("internvl_vision", "InternVLVisionModel"), + ("jamba", "JambaModel"), + ("janus", "JanusModel"), + ("jetmoe", "JetMoeModel"), + ("jukebox", "JukeboxModel"), + ("kosmos-2", "Kosmos2Model"), + ("layoutlm", "LayoutLMModel"), + ("layoutlmv2", "LayoutLMv2Model"), + ("layoutlmv3", "LayoutLMv3Model"), + ("led", "LEDModel"), + ("levit", "LevitModel"), + ("lilt", "LiltModel"), + ("llama", "LlamaModel"), + ("llama4", "Llama4ForConditionalGeneration"), + ("longformer", "LongformerModel"), + ("longt5", "LongT5Model"), + ("luke", "LukeModel"), + ("lxmert", "LxmertModel"), + ("m2m_100", "M2M100Model"), + ("mamba", "MambaModel"), + ("mamba2", "Mamba2Model"), + ("marian", "MarianModel"), + ("markuplm", "MarkupLMModel"), + ("mask2former", "Mask2FormerModel"), + ("maskformer", "MaskFormerModel"), + ("maskformer-swin", "MaskFormerSwinModel"), + ("mbart", "MBartModel"), + ("mctct", "MCTCTModel"), + ("mega", "MegaModel"), + ("megatron-bert", "MegatronBertModel"), + ("mgp-str", "MgpstrForSceneTextRecognition"), + ("mimi", "MimiModel"), + ("mistral", "MistralModel"), + ("mixtral", "MixtralModel"), + ("mlcd", "MLCDVisionModel"), + ("mobilebert", "MobileBertModel"), + ("mobilenet_v1", "MobileNetV1Model"), + ("mobilenet_v2", "MobileNetV2Model"), + ("mobilevit", "MobileViTModel"), + ("mobilevitv2", "MobileViTV2Model"), + ("modernbert", "ModernBertModel"), + ("moonshine", "MoonshineModel"), + ("moshi", "MoshiModel"), + ("mpnet", "MPNetModel"), + ("mpt", "MptModel"), + ("mra", "MraModel"), + ("mt5", "MT5Model"), + ("musicgen", "MusicgenModel"), + ("musicgen_melody", "MusicgenMelodyModel"), + ("mvp", "MvpModel"), + ("nat", "NatModel"), + ("nemotron", "NemotronModel"), + ("nezha", "NezhaModel"), + ("nllb-moe", "NllbMoeModel"), + ("nystromformer", "NystromformerModel"), + ("olmo", "OlmoModel"), + ("olmo2", "Olmo2Model"), + ("olmoe", "OlmoeModel"), + ("omdet-turbo", "OmDetTurboForObjectDetection"), + ("oneformer", "OneFormerModel"), + ("open-llama", "OpenLlamaModel"), + ("openai-gpt", "OpenAIGPTModel"), + ("opt", "OPTModel"), + ("owlv2", "Owlv2Model"), + ("owlvit", "OwlViTModel"), + ("patchtsmixer", "PatchTSMixerModel"), + ("patchtst", "PatchTSTModel"), + ("pegasus", "PegasusModel"), + ("pegasus_x", "PegasusXModel"), + ("perceiver", "PerceiverModel"), + ("persimmon", "PersimmonModel"), + ("phi", "PhiModel"), + ("phi3", "Phi3Model"), + ("phi4_multimodal", "Phi4MultimodalModel"), + ("phimoe", "PhimoeModel"), + ("pixtral", "PixtralVisionModel"), + ("plbart", "PLBartModel"), + ("poolformer", "PoolFormerModel"), + ("prophetnet", "ProphetNetModel"), + ("pvt", "PvtModel"), + ("pvt_v2", "PvtV2Model"), + ("qdqbert", "QDQBertModel"), + ("qwen2", "Qwen2Model"), + ("qwen2_5_vl", "Qwen2_5_VLModel"), + ("qwen2_5_vl_text", "Qwen2_5_VLModel"), + ("qwen2_audio_encoder", "Qwen2AudioEncoder"), + ("qwen2_moe", "Qwen2MoeModel"), + ("qwen2_vl", "Qwen2VLModel"), + ("qwen2_vl_text", "Qwen2VLModel"), + ("qwen3", "Qwen3Model"), + ("qwen3_moe", "Qwen3MoeModel"), + ("recurrent_gemma", "RecurrentGemmaModel"), + ("reformer", "ReformerModel"), + ("regnet", "RegNetModel"), + ("rembert", "RemBertModel"), + ("resnet", "ResNetModel"), + ("retribert", "RetriBertModel"), + ("roberta", "RobertaModel"), + ("roberta-prelayernorm", "RobertaPreLayerNormModel"), + ("roc_bert", "RoCBertModel"), + ("roformer", "RoFormerModel"), + ("rt_detr", "RTDetrModel"), + ("rt_detr_v2", "RTDetrV2Model"), + ("rwkv", "RwkvModel"), + ("sam", "SamModel"), + ("sam_vision_model", "SamVisionModel"), + ("seamless_m4t", "SeamlessM4TModel"), + ("seamless_m4t_v2", "SeamlessM4Tv2Model"), + ("segformer", "SegformerModel"), + ("seggpt", "SegGptModel"), + ("sew", "SEWModel"), + ("sew-d", "SEWDModel"), + ("siglip", "SiglipModel"), + ("siglip2", "Siglip2Model"), + ("siglip_vision_model", "SiglipVisionModel"), + ("smolvlm", "SmolVLMModel"), + ("smolvlm_vision", "SmolVLMVisionTransformer"), + ("speech_to_text", "Speech2TextModel"), + ("speecht5", "SpeechT5Model"), + ("splinter", "SplinterModel"), + ("squeezebert", "SqueezeBertModel"), + ("stablelm", "StableLmModel"), + ("starcoder2", "Starcoder2Model"), + ("superglue", "SuperGlueForKeypointMatching"), + ("swiftformer", "SwiftFormerModel"), + ("swin", "SwinModel"), + ("swin2sr", "Swin2SRModel"), + ("swinv2", "Swinv2Model"), + ("switch_transformers", "SwitchTransformersModel"), + ("t5", "T5Model"), + ("table-transformer", "TableTransformerModel"), + ("tapas", "TapasModel"), + ("textnet", "TextNetModel"), + ("time_series_transformer", "TimeSeriesTransformerModel"), + ("timesfm", "TimesFmModel"), + ("timesformer", "TimesformerModel"), + ("timm_backbone", "TimmBackbone"), + ("timm_wrapper", "TimmWrapperModel"), + ("trajectory_transformer", "TrajectoryTransformerModel"), + ("transfo-xl", "TransfoXLModel"), + ("tvlt", "TvltModel"), + ("tvp", "TvpModel"), + ("udop", "UdopModel"), + ("umt5", "UMT5Model"), + ("unispeech", "UniSpeechModel"), + ("unispeech-sat", "UniSpeechSatModel"), + ("univnet", "UnivNetModel"), + ("van", "VanModel"), + ("videomae", "VideoMAEModel"), + ("vilt", "ViltModel"), + ("vision-text-dual-encoder", "VisionTextDualEncoderModel"), + ("visual_bert", "VisualBertModel"), + ("vit", "ViTModel"), + ("vit_hybrid", "ViTHybridModel"), + ("vit_mae", "ViTMAEModel"), + ("vit_msn", "ViTMSNModel"), + ("vitdet", "VitDetModel"), + ("vits", "VitsModel"), + ("vivit", "VivitModel"), + ("wav2vec2", "Wav2Vec2Model"), + ("wav2vec2-bert", "Wav2Vec2BertModel"), + ("wav2vec2-conformer", "Wav2Vec2ConformerModel"), + ("wavlm", "WavLMModel"), + ("whisper", "WhisperModel"), + ("xclip", "XCLIPModel"), + ("xglm", "XGLMModel"), + ("xlm", "XLMModel"), + ("xlm-prophetnet", "XLMProphetNetModel"), + ("xlm-roberta", "XLMRobertaModel"), + ("xlm-roberta-xl", "XLMRobertaXLModel"), + ("xlnet", "XLNetModel"), + ("xmod", "XmodModel"), + ("yolos", "YolosModel"), + ("yoso", "YosoModel"), + ("zamba", "ZambaModel"), + ("zamba2", "Zamba2Model"), + ] +) + +MODEL_FOR_PRETRAINING_MAPPING_NAMES = OrderedDict( + [ + # Model for pre-training mapping + ("albert", "AlbertForPreTraining"), + ("bart", "BartForConditionalGeneration"), + ("bert", "BertForPreTraining"), + ("big_bird", "BigBirdForPreTraining"), + ("bloom", "BloomForCausalLM"), + ("camembert", "CamembertForMaskedLM"), + ("colpali", "ColPaliForRetrieval"), + ("ctrl", "CTRLLMHeadModel"), + ("data2vec-text", "Data2VecTextForMaskedLM"), + ("deberta", "DebertaForMaskedLM"), + ("deberta-v2", "DebertaV2ForMaskedLM"), + ("distilbert", "DistilBertForMaskedLM"), + ("electra", "ElectraForPreTraining"), + ("ernie", "ErnieForPreTraining"), + ("falcon_mamba", "FalconMambaForCausalLM"), + ("flaubert", "FlaubertWithLMHeadModel"), + ("flava", "FlavaForPreTraining"), + ("fnet", "FNetForPreTraining"), + ("fsmt", "FSMTForConditionalGeneration"), + ("funnel", "FunnelForPreTraining"), + ("gemma3", "Gemma3ForConditionalGeneration"), + ("gpt-sw3", "GPT2LMHeadModel"), + ("gpt2", "GPT2LMHeadModel"), + ("gpt_bigcode", "GPTBigCodeForCausalLM"), + ("gptsan-japanese", "GPTSanJapaneseForConditionalGeneration"), + ("hiera", "HieraForPreTraining"), + ("ibert", "IBertForMaskedLM"), + ("idefics", "IdeficsForVisionText2Text"), + ("idefics2", "Idefics2ForConditionalGeneration"), + ("idefics3", "Idefics3ForConditionalGeneration"), + ("janus", "JanusForConditionalGeneration"), + ("layoutlm", "LayoutLMForMaskedLM"), + ("llava", "LlavaForConditionalGeneration"), + ("llava_next", "LlavaNextForConditionalGeneration"), + ("llava_next_video", "LlavaNextVideoForConditionalGeneration"), + ("llava_onevision", "LlavaOnevisionForConditionalGeneration"), + ("longformer", "LongformerForMaskedLM"), + ("luke", "LukeForMaskedLM"), + ("lxmert", "LxmertForPreTraining"), + ("mamba", "MambaForCausalLM"), + ("mamba2", "Mamba2ForCausalLM"), + ("mega", "MegaForMaskedLM"), + ("megatron-bert", "MegatronBertForPreTraining"), + ("mistral3", "Mistral3ForConditionalGeneration"), + ("mllama", "MllamaForConditionalGeneration"), + ("mobilebert", "MobileBertForPreTraining"), + ("mpnet", "MPNetForMaskedLM"), + ("mpt", "MptForCausalLM"), + ("mra", "MraForMaskedLM"), + ("mvp", "MvpForConditionalGeneration"), + ("nezha", "NezhaForPreTraining"), + ("nllb-moe", "NllbMoeForConditionalGeneration"), + ("openai-gpt", "OpenAIGPTLMHeadModel"), + ("paligemma", "PaliGemmaForConditionalGeneration"), + ("qwen2_audio", "Qwen2AudioForConditionalGeneration"), + ("retribert", "RetriBertModel"), + ("roberta", "RobertaForMaskedLM"), + ("roberta-prelayernorm", "RobertaPreLayerNormForMaskedLM"), + ("roc_bert", "RoCBertForPreTraining"), + ("rwkv", "RwkvForCausalLM"), + ("splinter", "SplinterForPreTraining"), + ("squeezebert", "SqueezeBertForMaskedLM"), + ("switch_transformers", "SwitchTransformersForConditionalGeneration"), + ("t5", "T5ForConditionalGeneration"), + ("tapas", "TapasForMaskedLM"), + ("transfo-xl", "TransfoXLLMHeadModel"), + ("tvlt", "TvltForPreTraining"), + ("unispeech", "UniSpeechForPreTraining"), + ("unispeech-sat", "UniSpeechSatForPreTraining"), + ("video_llava", "VideoLlavaForConditionalGeneration"), + ("videomae", "VideoMAEForPreTraining"), + ("vipllava", "VipLlavaForConditionalGeneration"), + ("visual_bert", "VisualBertForPreTraining"), + ("vit_mae", "ViTMAEForPreTraining"), + ("wav2vec2", "Wav2Vec2ForPreTraining"), + ("wav2vec2-conformer", "Wav2Vec2ConformerForPreTraining"), + ("xlm", "XLMWithLMHeadModel"), + ("xlm-roberta", "XLMRobertaForMaskedLM"), + ("xlm-roberta-xl", "XLMRobertaXLForMaskedLM"), + ("xlnet", "XLNetLMHeadModel"), + ("xmod", "XmodForMaskedLM"), + ] +) + +MODEL_WITH_LM_HEAD_MAPPING_NAMES = OrderedDict( + [ + # Model with LM heads mapping + ("albert", "AlbertForMaskedLM"), + ("bart", "BartForConditionalGeneration"), + ("bert", "BertForMaskedLM"), + ("big_bird", "BigBirdForMaskedLM"), + ("bigbird_pegasus", "BigBirdPegasusForConditionalGeneration"), + ("blenderbot-small", "BlenderbotSmallForConditionalGeneration"), + ("bloom", "BloomForCausalLM"), + ("camembert", "CamembertForMaskedLM"), + ("codegen", "CodeGenForCausalLM"), + ("convbert", "ConvBertForMaskedLM"), + ("cpmant", "CpmAntForCausalLM"), + ("ctrl", "CTRLLMHeadModel"), + ("data2vec-text", "Data2VecTextForMaskedLM"), + ("deberta", "DebertaForMaskedLM"), + ("deberta-v2", "DebertaV2ForMaskedLM"), + ("distilbert", "DistilBertForMaskedLM"), + ("electra", "ElectraForMaskedLM"), + ("encoder-decoder", "EncoderDecoderModel"), + ("ernie", "ErnieForMaskedLM"), + ("esm", "EsmForMaskedLM"), + ("falcon_mamba", "FalconMambaForCausalLM"), + ("flaubert", "FlaubertWithLMHeadModel"), + ("fnet", "FNetForMaskedLM"), + ("fsmt", "FSMTForConditionalGeneration"), + ("funnel", "FunnelForMaskedLM"), + ("git", "GitForCausalLM"), + ("gpt-sw3", "GPT2LMHeadModel"), + ("gpt2", "GPT2LMHeadModel"), + ("gpt_bigcode", "GPTBigCodeForCausalLM"), + ("gpt_neo", "GPTNeoForCausalLM"), + ("gpt_neox", "GPTNeoXForCausalLM"), + ("gpt_neox_japanese", "GPTNeoXJapaneseForCausalLM"), + ("gptj", "GPTJForCausalLM"), + ("gptsan-japanese", "GPTSanJapaneseForConditionalGeneration"), + ("ibert", "IBertForMaskedLM"), + ("layoutlm", "LayoutLMForMaskedLM"), + ("led", "LEDForConditionalGeneration"), + ("longformer", "LongformerForMaskedLM"), + ("longt5", "LongT5ForConditionalGeneration"), + ("luke", "LukeForMaskedLM"), + ("m2m_100", "M2M100ForConditionalGeneration"), + ("mamba", "MambaForCausalLM"), + ("mamba2", "Mamba2ForCausalLM"), + ("marian", "MarianMTModel"), + ("mega", "MegaForMaskedLM"), + ("megatron-bert", "MegatronBertForCausalLM"), + ("mobilebert", "MobileBertForMaskedLM"), + ("moonshine", "MoonshineForConditionalGeneration"), + ("mpnet", "MPNetForMaskedLM"), + ("mpt", "MptForCausalLM"), + ("mra", "MraForMaskedLM"), + ("mvp", "MvpForConditionalGeneration"), + ("nezha", "NezhaForMaskedLM"), + ("nllb-moe", "NllbMoeForConditionalGeneration"), + ("nystromformer", "NystromformerForMaskedLM"), + ("openai-gpt", "OpenAIGPTLMHeadModel"), + ("pegasus_x", "PegasusXForConditionalGeneration"), + ("plbart", "PLBartForConditionalGeneration"), + ("pop2piano", "Pop2PianoForConditionalGeneration"), + ("qdqbert", "QDQBertForMaskedLM"), + ("reformer", "ReformerModelWithLMHead"), + ("rembert", "RemBertForMaskedLM"), + ("roberta", "RobertaForMaskedLM"), + ("roberta-prelayernorm", "RobertaPreLayerNormForMaskedLM"), + ("roc_bert", "RoCBertForMaskedLM"), + ("roformer", "RoFormerForMaskedLM"), + ("rwkv", "RwkvForCausalLM"), + ("speech_to_text", "Speech2TextForConditionalGeneration"), + ("squeezebert", "SqueezeBertForMaskedLM"), + ("switch_transformers", "SwitchTransformersForConditionalGeneration"), + ("t5", "T5ForConditionalGeneration"), + ("tapas", "TapasForMaskedLM"), + ("transfo-xl", "TransfoXLLMHeadModel"), + ("wav2vec2", "Wav2Vec2ForMaskedLM"), + ("whisper", "WhisperForConditionalGeneration"), + ("xlm", "XLMWithLMHeadModel"), + ("xlm-roberta", "XLMRobertaForMaskedLM"), + ("xlm-roberta-xl", "XLMRobertaXLForMaskedLM"), + ("xlnet", "XLNetLMHeadModel"), + ("xmod", "XmodForMaskedLM"), + ("yoso", "YosoForMaskedLM"), + ] +) + +MODEL_FOR_CAUSAL_LM_MAPPING_NAMES = OrderedDict( + [ + # Model for Causal LM mapping + ("aria_text", "AriaTextForCausalLM"), + ("bamba", "BambaForCausalLM"), + ("bart", "BartForCausalLM"), + ("bert", "BertLMHeadModel"), + ("bert-generation", "BertGenerationDecoder"), + ("big_bird", "BigBirdForCausalLM"), + ("bigbird_pegasus", "BigBirdPegasusForCausalLM"), + ("biogpt", "BioGptForCausalLM"), + ("blenderbot", "BlenderbotForCausalLM"), + ("blenderbot-small", "BlenderbotSmallForCausalLM"), + ("bloom", "BloomForCausalLM"), + ("camembert", "CamembertForCausalLM"), + ("code_llama", "LlamaForCausalLM"), + ("codegen", "CodeGenForCausalLM"), + ("cohere", "CohereForCausalLM"), + ("cohere2", "Cohere2ForCausalLM"), + ("cpmant", "CpmAntForCausalLM"), + ("ctrl", "CTRLLMHeadModel"), + ("data2vec-text", "Data2VecTextForCausalLM"), + ("dbrx", "DbrxForCausalLM"), + ("deepseek_v3", "DeepseekV3ForCausalLM"), + ("diffllama", "DiffLlamaForCausalLM"), + ("electra", "ElectraForCausalLM"), + ("emu3", "Emu3ForCausalLM"), + ("ernie", "ErnieForCausalLM"), + ("falcon", "FalconForCausalLM"), + ("falcon_mamba", "FalconMambaForCausalLM"), + ("fuyu", "FuyuForCausalLM"), + ("gemma", "GemmaForCausalLM"), + ("gemma2", "Gemma2ForCausalLM"), + ("gemma3", "Gemma3ForConditionalGeneration"), + ("gemma3_text", "Gemma3ForCausalLM"), + ("git", "GitForCausalLM"), + ("glm", "GlmForCausalLM"), + ("glm4", "Glm4ForCausalLM"), + ("got_ocr2", "GotOcr2ForConditionalGeneration"), + ("gpt-sw3", "GPT2LMHeadModel"), + ("gpt2", "GPT2LMHeadModel"), + ("gpt_bigcode", "GPTBigCodeForCausalLM"), + ("gpt_neo", "GPTNeoForCausalLM"), + ("gpt_neox", "GPTNeoXForCausalLM"), + ("gpt_neox_japanese", "GPTNeoXJapaneseForCausalLM"), + ("gptj", "GPTJForCausalLM"), + ("granite", "GraniteForCausalLM"), + ("granitemoe", "GraniteMoeForCausalLM"), + ("granitemoeshared", "GraniteMoeSharedForCausalLM"), + ("helium", "HeliumForCausalLM"), + ("jamba", "JambaForCausalLM"), + ("jetmoe", "JetMoeForCausalLM"), + ("llama", "LlamaForCausalLM"), + ("llama4", "Llama4ForCausalLM"), + ("llama4_text", "Llama4ForCausalLM"), + ("mamba", "MambaForCausalLM"), + ("mamba2", "Mamba2ForCausalLM"), + ("marian", "MarianForCausalLM"), + ("mbart", "MBartForCausalLM"), + ("mega", "MegaForCausalLM"), + ("megatron-bert", "MegatronBertForCausalLM"), + ("mistral", "MistralForCausalLM"), + ("mixtral", "MixtralForCausalLM"), + ("mllama", "MllamaForCausalLM"), + ("moshi", "MoshiForCausalLM"), + ("mpt", "MptForCausalLM"), + ("musicgen", "MusicgenForCausalLM"), + ("musicgen_melody", "MusicgenMelodyForCausalLM"), + ("mvp", "MvpForCausalLM"), + ("nemotron", "NemotronForCausalLM"), + ("olmo", "OlmoForCausalLM"), + ("olmo2", "Olmo2ForCausalLM"), + ("olmoe", "OlmoeForCausalLM"), + ("open-llama", "OpenLlamaForCausalLM"), + ("openai-gpt", "OpenAIGPTLMHeadModel"), + ("opt", "OPTForCausalLM"), + ("pegasus", "PegasusForCausalLM"), + ("persimmon", "PersimmonForCausalLM"), + ("phi", "PhiForCausalLM"), + ("phi3", "Phi3ForCausalLM"), + ("phi4_multimodal", "Phi4MultimodalForCausalLM"), + ("phimoe", "PhimoeForCausalLM"), + ("plbart", "PLBartForCausalLM"), + ("prophetnet", "ProphetNetForCausalLM"), + ("qdqbert", "QDQBertLMHeadModel"), + ("qwen2", "Qwen2ForCausalLM"), + ("qwen2_moe", "Qwen2MoeForCausalLM"), + ("qwen3", "Qwen3ForCausalLM"), + ("qwen3_moe", "Qwen3MoeForCausalLM"), + ("recurrent_gemma", "RecurrentGemmaForCausalLM"), + ("reformer", "ReformerModelWithLMHead"), + ("rembert", "RemBertForCausalLM"), + ("roberta", "RobertaForCausalLM"), + ("roberta-prelayernorm", "RobertaPreLayerNormForCausalLM"), + ("roc_bert", "RoCBertForCausalLM"), + ("roformer", "RoFormerForCausalLM"), + ("rwkv", "RwkvForCausalLM"), + ("speech_to_text_2", "Speech2Text2ForCausalLM"), + ("stablelm", "StableLmForCausalLM"), + ("starcoder2", "Starcoder2ForCausalLM"), + ("transfo-xl", "TransfoXLLMHeadModel"), + ("trocr", "TrOCRForCausalLM"), + ("whisper", "WhisperForCausalLM"), + ("xglm", "XGLMForCausalLM"), + ("xlm", "XLMWithLMHeadModel"), + ("xlm-prophetnet", "XLMProphetNetForCausalLM"), + ("xlm-roberta", "XLMRobertaForCausalLM"), + ("xlm-roberta-xl", "XLMRobertaXLForCausalLM"), + ("xlnet", "XLNetLMHeadModel"), + ("xmod", "XmodForCausalLM"), + ("zamba", "ZambaForCausalLM"), + ("zamba2", "Zamba2ForCausalLM"), + ] +) + +MODEL_FOR_IMAGE_MAPPING_NAMES = OrderedDict( + [ + # Model for Image mapping + ("beit", "BeitModel"), + ("bit", "BitModel"), + ("conditional_detr", "ConditionalDetrModel"), + ("convnext", "ConvNextModel"), + ("convnextv2", "ConvNextV2Model"), + ("dab-detr", "DabDetrModel"), + ("data2vec-vision", "Data2VecVisionModel"), + ("deformable_detr", "DeformableDetrModel"), + ("deit", "DeiTModel"), + ("depth_pro", "DepthProModel"), + ("deta", "DetaModel"), + ("detr", "DetrModel"), + ("dinat", "DinatModel"), + ("dinov2", "Dinov2Model"), + ("dinov2_with_registers", "Dinov2WithRegistersModel"), + ("dpt", "DPTModel"), + ("efficientformer", "EfficientFormerModel"), + ("efficientnet", "EfficientNetModel"), + ("focalnet", "FocalNetModel"), + ("glpn", "GLPNModel"), + ("hiera", "HieraModel"), + ("ijepa", "IJepaModel"), + ("imagegpt", "ImageGPTModel"), + ("levit", "LevitModel"), + ("llama4", "Llama4VisionModel"), + ("mlcd", "MLCDVisionModel"), + ("mllama", "MllamaVisionModel"), + ("mobilenet_v1", "MobileNetV1Model"), + ("mobilenet_v2", "MobileNetV2Model"), + ("mobilevit", "MobileViTModel"), + ("mobilevitv2", "MobileViTV2Model"), + ("nat", "NatModel"), + ("poolformer", "PoolFormerModel"), + ("pvt", "PvtModel"), + ("regnet", "RegNetModel"), + ("resnet", "ResNetModel"), + ("segformer", "SegformerModel"), + ("siglip_vision_model", "SiglipVisionModel"), + ("swiftformer", "SwiftFormerModel"), + ("swin", "SwinModel"), + ("swin2sr", "Swin2SRModel"), + ("swinv2", "Swinv2Model"), + ("table-transformer", "TableTransformerModel"), + ("timesformer", "TimesformerModel"), + ("timm_backbone", "TimmBackbone"), + ("timm_wrapper", "TimmWrapperModel"), + ("van", "VanModel"), + ("videomae", "VideoMAEModel"), + ("vit", "ViTModel"), + ("vit_hybrid", "ViTHybridModel"), + ("vit_mae", "ViTMAEModel"), + ("vit_msn", "ViTMSNModel"), + ("vitdet", "VitDetModel"), + ("vivit", "VivitModel"), + ("yolos", "YolosModel"), + ] +) + +MODEL_FOR_MASKED_IMAGE_MODELING_MAPPING_NAMES = OrderedDict( + [ + ("deit", "DeiTForMaskedImageModeling"), + ("focalnet", "FocalNetForMaskedImageModeling"), + ("swin", "SwinForMaskedImageModeling"), + ("swinv2", "Swinv2ForMaskedImageModeling"), + ("vit", "ViTForMaskedImageModeling"), + ] +) + + +MODEL_FOR_CAUSAL_IMAGE_MODELING_MAPPING_NAMES = OrderedDict( + # Model for Causal Image Modeling mapping + [ + ("imagegpt", "ImageGPTForCausalImageModeling"), + ] +) + +MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES = OrderedDict( + [ + # Model for Image Classification mapping + ("beit", "BeitForImageClassification"), + ("bit", "BitForImageClassification"), + ("clip", "CLIPForImageClassification"), + ("convnext", "ConvNextForImageClassification"), + ("convnextv2", "ConvNextV2ForImageClassification"), + ("cvt", "CvtForImageClassification"), + ("data2vec-vision", "Data2VecVisionForImageClassification"), + ( + "deit", + ("DeiTForImageClassification", "DeiTForImageClassificationWithTeacher"), + ), + ("dinat", "DinatForImageClassification"), + ("dinov2", "Dinov2ForImageClassification"), + ("dinov2_with_registers", "Dinov2WithRegistersForImageClassification"), + ("donut-swin", "DonutSwinForImageClassification"), + ( + "efficientformer", + ( + "EfficientFormerForImageClassification", + "EfficientFormerForImageClassificationWithTeacher", + ), + ), + ("efficientnet", "EfficientNetForImageClassification"), + ("focalnet", "FocalNetForImageClassification"), + ("hiera", "HieraForImageClassification"), + ("ijepa", "IJepaForImageClassification"), + ("imagegpt", "ImageGPTForImageClassification"), + ( + "levit", + ("LevitForImageClassification", "LevitForImageClassificationWithTeacher"), + ), + ("mobilenet_v1", "MobileNetV1ForImageClassification"), + ("mobilenet_v2", "MobileNetV2ForImageClassification"), + ("mobilevit", "MobileViTForImageClassification"), + ("mobilevitv2", "MobileViTV2ForImageClassification"), + ("nat", "NatForImageClassification"), + ( + "perceiver", + ( + "PerceiverForImageClassificationLearned", + "PerceiverForImageClassificationFourier", + "PerceiverForImageClassificationConvProcessing", + ), + ), + ("poolformer", "PoolFormerForImageClassification"), + ("pvt", "PvtForImageClassification"), + ("pvt_v2", "PvtV2ForImageClassification"), + ("regnet", "RegNetForImageClassification"), + ("resnet", "ResNetForImageClassification"), + ("segformer", "SegformerForImageClassification"), + ("shieldgemma2", "ShieldGemma2ForImageClassification"), + ("siglip", "SiglipForImageClassification"), + ("siglip2", "Siglip2ForImageClassification"), + ("swiftformer", "SwiftFormerForImageClassification"), + ("swin", "SwinForImageClassification"), + ("swinv2", "Swinv2ForImageClassification"), + ("textnet", "TextNetForImageClassification"), + ("timm_wrapper", "TimmWrapperForImageClassification"), + ("van", "VanForImageClassification"), + ("vit", "ViTForImageClassification"), + ("vit_hybrid", "ViTHybridForImageClassification"), + ("vit_msn", "ViTMSNForImageClassification"), + ] +) + +MODEL_FOR_IMAGE_SEGMENTATION_MAPPING_NAMES = OrderedDict( + [ + # Do not add new models here, this class will be deprecated in the future. + # Model for Image Segmentation mapping + ("detr", "DetrForSegmentation"), + ] +) + +MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING_NAMES = OrderedDict( + [ + # Model for Semantic Segmentation mapping + ("beit", "BeitForSemanticSegmentation"), + ("data2vec-vision", "Data2VecVisionForSemanticSegmentation"), + ("dpt", "DPTForSemanticSegmentation"), + ("mobilenet_v2", "MobileNetV2ForSemanticSegmentation"), + ("mobilevit", "MobileViTForSemanticSegmentation"), + ("mobilevitv2", "MobileViTV2ForSemanticSegmentation"), + ("segformer", "SegformerForSemanticSegmentation"), + ("upernet", "UperNetForSemanticSegmentation"), + ] +) + +MODEL_FOR_INSTANCE_SEGMENTATION_MAPPING_NAMES = OrderedDict( + [ + # Model for Instance Segmentation mapping + # MaskFormerForInstanceSegmentation can be removed from this mapping in v5 + ("maskformer", "MaskFormerForInstanceSegmentation"), + ] +) + +MODEL_FOR_UNIVERSAL_SEGMENTATION_MAPPING_NAMES = OrderedDict( + [ + # Model for Universal Segmentation mapping + ("detr", "DetrForSegmentation"), + ("mask2former", "Mask2FormerForUniversalSegmentation"), + ("maskformer", "MaskFormerForInstanceSegmentation"), + ("oneformer", "OneFormerForUniversalSegmentation"), + ] +) + +MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING_NAMES = OrderedDict( + [ + ("timesformer", "TimesformerForVideoClassification"), + ("videomae", "VideoMAEForVideoClassification"), + ("vivit", "VivitForVideoClassification"), + ] +) + +MODEL_FOR_VISION_2_SEQ_MAPPING_NAMES = OrderedDict( + [ + ("blip", "BlipForConditionalGeneration"), + ("blip-2", "Blip2ForConditionalGeneration"), + ("chameleon", "ChameleonForConditionalGeneration"), + ("git", "GitForCausalLM"), + ("idefics2", "Idefics2ForConditionalGeneration"), + ("idefics3", "Idefics3ForConditionalGeneration"), + ("instructblip", "InstructBlipForConditionalGeneration"), + ("instructblipvideo", "InstructBlipVideoForConditionalGeneration"), + ("kosmos-2", "Kosmos2ForConditionalGeneration"), + ("llava", "LlavaForConditionalGeneration"), + ("llava_next", "LlavaNextForConditionalGeneration"), + ("llava_next_video", "LlavaNextVideoForConditionalGeneration"), + ("llava_onevision", "LlavaOnevisionForConditionalGeneration"), + ("mistral3", "Mistral3ForConditionalGeneration"), + ("mllama", "MllamaForConditionalGeneration"), + ("paligemma", "PaliGemmaForConditionalGeneration"), + ("pix2struct", "Pix2StructForConditionalGeneration"), + ("qwen2_5_vl", "Qwen2_5_VLForConditionalGeneration"), + ("qwen2_vl", "Qwen2VLForConditionalGeneration"), + ("video_llava", "VideoLlavaForConditionalGeneration"), + ("vipllava", "VipLlavaForConditionalGeneration"), + ("vision-encoder-decoder", "VisionEncoderDecoderModel"), + ] +) + +MODEL_FOR_RETRIEVAL_MAPPING_NAMES = OrderedDict( + [ + ("colpali", "ColPaliForRetrieval"), + ] +) + +MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES = OrderedDict( + [ + ("aria", "AriaForConditionalGeneration"), + ("aya_vision", "AyaVisionForConditionalGeneration"), + ("blip", "BlipForConditionalGeneration"), + ("blip-2", "Blip2ForConditionalGeneration"), + ("chameleon", "ChameleonForConditionalGeneration"), + ("emu3", "Emu3ForConditionalGeneration"), + ("fuyu", "FuyuForCausalLM"), + ("gemma3", "Gemma3ForConditionalGeneration"), + ("git", "GitForCausalLM"), + ("got_ocr2", "GotOcr2ForConditionalGeneration"), + ("idefics", "IdeficsForVisionText2Text"), + ("idefics2", "Idefics2ForConditionalGeneration"), + ("idefics3", "Idefics3ForConditionalGeneration"), + ("instructblip", "InstructBlipForConditionalGeneration"), + ("internvl", "InternVLForConditionalGeneration"), + ("janus", "JanusForConditionalGeneration"), + ("kosmos-2", "Kosmos2ForConditionalGeneration"), + ("llama4", "Llama4ForConditionalGeneration"), + ("llava", "LlavaForConditionalGeneration"), + ("llava_next", "LlavaNextForConditionalGeneration"), + ("llava_onevision", "LlavaOnevisionForConditionalGeneration"), + ("mistral3", "Mistral3ForConditionalGeneration"), + ("mllama", "MllamaForConditionalGeneration"), + ("paligemma", "PaliGemmaForConditionalGeneration"), + ("pix2struct", "Pix2StructForConditionalGeneration"), + ("pixtral", "LlavaForConditionalGeneration"), + ("qwen2_5_vl", "Qwen2_5_VLForConditionalGeneration"), + ("qwen2_vl", "Qwen2VLForConditionalGeneration"), + ("shieldgemma2", "Gemma3ForConditionalGeneration"), + ("smolvlm", "SmolVLMForConditionalGeneration"), + ("udop", "UdopForConditionalGeneration"), + ("vipllava", "VipLlavaForConditionalGeneration"), + ("vision-encoder-decoder", "VisionEncoderDecoderModel"), + ] +) + +MODEL_FOR_MASKED_LM_MAPPING_NAMES = OrderedDict( + [ + # Model for Masked LM mapping + ("albert", "AlbertForMaskedLM"), + ("bart", "BartForConditionalGeneration"), + ("bert", "BertForMaskedLM"), + ("big_bird", "BigBirdForMaskedLM"), + ("camembert", "CamembertForMaskedLM"), + ("convbert", "ConvBertForMaskedLM"), + ("data2vec-text", "Data2VecTextForMaskedLM"), + ("deberta", "DebertaForMaskedLM"), + ("deberta-v2", "DebertaV2ForMaskedLM"), + ("distilbert", "DistilBertForMaskedLM"), + ("electra", "ElectraForMaskedLM"), + ("ernie", "ErnieForMaskedLM"), + ("esm", "EsmForMaskedLM"), + ("flaubert", "FlaubertWithLMHeadModel"), + ("fnet", "FNetForMaskedLM"), + ("funnel", "FunnelForMaskedLM"), + ("ibert", "IBertForMaskedLM"), + ("layoutlm", "LayoutLMForMaskedLM"), + ("longformer", "LongformerForMaskedLM"), + ("luke", "LukeForMaskedLM"), + ("mbart", "MBartForConditionalGeneration"), + ("mega", "MegaForMaskedLM"), + ("megatron-bert", "MegatronBertForMaskedLM"), + ("mobilebert", "MobileBertForMaskedLM"), + ("modernbert", "ModernBertForMaskedLM"), + ("mpnet", "MPNetForMaskedLM"), + ("mra", "MraForMaskedLM"), + ("mvp", "MvpForConditionalGeneration"), + ("nezha", "NezhaForMaskedLM"), + ("nystromformer", "NystromformerForMaskedLM"), + ("perceiver", "PerceiverForMaskedLM"), + ("qdqbert", "QDQBertForMaskedLM"), + ("reformer", "ReformerForMaskedLM"), + ("rembert", "RemBertForMaskedLM"), + ("roberta", "RobertaForMaskedLM"), + ("roberta-prelayernorm", "RobertaPreLayerNormForMaskedLM"), + ("roc_bert", "RoCBertForMaskedLM"), + ("roformer", "RoFormerForMaskedLM"), + ("squeezebert", "SqueezeBertForMaskedLM"), + ("tapas", "TapasForMaskedLM"), + ("wav2vec2", "Wav2Vec2ForMaskedLM"), + ("xlm", "XLMWithLMHeadModel"), + ("xlm-roberta", "XLMRobertaForMaskedLM"), + ("xlm-roberta-xl", "XLMRobertaXLForMaskedLM"), + ("xmod", "XmodForMaskedLM"), + ("yoso", "YosoForMaskedLM"), + ] +) + +MODEL_FOR_OBJECT_DETECTION_MAPPING_NAMES = OrderedDict( + [ + # Model for Object Detection mapping + ("conditional_detr", "ConditionalDetrForObjectDetection"), + ("dab-detr", "DabDetrForObjectDetection"), + ("deformable_detr", "DeformableDetrForObjectDetection"), + ("deta", "DetaForObjectDetection"), + ("detr", "DetrForObjectDetection"), + ("rt_detr", "RTDetrForObjectDetection"), + ("rt_detr_v2", "RTDetrV2ForObjectDetection"), + ("table-transformer", "TableTransformerForObjectDetection"), + ("yolos", "YolosForObjectDetection"), + ] +) + +MODEL_FOR_ZERO_SHOT_OBJECT_DETECTION_MAPPING_NAMES = OrderedDict( + [ + # Model for Zero Shot Object Detection mapping + ("grounding-dino", "GroundingDinoForObjectDetection"), + ("omdet-turbo", "OmDetTurboForObjectDetection"), + ("owlv2", "Owlv2ForObjectDetection"), + ("owlvit", "OwlViTForObjectDetection"), + ] +) + +MODEL_FOR_DEPTH_ESTIMATION_MAPPING_NAMES = OrderedDict( + [ + # Model for depth estimation mapping + ("depth_anything", "DepthAnythingForDepthEstimation"), + ("depth_pro", "DepthProForDepthEstimation"), + ("dpt", "DPTForDepthEstimation"), + ("glpn", "GLPNForDepthEstimation"), + ("prompt_depth_anything", "PromptDepthAnythingForDepthEstimation"), + ("zoedepth", "ZoeDepthForDepthEstimation"), + ] +) +MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES = OrderedDict( + [ + # Model for Seq2Seq Causal LM mapping + ("bart", "BartForConditionalGeneration"), + ("bigbird_pegasus", "BigBirdPegasusForConditionalGeneration"), + ("blenderbot", "BlenderbotForConditionalGeneration"), + ("blenderbot-small", "BlenderbotSmallForConditionalGeneration"), + ("encoder-decoder", "EncoderDecoderModel"), + ("fsmt", "FSMTForConditionalGeneration"), + ("gptsan-japanese", "GPTSanJapaneseForConditionalGeneration"), + ("granite_speech", "GraniteSpeechForConditionalGeneration"), + ("led", "LEDForConditionalGeneration"), + ("longt5", "LongT5ForConditionalGeneration"), + ("m2m_100", "M2M100ForConditionalGeneration"), + ("marian", "MarianMTModel"), + ("mbart", "MBartForConditionalGeneration"), + ("mt5", "MT5ForConditionalGeneration"), + ("mvp", "MvpForConditionalGeneration"), + ("nllb-moe", "NllbMoeForConditionalGeneration"), + ("pegasus", "PegasusForConditionalGeneration"), + ("pegasus_x", "PegasusXForConditionalGeneration"), + ("plbart", "PLBartForConditionalGeneration"), + ("prophetnet", "ProphetNetForConditionalGeneration"), + ("qwen2_audio", "Qwen2AudioForConditionalGeneration"), + ("seamless_m4t", "SeamlessM4TForTextToText"), + ("seamless_m4t_v2", "SeamlessM4Tv2ForTextToText"), + ("switch_transformers", "SwitchTransformersForConditionalGeneration"), + ("t5", "T5ForConditionalGeneration"), + ("umt5", "UMT5ForConditionalGeneration"), + ("xlm-prophetnet", "XLMProphetNetForConditionalGeneration"), + ] +) + +MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING_NAMES = OrderedDict( + [ + ("granite_speech", "GraniteSpeechForConditionalGeneration"), + ("moonshine", "MoonshineForConditionalGeneration"), + ("pop2piano", "Pop2PianoForConditionalGeneration"), + ("seamless_m4t", "SeamlessM4TForSpeechToText"), + ("seamless_m4t_v2", "SeamlessM4Tv2ForSpeechToText"), + ("speech-encoder-decoder", "SpeechEncoderDecoderModel"), + ("speech_to_text", "Speech2TextForConditionalGeneration"), + ("speecht5", "SpeechT5ForSpeechToText"), + ("whisper", "WhisperForConditionalGeneration"), + ] +) + +MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES = OrderedDict( + [ + # Model for Sequence Classification mapping + ("albert", "AlbertForSequenceClassification"), + ("bart", "BartForSequenceClassification"), + ("bert", "BertForSequenceClassification"), + ("big_bird", "BigBirdForSequenceClassification"), + ("bigbird_pegasus", "BigBirdPegasusForSequenceClassification"), + ("biogpt", "BioGptForSequenceClassification"), + ("bloom", "BloomForSequenceClassification"), + ("camembert", "CamembertForSequenceClassification"), + ("canine", "CanineForSequenceClassification"), + ("code_llama", "LlamaForSequenceClassification"), + ("convbert", "ConvBertForSequenceClassification"), + ("ctrl", "CTRLForSequenceClassification"), + ("data2vec-text", "Data2VecTextForSequenceClassification"), + ("deberta", "DebertaForSequenceClassification"), + ("deberta-v2", "DebertaV2ForSequenceClassification"), + ("diffllama", "DiffLlamaForSequenceClassification"), + ("distilbert", "DistilBertForSequenceClassification"), + ("electra", "ElectraForSequenceClassification"), + ("ernie", "ErnieForSequenceClassification"), + ("ernie_m", "ErnieMForSequenceClassification"), + ("esm", "EsmForSequenceClassification"), + ("falcon", "FalconForSequenceClassification"), + ("flaubert", "FlaubertForSequenceClassification"), + ("fnet", "FNetForSequenceClassification"), + ("funnel", "FunnelForSequenceClassification"), + ("gemma", "GemmaForSequenceClassification"), + ("gemma2", "Gemma2ForSequenceClassification"), + ("glm", "GlmForSequenceClassification"), + ("glm4", "Glm4ForSequenceClassification"), + ("gpt-sw3", "GPT2ForSequenceClassification"), + ("gpt2", "GPT2ForSequenceClassification"), + ("gpt_bigcode", "GPTBigCodeForSequenceClassification"), + ("gpt_neo", "GPTNeoForSequenceClassification"), + ("gpt_neox", "GPTNeoXForSequenceClassification"), + ("gptj", "GPTJForSequenceClassification"), + ("helium", "HeliumForSequenceClassification"), + ("ibert", "IBertForSequenceClassification"), + ("jamba", "JambaForSequenceClassification"), + ("jetmoe", "JetMoeForSequenceClassification"), + ("layoutlm", "LayoutLMForSequenceClassification"), + ("layoutlmv2", "LayoutLMv2ForSequenceClassification"), + ("layoutlmv3", "LayoutLMv3ForSequenceClassification"), + ("led", "LEDForSequenceClassification"), + ("lilt", "LiltForSequenceClassification"), + ("llama", "LlamaForSequenceClassification"), + ("longformer", "LongformerForSequenceClassification"), + ("luke", "LukeForSequenceClassification"), + ("markuplm", "MarkupLMForSequenceClassification"), + ("mbart", "MBartForSequenceClassification"), + ("mega", "MegaForSequenceClassification"), + ("megatron-bert", "MegatronBertForSequenceClassification"), + ("mistral", "MistralForSequenceClassification"), + ("mixtral", "MixtralForSequenceClassification"), + ("mobilebert", "MobileBertForSequenceClassification"), + ("modernbert", "ModernBertForSequenceClassification"), + ("mpnet", "MPNetForSequenceClassification"), + ("mpt", "MptForSequenceClassification"), + ("mra", "MraForSequenceClassification"), + ("mt5", "MT5ForSequenceClassification"), + ("mvp", "MvpForSequenceClassification"), + ("nemotron", "NemotronForSequenceClassification"), + ("nezha", "NezhaForSequenceClassification"), + ("nystromformer", "NystromformerForSequenceClassification"), + ("open-llama", "OpenLlamaForSequenceClassification"), + ("openai-gpt", "OpenAIGPTForSequenceClassification"), + ("opt", "OPTForSequenceClassification"), + ("perceiver", "PerceiverForSequenceClassification"), + ("persimmon", "PersimmonForSequenceClassification"), + ("phi", "PhiForSequenceClassification"), + ("phi3", "Phi3ForSequenceClassification"), + ("phimoe", "PhimoeForSequenceClassification"), + ("plbart", "PLBartForSequenceClassification"), + ("qdqbert", "QDQBertForSequenceClassification"), + ("qwen2", "Qwen2ForSequenceClassification"), + ("qwen2_moe", "Qwen2MoeForSequenceClassification"), + ("qwen3", "Qwen3ForSequenceClassification"), + ("qwen3_moe", "Qwen3MoeForSequenceClassification"), + ("reformer", "ReformerForSequenceClassification"), + ("rembert", "RemBertForSequenceClassification"), + ("roberta", "RobertaForSequenceClassification"), + ("roberta-prelayernorm", "RobertaPreLayerNormForSequenceClassification"), + ("roc_bert", "RoCBertForSequenceClassification"), + ("roformer", "RoFormerForSequenceClassification"), + ("squeezebert", "SqueezeBertForSequenceClassification"), + ("stablelm", "StableLmForSequenceClassification"), + ("starcoder2", "Starcoder2ForSequenceClassification"), + ("t5", "T5ForSequenceClassification"), + ("tapas", "TapasForSequenceClassification"), + ("transfo-xl", "TransfoXLForSequenceClassification"), + ("umt5", "UMT5ForSequenceClassification"), + ("xlm", "XLMForSequenceClassification"), + ("xlm-roberta", "XLMRobertaForSequenceClassification"), + ("xlm-roberta-xl", "XLMRobertaXLForSequenceClassification"), + ("xlnet", "XLNetForSequenceClassification"), + ("xmod", "XmodForSequenceClassification"), + ("yoso", "YosoForSequenceClassification"), + ("zamba", "ZambaForSequenceClassification"), + ("zamba2", "Zamba2ForSequenceClassification"), + ] +) + +MODEL_FOR_QUESTION_ANSWERING_MAPPING_NAMES = OrderedDict( + [ + # Model for Question Answering mapping + ("albert", "AlbertForQuestionAnswering"), + ("bart", "BartForQuestionAnswering"), + ("bert", "BertForQuestionAnswering"), + ("big_bird", "BigBirdForQuestionAnswering"), + ("bigbird_pegasus", "BigBirdPegasusForQuestionAnswering"), + ("bloom", "BloomForQuestionAnswering"), + ("camembert", "CamembertForQuestionAnswering"), + ("canine", "CanineForQuestionAnswering"), + ("convbert", "ConvBertForQuestionAnswering"), + ("data2vec-text", "Data2VecTextForQuestionAnswering"), + ("deberta", "DebertaForQuestionAnswering"), + ("deberta-v2", "DebertaV2ForQuestionAnswering"), + ("diffllama", "DiffLlamaForQuestionAnswering"), + ("distilbert", "DistilBertForQuestionAnswering"), + ("electra", "ElectraForQuestionAnswering"), + ("ernie", "ErnieForQuestionAnswering"), + ("ernie_m", "ErnieMForQuestionAnswering"), + ("falcon", "FalconForQuestionAnswering"), + ("flaubert", "FlaubertForQuestionAnsweringSimple"), + ("fnet", "FNetForQuestionAnswering"), + ("funnel", "FunnelForQuestionAnswering"), + ("gpt2", "GPT2ForQuestionAnswering"), + ("gpt_neo", "GPTNeoForQuestionAnswering"), + ("gpt_neox", "GPTNeoXForQuestionAnswering"), + ("gptj", "GPTJForQuestionAnswering"), + ("ibert", "IBertForQuestionAnswering"), + ("layoutlmv2", "LayoutLMv2ForQuestionAnswering"), + ("layoutlmv3", "LayoutLMv3ForQuestionAnswering"), + ("led", "LEDForQuestionAnswering"), + ("lilt", "LiltForQuestionAnswering"), + ("llama", "LlamaForQuestionAnswering"), + ("longformer", "LongformerForQuestionAnswering"), + ("luke", "LukeForQuestionAnswering"), + ("lxmert", "LxmertForQuestionAnswering"), + ("markuplm", "MarkupLMForQuestionAnswering"), + ("mbart", "MBartForQuestionAnswering"), + ("mega", "MegaForQuestionAnswering"), + ("megatron-bert", "MegatronBertForQuestionAnswering"), + ("mistral", "MistralForQuestionAnswering"), + ("mixtral", "MixtralForQuestionAnswering"), + ("mobilebert", "MobileBertForQuestionAnswering"), + ("modernbert", "ModernBertForQuestionAnswering"), + ("mpnet", "MPNetForQuestionAnswering"), + ("mpt", "MptForQuestionAnswering"), + ("mra", "MraForQuestionAnswering"), + ("mt5", "MT5ForQuestionAnswering"), + ("mvp", "MvpForQuestionAnswering"), + ("nemotron", "NemotronForQuestionAnswering"), + ("nezha", "NezhaForQuestionAnswering"), + ("nystromformer", "NystromformerForQuestionAnswering"), + ("opt", "OPTForQuestionAnswering"), + ("qdqbert", "QDQBertForQuestionAnswering"), + ("qwen2", "Qwen2ForQuestionAnswering"), + ("qwen2_moe", "Qwen2MoeForQuestionAnswering"), + ("qwen3", "Qwen3ForQuestionAnswering"), + ("qwen3_moe", "Qwen3MoeForQuestionAnswering"), + ("reformer", "ReformerForQuestionAnswering"), + ("rembert", "RemBertForQuestionAnswering"), + ("roberta", "RobertaForQuestionAnswering"), + ("roberta-prelayernorm", "RobertaPreLayerNormForQuestionAnswering"), + ("roc_bert", "RoCBertForQuestionAnswering"), + ("roformer", "RoFormerForQuestionAnswering"), + ("splinter", "SplinterForQuestionAnswering"), + ("squeezebert", "SqueezeBertForQuestionAnswering"), + ("t5", "T5ForQuestionAnswering"), + ("umt5", "UMT5ForQuestionAnswering"), + ("xlm", "XLMForQuestionAnsweringSimple"), + ("xlm-roberta", "XLMRobertaForQuestionAnswering"), + ("xlm-roberta-xl", "XLMRobertaXLForQuestionAnswering"), + ("xlnet", "XLNetForQuestionAnsweringSimple"), + ("xmod", "XmodForQuestionAnswering"), + ("yoso", "YosoForQuestionAnswering"), + ] +) + +MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING_NAMES = OrderedDict( + [ + # Model for Table Question Answering mapping + ("tapas", "TapasForQuestionAnswering"), + ] +) + +MODEL_FOR_VISUAL_QUESTION_ANSWERING_MAPPING_NAMES = OrderedDict( + [ + ("blip", "BlipForQuestionAnswering"), + ("blip-2", "Blip2ForConditionalGeneration"), + ("vilt", "ViltForQuestionAnswering"), + ] +) + +MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING_NAMES = OrderedDict( + [ + ("layoutlm", "LayoutLMForQuestionAnswering"), + ("layoutlmv2", "LayoutLMv2ForQuestionAnswering"), + ("layoutlmv3", "LayoutLMv3ForQuestionAnswering"), + ] +) + +MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES = OrderedDict( + [ + # Model for Token Classification mapping + ("albert", "AlbertForTokenClassification"), + ("bert", "BertForTokenClassification"), + ("big_bird", "BigBirdForTokenClassification"), + ("biogpt", "BioGptForTokenClassification"), + ("bloom", "BloomForTokenClassification"), + ("bros", "BrosForTokenClassification"), + ("camembert", "CamembertForTokenClassification"), + ("canine", "CanineForTokenClassification"), + ("convbert", "ConvBertForTokenClassification"), + ("data2vec-text", "Data2VecTextForTokenClassification"), + ("deberta", "DebertaForTokenClassification"), + ("deberta-v2", "DebertaV2ForTokenClassification"), + ("diffllama", "DiffLlamaForTokenClassification"), + ("distilbert", "DistilBertForTokenClassification"), + ("electra", "ElectraForTokenClassification"), + ("ernie", "ErnieForTokenClassification"), + ("ernie_m", "ErnieMForTokenClassification"), + ("esm", "EsmForTokenClassification"), + ("falcon", "FalconForTokenClassification"), + ("flaubert", "FlaubertForTokenClassification"), + ("fnet", "FNetForTokenClassification"), + ("funnel", "FunnelForTokenClassification"), + ("gemma", "GemmaForTokenClassification"), + ("gemma2", "Gemma2ForTokenClassification"), + ("glm", "GlmForTokenClassification"), + ("glm4", "Glm4ForTokenClassification"), + ("gpt-sw3", "GPT2ForTokenClassification"), + ("gpt2", "GPT2ForTokenClassification"), + ("gpt_bigcode", "GPTBigCodeForTokenClassification"), + ("gpt_neo", "GPTNeoForTokenClassification"), + ("gpt_neox", "GPTNeoXForTokenClassification"), + ("helium", "HeliumForTokenClassification"), + ("ibert", "IBertForTokenClassification"), + ("layoutlm", "LayoutLMForTokenClassification"), + ("layoutlmv2", "LayoutLMv2ForTokenClassification"), + ("layoutlmv3", "LayoutLMv3ForTokenClassification"), + ("lilt", "LiltForTokenClassification"), + ("llama", "LlamaForTokenClassification"), + ("longformer", "LongformerForTokenClassification"), + ("luke", "LukeForTokenClassification"), + ("markuplm", "MarkupLMForTokenClassification"), + ("mega", "MegaForTokenClassification"), + ("megatron-bert", "MegatronBertForTokenClassification"), + ("mistral", "MistralForTokenClassification"), + ("mixtral", "MixtralForTokenClassification"), + ("mobilebert", "MobileBertForTokenClassification"), + ("modernbert", "ModernBertForTokenClassification"), + ("mpnet", "MPNetForTokenClassification"), + ("mpt", "MptForTokenClassification"), + ("mra", "MraForTokenClassification"), + ("mt5", "MT5ForTokenClassification"), + ("nemotron", "NemotronForTokenClassification"), + ("nezha", "NezhaForTokenClassification"), + ("nystromformer", "NystromformerForTokenClassification"), + ("persimmon", "PersimmonForTokenClassification"), + ("phi", "PhiForTokenClassification"), + ("phi3", "Phi3ForTokenClassification"), + ("qdqbert", "QDQBertForTokenClassification"), + ("qwen2", "Qwen2ForTokenClassification"), + ("qwen2_moe", "Qwen2MoeForTokenClassification"), + ("qwen3", "Qwen3ForTokenClassification"), + ("qwen3_moe", "Qwen3MoeForTokenClassification"), + ("rembert", "RemBertForTokenClassification"), + ("roberta", "RobertaForTokenClassification"), + ("roberta-prelayernorm", "RobertaPreLayerNormForTokenClassification"), + ("roc_bert", "RoCBertForTokenClassification"), + ("roformer", "RoFormerForTokenClassification"), + ("squeezebert", "SqueezeBertForTokenClassification"), + ("stablelm", "StableLmForTokenClassification"), + ("starcoder2", "Starcoder2ForTokenClassification"), + ("t5", "T5ForTokenClassification"), + ("umt5", "UMT5ForTokenClassification"), + ("xlm", "XLMForTokenClassification"), + ("xlm-roberta", "XLMRobertaForTokenClassification"), + ("xlm-roberta-xl", "XLMRobertaXLForTokenClassification"), + ("xlnet", "XLNetForTokenClassification"), + ("xmod", "XmodForTokenClassification"), + ("yoso", "YosoForTokenClassification"), + ] +) + +MODEL_FOR_MULTIPLE_CHOICE_MAPPING_NAMES = OrderedDict( + [ + # Model for Multiple Choice mapping + ("albert", "AlbertForMultipleChoice"), + ("bert", "BertForMultipleChoice"), + ("big_bird", "BigBirdForMultipleChoice"), + ("camembert", "CamembertForMultipleChoice"), + ("canine", "CanineForMultipleChoice"), + ("convbert", "ConvBertForMultipleChoice"), + ("data2vec-text", "Data2VecTextForMultipleChoice"), + ("deberta-v2", "DebertaV2ForMultipleChoice"), + ("distilbert", "DistilBertForMultipleChoice"), + ("electra", "ElectraForMultipleChoice"), + ("ernie", "ErnieForMultipleChoice"), + ("ernie_m", "ErnieMForMultipleChoice"), + ("flaubert", "FlaubertForMultipleChoice"), + ("fnet", "FNetForMultipleChoice"), + ("funnel", "FunnelForMultipleChoice"), + ("ibert", "IBertForMultipleChoice"), + ("longformer", "LongformerForMultipleChoice"), + ("luke", "LukeForMultipleChoice"), + ("mega", "MegaForMultipleChoice"), + ("megatron-bert", "MegatronBertForMultipleChoice"), + ("mobilebert", "MobileBertForMultipleChoice"), + ("mpnet", "MPNetForMultipleChoice"), + ("mra", "MraForMultipleChoice"), + ("nezha", "NezhaForMultipleChoice"), + ("nystromformer", "NystromformerForMultipleChoice"), + ("qdqbert", "QDQBertForMultipleChoice"), + ("rembert", "RemBertForMultipleChoice"), + ("roberta", "RobertaForMultipleChoice"), + ("roberta-prelayernorm", "RobertaPreLayerNormForMultipleChoice"), + ("roc_bert", "RoCBertForMultipleChoice"), + ("roformer", "RoFormerForMultipleChoice"), + ("squeezebert", "SqueezeBertForMultipleChoice"), + ("xlm", "XLMForMultipleChoice"), + ("xlm-roberta", "XLMRobertaForMultipleChoice"), + ("xlm-roberta-xl", "XLMRobertaXLForMultipleChoice"), + ("xlnet", "XLNetForMultipleChoice"), + ("xmod", "XmodForMultipleChoice"), + ("yoso", "YosoForMultipleChoice"), + ] +) + +MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING_NAMES = OrderedDict( + [ + ("bert", "BertForNextSentencePrediction"), + ("ernie", "ErnieForNextSentencePrediction"), + ("fnet", "FNetForNextSentencePrediction"), + ("megatron-bert", "MegatronBertForNextSentencePrediction"), + ("mobilebert", "MobileBertForNextSentencePrediction"), + ("nezha", "NezhaForNextSentencePrediction"), + ("qdqbert", "QDQBertForNextSentencePrediction"), + ] +) + +MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING_NAMES = OrderedDict( + [ + # Model for Audio Classification mapping + ("audio-spectrogram-transformer", "ASTForAudioClassification"), + ("data2vec-audio", "Data2VecAudioForSequenceClassification"), + ("hubert", "HubertForSequenceClassification"), + ("sew", "SEWForSequenceClassification"), + ("sew-d", "SEWDForSequenceClassification"), + ("unispeech", "UniSpeechForSequenceClassification"), + ("unispeech-sat", "UniSpeechSatForSequenceClassification"), + ("wav2vec2", "Wav2Vec2ForSequenceClassification"), + ("wav2vec2-bert", "Wav2Vec2BertForSequenceClassification"), + ("wav2vec2-conformer", "Wav2Vec2ConformerForSequenceClassification"), + ("wavlm", "WavLMForSequenceClassification"), + ("whisper", "WhisperForAudioClassification"), + ] +) + +MODEL_FOR_CTC_MAPPING_NAMES = OrderedDict( + [ + # Model for Connectionist temporal classification (CTC) mapping + ("data2vec-audio", "Data2VecAudioForCTC"), + ("hubert", "HubertForCTC"), + ("mctct", "MCTCTForCTC"), + ("sew", "SEWForCTC"), + ("sew-d", "SEWDForCTC"), + ("unispeech", "UniSpeechForCTC"), + ("unispeech-sat", "UniSpeechSatForCTC"), + ("wav2vec2", "Wav2Vec2ForCTC"), + ("wav2vec2-bert", "Wav2Vec2BertForCTC"), + ("wav2vec2-conformer", "Wav2Vec2ConformerForCTC"), + ("wavlm", "WavLMForCTC"), + ] +) + +MODEL_FOR_AUDIO_FRAME_CLASSIFICATION_MAPPING_NAMES = OrderedDict( + [ + # Model for Audio Classification mapping + ("data2vec-audio", "Data2VecAudioForAudioFrameClassification"), + ("unispeech-sat", "UniSpeechSatForAudioFrameClassification"), + ("wav2vec2", "Wav2Vec2ForAudioFrameClassification"), + ("wav2vec2-bert", "Wav2Vec2BertForAudioFrameClassification"), + ("wav2vec2-conformer", "Wav2Vec2ConformerForAudioFrameClassification"), + ("wavlm", "WavLMForAudioFrameClassification"), + ] +) + +MODEL_FOR_AUDIO_XVECTOR_MAPPING_NAMES = OrderedDict( + [ + # Model for Audio Classification mapping + ("data2vec-audio", "Data2VecAudioForXVector"), + ("unispeech-sat", "UniSpeechSatForXVector"), + ("wav2vec2", "Wav2Vec2ForXVector"), + ("wav2vec2-bert", "Wav2Vec2BertForXVector"), + ("wav2vec2-conformer", "Wav2Vec2ConformerForXVector"), + ("wavlm", "WavLMForXVector"), + ] +) + +MODEL_FOR_TEXT_TO_SPECTROGRAM_MAPPING_NAMES = OrderedDict( + [ + # Model for Text-To-Spectrogram mapping + ("fastspeech2_conformer", "FastSpeech2ConformerModel"), + ("speecht5", "SpeechT5ForTextToSpeech"), + ] +) + +MODEL_FOR_TEXT_TO_WAVEFORM_MAPPING_NAMES = OrderedDict( + [ + # Model for Text-To-Waveform mapping + ("bark", "BarkModel"), + ("fastspeech2_conformer", "FastSpeech2ConformerWithHifiGan"), + ("musicgen", "MusicgenForConditionalGeneration"), + ("musicgen_melody", "MusicgenMelodyForConditionalGeneration"), + ("qwen2_5_omni", "Qwen2_5OmniForConditionalGeneration"), + ("seamless_m4t", "SeamlessM4TForTextToSpeech"), + ("seamless_m4t_v2", "SeamlessM4Tv2ForTextToSpeech"), + ("vits", "VitsModel"), + ] +) + +MODEL_FOR_ZERO_SHOT_IMAGE_CLASSIFICATION_MAPPING_NAMES = OrderedDict( + [ + # Model for Zero Shot Image Classification mapping + ("align", "AlignModel"), + ("altclip", "AltCLIPModel"), + ("blip", "BlipModel"), + ("blip-2", "Blip2ForImageTextRetrieval"), + ("chinese_clip", "ChineseCLIPModel"), + ("clip", "CLIPModel"), + ("clipseg", "CLIPSegModel"), + ("siglip", "SiglipModel"), + ("siglip2", "Siglip2Model"), + ] +) + +MODEL_FOR_BACKBONE_MAPPING_NAMES = OrderedDict( + [ + # Backbone mapping + ("beit", "BeitBackbone"), + ("bit", "BitBackbone"), + ("convnext", "ConvNextBackbone"), + ("convnextv2", "ConvNextV2Backbone"), + ("dinat", "DinatBackbone"), + ("dinov2", "Dinov2Backbone"), + ("dinov2_with_registers", "Dinov2WithRegistersBackbone"), + ("focalnet", "FocalNetBackbone"), + ("hiera", "HieraBackbone"), + ("maskformer-swin", "MaskFormerSwinBackbone"), + ("nat", "NatBackbone"), + ("pvt_v2", "PvtV2Backbone"), + ("resnet", "ResNetBackbone"), + ("rt_detr_resnet", "RTDetrResNetBackbone"), + ("swin", "SwinBackbone"), + ("swinv2", "Swinv2Backbone"), + ("textnet", "TextNetBackbone"), + ("timm_backbone", "TimmBackbone"), + ("vitdet", "VitDetBackbone"), + ("vitpose_backbone", "VitPoseBackbone"), + ] +) + +MODEL_FOR_MASK_GENERATION_MAPPING_NAMES = OrderedDict( + [ + ("sam", "SamModel"), + ] +) + + +MODEL_FOR_KEYPOINT_DETECTION_MAPPING_NAMES = OrderedDict( + [ + ("superpoint", "SuperPointForKeypointDetection"), + ] +) + + +MODEL_FOR_TEXT_ENCODING_MAPPING_NAMES = OrderedDict( + [ + ("albert", "AlbertModel"), + ("bert", "BertModel"), + ("big_bird", "BigBirdModel"), + ("clip_text_model", "CLIPTextModel"), + ("data2vec-text", "Data2VecTextModel"), + ("deberta", "DebertaModel"), + ("deberta-v2", "DebertaV2Model"), + ("distilbert", "DistilBertModel"), + ("electra", "ElectraModel"), + ("emu3", "Emu3TextModel"), + ("flaubert", "FlaubertModel"), + ("ibert", "IBertModel"), + ("llama4", "Llama4TextModel"), + ("longformer", "LongformerModel"), + ("mllama", "MllamaTextModel"), + ("mobilebert", "MobileBertModel"), + ("mt5", "MT5EncoderModel"), + ("nystromformer", "NystromformerModel"), + ("reformer", "ReformerModel"), + ("rembert", "RemBertModel"), + ("roberta", "RobertaModel"), + ("roberta-prelayernorm", "RobertaPreLayerNormModel"), + ("roc_bert", "RoCBertModel"), + ("roformer", "RoFormerModel"), + ("squeezebert", "SqueezeBertModel"), + ("t5", "T5EncoderModel"), + ("umt5", "UMT5EncoderModel"), + ("xlm", "XLMModel"), + ("xlm-roberta", "XLMRobertaModel"), + ("xlm-roberta-xl", "XLMRobertaXLModel"), + ] +) + +MODEL_FOR_TIME_SERIES_CLASSIFICATION_MAPPING_NAMES = OrderedDict( + [ + ("patchtsmixer", "PatchTSMixerForTimeSeriesClassification"), + ("patchtst", "PatchTSTForClassification"), + ] +) + +MODEL_FOR_TIME_SERIES_REGRESSION_MAPPING_NAMES = OrderedDict( + [ + ("patchtsmixer", "PatchTSMixerForRegression"), + ("patchtst", "PatchTSTForRegression"), + ] +) + +MODEL_FOR_TIME_SERIES_PREDICTION_MAPPING_NAMES = OrderedDict( + [ + ("timesfm", "TimesFmModelForPrediction"), + ] +) + +MODEL_FOR_IMAGE_TO_IMAGE_MAPPING_NAMES = OrderedDict( + [ + ("swin2sr", "Swin2SRForImageSuperResolution"), + ] +) + +MODEL_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_MAPPING_NAMES) +MODEL_FOR_PRETRAINING_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_FOR_PRETRAINING_MAPPING_NAMES) +MODEL_WITH_LM_HEAD_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_WITH_LM_HEAD_MAPPING_NAMES) +MODEL_FOR_CAUSAL_LM_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_FOR_CAUSAL_LM_MAPPING_NAMES) +MODEL_FOR_CAUSAL_IMAGE_MODELING_MAPPING = _LazyAutoMapping( + CONFIG_MAPPING_NAMES, MODEL_FOR_CAUSAL_IMAGE_MODELING_MAPPING_NAMES +) +MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING = _LazyAutoMapping( + CONFIG_MAPPING_NAMES, MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES +) +MODEL_FOR_ZERO_SHOT_IMAGE_CLASSIFICATION_MAPPING = _LazyAutoMapping( + CONFIG_MAPPING_NAMES, MODEL_FOR_ZERO_SHOT_IMAGE_CLASSIFICATION_MAPPING_NAMES +) +MODEL_FOR_IMAGE_SEGMENTATION_MAPPING = _LazyAutoMapping( + CONFIG_MAPPING_NAMES, MODEL_FOR_IMAGE_SEGMENTATION_MAPPING_NAMES +) +MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING = _LazyAutoMapping( + CONFIG_MAPPING_NAMES, MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING_NAMES +) +MODEL_FOR_INSTANCE_SEGMENTATION_MAPPING = _LazyAutoMapping( + CONFIG_MAPPING_NAMES, MODEL_FOR_INSTANCE_SEGMENTATION_MAPPING_NAMES +) +MODEL_FOR_UNIVERSAL_SEGMENTATION_MAPPING = _LazyAutoMapping( + CONFIG_MAPPING_NAMES, MODEL_FOR_UNIVERSAL_SEGMENTATION_MAPPING_NAMES +) +MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING = _LazyAutoMapping( + CONFIG_MAPPING_NAMES, MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING_NAMES +) +MODEL_FOR_VISION_2_SEQ_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_FOR_VISION_2_SEQ_MAPPING_NAMES) +MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING = _LazyAutoMapping( + CONFIG_MAPPING_NAMES, MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES +) +MODEL_FOR_RETRIEVAL_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_FOR_RETRIEVAL_MAPPING_NAMES) +MODEL_FOR_VISUAL_QUESTION_ANSWERING_MAPPING = _LazyAutoMapping( + CONFIG_MAPPING_NAMES, MODEL_FOR_VISUAL_QUESTION_ANSWERING_MAPPING_NAMES +) +MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING = _LazyAutoMapping( + CONFIG_MAPPING_NAMES, MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING_NAMES +) +MODEL_FOR_MASKED_LM_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_FOR_MASKED_LM_MAPPING_NAMES) +MODEL_FOR_IMAGE_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_FOR_IMAGE_MAPPING_NAMES) +MODEL_FOR_MASKED_IMAGE_MODELING_MAPPING = _LazyAutoMapping( + CONFIG_MAPPING_NAMES, MODEL_FOR_MASKED_IMAGE_MODELING_MAPPING_NAMES +) +MODEL_FOR_OBJECT_DETECTION_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_FOR_OBJECT_DETECTION_MAPPING_NAMES) +MODEL_FOR_ZERO_SHOT_OBJECT_DETECTION_MAPPING = _LazyAutoMapping( + CONFIG_MAPPING_NAMES, MODEL_FOR_ZERO_SHOT_OBJECT_DETECTION_MAPPING_NAMES +) +MODEL_FOR_DEPTH_ESTIMATION_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_FOR_DEPTH_ESTIMATION_MAPPING_NAMES) +MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING = _LazyAutoMapping( + CONFIG_MAPPING_NAMES, MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES +) +MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING = _LazyAutoMapping( + CONFIG_MAPPING_NAMES, MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES +) +MODEL_FOR_QUESTION_ANSWERING_MAPPING = _LazyAutoMapping( + CONFIG_MAPPING_NAMES, MODEL_FOR_QUESTION_ANSWERING_MAPPING_NAMES +) +MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING = _LazyAutoMapping( + CONFIG_MAPPING_NAMES, MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING_NAMES +) +MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING = _LazyAutoMapping( + CONFIG_MAPPING_NAMES, MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES +) +MODEL_FOR_MULTIPLE_CHOICE_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_FOR_MULTIPLE_CHOICE_MAPPING_NAMES) +MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING = _LazyAutoMapping( + CONFIG_MAPPING_NAMES, MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING_NAMES +) +MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING = _LazyAutoMapping( + CONFIG_MAPPING_NAMES, MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING_NAMES +) +MODEL_FOR_CTC_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_FOR_CTC_MAPPING_NAMES) +MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING_NAMES) +MODEL_FOR_AUDIO_FRAME_CLASSIFICATION_MAPPING = _LazyAutoMapping( + CONFIG_MAPPING_NAMES, MODEL_FOR_AUDIO_FRAME_CLASSIFICATION_MAPPING_NAMES +) +MODEL_FOR_AUDIO_XVECTOR_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_FOR_AUDIO_XVECTOR_MAPPING_NAMES) + +MODEL_FOR_TEXT_TO_SPECTROGRAM_MAPPING = _LazyAutoMapping( + CONFIG_MAPPING_NAMES, MODEL_FOR_TEXT_TO_SPECTROGRAM_MAPPING_NAMES +) + +MODEL_FOR_TEXT_TO_WAVEFORM_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_FOR_TEXT_TO_WAVEFORM_MAPPING_NAMES) + +MODEL_FOR_BACKBONE_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_FOR_BACKBONE_MAPPING_NAMES) + +MODEL_FOR_MASK_GENERATION_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_FOR_MASK_GENERATION_MAPPING_NAMES) + +MODEL_FOR_KEYPOINT_DETECTION_MAPPING = _LazyAutoMapping( + CONFIG_MAPPING_NAMES, MODEL_FOR_KEYPOINT_DETECTION_MAPPING_NAMES +) + +MODEL_FOR_TEXT_ENCODING_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_FOR_TEXT_ENCODING_MAPPING_NAMES) + +MODEL_FOR_TIME_SERIES_CLASSIFICATION_MAPPING = _LazyAutoMapping( + CONFIG_MAPPING_NAMES, MODEL_FOR_TIME_SERIES_CLASSIFICATION_MAPPING_NAMES +) + +MODEL_FOR_TIME_SERIES_REGRESSION_MAPPING = _LazyAutoMapping( + CONFIG_MAPPING_NAMES, MODEL_FOR_TIME_SERIES_REGRESSION_MAPPING_NAMES +) + +MODEL_FOR_TIME_SERIES_PREDICTION_MAPPING = _LazyAutoMapping( + CONFIG_MAPPING_NAMES, MODEL_FOR_TIME_SERIES_PREDICTION_MAPPING_NAMES +) + +MODEL_FOR_IMAGE_TO_IMAGE_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, MODEL_FOR_IMAGE_TO_IMAGE_MAPPING_NAMES) + + +class AutoModelForMaskGeneration(_BaseAutoModelClass): + _model_mapping = MODEL_FOR_MASK_GENERATION_MAPPING + + +class AutoModelForKeypointDetection(_BaseAutoModelClass): + _model_mapping = MODEL_FOR_KEYPOINT_DETECTION_MAPPING + + +class AutoModelForTextEncoding(_BaseAutoModelClass): + _model_mapping = MODEL_FOR_TEXT_ENCODING_MAPPING + + +class AutoModelForImageToImage(_BaseAutoModelClass): + _model_mapping = MODEL_FOR_IMAGE_TO_IMAGE_MAPPING + + +class AutoModel(_BaseAutoModelClass): + _model_mapping = MODEL_MAPPING + + +AutoModel = auto_class_update(AutoModel) + + +class AutoModelForPreTraining(_BaseAutoModelClass): + _model_mapping = MODEL_FOR_PRETRAINING_MAPPING + + +AutoModelForPreTraining = auto_class_update(AutoModelForPreTraining, head_doc="pretraining") + + +# Private on purpose, the public class will add the deprecation warnings. +class _AutoModelWithLMHead(_BaseAutoModelClass): + _model_mapping = MODEL_WITH_LM_HEAD_MAPPING + + +_AutoModelWithLMHead = auto_class_update(_AutoModelWithLMHead, head_doc="language modeling") + + +class AutoModelForCausalLM(_BaseAutoModelClass): + _model_mapping = MODEL_FOR_CAUSAL_LM_MAPPING + + +AutoModelForCausalLM = auto_class_update(AutoModelForCausalLM, head_doc="causal language modeling") + + +class AutoModelForMaskedLM(_BaseAutoModelClass): + _model_mapping = MODEL_FOR_MASKED_LM_MAPPING + + +AutoModelForMaskedLM = auto_class_update(AutoModelForMaskedLM, head_doc="masked language modeling") + + +class AutoModelForSeq2SeqLM(_BaseAutoModelClass): + _model_mapping = MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING + + +AutoModelForSeq2SeqLM = auto_class_update( + AutoModelForSeq2SeqLM, + head_doc="sequence-to-sequence language modeling", + checkpoint_for_example="google-t5/t5-base", +) + + +class AutoModelForSequenceClassification(_BaseAutoModelClass): + _model_mapping = MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING + + +AutoModelForSequenceClassification = auto_class_update( + AutoModelForSequenceClassification, head_doc="sequence classification" +) + + +class AutoModelForQuestionAnswering(_BaseAutoModelClass): + _model_mapping = MODEL_FOR_QUESTION_ANSWERING_MAPPING + + +AutoModelForQuestionAnswering = auto_class_update(AutoModelForQuestionAnswering, head_doc="question answering") + + +class AutoModelForTableQuestionAnswering(_BaseAutoModelClass): + _model_mapping = MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING + + +AutoModelForTableQuestionAnswering = auto_class_update( + AutoModelForTableQuestionAnswering, + head_doc="table question answering", + checkpoint_for_example="google/tapas-base-finetuned-wtq", +) + + +class AutoModelForVisualQuestionAnswering(_BaseAutoModelClass): + _model_mapping = MODEL_FOR_VISUAL_QUESTION_ANSWERING_MAPPING + + +AutoModelForVisualQuestionAnswering = auto_class_update( + AutoModelForVisualQuestionAnswering, + head_doc="visual question answering", + checkpoint_for_example="dandelin/vilt-b32-finetuned-vqa", +) + + +class AutoModelForDocumentQuestionAnswering(_BaseAutoModelClass): + _model_mapping = MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING + + +AutoModelForDocumentQuestionAnswering = auto_class_update( + AutoModelForDocumentQuestionAnswering, + head_doc="document question answering", + checkpoint_for_example='impira/layoutlm-document-qa", revision="52e01b3', +) + + +class AutoModelForTokenClassification(_BaseAutoModelClass): + _model_mapping = MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING + + +AutoModelForTokenClassification = auto_class_update(AutoModelForTokenClassification, head_doc="token classification") + + +class AutoModelForMultipleChoice(_BaseAutoModelClass): + _model_mapping = MODEL_FOR_MULTIPLE_CHOICE_MAPPING + + +AutoModelForMultipleChoice = auto_class_update(AutoModelForMultipleChoice, head_doc="multiple choice") + + +class AutoModelForNextSentencePrediction(_BaseAutoModelClass): + _model_mapping = MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING + + +AutoModelForNextSentencePrediction = auto_class_update( + AutoModelForNextSentencePrediction, head_doc="next sentence prediction" +) + + +class AutoModelForImageClassification(_BaseAutoModelClass): + _model_mapping = MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING + + +AutoModelForImageClassification = auto_class_update(AutoModelForImageClassification, head_doc="image classification") + + +class AutoModelForZeroShotImageClassification(_BaseAutoModelClass): + _model_mapping = MODEL_FOR_ZERO_SHOT_IMAGE_CLASSIFICATION_MAPPING + + +AutoModelForZeroShotImageClassification = auto_class_update( + AutoModelForZeroShotImageClassification, head_doc="zero-shot image classification" +) + + +class AutoModelForImageSegmentation(_BaseAutoModelClass): + _model_mapping = MODEL_FOR_IMAGE_SEGMENTATION_MAPPING + + +AutoModelForImageSegmentation = auto_class_update(AutoModelForImageSegmentation, head_doc="image segmentation") + + +class AutoModelForSemanticSegmentation(_BaseAutoModelClass): + _model_mapping = MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING + + +AutoModelForSemanticSegmentation = auto_class_update( + AutoModelForSemanticSegmentation, head_doc="semantic segmentation" +) + + +class AutoModelForTimeSeriesPrediction(_BaseAutoModelClass): + _model_mapping = MODEL_FOR_TIME_SERIES_PREDICTION_MAPPING + + +AutoModelForTimeSeriesPrediction = auto_class_update( + AutoModelForTimeSeriesPrediction, head_doc="time-series prediction" +) + + +class AutoModelForUniversalSegmentation(_BaseAutoModelClass): + _model_mapping = MODEL_FOR_UNIVERSAL_SEGMENTATION_MAPPING + + +AutoModelForUniversalSegmentation = auto_class_update( + AutoModelForUniversalSegmentation, head_doc="universal image segmentation" +) + + +class AutoModelForInstanceSegmentation(_BaseAutoModelClass): + _model_mapping = MODEL_FOR_INSTANCE_SEGMENTATION_MAPPING + + +AutoModelForInstanceSegmentation = auto_class_update( + AutoModelForInstanceSegmentation, head_doc="instance segmentation" +) + + +class AutoModelForObjectDetection(_BaseAutoModelClass): + _model_mapping = MODEL_FOR_OBJECT_DETECTION_MAPPING + + +AutoModelForObjectDetection = auto_class_update(AutoModelForObjectDetection, head_doc="object detection") + + +class AutoModelForZeroShotObjectDetection(_BaseAutoModelClass): + _model_mapping = MODEL_FOR_ZERO_SHOT_OBJECT_DETECTION_MAPPING + + +AutoModelForZeroShotObjectDetection = auto_class_update( + AutoModelForZeroShotObjectDetection, head_doc="zero-shot object detection" +) + + +class AutoModelForDepthEstimation(_BaseAutoModelClass): + _model_mapping = MODEL_FOR_DEPTH_ESTIMATION_MAPPING + + +AutoModelForDepthEstimation = auto_class_update(AutoModelForDepthEstimation, head_doc="depth estimation") + + +class AutoModelForVideoClassification(_BaseAutoModelClass): + _model_mapping = MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING + + +AutoModelForVideoClassification = auto_class_update(AutoModelForVideoClassification, head_doc="video classification") + + +class AutoModelForVision2Seq(_BaseAutoModelClass): + _model_mapping = MODEL_FOR_VISION_2_SEQ_MAPPING + + +AutoModelForVision2Seq = auto_class_update(AutoModelForVision2Seq, head_doc="vision-to-text modeling") + + +class AutoModelForImageTextToText(_BaseAutoModelClass): + _model_mapping = MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING + + +AutoModelForImageTextToText = auto_class_update(AutoModelForImageTextToText, head_doc="image-text-to-text modeling") + + +class AutoModelForAudioClassification(_BaseAutoModelClass): + _model_mapping = MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING + + +AutoModelForAudioClassification = auto_class_update(AutoModelForAudioClassification, head_doc="audio classification") + + +class AutoModelForCTC(_BaseAutoModelClass): + _model_mapping = MODEL_FOR_CTC_MAPPING + + +AutoModelForCTC = auto_class_update(AutoModelForCTC, head_doc="connectionist temporal classification") + + +class AutoModelForSpeechSeq2Seq(_BaseAutoModelClass): + _model_mapping = MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING + + +AutoModelForSpeechSeq2Seq = auto_class_update( + AutoModelForSpeechSeq2Seq, head_doc="sequence-to-sequence speech-to-text modeling" +) + + +class AutoModelForAudioFrameClassification(_BaseAutoModelClass): + _model_mapping = MODEL_FOR_AUDIO_FRAME_CLASSIFICATION_MAPPING + + +AutoModelForAudioFrameClassification = auto_class_update( + AutoModelForAudioFrameClassification, head_doc="audio frame (token) classification" +) + + +class AutoModelForAudioXVector(_BaseAutoModelClass): + _model_mapping = MODEL_FOR_AUDIO_XVECTOR_MAPPING + + +class AutoModelForTextToSpectrogram(_BaseAutoModelClass): + _model_mapping = MODEL_FOR_TEXT_TO_SPECTROGRAM_MAPPING + + +class AutoModelForTextToWaveform(_BaseAutoModelClass): + _model_mapping = MODEL_FOR_TEXT_TO_WAVEFORM_MAPPING + + +class AutoBackbone(_BaseAutoBackboneClass): + _model_mapping = MODEL_FOR_BACKBONE_MAPPING + + +AutoModelForAudioXVector = auto_class_update(AutoModelForAudioXVector, head_doc="audio retrieval via x-vector") + + +class AutoModelForMaskedImageModeling(_BaseAutoModelClass): + _model_mapping = MODEL_FOR_MASKED_IMAGE_MODELING_MAPPING + + +AutoModelForMaskedImageModeling = auto_class_update(AutoModelForMaskedImageModeling, head_doc="masked image modeling") + + +class AutoModelWithLMHead(_AutoModelWithLMHead): + @classmethod + def from_config(cls, config): + warnings.warn( + "The class `AutoModelWithLMHead` is deprecated and will be removed in a future version. Please use " + "`AutoModelForCausalLM` for causal language models, `AutoModelForMaskedLM` for masked language models and " + "`AutoModelForSeq2SeqLM` for encoder-decoder models.", + FutureWarning, + ) + return super().from_config(config) + + @classmethod + def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): + warnings.warn( + "The class `AutoModelWithLMHead` is deprecated and will be removed in a future version. Please use " + "`AutoModelForCausalLM` for causal language models, `AutoModelForMaskedLM` for masked language models and " + "`AutoModelForSeq2SeqLM` for encoder-decoder models.", + FutureWarning, + ) + return super().from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) + + +__all__ = [ + "MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING", + "MODEL_FOR_AUDIO_FRAME_CLASSIFICATION_MAPPING", + "MODEL_FOR_AUDIO_XVECTOR_MAPPING", + "MODEL_FOR_BACKBONE_MAPPING", + "MODEL_FOR_CAUSAL_IMAGE_MODELING_MAPPING", + "MODEL_FOR_CAUSAL_LM_MAPPING", + "MODEL_FOR_CTC_MAPPING", + "MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING", + "MODEL_FOR_DEPTH_ESTIMATION_MAPPING", + "MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING", + "MODEL_FOR_IMAGE_MAPPING", + "MODEL_FOR_IMAGE_SEGMENTATION_MAPPING", + "MODEL_FOR_IMAGE_TO_IMAGE_MAPPING", + "MODEL_FOR_KEYPOINT_DETECTION_MAPPING", + "MODEL_FOR_INSTANCE_SEGMENTATION_MAPPING", + "MODEL_FOR_MASKED_IMAGE_MODELING_MAPPING", + "MODEL_FOR_MASKED_LM_MAPPING", + "MODEL_FOR_MASK_GENERATION_MAPPING", + "MODEL_FOR_MULTIPLE_CHOICE_MAPPING", + "MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING", + "MODEL_FOR_OBJECT_DETECTION_MAPPING", + "MODEL_FOR_PRETRAINING_MAPPING", + "MODEL_FOR_QUESTION_ANSWERING_MAPPING", + "MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING", + "MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING", + "MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING", + "MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING", + "MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING", + "MODEL_FOR_TEXT_ENCODING_MAPPING", + "MODEL_FOR_TEXT_TO_WAVEFORM_MAPPING", + "MODEL_FOR_TEXT_TO_SPECTROGRAM_MAPPING", + "MODEL_FOR_TIME_SERIES_PREDICTION_MAPPING", + "MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING", + "MODEL_FOR_UNIVERSAL_SEGMENTATION_MAPPING", + "MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING", + "MODEL_FOR_VISION_2_SEQ_MAPPING", + "MODEL_FOR_RETRIEVAL_MAPPING", + "MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING", + "MODEL_FOR_VISUAL_QUESTION_ANSWERING_MAPPING", + "MODEL_MAPPING", + "MODEL_WITH_LM_HEAD_MAPPING", + "MODEL_FOR_ZERO_SHOT_IMAGE_CLASSIFICATION_MAPPING", + "MODEL_FOR_ZERO_SHOT_OBJECT_DETECTION_MAPPING", + "MODEL_FOR_TIME_SERIES_CLASSIFICATION_MAPPING", + "MODEL_FOR_TIME_SERIES_REGRESSION_MAPPING", + "AutoModel", + "AutoBackbone", + "AutoModelForAudioClassification", + "AutoModelForAudioFrameClassification", + "AutoModelForAudioXVector", + "AutoModelForCausalLM", + "AutoModelForCTC", + "AutoModelForDepthEstimation", + "AutoModelForImageClassification", + "AutoModelForImageSegmentation", + "AutoModelForImageToImage", + "AutoModelForInstanceSegmentation", + "AutoModelForKeypointDetection", + "AutoModelForMaskGeneration", + "AutoModelForTextEncoding", + "AutoModelForMaskedImageModeling", + "AutoModelForMaskedLM", + "AutoModelForMultipleChoice", + "AutoModelForNextSentencePrediction", + "AutoModelForObjectDetection", + "AutoModelForPreTraining", + "AutoModelForQuestionAnswering", + "AutoModelForSemanticSegmentation", + "AutoModelForSeq2SeqLM", + "AutoModelForSequenceClassification", + "AutoModelForSpeechSeq2Seq", + "AutoModelForTableQuestionAnswering", + "AutoModelForTextToSpectrogram", + "AutoModelForTextToWaveform", + "AutoModelForTokenClassification", + "AutoModelForUniversalSegmentation", + "AutoModelForVideoClassification", + "AutoModelForVision2Seq", + "AutoModelForVisualQuestionAnswering", + "AutoModelForDocumentQuestionAnswering", + "AutoModelWithLMHead", + "AutoModelForZeroShotImageClassification", + "AutoModelForZeroShotObjectDetection", + "AutoModelForImageTextToText", +] diff --git a/docs/transformers/build/lib/transformers/models/auto/modeling_flax_auto.py b/docs/transformers/build/lib/transformers/models/auto/modeling_flax_auto.py new file mode 100644 index 0000000000000000000000000000000000000000..0588d03cb6cdb43b94cc3fcd73b1791d1a5ee809 --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/auto/modeling_flax_auto.py @@ -0,0 +1,413 @@ +# coding=utf-8 +# Copyright 2018 The Google Flax Team Authors and The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Auto Model class.""" + +from collections import OrderedDict + +from ...utils import logging +from .auto_factory import _BaseAutoModelClass, _LazyAutoMapping, auto_class_update +from .configuration_auto import CONFIG_MAPPING_NAMES + + +logger = logging.get_logger(__name__) + + +FLAX_MODEL_MAPPING_NAMES = OrderedDict( + [ + # Base model mapping + ("albert", "FlaxAlbertModel"), + ("bart", "FlaxBartModel"), + ("beit", "FlaxBeitModel"), + ("bert", "FlaxBertModel"), + ("big_bird", "FlaxBigBirdModel"), + ("blenderbot", "FlaxBlenderbotModel"), + ("blenderbot-small", "FlaxBlenderbotSmallModel"), + ("bloom", "FlaxBloomModel"), + ("clip", "FlaxCLIPModel"), + ("dinov2", "FlaxDinov2Model"), + ("distilbert", "FlaxDistilBertModel"), + ("electra", "FlaxElectraModel"), + ("gemma", "FlaxGemmaModel"), + ("gpt-sw3", "FlaxGPT2Model"), + ("gpt2", "FlaxGPT2Model"), + ("gpt_neo", "FlaxGPTNeoModel"), + ("gptj", "FlaxGPTJModel"), + ("llama", "FlaxLlamaModel"), + ("longt5", "FlaxLongT5Model"), + ("marian", "FlaxMarianModel"), + ("mbart", "FlaxMBartModel"), + ("mistral", "FlaxMistralModel"), + ("mt5", "FlaxMT5Model"), + ("opt", "FlaxOPTModel"), + ("pegasus", "FlaxPegasusModel"), + ("regnet", "FlaxRegNetModel"), + ("resnet", "FlaxResNetModel"), + ("roberta", "FlaxRobertaModel"), + ("roberta-prelayernorm", "FlaxRobertaPreLayerNormModel"), + ("roformer", "FlaxRoFormerModel"), + ("t5", "FlaxT5Model"), + ("vision-text-dual-encoder", "FlaxVisionTextDualEncoderModel"), + ("vit", "FlaxViTModel"), + ("wav2vec2", "FlaxWav2Vec2Model"), + ("whisper", "FlaxWhisperModel"), + ("xglm", "FlaxXGLMModel"), + ("xlm-roberta", "FlaxXLMRobertaModel"), + ] +) + +FLAX_MODEL_FOR_PRETRAINING_MAPPING_NAMES = OrderedDict( + [ + # Model for pre-training mapping + ("albert", "FlaxAlbertForPreTraining"), + ("bart", "FlaxBartForConditionalGeneration"), + ("bert", "FlaxBertForPreTraining"), + ("big_bird", "FlaxBigBirdForPreTraining"), + ("electra", "FlaxElectraForPreTraining"), + ("longt5", "FlaxLongT5ForConditionalGeneration"), + ("mbart", "FlaxMBartForConditionalGeneration"), + ("mt5", "FlaxMT5ForConditionalGeneration"), + ("roberta", "FlaxRobertaForMaskedLM"), + ("roberta-prelayernorm", "FlaxRobertaPreLayerNormForMaskedLM"), + ("roformer", "FlaxRoFormerForMaskedLM"), + ("t5", "FlaxT5ForConditionalGeneration"), + ("wav2vec2", "FlaxWav2Vec2ForPreTraining"), + ("whisper", "FlaxWhisperForConditionalGeneration"), + ("xlm-roberta", "FlaxXLMRobertaForMaskedLM"), + ] +) + +FLAX_MODEL_FOR_MASKED_LM_MAPPING_NAMES = OrderedDict( + [ + # Model for Masked LM mapping + ("albert", "FlaxAlbertForMaskedLM"), + ("bart", "FlaxBartForConditionalGeneration"), + ("bert", "FlaxBertForMaskedLM"), + ("big_bird", "FlaxBigBirdForMaskedLM"), + ("distilbert", "FlaxDistilBertForMaskedLM"), + ("electra", "FlaxElectraForMaskedLM"), + ("mbart", "FlaxMBartForConditionalGeneration"), + ("roberta", "FlaxRobertaForMaskedLM"), + ("roberta-prelayernorm", "FlaxRobertaPreLayerNormForMaskedLM"), + ("roformer", "FlaxRoFormerForMaskedLM"), + ("xlm-roberta", "FlaxXLMRobertaForMaskedLM"), + ] +) + +FLAX_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES = OrderedDict( + [ + # Model for Seq2Seq Causal LM mapping + ("bart", "FlaxBartForConditionalGeneration"), + ("blenderbot", "FlaxBlenderbotForConditionalGeneration"), + ("blenderbot-small", "FlaxBlenderbotSmallForConditionalGeneration"), + ("encoder-decoder", "FlaxEncoderDecoderModel"), + ("longt5", "FlaxLongT5ForConditionalGeneration"), + ("marian", "FlaxMarianMTModel"), + ("mbart", "FlaxMBartForConditionalGeneration"), + ("mt5", "FlaxMT5ForConditionalGeneration"), + ("pegasus", "FlaxPegasusForConditionalGeneration"), + ("t5", "FlaxT5ForConditionalGeneration"), + ] +) + +FLAX_MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES = OrderedDict( + [ + # Model for Image-classification + ("beit", "FlaxBeitForImageClassification"), + ("dinov2", "FlaxDinov2ForImageClassification"), + ("regnet", "FlaxRegNetForImageClassification"), + ("resnet", "FlaxResNetForImageClassification"), + ("vit", "FlaxViTForImageClassification"), + ] +) + +FLAX_MODEL_FOR_VISION_2_SEQ_MAPPING_NAMES = OrderedDict( + [ + ("vision-encoder-decoder", "FlaxVisionEncoderDecoderModel"), + ] +) + +FLAX_MODEL_FOR_CAUSAL_LM_MAPPING_NAMES = OrderedDict( + [ + # Model for Causal LM mapping + ("bart", "FlaxBartForCausalLM"), + ("bert", "FlaxBertForCausalLM"), + ("big_bird", "FlaxBigBirdForCausalLM"), + ("bloom", "FlaxBloomForCausalLM"), + ("electra", "FlaxElectraForCausalLM"), + ("gemma", "FlaxGemmaForCausalLM"), + ("gpt-sw3", "FlaxGPT2LMHeadModel"), + ("gpt2", "FlaxGPT2LMHeadModel"), + ("gpt_neo", "FlaxGPTNeoForCausalLM"), + ("gptj", "FlaxGPTJForCausalLM"), + ("llama", "FlaxLlamaForCausalLM"), + ("mistral", "FlaxMistralForCausalLM"), + ("opt", "FlaxOPTForCausalLM"), + ("roberta", "FlaxRobertaForCausalLM"), + ("roberta-prelayernorm", "FlaxRobertaPreLayerNormForCausalLM"), + ("xglm", "FlaxXGLMForCausalLM"), + ("xlm-roberta", "FlaxXLMRobertaForCausalLM"), + ] +) + +FLAX_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES = OrderedDict( + [ + # Model for Sequence Classification mapping + ("albert", "FlaxAlbertForSequenceClassification"), + ("bart", "FlaxBartForSequenceClassification"), + ("bert", "FlaxBertForSequenceClassification"), + ("big_bird", "FlaxBigBirdForSequenceClassification"), + ("distilbert", "FlaxDistilBertForSequenceClassification"), + ("electra", "FlaxElectraForSequenceClassification"), + ("mbart", "FlaxMBartForSequenceClassification"), + ("roberta", "FlaxRobertaForSequenceClassification"), + ("roberta-prelayernorm", "FlaxRobertaPreLayerNormForSequenceClassification"), + ("roformer", "FlaxRoFormerForSequenceClassification"), + ("xlm-roberta", "FlaxXLMRobertaForSequenceClassification"), + ] +) + +FLAX_MODEL_FOR_QUESTION_ANSWERING_MAPPING_NAMES = OrderedDict( + [ + # Model for Question Answering mapping + ("albert", "FlaxAlbertForQuestionAnswering"), + ("bart", "FlaxBartForQuestionAnswering"), + ("bert", "FlaxBertForQuestionAnswering"), + ("big_bird", "FlaxBigBirdForQuestionAnswering"), + ("distilbert", "FlaxDistilBertForQuestionAnswering"), + ("electra", "FlaxElectraForQuestionAnswering"), + ("mbart", "FlaxMBartForQuestionAnswering"), + ("roberta", "FlaxRobertaForQuestionAnswering"), + ("roberta-prelayernorm", "FlaxRobertaPreLayerNormForQuestionAnswering"), + ("roformer", "FlaxRoFormerForQuestionAnswering"), + ("xlm-roberta", "FlaxXLMRobertaForQuestionAnswering"), + ] +) + +FLAX_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES = OrderedDict( + [ + # Model for Token Classification mapping + ("albert", "FlaxAlbertForTokenClassification"), + ("bert", "FlaxBertForTokenClassification"), + ("big_bird", "FlaxBigBirdForTokenClassification"), + ("distilbert", "FlaxDistilBertForTokenClassification"), + ("electra", "FlaxElectraForTokenClassification"), + ("roberta", "FlaxRobertaForTokenClassification"), + ("roberta-prelayernorm", "FlaxRobertaPreLayerNormForTokenClassification"), + ("roformer", "FlaxRoFormerForTokenClassification"), + ("xlm-roberta", "FlaxXLMRobertaForTokenClassification"), + ] +) + +FLAX_MODEL_FOR_MULTIPLE_CHOICE_MAPPING_NAMES = OrderedDict( + [ + # Model for Multiple Choice mapping + ("albert", "FlaxAlbertForMultipleChoice"), + ("bert", "FlaxBertForMultipleChoice"), + ("big_bird", "FlaxBigBirdForMultipleChoice"), + ("distilbert", "FlaxDistilBertForMultipleChoice"), + ("electra", "FlaxElectraForMultipleChoice"), + ("roberta", "FlaxRobertaForMultipleChoice"), + ("roberta-prelayernorm", "FlaxRobertaPreLayerNormForMultipleChoice"), + ("roformer", "FlaxRoFormerForMultipleChoice"), + ("xlm-roberta", "FlaxXLMRobertaForMultipleChoice"), + ] +) + +FLAX_MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING_NAMES = OrderedDict( + [ + ("bert", "FlaxBertForNextSentencePrediction"), + ] +) + +FLAX_MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING_NAMES = OrderedDict( + [ + ("speech-encoder-decoder", "FlaxSpeechEncoderDecoderModel"), + ("whisper", "FlaxWhisperForConditionalGeneration"), + ] +) + +FLAX_MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING_NAMES = OrderedDict( + [ + ("whisper", "FlaxWhisperForAudioClassification"), + ] +) + +FLAX_MODEL_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, FLAX_MODEL_MAPPING_NAMES) +FLAX_MODEL_FOR_PRETRAINING_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, FLAX_MODEL_FOR_PRETRAINING_MAPPING_NAMES) +FLAX_MODEL_FOR_MASKED_LM_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, FLAX_MODEL_FOR_MASKED_LM_MAPPING_NAMES) +FLAX_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING = _LazyAutoMapping( + CONFIG_MAPPING_NAMES, FLAX_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES +) +FLAX_MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING = _LazyAutoMapping( + CONFIG_MAPPING_NAMES, FLAX_MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES +) +FLAX_MODEL_FOR_VISION_2_SEQ_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, FLAX_MODEL_FOR_VISION_2_SEQ_MAPPING_NAMES) +FLAX_MODEL_FOR_CAUSAL_LM_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, FLAX_MODEL_FOR_CAUSAL_LM_MAPPING_NAMES) +FLAX_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING = _LazyAutoMapping( + CONFIG_MAPPING_NAMES, FLAX_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES +) +FLAX_MODEL_FOR_QUESTION_ANSWERING_MAPPING = _LazyAutoMapping( + CONFIG_MAPPING_NAMES, FLAX_MODEL_FOR_QUESTION_ANSWERING_MAPPING_NAMES +) +FLAX_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING = _LazyAutoMapping( + CONFIG_MAPPING_NAMES, FLAX_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES +) +FLAX_MODEL_FOR_MULTIPLE_CHOICE_MAPPING = _LazyAutoMapping( + CONFIG_MAPPING_NAMES, FLAX_MODEL_FOR_MULTIPLE_CHOICE_MAPPING_NAMES +) +FLAX_MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING = _LazyAutoMapping( + CONFIG_MAPPING_NAMES, FLAX_MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING_NAMES +) +FLAX_MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING = _LazyAutoMapping( + CONFIG_MAPPING_NAMES, FLAX_MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING_NAMES +) +FLAX_MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING = _LazyAutoMapping( + CONFIG_MAPPING_NAMES, FLAX_MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING_NAMES +) + + +class FlaxAutoModel(_BaseAutoModelClass): + _model_mapping = FLAX_MODEL_MAPPING + + +FlaxAutoModel = auto_class_update(FlaxAutoModel) + + +class FlaxAutoModelForPreTraining(_BaseAutoModelClass): + _model_mapping = FLAX_MODEL_FOR_PRETRAINING_MAPPING + + +FlaxAutoModelForPreTraining = auto_class_update(FlaxAutoModelForPreTraining, head_doc="pretraining") + + +class FlaxAutoModelForCausalLM(_BaseAutoModelClass): + _model_mapping = FLAX_MODEL_FOR_CAUSAL_LM_MAPPING + + +FlaxAutoModelForCausalLM = auto_class_update(FlaxAutoModelForCausalLM, head_doc="causal language modeling") + + +class FlaxAutoModelForMaskedLM(_BaseAutoModelClass): + _model_mapping = FLAX_MODEL_FOR_MASKED_LM_MAPPING + + +FlaxAutoModelForMaskedLM = auto_class_update(FlaxAutoModelForMaskedLM, head_doc="masked language modeling") + + +class FlaxAutoModelForSeq2SeqLM(_BaseAutoModelClass): + _model_mapping = FLAX_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING + + +FlaxAutoModelForSeq2SeqLM = auto_class_update( + FlaxAutoModelForSeq2SeqLM, + head_doc="sequence-to-sequence language modeling", + checkpoint_for_example="google-t5/t5-base", +) + + +class FlaxAutoModelForSequenceClassification(_BaseAutoModelClass): + _model_mapping = FLAX_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING + + +FlaxAutoModelForSequenceClassification = auto_class_update( + FlaxAutoModelForSequenceClassification, head_doc="sequence classification" +) + + +class FlaxAutoModelForQuestionAnswering(_BaseAutoModelClass): + _model_mapping = FLAX_MODEL_FOR_QUESTION_ANSWERING_MAPPING + + +FlaxAutoModelForQuestionAnswering = auto_class_update(FlaxAutoModelForQuestionAnswering, head_doc="question answering") + + +class FlaxAutoModelForTokenClassification(_BaseAutoModelClass): + _model_mapping = FLAX_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING + + +FlaxAutoModelForTokenClassification = auto_class_update( + FlaxAutoModelForTokenClassification, head_doc="token classification" +) + + +class FlaxAutoModelForMultipleChoice(_BaseAutoModelClass): + _model_mapping = FLAX_MODEL_FOR_MULTIPLE_CHOICE_MAPPING + + +FlaxAutoModelForMultipleChoice = auto_class_update(FlaxAutoModelForMultipleChoice, head_doc="multiple choice") + + +class FlaxAutoModelForNextSentencePrediction(_BaseAutoModelClass): + _model_mapping = FLAX_MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING + + +FlaxAutoModelForNextSentencePrediction = auto_class_update( + FlaxAutoModelForNextSentencePrediction, head_doc="next sentence prediction" +) + + +class FlaxAutoModelForImageClassification(_BaseAutoModelClass): + _model_mapping = FLAX_MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING + + +FlaxAutoModelForImageClassification = auto_class_update( + FlaxAutoModelForImageClassification, head_doc="image classification" +) + + +class FlaxAutoModelForVision2Seq(_BaseAutoModelClass): + _model_mapping = FLAX_MODEL_FOR_VISION_2_SEQ_MAPPING + + +FlaxAutoModelForVision2Seq = auto_class_update(FlaxAutoModelForVision2Seq, head_doc="vision-to-text modeling") + + +class FlaxAutoModelForSpeechSeq2Seq(_BaseAutoModelClass): + _model_mapping = FLAX_MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING + + +FlaxAutoModelForSpeechSeq2Seq = auto_class_update( + FlaxAutoModelForSpeechSeq2Seq, head_doc="sequence-to-sequence speech-to-text modeling" +) + +__all__ = [ + "FLAX_MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING", + "FLAX_MODEL_FOR_CAUSAL_LM_MAPPING", + "FLAX_MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING", + "FLAX_MODEL_FOR_MASKED_LM_MAPPING", + "FLAX_MODEL_FOR_MULTIPLE_CHOICE_MAPPING", + "FLAX_MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING", + "FLAX_MODEL_FOR_PRETRAINING_MAPPING", + "FLAX_MODEL_FOR_QUESTION_ANSWERING_MAPPING", + "FLAX_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING", + "FLAX_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING", + "FLAX_MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING", + "FLAX_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING", + "FLAX_MODEL_FOR_VISION_2_SEQ_MAPPING", + "FLAX_MODEL_MAPPING", + "FlaxAutoModel", + "FlaxAutoModelForCausalLM", + "FlaxAutoModelForImageClassification", + "FlaxAutoModelForMaskedLM", + "FlaxAutoModelForMultipleChoice", + "FlaxAutoModelForNextSentencePrediction", + "FlaxAutoModelForPreTraining", + "FlaxAutoModelForQuestionAnswering", + "FlaxAutoModelForSeq2SeqLM", + "FlaxAutoModelForSequenceClassification", + "FlaxAutoModelForSpeechSeq2Seq", + "FlaxAutoModelForTokenClassification", + "FlaxAutoModelForVision2Seq", +] diff --git a/docs/transformers/build/lib/transformers/models/auto/modeling_tf_auto.py b/docs/transformers/build/lib/transformers/models/auto/modeling_tf_auto.py new file mode 100644 index 0000000000000000000000000000000000000000..cf39f4d7c9c40bd87a8e4c5e3037e2cbe3574a29 --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/auto/modeling_tf_auto.py @@ -0,0 +1,776 @@ +# coding=utf-8 +# Copyright 2018 The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Auto Model class.""" + +import warnings +from collections import OrderedDict + +from ...utils import logging +from .auto_factory import _BaseAutoModelClass, _LazyAutoMapping, auto_class_update +from .configuration_auto import CONFIG_MAPPING_NAMES + + +logger = logging.get_logger(__name__) + + +TF_MODEL_MAPPING_NAMES = OrderedDict( + [ + # Base model mapping + ("albert", "TFAlbertModel"), + ("bart", "TFBartModel"), + ("bert", "TFBertModel"), + ("blenderbot", "TFBlenderbotModel"), + ("blenderbot-small", "TFBlenderbotSmallModel"), + ("blip", "TFBlipModel"), + ("camembert", "TFCamembertModel"), + ("clip", "TFCLIPModel"), + ("convbert", "TFConvBertModel"), + ("convnext", "TFConvNextModel"), + ("convnextv2", "TFConvNextV2Model"), + ("ctrl", "TFCTRLModel"), + ("cvt", "TFCvtModel"), + ("data2vec-vision", "TFData2VecVisionModel"), + ("deberta", "TFDebertaModel"), + ("deberta-v2", "TFDebertaV2Model"), + ("deit", "TFDeiTModel"), + ("distilbert", "TFDistilBertModel"), + ("dpr", "TFDPRQuestionEncoder"), + ("efficientformer", "TFEfficientFormerModel"), + ("electra", "TFElectraModel"), + ("esm", "TFEsmModel"), + ("flaubert", "TFFlaubertModel"), + ("funnel", ("TFFunnelModel", "TFFunnelBaseModel")), + ("gpt-sw3", "TFGPT2Model"), + ("gpt2", "TFGPT2Model"), + ("gptj", "TFGPTJModel"), + ("groupvit", "TFGroupViTModel"), + ("hubert", "TFHubertModel"), + ("idefics", "TFIdeficsModel"), + ("layoutlm", "TFLayoutLMModel"), + ("layoutlmv3", "TFLayoutLMv3Model"), + ("led", "TFLEDModel"), + ("longformer", "TFLongformerModel"), + ("lxmert", "TFLxmertModel"), + ("marian", "TFMarianModel"), + ("mbart", "TFMBartModel"), + ("mistral", "TFMistralModel"), + ("mobilebert", "TFMobileBertModel"), + ("mobilevit", "TFMobileViTModel"), + ("mpnet", "TFMPNetModel"), + ("mt5", "TFMT5Model"), + ("openai-gpt", "TFOpenAIGPTModel"), + ("opt", "TFOPTModel"), + ("pegasus", "TFPegasusModel"), + ("regnet", "TFRegNetModel"), + ("rembert", "TFRemBertModel"), + ("resnet", "TFResNetModel"), + ("roberta", "TFRobertaModel"), + ("roberta-prelayernorm", "TFRobertaPreLayerNormModel"), + ("roformer", "TFRoFormerModel"), + ("sam", "TFSamModel"), + ("sam_vision_model", "TFSamVisionModel"), + ("segformer", "TFSegformerModel"), + ("speech_to_text", "TFSpeech2TextModel"), + ("swiftformer", "TFSwiftFormerModel"), + ("swin", "TFSwinModel"), + ("t5", "TFT5Model"), + ("tapas", "TFTapasModel"), + ("transfo-xl", "TFTransfoXLModel"), + ("vision-text-dual-encoder", "TFVisionTextDualEncoderModel"), + ("vit", "TFViTModel"), + ("vit_mae", "TFViTMAEModel"), + ("wav2vec2", "TFWav2Vec2Model"), + ("whisper", "TFWhisperModel"), + ("xglm", "TFXGLMModel"), + ("xlm", "TFXLMModel"), + ("xlm-roberta", "TFXLMRobertaModel"), + ("xlnet", "TFXLNetModel"), + ] +) + +TF_MODEL_FOR_PRETRAINING_MAPPING_NAMES = OrderedDict( + [ + # Model for pre-training mapping + ("albert", "TFAlbertForPreTraining"), + ("bart", "TFBartForConditionalGeneration"), + ("bert", "TFBertForPreTraining"), + ("camembert", "TFCamembertForMaskedLM"), + ("ctrl", "TFCTRLLMHeadModel"), + ("distilbert", "TFDistilBertForMaskedLM"), + ("electra", "TFElectraForPreTraining"), + ("flaubert", "TFFlaubertWithLMHeadModel"), + ("funnel", "TFFunnelForPreTraining"), + ("gpt-sw3", "TFGPT2LMHeadModel"), + ("gpt2", "TFGPT2LMHeadModel"), + ("idefics", "TFIdeficsForVisionText2Text"), + ("layoutlm", "TFLayoutLMForMaskedLM"), + ("lxmert", "TFLxmertForPreTraining"), + ("mobilebert", "TFMobileBertForPreTraining"), + ("mpnet", "TFMPNetForMaskedLM"), + ("openai-gpt", "TFOpenAIGPTLMHeadModel"), + ("roberta", "TFRobertaForMaskedLM"), + ("roberta-prelayernorm", "TFRobertaPreLayerNormForMaskedLM"), + ("t5", "TFT5ForConditionalGeneration"), + ("tapas", "TFTapasForMaskedLM"), + ("transfo-xl", "TFTransfoXLLMHeadModel"), + ("vit_mae", "TFViTMAEForPreTraining"), + ("xlm", "TFXLMWithLMHeadModel"), + ("xlm-roberta", "TFXLMRobertaForMaskedLM"), + ("xlnet", "TFXLNetLMHeadModel"), + ] +) + +TF_MODEL_WITH_LM_HEAD_MAPPING_NAMES = OrderedDict( + [ + # Model with LM heads mapping + ("albert", "TFAlbertForMaskedLM"), + ("bart", "TFBartForConditionalGeneration"), + ("bert", "TFBertForMaskedLM"), + ("camembert", "TFCamembertForMaskedLM"), + ("convbert", "TFConvBertForMaskedLM"), + ("ctrl", "TFCTRLLMHeadModel"), + ("distilbert", "TFDistilBertForMaskedLM"), + ("electra", "TFElectraForMaskedLM"), + ("esm", "TFEsmForMaskedLM"), + ("flaubert", "TFFlaubertWithLMHeadModel"), + ("funnel", "TFFunnelForMaskedLM"), + ("gpt-sw3", "TFGPT2LMHeadModel"), + ("gpt2", "TFGPT2LMHeadModel"), + ("gptj", "TFGPTJForCausalLM"), + ("layoutlm", "TFLayoutLMForMaskedLM"), + ("led", "TFLEDForConditionalGeneration"), + ("longformer", "TFLongformerForMaskedLM"), + ("marian", "TFMarianMTModel"), + ("mobilebert", "TFMobileBertForMaskedLM"), + ("mpnet", "TFMPNetForMaskedLM"), + ("openai-gpt", "TFOpenAIGPTLMHeadModel"), + ("rembert", "TFRemBertForMaskedLM"), + ("roberta", "TFRobertaForMaskedLM"), + ("roberta-prelayernorm", "TFRobertaPreLayerNormForMaskedLM"), + ("roformer", "TFRoFormerForMaskedLM"), + ("speech_to_text", "TFSpeech2TextForConditionalGeneration"), + ("t5", "TFT5ForConditionalGeneration"), + ("tapas", "TFTapasForMaskedLM"), + ("transfo-xl", "TFTransfoXLLMHeadModel"), + ("whisper", "TFWhisperForConditionalGeneration"), + ("xlm", "TFXLMWithLMHeadModel"), + ("xlm-roberta", "TFXLMRobertaForMaskedLM"), + ("xlnet", "TFXLNetLMHeadModel"), + ] +) + +TF_MODEL_FOR_CAUSAL_LM_MAPPING_NAMES = OrderedDict( + [ + # Model for Causal LM mapping + ("bert", "TFBertLMHeadModel"), + ("camembert", "TFCamembertForCausalLM"), + ("ctrl", "TFCTRLLMHeadModel"), + ("gpt-sw3", "TFGPT2LMHeadModel"), + ("gpt2", "TFGPT2LMHeadModel"), + ("gptj", "TFGPTJForCausalLM"), + ("mistral", "TFMistralForCausalLM"), + ("openai-gpt", "TFOpenAIGPTLMHeadModel"), + ("opt", "TFOPTForCausalLM"), + ("rembert", "TFRemBertForCausalLM"), + ("roberta", "TFRobertaForCausalLM"), + ("roberta-prelayernorm", "TFRobertaPreLayerNormForCausalLM"), + ("roformer", "TFRoFormerForCausalLM"), + ("transfo-xl", "TFTransfoXLLMHeadModel"), + ("xglm", "TFXGLMForCausalLM"), + ("xlm", "TFXLMWithLMHeadModel"), + ("xlm-roberta", "TFXLMRobertaForCausalLM"), + ("xlnet", "TFXLNetLMHeadModel"), + ] +) + +TF_MODEL_FOR_MASKED_IMAGE_MODELING_MAPPING_NAMES = OrderedDict( + [ + ("deit", "TFDeiTForMaskedImageModeling"), + ("swin", "TFSwinForMaskedImageModeling"), + ] +) + +TF_MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES = OrderedDict( + [ + # Model for Image-classsification + ("convnext", "TFConvNextForImageClassification"), + ("convnextv2", "TFConvNextV2ForImageClassification"), + ("cvt", "TFCvtForImageClassification"), + ("data2vec-vision", "TFData2VecVisionForImageClassification"), + ("deit", ("TFDeiTForImageClassification", "TFDeiTForImageClassificationWithTeacher")), + ( + "efficientformer", + ("TFEfficientFormerForImageClassification", "TFEfficientFormerForImageClassificationWithTeacher"), + ), + ("mobilevit", "TFMobileViTForImageClassification"), + ("regnet", "TFRegNetForImageClassification"), + ("resnet", "TFResNetForImageClassification"), + ("segformer", "TFSegformerForImageClassification"), + ("swiftformer", "TFSwiftFormerForImageClassification"), + ("swin", "TFSwinForImageClassification"), + ("vit", "TFViTForImageClassification"), + ] +) + + +TF_MODEL_FOR_ZERO_SHOT_IMAGE_CLASSIFICATION_MAPPING_NAMES = OrderedDict( + [ + # Model for Zero Shot Image Classification mapping + ("blip", "TFBlipModel"), + ("clip", "TFCLIPModel"), + ] +) + + +TF_MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING_NAMES = OrderedDict( + [ + # Model for Semantic Segmentation mapping + ("data2vec-vision", "TFData2VecVisionForSemanticSegmentation"), + ("mobilevit", "TFMobileViTForSemanticSegmentation"), + ("segformer", "TFSegformerForSemanticSegmentation"), + ] +) + +TF_MODEL_FOR_VISION_2_SEQ_MAPPING_NAMES = OrderedDict( + [ + ("blip", "TFBlipForConditionalGeneration"), + ("vision-encoder-decoder", "TFVisionEncoderDecoderModel"), + ] +) + +TF_MODEL_FOR_MASKED_LM_MAPPING_NAMES = OrderedDict( + [ + # Model for Masked LM mapping + ("albert", "TFAlbertForMaskedLM"), + ("bert", "TFBertForMaskedLM"), + ("camembert", "TFCamembertForMaskedLM"), + ("convbert", "TFConvBertForMaskedLM"), + ("deberta", "TFDebertaForMaskedLM"), + ("deberta-v2", "TFDebertaV2ForMaskedLM"), + ("distilbert", "TFDistilBertForMaskedLM"), + ("electra", "TFElectraForMaskedLM"), + ("esm", "TFEsmForMaskedLM"), + ("flaubert", "TFFlaubertWithLMHeadModel"), + ("funnel", "TFFunnelForMaskedLM"), + ("layoutlm", "TFLayoutLMForMaskedLM"), + ("longformer", "TFLongformerForMaskedLM"), + ("mobilebert", "TFMobileBertForMaskedLM"), + ("mpnet", "TFMPNetForMaskedLM"), + ("rembert", "TFRemBertForMaskedLM"), + ("roberta", "TFRobertaForMaskedLM"), + ("roberta-prelayernorm", "TFRobertaPreLayerNormForMaskedLM"), + ("roformer", "TFRoFormerForMaskedLM"), + ("tapas", "TFTapasForMaskedLM"), + ("xlm", "TFXLMWithLMHeadModel"), + ("xlm-roberta", "TFXLMRobertaForMaskedLM"), + ] +) + +TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES = OrderedDict( + [ + # Model for Seq2Seq Causal LM mapping + ("bart", "TFBartForConditionalGeneration"), + ("blenderbot", "TFBlenderbotForConditionalGeneration"), + ("blenderbot-small", "TFBlenderbotSmallForConditionalGeneration"), + ("encoder-decoder", "TFEncoderDecoderModel"), + ("led", "TFLEDForConditionalGeneration"), + ("marian", "TFMarianMTModel"), + ("mbart", "TFMBartForConditionalGeneration"), + ("mt5", "TFMT5ForConditionalGeneration"), + ("pegasus", "TFPegasusForConditionalGeneration"), + ("t5", "TFT5ForConditionalGeneration"), + ] +) + +TF_MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING_NAMES = OrderedDict( + [ + ("speech_to_text", "TFSpeech2TextForConditionalGeneration"), + ("whisper", "TFWhisperForConditionalGeneration"), + ] +) + +TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES = OrderedDict( + [ + # Model for Sequence Classification mapping + ("albert", "TFAlbertForSequenceClassification"), + ("bart", "TFBartForSequenceClassification"), + ("bert", "TFBertForSequenceClassification"), + ("camembert", "TFCamembertForSequenceClassification"), + ("convbert", "TFConvBertForSequenceClassification"), + ("ctrl", "TFCTRLForSequenceClassification"), + ("deberta", "TFDebertaForSequenceClassification"), + ("deberta-v2", "TFDebertaV2ForSequenceClassification"), + ("distilbert", "TFDistilBertForSequenceClassification"), + ("electra", "TFElectraForSequenceClassification"), + ("esm", "TFEsmForSequenceClassification"), + ("flaubert", "TFFlaubertForSequenceClassification"), + ("funnel", "TFFunnelForSequenceClassification"), + ("gpt-sw3", "TFGPT2ForSequenceClassification"), + ("gpt2", "TFGPT2ForSequenceClassification"), + ("gptj", "TFGPTJForSequenceClassification"), + ("layoutlm", "TFLayoutLMForSequenceClassification"), + ("layoutlmv3", "TFLayoutLMv3ForSequenceClassification"), + ("longformer", "TFLongformerForSequenceClassification"), + ("mistral", "TFMistralForSequenceClassification"), + ("mobilebert", "TFMobileBertForSequenceClassification"), + ("mpnet", "TFMPNetForSequenceClassification"), + ("openai-gpt", "TFOpenAIGPTForSequenceClassification"), + ("rembert", "TFRemBertForSequenceClassification"), + ("roberta", "TFRobertaForSequenceClassification"), + ("roberta-prelayernorm", "TFRobertaPreLayerNormForSequenceClassification"), + ("roformer", "TFRoFormerForSequenceClassification"), + ("tapas", "TFTapasForSequenceClassification"), + ("transfo-xl", "TFTransfoXLForSequenceClassification"), + ("xlm", "TFXLMForSequenceClassification"), + ("xlm-roberta", "TFXLMRobertaForSequenceClassification"), + ("xlnet", "TFXLNetForSequenceClassification"), + ] +) + +TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING_NAMES = OrderedDict( + [ + # Model for Question Answering mapping + ("albert", "TFAlbertForQuestionAnswering"), + ("bert", "TFBertForQuestionAnswering"), + ("camembert", "TFCamembertForQuestionAnswering"), + ("convbert", "TFConvBertForQuestionAnswering"), + ("deberta", "TFDebertaForQuestionAnswering"), + ("deberta-v2", "TFDebertaV2ForQuestionAnswering"), + ("distilbert", "TFDistilBertForQuestionAnswering"), + ("electra", "TFElectraForQuestionAnswering"), + ("flaubert", "TFFlaubertForQuestionAnsweringSimple"), + ("funnel", "TFFunnelForQuestionAnswering"), + ("gptj", "TFGPTJForQuestionAnswering"), + ("layoutlmv3", "TFLayoutLMv3ForQuestionAnswering"), + ("longformer", "TFLongformerForQuestionAnswering"), + ("mobilebert", "TFMobileBertForQuestionAnswering"), + ("mpnet", "TFMPNetForQuestionAnswering"), + ("rembert", "TFRemBertForQuestionAnswering"), + ("roberta", "TFRobertaForQuestionAnswering"), + ("roberta-prelayernorm", "TFRobertaPreLayerNormForQuestionAnswering"), + ("roformer", "TFRoFormerForQuestionAnswering"), + ("xlm", "TFXLMForQuestionAnsweringSimple"), + ("xlm-roberta", "TFXLMRobertaForQuestionAnswering"), + ("xlnet", "TFXLNetForQuestionAnsweringSimple"), + ] +) +TF_MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING_NAMES = OrderedDict([("wav2vec2", "TFWav2Vec2ForSequenceClassification")]) + +TF_MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING_NAMES = OrderedDict( + [ + ("layoutlm", "TFLayoutLMForQuestionAnswering"), + ("layoutlmv3", "TFLayoutLMv3ForQuestionAnswering"), + ] +) + + +TF_MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING_NAMES = OrderedDict( + [ + # Model for Table Question Answering mapping + ("tapas", "TFTapasForQuestionAnswering"), + ] +) + +TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES = OrderedDict( + [ + # Model for Token Classification mapping + ("albert", "TFAlbertForTokenClassification"), + ("bert", "TFBertForTokenClassification"), + ("camembert", "TFCamembertForTokenClassification"), + ("convbert", "TFConvBertForTokenClassification"), + ("deberta", "TFDebertaForTokenClassification"), + ("deberta-v2", "TFDebertaV2ForTokenClassification"), + ("distilbert", "TFDistilBertForTokenClassification"), + ("electra", "TFElectraForTokenClassification"), + ("esm", "TFEsmForTokenClassification"), + ("flaubert", "TFFlaubertForTokenClassification"), + ("funnel", "TFFunnelForTokenClassification"), + ("layoutlm", "TFLayoutLMForTokenClassification"), + ("layoutlmv3", "TFLayoutLMv3ForTokenClassification"), + ("longformer", "TFLongformerForTokenClassification"), + ("mobilebert", "TFMobileBertForTokenClassification"), + ("mpnet", "TFMPNetForTokenClassification"), + ("rembert", "TFRemBertForTokenClassification"), + ("roberta", "TFRobertaForTokenClassification"), + ("roberta-prelayernorm", "TFRobertaPreLayerNormForTokenClassification"), + ("roformer", "TFRoFormerForTokenClassification"), + ("xlm", "TFXLMForTokenClassification"), + ("xlm-roberta", "TFXLMRobertaForTokenClassification"), + ("xlnet", "TFXLNetForTokenClassification"), + ] +) + +TF_MODEL_FOR_MULTIPLE_CHOICE_MAPPING_NAMES = OrderedDict( + [ + # Model for Multiple Choice mapping + ("albert", "TFAlbertForMultipleChoice"), + ("bert", "TFBertForMultipleChoice"), + ("camembert", "TFCamembertForMultipleChoice"), + ("convbert", "TFConvBertForMultipleChoice"), + ("deberta-v2", "TFDebertaV2ForMultipleChoice"), + ("distilbert", "TFDistilBertForMultipleChoice"), + ("electra", "TFElectraForMultipleChoice"), + ("flaubert", "TFFlaubertForMultipleChoice"), + ("funnel", "TFFunnelForMultipleChoice"), + ("longformer", "TFLongformerForMultipleChoice"), + ("mobilebert", "TFMobileBertForMultipleChoice"), + ("mpnet", "TFMPNetForMultipleChoice"), + ("rembert", "TFRemBertForMultipleChoice"), + ("roberta", "TFRobertaForMultipleChoice"), + ("roberta-prelayernorm", "TFRobertaPreLayerNormForMultipleChoice"), + ("roformer", "TFRoFormerForMultipleChoice"), + ("xlm", "TFXLMForMultipleChoice"), + ("xlm-roberta", "TFXLMRobertaForMultipleChoice"), + ("xlnet", "TFXLNetForMultipleChoice"), + ] +) + +TF_MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING_NAMES = OrderedDict( + [ + ("bert", "TFBertForNextSentencePrediction"), + ("mobilebert", "TFMobileBertForNextSentencePrediction"), + ] +) +TF_MODEL_FOR_MASK_GENERATION_MAPPING_NAMES = OrderedDict( + [ + ("sam", "TFSamModel"), + ] +) +TF_MODEL_FOR_TEXT_ENCODING_MAPPING_NAMES = OrderedDict( + [ + ("albert", "TFAlbertModel"), + ("bert", "TFBertModel"), + ("convbert", "TFConvBertModel"), + ("deberta", "TFDebertaModel"), + ("deberta-v2", "TFDebertaV2Model"), + ("distilbert", "TFDistilBertModel"), + ("electra", "TFElectraModel"), + ("flaubert", "TFFlaubertModel"), + ("longformer", "TFLongformerModel"), + ("mobilebert", "TFMobileBertModel"), + ("mt5", "TFMT5EncoderModel"), + ("rembert", "TFRemBertModel"), + ("roberta", "TFRobertaModel"), + ("roberta-prelayernorm", "TFRobertaPreLayerNormModel"), + ("roformer", "TFRoFormerModel"), + ("t5", "TFT5EncoderModel"), + ("xlm", "TFXLMModel"), + ("xlm-roberta", "TFXLMRobertaModel"), + ] +) + +TF_MODEL_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, TF_MODEL_MAPPING_NAMES) +TF_MODEL_FOR_PRETRAINING_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, TF_MODEL_FOR_PRETRAINING_MAPPING_NAMES) +TF_MODEL_WITH_LM_HEAD_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, TF_MODEL_WITH_LM_HEAD_MAPPING_NAMES) +TF_MODEL_FOR_CAUSAL_LM_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, TF_MODEL_FOR_CAUSAL_LM_MAPPING_NAMES) +TF_MODEL_FOR_MASKED_IMAGE_MODELING_MAPPING = _LazyAutoMapping( + CONFIG_MAPPING_NAMES, TF_MODEL_FOR_MASKED_IMAGE_MODELING_MAPPING_NAMES +) +TF_MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING = _LazyAutoMapping( + CONFIG_MAPPING_NAMES, TF_MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES +) +TF_MODEL_FOR_ZERO_SHOT_IMAGE_CLASSIFICATION_MAPPING = _LazyAutoMapping( + CONFIG_MAPPING_NAMES, TF_MODEL_FOR_ZERO_SHOT_IMAGE_CLASSIFICATION_MAPPING_NAMES +) +TF_MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING = _LazyAutoMapping( + CONFIG_MAPPING_NAMES, TF_MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING_NAMES +) +TF_MODEL_FOR_VISION_2_SEQ_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, TF_MODEL_FOR_VISION_2_SEQ_MAPPING_NAMES) +TF_MODEL_FOR_MASKED_LM_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, TF_MODEL_FOR_MASKED_LM_MAPPING_NAMES) +TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING = _LazyAutoMapping( + CONFIG_MAPPING_NAMES, TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES +) +TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING = _LazyAutoMapping( + CONFIG_MAPPING_NAMES, TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES +) +TF_MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING = _LazyAutoMapping( + CONFIG_MAPPING_NAMES, TF_MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING_NAMES +) +TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING = _LazyAutoMapping( + CONFIG_MAPPING_NAMES, TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING_NAMES +) +TF_MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING = _LazyAutoMapping( + CONFIG_MAPPING_NAMES, TF_MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING_NAMES +) +TF_MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING = _LazyAutoMapping( + CONFIG_MAPPING_NAMES, TF_MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING_NAMES +) +TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING = _LazyAutoMapping( + CONFIG_MAPPING_NAMES, TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES +) +TF_MODEL_FOR_MULTIPLE_CHOICE_MAPPING = _LazyAutoMapping( + CONFIG_MAPPING_NAMES, TF_MODEL_FOR_MULTIPLE_CHOICE_MAPPING_NAMES +) +TF_MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING = _LazyAutoMapping( + CONFIG_MAPPING_NAMES, TF_MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING_NAMES +) +TF_MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING = _LazyAutoMapping( + CONFIG_MAPPING_NAMES, TF_MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING_NAMES +) + +TF_MODEL_FOR_MASK_GENERATION_MAPPING = _LazyAutoMapping( + CONFIG_MAPPING_NAMES, TF_MODEL_FOR_MASK_GENERATION_MAPPING_NAMES +) + +TF_MODEL_FOR_TEXT_ENCODING_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, TF_MODEL_FOR_TEXT_ENCODING_MAPPING_NAMES) + + +class TFAutoModelForMaskGeneration(_BaseAutoModelClass): + _model_mapping = TF_MODEL_FOR_MASK_GENERATION_MAPPING + + +class TFAutoModelForTextEncoding(_BaseAutoModelClass): + _model_mapping = TF_MODEL_FOR_TEXT_ENCODING_MAPPING + + +class TFAutoModel(_BaseAutoModelClass): + _model_mapping = TF_MODEL_MAPPING + + +TFAutoModel = auto_class_update(TFAutoModel) + + +class TFAutoModelForAudioClassification(_BaseAutoModelClass): + _model_mapping = TF_MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING + + +TFAutoModelForAudioClassification = auto_class_update( + TFAutoModelForAudioClassification, head_doc="audio classification" +) + + +class TFAutoModelForPreTraining(_BaseAutoModelClass): + _model_mapping = TF_MODEL_FOR_PRETRAINING_MAPPING + + +TFAutoModelForPreTraining = auto_class_update(TFAutoModelForPreTraining, head_doc="pretraining") + + +# Private on purpose, the public class will add the deprecation warnings. +class _TFAutoModelWithLMHead(_BaseAutoModelClass): + _model_mapping = TF_MODEL_WITH_LM_HEAD_MAPPING + + +_TFAutoModelWithLMHead = auto_class_update(_TFAutoModelWithLMHead, head_doc="language modeling") + + +class TFAutoModelForCausalLM(_BaseAutoModelClass): + _model_mapping = TF_MODEL_FOR_CAUSAL_LM_MAPPING + + +TFAutoModelForCausalLM = auto_class_update(TFAutoModelForCausalLM, head_doc="causal language modeling") + + +class TFAutoModelForMaskedImageModeling(_BaseAutoModelClass): + _model_mapping = TF_MODEL_FOR_MASKED_IMAGE_MODELING_MAPPING + + +TFAutoModelForMaskedImageModeling = auto_class_update( + TFAutoModelForMaskedImageModeling, head_doc="masked image modeling" +) + + +class TFAutoModelForImageClassification(_BaseAutoModelClass): + _model_mapping = TF_MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING + + +TFAutoModelForImageClassification = auto_class_update( + TFAutoModelForImageClassification, head_doc="image classification" +) + + +class TFAutoModelForZeroShotImageClassification(_BaseAutoModelClass): + _model_mapping = TF_MODEL_FOR_ZERO_SHOT_IMAGE_CLASSIFICATION_MAPPING + + +TFAutoModelForZeroShotImageClassification = auto_class_update( + TFAutoModelForZeroShotImageClassification, head_doc="zero-shot image classification" +) + + +class TFAutoModelForSemanticSegmentation(_BaseAutoModelClass): + _model_mapping = TF_MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING + + +TFAutoModelForSemanticSegmentation = auto_class_update( + TFAutoModelForSemanticSegmentation, head_doc="semantic segmentation" +) + + +class TFAutoModelForVision2Seq(_BaseAutoModelClass): + _model_mapping = TF_MODEL_FOR_VISION_2_SEQ_MAPPING + + +TFAutoModelForVision2Seq = auto_class_update(TFAutoModelForVision2Seq, head_doc="vision-to-text modeling") + + +class TFAutoModelForMaskedLM(_BaseAutoModelClass): + _model_mapping = TF_MODEL_FOR_MASKED_LM_MAPPING + + +TFAutoModelForMaskedLM = auto_class_update(TFAutoModelForMaskedLM, head_doc="masked language modeling") + + +class TFAutoModelForSeq2SeqLM(_BaseAutoModelClass): + _model_mapping = TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING + + +TFAutoModelForSeq2SeqLM = auto_class_update( + TFAutoModelForSeq2SeqLM, + head_doc="sequence-to-sequence language modeling", + checkpoint_for_example="google-t5/t5-base", +) + + +class TFAutoModelForSequenceClassification(_BaseAutoModelClass): + _model_mapping = TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING + + +TFAutoModelForSequenceClassification = auto_class_update( + TFAutoModelForSequenceClassification, head_doc="sequence classification" +) + + +class TFAutoModelForQuestionAnswering(_BaseAutoModelClass): + _model_mapping = TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING + + +TFAutoModelForQuestionAnswering = auto_class_update(TFAutoModelForQuestionAnswering, head_doc="question answering") + + +class TFAutoModelForDocumentQuestionAnswering(_BaseAutoModelClass): + _model_mapping = TF_MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING + + +TFAutoModelForDocumentQuestionAnswering = auto_class_update( + TFAutoModelForDocumentQuestionAnswering, + head_doc="document question answering", + checkpoint_for_example='impira/layoutlm-document-qa", revision="52e01b3', +) + + +class TFAutoModelForTableQuestionAnswering(_BaseAutoModelClass): + _model_mapping = TF_MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING + + +TFAutoModelForTableQuestionAnswering = auto_class_update( + TFAutoModelForTableQuestionAnswering, + head_doc="table question answering", + checkpoint_for_example="google/tapas-base-finetuned-wtq", +) + + +class TFAutoModelForTokenClassification(_BaseAutoModelClass): + _model_mapping = TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING + + +TFAutoModelForTokenClassification = auto_class_update( + TFAutoModelForTokenClassification, head_doc="token classification" +) + + +class TFAutoModelForMultipleChoice(_BaseAutoModelClass): + _model_mapping = TF_MODEL_FOR_MULTIPLE_CHOICE_MAPPING + + +TFAutoModelForMultipleChoice = auto_class_update(TFAutoModelForMultipleChoice, head_doc="multiple choice") + + +class TFAutoModelForNextSentencePrediction(_BaseAutoModelClass): + _model_mapping = TF_MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING + + +TFAutoModelForNextSentencePrediction = auto_class_update( + TFAutoModelForNextSentencePrediction, head_doc="next sentence prediction" +) + + +class TFAutoModelForSpeechSeq2Seq(_BaseAutoModelClass): + _model_mapping = TF_MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING + + +TFAutoModelForSpeechSeq2Seq = auto_class_update( + TFAutoModelForSpeechSeq2Seq, head_doc="sequence-to-sequence speech-to-text modeling" +) + + +class TFAutoModelWithLMHead(_TFAutoModelWithLMHead): + @classmethod + def from_config(cls, config): + warnings.warn( + "The class `TFAutoModelWithLMHead` is deprecated and will be removed in a future version. Please use" + " `TFAutoModelForCausalLM` for causal language models, `TFAutoModelForMaskedLM` for masked language models" + " and `TFAutoModelForSeq2SeqLM` for encoder-decoder models.", + FutureWarning, + ) + return super().from_config(config) + + @classmethod + def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): + warnings.warn( + "The class `TFAutoModelWithLMHead` is deprecated and will be removed in a future version. Please use" + " `TFAutoModelForCausalLM` for causal language models, `TFAutoModelForMaskedLM` for masked language models" + " and `TFAutoModelForSeq2SeqLM` for encoder-decoder models.", + FutureWarning, + ) + return super().from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) + + +__all__ = [ + "TF_MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING", + "TF_MODEL_FOR_CAUSAL_LM_MAPPING", + "TF_MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING", + "TF_MODEL_FOR_MASK_GENERATION_MAPPING", + "TF_MODEL_FOR_MASKED_IMAGE_MODELING_MAPPING", + "TF_MODEL_FOR_MASKED_LM_MAPPING", + "TF_MODEL_FOR_MULTIPLE_CHOICE_MAPPING", + "TF_MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING", + "TF_MODEL_FOR_PRETRAINING_MAPPING", + "TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING", + "TF_MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING", + "TF_MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING", + "TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING", + "TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING", + "TF_MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING", + "TF_MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING", + "TF_MODEL_FOR_TEXT_ENCODING_MAPPING", + "TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING", + "TF_MODEL_FOR_VISION_2_SEQ_MAPPING", + "TF_MODEL_FOR_ZERO_SHOT_IMAGE_CLASSIFICATION_MAPPING", + "TF_MODEL_MAPPING", + "TF_MODEL_WITH_LM_HEAD_MAPPING", + "TFAutoModel", + "TFAutoModelForAudioClassification", + "TFAutoModelForCausalLM", + "TFAutoModelForImageClassification", + "TFAutoModelForMaskedImageModeling", + "TFAutoModelForMaskedLM", + "TFAutoModelForMaskGeneration", + "TFAutoModelForMultipleChoice", + "TFAutoModelForNextSentencePrediction", + "TFAutoModelForPreTraining", + "TFAutoModelForDocumentQuestionAnswering", + "TFAutoModelForQuestionAnswering", + "TFAutoModelForSemanticSegmentation", + "TFAutoModelForSeq2SeqLM", + "TFAutoModelForSequenceClassification", + "TFAutoModelForSpeechSeq2Seq", + "TFAutoModelForTableQuestionAnswering", + "TFAutoModelForTextEncoding", + "TFAutoModelForTokenClassification", + "TFAutoModelForVision2Seq", + "TFAutoModelForZeroShotImageClassification", + "TFAutoModelWithLMHead", +] diff --git a/docs/transformers/build/lib/transformers/models/auto/processing_auto.py b/docs/transformers/build/lib/transformers/models/auto/processing_auto.py new file mode 100644 index 0000000000000000000000000000000000000000..7060b6125dfb49ef0c91193321f717556c10e5da --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/auto/processing_auto.py @@ -0,0 +1,398 @@ +# coding=utf-8 +# Copyright 2021 The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""AutoProcessor class.""" + +import importlib +import inspect +import json +import os +import warnings +from collections import OrderedDict + +# Build the list of all feature extractors +from ...configuration_utils import PretrainedConfig +from ...dynamic_module_utils import get_class_from_dynamic_module, resolve_trust_remote_code +from ...feature_extraction_utils import FeatureExtractionMixin +from ...image_processing_utils import ImageProcessingMixin +from ...processing_utils import ProcessorMixin +from ...tokenization_utils import TOKENIZER_CONFIG_FILE +from ...utils import FEATURE_EXTRACTOR_NAME, PROCESSOR_NAME, cached_file, logging +from .auto_factory import _LazyAutoMapping +from .configuration_auto import ( + CONFIG_MAPPING_NAMES, + AutoConfig, + model_type_to_module_name, + replace_list_option_in_docstrings, +) +from .feature_extraction_auto import AutoFeatureExtractor +from .image_processing_auto import AutoImageProcessor +from .tokenization_auto import AutoTokenizer + + +logger = logging.get_logger(__name__) + +PROCESSOR_MAPPING_NAMES = OrderedDict( + [ + ("align", "AlignProcessor"), + ("altclip", "AltCLIPProcessor"), + ("aria", "AriaProcessor"), + ("aya_vision", "AyaVisionProcessor"), + ("bark", "BarkProcessor"), + ("blip", "BlipProcessor"), + ("blip-2", "Blip2Processor"), + ("bridgetower", "BridgeTowerProcessor"), + ("chameleon", "ChameleonProcessor"), + ("chinese_clip", "ChineseCLIPProcessor"), + ("clap", "ClapProcessor"), + ("clip", "CLIPProcessor"), + ("clipseg", "CLIPSegProcessor"), + ("clvp", "ClvpProcessor"), + ("colpali", "ColPaliProcessor"), + ("emu3", "Emu3Processor"), + ("flava", "FlavaProcessor"), + ("fuyu", "FuyuProcessor"), + ("gemma3", "Gemma3Processor"), + ("git", "GitProcessor"), + ("got_ocr2", "GotOcr2Processor"), + ("granite_speech", "GraniteSpeechProcessor"), + ("grounding-dino", "GroundingDinoProcessor"), + ("groupvit", "CLIPProcessor"), + ("hubert", "Wav2Vec2Processor"), + ("idefics", "IdeficsProcessor"), + ("idefics2", "Idefics2Processor"), + ("idefics3", "Idefics3Processor"), + ("instructblip", "InstructBlipProcessor"), + ("instructblipvideo", "InstructBlipVideoProcessor"), + ("internvl", "InternVLProcessor"), + ("janus", "JanusProcessor"), + ("kosmos-2", "Kosmos2Processor"), + ("layoutlmv2", "LayoutLMv2Processor"), + ("layoutlmv3", "LayoutLMv3Processor"), + ("llama4", "Llama4Processor"), + ("llava", "LlavaProcessor"), + ("llava_next", "LlavaNextProcessor"), + ("llava_next_video", "LlavaNextVideoProcessor"), + ("llava_onevision", "LlavaOnevisionProcessor"), + ("markuplm", "MarkupLMProcessor"), + ("mctct", "MCTCTProcessor"), + ("mgp-str", "MgpstrProcessor"), + ("mistral3", "PixtralProcessor"), + ("mllama", "MllamaProcessor"), + ("moonshine", "Wav2Vec2Processor"), + ("oneformer", "OneFormerProcessor"), + ("owlv2", "Owlv2Processor"), + ("owlvit", "OwlViTProcessor"), + ("paligemma", "PaliGemmaProcessor"), + ("phi4_multimodal", "Phi4MultimodalProcessor"), + ("pix2struct", "Pix2StructProcessor"), + ("pixtral", "PixtralProcessor"), + ("pop2piano", "Pop2PianoProcessor"), + ("qwen2_5_omni", "Qwen2_5OmniProcessor"), + ("qwen2_5_vl", "Qwen2_5_VLProcessor"), + ("qwen2_audio", "Qwen2AudioProcessor"), + ("qwen2_vl", "Qwen2VLProcessor"), + ("sam", "SamProcessor"), + ("seamless_m4t", "SeamlessM4TProcessor"), + ("sew", "Wav2Vec2Processor"), + ("sew-d", "Wav2Vec2Processor"), + ("shieldgemma2", "ShieldGemma2Processor"), + ("siglip", "SiglipProcessor"), + ("siglip2", "Siglip2Processor"), + ("speech_to_text", "Speech2TextProcessor"), + ("speech_to_text_2", "Speech2Text2Processor"), + ("speecht5", "SpeechT5Processor"), + ("trocr", "TrOCRProcessor"), + ("tvlt", "TvltProcessor"), + ("tvp", "TvpProcessor"), + ("udop", "UdopProcessor"), + ("unispeech", "Wav2Vec2Processor"), + ("unispeech-sat", "Wav2Vec2Processor"), + ("video_llava", "VideoLlavaProcessor"), + ("vilt", "ViltProcessor"), + ("vipllava", "LlavaProcessor"), + ("vision-text-dual-encoder", "VisionTextDualEncoderProcessor"), + ("wav2vec2", "Wav2Vec2Processor"), + ("wav2vec2-bert", "Wav2Vec2Processor"), + ("wav2vec2-conformer", "Wav2Vec2Processor"), + ("wavlm", "Wav2Vec2Processor"), + ("whisper", "WhisperProcessor"), + ("xclip", "XCLIPProcessor"), + ] +) + +PROCESSOR_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, PROCESSOR_MAPPING_NAMES) + + +def processor_class_from_name(class_name: str): + for module_name, processors in PROCESSOR_MAPPING_NAMES.items(): + if class_name in processors: + module_name = model_type_to_module_name(module_name) + + module = importlib.import_module(f".{module_name}", "transformers.models") + try: + return getattr(module, class_name) + except AttributeError: + continue + + for processor in PROCESSOR_MAPPING._extra_content.values(): + if getattr(processor, "__name__", None) == class_name: + return processor + + # We did not fine the class, but maybe it's because a dep is missing. In that case, the class will be in the main + # init and we return the proper dummy to get an appropriate error message. + main_module = importlib.import_module("transformers") + if hasattr(main_module, class_name): + return getattr(main_module, class_name) + + return None + + +class AutoProcessor: + r""" + This is a generic processor class that will be instantiated as one of the processor classes of the library when + created with the [`AutoProcessor.from_pretrained`] class method. + + This class cannot be instantiated directly using `__init__()` (throws an error). + """ + + def __init__(self): + raise EnvironmentError( + "AutoProcessor is designed to be instantiated " + "using the `AutoProcessor.from_pretrained(pretrained_model_name_or_path)` method." + ) + + @classmethod + @replace_list_option_in_docstrings(PROCESSOR_MAPPING_NAMES) + def from_pretrained(cls, pretrained_model_name_or_path, **kwargs): + r""" + Instantiate one of the processor classes of the library from a pretrained model vocabulary. + + The processor class to instantiate is selected based on the `model_type` property of the config object (either + passed as an argument or loaded from `pretrained_model_name_or_path` if possible): + + List options + + Params: + pretrained_model_name_or_path (`str` or `os.PathLike`): + This can be either: + + - a string, the *model id* of a pretrained feature_extractor hosted inside a model repo on + huggingface.co. + - a path to a *directory* containing a processor files saved using the `save_pretrained()` method, + e.g., `./my_model_directory/`. + cache_dir (`str` or `os.PathLike`, *optional*): + Path to a directory in which a downloaded pretrained model feature extractor should be cached if the + standard cache should not be used. + force_download (`bool`, *optional*, defaults to `False`): + Whether or not to force to (re-)download the feature extractor files and override the cached versions + if they exist. + resume_download: + Deprecated and ignored. All downloads are now resumed by default when possible. + Will be removed in v5 of Transformers. + proxies (`Dict[str, str]`, *optional*): + A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128', + 'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request. + token (`str` or *bool*, *optional*): + The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated + when running `huggingface-cli login` (stored in `~/.huggingface`). + revision (`str`, *optional*, defaults to `"main"`): + The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a + git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any + identifier allowed by git. + return_unused_kwargs (`bool`, *optional*, defaults to `False`): + If `False`, then this function returns just the final feature extractor object. If `True`, then this + functions returns a `Tuple(feature_extractor, unused_kwargs)` where *unused_kwargs* is a dictionary + consisting of the key/value pairs whose keys are not feature extractor attributes: i.e., the part of + `kwargs` which has not been used to update `feature_extractor` and is otherwise ignored. + trust_remote_code (`bool`, *optional*, defaults to `False`): + Whether or not to allow for custom models defined on the Hub in their own modeling files. This option + should only be set to `True` for repositories you trust and in which you have read the code, as it will + execute code present on the Hub on your local machine. + kwargs (`Dict[str, Any]`, *optional*): + The values in kwargs of any keys which are feature extractor attributes will be used to override the + loaded values. Behavior concerning key/value pairs whose keys are *not* feature extractor attributes is + controlled by the `return_unused_kwargs` keyword parameter. + + + + Passing `token=True` is required when you want to use a private model. + + + + Examples: + + ```python + >>> from transformers import AutoProcessor + + >>> # Download processor from huggingface.co and cache. + >>> processor = AutoProcessor.from_pretrained("facebook/wav2vec2-base-960h") + + >>> # If processor files are in a directory (e.g. processor was saved using *save_pretrained('./test/saved_model/')*) + >>> # processor = AutoProcessor.from_pretrained("./test/saved_model/") + ```""" + use_auth_token = kwargs.pop("use_auth_token", None) + if use_auth_token is not None: + warnings.warn( + "The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers. Please use `token` instead.", + FutureWarning, + ) + if kwargs.get("token", None) is not None: + raise ValueError( + "`token` and `use_auth_token` are both specified. Please set only the argument `token`." + ) + kwargs["token"] = use_auth_token + + config = kwargs.pop("config", None) + trust_remote_code = kwargs.pop("trust_remote_code", None) + kwargs["_from_auto"] = True + + processor_class = None + processor_auto_map = None + + # First, let's see if we have a processor or preprocessor config. + # Filter the kwargs for `cached_file`. + cached_file_kwargs = { + key: kwargs[key] for key in inspect.signature(cached_file).parameters.keys() if key in kwargs + } + # We don't want to raise + cached_file_kwargs.update( + { + "_raise_exceptions_for_gated_repo": False, + "_raise_exceptions_for_missing_entries": False, + "_raise_exceptions_for_connection_errors": False, + } + ) + + # Let's start by checking whether the processor class is saved in a processor config + processor_config_file = cached_file(pretrained_model_name_or_path, PROCESSOR_NAME, **cached_file_kwargs) + if processor_config_file is not None: + config_dict, _ = ProcessorMixin.get_processor_dict(pretrained_model_name_or_path, **kwargs) + processor_class = config_dict.get("processor_class", None) + if "AutoProcessor" in config_dict.get("auto_map", {}): + processor_auto_map = config_dict["auto_map"]["AutoProcessor"] + + if processor_class is None: + # If not found, let's check whether the processor class is saved in an image processor config + preprocessor_config_file = cached_file( + pretrained_model_name_or_path, FEATURE_EXTRACTOR_NAME, **cached_file_kwargs + ) + if preprocessor_config_file is not None: + config_dict, _ = ImageProcessingMixin.get_image_processor_dict(pretrained_model_name_or_path, **kwargs) + processor_class = config_dict.get("processor_class", None) + if "AutoProcessor" in config_dict.get("auto_map", {}): + processor_auto_map = config_dict["auto_map"]["AutoProcessor"] + + # If not found, let's check whether the processor class is saved in a feature extractor config + if preprocessor_config_file is not None and processor_class is None: + config_dict, _ = FeatureExtractionMixin.get_feature_extractor_dict( + pretrained_model_name_or_path, **kwargs + ) + processor_class = config_dict.get("processor_class", None) + if "AutoProcessor" in config_dict.get("auto_map", {}): + processor_auto_map = config_dict["auto_map"]["AutoProcessor"] + + if processor_class is None: + # Next, let's check whether the processor class is saved in a tokenizer + tokenizer_config_file = cached_file( + pretrained_model_name_or_path, TOKENIZER_CONFIG_FILE, **cached_file_kwargs + ) + if tokenizer_config_file is not None: + with open(tokenizer_config_file, encoding="utf-8") as reader: + config_dict = json.load(reader) + + processor_class = config_dict.get("processor_class", None) + if "AutoProcessor" in config_dict.get("auto_map", {}): + processor_auto_map = config_dict["auto_map"]["AutoProcessor"] + + if processor_class is None: + # Otherwise, load config, if it can be loaded. + if not isinstance(config, PretrainedConfig): + config = AutoConfig.from_pretrained( + pretrained_model_name_or_path, trust_remote_code=trust_remote_code, **kwargs + ) + + # And check if the config contains the processor class. + processor_class = getattr(config, "processor_class", None) + if hasattr(config, "auto_map") and "AutoProcessor" in config.auto_map: + processor_auto_map = config.auto_map["AutoProcessor"] + + if processor_class is not None: + processor_class = processor_class_from_name(processor_class) + + has_remote_code = processor_auto_map is not None + has_local_code = processor_class is not None or type(config) in PROCESSOR_MAPPING + trust_remote_code = resolve_trust_remote_code( + trust_remote_code, pretrained_model_name_or_path, has_local_code, has_remote_code + ) + + if has_remote_code and trust_remote_code: + processor_class = get_class_from_dynamic_module( + processor_auto_map, pretrained_model_name_or_path, **kwargs + ) + _ = kwargs.pop("code_revision", None) + if os.path.isdir(pretrained_model_name_or_path): + processor_class.register_for_auto_class() + return processor_class.from_pretrained( + pretrained_model_name_or_path, trust_remote_code=trust_remote_code, **kwargs + ) + elif processor_class is not None: + return processor_class.from_pretrained( + pretrained_model_name_or_path, trust_remote_code=trust_remote_code, **kwargs + ) + # Last try: we use the PROCESSOR_MAPPING. + elif type(config) in PROCESSOR_MAPPING: + return PROCESSOR_MAPPING[type(config)].from_pretrained(pretrained_model_name_or_path, **kwargs) + + # At this stage, there doesn't seem to be a `Processor` class available for this model, so let's try a + # tokenizer. + try: + return AutoTokenizer.from_pretrained( + pretrained_model_name_or_path, trust_remote_code=trust_remote_code, **kwargs + ) + except Exception: + try: + return AutoImageProcessor.from_pretrained( + pretrained_model_name_or_path, trust_remote_code=trust_remote_code, **kwargs + ) + except Exception: + pass + + try: + return AutoFeatureExtractor.from_pretrained( + pretrained_model_name_or_path, trust_remote_code=trust_remote_code, **kwargs + ) + except Exception: + pass + + raise ValueError( + f"Unrecognized processing class in {pretrained_model_name_or_path}. Can't instantiate a processor, a " + "tokenizer, an image processor or a feature extractor for this model. Make sure the repository contains " + "the files of at least one of those processing classes." + ) + + @staticmethod + def register(config_class, processor_class, exist_ok=False): + """ + Register a new processor for this class. + + Args: + config_class ([`PretrainedConfig`]): + The configuration corresponding to the model to register. + processor_class ([`ProcessorMixin`]): The processor to register. + """ + PROCESSOR_MAPPING.register(config_class, processor_class, exist_ok=exist_ok) + + +__all__ = ["PROCESSOR_MAPPING", "AutoProcessor"] diff --git a/docs/transformers/build/lib/transformers/models/auto/tokenization_auto.py b/docs/transformers/build/lib/transformers/models/auto/tokenization_auto.py new file mode 100644 index 0000000000000000000000000000000000000000..5da5fb73f4d6a620c7d621fe221ae130b5f7714f --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/auto/tokenization_auto.py @@ -0,0 +1,1091 @@ +# coding=utf-8 +# Copyright 2018 The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Auto Tokenizer class.""" + +import importlib +import json +import os +import warnings +from collections import OrderedDict +from typing import TYPE_CHECKING, Dict, Optional, Tuple, Union + +from ...configuration_utils import PretrainedConfig +from ...dynamic_module_utils import get_class_from_dynamic_module, resolve_trust_remote_code +from ...modeling_gguf_pytorch_utils import load_gguf_checkpoint +from ...tokenization_utils import PreTrainedTokenizer +from ...tokenization_utils_base import TOKENIZER_CONFIG_FILE +from ...utils import ( + cached_file, + extract_commit_hash, + is_g2p_en_available, + is_sentencepiece_available, + is_tokenizers_available, + logging, +) +from ..encoder_decoder import EncoderDecoderConfig +from .auto_factory import _LazyAutoMapping +from .configuration_auto import ( + CONFIG_MAPPING_NAMES, + AutoConfig, + config_class_to_model_type, + model_type_to_module_name, + replace_list_option_in_docstrings, +) + + +if is_tokenizers_available(): + from ...tokenization_utils_fast import PreTrainedTokenizerFast +else: + PreTrainedTokenizerFast = None + + +logger = logging.get_logger(__name__) + +if TYPE_CHECKING: + # This significantly improves completion suggestion performance when + # the transformers package is used with Microsoft's Pylance language server. + TOKENIZER_MAPPING_NAMES: OrderedDict[str, Tuple[Optional[str], Optional[str]]] = OrderedDict() +else: + TOKENIZER_MAPPING_NAMES = OrderedDict( + [ + ( + "albert", + ( + "AlbertTokenizer" if is_sentencepiece_available() else None, + "AlbertTokenizerFast" if is_tokenizers_available() else None, + ), + ), + ("align", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)), + ("aria", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)), + ("aya_vision", (None, "CohereTokenizerFast" if is_tokenizers_available() else None)), + ("bark", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)), + ("bart", ("BartTokenizer", "BartTokenizerFast")), + ( + "barthez", + ( + "BarthezTokenizer" if is_sentencepiece_available() else None, + "BarthezTokenizerFast" if is_tokenizers_available() else None, + ), + ), + ("bartpho", ("BartphoTokenizer", None)), + ("bert", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)), + ("bert-generation", ("BertGenerationTokenizer" if is_sentencepiece_available() else None, None)), + ("bert-japanese", ("BertJapaneseTokenizer", None)), + ("bertweet", ("BertweetTokenizer", None)), + ( + "big_bird", + ( + "BigBirdTokenizer" if is_sentencepiece_available() else None, + "BigBirdTokenizerFast" if is_tokenizers_available() else None, + ), + ), + ("bigbird_pegasus", ("PegasusTokenizer", "PegasusTokenizerFast" if is_tokenizers_available() else None)), + ("biogpt", ("BioGptTokenizer", None)), + ("blenderbot", ("BlenderbotTokenizer", "BlenderbotTokenizerFast")), + ("blenderbot-small", ("BlenderbotSmallTokenizer", None)), + ("blip", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)), + ("blip-2", ("GPT2Tokenizer", "GPT2TokenizerFast" if is_tokenizers_available() else None)), + ("bloom", (None, "BloomTokenizerFast" if is_tokenizers_available() else None)), + ("bridgetower", ("RobertaTokenizer", "RobertaTokenizerFast" if is_tokenizers_available() else None)), + ("bros", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)), + ("byt5", ("ByT5Tokenizer", None)), + ( + "camembert", + ( + "CamembertTokenizer" if is_sentencepiece_available() else None, + "CamembertTokenizerFast" if is_tokenizers_available() else None, + ), + ), + ("canine", ("CanineTokenizer", None)), + ( + "chameleon", + ( + "LlamaTokenizer" if is_sentencepiece_available() else None, + "LlamaTokenizerFast" if is_tokenizers_available() else None, + ), + ), + ("chinese_clip", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)), + ( + "clap", + ( + "RobertaTokenizer", + "RobertaTokenizerFast" if is_tokenizers_available() else None, + ), + ), + ( + "clip", + ( + "CLIPTokenizer", + "CLIPTokenizerFast" if is_tokenizers_available() else None, + ), + ), + ( + "clipseg", + ( + "CLIPTokenizer", + "CLIPTokenizerFast" if is_tokenizers_available() else None, + ), + ), + ("clvp", ("ClvpTokenizer", None)), + ( + "code_llama", + ( + "CodeLlamaTokenizer" if is_sentencepiece_available() else None, + "CodeLlamaTokenizerFast" if is_tokenizers_available() else None, + ), + ), + ("codegen", ("CodeGenTokenizer", "CodeGenTokenizerFast" if is_tokenizers_available() else None)), + ("cohere", (None, "CohereTokenizerFast" if is_tokenizers_available() else None)), + ("cohere2", (None, "CohereTokenizerFast" if is_tokenizers_available() else None)), + ("colpali", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)), + ("convbert", ("ConvBertTokenizer", "ConvBertTokenizerFast" if is_tokenizers_available() else None)), + ( + "cpm", + ( + "CpmTokenizer" if is_sentencepiece_available() else None, + "CpmTokenizerFast" if is_tokenizers_available() else None, + ), + ), + ("cpmant", ("CpmAntTokenizer", None)), + ("ctrl", ("CTRLTokenizer", None)), + ("data2vec-audio", ("Wav2Vec2CTCTokenizer", None)), + ("data2vec-text", ("RobertaTokenizer", "RobertaTokenizerFast" if is_tokenizers_available() else None)), + ("dbrx", ("GPT2Tokenizer", "GPT2TokenizerFast" if is_tokenizers_available() else None)), + ("deberta", ("DebertaTokenizer", "DebertaTokenizerFast" if is_tokenizers_available() else None)), + ( + "deberta-v2", + ( + "DebertaV2Tokenizer" if is_sentencepiece_available() else None, + "DebertaV2TokenizerFast" if is_tokenizers_available() else None, + ), + ), + ( + "deepseek_v3", + ( + "LlamaTokenizer" if is_sentencepiece_available() else None, + "LlamaTokenizerFast" if is_tokenizers_available() else None, + ), + ), + ( + "diffllama", + ( + "LlamaTokenizer" if is_sentencepiece_available() else None, + "LlamaTokenizerFast" if is_tokenizers_available() else None, + ), + ), + ("distilbert", ("DistilBertTokenizer", "DistilBertTokenizerFast" if is_tokenizers_available() else None)), + ( + "dpr", + ( + "DPRQuestionEncoderTokenizer", + "DPRQuestionEncoderTokenizerFast" if is_tokenizers_available() else None, + ), + ), + ("electra", ("ElectraTokenizer", "ElectraTokenizerFast" if is_tokenizers_available() else None)), + ("emu3", ("GPT2Tokenizer", "GPT2TokenizerFast" if is_tokenizers_available() else None)), + ("ernie", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)), + ("ernie_m", ("ErnieMTokenizer" if is_sentencepiece_available() else None, None)), + ("esm", ("EsmTokenizer", None)), + ("falcon", (None, "PreTrainedTokenizerFast" if is_tokenizers_available() else None)), + ("falcon_mamba", (None, "GPTNeoXTokenizerFast" if is_tokenizers_available() else None)), + ( + "fastspeech2_conformer", + ("FastSpeech2ConformerTokenizer" if is_g2p_en_available() else None, None), + ), + ("flaubert", ("FlaubertTokenizer", None)), + ("fnet", ("FNetTokenizer", "FNetTokenizerFast" if is_tokenizers_available() else None)), + ("fsmt", ("FSMTTokenizer", None)), + ("funnel", ("FunnelTokenizer", "FunnelTokenizerFast" if is_tokenizers_available() else None)), + ( + "gemma", + ( + "GemmaTokenizer" if is_sentencepiece_available() else None, + "GemmaTokenizerFast" if is_tokenizers_available() else None, + ), + ), + ( + "gemma2", + ( + "GemmaTokenizer" if is_sentencepiece_available() else None, + "GemmaTokenizerFast" if is_tokenizers_available() else None, + ), + ), + ( + "gemma3", + ( + "GemmaTokenizer" if is_sentencepiece_available() else None, + "GemmaTokenizerFast" if is_tokenizers_available() else None, + ), + ), + ( + "gemma3_text", + ( + "GemmaTokenizer" if is_sentencepiece_available() else None, + "GemmaTokenizerFast" if is_tokenizers_available() else None, + ), + ), + ("git", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)), + ("glm", (None, "PreTrainedTokenizerFast" if is_tokenizers_available() else None)), + ("glm4", (None, "PreTrainedTokenizerFast" if is_tokenizers_available() else None)), + ("gpt-sw3", ("GPTSw3Tokenizer" if is_sentencepiece_available() else None, None)), + ("gpt2", ("GPT2Tokenizer", "GPT2TokenizerFast" if is_tokenizers_available() else None)), + ("gpt_bigcode", ("GPT2Tokenizer", "GPT2TokenizerFast" if is_tokenizers_available() else None)), + ("gpt_neo", ("GPT2Tokenizer", "GPT2TokenizerFast" if is_tokenizers_available() else None)), + ("gpt_neox", (None, "GPTNeoXTokenizerFast" if is_tokenizers_available() else None)), + ("gpt_neox_japanese", ("GPTNeoXJapaneseTokenizer", None)), + ("gptj", ("GPT2Tokenizer", "GPT2TokenizerFast" if is_tokenizers_available() else None)), + ("gptsan-japanese", ("GPTSanJapaneseTokenizer", None)), + ("grounding-dino", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)), + ("groupvit", ("CLIPTokenizer", "CLIPTokenizerFast" if is_tokenizers_available() else None)), + ("helium", (None, "PreTrainedTokenizerFast" if is_tokenizers_available() else None)), + ("herbert", ("HerbertTokenizer", "HerbertTokenizerFast" if is_tokenizers_available() else None)), + ("hubert", ("Wav2Vec2CTCTokenizer", None)), + ("ibert", ("RobertaTokenizer", "RobertaTokenizerFast" if is_tokenizers_available() else None)), + ("idefics", (None, "LlamaTokenizerFast" if is_tokenizers_available() else None)), + ("idefics2", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)), + ("idefics3", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)), + ("instructblip", ("GPT2Tokenizer", "GPT2TokenizerFast" if is_tokenizers_available() else None)), + ("instructblipvideo", ("GPT2Tokenizer", "GPT2TokenizerFast" if is_tokenizers_available() else None)), + ("internvl", ("Qwen2Tokenizer", "Qwen2TokenizerFast" if is_tokenizers_available() else None)), + ( + "jamba", + ( + "LlamaTokenizer" if is_sentencepiece_available() else None, + "LlamaTokenizerFast" if is_tokenizers_available() else None, + ), + ), + ("janus", (None, "LlamaTokenizerFast" if is_tokenizers_available() else None)), + ( + "jetmoe", + ( + "LlamaTokenizer" if is_sentencepiece_available() else None, + "LlamaTokenizerFast" if is_tokenizers_available() else None, + ), + ), + ("jukebox", ("JukeboxTokenizer", None)), + ( + "kosmos-2", + ( + "XLMRobertaTokenizer" if is_sentencepiece_available() else None, + "XLMRobertaTokenizerFast" if is_tokenizers_available() else None, + ), + ), + ("layoutlm", ("LayoutLMTokenizer", "LayoutLMTokenizerFast" if is_tokenizers_available() else None)), + ("layoutlmv2", ("LayoutLMv2Tokenizer", "LayoutLMv2TokenizerFast" if is_tokenizers_available() else None)), + ("layoutlmv3", ("LayoutLMv3Tokenizer", "LayoutLMv3TokenizerFast" if is_tokenizers_available() else None)), + ("layoutxlm", ("LayoutXLMTokenizer", "LayoutXLMTokenizerFast" if is_tokenizers_available() else None)), + ("led", ("LEDTokenizer", "LEDTokenizerFast" if is_tokenizers_available() else None)), + ("lilt", ("LayoutLMv3Tokenizer", "LayoutLMv3TokenizerFast" if is_tokenizers_available() else None)), + ( + "llama", + ( + "LlamaTokenizer" if is_sentencepiece_available() else None, + "LlamaTokenizerFast" if is_tokenizers_available() else None, + ), + ), + ( + "llama4", + ( + "LlamaTokenizer" if is_sentencepiece_available() else None, + "LlamaTokenizerFast" if is_tokenizers_available() else None, + ), + ), + ( + "llama4_text", + ( + "LlamaTokenizer" if is_sentencepiece_available() else None, + "LlamaTokenizerFast" if is_tokenizers_available() else None, + ), + ), + ("llava", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)), + ("llava_next", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)), + ("llava_next_video", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)), + ("llava_onevision", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)), + ("longformer", ("LongformerTokenizer", "LongformerTokenizerFast" if is_tokenizers_available() else None)), + ( + "longt5", + ( + "T5Tokenizer" if is_sentencepiece_available() else None, + "T5TokenizerFast" if is_tokenizers_available() else None, + ), + ), + ("luke", ("LukeTokenizer", None)), + ("lxmert", ("LxmertTokenizer", "LxmertTokenizerFast" if is_tokenizers_available() else None)), + ("m2m_100", ("M2M100Tokenizer" if is_sentencepiece_available() else None, None)), + ("mamba", (None, "GPTNeoXTokenizerFast" if is_tokenizers_available() else None)), + ("mamba2", (None, "GPTNeoXTokenizerFast" if is_tokenizers_available() else None)), + ("marian", ("MarianTokenizer" if is_sentencepiece_available() else None, None)), + ( + "mbart", + ( + "MBartTokenizer" if is_sentencepiece_available() else None, + "MBartTokenizerFast" if is_tokenizers_available() else None, + ), + ), + ( + "mbart50", + ( + "MBart50Tokenizer" if is_sentencepiece_available() else None, + "MBart50TokenizerFast" if is_tokenizers_available() else None, + ), + ), + ("mega", ("RobertaTokenizer", "RobertaTokenizerFast" if is_tokenizers_available() else None)), + ("megatron-bert", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)), + ("mgp-str", ("MgpstrTokenizer", None)), + ( + "mistral", + ( + "LlamaTokenizer" if is_sentencepiece_available() else None, + "LlamaTokenizerFast" if is_tokenizers_available() else None, + ), + ), + ( + "mixtral", + ( + "LlamaTokenizer" if is_sentencepiece_available() else None, + "LlamaTokenizerFast" if is_tokenizers_available() else None, + ), + ), + ("mllama", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)), + ("mluke", ("MLukeTokenizer" if is_sentencepiece_available() else None, None)), + ("mobilebert", ("MobileBertTokenizer", "MobileBertTokenizerFast" if is_tokenizers_available() else None)), + ("modernbert", (None, "PreTrainedTokenizerFast" if is_tokenizers_available() else None)), + ("moonshine", (None, "PreTrainedTokenizerFast" if is_tokenizers_available() else None)), + ("moshi", (None, "PreTrainedTokenizerFast" if is_tokenizers_available() else None)), + ("mpnet", ("MPNetTokenizer", "MPNetTokenizerFast" if is_tokenizers_available() else None)), + ("mpt", (None, "GPTNeoXTokenizerFast" if is_tokenizers_available() else None)), + ("mra", ("RobertaTokenizer", "RobertaTokenizerFast" if is_tokenizers_available() else None)), + ( + "mt5", + ( + "MT5Tokenizer" if is_sentencepiece_available() else None, + "MT5TokenizerFast" if is_tokenizers_available() else None, + ), + ), + ("musicgen", ("T5Tokenizer", "T5TokenizerFast" if is_tokenizers_available() else None)), + ("musicgen_melody", ("T5Tokenizer", "T5TokenizerFast" if is_tokenizers_available() else None)), + ("mvp", ("MvpTokenizer", "MvpTokenizerFast" if is_tokenizers_available() else None)), + ("myt5", ("MyT5Tokenizer", None)), + ("nemotron", (None, "PreTrainedTokenizerFast" if is_tokenizers_available() else None)), + ("nezha", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)), + ( + "nllb", + ( + "NllbTokenizer" if is_sentencepiece_available() else None, + "NllbTokenizerFast" if is_tokenizers_available() else None, + ), + ), + ( + "nllb-moe", + ( + "NllbTokenizer" if is_sentencepiece_available() else None, + "NllbTokenizerFast" if is_tokenizers_available() else None, + ), + ), + ( + "nystromformer", + ( + "AlbertTokenizer" if is_sentencepiece_available() else None, + "AlbertTokenizerFast" if is_tokenizers_available() else None, + ), + ), + ("olmo", (None, "GPTNeoXTokenizerFast" if is_tokenizers_available() else None)), + ("olmo2", (None, "GPTNeoXTokenizerFast" if is_tokenizers_available() else None)), + ("olmoe", (None, "GPTNeoXTokenizerFast" if is_tokenizers_available() else None)), + ( + "omdet-turbo", + ("CLIPTokenizer", "CLIPTokenizerFast" if is_tokenizers_available() else None), + ), + ("oneformer", ("CLIPTokenizer", "CLIPTokenizerFast" if is_tokenizers_available() else None)), + ( + "openai-gpt", + ("OpenAIGPTTokenizer", "OpenAIGPTTokenizerFast" if is_tokenizers_available() else None), + ), + ("opt", ("GPT2Tokenizer", "GPT2TokenizerFast" if is_tokenizers_available() else None)), + ("owlv2", ("CLIPTokenizer", "CLIPTokenizerFast" if is_tokenizers_available() else None)), + ("owlvit", ("CLIPTokenizer", "CLIPTokenizerFast" if is_tokenizers_available() else None)), + ("paligemma", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)), + ( + "pegasus", + ( + "PegasusTokenizer" if is_sentencepiece_available() else None, + "PegasusTokenizerFast" if is_tokenizers_available() else None, + ), + ), + ( + "pegasus_x", + ( + "PegasusTokenizer" if is_sentencepiece_available() else None, + "PegasusTokenizerFast" if is_tokenizers_available() else None, + ), + ), + ( + "perceiver", + ( + "PerceiverTokenizer", + None, + ), + ), + ( + "persimmon", + ( + "LlamaTokenizer" if is_sentencepiece_available() else None, + "LlamaTokenizerFast" if is_tokenizers_available() else None, + ), + ), + ("phi", ("CodeGenTokenizer", "CodeGenTokenizerFast" if is_tokenizers_available() else None)), + ("phi3", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)), + ("phimoe", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)), + ("phobert", ("PhobertTokenizer", None)), + ("pix2struct", ("T5Tokenizer", "T5TokenizerFast" if is_tokenizers_available() else None)), + ("pixtral", (None, "PreTrainedTokenizerFast" if is_tokenizers_available() else None)), + ("plbart", ("PLBartTokenizer" if is_sentencepiece_available() else None, None)), + ("prophetnet", ("ProphetNetTokenizer", None)), + ("qdqbert", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)), + ( + "qwen2", + ( + "Qwen2Tokenizer", + "Qwen2TokenizerFast" if is_tokenizers_available() else None, + ), + ), + ("qwen2_5_omni", ("Qwen2Tokenizer", "Qwen2TokenizerFast" if is_tokenizers_available() else None)), + ("qwen2_5_vl", ("Qwen2Tokenizer", "Qwen2TokenizerFast" if is_tokenizers_available() else None)), + ("qwen2_audio", ("Qwen2Tokenizer", "Qwen2TokenizerFast" if is_tokenizers_available() else None)), + ( + "qwen2_moe", + ( + "Qwen2Tokenizer", + "Qwen2TokenizerFast" if is_tokenizers_available() else None, + ), + ), + ("qwen2_vl", ("Qwen2Tokenizer", "Qwen2TokenizerFast" if is_tokenizers_available() else None)), + ( + "qwen3", + ( + "Qwen2Tokenizer", + "Qwen2TokenizerFast" if is_tokenizers_available() else None, + ), + ), + ( + "qwen3_moe", + ( + "Qwen2Tokenizer", + "Qwen2TokenizerFast" if is_tokenizers_available() else None, + ), + ), + ("rag", ("RagTokenizer", None)), + ("realm", ("RealmTokenizer", "RealmTokenizerFast" if is_tokenizers_available() else None)), + ( + "recurrent_gemma", + ( + "GemmaTokenizer" if is_sentencepiece_available() else None, + "GemmaTokenizerFast" if is_tokenizers_available() else None, + ), + ), + ( + "reformer", + ( + "ReformerTokenizer" if is_sentencepiece_available() else None, + "ReformerTokenizerFast" if is_tokenizers_available() else None, + ), + ), + ( + "rembert", + ( + "RemBertTokenizer" if is_sentencepiece_available() else None, + "RemBertTokenizerFast" if is_tokenizers_available() else None, + ), + ), + ("retribert", ("RetriBertTokenizer", "RetriBertTokenizerFast" if is_tokenizers_available() else None)), + ("roberta", ("RobertaTokenizer", "RobertaTokenizerFast" if is_tokenizers_available() else None)), + ( + "roberta-prelayernorm", + ("RobertaTokenizer", "RobertaTokenizerFast" if is_tokenizers_available() else None), + ), + ("roc_bert", ("RoCBertTokenizer", None)), + ("roformer", ("RoFormerTokenizer", "RoFormerTokenizerFast" if is_tokenizers_available() else None)), + ("rwkv", (None, "GPTNeoXTokenizerFast" if is_tokenizers_available() else None)), + ( + "seamless_m4t", + ( + "SeamlessM4TTokenizer" if is_sentencepiece_available() else None, + "SeamlessM4TTokenizerFast" if is_tokenizers_available() else None, + ), + ), + ( + "seamless_m4t_v2", + ( + "SeamlessM4TTokenizer" if is_sentencepiece_available() else None, + "SeamlessM4TTokenizerFast" if is_tokenizers_available() else None, + ), + ), + ( + "shieldgemma2", + ( + "GemmaTokenizer" if is_sentencepiece_available() else None, + "GemmaTokenizerFast" if is_tokenizers_available() else None, + ), + ), + ("siglip", ("SiglipTokenizer" if is_sentencepiece_available() else None, None)), + ( + "siglip2", + ( + "GemmaTokenizer" if is_sentencepiece_available() else None, + "GemmaTokenizerFast" if is_tokenizers_available() else None, + ), + ), + ("speech_to_text", ("Speech2TextTokenizer" if is_sentencepiece_available() else None, None)), + ("speech_to_text_2", ("Speech2Text2Tokenizer", None)), + ("speecht5", ("SpeechT5Tokenizer" if is_sentencepiece_available() else None, None)), + ("splinter", ("SplinterTokenizer", "SplinterTokenizerFast")), + ( + "squeezebert", + ("SqueezeBertTokenizer", "SqueezeBertTokenizerFast" if is_tokenizers_available() else None), + ), + ("stablelm", (None, "GPTNeoXTokenizerFast" if is_tokenizers_available() else None)), + ("starcoder2", ("GPT2Tokenizer", "GPT2TokenizerFast" if is_tokenizers_available() else None)), + ( + "switch_transformers", + ( + "T5Tokenizer" if is_sentencepiece_available() else None, + "T5TokenizerFast" if is_tokenizers_available() else None, + ), + ), + ( + "t5", + ( + "T5Tokenizer" if is_sentencepiece_available() else None, + "T5TokenizerFast" if is_tokenizers_available() else None, + ), + ), + ("tapas", ("TapasTokenizer", None)), + ("tapex", ("TapexTokenizer", None)), + ("transfo-xl", ("TransfoXLTokenizer", None)), + ("tvp", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)), + ( + "udop", + ( + "UdopTokenizer" if is_sentencepiece_available() else None, + "UdopTokenizerFast" if is_tokenizers_available() else None, + ), + ), + ( + "umt5", + ( + "T5Tokenizer" if is_sentencepiece_available() else None, + "T5TokenizerFast" if is_tokenizers_available() else None, + ), + ), + ("video_llava", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)), + ("vilt", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)), + ("vipllava", ("LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None)), + ("visual_bert", ("BertTokenizer", "BertTokenizerFast" if is_tokenizers_available() else None)), + ("vits", ("VitsTokenizer", None)), + ("wav2vec2", ("Wav2Vec2CTCTokenizer", None)), + ("wav2vec2-bert", ("Wav2Vec2CTCTokenizer", None)), + ("wav2vec2-conformer", ("Wav2Vec2CTCTokenizer", None)), + ("wav2vec2_phoneme", ("Wav2Vec2PhonemeCTCTokenizer", None)), + ("whisper", ("WhisperTokenizer", "WhisperTokenizerFast" if is_tokenizers_available() else None)), + ("xclip", ("CLIPTokenizer", "CLIPTokenizerFast" if is_tokenizers_available() else None)), + ( + "xglm", + ( + "XGLMTokenizer" if is_sentencepiece_available() else None, + "XGLMTokenizerFast" if is_tokenizers_available() else None, + ), + ), + ("xlm", ("XLMTokenizer", None)), + ("xlm-prophetnet", ("XLMProphetNetTokenizer" if is_sentencepiece_available() else None, None)), + ( + "xlm-roberta", + ( + "XLMRobertaTokenizer" if is_sentencepiece_available() else None, + "XLMRobertaTokenizerFast" if is_tokenizers_available() else None, + ), + ), + ( + "xlm-roberta-xl", + ( + "XLMRobertaTokenizer" if is_sentencepiece_available() else None, + "XLMRobertaTokenizerFast" if is_tokenizers_available() else None, + ), + ), + ( + "xlnet", + ( + "XLNetTokenizer" if is_sentencepiece_available() else None, + "XLNetTokenizerFast" if is_tokenizers_available() else None, + ), + ), + ( + "xmod", + ( + "XLMRobertaTokenizer" if is_sentencepiece_available() else None, + "XLMRobertaTokenizerFast" if is_tokenizers_available() else None, + ), + ), + ( + "yoso", + ( + "AlbertTokenizer" if is_sentencepiece_available() else None, + "AlbertTokenizerFast" if is_tokenizers_available() else None, + ), + ), + ( + "zamba", + ( + "LlamaTokenizer" if is_sentencepiece_available() else None, + "LlamaTokenizerFast" if is_tokenizers_available() else None, + ), + ), + ( + "zamba2", + ( + "LlamaTokenizer" if is_sentencepiece_available() else None, + "LlamaTokenizerFast" if is_tokenizers_available() else None, + ), + ), + ] + ) + +TOKENIZER_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, TOKENIZER_MAPPING_NAMES) + +CONFIG_TO_TYPE = {v: k for k, v in CONFIG_MAPPING_NAMES.items()} + + +def tokenizer_class_from_name(class_name: str): + if class_name == "PreTrainedTokenizerFast": + return PreTrainedTokenizerFast + + for module_name, tokenizers in TOKENIZER_MAPPING_NAMES.items(): + if class_name in tokenizers: + module_name = model_type_to_module_name(module_name) + + module = importlib.import_module(f".{module_name}", "transformers.models") + try: + return getattr(module, class_name) + except AttributeError: + continue + + for config, tokenizers in TOKENIZER_MAPPING._extra_content.items(): + for tokenizer in tokenizers: + if getattr(tokenizer, "__name__", None) == class_name: + return tokenizer + + # We did not fine the class, but maybe it's because a dep is missing. In that case, the class will be in the main + # init and we return the proper dummy to get an appropriate error message. + main_module = importlib.import_module("transformers") + if hasattr(main_module, class_name): + return getattr(main_module, class_name) + + return None + + +def get_tokenizer_config( + pretrained_model_name_or_path: Union[str, os.PathLike], + cache_dir: Optional[Union[str, os.PathLike]] = None, + force_download: bool = False, + resume_download: Optional[bool] = None, + proxies: Optional[Dict[str, str]] = None, + token: Optional[Union[bool, str]] = None, + revision: Optional[str] = None, + local_files_only: bool = False, + subfolder: str = "", + **kwargs, +): + """ + Loads the tokenizer configuration from a pretrained model tokenizer configuration. + + Args: + pretrained_model_name_or_path (`str` or `os.PathLike`): + This can be either: + + - a string, the *model id* of a pretrained model configuration hosted inside a model repo on + huggingface.co. + - a path to a *directory* containing a configuration file saved using the + [`~PreTrainedTokenizer.save_pretrained`] method, e.g., `./my_model_directory/`. + + cache_dir (`str` or `os.PathLike`, *optional*): + Path to a directory in which a downloaded pretrained model configuration should be cached if the standard + cache should not be used. + force_download (`bool`, *optional*, defaults to `False`): + Whether or not to force to (re-)download the configuration files and override the cached versions if they + exist. + resume_download: + Deprecated and ignored. All downloads are now resumed by default when possible. + Will be removed in v5 of Transformers. + proxies (`Dict[str, str]`, *optional*): + A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128', + 'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request. + token (`str` or *bool*, *optional*): + The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated + when running `huggingface-cli login` (stored in `~/.huggingface`). + revision (`str`, *optional*, defaults to `"main"`): + The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a + git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any + identifier allowed by git. + local_files_only (`bool`, *optional*, defaults to `False`): + If `True`, will only try to load the tokenizer configuration from local files. + subfolder (`str`, *optional*, defaults to `""`): + In case the tokenizer config is located inside a subfolder of the model repo on huggingface.co, you can + specify the folder name here. + + + + Passing `token=True` is required when you want to use a private model. + + + + Returns: + `Dict`: The configuration of the tokenizer. + + Examples: + + ```python + # Download configuration from huggingface.co and cache. + tokenizer_config = get_tokenizer_config("google-bert/bert-base-uncased") + # This model does not have a tokenizer config so the result will be an empty dict. + tokenizer_config = get_tokenizer_config("FacebookAI/xlm-roberta-base") + + # Save a pretrained tokenizer locally and you can reload its config + from transformers import AutoTokenizer + + tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-cased") + tokenizer.save_pretrained("tokenizer-test") + tokenizer_config = get_tokenizer_config("tokenizer-test") + ```""" + use_auth_token = kwargs.pop("use_auth_token", None) + if use_auth_token is not None: + warnings.warn( + "The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers. Please use `token` instead.", + FutureWarning, + ) + if token is not None: + raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.") + token = use_auth_token + + commit_hash = kwargs.get("_commit_hash", None) + resolved_config_file = cached_file( + pretrained_model_name_or_path, + TOKENIZER_CONFIG_FILE, + cache_dir=cache_dir, + force_download=force_download, + resume_download=resume_download, + proxies=proxies, + token=token, + revision=revision, + local_files_only=local_files_only, + subfolder=subfolder, + _raise_exceptions_for_gated_repo=False, + _raise_exceptions_for_missing_entries=False, + _raise_exceptions_for_connection_errors=False, + _commit_hash=commit_hash, + ) + if resolved_config_file is None: + logger.info("Could not locate the tokenizer configuration file, will try to use the model config instead.") + return {} + commit_hash = extract_commit_hash(resolved_config_file, commit_hash) + + with open(resolved_config_file, encoding="utf-8") as reader: + result = json.load(reader) + result["_commit_hash"] = commit_hash + return result + + +class AutoTokenizer: + r""" + This is a generic tokenizer class that will be instantiated as one of the tokenizer classes of the library when + created with the [`AutoTokenizer.from_pretrained`] class method. + + This class cannot be instantiated directly using `__init__()` (throws an error). + """ + + def __init__(self): + raise EnvironmentError( + "AutoTokenizer is designed to be instantiated " + "using the `AutoTokenizer.from_pretrained(pretrained_model_name_or_path)` method." + ) + + @classmethod + @replace_list_option_in_docstrings(TOKENIZER_MAPPING_NAMES) + def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs): + r""" + Instantiate one of the tokenizer classes of the library from a pretrained model vocabulary. + + The tokenizer class to instantiate is selected based on the `model_type` property of the config object (either + passed as an argument or loaded from `pretrained_model_name_or_path` if possible), or when it's missing, by + falling back to using pattern matching on `pretrained_model_name_or_path`: + + List options + + Params: + pretrained_model_name_or_path (`str` or `os.PathLike`): + Can be either: + + - A string, the *model id* of a predefined tokenizer hosted inside a model repo on huggingface.co. + - A path to a *directory* containing vocabulary files required by the tokenizer, for instance saved + using the [`~PreTrainedTokenizer.save_pretrained`] method, e.g., `./my_model_directory/`. + - A path or url to a single saved vocabulary file if and only if the tokenizer only requires a + single vocabulary file (like Bert or XLNet), e.g.: `./my_model_directory/vocab.txt`. (Not + applicable to all derived classes) + inputs (additional positional arguments, *optional*): + Will be passed along to the Tokenizer `__init__()` method. + config ([`PretrainedConfig`], *optional*) + The configuration object used to determine the tokenizer class to instantiate. + cache_dir (`str` or `os.PathLike`, *optional*): + Path to a directory in which a downloaded pretrained model configuration should be cached if the + standard cache should not be used. + force_download (`bool`, *optional*, defaults to `False`): + Whether or not to force the (re-)download the model weights and configuration files and override the + cached versions if they exist. + resume_download: + Deprecated and ignored. All downloads are now resumed by default when possible. + Will be removed in v5 of Transformers. + proxies (`Dict[str, str]`, *optional*): + A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128', + 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request. + revision (`str`, *optional*, defaults to `"main"`): + The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a + git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any + identifier allowed by git. + subfolder (`str`, *optional*): + In case the relevant files are located inside a subfolder of the model repo on huggingface.co (e.g. for + facebook/rag-token-base), specify it here. + use_fast (`bool`, *optional*, defaults to `True`): + Use a [fast Rust-based tokenizer](https://huggingface.co/docs/tokenizers/index) if it is supported for + a given model. If a fast tokenizer is not available for a given model, a normal Python-based tokenizer + is returned instead. + tokenizer_type (`str`, *optional*): + Tokenizer type to be loaded. + trust_remote_code (`bool`, *optional*, defaults to `False`): + Whether or not to allow for custom models defined on the Hub in their own modeling files. This option + should only be set to `True` for repositories you trust and in which you have read the code, as it will + execute code present on the Hub on your local machine. + kwargs (additional keyword arguments, *optional*): + Will be passed to the Tokenizer `__init__()` method. Can be used to set special tokens like + `bos_token`, `eos_token`, `unk_token`, `sep_token`, `pad_token`, `cls_token`, `mask_token`, + `additional_special_tokens`. See parameters in the `__init__()` for more details. + + Examples: + + ```python + >>> from transformers import AutoTokenizer + + >>> # Download vocabulary from huggingface.co and cache. + >>> tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased") + + >>> # Download vocabulary from huggingface.co (user-uploaded) and cache. + >>> tokenizer = AutoTokenizer.from_pretrained("dbmdz/bert-base-german-cased") + + >>> # If vocabulary files are in a directory (e.g. tokenizer was saved using *save_pretrained('./test/saved_model/')*) + >>> # tokenizer = AutoTokenizer.from_pretrained("./test/bert_saved_model/") + + >>> # Download vocabulary from huggingface.co and define model-specific arguments + >>> tokenizer = AutoTokenizer.from_pretrained("FacebookAI/roberta-base", add_prefix_space=True) + ```""" + use_auth_token = kwargs.pop("use_auth_token", None) + if use_auth_token is not None: + warnings.warn( + "The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers. Please use `token` instead.", + FutureWarning, + ) + if kwargs.get("token", None) is not None: + raise ValueError( + "`token` and `use_auth_token` are both specified. Please set only the argument `token`." + ) + kwargs["token"] = use_auth_token + + config = kwargs.pop("config", None) + kwargs["_from_auto"] = True + + use_fast = kwargs.pop("use_fast", True) + tokenizer_type = kwargs.pop("tokenizer_type", None) + trust_remote_code = kwargs.pop("trust_remote_code", None) + gguf_file = kwargs.get("gguf_file", None) + + # First, let's see whether the tokenizer_type is passed so that we can leverage it + if tokenizer_type is not None: + tokenizer_class = None + tokenizer_class_tuple = TOKENIZER_MAPPING_NAMES.get(tokenizer_type, None) + + if tokenizer_class_tuple is None: + raise ValueError( + f"Passed `tokenizer_type` {tokenizer_type} does not exist. `tokenizer_type` should be one of " + f"{', '.join(c for c in TOKENIZER_MAPPING_NAMES.keys())}." + ) + + tokenizer_class_name, tokenizer_fast_class_name = tokenizer_class_tuple + + if use_fast: + if tokenizer_fast_class_name is not None: + tokenizer_class = tokenizer_class_from_name(tokenizer_fast_class_name) + else: + logger.warning( + "`use_fast` is set to `True` but the tokenizer class does not have a fast version. " + " Falling back to the slow version." + ) + if tokenizer_class is None: + tokenizer_class = tokenizer_class_from_name(tokenizer_class_name) + + if tokenizer_class is None: + raise ValueError(f"Tokenizer class {tokenizer_class_name} is not currently imported.") + + return tokenizer_class.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs) + + # Next, let's try to use the tokenizer_config file to get the tokenizer class. + tokenizer_config = get_tokenizer_config(pretrained_model_name_or_path, **kwargs) + if "_commit_hash" in tokenizer_config: + kwargs["_commit_hash"] = tokenizer_config["_commit_hash"] + config_tokenizer_class = tokenizer_config.get("tokenizer_class") + tokenizer_auto_map = None + if "auto_map" in tokenizer_config: + if isinstance(tokenizer_config["auto_map"], (tuple, list)): + # Legacy format for dynamic tokenizers + tokenizer_auto_map = tokenizer_config["auto_map"] + else: + tokenizer_auto_map = tokenizer_config["auto_map"].get("AutoTokenizer", None) + + # If that did not work, let's try to use the config. + if config_tokenizer_class is None: + if not isinstance(config, PretrainedConfig): + if gguf_file: + gguf_path = cached_file(pretrained_model_name_or_path, gguf_file, **kwargs) + config_dict = load_gguf_checkpoint(gguf_path, return_tensors=False)["config"] + config = AutoConfig.for_model(**config_dict) + else: + config = AutoConfig.from_pretrained( + pretrained_model_name_or_path, trust_remote_code=trust_remote_code, **kwargs + ) + config_tokenizer_class = config.tokenizer_class + if hasattr(config, "auto_map") and "AutoTokenizer" in config.auto_map: + tokenizer_auto_map = config.auto_map["AutoTokenizer"] + + has_remote_code = tokenizer_auto_map is not None + has_local_code = type(config) in TOKENIZER_MAPPING or ( + config_tokenizer_class is not None + and ( + tokenizer_class_from_name(config_tokenizer_class) is not None + or tokenizer_class_from_name(config_tokenizer_class + "Fast") is not None + ) + ) + trust_remote_code = resolve_trust_remote_code( + trust_remote_code, pretrained_model_name_or_path, has_local_code, has_remote_code + ) + + if has_remote_code and trust_remote_code: + if use_fast and tokenizer_auto_map[1] is not None: + class_ref = tokenizer_auto_map[1] + else: + class_ref = tokenizer_auto_map[0] + tokenizer_class = get_class_from_dynamic_module(class_ref, pretrained_model_name_or_path, **kwargs) + _ = kwargs.pop("code_revision", None) + if os.path.isdir(pretrained_model_name_or_path): + tokenizer_class.register_for_auto_class() + return tokenizer_class.from_pretrained( + pretrained_model_name_or_path, *inputs, trust_remote_code=trust_remote_code, **kwargs + ) + elif config_tokenizer_class is not None: + tokenizer_class = None + if use_fast and not config_tokenizer_class.endswith("Fast"): + tokenizer_class_candidate = f"{config_tokenizer_class}Fast" + tokenizer_class = tokenizer_class_from_name(tokenizer_class_candidate) + if tokenizer_class is None: + tokenizer_class_candidate = config_tokenizer_class + tokenizer_class = tokenizer_class_from_name(tokenizer_class_candidate) + if tokenizer_class is None: + raise ValueError( + f"Tokenizer class {tokenizer_class_candidate} does not exist or is not currently imported." + ) + return tokenizer_class.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs) + + # Otherwise we have to be creative. + # if model is an encoder decoder, the encoder tokenizer class is used by default + if isinstance(config, EncoderDecoderConfig): + if type(config.decoder) is not type(config.encoder): # noqa: E721 + logger.warning( + f"The encoder model config class: {config.encoder.__class__} is different from the decoder model " + f"config class: {config.decoder.__class__}. It is not recommended to use the " + "`AutoTokenizer.from_pretrained()` method in this case. Please use the encoder and decoder " + "specific tokenizer classes." + ) + config = config.encoder + + model_type = config_class_to_model_type(type(config).__name__) + if model_type is not None: + tokenizer_class_py, tokenizer_class_fast = TOKENIZER_MAPPING[type(config)] + + if tokenizer_class_fast and (use_fast or tokenizer_class_py is None): + return tokenizer_class_fast.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs) + else: + if tokenizer_class_py is not None: + return tokenizer_class_py.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs) + else: + raise ValueError( + "This tokenizer cannot be instantiated. Please make sure you have `sentencepiece` installed " + "in order to use this tokenizer." + ) + + raise ValueError( + f"Unrecognized configuration class {config.__class__} to build an AutoTokenizer.\n" + f"Model type should be one of {', '.join(c.__name__ for c in TOKENIZER_MAPPING.keys())}." + ) + + @staticmethod + def register(config_class, slow_tokenizer_class=None, fast_tokenizer_class=None, exist_ok=False): + """ + Register a new tokenizer in this mapping. + + + Args: + config_class ([`PretrainedConfig`]): + The configuration corresponding to the model to register. + slow_tokenizer_class ([`PretrainedTokenizer`], *optional*): + The slow tokenizer to register. + fast_tokenizer_class ([`PretrainedTokenizerFast`], *optional*): + The fast tokenizer to register. + """ + if slow_tokenizer_class is None and fast_tokenizer_class is None: + raise ValueError("You need to pass either a `slow_tokenizer_class` or a `fast_tokenizer_class") + if slow_tokenizer_class is not None and issubclass(slow_tokenizer_class, PreTrainedTokenizerFast): + raise ValueError("You passed a fast tokenizer in the `slow_tokenizer_class`.") + if fast_tokenizer_class is not None and issubclass(fast_tokenizer_class, PreTrainedTokenizer): + raise ValueError("You passed a slow tokenizer in the `fast_tokenizer_class`.") + + if ( + slow_tokenizer_class is not None + and fast_tokenizer_class is not None + and issubclass(fast_tokenizer_class, PreTrainedTokenizerFast) + and fast_tokenizer_class.slow_tokenizer_class != slow_tokenizer_class + ): + raise ValueError( + "The fast tokenizer class you are passing has a `slow_tokenizer_class` attribute that is not " + "consistent with the slow tokenizer class you passed (fast tokenizer has " + f"{fast_tokenizer_class.slow_tokenizer_class} and you passed {slow_tokenizer_class}. Fix one of those " + "so they match!" + ) + + # Avoid resetting a set slow/fast tokenizer if we are passing just the other ones. + if config_class in TOKENIZER_MAPPING._extra_content: + existing_slow, existing_fast = TOKENIZER_MAPPING[config_class] + if slow_tokenizer_class is None: + slow_tokenizer_class = existing_slow + if fast_tokenizer_class is None: + fast_tokenizer_class = existing_fast + + TOKENIZER_MAPPING.register(config_class, (slow_tokenizer_class, fast_tokenizer_class), exist_ok=exist_ok) + + +__all__ = ["TOKENIZER_MAPPING", "AutoTokenizer"] diff --git a/docs/transformers/build/lib/transformers/models/autoformer/__init__.py b/docs/transformers/build/lib/transformers/models/autoformer/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..48a329608039b1a4a96ac1f99c20ee427f845cd9 --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/autoformer/__init__.py @@ -0,0 +1,27 @@ +# Copyright 2023 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ...utils import _LazyModule +from ...utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .configuration_autoformer import * + from .modeling_autoformer import * +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/docs/transformers/build/lib/transformers/models/autoformer/configuration_autoformer.py b/docs/transformers/build/lib/transformers/models/autoformer/configuration_autoformer.py new file mode 100644 index 0000000000000000000000000000000000000000..aba83f19a5d942d528b1e443199e264feb51f83c --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/autoformer/configuration_autoformer.py @@ -0,0 +1,245 @@ +# coding=utf-8 +# Copyright 2023 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Autoformer model configuration""" + +from typing import List, Optional + +from ...configuration_utils import PretrainedConfig +from ...utils import logging + + +logger = logging.get_logger(__name__) + + +class AutoformerConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of an [`AutoformerModel`]. It is used to instantiate an + Autoformer model according to the specified arguments, defining the model architecture. Instantiating a + configuration with the defaults will yield a similar configuration to that of the Autoformer + [huggingface/autoformer-tourism-monthly](https://huggingface.co/huggingface/autoformer-tourism-monthly) + architecture. + + Configuration objects inherit from [`PretrainedConfig`] can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + prediction_length (`int`): + The prediction length for the decoder. In other words, the prediction horizon of the model. + context_length (`int`, *optional*, defaults to `prediction_length`): + The context length for the encoder. If unset, the context length will be the same as the + `prediction_length`. + distribution_output (`string`, *optional*, defaults to `"student_t"`): + The distribution emission head for the model. Could be either "student_t", "normal" or "negative_binomial". + loss (`string`, *optional*, defaults to `"nll"`): + The loss function for the model corresponding to the `distribution_output` head. For parametric + distributions it is the negative log likelihood (nll) - which currently is the only supported one. + input_size (`int`, *optional*, defaults to 1): + The size of the target variable which by default is 1 for univariate targets. Would be > 1 in case of + multivariate targets. + lags_sequence (`list[int]`, *optional*, defaults to `[1, 2, 3, 4, 5, 6, 7]`): + The lags of the input time series as covariates often dictated by the frequency. Default is `[1, 2, 3, 4, + 5, 6, 7]`. + scaling (`bool`, *optional* defaults to `True`): + Whether to scale the input targets. + num_time_features (`int`, *optional*, defaults to 0): + The number of time features in the input time series. + num_dynamic_real_features (`int`, *optional*, defaults to 0): + The number of dynamic real valued features. + num_static_categorical_features (`int`, *optional*, defaults to 0): + The number of static categorical features. + num_static_real_features (`int`, *optional*, defaults to 0): + The number of static real valued features. + cardinality (`list[int]`, *optional*): + The cardinality (number of different values) for each of the static categorical features. Should be a list + of integers, having the same length as `num_static_categorical_features`. Cannot be `None` if + `num_static_categorical_features` is > 0. + embedding_dimension (`list[int]`, *optional*): + The dimension of the embedding for each of the static categorical features. Should be a list of integers, + having the same length as `num_static_categorical_features`. Cannot be `None` if + `num_static_categorical_features` is > 0. + d_model (`int`, *optional*, defaults to 64): + Dimensionality of the transformer layers. + encoder_layers (`int`, *optional*, defaults to 2): + Number of encoder layers. + decoder_layers (`int`, *optional*, defaults to 2): + Number of decoder layers. + encoder_attention_heads (`int`, *optional*, defaults to 2): + Number of attention heads for each attention layer in the Transformer encoder. + decoder_attention_heads (`int`, *optional*, defaults to 2): + Number of attention heads for each attention layer in the Transformer decoder. + encoder_ffn_dim (`int`, *optional*, defaults to 32): + Dimension of the "intermediate" (often named feed-forward) layer in encoder. + decoder_ffn_dim (`int`, *optional*, defaults to 32): + Dimension of the "intermediate" (often named feed-forward) layer in decoder. + activation_function (`str` or `function`, *optional*, defaults to `"gelu"`): + The non-linear activation function (function or string) in the encoder and decoder. If string, `"gelu"` and + `"relu"` are supported. + dropout (`float`, *optional*, defaults to 0.1): + The dropout probability for all fully connected layers in the encoder, and decoder. + encoder_layerdrop (`float`, *optional*, defaults to 0.1): + The dropout probability for the attention and fully connected layers for each encoder layer. + decoder_layerdrop (`float`, *optional*, defaults to 0.1): + The dropout probability for the attention and fully connected layers for each decoder layer. + attention_dropout (`float`, *optional*, defaults to 0.1): + The dropout probability for the attention probabilities. + activation_dropout (`float`, *optional*, defaults to 0.1): + The dropout probability used between the two layers of the feed-forward networks. + num_parallel_samples (`int`, *optional*, defaults to 100): + The number of samples to generate in parallel for each time step of inference. + init_std (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated normal weight initialization distribution. + use_cache (`bool`, *optional*, defaults to `True`): + Whether to use the past key/values attentions (if applicable to the model) to speed up decoding. + label_length (`int`, *optional*, defaults to 10): + Start token length of the Autoformer decoder, which is used for direct multi-step prediction (i.e. + non-autoregressive generation). + moving_average (`int`, *optional*, defaults to 25): + The window size of the moving average. In practice, it's the kernel size in AvgPool1d of the Decomposition + Layer. + autocorrelation_factor (`int`, *optional*, defaults to 3): + "Attention" (i.e. AutoCorrelation mechanism) factor which is used to find top k autocorrelations delays. + It's recommended in the paper to set it to a number between 1 and 5. + + + Example: + + ```python + >>> from transformers import AutoformerConfig, AutoformerModel + + >>> # Initializing a default Autoformer configuration + >>> configuration = AutoformerConfig() + + >>> # Randomly initializing a model (with random weights) from the configuration + >>> model = AutoformerModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "autoformer" + attribute_map = { + "hidden_size": "d_model", + "num_attention_heads": "encoder_attention_heads", + "num_hidden_layers": "encoder_layers", + } + + def __init__( + self, + prediction_length: Optional[int] = None, + context_length: Optional[int] = None, + distribution_output: str = "student_t", + loss: str = "nll", + input_size: int = 1, + lags_sequence: List[int] = [1, 2, 3, 4, 5, 6, 7], + scaling: bool = True, + num_time_features: int = 0, + num_dynamic_real_features: int = 0, + num_static_categorical_features: int = 0, + num_static_real_features: int = 0, + cardinality: Optional[List[int]] = None, + embedding_dimension: Optional[List[int]] = None, + d_model: int = 64, + encoder_attention_heads: int = 2, + decoder_attention_heads: int = 2, + encoder_layers: int = 2, + decoder_layers: int = 2, + encoder_ffn_dim: int = 32, + decoder_ffn_dim: int = 32, + activation_function: str = "gelu", + dropout: float = 0.1, + encoder_layerdrop: float = 0.1, + decoder_layerdrop: float = 0.1, + attention_dropout: float = 0.1, + activation_dropout: float = 0.1, + num_parallel_samples: int = 100, + init_std: float = 0.02, + use_cache: bool = True, + is_encoder_decoder=True, + # Autoformer arguments + label_length: int = 10, + moving_average: int = 25, + autocorrelation_factor: int = 3, + **kwargs, + ): + # time series specific configuration + self.prediction_length = prediction_length + self.context_length = context_length if context_length is not None else prediction_length + self.distribution_output = distribution_output + self.loss = loss + self.input_size = input_size + self.num_time_features = num_time_features + self.lags_sequence = lags_sequence + self.scaling = scaling + self.num_dynamic_real_features = num_dynamic_real_features + self.num_static_real_features = num_static_real_features + self.num_static_categorical_features = num_static_categorical_features + if cardinality is not None and num_static_categorical_features > 0: + if len(cardinality) != num_static_categorical_features: + raise ValueError( + "The cardinality should be a list of the same length as `num_static_categorical_features`" + ) + self.cardinality = cardinality + else: + self.cardinality = [0] + if embedding_dimension is not None and num_static_categorical_features > 0: + if len(embedding_dimension) != num_static_categorical_features: + raise ValueError( + "The embedding dimension should be a list of the same length as `num_static_categorical_features`" + ) + self.embedding_dimension = embedding_dimension + else: + self.embedding_dimension = [min(50, (cat + 1) // 2) for cat in self.cardinality] + self.num_parallel_samples = num_parallel_samples + + # Transformer architecture configuration + self.feature_size = input_size * len(self.lags_sequence) + self._number_of_features + self.d_model = d_model + self.encoder_attention_heads = encoder_attention_heads + self.decoder_attention_heads = decoder_attention_heads + self.encoder_ffn_dim = encoder_ffn_dim + self.decoder_ffn_dim = decoder_ffn_dim + self.encoder_layers = encoder_layers + self.decoder_layers = decoder_layers + + self.dropout = dropout + self.attention_dropout = attention_dropout + self.activation_dropout = activation_dropout + self.encoder_layerdrop = encoder_layerdrop + self.decoder_layerdrop = decoder_layerdrop + + self.activation_function = activation_function + self.init_std = init_std + + self.use_cache = use_cache + + # Autoformer + self.label_length = label_length + self.moving_average = moving_average + self.autocorrelation_factor = autocorrelation_factor + + super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs) + + @property + def _number_of_features(self) -> int: + return ( + sum(self.embedding_dimension) + + self.num_dynamic_real_features + + self.num_time_features + + self.num_static_real_features + + self.input_size * 2 # the log1p(abs(loc)) and log(scale) features + ) + + +__all__ = ["AutoformerConfig"] diff --git a/docs/transformers/build/lib/transformers/models/autoformer/modeling_autoformer.py b/docs/transformers/build/lib/transformers/models/autoformer/modeling_autoformer.py new file mode 100644 index 0000000000000000000000000000000000000000..eb32013d5e7324273b9a978b477b5593982e6fe5 --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/autoformer/modeling_autoformer.py @@ -0,0 +1,2152 @@ +# coding=utf-8 +# Copyright (c) 2021 THUML @ Tsinghua University +# Copyright 2023 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# Copyright 2023 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch Autoformer model.""" + +import math +from dataclasses import dataclass +from typing import List, Optional, Tuple, Union + +import numpy as np +import torch +import torch.utils.checkpoint +from torch import nn + +from ...activations import ACT2FN +from ...modeling_attn_mask_utils import _prepare_4d_attention_mask +from ...modeling_outputs import ( + BaseModelOutput, + ModelOutput, + SampleTSPredictionOutput, + Seq2SeqTSPredictionOutput, +) +from ...modeling_utils import PreTrainedModel +from ...time_series_utils import NegativeBinomialOutput, NormalOutput, StudentTOutput +from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings +from .configuration_autoformer import AutoformerConfig + + +logger = logging.get_logger(__name__) + +_CONFIG_FOR_DOC = "AutoformerConfig" + + +@dataclass +class AutoFormerDecoderOutput(ModelOutput): + """ + Base class for model's outputs that may also contain a past key/values (to speed up sequential decoding). + + Args: + last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): + Sequence of hidden-states at the output of the last layer of the model. + + If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1, + hidden_size)` is output. + trend (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): + Trend tensor for each time series. + past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): + Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape + `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and optionally if + `config.is_encoder_decoder=True` 2 additional tensors of shape `(batch_size, num_heads, + encoder_sequence_length, embed_size_per_head)`. + + Contains pre-computed hidden-states (key and values in the self-attention blocks and optionally if + `config.is_encoder_decoder=True` in the cross-attention blocks) that can be used (see `past_key_values` + input) to speed up sequential decoding. + hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, + + one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the optional initial embedding outputs. + attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` and `config.add_cross_attention=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the + weighted average in the cross-attention heads. + """ + + last_hidden_state: Optional[torch.FloatTensor] = None + trend: Optional[torch.FloatTensor] = None + past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None + hidden_states: Optional[Tuple[torch.FloatTensor]] = None + attentions: Optional[Tuple[torch.FloatTensor]] = None + cross_attentions: Optional[Tuple[torch.FloatTensor]] = None + + +@dataclass +class AutoformerModelOutput(ModelOutput): + """ + Autoformer model output that contains the additional trend output. + + Args: + last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): + Sequence of hidden-states at the output of the last layer of the decoder of the model. + + If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1, + hidden_size)` is output. + trend (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): + Trend tensor for each time series. + past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): + Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape + `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape + `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`. + + Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention + blocks) that can be used (see `past_key_values` input) to speed up sequential decoding. + decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, + + one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the decoder at the output of each layer plus the optional initial embedding outputs. + decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the + self-attention heads. + cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the + weighted average in the cross-attention heads. + encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Sequence of hidden-states at the output of the last layer of the encoder of the model. + encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, + + one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the encoder at the output of each layer plus the optional initial embedding outputs. + encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the + self-attention heads. + loc (`torch.FloatTensor` of shape `(batch_size,)` or `(batch_size, input_size)`, *optional*): + Shift values of each time series' context window which is used to give the model inputs of the same + magnitude and then used to shift back to the original magnitude. + scale (`torch.FloatTensor` of shape `(batch_size,)` or `(batch_size, input_size)`, *optional*): + Scaling values of each time series' context window which is used to give the model inputs of the same + magnitude and then used to rescale back to the original magnitude. + static_features: (`torch.FloatTensor` of shape `(batch_size, feature size)`, *optional*): + Static features of each time series' in a batch which are copied to the covariates at inference time. + """ + + last_hidden_state: Optional[torch.FloatTensor] = None + trend: Optional[torch.FloatTensor] = None + past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None + decoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None + decoder_attentions: Optional[Tuple[torch.FloatTensor]] = None + cross_attentions: Optional[Tuple[torch.FloatTensor]] = None + encoder_last_hidden_state: Optional[torch.FloatTensor] = None + encoder_hidden_states: Optional[Tuple[torch.FloatTensor]] = None + encoder_attentions: Optional[Tuple[torch.FloatTensor]] = None + loc: Optional[torch.FloatTensor] = None + scale: Optional[torch.FloatTensor] = None + static_features: Optional[torch.FloatTensor] = None + + +# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.TimeSeriesFeatureEmbedder with TimeSeries->Autoformer +class AutoformerFeatureEmbedder(nn.Module): + """ + Embed a sequence of categorical features. + + Args: + cardinalities (`list[int]`): + List of cardinalities of the categorical features. + embedding_dims (`list[int]`): + List of embedding dimensions of the categorical features. + """ + + def __init__(self, cardinalities: List[int], embedding_dims: List[int]) -> None: + super().__init__() + + self.num_features = len(cardinalities) + self.embedders = nn.ModuleList([nn.Embedding(c, d) for c, d in zip(cardinalities, embedding_dims)]) + + def forward(self, features: torch.Tensor) -> torch.Tensor: + if self.num_features > 1: + # we slice the last dimension, giving an array of length + # self.num_features with shape (N,T) or (N) + cat_feature_slices = torch.chunk(features, self.num_features, dim=-1) + else: + cat_feature_slices = [features] + + return torch.cat( + [ + embed(cat_feature_slice.squeeze(-1)) + for embed, cat_feature_slice in zip(self.embedders, cat_feature_slices) + ], + dim=-1, + ) + + +# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.TimeSeriesStdScaler with TimeSeriesTransformer->Autoformer,TimeSeries->Autoformer +class AutoformerStdScaler(nn.Module): + """ + Standardize features by calculating the mean and scaling along the first dimension, and then normalizes it by + subtracting from the mean and dividing by the standard deviation. + """ + + def __init__(self, config: AutoformerConfig): + super().__init__() + self.dim = config.scaling_dim if hasattr(config, "scaling_dim") else 1 + self.keepdim = config.keepdim if hasattr(config, "keepdim") else True + self.minimum_scale = config.minimum_scale if hasattr(config, "minimum_scale") else 1e-5 + + def forward( + self, data: torch.Tensor, observed_indicator: torch.Tensor + ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + """ + Parameters: + data (`torch.Tensor` of shape `(batch_size, sequence_length, num_input_channels)`): + input for Batch norm calculation + observed_indicator (`torch.BoolTensor` of shape `(batch_size, sequence_length, num_input_channels)`): + Calculating the scale on the observed indicator. + Returns: + tuple of `torch.Tensor` of shapes + (`(batch_size, sequence_length, num_input_channels)`,`(batch_size, 1, num_input_channels)`, + `(batch_size, 1, num_input_channels)`) + """ + denominator = observed_indicator.sum(self.dim, keepdim=self.keepdim) + denominator = denominator.clamp_min(1.0) + loc = (data * observed_indicator).sum(self.dim, keepdim=self.keepdim) / denominator + + variance = (((data - loc) * observed_indicator) ** 2).sum(self.dim, keepdim=self.keepdim) / denominator + scale = torch.sqrt(variance + self.minimum_scale) + return (data - loc) / scale, loc, scale + + +# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.TimeSeriesMeanScaler with TimeSeriesTransformer->Autoformer,TimeSeries->Autoformer +class AutoformerMeanScaler(nn.Module): + """ + Computes a scaling factor as the weighted average absolute value along the first dimension, and scales the data + accordingly. + """ + + def __init__(self, config: AutoformerConfig): + super().__init__() + self.dim = config.scaling_dim if hasattr(config, "scaling_dim") else 1 + self.keepdim = config.keepdim if hasattr(config, "keepdim") else True + self.minimum_scale = config.minimum_scale if hasattr(config, "minimum_scale") else 1e-10 + self.default_scale = config.default_scale if hasattr(config, "default_scale") else None + + def forward( + self, data: torch.Tensor, observed_indicator: torch.Tensor + ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + """ + Parameters: + data (`torch.Tensor` of shape `(batch_size, sequence_length, num_input_channels)`): + input for Batch norm calculation + observed_indicator (`torch.BoolTensor` of shape `(batch_size, sequence_length, num_input_channels)`): + Calculating the scale on the observed indicator. + Returns: + tuple of `torch.Tensor` of shapes + (`(batch_size, sequence_length, num_input_channels)`,`(batch_size, 1, num_input_channels)`, + `(batch_size, 1, num_input_channels)`) + """ + ts_sum = (data * observed_indicator).abs().sum(self.dim, keepdim=True) + num_observed = observed_indicator.sum(self.dim, keepdim=True) + + scale = ts_sum / torch.clamp(num_observed, min=1) + + # If `default_scale` is provided, we use it, otherwise we use the scale + # of the batch. + if self.default_scale is None: + batch_sum = ts_sum.sum(dim=0) + batch_observations = torch.clamp(num_observed.sum(0), min=1) + default_scale = torch.squeeze(batch_sum / batch_observations) + else: + default_scale = self.default_scale * torch.ones_like(scale) + + # apply default scale where there are no observations + scale = torch.where(num_observed > 0, scale, default_scale) + + # ensure the scale is at least `self.minimum_scale` + scale = torch.clamp(scale, min=self.minimum_scale) + scaled_data = data / scale + + if not self.keepdim: + scale = scale.squeeze(dim=self.dim) + + return scaled_data, torch.zeros_like(scale), scale + + +# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.TimeSeriesNOPScaler with TimeSeriesTransformer->Autoformer,TimeSeries->Autoformer +class AutoformerNOPScaler(nn.Module): + """ + Assigns a scaling factor equal to 1 along the first dimension, and therefore applies no scaling to the input data. + """ + + def __init__(self, config: AutoformerConfig): + super().__init__() + self.dim = config.scaling_dim if hasattr(config, "scaling_dim") else 1 + self.keepdim = config.keepdim if hasattr(config, "keepdim") else True + + def forward( + self, data: torch.Tensor, observed_indicator: Optional[torch.Tensor] = None + ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + """ + Parameters: + data (`torch.Tensor` of shape `(batch_size, sequence_length, num_input_channels)`): + input for Batch norm calculation + Returns: + tuple of `torch.Tensor` of shapes + (`(batch_size, sequence_length, num_input_channels)`,`(batch_size, 1, num_input_channels)`, + `(batch_size, 1, num_input_channels)`) + """ + scale = torch.ones_like(data, requires_grad=False).mean(dim=self.dim, keepdim=self.keepdim) + loc = torch.zeros_like(data, requires_grad=False).mean(dim=self.dim, keepdim=self.keepdim) + return data, loc, scale + + +# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.weighted_average +def weighted_average(input_tensor: torch.Tensor, weights: Optional[torch.Tensor] = None, dim=None) -> torch.Tensor: + """ + Computes the weighted average of a given tensor across a given `dim`, masking values associated with weight zero, + meaning instead of `nan * 0 = nan` you will get `0 * 0 = 0`. + + Args: + input_tensor (`torch.FloatTensor`): + Input tensor, of which the average must be computed. + weights (`torch.FloatTensor`, *optional*): + Weights tensor, of the same shape as `input_tensor`. + dim (`int`, *optional*): + The dim along which to average `input_tensor`. + + Returns: + `torch.FloatTensor`: The tensor with values averaged along the specified `dim`. + """ + if weights is not None: + weighted_tensor = torch.where(weights != 0, input_tensor * weights, torch.zeros_like(input_tensor)) + sum_weights = torch.clamp(weights.sum(dim=dim) if dim else weights.sum(), min=1.0) + return (weighted_tensor.sum(dim=dim) if dim else weighted_tensor.sum()) / sum_weights + else: + return input_tensor.mean(dim=dim) + + +# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.nll +def nll(input: torch.distributions.Distribution, target: torch.Tensor) -> torch.Tensor: + """ + Computes the negative log likelihood loss from input distribution with respect to target. + """ + return -input.log_prob(target) + + +# Copied from transformers.models.marian.modeling_marian.MarianSinusoidalPositionalEmbedding with Marian->Autoformer +class AutoformerSinusoidalPositionalEmbedding(nn.Embedding): + """This module produces sinusoidal positional embeddings of any length.""" + + def __init__(self, num_positions: int, embedding_dim: int, padding_idx: Optional[int] = None) -> None: + super().__init__(num_positions, embedding_dim) + + def _init_weight(self): + """ + Identical to the XLM create_sinusoidal_embeddings except features are not interleaved. The cos features are in + the 2nd half of the vector. [dim // 2:] + """ + n_pos, dim = self.weight.shape + position_enc = np.array( + [[pos / np.power(10000, 2 * (j // 2) / dim) for j in range(dim)] for pos in range(n_pos)] + ) + out = torch.empty(n_pos, dim, dtype=self.weight.dtype, requires_grad=False) + sentinel = dim // 2 if dim % 2 == 0 else (dim // 2) + 1 + out[:, 0:sentinel] = torch.FloatTensor(np.sin(position_enc[:, 0::2])) + out[:, sentinel:] = torch.FloatTensor(np.cos(position_enc[:, 1::2])) + self.weight = nn.Parameter(out, requires_grad=False) + + @torch.no_grad() + def forward(self, input_ids_shape: torch.Size, past_key_values_length: int = 0) -> torch.Tensor: + """`input_ids_shape` is expected to be [bsz x seqlen].""" + bsz, seq_len = input_ids_shape[:2] + positions = torch.arange( + past_key_values_length, past_key_values_length + seq_len, dtype=torch.long, device=self.weight.device + ) + return super().forward(positions) + + +# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.TimeSeriesValueEmbedding with TimeSeries->Autoformer +class AutoformerValueEmbedding(nn.Module): + def __init__(self, feature_size, d_model): + super().__init__() + self.value_projection = nn.Linear(in_features=feature_size, out_features=d_model, bias=False) + + def forward(self, x): + return self.value_projection(x) + + +# Class based on +# https://github.com/thuml/Autoformer/blob/c6a0694ff484753f2d986cc0bb1f99ee850fc1a8/layers/Autoformer_EncDec.py#L39 +# where AutoformerSeriesDecompositionLayer is series_decomp + moving_average +class AutoformerSeriesDecompositionLayer(nn.Module): + """ + Returns the trend and the seasonal parts of the time series. Calculated as: + + x_trend = AvgPool(Padding(X)) and x_seasonal = X - x_trend + """ + + def __init__(self, config: AutoformerConfig): + super().__init__() + self.kernel_size = config.moving_average + self.avg = nn.AvgPool1d(kernel_size=self.kernel_size, stride=1, padding=0) + + def forward(self, x): + """Input shape: Batch x Time x EMBED_DIM""" + # padding on the both ends of time series + num_of_pads = (self.kernel_size - 1) // 2 + front = x[:, 0:1, :].repeat(1, num_of_pads, 1) + end = x[:, -1:, :].repeat(1, num_of_pads, 1) + x_padded = torch.cat([front, x, end], dim=1) + + # calculate the trend and seasonal part of the series + x_trend = self.avg(x_padded.permute(0, 2, 1)).permute(0, 2, 1) + x_seasonal = x - x_trend + return x_seasonal, x_trend + + +# Class based on +# https://github.com/thuml/Autoformer/blob/c6a0694ff484753f2d986cc0bb1f99ee850fc1a8/layers/Autoformer_EncDec.py#L6 +# where AutoformerLayernorm is my_Layernorm +class AutoformerLayernorm(nn.Module): + """ + Special designed layer normalization for the seasonal part, calculated as: AutoformerLayernorm(x) = nn.LayerNorm(x) + - torch.mean(nn.LayerNorm(x)) + """ + + def __init__(self, config: AutoformerConfig): + super().__init__() + self.layernorm = nn.LayerNorm(config.d_model) + + def forward(self, x): + x_hat = self.layernorm(x) + bias = torch.mean(x_hat, dim=1).unsqueeze(1).repeat(1, x.shape[1], 1) + return x_hat - bias + + +class AutoformerAttention(nn.Module): + """ + AutoCorrelation Mechanism with the following two phases: + (1) period-based dependencies discovery (2) time delay aggregation + This block replace the canonical self-attention mechanism. + """ + + def __init__( + self, + embed_dim: int, + num_heads: int, + dropout: float = 0.0, + is_decoder: bool = False, + bias: bool = True, + autocorrelation_factor: int = 3, + ): + super().__init__() + self.embed_dim = embed_dim + self.num_heads = num_heads + self.dropout = dropout + self.head_dim = embed_dim // num_heads + + if (self.head_dim * num_heads) != self.embed_dim: + raise ValueError( + f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}" + f" and `num_heads`: {num_heads})." + ) + self.scaling = self.head_dim**-0.5 + self.is_decoder = is_decoder + + self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias) + self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias) + self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias) + self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias) + + self.autocorrelation_factor = autocorrelation_factor + + def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int): + return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous() + + def forward( + self, + hidden_states: torch.Tensor, + key_value_states: Optional[torch.Tensor] = None, + past_key_value: Optional[Tuple[torch.Tensor]] = None, + attention_mask: Optional[torch.Tensor] = None, + layer_head_mask: Optional[torch.Tensor] = None, + output_attentions: bool = False, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + """Input shape: Batch x Time x Channel""" + + # if key_value_states are provided this layer is used as a cross-attention layer + # for the decoder + is_cross_attention = key_value_states is not None + + bsz, tgt_len, _ = hidden_states.size() + + # get query proj + query_states = self.q_proj(hidden_states) + # get key, value proj + # `past_key_value[0].shape[2] == key_value_states.shape[1]` + # is checking that the `sequence_length` of the `past_key_value` is the same as + # the provided `key_value_states` to support prefix tuning + if ( + is_cross_attention + and past_key_value is not None + and past_key_value[0].shape[2] == key_value_states.shape[1] + ): + # reuse k,v, cross_attentions + key_states = past_key_value[0] + value_states = past_key_value[1] + elif is_cross_attention: + # cross_attentions + key_states = self._shape(self.k_proj(key_value_states), -1, bsz) + value_states = self._shape(self.v_proj(key_value_states), -1, bsz) + elif past_key_value is not None: + # reuse k, v, self_attention + key_states = self._shape(self.k_proj(hidden_states), -1, bsz) + value_states = self._shape(self.v_proj(hidden_states), -1, bsz) + key_states = torch.cat([past_key_value[0], key_states], dim=2) + value_states = torch.cat([past_key_value[1], value_states], dim=2) + else: + # self_attention + key_states = self._shape(self.k_proj(hidden_states), -1, bsz) + value_states = self._shape(self.v_proj(hidden_states), -1, bsz) + + if self.is_decoder: + # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states. + # Further calls to cross_attention layer can then reuse all cross-attention + # key/value_states (first "if" case) + # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of + # all previous decoder key/value_states. Further calls to uni-directional self-attention + # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case) + # if encoder bi-directional self-attention `past_key_value` is always `None` + past_key_value = (key_states, value_states) + + proj_shape = (bsz * self.num_heads, -1, self.head_dim) + query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape) + key_states = key_states.view(*proj_shape) + value_states = value_states.view(*proj_shape) + + # (1) period-based dependencies discovery + # Resize (truncation or zero filling) + queries_time_length = query_states.size(1) + values_time_length = value_states.size(1) + if queries_time_length > values_time_length: + query_states = query_states[:, : (queries_time_length - values_time_length), :] + zeros = torch.zeros_like(query_states).float() + value_states = torch.cat([value_states, zeros], dim=1) + key_states = torch.cat([key_states, zeros], dim=1) + else: + value_states = value_states[:, :queries_time_length, :] + key_states = key_states[:, :queries_time_length, :] + + query_states_fft = torch.fft.rfft(query_states, n=tgt_len, dim=1) + key_states_fft = torch.fft.rfft(key_states, n=tgt_len, dim=1) + attn_weights = query_states_fft * torch.conj(key_states_fft) + attn_weights = torch.fft.irfft(attn_weights, n=tgt_len, dim=1) # Autocorrelation(Q,K) + + src_len = key_states.size(1) + channel = key_states.size(2) + + if attn_weights.size() != (bsz * self.num_heads, tgt_len, channel): + raise ValueError( + f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, channel)}, but is" + f" {attn_weights.size()}" + ) + + if attention_mask is not None: + if attention_mask.size() != (bsz, 1, tgt_len, src_len): + raise ValueError( + f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}" + ) + attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attention_mask + attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len) + + if layer_head_mask is not None: + if layer_head_mask.size() != (self.num_heads,): + raise ValueError( + f"Head mask for a single layer should be of size {(self.num_heads,)}, but is" + f" {layer_head_mask.size()}" + ) + attn_weights = layer_head_mask.view(1, -1, 1, 1) * attn_weights.view(bsz, self.num_heads, tgt_len, channel) + attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, channel) + + if output_attentions: + # this operation is a bit awkward, but it's required to + # make sure that attn_weights keeps its gradient. + # In order to do so, attn_weights have to be reshaped + # twice and have to be reused in the following + attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, channel) + attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, channel) + else: + attn_weights_reshaped = None + + # time delay aggregation + time_length = value_states.size(1) + autocorrelations = attn_weights.view(bsz, self.num_heads, tgt_len, channel) + + # find top k autocorrelations delays + top_k = int(self.autocorrelation_factor * math.log(time_length)) + autocorrelations_mean_on_head_channel = torch.mean(autocorrelations, dim=(1, -1)) # bsz x tgt_len + if self.training: + autocorrelations_mean_on_bsz = torch.mean(autocorrelations_mean_on_head_channel, dim=0) + _, top_k_delays_index = torch.topk(autocorrelations_mean_on_bsz, top_k) + top_k_autocorrelations = torch.stack( + [autocorrelations_mean_on_head_channel[:, top_k_delays_index[i]] for i in range(top_k)], dim=-1 + ) + else: + top_k_autocorrelations, top_k_delays_index = torch.topk( + autocorrelations_mean_on_head_channel, top_k, dim=1 + ) + + top_k_autocorrelations = torch.softmax(top_k_autocorrelations, dim=-1) # bsz x top_k + + # compute aggregation: value_states.roll(delay) * top_k_autocorrelations(delay) + if not self.training: + # used for compute values_states.roll(delay) in inference + tmp_values = value_states.repeat(1, 2, 1) + init_index = ( + torch.arange(time_length) + .view(1, -1, 1) + .repeat(bsz * self.num_heads, 1, channel) + .to(value_states.device) + ) + + delays_agg = torch.zeros_like(value_states).float() # bsz x time_length x channel + for i in range(top_k): + # compute value_states roll delay + if not self.training: + tmp_delay = init_index + top_k_delays_index[:, i].view(-1, 1, 1).repeat( + self.num_heads, tgt_len, channel + ) + value_states_roll_delay = torch.gather(tmp_values, dim=1, index=tmp_delay) + else: + value_states_roll_delay = value_states.roll(shifts=-int(top_k_delays_index[i]), dims=1) + + # aggregation + top_k_autocorrelations_at_delay = ( + top_k_autocorrelations[:, i].view(-1, 1, 1).repeat(self.num_heads, tgt_len, channel) + ) + delays_agg += value_states_roll_delay * top_k_autocorrelations_at_delay + + attn_output = delays_agg.contiguous() + + if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim): + raise ValueError( + f"`attn_output` should be of size {(bsz * self.num_heads, tgt_len, self.head_dim)}, but is" + f" {attn_output.size()}" + ) + + attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim) + attn_output = attn_output.transpose(1, 2) + + # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be + # partitioned across GPUs when using tensor-parallelism. + attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim) + + attn_output = self.out_proj(attn_output) + + return attn_output, attn_weights_reshaped, past_key_value + + +class AutoformerEncoderLayer(nn.Module): + def __init__(self, config: AutoformerConfig): + super().__init__() + self.embed_dim = config.d_model + self.self_attn = AutoformerAttention( + embed_dim=self.embed_dim, + num_heads=config.encoder_attention_heads, + dropout=config.attention_dropout, + autocorrelation_factor=config.autocorrelation_factor, + ) + self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim) + self.dropout = config.dropout + self.activation_fn = ACT2FN[config.activation_function] + self.activation_dropout = config.activation_dropout + self.fc1 = nn.Linear(self.embed_dim, config.encoder_ffn_dim) + self.fc2 = nn.Linear(config.encoder_ffn_dim, self.embed_dim) + self.final_layer_norm = AutoformerLayernorm(config) + self.decomp1 = AutoformerSeriesDecompositionLayer(config) + self.decomp2 = AutoformerSeriesDecompositionLayer(config) + + def forward( + self, + hidden_states: torch.FloatTensor, + attention_mask: torch.FloatTensor, + layer_head_mask: torch.FloatTensor, + output_attentions: Optional[bool] = False, + ) -> Tuple[torch.FloatTensor, Optional[torch.FloatTensor]]: + """ + Args: + hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)` + attention_mask (`torch.FloatTensor`): attention mask of size + `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values. + layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size + `(encoder_attention_heads,)`. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + """ + residual = hidden_states + hidden_states, attn_weights, _ = self.self_attn( + hidden_states=hidden_states, + attention_mask=attention_mask, + layer_head_mask=layer_head_mask, + output_attentions=output_attentions, + ) + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = residual + hidden_states + # added layer norm here as an improvement + hidden_states = self.self_attn_layer_norm(hidden_states) + hidden_states, _ = self.decomp1(hidden_states) + + residual = hidden_states + hidden_states = self.activation_fn(self.fc1(hidden_states)) + hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training) + hidden_states = self.fc2(hidden_states) + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = residual + hidden_states + hidden_states, _ = self.decomp2(hidden_states) + hidden_states = self.final_layer_norm(hidden_states) + + if hidden_states.dtype == torch.float16 and ( + torch.isinf(hidden_states).any() or torch.isnan(hidden_states).any() + ): + clamp_value = torch.finfo(hidden_states.dtype).max - 1000 + hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value) + + outputs = (hidden_states,) + + if output_attentions: + outputs += (attn_weights,) + + return outputs + + +class AutoformerDecoderLayer(nn.Module): + def __init__(self, config: AutoformerConfig): + super().__init__() + self.embed_dim = config.d_model + + self.self_attn = AutoformerAttention( + embed_dim=self.embed_dim, + num_heads=config.decoder_attention_heads, + dropout=config.attention_dropout, + is_decoder=True, + autocorrelation_factor=config.autocorrelation_factor, + ) + self.dropout = config.dropout + self.activation_fn = ACT2FN[config.activation_function] + self.activation_dropout = config.activation_dropout + + self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim) + self.encoder_attn = AutoformerAttention( + self.embed_dim, + config.decoder_attention_heads, + dropout=config.attention_dropout, + is_decoder=True, + autocorrelation_factor=config.autocorrelation_factor, + ) + self.encoder_attn_layer_norm = nn.LayerNorm(self.embed_dim) + self.fc1 = nn.Linear(self.embed_dim, config.decoder_ffn_dim) + self.fc2 = nn.Linear(config.decoder_ffn_dim, self.embed_dim) + self.final_layer_norm = AutoformerLayernorm(config) + + self.decomp1 = AutoformerSeriesDecompositionLayer(config) + self.decomp2 = AutoformerSeriesDecompositionLayer(config) + self.decomp3 = AutoformerSeriesDecompositionLayer(config) + + # source: https://github.com/thuml/Autoformer/blob/e6371e24f2ae2dd53e472edefdd5814c5176f864/layers/Autoformer_EncDec.py#L128 + self.trend_projection = nn.Conv1d( + in_channels=self.embed_dim, + out_channels=config.feature_size, + kernel_size=3, + stride=1, + padding=1, + padding_mode="circular", + bias=False, + ) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, + encoder_attention_mask: Optional[torch.Tensor] = None, + layer_head_mask: Optional[torch.Tensor] = None, + cross_attn_layer_head_mask: Optional[torch.Tensor] = None, + past_key_value: Optional[Tuple[torch.Tensor]] = None, + output_attentions: Optional[bool] = False, + use_cache: Optional[bool] = True, + ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]: + """ + Args: + hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)` + attention_mask (`torch.FloatTensor`): attention mask of size + `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values. + encoder_hidden_states (`torch.FloatTensor`): + cross attention input to the layer of shape `(batch, seq_len, embed_dim)` + encoder_attention_mask (`torch.FloatTensor`): encoder attention mask of size + `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values. + layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size + `(encoder_attention_heads,)`. + cross_attn_layer_head_mask (`torch.FloatTensor`): mask for cross-attention heads in a given layer of + size `(decoder_attention_heads,)`. + past_key_value (`Tuple(torch.FloatTensor)`): cached past key and value projection states + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + use_cache: (`bool`, *optional*, defaults to `True`): + Whether or not the model should return the `present_key_value` state to be used for subsequent + decoding. + """ + residual = hidden_states + + # Self Attention + # decoder uni-directional self-attention cached key/values tuple is at positions 1,2 + self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None + # add present self-attn cache to positions 1,2 of present_key_value tuple + hidden_states, self_attn_weights, present_key_value = self.self_attn( + hidden_states=hidden_states, + past_key_value=self_attn_past_key_value, + attention_mask=attention_mask, + layer_head_mask=layer_head_mask, + output_attentions=output_attentions, + ) + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = residual + hidden_states + hidden_states, trend1 = self.decomp1(hidden_states) + # added layer norm here as an improvement + hidden_states = self.self_attn_layer_norm(hidden_states) + + # Cross-Attention Block + cross_attn_present_key_value = None + cross_attn_weights = None + if encoder_hidden_states is not None: + residual = hidden_states + + # cross_attn cached key/values tuple is at positions 3,4 of present_key_value tuple + cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None + hidden_states, cross_attn_weights, cross_attn_present_key_value = self.encoder_attn( + hidden_states=hidden_states, + key_value_states=encoder_hidden_states, + attention_mask=encoder_attention_mask, + layer_head_mask=cross_attn_layer_head_mask, + past_key_value=cross_attn_past_key_value, + output_attentions=output_attentions, + ) + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = residual + hidden_states + hidden_states, trend2 = self.decomp2(hidden_states) + # added layer norm here as an improvement + hidden_states = self.encoder_attn_layer_norm(hidden_states) + + # add cross-attn to positions 3,4 of present_key_value tuple + present_key_value = present_key_value + cross_attn_present_key_value + + # Fully Connected + residual = hidden_states + hidden_states = self.activation_fn(self.fc1(hidden_states)) + hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training) + hidden_states = self.fc2(hidden_states) + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = residual + hidden_states + hidden_states, trend3 = self.decomp3(hidden_states) + hidden_states = self.final_layer_norm(hidden_states) + + if encoder_hidden_states is not None: + residual_trend = trend1 + trend2 + trend3 + else: + residual_trend = trend1 + trend3 + residual_trend = self.trend_projection(residual_trend.permute(0, 2, 1)).transpose(1, 2) + outputs = ((hidden_states, residual_trend),) + + if output_attentions: + outputs += (self_attn_weights, cross_attn_weights) + + if use_cache: + outputs += (present_key_value,) + + return outputs + + +class AutoformerPreTrainedModel(PreTrainedModel): + config_class = AutoformerConfig + base_model_prefix = "model" + main_input_name = "past_values" + supports_gradient_checkpointing = True + + def _init_weights(self, module): + std = self.config.init_std + if isinstance(module, (nn.Linear, nn.Conv1d)): + module.weight.data.normal_(mean=0.0, std=std) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, AutoformerSinusoidalPositionalEmbedding): + module._init_weight() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=std) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + + +AUTOFORMER_START_DOCSTRING = r""" + This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads + etc.) + + This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. + Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage + and behavior. + + Parameters: + config ([`AutoformerConfig`]): + Model configuration class with all the parameters of the model. Initializing with a config file does not + load the weights associated with the model, only the configuration. Check out the + [`~PreTrainedModel.from_pretrained`] method to load the model weights. +""" + +AUTOFORMER_INPUTS_DOCSTRING = r""" + Args: + past_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`): + Past values of the time series, that serve as context in order to predict the future. These values may + contain lags, i.e. additional values from the past which are added in order to serve as "extra context". + The `past_values` is what the Transformer encoder gets as input (with optional additional features, such as + `static_categorical_features`, `static_real_features`, `past_time_features`). + + The sequence length here is equal to `context_length` + `max(config.lags_sequence)`. + + Missing values need to be replaced with zeros. + + past_time_features (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_features)`, *optional*): + Optional time features, which the model internally will add to `past_values`. These could be things like + "month of year", "day of the month", etc. encoded as vectors (for instance as Fourier features). These + could also be so-called "age" features, which basically help the model know "at which point in life" a + time-series is. Age features have small values for distant past time steps and increase monotonically the + more we approach the current time step. + + These features serve as the "positional encodings" of the inputs. So contrary to a model like BERT, where + the position encodings are learned from scratch internally as parameters of the model, the Time Series + Transformer requires to provide additional time features. + + The Autoformer only learns additional embeddings for `static_categorical_features`. + + past_observed_mask (`torch.BoolTensor` of shape `(batch_size, sequence_length)`, *optional*): + Boolean mask to indicate which `past_values` were observed and which were missing. Mask values selected in + `[0, 1]`: + + - 1 for values that are **observed**, + - 0 for values that are **missing** (i.e. NaNs that were replaced by zeros). + + static_categorical_features (`torch.LongTensor` of shape `(batch_size, number of static categorical features)`, *optional*): + Optional static categorical features for which the model will learn an embedding, which it will add to the + values of the time series. + + Static categorical features are features which have the same value for all time steps (static over time). + + A typical example of a static categorical feature is a time series ID. + + static_real_features (`torch.FloatTensor` of shape `(batch_size, number of static real features)`, *optional*): + Optional static real features which the model will add to the values of the time series. + + Static real features are features which have the same value for all time steps (static over time). + + A typical example of a static real feature is promotion information. + + future_values (`torch.FloatTensor` of shape `(batch_size, prediction_length)`): + Future values of the time series, that serve as labels for the model. The `future_values` is what the + Transformer needs to learn to output, given the `past_values`. + + See the demo notebook and code snippets for details. + + Missing values need to be replaced with zeros. + + future_time_features (`torch.FloatTensor` of shape `(batch_size, prediction_length, num_features)`, *optional*): + Optional time features, which the model internally will add to `future_values`. These could be things like + "month of year", "day of the month", etc. encoded as vectors (for instance as Fourier features). These + could also be so-called "age" features, which basically help the model know "at which point in life" a + time-series is. Age features have small values for distant past time steps and increase monotonically the + more we approach the current time step. + + These features serve as the "positional encodings" of the inputs. So contrary to a model like BERT, where + the position encodings are learned from scratch internally as parameters of the model, the Time Series + Transformer requires to provide additional features. + + The Autoformer only learns additional embeddings for `static_categorical_features`. + + attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on certain token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + + decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*): + Mask to avoid performing attention on certain token indices. By default, a causal mask will be used, to + make sure the model can only look at previous inputs in order to predict the future. + + head_mask (`torch.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*): + Mask to nullify selected heads of the attention modules in the encoder. Mask values selected in `[0, 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + decoder_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*): + Mask to nullify selected heads of the attention modules in the decoder. Mask values selected in `[0, 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*): + Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + encoder_outputs (`tuple(tuple(torch.FloatTensor)`, *optional*): + Tuple consists of `last_hidden_state`, `hidden_states` (*optional*) and `attentions` (*optional*) + `last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)` (*optional*) is a sequence of + hidden-states at the output of the last layer of the encoder. Used in the cross-attention of the decoder. + past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): + Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape + `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape + `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`. + + Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention + blocks) that can be used (see `past_key_values` input) to speed up sequential decoding. + + If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that + don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all + `decoder_input_ids` of shape `(batch_size, sequence_length)`. + inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This + is useful if you want more control over how to convert `input_ids` indices into associated vectors than the + model's internal embedding lookup matrix. + + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see + `past_key_values`). + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. +""" + + +# Copied from transformers.models.time_series_transformer.modeling_time_series_transformer.TimeSeriesTransformerEncoder with TimeSeriesTransformer->Autoformer,TimeSeries->Autoformer +class AutoformerEncoder(AutoformerPreTrainedModel): + """ + Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a + [`AutoformerEncoderLayer`]. + + Args: + config: AutoformerConfig + """ + + def __init__(self, config: AutoformerConfig): + super().__init__(config) + + self.dropout = config.dropout + self.layerdrop = config.encoder_layerdrop + if config.prediction_length is None: + raise ValueError("The `prediction_length` config needs to be specified.") + + self.value_embedding = AutoformerValueEmbedding(feature_size=config.feature_size, d_model=config.d_model) + self.embed_positions = AutoformerSinusoidalPositionalEmbedding( + config.context_length + config.prediction_length, config.d_model + ) + self.layers = nn.ModuleList([AutoformerEncoderLayer(config) for _ in range(config.encoder_layers)]) + self.layernorm_embedding = nn.LayerNorm(config.d_model) + + self.gradient_checkpointing = False + # Initialize weights and apply final processing + self.post_init() + + def forward( + self, + attention_mask: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, BaseModelOutput]: + r""" + Args: + attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + head_mask (`torch.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*): + Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. + This is useful if you want more control over how to convert `input_ids` indices into associated vectors + than the model's internal embedding lookup matrix. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors + for more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. + """ + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + hidden_states = self.value_embedding(inputs_embeds) + embed_pos = self.embed_positions(inputs_embeds.size()) + + hidden_states = self.layernorm_embedding(hidden_states + embed_pos) + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) + + # expand attention_mask + if attention_mask is not None: + # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] + attention_mask = _prepare_4d_attention_mask(attention_mask, inputs_embeds.dtype) + + encoder_states = () if output_hidden_states else None + all_attentions = () if output_attentions else None + + # check if head_mask has a correct number of layers specified if desired + if head_mask is not None: + if head_mask.size()[0] != (len(self.layers)): + raise ValueError( + f"The head_mask should be specified for {len(self.layers)} layers, but it is for" + f" {head_mask.size()[0]}." + ) + + for idx, encoder_layer in enumerate(self.layers): + if output_hidden_states: + encoder_states = encoder_states + (hidden_states,) + # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) + to_drop = False + if self.training: + dropout_probability = torch.rand([]) + if dropout_probability < self.layerdrop: # skip the layer + to_drop = True + + if to_drop: + layer_outputs = (None, None) + else: + if self.gradient_checkpointing and self.training: + layer_outputs = self._gradient_checkpointing_func( + encoder_layer.__call__, + hidden_states, + attention_mask, + (head_mask[idx] if head_mask is not None else None), + output_attentions, + ) + else: + layer_outputs = encoder_layer( + hidden_states, + attention_mask, + layer_head_mask=(head_mask[idx] if head_mask is not None else None), + output_attentions=output_attentions, + ) + + hidden_states = layer_outputs[0] + + if output_attentions: + all_attentions = all_attentions + (layer_outputs[1],) + + if output_hidden_states: + encoder_states = encoder_states + (hidden_states,) + + if not return_dict: + return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None) + return BaseModelOutput( + last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions + ) + + +class AutoformerDecoder(AutoformerPreTrainedModel): + """ + Transformer decoder consisting of `config.decoder_layers` layers. Each layer is a [`AutoformerDecoderLayer`] + + Args: + config: AutoformerConfig + """ + + def __init__(self, config: AutoformerConfig): + super().__init__(config) + self.dropout = config.dropout + self.layerdrop = config.decoder_layerdrop + if config.prediction_length is None: + raise ValueError("The `prediction_length` config needs to be specified.") + + self.value_embedding = AutoformerValueEmbedding(feature_size=config.feature_size, d_model=config.d_model) + self.embed_positions = AutoformerSinusoidalPositionalEmbedding( + config.context_length + config.prediction_length, config.d_model + ) + self.layers = nn.ModuleList([AutoformerDecoderLayer(config) for _ in range(config.decoder_layers)]) + self.layernorm_embedding = nn.LayerNorm(config.d_model) + + # https://github.com/thuml/Autoformer/blob/e6371e24f2ae2dd53e472edefdd5814c5176f864/models/Autoformer.py#L74 + self.seasonality_projection = nn.Linear(config.d_model, config.feature_size) + + self.gradient_checkpointing = False + # Initialize weights and apply final processing + self.post_init() + + def forward( + self, + trend: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + encoder_attention_mask: Optional[torch.LongTensor] = None, + head_mask: Optional[torch.Tensor] = None, + cross_attn_head_mask: Optional[torch.Tensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, AutoFormerDecoderOutput]: + r""" + Args: + trend (`torch.FloatTensor` of shape `(batch_size, prediction_length, feature_size)`, *optional*): + The trend sequence to be fed to the decoder. + attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*): + Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention + of the decoder. + encoder_attention_mask (`torch.LongTensor` of shape `(batch_size, encoder_sequence_length)`, *optional*): + Mask to avoid performing cross-attention on padding tokens indices of encoder input_ids. Mask values + selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*): + Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*): + Mask to nullify selected heads of the cross-attention modules in the decoder to avoid performing + cross-attention on hidden heads. Mask values selected in `[0, 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): + Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of + shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of + shape `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`. + + Contains pre-computed hidden-states (key and values in the self-attention blocks and in the + cross-attention blocks) that can be used (see `past_key_values` input) to speed up sequential decoding. + + If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those + that don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of + all `decoder_input_ids` of shape `(batch_size, sequence_length)`. + inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. + This is useful if you want more control over how to convert `input_ids` indices into associated vectors + than the model's internal embedding lookup matrix. + use_cache (`bool`, *optional*): + If `use_cache` is True, `past_key_values` key value states are returned and can be used to speed up + decoding (see `past_key_values`). + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors + for more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. + """ + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + use_cache = use_cache if use_cache is not None else self.config.use_cache + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + input_shape = inputs_embeds.size()[:-1] + + # expand encoder attention mask + if encoder_hidden_states is not None and encoder_attention_mask is not None: + # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] + encoder_attention_mask = _prepare_4d_attention_mask( + encoder_attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1] + ) + + hidden_states = self.value_embedding(inputs_embeds) + embed_pos = self.embed_positions( + inputs_embeds.size(), past_key_values_length=self.config.context_length - self.config.label_length + ) + hidden_states = self.layernorm_embedding(hidden_states + embed_pos) + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) + + # decoder layers + all_hidden_states = () if output_hidden_states else None + all_self_attns = () if output_attentions else None + all_cross_attentions = () if (output_attentions and encoder_hidden_states is not None) else None + next_decoder_cache = () if use_cache else None + + # check if head_mask/cross_attn_head_mask has a correct number of layers specified if desired + for attn_mask, mask_name in zip([head_mask, cross_attn_head_mask], ["head_mask", "cross_attn_head_mask"]): + if attn_mask is not None: + if attn_mask.size()[0] != (len(self.layers)): + raise ValueError( + f"The `{mask_name}` should be specified for {len(self.layers)} layers, but it is for" + f" {head_mask.size()[0]}." + ) + + for idx, decoder_layer in enumerate(self.layers): + # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) + if output_hidden_states: + all_hidden_states += (hidden_states,) + if self.training: + dropout_probability = torch.rand([]) + if dropout_probability < self.layerdrop: + continue + + past_key_value = past_key_values[idx] if past_key_values is not None else None + + if self.gradient_checkpointing and self.training: + if use_cache: + logger.warning( + "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." + ) + use_cache = False + layer_outputs = self._gradient_checkpointing_func( + decoder_layer.__call__, + hidden_states, + attention_mask, + encoder_hidden_states, + encoder_attention_mask, + head_mask[idx] if head_mask is not None else None, + cross_attn_head_mask[idx] if cross_attn_head_mask is not None else None, + None, + output_attentions, + use_cache, + ) + else: + layer_outputs = decoder_layer( + hidden_states, + attention_mask=attention_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + layer_head_mask=(head_mask[idx] if head_mask is not None else None), + cross_attn_layer_head_mask=( + cross_attn_head_mask[idx] if cross_attn_head_mask is not None else None + ), + past_key_value=past_key_value, + output_attentions=output_attentions, + use_cache=use_cache, + ) + (hidden_states, residual_trend) = layer_outputs[0] + trend = trend + residual_trend + + if use_cache: + next_decoder_cache += (layer_outputs[3 if output_attentions else 1],) + + if output_attentions: + all_self_attns += (layer_outputs[1],) + + if encoder_hidden_states is not None: + all_cross_attentions += (layer_outputs[2],) + + # project seasonality representation + hidden_states = self.seasonality_projection(hidden_states) + + # add hidden states from the last decoder layer + if output_hidden_states: + all_hidden_states += (hidden_states,) + + next_cache = next_decoder_cache if use_cache else None + if not return_dict: + return tuple( + v + for v in [hidden_states, trend, next_cache, all_hidden_states, all_self_attns, all_cross_attentions] + if v is not None + ) + return AutoFormerDecoderOutput( + last_hidden_state=hidden_states, + trend=trend, + past_key_values=next_cache, + hidden_states=all_hidden_states, + attentions=all_self_attns, + cross_attentions=all_cross_attentions, + ) + + +@add_start_docstrings( + "The bare Autoformer Model outputting raw hidden-states without any specific head on top.", + AUTOFORMER_START_DOCSTRING, +) +class AutoformerModel(AutoformerPreTrainedModel): + def __init__(self, config: AutoformerConfig): + super().__init__(config) + + if config.scaling == "mean" or config.scaling is True: + self.scaler = AutoformerMeanScaler(config) + elif config.scaling == "std": + self.scaler = AutoformerStdScaler(config) + else: + self.scaler = AutoformerNOPScaler(config) + + if config.num_static_categorical_features > 0: + self.embedder = AutoformerFeatureEmbedder( + cardinalities=config.cardinality, embedding_dims=config.embedding_dimension + ) + + # transformer encoder-decoder and mask initializer + self.encoder = AutoformerEncoder(config) + self.decoder = AutoformerDecoder(config) + + # used for decoder seasonal and trend initialization + self.decomposition_layer = AutoformerSeriesDecompositionLayer(config) + + # Initialize weights and apply final processing + self.post_init() + + @property + def _past_length(self) -> int: + return self.config.context_length + max(self.config.lags_sequence) + + def get_lagged_subsequences( + self, sequence: torch.Tensor, subsequences_length: int, shift: int = 0 + ) -> torch.Tensor: + """ + Returns lagged subsequences of a given sequence. Returns a tensor of shape (batch_size, subsequences_length, + feature_size, indices_length), containing lagged subsequences. Specifically, lagged[i, j, :, k] = sequence[i, + -indices[k]-subsequences_length+j, :]. + + Args: + sequence (`torch.Tensor` or shape `(batch_size, context_length, + feature_size)`): The sequence from which lagged subsequences should be extracted. + subsequences_length (`int`): + Length of the subsequences to be extracted. + shift (`int`, *optional* defaults to 0): + Shift the lags by this amount back in the time index. + """ + + # calculates the indices of the lags by subtracting the shift value from the given lags_sequence + indices = [lag - shift for lag in self.config.lags_sequence] + + # checks if the maximum lag plus the length of the subsequences exceeds the length of the input sequence + sequence_length = sequence.shape[1] + if max(indices) + subsequences_length > sequence_length: + raise ValueError( + f"lags cannot go further than history length, found lag {max(indices)} " + f"while history length is only {sequence_length}" + ) + + # extracts the lagged subsequences from the input sequence using the calculated indices + lagged_values = [] + for lag_index in indices: + begin_index = -lag_index - subsequences_length + end_index = -lag_index if lag_index > 0 else None + lagged_values.append(sequence[:, begin_index:end_index, ...]) + + # return as stacked tensor in the feature dimension + return torch.stack(lagged_values, dim=-1) + + def create_network_inputs( + self, + past_values: torch.Tensor, + past_time_features: torch.Tensor, + static_categorical_features: Optional[torch.Tensor] = None, + static_real_features: Optional[torch.Tensor] = None, + past_observed_mask: Optional[torch.Tensor] = None, + future_values: Optional[torch.Tensor] = None, + future_time_features: Optional[torch.Tensor] = None, + ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]: + """ + Creates the inputs for the network given the past and future values, time features, and static features. + + Args: + past_values (`torch.Tensor`): + A tensor of shape `(batch_size, past_length, input_size)` containing the past values. + past_time_features (`torch.Tensor`): + A tensor of shape `(batch_size, past_length, num_features)` containing the past time features. + static_categorical_features (`Optional[torch.Tensor]`): + An optional tensor of shape `(batch_size, num_categorical_features)` containing the static categorical + features. + static_real_features (`Optional[torch.Tensor]`): + An optional tensor of shape `(batch_size, num_real_features)` containing the static real features. + past_observed_mask (`Optional[torch.Tensor]`): + An optional tensor of shape `(batch_size, past_length, input_size)` containing the mask of observed + values in the past. + future_values (`Optional[torch.Tensor]`): + An optional tensor of shape `(batch_size, future_length, input_size)` containing the future values. + + Returns: + A tuple containing the following tensors: + - reshaped_lagged_sequence (`torch.Tensor`): A tensor of shape `(batch_size, sequence_length, num_lags * + input_size)` containing the lagged subsequences of the inputs. + - features (`torch.Tensor`): A tensor of shape `(batch_size, sequence_length, num_features)` containing the + concatenated static and time features. + - loc (`torch.Tensor`): A tensor of shape `(batch_size, input_size)` containing the mean of the input + values. + - scale (`torch.Tensor`): A tensor of shape `(batch_size, input_size)` containing the std of the input + values. + - static_feat (`torch.Tensor`): A tensor of shape `(batch_size, num_static_features)` containing the + concatenated static features. + """ + # time feature + time_feat = ( + torch.cat( + ( + past_time_features[:, self._past_length - self.config.context_length :, ...], + future_time_features, + ), + dim=1, + ) + if future_values is not None + else past_time_features[:, self._past_length - self.config.context_length :, ...] + ) + + # target + if past_observed_mask is None: + past_observed_mask = torch.ones_like(past_values) + + context = past_values[:, -self.config.context_length :] + observed_context = past_observed_mask[:, -self.config.context_length :] + _, loc, scale = self.scaler(context, observed_context) + + inputs = ( + (torch.cat((past_values, future_values), dim=1) - loc) / scale + if future_values is not None + else (past_values - loc) / scale + ) + + # static features + log_abs_loc = loc.abs().log1p() if self.config.input_size == 1 else loc.squeeze(1).abs().log1p() + log_scale = scale.log() if self.config.input_size == 1 else scale.squeeze(1).log() + static_feat = torch.cat((log_abs_loc, log_scale), dim=1) + + if static_real_features is not None: + static_feat = torch.cat((static_real_features, static_feat), dim=1) + if static_categorical_features is not None: + embedded_cat = self.embedder(static_categorical_features) + static_feat = torch.cat((embedded_cat, static_feat), dim=1) + expanded_static_feat = static_feat.unsqueeze(1).expand(-1, time_feat.shape[1], -1) + + # all features + features = torch.cat((expanded_static_feat, time_feat), dim=-1) + + # lagged features + subsequences_length = ( + self.config.context_length + self.config.prediction_length + if future_values is not None + else self.config.context_length + ) + lagged_sequence = self.get_lagged_subsequences(sequence=inputs, subsequences_length=subsequences_length) + lags_shape = lagged_sequence.shape + reshaped_lagged_sequence = lagged_sequence.reshape(lags_shape[0], lags_shape[1], -1) + + if reshaped_lagged_sequence.shape[1] != time_feat.shape[1]: + raise ValueError( + f"input length {reshaped_lagged_sequence.shape[1]} and time feature lengths {time_feat.shape[1]} does not match" + ) + return reshaped_lagged_sequence, features, loc, scale, static_feat + + def get_encoder(self): + return self.encoder + + def get_decoder(self): + return self.decoder + + @add_start_docstrings_to_model_forward(AUTOFORMER_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=AutoformerModelOutput, config_class=_CONFIG_FOR_DOC) + def forward( + self, + past_values: torch.Tensor, + past_time_features: torch.Tensor, + past_observed_mask: torch.Tensor, + static_categorical_features: Optional[torch.Tensor] = None, + static_real_features: Optional[torch.Tensor] = None, + future_values: Optional[torch.Tensor] = None, + future_time_features: Optional[torch.Tensor] = None, + decoder_attention_mask: Optional[torch.LongTensor] = None, + head_mask: Optional[torch.Tensor] = None, + decoder_head_mask: Optional[torch.Tensor] = None, + cross_attn_head_mask: Optional[torch.Tensor] = None, + encoder_outputs: Optional[List[torch.FloatTensor]] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + output_hidden_states: Optional[bool] = None, + output_attentions: Optional[bool] = None, + use_cache: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[AutoformerModelOutput, Tuple]: + r""" + Returns: + + Examples: + + ```python + >>> from huggingface_hub import hf_hub_download + >>> import torch + >>> from transformers import AutoformerModel + + >>> file = hf_hub_download( + ... repo_id="hf-internal-testing/tourism-monthly-batch", filename="train-batch.pt", repo_type="dataset" + ... ) + >>> batch = torch.load(file) + + >>> model = AutoformerModel.from_pretrained("huggingface/autoformer-tourism-monthly") + + >>> # during training, one provides both past and future values + >>> # as well as possible additional features + >>> outputs = model( + ... past_values=batch["past_values"], + ... past_time_features=batch["past_time_features"], + ... past_observed_mask=batch["past_observed_mask"], + ... static_categorical_features=batch["static_categorical_features"], + ... future_values=batch["future_values"], + ... future_time_features=batch["future_time_features"], + ... ) + + >>> last_hidden_state = outputs.last_hidden_state + ```""" + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + use_cache = use_cache if use_cache is not None else self.config.use_cache + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + transformer_inputs, temporal_features, loc, scale, static_feat = self.create_network_inputs( + past_values=past_values, + past_time_features=past_time_features, + past_observed_mask=past_observed_mask, + static_categorical_features=static_categorical_features, + static_real_features=static_real_features, + future_values=future_values, + future_time_features=future_time_features, + ) + + if encoder_outputs is None: + enc_input = torch.cat( + ( + transformer_inputs[:, : self.config.context_length, ...], + temporal_features[:, : self.config.context_length, ...], + ), + dim=-1, + ) + encoder_outputs = self.encoder( + inputs_embeds=enc_input, + head_mask=head_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + # If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput when return_dict=True + elif return_dict and not isinstance(encoder_outputs, BaseModelOutput): + encoder_outputs = BaseModelOutput( + last_hidden_state=encoder_outputs[0], + hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None, + attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None, + ) + + if future_values is not None: + # Decoder inputs + # seasonality and trend from context length + seasonal_input, trend_input = self.decomposition_layer( + transformer_inputs[:, : self.config.context_length, ...] + ) + mean = ( + torch.mean(transformer_inputs[:, : self.config.context_length, ...], dim=1) + .unsqueeze(1) + .repeat(1, self.config.prediction_length, 1) + ) + zeros = torch.zeros( + [transformer_inputs.shape[0], self.config.prediction_length, transformer_inputs.shape[2]], + device=enc_input.device, + ) + + decoder_input = torch.cat( + ( + torch.cat((seasonal_input[:, -self.config.label_length :, ...], zeros), dim=1), + temporal_features[:, self.config.context_length - self.config.label_length :, ...], + ), + dim=-1, + ) + trend_init = torch.cat( + ( + torch.cat((trend_input[:, -self.config.label_length :, ...], mean), dim=1), + temporal_features[:, self.config.context_length - self.config.label_length :, ...], + ), + dim=-1, + ) + + decoder_outputs = self.decoder( + trend=trend_init, + inputs_embeds=decoder_input, + attention_mask=decoder_attention_mask, + encoder_hidden_states=encoder_outputs[0], + head_mask=decoder_head_mask, + cross_attn_head_mask=cross_attn_head_mask, + past_key_values=past_key_values, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + else: + decoder_outputs = AutoFormerDecoderOutput() + + if not return_dict: + return decoder_outputs + encoder_outputs + (loc, scale, static_feat) + + return AutoformerModelOutput( + last_hidden_state=decoder_outputs.last_hidden_state, + trend=decoder_outputs.trend, + past_key_values=decoder_outputs.past_key_values, + decoder_hidden_states=decoder_outputs.hidden_states, + decoder_attentions=decoder_outputs.attentions, + cross_attentions=decoder_outputs.cross_attentions, + encoder_last_hidden_state=encoder_outputs.last_hidden_state, + encoder_hidden_states=encoder_outputs.hidden_states, + encoder_attentions=encoder_outputs.attentions, + loc=loc, + scale=scale, + static_features=static_feat, + ) + + +@add_start_docstrings( + "The Autoformer Model with a distribution head on top for time-series forecasting.", + AUTOFORMER_START_DOCSTRING, +) +class AutoformerForPrediction(AutoformerPreTrainedModel): + def __init__(self, config: AutoformerConfig): + super().__init__(config) + self.model = AutoformerModel(config) + if config.distribution_output == "student_t": + self.distribution_output = StudentTOutput(dim=config.input_size) + elif config.distribution_output == "normal": + self.distribution_output = NormalOutput(dim=config.input_size) + elif config.distribution_output == "negative_binomial": + self.distribution_output = NegativeBinomialOutput(dim=config.input_size) + else: + raise ValueError(f"Unknown distribution output {config.distribution_output}") + + self.parameter_projection = self.distribution_output.get_parameter_projection(self.model.config.feature_size) + self.target_shape = self.distribution_output.event_shape + + if config.loss == "nll": + self.loss = nll + else: + raise ValueError(f"Unknown loss function {config.loss}") + + # Initialize weights of distribution_output and apply final processing + self.post_init() + + def output_params(self, decoder_output): + return self.parameter_projection(decoder_output[:, -self.config.prediction_length :, :]) + + def get_encoder(self): + return self.model.get_encoder() + + def get_decoder(self): + return self.model.get_decoder() + + @torch.jit.ignore + def output_distribution(self, params, loc=None, scale=None, trailing_n=None) -> torch.distributions.Distribution: + sliced_params = params + if trailing_n is not None: + sliced_params = [p[:, -trailing_n:] for p in params] + return self.distribution_output.distribution(sliced_params, loc=loc, scale=scale) + + @add_start_docstrings_to_model_forward(AUTOFORMER_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=Seq2SeqTSPredictionOutput, config_class=_CONFIG_FOR_DOC) + def forward( + self, + past_values: torch.Tensor, + past_time_features: torch.Tensor, + past_observed_mask: torch.Tensor, + static_categorical_features: Optional[torch.Tensor] = None, + static_real_features: Optional[torch.Tensor] = None, + future_values: Optional[torch.Tensor] = None, + future_time_features: Optional[torch.Tensor] = None, + future_observed_mask: Optional[torch.Tensor] = None, + decoder_attention_mask: Optional[torch.LongTensor] = None, + head_mask: Optional[torch.Tensor] = None, + decoder_head_mask: Optional[torch.Tensor] = None, + cross_attn_head_mask: Optional[torch.Tensor] = None, + encoder_outputs: Optional[List[torch.FloatTensor]] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + output_hidden_states: Optional[bool] = None, + output_attentions: Optional[bool] = None, + use_cache: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Seq2SeqTSPredictionOutput, Tuple]: + r""" + Returns: + + Examples: + + ```python + >>> from huggingface_hub import hf_hub_download + >>> import torch + >>> from transformers import AutoformerForPrediction + + >>> file = hf_hub_download( + ... repo_id="hf-internal-testing/tourism-monthly-batch", filename="train-batch.pt", repo_type="dataset" + ... ) + >>> batch = torch.load(file) + + >>> model = AutoformerForPrediction.from_pretrained("huggingface/autoformer-tourism-monthly") + + >>> # during training, one provides both past and future values + >>> # as well as possible additional features + >>> outputs = model( + ... past_values=batch["past_values"], + ... past_time_features=batch["past_time_features"], + ... past_observed_mask=batch["past_observed_mask"], + ... static_categorical_features=batch["static_categorical_features"], + ... future_values=batch["future_values"], + ... future_time_features=batch["future_time_features"], + ... ) + + >>> loss = outputs.loss + >>> loss.backward() + + >>> # during inference, one only provides past values + >>> # as well as possible additional features + >>> # the model autoregressively generates future values + >>> outputs = model.generate( + ... past_values=batch["past_values"], + ... past_time_features=batch["past_time_features"], + ... past_observed_mask=batch["past_observed_mask"], + ... static_categorical_features=batch["static_categorical_features"], + ... future_time_features=batch["future_time_features"], + ... ) + + >>> mean_prediction = outputs.sequences.mean(dim=1) + ``` + + + + The AutoformerForPrediction can also use static_real_features. To do so, set num_static_real_features in + AutoformerConfig based on number of such features in the dataset (in case of tourism_monthly dataset it + is equal to 1), initialize the model and call as shown below: + + ``` + >>> from huggingface_hub import hf_hub_download + >>> import torch + >>> from transformers import AutoformerConfig, AutoformerForPrediction + + >>> file = hf_hub_download( + ... repo_id="hf-internal-testing/tourism-monthly-batch", filename="train-batch.pt", repo_type="dataset" + ... ) + >>> batch = torch.load(file) + + >>> # check number of static real features + >>> num_static_real_features = batch["static_real_features"].shape[-1] + + >>> # load configuration of pretrained model and override num_static_real_features + >>> configuration = AutoformerConfig.from_pretrained( + ... "huggingface/autoformer-tourism-monthly", + ... num_static_real_features=num_static_real_features, + ... ) + >>> # we also need to update feature_size as it is not recalculated + >>> configuration.feature_size += num_static_real_features + + >>> model = AutoformerForPrediction(configuration) + + >>> outputs = model( + ... past_values=batch["past_values"], + ... past_time_features=batch["past_time_features"], + ... past_observed_mask=batch["past_observed_mask"], + ... static_categorical_features=batch["static_categorical_features"], + ... static_real_features=batch["static_real_features"], + ... future_values=batch["future_values"], + ... future_time_features=batch["future_time_features"], + ... ) + ``` + + + """ + + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + if future_values is not None: + use_cache = False + + outputs = self.model( + past_values=past_values, + past_time_features=past_time_features, + past_observed_mask=past_observed_mask, + static_categorical_features=static_categorical_features, + static_real_features=static_real_features, + future_values=future_values, + future_time_features=future_time_features, + decoder_attention_mask=decoder_attention_mask, + head_mask=head_mask, + decoder_head_mask=decoder_head_mask, + cross_attn_head_mask=cross_attn_head_mask, + encoder_outputs=encoder_outputs, + past_key_values=past_key_values, + output_hidden_states=output_hidden_states, + output_attentions=output_attentions, + use_cache=use_cache, + return_dict=return_dict, + ) + + prediction_loss = None + params = None + if future_values is not None: + # outputs.last_hidden_state and trend + # loc is 4rd last and scale is 3rd last output + params = self.output_params(outputs[0] + outputs[1]) + distribution = self.output_distribution(params, loc=outputs[-3], scale=outputs[-2]) + + loss = self.loss(distribution, future_values) + + if future_observed_mask is None: + future_observed_mask = torch.ones_like(future_values) + + if len(self.target_shape) == 0: + loss_weights = future_observed_mask + else: + loss_weights, _ = future_observed_mask.min(dim=-1, keepdim=False) + + prediction_loss = weighted_average(loss, weights=loss_weights) + + if not return_dict: + outputs = ((params,) + outputs[2:]) if params is not None else outputs[2:] + return ((prediction_loss,) + outputs) if prediction_loss is not None else outputs + + return Seq2SeqTSPredictionOutput( + loss=prediction_loss, + params=params, + past_key_values=outputs.past_key_values, + decoder_hidden_states=outputs.decoder_hidden_states, + decoder_attentions=outputs.decoder_attentions, + cross_attentions=outputs.cross_attentions, + encoder_last_hidden_state=outputs.encoder_last_hidden_state, + encoder_hidden_states=outputs.encoder_hidden_states, + encoder_attentions=outputs.encoder_attentions, + loc=outputs.loc, + scale=outputs.scale, + static_features=outputs.static_features, + ) + + @torch.no_grad() + def generate( + self, + past_values: torch.Tensor, + past_time_features: torch.Tensor, + future_time_features: torch.Tensor, + past_observed_mask: Optional[torch.Tensor] = None, + static_categorical_features: Optional[torch.Tensor] = None, + static_real_features: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + ) -> SampleTSPredictionOutput: + r""" + Greedily generate sequences of sample predictions from a model with a probability distribution head. + + Parameters: + past_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)` or `(batch_size, sequence_length, input_size)`): + Past values of the time series, that serve as context in order to predict the future. The sequence size + of this tensor must be larger than the `context_length` of the model, since the model will use the + larger size to construct lag features, i.e. additional values from the past which are added in order to + serve as "extra context". + + The `sequence_length` here is equal to `config.context_length` + `max(config.lags_sequence)`, which if + no `lags_sequence` is configured, is equal to `config.context_length` + 7 (as by default, the largest + look-back index in `config.lags_sequence` is 7). The property `_past_length` returns the actual length + of the past. + + The `past_values` is what the Transformer encoder gets as input (with optional additional features, + such as `static_categorical_features`, `static_real_features`, `past_time_features` and lags). + + Optionally, missing values need to be replaced with zeros and indicated via the `past_observed_mask`. + + For multivariate time series, the `input_size` > 1 dimension is required and corresponds to the number + of variates in the time series per time step. + past_time_features (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_features)`): + Required time features, which the model internally will add to `past_values`. These could be things + like "month of year", "day of the month", etc. encoded as vectors (for instance as Fourier features). + These could also be so-called "age" features, which basically help the model know "at which point in + life" a time-series is. Age features have small values for distant past time steps and increase + monotonically the more we approach the current time step. Holiday features are also a good example of + time features. + + These features serve as the "positional encodings" of the inputs. So contrary to a model like BERT, + where the position encodings are learned from scratch internally as parameters of the model, the Time + Series Transformer requires to provide additional time features. The Time Series Transformer only + learns additional embeddings for `static_categorical_features`. + + Additional dynamic real covariates can be concatenated to this tensor, with the caveat that these + features must but known at prediction time. + + The `num_features` here is equal to `config.`num_time_features` + `config.num_dynamic_real_features`. + future_time_features (`torch.FloatTensor` of shape `(batch_size, prediction_length, num_features)`): + Required time features for the prediction window, which the model internally will add to sampled + predictions. These could be things like "month of year", "day of the month", etc. encoded as vectors + (for instance as Fourier features). These could also be so-called "age" features, which basically help + the model know "at which point in life" a time-series is. Age features have small values for distant + past time steps and increase monotonically the more we approach the current time step. Holiday features + are also a good example of time features. + + These features serve as the "positional encodings" of the inputs. So contrary to a model like BERT, + where the position encodings are learned from scratch internally as parameters of the model, the Time + Series Transformer requires to provide additional time features. The Time Series Transformer only + learns additional embeddings for `static_categorical_features`. + + Additional dynamic real covariates can be concatenated to this tensor, with the caveat that these + features must but known at prediction time. + + The `num_features` here is equal to `config.`num_time_features` + `config.num_dynamic_real_features`. + past_observed_mask (`torch.BoolTensor` of shape `(batch_size, sequence_length)` or `(batch_size, sequence_length, input_size)`, *optional*): + Boolean mask to indicate which `past_values` were observed and which were missing. Mask values selected + in `[0, 1]`: + + - 1 for values that are **observed**, + - 0 for values that are **missing** (i.e. NaNs that were replaced by zeros). + + static_categorical_features (`torch.LongTensor` of shape `(batch_size, number of static categorical features)`, *optional*): + Optional static categorical features for which the model will learn an embedding, which it will add to + the values of the time series. + + Static categorical features are features which have the same value for all time steps (static over + time). + + A typical example of a static categorical feature is a time series ID. + static_real_features (`torch.FloatTensor` of shape `(batch_size, number of static real features)`, *optional*): + Optional static real features which the model will add to the values of the time series. + + Static real features are features which have the same value for all time steps (static over time). + + A typical example of a static real feature is promotion information. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. + + Return: + [`SampleTSPredictionOutput`] where the outputs `sequences` tensor will have shape `(batch_size, number of + samples, prediction_length)` or `(batch_size, number of samples, prediction_length, input_size)` for + multivariate predictions. + """ + outputs = self( + static_categorical_features=static_categorical_features, + static_real_features=static_real_features, + past_time_features=past_time_features, + past_values=past_values, + past_observed_mask=past_observed_mask, + future_time_features=None, + future_values=None, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=True, + use_cache=False, + ) + + decoder = self.model.get_decoder() + enc_last_hidden = outputs.encoder_last_hidden_state + loc = outputs.loc + scale = outputs.scale + static_feat = outputs.static_features + + num_parallel_samples = self.config.num_parallel_samples + repeated_loc = loc.repeat_interleave(repeats=num_parallel_samples, dim=0) + repeated_scale = scale.repeat_interleave(repeats=num_parallel_samples, dim=0) + + repeated_past_values = ( + past_values.repeat_interleave(repeats=num_parallel_samples, dim=0) - repeated_loc + ) / repeated_scale + + time_features = torch.cat((past_time_features, future_time_features), dim=1) + + expanded_static_feat = static_feat.unsqueeze(1).expand(-1, time_features.shape[1], -1) + features = torch.cat((expanded_static_feat, time_features), dim=-1) + repeated_features = features.repeat_interleave(repeats=num_parallel_samples, dim=0) + + repeated_enc_last_hidden = enc_last_hidden.repeat_interleave(repeats=num_parallel_samples, dim=0) + + lagged_sequence = self.model.get_lagged_subsequences( + sequence=repeated_past_values, subsequences_length=self.config.context_length + ) + lags_shape = lagged_sequence.shape + reshaped_lagged_sequence = lagged_sequence.reshape(lags_shape[0], lags_shape[1], -1) + seasonal_input, trend_input = self.model.decomposition_layer(reshaped_lagged_sequence) + + mean = torch.mean(reshaped_lagged_sequence, dim=1).unsqueeze(1).repeat(1, self.config.prediction_length, 1) + zeros = torch.zeros( + [reshaped_lagged_sequence.shape[0], self.config.prediction_length, reshaped_lagged_sequence.shape[2]], + device=reshaped_lagged_sequence.device, + ) + + decoder_input = torch.cat( + ( + torch.cat((seasonal_input[:, -self.config.label_length :, ...], zeros), dim=1), + repeated_features[:, -self.config.prediction_length - self.config.label_length :, ...], + ), + dim=-1, + ) + trend_init = torch.cat( + ( + torch.cat((trend_input[:, -self.config.label_length :, ...], mean), dim=1), + repeated_features[:, -self.config.prediction_length - self.config.label_length :, ...], + ), + dim=-1, + ) + decoder_outputs = decoder( + trend=trend_init, inputs_embeds=decoder_input, encoder_hidden_states=repeated_enc_last_hidden + ) + decoder_last_hidden = decoder_outputs.last_hidden_state + trend = decoder_outputs.trend + params = self.output_params(decoder_last_hidden + trend) + distr = self.output_distribution(params, loc=repeated_loc, scale=repeated_scale) + future_samples = distr.sample() + + return SampleTSPredictionOutput( + sequences=future_samples.reshape( + (-1, num_parallel_samples, self.config.prediction_length) + self.target_shape, + ) + ) + + +__all__ = ["AutoformerForPrediction", "AutoformerModel", "AutoformerPreTrainedModel"] diff --git a/docs/transformers/build/lib/transformers/models/aya_vision/__init__.py b/docs/transformers/build/lib/transformers/models/aya_vision/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..f8be47cb228b19f02b87d195747e78c2a87de752 --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/aya_vision/__init__.py @@ -0,0 +1,28 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ...utils import _LazyModule +from ...utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .configuration_aya_vision import * + from .modeling_aya_vision import * + from .processing_aya_vision import * +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/docs/transformers/build/lib/transformers/models/aya_vision/configuration_aya_vision.py b/docs/transformers/build/lib/transformers/models/aya_vision/configuration_aya_vision.py new file mode 100644 index 0000000000000000000000000000000000000000..ad7fdfd319d183d454c482a76ec2ab9fa00572c6 --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/aya_vision/configuration_aya_vision.py @@ -0,0 +1,112 @@ +# coding=utf-8 +# Copyright 2025 Cohere team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""AyaVision model configuration""" + +from ...configuration_utils import PretrainedConfig +from ...utils import logging +from ..auto import CONFIG_MAPPING, AutoConfig + + +logger = logging.get_logger(__name__) + + +class AyaVisionConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`AyaVisionForConditionalGeneration`]. It is used to instantiate an + AyaVision model according to the specified arguments, defining the model architecture. Instantiating a configuration + with the defaults will yield a similar configuration to that of AyaVision. + e.g. [CohereForAI/aya-vision-8b](https://huggingface.co/CohereForAI/aya-vision-8b) + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + vision_config (`Union[AutoConfig, dict]`, *optional*, defaults to `CLIPVisionConfig`): + The config object or dictionary of the vision backbone. + text_config (`Union[AutoConfig, dict]`, *optional*, defaults to `LlamaConfig`): + The config object or dictionary of the text backbone. + vision_feature_select_strategy (`str`, *optional*, defaults to `"full"`): + The feature selection strategy used to select the vision feature from the vision backbone. + Can be one of `"default"` or `"full"`. If `"default"`, the CLS token is removed from the vision features. + If `"full"`, the full vision features are used. + vision_feature_layer (`int`, *optional*, defaults to -1): + The index of the layer to select the vision feature. + downsample_factor (`int`, *optional*, defaults to 2): + The downsample factor to apply to the vision features. + adapter_layer_norm_eps (`float`, *optional*, defaults to 1e-06): + The epsilon value used for layer normalization in the adapter. + image_token_index (`int`, *optional*, defaults to 255036): + The image token index to encode the image prompt. + """ + + model_type = "aya_vision" + attribute_map = { + "image_token_id": "image_token_index", + } + sub_configs = {"text_config": AutoConfig, "vision_config": AutoConfig} + + def __init__( + self, + vision_config=None, + text_config=None, + vision_feature_select_strategy="full", + vision_feature_layer=-1, + downsample_factor=2, + adapter_layer_norm_eps=1e-6, + image_token_index=255036, + **kwargs, + ): + self.image_token_index = image_token_index + self.downsample_factor = downsample_factor + self.adapter_layer_norm_eps = adapter_layer_norm_eps + if vision_feature_select_strategy not in ["default", "full"]: + raise ValueError( + "vision_feature_select_strategy should be one of 'default', 'full'." + f"Got: {vision_feature_select_strategy}" + ) + + self.vision_feature_select_strategy = vision_feature_select_strategy + self.vision_feature_layer = vision_feature_layer + + if isinstance(vision_config, dict): + vision_config["model_type"] = ( + vision_config["model_type"] if "model_type" in vision_config else "clip_vision_model" + ) + vision_config = CONFIG_MAPPING[vision_config["model_type"]](**vision_config) + elif vision_config is None: + vision_config = CONFIG_MAPPING["siglip_vision_model"]( + hidden_size=1152, + intermediate_size=4304, + patch_size=14, + image_size=384, + num_hidden_layers=26, + num_attention_heads=14, + vision_use_head=False, + ) + + self.vision_config = vision_config + + if isinstance(text_config, dict): + text_config["model_type"] = text_config["model_type"] if "model_type" in text_config else "llama" + text_config = CONFIG_MAPPING[text_config["model_type"]](**text_config) + elif text_config is None: + text_config = CONFIG_MAPPING["cohere2"]() + + self.text_config = text_config + + super().__init__(**kwargs) + + +__all__ = ["AyaVisionConfig"] diff --git a/docs/transformers/build/lib/transformers/models/aya_vision/modeling_aya_vision.py b/docs/transformers/build/lib/transformers/models/aya_vision/modeling_aya_vision.py new file mode 100644 index 0000000000000000000000000000000000000000..45c2ab66e3e96dcfe2372ee1434bab1eebe52323 --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/aya_vision/modeling_aya_vision.py @@ -0,0 +1,546 @@ +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# This file was automatically generated from src/transformers/models/aya_vision/modular_aya_vision.py. +# Do NOT edit this file manually as any edits will be overwritten by the generation of +# the file from the modular. If any change should be done, please apply the change to the +# modular_aya_vision.py file directly. One of our CI enforces this. +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# coding=utf-8 +# Copyright 2025 the Cohere Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from dataclasses import dataclass +from typing import List, Optional, Tuple, Union + +import torch +from torch import nn + +from ...activations import ACT2FN +from ...generation import GenerationMixin +from ...modeling_outputs import ModelOutput +from ...modeling_utils import PreTrainedModel +from ...utils import ( + add_start_docstrings, + add_start_docstrings_to_model_forward, + is_torchdynamo_compiling, + replace_return_docstrings, +) +from ..auto import AutoModel, AutoModelForCausalLM +from .configuration_aya_vision import AyaVisionConfig + + +_CONFIG_FOR_DOC = "AyaVisionConfig" + + +class AyaVisionMultiModalProjector(nn.Module): + def __init__(self, config: AyaVisionConfig): + super().__init__() + self.config = config + self.downsample_factor = config.downsample_factor + self.alignment_intermediate_size = getattr( + config, "alignment_intermediate_size", config.text_config.hidden_size + ) + self.layernorm = nn.LayerNorm( + config.vision_config.hidden_size * (config.downsample_factor**2), eps=config.adapter_layer_norm_eps + ) + + self.linear_1 = nn.Linear( + config.vision_config.hidden_size * (config.downsample_factor**2), + self.alignment_intermediate_size, + bias=True, + ) + + self.act = ACT2FN["silu"] # SwiGLU uses SiLU activation + # For SwiGLU, project down to half size since we split intermediate dim + self.linear_2 = nn.Linear(self.alignment_intermediate_size // 2, config.text_config.hidden_size, bias=True) + + def forward(self, image_features): + image_features = self.pixel_shuffle(image_features) + image_features = self.layernorm(image_features) + hidden_states = self.linear_1(image_features) + + # Split along last dimension and apply SwiGLU + x, gate = hidden_states.chunk(2, dim=-1) + hidden_states = self.act(gate) * x + + hidden_states = self.linear_2(hidden_states) + return hidden_states + + def pixel_shuffle(self, image_features): # B, S, D + batch_size, seq_length, feature_dim = image_features.shape + height = width = int(seq_length**0.5) + image_features = image_features.reshape(image_features.shape[0], width, height, -1) + channels = image_features.shape[-1] + image_features = image_features.reshape( + batch_size, width, int(height / self.downsample_factor), int(channels * self.downsample_factor) + ) + image_features = image_features.permute(0, 2, 1, 3) + image_features = image_features.reshape( + batch_size, int(height / self.downsample_factor), int(width / self.downsample_factor), -1 + ) + image_features = image_features.permute(0, 2, 1, 3) + return image_features + + +AYA_VISION_START_DOCSTRING = r""" + This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads + etc.) + + This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. + Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage + and behavior. + + Parameters: + config ([`AyaVisionConfig`] or [`AyaVisionVisionConfig`]): + Model configuration class with all the parameters of the model. Initializing with a config file does not + load the weights associated with the model, only the configuration. Check out the + [`~PreTrainedModel.from_pretrained`] method to load the model weights. +""" + + +@add_start_docstrings( + "The bare Aya Vision Model outputting raw hidden-states without any specific head on top.", + AYA_VISION_START_DOCSTRING, +) +class AyaVisionPreTrainedModel(PreTrainedModel): + config_class = AyaVisionConfig + base_model_prefix = "model" + supports_gradient_checkpointing = True + _no_split_modules = ["AyaVisionVisionAttention"] + _skip_keys_device_placement = "past_key_values" + _supports_cache_class = True + _supports_flash_attn_2 = True + _supports_sdpa = True + _supports_quantized_cache = False + _supports_static_cache = False + + def _init_weights(self, module): + std = ( + self.config.initializer_range + if hasattr(self.config, "initializer_range") + else self.config.text_config.initializer_range + ) + + if isinstance(module, nn.Linear): + module.weight.data.normal_(mean=0.0, std=std) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.LayerNorm): + module.weight.data.fill_(1.0) + module.bias.data.zero_() + + +@dataclass +class AyaVisionCausalLMOutputWithPast(ModelOutput): + """ + Base class for AyaVision causal language model (or autoregressive) outputs. + + Args: + loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided): + Language modeling loss (for next-token prediction). + logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`): + Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). + past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): + Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape + `(batch_size, num_heads, sequence_length, embed_size_per_head)`) + + Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see + `past_key_values` input) to speed up sequential decoding. + hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, + + one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the optional initial embedding outputs. + attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + image_hidden_states (`torch.FloatTensor`, *optional*): + A `torch.FloatTensor` of size (batch_size, num_images, sequence_length, hidden_size)`. + image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state. + """ + + loss: Optional[torch.FloatTensor] = None + logits: Optional[torch.FloatTensor] = None + past_key_values: Optional[List[torch.FloatTensor]] = None + hidden_states: Optional[Tuple[torch.FloatTensor]] = None + attentions: Optional[Tuple[torch.FloatTensor]] = None + image_hidden_states: Optional[torch.FloatTensor] = None + + +AYA_VISION_INPUTS_DOCSTRING = """ + Args: + input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide + it. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)): + The tensors corresponding to the input images. Pixel values can be obtained using + [`AutoImageProcessor`]. See [`GotOcr2ImageProcessor.__call__`] for details. [`AyaVisionProcessor`] uses + [`GotOcr2ImageProcessor`] for processing images. + attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see + `past_key_values`). + + If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`] + and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more + information on the default strategy. + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, + config.n_positions - 1]`. [What are position IDs?](../glossary#position-ids) + past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): + Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape + `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape + `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`. + + Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention + blocks) that can be used (see `past_key_values` input) to speed up sequential decoding. + + If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that + don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all + `decoder_input_ids` of shape `(batch_size, sequence_length)`. + inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This + is useful if you want more control over how to convert `input_ids` indices into associated vectors than the + model's internal embedding lookup matrix. + vision_feature_layer (`Union[int, List[int]], *optional*, defaults to -2`): + The index of the layer to select the vision feature. If multiple indices are provided, + the vision feature of the corresponding indices will be concatenated to form the + vision features. + vision_feature_select_strategy (`str`, *optional*, defaults to `"default"`): + The feature selection strategy used to select the vision feature from the vision backbone. + Can be one of `"default"` or `"full"`. + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see + `past_key_values`). + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. + cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*): + Indices depicting the position of the input sequence tokens in the sequence. Contrarily to `position_ids`, + this tensor is not affected by padding. It is used to update the cache in the correct position and to infer + the complete sequence length. +""" + + +@add_start_docstrings( + """The AyaVision model which consists of a vision backbone and a language model.""", + AYA_VISION_START_DOCSTRING, +) +class AyaVisionForConditionalGeneration(AyaVisionPreTrainedModel, GenerationMixin): + def __init__(self, config: AyaVisionConfig): + super().__init__(config) + self.vision_tower = AutoModel.from_config(config.vision_config) + + self.multi_modal_projector = AyaVisionMultiModalProjector(config) + self.vocab_size = config.text_config.vocab_size + self.language_model = AutoModelForCausalLM.from_config(config.text_config) + + if self.language_model._tied_weights_keys is not None: + self._tied_weights_keys = [f"language_model.{k}" for k in self.language_model._tied_weights_keys] + + self.pad_token_id = self.config.pad_token_id if self.config.pad_token_id is not None else -1 + + self.post_init() + + def get_input_embeddings(self): + return self.language_model.get_input_embeddings() + + def set_input_embeddings(self, value): + self.language_model.set_input_embeddings(value) + + def get_output_embeddings(self): + return self.language_model.get_output_embeddings() + + def set_output_embeddings(self, new_embeddings): + self.language_model.set_output_embeddings(new_embeddings) + + def set_decoder(self, decoder): + self.language_model.set_decoder(decoder) + + def get_decoder(self): + return self.language_model.get_decoder() + + def get_image_features( + self, + pixel_values: torch.FloatTensor, + vision_feature_layer: Union[int, List[int]], + vision_feature_select_strategy: str, + **kwargs, + ): + """ + Obtains image last hidden states from the vision tower and apply multimodal projection. + + Args: + pixel_values (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`) + The tensors corresponding to the input images. + vision_feature_layer (`Union[int, List[int]]`): + The index of the layer to select the vision feature. If multiple indices are provided, + the vision feature of the corresponding indices will be concatenated to form the + vision features. + vision_feature_select_strategy (`str`): + The feature selection strategy used to select the vision feature from the vision backbone. + Can be one of `"default"` or `"full"` + Returns: + image_features (`torch.Tensor`): Image feature tensor of shape `(num_images, image_length, embed_dim)`). + """ + if vision_feature_select_strategy not in ["default", "full"]: + raise ValueError(f"Unexpected select feature strategy: {self.config.vision_feature_select_strategy}") + + kwargs = {k: v for k, v in kwargs.items() if v is not None} + # this is not memory efficient at all (output_hidden_states=True) will save all the hidden states. + image_outputs = self.vision_tower(pixel_values, output_hidden_states=True, **kwargs) + + # If we have one vision feature layer, return the corresponding hidden states, + # otherwise, select the hidden states of each feature layer and concatenate them + if isinstance(vision_feature_layer, int): + selected_image_feature = image_outputs.hidden_states[vision_feature_layer] + if vision_feature_select_strategy == "default": + selected_image_feature = selected_image_feature[:, 1:] + else: + hs_pool = [image_outputs.hidden_states[layer_idx] for layer_idx in vision_feature_layer] + # For default; crop CLS from each hidden state in the hidden state pool + if vision_feature_select_strategy == "default": + hs_pool = [hs[:, 1:] for hs in hs_pool] + selected_image_feature = torch.cat(hs_pool, dim=-1) + + image_features = self.multi_modal_projector(selected_image_feature) + return image_features + + @add_start_docstrings_to_model_forward(AYA_VISION_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=AyaVisionCausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC) + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + pixel_values: Optional[torch.FloatTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + vision_feature_layer: Optional[Union[int, List[int]]] = None, + vision_feature_select_strategy: Optional[str] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + cache_position: Optional[torch.LongTensor] = None, + logits_to_keep: Union[int, torch.Tensor] = 0, + image_sizes: Optional[torch.Tensor] = None, + **lm_kwargs, + ) -> Union[Tuple, AyaVisionCausalLMOutputWithPast]: + r""" + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., + config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored + (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. + + logits_to_keep (`int` or `torch.Tensor`, *optional*): + If an `int`, compute logits for the last `logits_to_keep` tokens. If `0`, calculate logits for all + `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that + token can save memory, which becomes pretty significant for long sequences or large vocabulary size. + If a `torch.Tensor`, must be 1D corresponding to the indices to keep in the sequence length dimension. + This is useful when using packed tensor format (single dimension for batch and sequence length). + + + Returns: + + Example: + + ```python + >>> from transformers import AutoProcessor, AyaVisionForConditionalGeneration + >>> import torch + + >>> torch_device = "cuda:0" + >>> processor = AutoProcessor.from_pretrained("CohereForAI/aya-vision-8b", use_fast=True) + >>> model = AyaVisionForConditionalGeneration.from_pretrained("CohereForAI/aya-vision-8b", device_map=torch_device) + + >>> messages = [ + ... { + ... "role": "user", + ... "content": [ + ... { + ... "type": "image", + ... "url": "https://pbs.twimg.com/media/Fx7YvfQWYAIp6rZ?format=jpg&name=medium", + ... }, + ... {"type": "text", "text": "चित्र में लिखा पाठ क्या कहता है?"}, + ... ], + ... } + ... ] + + >>> inputs = processor.apply_chat_template( + ... messages, padding=True, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt", device=torch_device + ... ).to(model.device) + + >>> gen_tokens = model.generate(**inputs, max_new_tokens=300, do_sample=True, temperature=0.3) + >>> processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True) + ```""" + + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + vision_feature_layer = ( + vision_feature_layer if vision_feature_layer is not None else self.config.vision_feature_layer + ) + vision_feature_select_strategy = ( + vision_feature_select_strategy + if vision_feature_select_strategy is not None + else self.config.vision_feature_select_strategy + ) + + if (input_ids is None) ^ (inputs_embeds is not None): + raise ValueError("You must specify exactly one of input_ids or inputs_embeds") + + if pixel_values is not None and inputs_embeds is not None: + raise ValueError( + "You cannot specify both pixel_values and inputs_embeds at the same time, and must specify either one" + ) + + if inputs_embeds is None: + inputs_embeds = self.get_input_embeddings()(input_ids) + + if pixel_values is not None: + image_features = self.get_image_features( + pixel_values=pixel_values, + vision_feature_layer=vision_feature_layer, + vision_feature_select_strategy=vision_feature_select_strategy, + image_sizes=image_sizes, + ) + + special_image_mask = (input_ids == self.config.image_token_id).unsqueeze(-1) + special_image_mask = special_image_mask.expand_as(inputs_embeds).to(inputs_embeds.device) + if not is_torchdynamo_compiling() and inputs_embeds[special_image_mask].numel() != image_features.numel(): + n_image_tokens = (input_ids == self.config.image_token_id).sum() + n_image_features = image_features.shape[0] * image_features.shape[1] + raise ValueError( + f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}" + ) + image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype) + inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features) + + outputs = self.language_model( + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + cache_position=cache_position, + logits_to_keep=logits_to_keep, + **lm_kwargs, + ) + + logits = outputs[0] + + loss = None + if labels is not None: + # Shift so that tokens < n predict n + if attention_mask is not None: + # we use the input attention mask to shift the logits and labels, because it is 2D. + # we also crop attn mask in case it is longer, which happens in PrefixTuning with peft + shift_attention_mask = attention_mask[:, -(logits.shape[1] - 1) :].to(logits.device) + shift_logits = logits[..., :-1, :][shift_attention_mask.to(logits.device) != 0].contiguous() + shift_labels = labels[..., 1:][shift_attention_mask.to(labels.device) != 0].contiguous() + else: + shift_logits = logits[..., :-1, :].contiguous() + shift_labels = labels[..., 1:].contiguous() + # Flatten the tokens + loss_fct = nn.CrossEntropyLoss() + loss = loss_fct( + shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1).to(shift_logits.device) + ) + + if not return_dict: + output = (logits,) + outputs[1:] + return (loss,) + output if loss is not None else output + + return AyaVisionCausalLMOutputWithPast( + loss=loss, + logits=logits, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + image_hidden_states=image_features if pixel_values is not None else None, + ) + + def prepare_inputs_for_generation( + self, + input_ids, + past_key_values=None, + inputs_embeds=None, + pixel_values=None, + attention_mask=None, + cache_position=None, + logits_to_keep=None, + **kwargs, + ): + # Overwritten -- in specific circumstances we don't want to forward image inputs to the model + + model_inputs = self.language_model.prepare_inputs_for_generation( + input_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + attention_mask=attention_mask, + cache_position=cache_position, + logits_to_keep=logits_to_keep, + **kwargs, + ) + + if cache_position[0] == 0: + # If we're in cached decoding stage, pixel values should be None because input ids do not contain special image token anymore + # Otherwise we need pixel values to be passed to model + model_inputs["pixel_values"] = pixel_values + + return model_inputs + + def tie_weights(self): + return self.language_model.tie_weights() + + def resize_token_embeddings(self, new_num_tokens: Optional[int] = None, pad_to_multiple_of=None) -> nn.Embedding: + model_embeds = self.language_model.resize_token_embeddings(new_num_tokens, pad_to_multiple_of) + # update vocab size + self.config.text_config.vocab_size = model_embeds.num_embeddings + self.vocab_size = model_embeds.num_embeddings + return model_embeds + + +__all__ = ["AyaVisionForConditionalGeneration", "AyaVisionPreTrainedModel"] diff --git a/docs/transformers/build/lib/transformers/models/aya_vision/modular_aya_vision.py b/docs/transformers/build/lib/transformers/models/aya_vision/modular_aya_vision.py new file mode 100644 index 0000000000000000000000000000000000000000..96f0de888dd793b22c4eaeba6e94684740b61c78 --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/aya_vision/modular_aya_vision.py @@ -0,0 +1,315 @@ +# coding=utf-8 +# Copyright 2025 the Cohere Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch AyaVision model.""" + +from typing import List, Optional, Tuple, Union + +import torch +from torch import nn + +from transformers.models.llava.modeling_llava import ( + LlavaCausalLMOutputWithPast, + LlavaForConditionalGeneration, + LlavaPreTrainedModel, +) + +from ...activations import ACT2FN +from ...utils import ( + add_start_docstrings, + logging, +) +from .configuration_aya_vision import AyaVisionConfig + + +logger = logging.get_logger(__name__) + +_CONFIG_FOR_DOC = "AyaVisionConfig" + + +class AyaVisionMultiModalProjector(nn.Module): + def __init__(self, config: AyaVisionConfig): + super().__init__() + self.config = config + self.downsample_factor = config.downsample_factor + self.alignment_intermediate_size = getattr( + config, "alignment_intermediate_size", config.text_config.hidden_size + ) + self.layernorm = nn.LayerNorm( + config.vision_config.hidden_size * (config.downsample_factor**2), eps=config.adapter_layer_norm_eps + ) + + self.linear_1 = nn.Linear( + config.vision_config.hidden_size * (config.downsample_factor**2), + self.alignment_intermediate_size, + bias=True, + ) + + self.act = ACT2FN["silu"] # SwiGLU uses SiLU activation + # For SwiGLU, project down to half size since we split intermediate dim + self.linear_2 = nn.Linear(self.alignment_intermediate_size // 2, config.text_config.hidden_size, bias=True) + + def forward(self, image_features): + image_features = self.pixel_shuffle(image_features) + image_features = self.layernorm(image_features) + hidden_states = self.linear_1(image_features) + + # Split along last dimension and apply SwiGLU + x, gate = hidden_states.chunk(2, dim=-1) + hidden_states = self.act(gate) * x + + hidden_states = self.linear_2(hidden_states) + return hidden_states + + def pixel_shuffle(self, image_features): # B, S, D + batch_size, seq_length, feature_dim = image_features.shape + height = width = int(seq_length**0.5) + image_features = image_features.reshape(image_features.shape[0], width, height, -1) + channels = image_features.shape[-1] + image_features = image_features.reshape( + batch_size, width, int(height / self.downsample_factor), int(channels * self.downsample_factor) + ) + image_features = image_features.permute(0, 2, 1, 3) + image_features = image_features.reshape( + batch_size, int(height / self.downsample_factor), int(width / self.downsample_factor), -1 + ) + image_features = image_features.permute(0, 2, 1, 3) + return image_features + + +AYA_VISION_START_DOCSTRING = r""" + This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads + etc.) + + This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. + Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage + and behavior. + + Parameters: + config ([`AyaVisionConfig`] or [`AyaVisionVisionConfig`]): + Model configuration class with all the parameters of the model. Initializing with a config file does not + load the weights associated with the model, only the configuration. Check out the + [`~PreTrainedModel.from_pretrained`] method to load the model weights. +""" + + +@add_start_docstrings( + "The bare Aya Vision Model outputting raw hidden-states without any specific head on top.", + AYA_VISION_START_DOCSTRING, +) +class AyaVisionPreTrainedModel(LlavaPreTrainedModel): + _supports_quantized_cache = False + _supports_static_cache = False + + def _init_weights(self, module): + std = ( + self.config.initializer_range + if hasattr(self.config, "initializer_range") + else self.config.text_config.initializer_range + ) + + if isinstance(module, nn.Linear): + module.weight.data.normal_(mean=0.0, std=std) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.LayerNorm): + module.weight.data.fill_(1.0) + module.bias.data.zero_() + + +class AyaVisionCausalLMOutputWithPast(LlavaCausalLMOutputWithPast): + pass + + +AYA_VISION_INPUTS_DOCSTRING = """ + Args: + input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide + it. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)): + The tensors corresponding to the input images. Pixel values can be obtained using + [`AutoImageProcessor`]. See [`GotOcr2ImageProcessor.__call__`] for details. [`AyaVisionProcessor`] uses + [`GotOcr2ImageProcessor`] for processing images. + attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see + `past_key_values`). + + If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`] + and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more + information on the default strategy. + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, + config.n_positions - 1]`. [What are position IDs?](../glossary#position-ids) + past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): + Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape + `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape + `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`. + + Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention + blocks) that can be used (see `past_key_values` input) to speed up sequential decoding. + + If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that + don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all + `decoder_input_ids` of shape `(batch_size, sequence_length)`. + inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This + is useful if you want more control over how to convert `input_ids` indices into associated vectors than the + model's internal embedding lookup matrix. + vision_feature_layer (`Union[int, List[int]], *optional*, defaults to -2`): + The index of the layer to select the vision feature. If multiple indices are provided, + the vision feature of the corresponding indices will be concatenated to form the + vision features. + vision_feature_select_strategy (`str`, *optional*, defaults to `"default"`): + The feature selection strategy used to select the vision feature from the vision backbone. + Can be one of `"default"` or `"full"`. + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see + `past_key_values`). + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. + cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*): + Indices depicting the position of the input sequence tokens in the sequence. Contrarily to `position_ids`, + this tensor is not affected by padding. It is used to update the cache in the correct position and to infer + the complete sequence length. +""" + + +@add_start_docstrings( + """The AyaVision model which consists of a vision backbone and a language model.""", + AYA_VISION_START_DOCSTRING, +) +class AyaVisionForConditionalGeneration(LlavaForConditionalGeneration): + def tie_weights(self): + return self.language_model.tie_weights() + + def resize_token_embeddings(self, new_num_tokens: Optional[int] = None, pad_to_multiple_of=None) -> nn.Embedding: + model_embeds = self.language_model.resize_token_embeddings(new_num_tokens, pad_to_multiple_of) + # update vocab size + self.config.text_config.vocab_size = model_embeds.num_embeddings + self.vocab_size = model_embeds.num_embeddings + return model_embeds + + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + pixel_values: Optional[torch.FloatTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + vision_feature_layer: Optional[Union[int, List[int]]] = None, + vision_feature_select_strategy: Optional[str] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + cache_position: Optional[torch.LongTensor] = None, + logits_to_keep: Union[int, torch.Tensor] = 0, + image_sizes: Optional[torch.Tensor] = None, + **lm_kwargs, + ) -> Union[Tuple, AyaVisionCausalLMOutputWithPast]: + r""" + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., + config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored + (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. + + logits_to_keep (`int` or `torch.Tensor`, *optional*): + If an `int`, compute logits for the last `logits_to_keep` tokens. If `0`, calculate logits for all + `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that + token can save memory, which becomes pretty significant for long sequences or large vocabulary size. + If a `torch.Tensor`, must be 1D corresponding to the indices to keep in the sequence length dimension. + This is useful when using packed tensor format (single dimension for batch and sequence length). + + + Returns: + + Example: + + ```python + >>> from transformers import AutoProcessor, AyaVisionForConditionalGeneration + >>> import torch + + >>> torch_device = "cuda:0" + >>> processor = AutoProcessor.from_pretrained("CohereForAI/aya-vision-8b", use_fast=True) + >>> model = AyaVisionForConditionalGeneration.from_pretrained("CohereForAI/aya-vision-8b", device_map=torch_device) + + >>> messages = [ + ... { + ... "role": "user", + ... "content": [ + ... { + ... "type": "image", + ... "url": "https://pbs.twimg.com/media/Fx7YvfQWYAIp6rZ?format=jpg&name=medium", + ... }, + ... {"type": "text", "text": "चित्र में लिखा पाठ क्या कहता है?"}, + ... ], + ... } + ... ] + + >>> inputs = processor.apply_chat_template( + ... messages, padding=True, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt", device=torch_device + ... ).to(model.device) + + >>> gen_tokens = model.generate(**inputs, max_new_tokens=300, do_sample=True, temperature=0.3) + >>> processor.tokenizer.decode(gen_tokens[0][inputs.input_ids.shape[1]:], skip_special_tokens=True) + ```""" + super().forward( + input_ids=input_ids, + pixel_values=pixel_values, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + vision_feature_layer=vision_feature_layer, + vision_feature_select_strategy=vision_feature_select_strategy, + labels=labels, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + cache_position=cache_position, + logits_to_keep=logits_to_keep, + image_sizes=image_sizes, + **lm_kwargs, + ) + + +__all__ = ["AyaVisionForConditionalGeneration", "AyaVisionPreTrainedModel"] diff --git a/docs/transformers/build/lib/transformers/models/aya_vision/processing_aya_vision.py b/docs/transformers/build/lib/transformers/models/aya_vision/processing_aya_vision.py new file mode 100644 index 0000000000000000000000000000000000000000..3b9afecda50fd1f34d681922eff43949bf44a519 --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/aya_vision/processing_aya_vision.py @@ -0,0 +1,255 @@ +# coding=utf-8 +# Copyright 2025 HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from typing import List, Optional, Union + +from transformers.processing_utils import ( + ImagesKwargs, + ProcessingKwargs, + ProcessorMixin, + Unpack, +) +from transformers.tokenization_utils_base import PreTokenizedInput, TextInput + +from ...image_processing_utils import BatchFeature +from ...image_utils import ( + ImageInput, + make_flat_list_of_images, +) + + +class AyaVisionImagesKwargs(ImagesKwargs, total=False): + crop_to_patches: Optional[bool] + min_patches: Optional[int] + max_patches: Optional[int] + + +class AyaVisionProcessorKwargs(ProcessingKwargs, total=False): + images_kwargs: AyaVisionImagesKwargs + _defaults = { + "text_kwargs": { + "padding_side": "left", + "padding": True, + }, + "images_kwargs": { + "crop_to_patches": True, + }, + } + + +class AyaVisionProcessor(ProcessorMixin): + r""" + Constructs a AyaVision processor which wraps a [`AutoImageProcessor`] and + [`PretrainedTokenizerFast`] tokenizer into a single processor that inherits both the image processor and + tokenizer functionalities. See the [`~AyaVisionProcessor.__call__`] and [`~AyaVisionProcessor.decode`] for more information. + Args: + image_processor ([`AutoImageProcessor`], *optional*): + The image processor is a required input. + tokenizer ([`PreTrainedTokenizer`, `PreTrainedTokenizerFast`], *optional*): + The tokenizer is a required input. + patch_size (`int`, *optional*, defaults to 28): + The size of image patches for tokenization. + img_size (`int`, *optional*, defaults to 364): + The size of the image to be tokenized. This should correspond to the size given to the image processor. + image_token (`str`, *optional*, defaults to `""`): + The token to be used to represent an image in the text. + downsample_factor (`int`, *optional*, defaults to 1): + The factor by which to scale the patch size. + start_of_img_token (`str`, *optional*, defaults to `"<|START_OF_IMG|>"`): + The token to be used to represent the start of an image in the text. + end_of_img_token (`str`, *optional*, defaults to `"<|END_OF_IMG|>"`): + The token to be used to represent the end of an image in the text. + img_patch_token (`str`, *optional*, defaults to `"<|IMG_PATCH|>"`): + The token to be used to represent an image patch in the text. + img_line_break_token (`str`, *optional*, defaults to `"<|IMG_LINE_BREAK|>"`): + The token to be used to represent a line break in the text. + tile_token (`str`, *optional*, defaults to `"TILE"`): + The token to be used to represent an image patch in the text. + tile_global_token (`str`, *optional*, defaults to `"TILE_GLOBAL"`): + The token to be used to represent the cover image in the text. + chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages + in a chat into a tokenizable string. + """ + + attributes = ["image_processor", "tokenizer"] + valid_kwargs = [ + "chat_template", + "image_token", + "patch_size", + "img_size", + "downsample_factor", + "start_of_img_token", + "end_of_img_token", + "img_patch_token", + "img_line_break_token", + "tile_token", + "tile_global_token", + ] + image_processor_class = "AutoImageProcessor" + tokenizer_class = "AutoTokenizer" + + def __init__( + self, + image_processor=None, + tokenizer=None, + patch_size: int = 28, + img_size: int = 364, + image_token="", # set the default and let users change if they have peculiar special tokens in rare cases + downsample_factor: int = 1, + start_of_img_token="<|START_OF_IMG|>", + end_of_img_token="<|END_OF_IMG|>", + img_patch_token="<|IMG_PATCH|>", + img_line_break_token="<|IMG_LINE_BREAK|>", + tile_token="TILE", + tile_global_token="TILE_GLOBAL", + chat_template=None, + **kwargs, + ): + super().__init__(image_processor, tokenizer, chat_template=chat_template) + + self.image_token = image_token + self.image_token_id = tokenizer.convert_tokens_to_ids(self.image_token) + self.patch_size = patch_size * downsample_factor + self.img_size = img_size + + self.start_of_img_token = start_of_img_token + self.end_of_img_token = end_of_img_token + self.img_patch_token = img_patch_token + self.img_line_break_token = img_line_break_token + self.tile_token = tile_token + self.tile_global_token = tile_global_token + + def _prompt_split_image(self, num_patches): + """ + Create a structured string representation of image tokens + + Args: + num_patches: Number of patches in the image + + Returns: + String with appropriate image tokens + """ + + img_patches_per_tile = (self.img_size // self.patch_size) ** 2 + img_string = f"{self.start_of_img_token}" + if num_patches > 1: + for idx in range(1, num_patches): + img_string += f"{self.tile_token}_{idx}" + f"{self.img_patch_token}" * img_patches_per_tile + + img_string += f"{self.tile_global_token}" + f"{self.img_patch_token}" * img_patches_per_tile + img_string += f"{self.end_of_img_token}" + return img_string + + def __call__( + self, + images: Optional[ImageInput] = None, + text: Optional[Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]] = None, + audio=None, + videos=None, + **kwargs: Unpack[AyaVisionProcessorKwargs], + ) -> BatchFeature: + """ + Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text` + and `kwargs` arguments to PreTrainedTokenizerFast's [`~PreTrainedTokenizerFast.__call__`] to encode the text. + To prepare the vision inputs, this method forwards the `images` and `kwargs` arguments to + GotOcr2ImageProcessor's [`~GotOcr2ImageProcessor.__call__`] if `images` is not `None`. + + Args: + images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`): + The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch + tensor. Both channels-first and channels-last formats are supported. + text (`str`, `List[str]`, `List[List[str]]`): + The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings + (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set + `is_split_into_words=True` (to lift the ambiguity with a batch of sequences). + return_tensors (`str` or [`~utils.TensorType`], *optional*): + If set, will return tensors of a particular framework. Acceptable values are: + - `'tf'`: Return TensorFlow `tf.constant` objects. + - `'pt'`: Return PyTorch `torch.Tensor` objects. + - `'np'`: Return NumPy `np.ndarray` objects. + - `'jax'`: Return JAX `jnp.ndarray` objects. + + Returns: + [`BatchFeature`]: A [`BatchFeature`] with the following fields: + + - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`. + - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when + `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not + `None`). + - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`. + """ + if text is None: + raise ValueError("You have to specify text.") + + output_kwargs = self._merge_kwargs( + AyaVisionProcessorKwargs, + tokenizer_init_kwargs=self.tokenizer.init_kwargs, + **kwargs, + ) + + if not isinstance(text, (list, tuple)): + text = [text] + + # Process images + image_inputs = {} + if images is not None: + images = make_flat_list_of_images(images) + image_inputs = self.image_processor(images=images, **output_kwargs["images_kwargs"]) + num_patches = image_inputs.pop("num_patches") + image_index = 0 + processed_text = [] + for prompt in text: + new_prompt = prompt + while "" in new_prompt: + # Replace the image placeholder with structured image tokens + image_tokens = self._prompt_split_image(num_patches[image_index]) + new_prompt = new_prompt.replace("", image_tokens, 1) + image_index += 1 + processed_text.append(new_prompt) + + if image_index != len(images): + raise ValueError("Number of image placeholders in the prompt does not match the number of images.") + + text = processed_text + + return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None) + text_inputs = self.tokenizer(text, **output_kwargs["text_kwargs"]) + self._check_special_mm_tokens(text, text_inputs, modalities=["image"]) + + return BatchFeature(data={**text_inputs, **image_inputs}, tensor_type=return_tensors) + + def batch_decode(self, *args, **kwargs): + """ + This method forwards all its arguments to PreTrainedTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please + refer to the docstring of this method for more information. + """ + return self.tokenizer.batch_decode(*args, **kwargs) + + def decode(self, *args, **kwargs): + """ + This method forwards all its arguments to PreTrainedTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to + the docstring of this method for more information. + """ + return self.tokenizer.decode(*args, **kwargs) + + @property + def model_input_names(self): + tokenizer_input_names = self.tokenizer.model_input_names + image_processor_input_names = self.image_processor.model_input_names + return list(tokenizer_input_names) + list(image_processor_input_names) + + +__all__ = ["AyaVisionProcessor"] diff --git a/docs/transformers/build/lib/transformers/models/bamba/__init__.py b/docs/transformers/build/lib/transformers/models/bamba/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..c3920da849a33318e626f6df3fe65d1abc71aac7 --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/bamba/__init__.py @@ -0,0 +1,28 @@ +# Copyright 2024 IBM and the HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ...utils import _LazyModule +from ...utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .configuration_bamba import * + from .modeling_bamba import * + from .processing_bamba import * +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/docs/transformers/build/lib/transformers/models/bamba/configuration_bamba.py b/docs/transformers/build/lib/transformers/models/bamba/configuration_bamba.py new file mode 100644 index 0000000000000000000000000000000000000000..36ac30ccca430c30315006df12341bea2dab1a09 --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/bamba/configuration_bamba.py @@ -0,0 +1,206 @@ +# coding=utf-8 +# Copyright 2024 IBM and the HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Bamba model configuration""" + +from ...configuration_utils import PretrainedConfig +from ...utils import logging + + +logger = logging.get_logger(__name__) + + +class BambaConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`BambaModel`]. It is used to instantiate a + BambaModel model according to the specified arguments, defining the model architecture. Instantiating a configuration + with defaults taken from [ibm-fms/Bamba-9.8b-2.2T-hf](https://huggingface.co/ibm-fms/Bamba-9.8b-2.2T-hf). + + The BambaModel is a hybrid [mamba2](https://github.com/state-spaces/mamba) architecture with SwiGLU. + The checkpoints are jointly trained by IBM, Princeton, and UIUC. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + vocab_size (`int`, *optional*, defaults to 128000): + Vocabulary size of the Bamba model. Defines the number of different tokens that can be represented by the + `inputs_ids` passed when calling [`BambaModel`] + tie_word_embeddings (`bool`, *optional*, defaults to `False`): + Whether the model's input and output word embeddings should be tied. Note that this is only relevant if the + model has an output word embedding layer. + hidden_size (`int`, *optional*, defaults to 4096): + Dimension of the hidden representations. + intermediate_size (`int`, *optional*, defaults to 14336): + Dimension of the MLP representations. + num_hidden_layers (`int`, *optional*, defaults to 32): + Number of hidden layers in the Transformer encoder. + num_attention_heads (`int`, *optional*, defaults to 32): + Number of attention heads for each attention layer in the Transformer encoder. + num_key_value_heads (`int`, *optional*, defaults to 8): + This is the number of key_value heads that should be used to implement Grouped Query Attention. If + `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if + `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When + converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed + by meanpooling all the original heads within that group. For more details checkout [this + paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to `8`. + hidden_act (`str` or `function`, *optional*, defaults to `"silu"`): + The non-linear activation function (function or string) in the decoder. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + rms_norm_eps (`float`, *optional*, defaults to 1e-05): + The epsilon used by the rms normalization layers. + use_cache (`bool`, *optional*, defaults to `True`): + Whether or not the model should return the last key/values attentions (not used by all models). Only + relevant if `config.is_decoder=True`. + num_logits_to_keep (`int` or `None`, *optional*, defaults to 1): + Number of prompt logits to calculate during generation. If `None`, all logits will be calculated. If an + integer value, only last `num_logits_to_keep` logits will be calculated. Default is 1 because only the + logits of the last prompt token are needed for generation. For long sequences, the logits for the entire + sequence may use a lot of memory so, setting `num_logits_to_keep=1` will reduce memory footprint + significantly. + pad_token_id (`int`, *optional*, defaults to 0): + The id of the padding token. + bos_token_id (`int`, *optional*, defaults to 1): + The id of the "beginning-of-sequence" token. + eos_token_id (`int`, *optional*, defaults to 2): + The id of the "end-of-sequence" token. + max_position_embeddings (`int`, *optional*, defaults to 262144): + Max cached sequence length for the model + attention_dropout (`float`, *optional*, defaults to 0.0): + The dropout ratio for the attention probabilities. + attn_layer_indices (`list`, *optional*): + Specifies the layer indices that will have full attention. Must contain values at most num_hidden_layers. + mamba_n_heads (`int`, *optional*, defaults to 128): + The number of mamba heads used in the v2 implementation. + mamba_d_head (`int`, *optional*, defaults to `"auto"`): + Head embedding dimension size + mamba_n_groups (`int`, *optional*, defaults to 1): + The number of the mamba groups used in the v2 implementation. + mamba_d_state (`int`, *optional*, defaults to 256): + The dimension the mamba state space latents + mamba_d_conv (`int`, *optional*, defaults to 4): + The size of the mamba convolution kernel + mamba_expand (`int`, *optional*, defaults to 2): + Expanding factor (relative to hidden_size) used to determine the mamba intermediate size + mamba_chunk_size (`int`, *optional*, defaults to 256): + The chunks in which to break the sequence when doing prefill/training + mamba_conv_bias (`bool`, *optional*, defaults to `True`): + Flag indicating whether or not to use bias in the convolution layer of the mamba mixer block. + mamba_proj_bias (`bool`, *optional*, defaults to `False`): + Flag indicating whether or not to use bias in the input and output projections (["in_proj", "out_proj"]) of the mamba mixer block + + """ + + model_type = "bamba" + keys_to_ignore_at_inference = ["past_key_values"] + + def __init__( + self, + vocab_size=128000, + tie_word_embeddings=False, + hidden_size=4096, + intermediate_size=14336, + num_hidden_layers=32, + num_attention_heads=32, + num_key_value_heads=8, + hidden_act="silu", + initializer_range=0.02, + rms_norm_eps=1e-5, + use_cache=True, + num_logits_to_keep=1, + pad_token_id=0, + bos_token_id=1, + eos_token_id=2, + max_position_embeddings=262144, + attention_dropout=0.0, + attn_layer_indices=None, + mamba_n_heads=128, + mamba_d_head="auto", + mamba_n_groups=1, + mamba_d_state=256, + mamba_d_conv=4, + mamba_expand=2, + mamba_chunk_size=256, + mamba_conv_bias=True, + mamba_proj_bias=False, + **kwargs, + ): + self.vocab_size = vocab_size + self.tie_word_embeddings = tie_word_embeddings + self.hidden_size = hidden_size + self.intermediate_size = intermediate_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.max_position_embeddings = max_position_embeddings + self.attention_dropout = attention_dropout + self.attention_bias = False + self.mlp_bias = False + + # for backward compatibility + if num_key_value_heads is None: + num_key_value_heads = num_attention_heads + + self.num_key_value_heads = num_key_value_heads + self.hidden_act = hidden_act + self.initializer_range = initializer_range + self.rms_norm_eps = rms_norm_eps + + self.use_cache = use_cache + self.num_logits_to_keep = num_logits_to_keep + + self.attn_layer_indices = attn_layer_indices + self.rope_theta = 10000.0 + self.rope_scaling = None + self.partial_rotary_factor = 0.5 + + mamba_intermediate = mamba_expand * hidden_size + + if mamba_intermediate % mamba_n_heads != 0: + raise ValueError("mamba_n_heads must divide mamba_expand * hidden_size") + + # for the mamba_v2, must satisfy the following + if mamba_d_head == "auto": + mamba_d_head = mamba_intermediate // mamba_n_heads + + if mamba_d_head * mamba_n_heads != mamba_intermediate: + raise ValueError("The dimensions for the Mamba head state do not match the model intermediate_size") + + self.mamba_n_heads = mamba_n_heads + self.mamba_d_head = mamba_d_head + self.mamba_n_groups = mamba_n_groups + self.mamba_d_state = mamba_d_state + self.mamba_d_conv = mamba_d_conv + self.mamba_expand = mamba_expand + self.mamba_chunk_size = mamba_chunk_size + self.mamba_conv_bias = mamba_conv_bias + self.mamba_proj_bias = mamba_proj_bias + + super().__init__( + pad_token_id=pad_token_id, + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + tie_word_embeddings=tie_word_embeddings, + **kwargs, + ) + + @property + def layers_block_type(self): + return [ + "attention" if (self.attn_layer_indices and i in self.attn_layer_indices) else "mamba" + for i in range(self.num_hidden_layers) + ] + + +__all__ = ["BambaConfig"] diff --git a/docs/transformers/build/lib/transformers/models/bamba/convert_mamba_ssm_checkpoint.py b/docs/transformers/build/lib/transformers/models/bamba/convert_mamba_ssm_checkpoint.py new file mode 100644 index 0000000000000000000000000000000000000000..92ddcc88d4c214e3be0dcec5ef7ece963df6ed37 --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/bamba/convert_mamba_ssm_checkpoint.py @@ -0,0 +1,273 @@ +# coding=utf-8 +# Copyright 2024 IBM and the HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""This script can be used to convert checkpoints provided in the `mamba_ssm` library into the format provided in HuggingFace `transformers`. It depends on the `mamba2_ssm` package to be installed.""" + +import argparse +import json +import os +import re +from os import path +from typing import Dict, Optional, Union + +import torch +from huggingface_hub import split_torch_state_dict_into_shards +from safetensors.torch import save_file + +from transformers import AutoTokenizer +from transformers.utils import SAFE_WEIGHTS_INDEX_NAME, SAFE_WEIGHTS_NAME + +from .configuration_bamba import BambaConfig + + +def convert_state_dict_from_mamba_ssm(original_sd: Dict) -> Dict[str, torch.Tensor]: + state_dict = {} + + for orig_k, param in original_sd.items(): + k = orig_k.replace("backbone", "model") + + # for embeddings + k = k.replace("embedding", "embed_tokens") + + # for mixer + k = k.replace("mixer", "mamba") + + # for final layernorm + k = k.replace("norm_f", "final_layernorm") + + # for block layernorm + k = re.sub(r"(\d+)\.norm\.", r"\1.input_layernorm.", k) + k = re.sub(r"(\d+)\.norm2\.", r"\1.pre_ff_layernorm.", k) + + # for mlp + k = k.replace("mlp.fc2", "feed_forward.down_proj") + + if "mlp.fc1" in k: + param, param2 = torch.chunk(param, 2, dim=0) + k2 = k.replace("mlp.fc1", "feed_forward.gate_proj") + state_dict[k2] = param2 + k = k.replace("mlp.fc1", "feed_forward.up_proj") + + if ("in_proj" in k and orig_k.replace("in_proj", "conv1d") in original_sd) or ( + "out_proj" in k and orig_k.replace("out_proj", "conv1d") in original_sd + ): + # then this must be a mamba + pass + else: + # for attn + # - because mixer was replaced to mamba above + k = k.replace("mamba.out_proj", "self_attn.o_proj") + if "mamba.in_proj" in k: + m, n = param.shape + d = (m - n) // 2 + param, param2, param3 = torch.split(param, [n, d, d], dim=0) + k2 = k.replace("mamba.in_proj", "self_attn.k_proj") + state_dict[k2] = param2 + k2 = k.replace("mamba.in_proj", "self_attn.v_proj") + state_dict[k2] = param3 + k = k.replace("mamba.in_proj", "self_attn.q_proj") + + state_dict[k] = param + + return state_dict + + +# Adapted from transformers.models.mamba.convert_mamba_ssm_checkpoint_to_pytorch.py +def convert_ssm_config_to_hf_config( + config_ssm: Dict, + **kwargs, +) -> BambaConfig: + """Convert a config from mamba_ssm to a BambaConfig from here.""" + hf_config: BambaConfig = BambaConfig(**kwargs) + + hf_config.architectures = ["BambaForCausalLM"] + + # Set important values from config and recalculate other resulting entries + hf_config.hidden_size = config_ssm["d_model"] + hf_config.intermediate_size = config_ssm["d_intermediate"] + hf_config.mamba_n_heads = (hf_config.hidden_size * hf_config.mamba_expand) // hf_config.mamba_d_head + hf_config.num_hidden_layers = config_ssm["n_layer"] + hf_config.tie_word_embeddings = config_ssm["tie_embeddings"] + + # currently this script assumes config_ssm belongs to v2 + if config_ssm["ssm_cfg"].get("layer") != "Mamba2": + raise ValueError("Conversion script only supports Mamba2") + + # Set attention values + attn_cfg = config_ssm.get("attn_cfg") + if attn_cfg: + assert attn_cfg["causal"], "Only support non-causal attention." + assert not attn_cfg["qkv_proj_bias"], "Only support no qkv bias." + assert not attn_cfg["out_proj_bias"], "Only support no out bias." + hf_config.attn_rotary_emb = attn_cfg["rotary_emb_dim"] + hf_config.num_attention_heads = attn_cfg["num_heads"] + hf_config.num_key_value_heads = attn_cfg["num_heads_kv"] + + attention_layer_indices = config_ssm.get("attn_layer_idx") + if attention_layer_indices: + hf_config.attn_layer_indices = attention_layer_indices + + # Padded vocab size, mostly of 16 but 32 is also very common in different models + vocab_size = config_ssm["vocab_size"] + pad_vocab_size_multiple = config_ssm["pad_vocab_size_multiple"] + if (vocab_size % pad_vocab_size_multiple) != 0: + vocab_size += pad_vocab_size_multiple - (vocab_size % pad_vocab_size_multiple) + hf_config.vocab_size = vocab_size + + return hf_config + + +def save_single_safetensor( + state_dict: Dict, + save_directory: str, + metadata: Dict, +): + save_file( + state_dict, + os.path.join(save_directory, SAFE_WEIGHTS_NAME), + metadata, + ) + + +def save_sharded_safetensors( + state_dict: Dict, + save_directory: str, + metadata: Dict, + max_shard_size: Union[int, str] = "5GB", +): + filename_pattern = SAFE_WEIGHTS_NAME.replace(".bin", "{suffix}.bin").replace( + ".safetensors", "{suffix}.safetensors" + ) + state_dict_split = split_torch_state_dict_into_shards( + state_dict, filename_pattern=filename_pattern, max_shard_size=max_shard_size + ) + index = { + "metadata": state_dict_split.metadata, + "weight_map": state_dict_split.tensor_to_filename, + } + # Save the index + with open(os.path.join(save_directory, SAFE_WEIGHTS_INDEX_NAME), "w", encoding="utf-8") as f: + content = json.dumps(index, indent=2, sort_keys=True) + "\n" + f.write(content) + + filename_to_tensors = state_dict_split.filename_to_tensors.items() + for shard_file, tensors in filename_to_tensors: + shard = {tensor: state_dict[tensor].contiguous() for tensor in tensors} + save_file(shard, os.path.join(save_directory, shard_file), metadata=metadata) + + +# Adapted from transformers.models.mamba.convert_mamba_ssm_checkpoint_to_pytorch.py +def convert_mamba_ssm_checkpoint_file_to_huggingface_model_file( + mamba_ssm_checkpoint_path: str, + precision: str, + output_dir: str, + tokenizer_path: Optional[str] = None, + save_model: Union[bool, str] = True, +) -> None: + # load tokenizer if provided, this will be used to set the + # token_ids in the config file + token_ids = {} + if tokenizer_path: + tokenizer = AutoTokenizer.from_pretrained(tokenizer_path) + for key in [ + "bos_token_id", + "eos_token_id", + "pad_token_id", + ]: + id = getattr(tokenizer, key, None) + if id: + token_ids[key] = id + + # there are some configs unsettable by mamba_ssn config, so + # if there are changes from the defaults, have to pass them into + # the function + unsettables = { + "mamba_d_head": 64, + "mamba_d_state": 128, + "mamba_n_groups": 1, + "rms_norm_eps": 1e-5, + } + + # Load and save config based on name + config_path = path.join(mamba_ssm_checkpoint_path, "config.json") + with open(config_path, "r", encoding="utf-8") as json_file: + config = json.load(json_file) + + # convert the config + hf_config = convert_ssm_config_to_hf_config( + config_ssm=config, + **token_ids, + **unsettables, + ) + hf_config.save_pretrained(output_dir) + + # Load state dict of the original model and transfer to hf model + state_dict = torch.load( + path.join(mamba_ssm_checkpoint_path, "pytorch_model.bin"), + map_location="cpu", + weights_only=True, + ) + # FIXME: allow other parameters to pass in + state_dict = convert_state_dict_from_mamba_ssm(state_dict) + + # Save new model to pytorch_dump_path + dtype = torch.float32 if precision == "fp32" else (torch.bfloat16 if precision == "bf16" else torch.float16) + + save_file_fn = None + if isinstance(save_model, bool) and save_model: + save_file_fn = save_single_safetensor + elif isinstance(save_model, str) and save_model == "sharded": + save_file_fn = save_sharded_safetensors + + if save_file_fn: + save_file_fn({k: v.to(dtype) for k, v in state_dict.items()}, output_dir, metadata={"format": "pt"}) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + "-i", + "--mamba_ssm_checkpoint_directory", + type=str, + required=True, + help="Path to a directory containing the `pytorch_model.bin` mamba_ssm checkpoint file to be converted.", + ) + parser.add_argument( + "-p", + "--precision", + type=str, + default="fp16", + const="fp16", + required=True, + choices=("fp32", "fp16", "bf16"), + help="The precision the model will be saved in. Select from fp32, fp16 or bf16.", + ) + parser.add_argument( + "-o", "--output_dir", type=str, required=True, help="Path to directory to save the converted output model to." + ) + parser.add_argument( + "-t", + "--tokenizer_model_path", + type=str, + default=None, + required=False, + help="Path to a the tokenizer file.", + ) + args = parser.parse_args() + + convert_mamba_ssm_checkpoint_file_to_huggingface_model_file( + args.mamba2_checkpoint_directory, + args.precision, + args.output_dir, + ) diff --git a/docs/transformers/build/lib/transformers/models/bamba/modeling_bamba.py b/docs/transformers/build/lib/transformers/models/bamba/modeling_bamba.py new file mode 100644 index 0000000000000000000000000000000000000000..0fb696c1f7364fc403d24196a5c872d7cd6376f0 --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/bamba/modeling_bamba.py @@ -0,0 +1,1596 @@ +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# This file was automatically generated from src/transformers/models/bamba/modular_bamba.py. +# Do NOT edit this file manually as any edits will be overwritten by the generation of +# the file from the modular. If any change should be done, please apply the change to the +# modular_bamba.py file directly. One of our CI enforces this. +# 🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨 +# coding=utf-8 +# Copyright 2024 IBM and the HuggingFace Inc. team. All rights reserved. +# +# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX +# and OPT implementations in this library. It has been modified from its +# original forms to accommodate minor architectural differences compared +# to GPT-NeoX and OPT used by the Meta AI team that trained the model. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Callable, Optional, Tuple, Union + +import torch +from torch import nn + +import transformers.models.jamba.modeling_jamba as modeling_jamba +from transformers.activations import ACT2FN + +from ...cache_utils import Cache # we need __iter__ and __len__ of pkv +from ...generation import GenerationMixin +from ...integrations import use_kernel_forward_from_hub +from ...modeling_attn_mask_utils import AttentionMaskConverter +from ...modeling_flash_attention_utils import FlashAttentionKwargs +from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast +from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update +from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel +from ...processing_utils import Unpack +from ...utils import ( + add_start_docstrings, + add_start_docstrings_to_model_forward, + can_return_tuple, + logging, + replace_return_docstrings, +) +from ...utils.import_utils import is_causal_conv1d_available, is_mamba_2_ssm_available +from .configuration_bamba import BambaConfig + + +if is_mamba_2_ssm_available(): + from mamba_ssm.ops.triton.selective_state_update import selective_state_update + from mamba_ssm.ops.triton.ssd_combined import mamba_chunk_scan_combined, mamba_split_conv1d_scan_combined +else: + selective_state_update = None + +if is_causal_conv1d_available(): + from causal_conv1d import causal_conv1d_fn, causal_conv1d_update +else: + causal_conv1d_update, causal_conv1d_fn = None, None + + +logger = logging.get_logger(__name__) + +_CONFIG_FOR_DOC = "BambaConfig" + + +# Adapted from transformers.models.jamba.modeling_jamba.HybridMambaAttentionDynamicCache for the v2 mixer +class HybridMambaAttentionDynamicCache(modeling_jamba.HybridMambaAttentionDynamicCache): + """ + A dynamic cache that can handle both the attention cache (which has a seq_len dimension) and the mamba cache + (which has a constant shape regardless of seq_len). + + This cache has two sets of lists of tensors: `key_cache` and `value_cache` for attention cache and `conv_states` + and `ssm_states` for mamba cache. Each of these lists has `num_layers` tensors. The expected shape for each tensor + For attention layers, `key_cache` and `value_cache` have a shape of `(batch_size, num_heads, seq_len, head_dim)`, + while `conv_states` and `ssm_states` have a shape of `(batch_size, 0)` (empty tensors). + For mamba layers, `key_cache` and `value_cache` have a shape of `(batch_size, 0)` (empty tensors), + while `conv_states` represents the convolution state and has a shape of `(batch_size, d_inner, d_conv)`, + and `ssm_states` represents the ssm state and has a shape of `(batch_size, d_inner, d_state)`. + """ + + def __init__(self, config: BambaConfig, batch_size, dtype=torch.float16, device=None): + super().__init__(config, batch_size, dtype, device) + self.layers_block_type = config.layers_block_type + self.has_previous_state = False # only used by mamba + conv_kernel_size = config.mamba_d_conv + ssm_state_size = config.mamba_d_state + + self.conv_states = [] + self.ssm_states = [] + self.transformer_layers = [] + for i in range(config.num_hidden_layers): + if self.layers_block_type[i] == "mamba": + self.conv_states += [ + torch.zeros( + batch_size, + (config.mamba_expand * config.hidden_size + 2 * config.mamba_n_groups * ssm_state_size), + conv_kernel_size, + device=device, + dtype=dtype, + ) + ] + self.ssm_states += [ + torch.zeros( + batch_size, + config.mamba_n_heads, + config.mamba_d_head, + ssm_state_size, + device=device, + dtype=dtype, + ) + ] + else: + self.conv_states += [torch.tensor([[]] * batch_size, device=device)] + self.ssm_states += [torch.tensor([[]] * batch_size, device=device)] + self.transformer_layers.append(i) + + self.key_cache = [torch.tensor([[]] * batch_size, device=device) for _ in range(config.num_hidden_layers)] + self.value_cache = [torch.tensor([[]] * batch_size, device=device) for _ in range(config.num_hidden_layers)] + + +class BambaRotaryEmbedding(nn.Module): + def __init__(self, config: BambaConfig, device=None): + super().__init__() + # BC: "rope_type" was originally "type" + if hasattr(config, "rope_scaling") and config.rope_scaling is not None: + self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type")) + else: + self.rope_type = "default" + self.max_seq_len_cached = config.max_position_embeddings + self.original_max_seq_len = config.max_position_embeddings + + self.config = config + self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type] + + inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device) + self.register_buffer("inv_freq", inv_freq, persistent=False) + self.original_inv_freq = self.inv_freq + + @torch.no_grad() + @dynamic_rope_update # power user: used with advanced RoPE types (e.g. dynamic rope) + def forward(self, x, position_ids): + inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device) + position_ids_expanded = position_ids[:, None, :].float() + + device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu" + with torch.autocast(device_type=device_type, enabled=False): # Force float32 + freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2) + emb = torch.cat((freqs, freqs), dim=-1) + cos = emb.cos() * self.attention_scaling + sin = emb.sin() * self.attention_scaling + + return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype) + + +def rotate_half(x): + """Rotates half the hidden dims of the input.""" + x1 = x[..., : x.shape[-1] // 2] + x2 = x[..., x.shape[-1] // 2 :] + return torch.cat((-x2, x1), dim=-1) + + +def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor: + """ + This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch, + num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim) + """ + batch, num_key_value_heads, slen, head_dim = hidden_states.shape + if n_rep == 1: + return hidden_states + hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim) + return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim) + + +def eager_attention_forward( + module: nn.Module, + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + attention_mask: Optional[torch.Tensor], + scaling: float, + dropout: float = 0.0, + **kwargs, +): + key_states = repeat_kv(key, module.num_key_value_groups) + value_states = repeat_kv(value, module.num_key_value_groups) + + attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling + if attention_mask is not None: + causal_mask = attention_mask[:, :, :, : key_states.shape[-2]] + attn_weights = attn_weights + causal_mask + + attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype) + attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training) + attn_output = torch.matmul(attn_weights, value_states) + attn_output = attn_output.transpose(1, 2).contiguous() + + return attn_output, attn_weights + + +# Adapted from transformers.models.glm.modular_glm.apply_rotary_pos_emb +def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1): + """Applies Rotary Position Embedding to the query and key tensors. + + Removes the interleaving of cos and sin from GLM + + Args: + q (`torch.Tensor`): The query tensor. + k (`torch.Tensor`): The key tensor. + cos (`torch.Tensor`): The cosine part of the rotary embedding. + sin (`torch.Tensor`): The sine part of the rotary embedding. + position_ids (`torch.Tensor`, *optional*): + Deprecated and unused. + unsqueeze_dim (`int`, *optional*, defaults to 1): + The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and + sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note + that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and + k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes + cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have + the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2. + Returns: + `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding. + """ + cos = cos.unsqueeze(unsqueeze_dim) + sin = sin.unsqueeze(unsqueeze_dim) + + # Keep half or full tensor for later concatenation + rotary_dim = cos.shape[-1] + q_rot, q_pass = q[..., :rotary_dim], q[..., rotary_dim:] + k_rot, k_pass = k[..., :rotary_dim], k[..., rotary_dim:] + + # Apply rotary embeddings on the first half or full tensor + q_embed = (q_rot * cos) + (rotate_half(q_rot) * sin) + k_embed = (k_rot * cos) + (rotate_half(k_rot) * sin) + + # Concatenate back to full shape + q_embed = torch.cat([q_embed, q_pass], dim=-1) + k_embed = torch.cat([k_embed, k_pass], dim=-1) + return q_embed, k_embed + + +class BambaAttention(nn.Module): + """Multi-headed attention from 'Attention Is All You Need' paper""" + + def __init__(self, config: BambaConfig, layer_idx: int): + super().__init__() + self.config = config + self.layer_idx = layer_idx + self.head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads) + self.num_key_value_groups = config.num_attention_heads // config.num_key_value_heads + self.scaling = self.head_dim**-0.5 + self.attention_dropout = config.attention_dropout + self.is_causal = True + + self.q_proj = nn.Linear( + config.hidden_size, config.num_attention_heads * self.head_dim, bias=config.attention_bias + ) + self.k_proj = nn.Linear( + config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias + ) + self.v_proj = nn.Linear( + config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias + ) + self.o_proj = nn.Linear( + config.num_attention_heads * self.head_dim, config.hidden_size, bias=config.attention_bias + ) + + def forward( + self, + hidden_states: torch.Tensor, + position_embeddings: Tuple[torch.Tensor, torch.Tensor], + attention_mask: Optional[torch.Tensor], + past_key_value: Optional[Cache] = None, + cache_position: Optional[torch.LongTensor] = None, + **kwargs: Unpack[FlashAttentionKwargs], + ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + input_shape = hidden_states.shape[:-1] + hidden_shape = (*input_shape, -1, self.head_dim) + + query_states = self.q_proj(hidden_states).view(hidden_shape).transpose(1, 2) + key_states = self.k_proj(hidden_states).view(hidden_shape).transpose(1, 2) + value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2) + + cos, sin = position_embeddings + query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin) + + if past_key_value is not None: + # sin and cos are specific to RoPE models; cache_position needed for the static cache + cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position} + key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs) + + attention_interface: Callable = eager_attention_forward + + if self.config._attn_implementation != "eager": + if self.config._attn_implementation == "sdpa" and kwargs.get("output_attentions", False): + logger.warning_once( + "`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to " + 'eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.' + ) + else: + attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation] + + attn_output, attn_weights = attention_interface( + self, + query_states, + key_states, + value_states, + attention_mask, + dropout=0.0 if not self.training else self.attention_dropout, + scaling=self.scaling, + **kwargs, + ) + + attn_output = attn_output.reshape(*input_shape, -1).contiguous() + attn_output = self.o_proj(attn_output) + return attn_output, attn_weights + + +class BambaRMSNormGated(torch.nn.Module): + def __init__(self, hidden_size, eps=1e-6): + super().__init__() + self.weight = nn.Parameter(torch.ones(hidden_size)) + self.variance_epsilon = eps + + def forward(self, hidden_states, gate=None): + input_dtype = hidden_states.dtype + hidden_states = hidden_states.to(torch.float32) + + if gate is not None: + hidden_states = hidden_states * nn.functional.silu(gate.to(torch.float32)) + variance = hidden_states.pow(2).mean(-1, keepdim=True) + hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon) + + return self.weight * hidden_states.to(input_dtype) + + +# Helper methods for segment sum computation + + +def pad_tensor_by_size(input_tensor: torch.Tensor, pad_size: int): + """ + Padding x tensor with `pad_size` on the seq_len dim (dim=1) + + Assumes that we only have tensors of either size 4 or 3 + """ + pad_shape = (0, 0, 0, 0, 0, pad_size, 0, 0) if len(input_tensor.shape) == 4 else (0, 0, 0, pad_size, 0, 0) + + return torch.nn.functional.pad(input_tensor, pad_shape, mode="constant", value=0) + + +def reshape_into_chunks(input_tensor, pad_size, chunk_size): + """ + Padding input_tensor with `pad_size` on the seq_len dim (dim=1) and + simultaneously splitting it into chunk sequences. + + Assumes that we only have tensors of either size 4 or 3 + """ + # [bsz, seq_len, ...] -> [bsz, seq_len multiple of chunk_size, ...] + input_tensor = pad_tensor_by_size(input_tensor, pad_size) + + if len(input_tensor.shape) == 3: + # [bsz, seq_len multiple of chunk_size, num_heads] -> [bsz, -1, chunk_size, num_heads] + return input_tensor.reshape(input_tensor.shape[0], -1, chunk_size, input_tensor.shape[2]) + else: + # [bsz, seq_len multiple of chunk_size, num_heads, head_dim or state_size] -> [bsz, -1, chunk_size, num_heads, head_dim or state_size] + return input_tensor.reshape( + input_tensor.shape[0], -1, chunk_size, input_tensor.shape[2], input_tensor.shape[3] + ) + + +def segment_sum(input_tensor): + """ + More stable segment sum calculation. Uses cumulative sums and masking instead of direct subtractions. + """ + chunk_size = input_tensor.size(-1) + # 1. expand input tensor to have an additional dimension and repeat along that dimension + # [..., chunk_size] -> [..., chunk_size, chunk_size] + input_tensor = input_tensor[..., None].expand(*input_tensor.size(), chunk_size) + # 2. create a lower triangular mask with the diagonal set to 0 to 0 out elements above diag + mask = torch.tril(torch.ones(chunk_size, chunk_size, device=input_tensor.device, dtype=torch.bool), diagonal=-1) + input_tensor = input_tensor.masked_fill(~mask, 0) + # 3. compute actual cumsum + tensor_segsum = torch.cumsum(input_tensor, dim=-2) + + # 4. apply mask to keep only the lower triangular part of the cumulative sum result (incl diagonal this time) + mask = torch.tril(torch.ones(chunk_size, chunk_size, device=input_tensor.device, dtype=torch.bool), diagonal=0) + tensor_segsum = tensor_segsum.masked_fill(~mask, -torch.inf) + return tensor_segsum + + +is_fast_path_available = all((selective_state_update, causal_conv1d_fn, causal_conv1d_update)) + + +def apply_mask_to_padding_states(hidden_states, attention_mask): + """ + Tunes out the hidden states for padding tokens, see https://github.com/state-spaces/mamba/issues/66 + """ + if attention_mask is not None and attention_mask.shape[1] > 1 and attention_mask.shape[0] > 1: + dtype = hidden_states.dtype + hidden_states = (hidden_states * attention_mask[:, :, None]).to(dtype) + + return hidden_states + + +# Adapted from transformers.models.mamba2.modeling_mamba2.Mamba2Mixer +class BambaMixer(nn.Module): + """ + Compute ∆, A, B, C, and D the state space parameters and compute the `contextualized_states`. + A, D are input independent (see Mamba paper [1] Section 3.5.2 "Interpretation of A" for why A isn't selective) + ∆, B, C are input-dependent (this is a key difference between Mamba and the linear time invariant S4, + and is why Mamba is called **selective** state spaces) + + The are a few differences between this and Mamba2Mixer: + - The variable use_precomputed_states is slightly different due to the HybridCache structure + - There's a few non-obvious bugs fixed with batching in the slow path that exist in main + - Some extra variables that our layer doesn't need have been removed + - We ported most of the refactors in https://github.com/huggingface/transformers/pull/35154, which is (as of Dec 18, 2024) unmerged + """ + + def __init__(self, config: BambaConfig, layer_idx: int): + super().__init__() + self.num_heads = config.mamba_n_heads + self.hidden_size = config.hidden_size + self.ssm_state_size = config.mamba_d_state + self.conv_kernel_size = config.mamba_d_conv + self.intermediate_size = int(config.mamba_expand * self.hidden_size) + self.layer_idx = layer_idx + self.use_conv_bias = config.mamba_conv_bias + self.activation = config.hidden_act + self.act = ACT2FN[config.hidden_act] + self.use_bias = config.mamba_proj_bias + + self.layer_norm_epsilon = config.rms_norm_eps + + self.n_groups = config.mamba_n_groups + self.head_dim = config.mamba_d_head + self.chunk_size = config.mamba_chunk_size + + # FIXME: + self.time_step_limit = (0.0, float("inf")) + self.time_step_min = 0.001 + self.time_step_max = 0.1 + + self.conv_dim = self.intermediate_size + 2 * self.n_groups * self.ssm_state_size + self.conv1d = nn.Conv1d( + in_channels=self.conv_dim, + out_channels=self.conv_dim, + bias=config.mamba_conv_bias, + kernel_size=self.conv_kernel_size, + groups=self.conv_dim, + padding=self.conv_kernel_size - 1, + ) + + # projection of the input hidden states + projection_size = self.intermediate_size + self.conv_dim + self.num_heads + self.in_proj = nn.Linear( + self.hidden_size, + projection_size, + bias=self.use_bias, + ) + # selective projection used to make dt, B and C input dependant + + # time step projection (discretization) + # instantiate once and copy inv_dt in init_weights of PretrainedModel + self.dt_bias = nn.Parameter(torch.ones(self.num_heads)) + + # S4D real initialization. These are not discretized! + # The core is to load them, compute the discrete states, then write the updated state. Keeps the memory bounded + A = torch.arange(1, self.num_heads + 1) + self.A_log = nn.Parameter(torch.log(A)) + self.A_log._no_weight_decay = True + self.norm = BambaRMSNormGated(self.intermediate_size, eps=self.layer_norm_epsilon) + self.D = nn.Parameter(torch.ones(self.num_heads)) + self.D._no_weight_decay = True + + self.out_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=self.use_bias) + + if not is_fast_path_available: + logger.warning_once( + "The fast path is not available because on of `(selective_state_update, causal_conv1d_fn, causal_conv1d_update)`" + " is None. Falling back to the naive implementation. To install follow https://github.com/state-spaces/mamba/#installation and" + " https://github.com/Dao-AILab/causal-conv1d" + ) + else: + logger.warning_once("The fast path for Bamba will be used when running the model on a GPU") + + def cuda_kernels_forward( + self, + hidden_states: torch.Tensor, + cache_params: Optional[HybridMambaAttentionDynamicCache] = None, + cache_position: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + ): + # 1. Gated MLP's linear projection + hidden_states = apply_mask_to_padding_states(hidden_states, attention_mask) + projected_states = self.in_proj(hidden_states) + + # Set up dimensions for reshapes later + batch_size, seq_len, _ = hidden_states.shape + groups_time_state_size = self.n_groups * self.ssm_state_size + + use_precomputed_states = ( + cache_params is not None + and cache_params.has_previous_state + and seq_len == 1 + and cache_params.conv_states[self.layer_idx].shape[0] + == cache_params.ssm_states[self.layer_idx].shape[0] + == batch_size + and cache_position is not None + and cache_position[0] > 0 + ) + + # getting projected states from cache if it exists + if use_precomputed_states: + gate, hidden_states_B_C, dt = projected_states.squeeze(1).split( + [self.intermediate_size, self.conv_dim, self.num_heads], dim=-1 + ) + + # 2. Convolution sequence transformation + hidden_states_B_C = causal_conv1d_update( + hidden_states_B_C, + cache_params.conv_states[self.layer_idx], + self.conv1d.weight.squeeze(1), + self.conv1d.bias, + self.activation, + ) + + hidden_states, B, C = torch.split( + hidden_states_B_C, + [self.intermediate_size, groups_time_state_size, groups_time_state_size], + dim=-1, + ) + + # 3. SSM transformation + A = -torch.exp(self.A_log.float()) # (nheads,) + A = A[:, None, ...][:, :, None].expand(-1, self.head_dim, self.ssm_state_size).to(dtype=torch.float32) + dt = dt[:, :, None].expand(-1, -1, self.head_dim) + dt_bias = self.dt_bias[:, None, ...].expand(-1, self.head_dim) + D = self.D[:, None, ...].expand(-1, self.head_dim) + B = B.view(batch_size, self.n_groups, B.shape[1] // self.n_groups) + C = C.view(batch_size, self.n_groups, C.shape[1] // self.n_groups) + hidden_states_reshaped = hidden_states.view(batch_size, self.num_heads, self.head_dim) + hidden_states = selective_state_update( + cache_params.ssm_states[self.layer_idx], + hidden_states_reshaped, + dt, + A, + B, + C, + D, + z=None, + dt_bias=dt_bias, + dt_softplus=True, + ) + hidden_states = hidden_states.view(batch_size, self.num_heads * self.head_dim) + hidden_states = self.norm(hidden_states, gate) + + # 4. Final linear projection + out = self.out_proj(hidden_states)[:, None, ...] + # Fused calculations or step by step if no initialized cache is found + else: + A = -torch.exp(self.A_log.float()) # (num_heads) or (intermediate_size, state_size) + dt_limit_kwargs = {} if self.time_step_limit == (0.0, float("inf")) else {"dt_limit": self.time_step_limit} + + # 2-4. Fused kernel for conv1d, SSM, and the final projection + if self.training and cache_params is None: + out = mamba_split_conv1d_scan_combined( + projected_states, + self.conv1d.weight.squeeze(1), + self.conv1d.bias, + self.dt_bias, + A, + D=self.D, + chunk_size=self.chunk_size, + seq_idx=None, # was seq_idx + activation=self.activation, + rmsnorm_weight=self.norm.weight, + rmsnorm_eps=self.norm.variance_epsilon, + outproj_weight=self.out_proj.weight, + outproj_bias=self.out_proj.bias, + headdim=self.head_dim, + ngroups=self.n_groups, + norm_before_gate=False, + return_final_states=False, + **dt_limit_kwargs, + ) + + else: + gate, hidden_states_B_C, dt = projected_states.split( + [self.intermediate_size, self.conv_dim, self.num_heads], dim=-1 + ) + + # 2. Convolution sequence transformation + # Init cache + if cache_params is not None: + # storing the states + # If we just take xBC[:, :, -self.d_conv :], it will error if seqlen < self.d_conv + # Instead F.pad will pad with zeros if seqlen < self.d_conv, and truncate otherwise. + hidden_states_B_C_transposed = hidden_states_B_C.transpose(1, 2) + conv_states = nn.functional.pad( + hidden_states_B_C_transposed, + (self.conv_kernel_size - hidden_states_B_C_transposed.shape[-1], 0), + ) + cache_params.conv_states[self.layer_idx].copy_(conv_states) + + if self.activation not in ["silu", "swish"]: + hidden_states_B_C = self.act( + self.conv1d(hidden_states_B_C.transpose(1, 2))[..., :seq_len].transpose(1, 2) + ) + else: + hidden_states_B_C = causal_conv1d_fn( + x=hidden_states_B_C.transpose(1, 2), + weight=self.conv1d.weight.squeeze(1), + bias=self.conv1d.bias, + activation=self.activation, + ).transpose(1, 2) + + hidden_states_B_C = apply_mask_to_padding_states(hidden_states_B_C, attention_mask) + hidden_states, B, C = torch.split( + hidden_states_B_C, + [self.intermediate_size, groups_time_state_size, groups_time_state_size], + dim=-1, + ) + + # 3. SSM transformation + scan_output, ssm_state = mamba_chunk_scan_combined( + hidden_states.view(batch_size, seq_len, -1, self.head_dim), + dt, + A, + B.view(batch_size, seq_len, self.n_groups, -1), + C.view(batch_size, seq_len, self.n_groups, -1), + chunk_size=self.chunk_size, + D=self.D, + z=None, + seq_idx=None, + return_final_states=True, + dt_bias=self.dt_bias, + dt_softplus=True, + **dt_limit_kwargs, + ) + + # Init cache + if ssm_state is not None and cache_params is not None: + cache_params.ssm_states[self.layer_idx].copy_(ssm_state) + + scan_output = scan_output.view(batch_size, seq_len, -1) + # Multiply "gate" branch and apply extra normalization layer + scan_output = self.norm(scan_output, gate) + + # 4. Final linear projection + out = self.out_proj(scan_output) + return out + + # fmt: off + def torch_forward( + self, + input_states, + cache_params: Optional[HybridMambaAttentionDynamicCache] = None, + cache_position: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + ): + batch_size, seq_len, _ = input_states.shape + dtype = input_states.dtype + + # 1. Gated MLP's linear projection + input_states = apply_mask_to_padding_states(input_states, attention_mask) + projected_states = self.in_proj(input_states) + gate, hidden_states_B_C, dt = projected_states.split( + [self.intermediate_size, self.conv_dim, self.num_heads], dim=-1 + ) + + use_precomputed_states = ( + cache_params is not None + and cache_params.has_previous_state + and seq_len == 1 + and cache_params.conv_states[self.layer_idx].shape[0] + == cache_params.ssm_states[self.layer_idx].shape[0] + == batch_size + and cache_position is not None + and cache_position[0] > 0 + ) + + # 2. Convolution sequence transformation + if use_precomputed_states: + cache_params.conv_states[self.layer_idx] = cache_params.conv_states[self.layer_idx].roll(shifts=-1, dims=-1) + cache_params.conv_states[self.layer_idx][:, :, -1] = hidden_states_B_C[:, 0, :].to(cache_params.conv_states[self.layer_idx].device) + + # We need to guarantee that anything regarding the cache is on the same device + conv_states = cache_params.conv_states[self.layer_idx].to(device=self.conv1d.weight.device) + + hidden_states_B_C = torch.sum( + conv_states * self.conv1d.weight.squeeze(1), dim=-1 + ) + if self.use_conv_bias: + hidden_states_B_C = hidden_states_B_C + self.conv1d.bias + hidden_states_B_C = self.act(hidden_states_B_C) + else: + # Init cache + if cache_params is not None: + hidden_states_B_C_transposed = hidden_states_B_C.transpose(1, 2) + conv_states = nn.functional.pad( + hidden_states_B_C_transposed, (self.conv_kernel_size - hidden_states_B_C_transposed.shape[-1], 0) + ) + cache_params.conv_states[self.layer_idx].copy_(conv_states) + + hidden_states_B_C = self.act(self.conv1d(hidden_states_B_C.transpose(1, 2))[..., :seq_len].transpose(1, 2)) + + hidden_states_B_C = apply_mask_to_padding_states(hidden_states_B_C, attention_mask) + hidden_states, B, C = torch.split( + hidden_states_B_C, + [self.intermediate_size, self.n_groups * self.ssm_state_size, self.n_groups * self.ssm_state_size], + dim=-1 + ) + + # 3. SSM transformation + A = -torch.exp(self.A_log.float()) # [num_heads] + if use_precomputed_states: + # We need to guarantee that anything regarding the cache is on the same device + cache_device = cache_params.ssm_states[self.layer_idx].device + + # Note: there is no need to pad parameter matrices here, as there is just one new token + # for batched generation + dt = dt[:, 0, :][:, None, ...] + dt = dt.transpose(1, 2).expand(batch_size, dt.shape[-1], self.head_dim) + # [num_heads] -> [num_heads, head_dim] + dt_bias = self.dt_bias[..., None].expand(self.dt_bias.shape[0], self.head_dim) + + dt = torch.nn.functional.softplus(dt + dt_bias.to(dt.dtype)) + dt = torch.clamp(dt, self.time_step_limit[0], self.time_step_limit[1]) + A = A[..., None, None].expand(self.num_heads, self.head_dim, self.ssm_state_size).to(dtype=torch.float32) + # [bsz, num_heads, head_dim, state_size] + dA = (torch.exp(dt[..., None] * A)).to(device=cache_device) + + # Discretize B + # [bsz, n_groups * state_size] -> [bsz, n_groups, 1, state_size] -> + # -> [bsz, n_groups, group to head repetition factor, state_size] -> [bsz, num_heads, state_size] + B = B.reshape(batch_size, self.n_groups, -1)[..., None, :] + B = B.expand(batch_size, self.n_groups, self.num_heads // self.n_groups, B.shape[-1]).contiguous() + B = B.reshape(batch_size, -1, B.shape[-1]) + # [bsz, num_heads, head_dim, state_size] + dB = dt[..., None] * B[..., None, :] + + # Discretize x into dB + # [bsz, intermediate_size] -> [bsz, num_heads, head_dim] + hidden_states = hidden_states.reshape(batch_size, -1, self.head_dim) + dBx = (dB * hidden_states[..., None]).to(device=cache_device) + + # State calculation + cache_params.ssm_states[self.layer_idx].copy_( + cache_params.ssm_states[self.layer_idx] * dA + dBx + ) + + # Subsequent output + # [bsz, n_groups * state_size] -> [bsz, num_heads, state_size] + C = C.reshape(batch_size, self.n_groups, -1)[..., None, :] + C = C.expand(batch_size, self.n_groups, self.num_heads // self.n_groups, C.shape[-1]).contiguous() + C = C.reshape(batch_size, -1, C.shape[-1]) + # [bsz, num_heads, head_dim] + + ssm_states = cache_params.ssm_states[self.layer_idx].to(device=C.device, dtype=C.dtype) # Shape: [b, h, d, n] + # Reshape ssm_states to merge the first two dimensions + ssm_states_reshaped = ssm_states.view(batch_size * self.num_heads, self.head_dim, self.ssm_state_size) # Shape: [b*h, d, n] + C_reshaped = C.view(batch_size * self.num_heads, self.ssm_state_size, 1) # Shape: [b*h, n, 1] + y = torch.bmm(ssm_states_reshaped, C_reshaped) + y = y.view(batch_size, self.num_heads, self.head_dim) + + # D skip connection + # [num_heads] -> [num_heads, head_dim] + D = self.D[..., None].expand(self.D.shape[0], self.head_dim) + y = (y + hidden_states * D).to(y.dtype) + + # [bsz, num_heads, head_dim] -> [bsz, 1, intermediate_size] + y = y.reshape(batch_size, -1)[:, None, ...] + else: + # begin ssd naive implementation without einsums + dt = nn.functional.softplus(dt + self.dt_bias) + dt = torch.clamp(dt, self.time_step_limit[0], self.time_step_limit[1]) + hidden_states = hidden_states.reshape(batch_size, seq_len, -1, self.head_dim).float() + B = B.reshape(batch_size, seq_len, -1, self.ssm_state_size).float() + C = C.reshape(batch_size, seq_len, -1, self.ssm_state_size).float() + B = B.repeat_interleave(self.num_heads // self.n_groups, dim=2, output_size=self.num_heads) + C = C.repeat_interleave(self.num_heads // self.n_groups, dim=2, output_size=self.num_heads) + pad_size = (self.chunk_size - seq_len % self.chunk_size) % self.chunk_size + + D_residual = self.D[..., None] * pad_tensor_by_size(hidden_states, pad_size) + + # Discretize x and A + hidden_states = hidden_states * dt[..., None] + A = A.to(hidden_states.dtype) * dt + + # Rearrange into blocks/chunks + hidden_states, A, B, C = [reshape_into_chunks(t, pad_size, self.chunk_size) for t in (hidden_states, A, B, C)] + + # [bsz, -1, chunk_size, num_heads] -> [bsz, num_heads, -1, chunk_size] + A = A.permute(0, 3, 1, 2) + A_cumsum = torch.cumsum(A, dim=-1) + + # 1. Compute the output for each intra-chunk (diagonal blocks) + # This is the analog of a causal mask + L = torch.exp(segment_sum(A)) + + # Contraction of C and B to get G (attention-weights like) + G_intermediate = C[:, :, :, None, :, :] * B[:, :, None, :, :, :] # shape: (b, c, l, s, h, n) + G = G_intermediate.sum(dim=-1) # shape: (b, c, l, s, h) + + # Compute M, equivalent to applying attention mask to weights + M_intermediate = G[..., None] * L.permute(0, 2, 3, 4, 1)[..., None] + M = M_intermediate.sum(dim=-1) + + # Compute Y_diag (apply to values) + Y_diag = (M[..., None] * hidden_states[:, :, None]).sum(dim=3) + + # 2. Compute the state for each intra-chunk + # (right term of low-rank factorization of off-diagonal blocks; B terms) + decay_states = torch.exp((A_cumsum[:, :, :, -1:] - A_cumsum)) + B_decay = B * decay_states.permute(0, -2, -1, 1)[..., None] + states = (B_decay[..., None, :] * hidden_states[..., None]).sum(dim=2) + + # 3. Compute the inter-chunk SSM recurrence; produces correct SSM states at chunk boundaries + # (middle term of factorization of off-diag blocks; A terms) + if use_precomputed_states: + previous_states = cache_params.ssm_states[self.layer_idx][:, None, ...].to(device=states.device) + else: + previous_states = torch.zeros_like(states[:, :1]) + states = torch.cat([previous_states, states], dim=1) + decay_chunk = torch.exp(segment_sum(nn.functional.pad(A_cumsum[:, :, :, -1], (1, 0)))) + decay_chunk = decay_chunk.transpose(1, 3) + new_states = (decay_chunk[..., None, None] * states[:, :, None, ...]).sum(dim=1) + states, ssm_state = new_states[:, :-1], new_states[:, -1] + + # 4. Compute state -> output conversion per chunk + # (left term of low-rank factorization of off-diagonal blocks; C terms) + state_decay_out = torch.exp(A_cumsum) + C_times_states = (C[..., None, :] * states[:, :, None, ...]) + state_decay_out_permuted = state_decay_out.permute(0, 2, 3, 1) + Y_off = (C_times_states.sum(-1) * state_decay_out_permuted[..., None]) + + # Add output of intra-chunk and inter-chunk terms (diagonal and off-diagonal blocks) + y = Y_diag + Y_off + # [bsz, -1, self.chunk_size, num_heads, head_dim] -> [bsz, (padded) seq_len, num_heads, head_dim] + y = y.reshape(batch_size, -1, self.num_heads, self.head_dim) + + y = y + D_residual + # Cutting off padded chunks + if pad_size > 0: + y = y[:, :seq_len, :, :] + y = y.reshape(batch_size, seq_len, -1) + + # Init cache + if ssm_state is not None and cache_params is not None: + cache_params.ssm_states[self.layer_idx].copy_(ssm_state) + + scan_output = self.norm(y, gate) + + # end ssd naive + + # 4. Final linear projection + contextualized_states = self.out_proj(scan_output.to(dtype)) # [batch, seq_len, hidden_size] + return contextualized_states + # fmt: on + + def forward( + self, + hidden_states, + cache_params: Optional[HybridMambaAttentionDynamicCache] = None, + cache_position: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + ): + if is_fast_path_available and "cuda" in self.in_proj.weight.device.type: + return self.cuda_kernels_forward(hidden_states, cache_params, cache_position, attention_mask) + dtype = hidden_states.dtype + if attention_mask is not None and attention_mask.shape[1] > 1 and attention_mask.shape[0] > 1: + # tune out hidden states for pad tokens, see https://github.com/state-spaces/mamba/issues/66 + hidden_states = (hidden_states * attention_mask[:, :, None]).to(dtype) + + return self.torch_forward(hidden_states, cache_params, cache_position, attention_mask) + + +class BambaMLP(nn.Module): + def __init__(self, config): + super().__init__() + self.config = config + self.hidden_size = config.hidden_size + self.intermediate_size = config.intermediate_size + self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=config.mlp_bias) + self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=config.mlp_bias) + self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=config.mlp_bias) + self.act_fn = ACT2FN[config.hidden_act] + + def forward(self, x): + down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x)) + return down_proj + + +@use_kernel_forward_from_hub("RMSNorm") +class BambaRMSNorm(nn.Module): + def __init__(self, hidden_size, eps=1e-6): + """ + BambaRMSNorm is equivalent to T5LayerNorm + """ + super().__init__() + self.weight = nn.Parameter(torch.ones(hidden_size)) + self.variance_epsilon = eps + + def forward(self, hidden_states): + input_dtype = hidden_states.dtype + hidden_states = hidden_states.to(torch.float32) + variance = hidden_states.pow(2).mean(-1, keepdim=True) + hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon) + return self.weight * hidden_states.to(input_dtype) + + def extra_repr(self): + return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}" + + +class BambaDecoderLayer(nn.Module): + def __init__(self, config: BambaConfig, layer_idx: int, layer_type: str = "mamba"): + super().__init__() + + num_experts = 1 + ffn_layer_class = BambaMLP if num_experts == 1 else None + self.feed_forward = ffn_layer_class(config) + self.input_layernorm = BambaRMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.pre_ff_layernorm = BambaRMSNorm(config.hidden_size, eps=config.rms_norm_eps) + + self.layer_type = layer_type + if layer_type == "mamba": + self.mamba = BambaMixer(config=config, layer_idx=layer_idx) + elif layer_type == "attention": + self.self_attn = BambaAttention(config, layer_idx) + else: + raise ValueError("Invalid layer_type") + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[HybridMambaAttentionDynamicCache] = None, + output_attentions: Optional[bool] = False, + use_cache: Optional[bool] = False, + cache_position: Optional[torch.LongTensor] = None, + position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC + **kwargs, + ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]: + """ + Args: + hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)` + attention_mask (`torch.FloatTensor`, *optional*): attention mask of size + `(batch, sequence_length)` where padding elements are indicated by 0. + past_key_value (`HybridMambaAttentionDynamicCache`, *optional*): cached past key and value projection states + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding + (see `past_key_values`). + cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*): + Indices depicting the position of the input sequence tokens in the sequence. + position_embeddings (`Tuple[torch.FloatTensor, torch.FloatTensor]`, *optional*): + Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`, + with `head_dim` being the embedding dimension of each attention head. + kwargs (`dict`, *optional*): + Arbitrary kwargs to be ignored, used for FSDP and other methods that injects code + into the model + """ + + residual = hidden_states + + hidden_states = self.input_layernorm(hidden_states) + + # this is a hybrid decoder layer + if self.layer_type == "mamba": + hidden_states = self.mamba( + hidden_states=hidden_states, + cache_params=past_key_value, + cache_position=cache_position, + attention_mask=attention_mask, + ) + self_attn_weights = None + elif self.layer_type == "attention": + hidden_states, self_attn_weights = self.self_attn( + hidden_states=hidden_states, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_value=past_key_value, + output_attentions=output_attentions, + use_cache=use_cache, + cache_position=cache_position, + position_embeddings=position_embeddings, + **kwargs, + ) + + # residual connection after attention + hidden_states = residual + hidden_states + + # feed-forward + residual = hidden_states + hidden_states = self.pre_ff_layernorm(hidden_states) + hidden_states = self.feed_forward(hidden_states) + hidden_states = residual + hidden_states + + outputs = (hidden_states,) + + if output_attentions: + outputs += (self_attn_weights,) + + return outputs + + +BAMBA_START_DOCSTRING = r""" + This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads + etc.) + + This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. + Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage + and behavior. + + Parameters: + config ([`BambaConfig`]): + Model configuration class with all the parameters of the model. Initializing with a config file does not + load the weights associated with the model, only the configuration. Check out the + [`~PreTrainedModel.from_pretrained`] method to load the model weights. +""" + + +@add_start_docstrings( + "The bare BambaModel outputting raw hidden-states without any specific head on top.", + BAMBA_START_DOCSTRING, +) +class BambaPreTrainedModel(PreTrainedModel): + config_class = BambaConfig + base_model_prefix = "model" + supports_gradient_checkpointing = True + _no_split_modules = ["BambaDecoderLayer"] + _skip_keys_device_placement = "past_key_values" + _supports_flash_attn_2 = True + _supports_sdpa = True + _supports_cache_class = True # Note: only supports HybridMambaAttentionDynamicCache + _is_stateful = True + + def _init_weights(self, module): + std = self.config.initializer_range + if isinstance(module, (nn.Linear, nn.Conv1d)): + module.weight.data.normal_(mean=0.0, std=std) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, (BambaRMSNormGated, BambaRMSNorm)): + module.weight.data.fill_(1.0) + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=std) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + elif isinstance(module, BambaMixer): + module.dt_bias.data.fill_(1.0) + module.A_log.data = torch.log(torch.arange(1, module.num_heads + 1)) + module.D.data.fill_(1.0) + + +BAMBA_INPUTS_DOCSTRING = r""" + Args: + input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide + it. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + If `past_key_values` is used, optionally only the last `input_ids` have to be input (see + `past_key_values`). + + If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`] + and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more + information on the default strategy. + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, + config.n_positions - 1]`. + + [What are position IDs?](../glossary#position-ids) + past_key_values (`HybridMambaAttentionDynamicCache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): + A HybridMambaAttentionDynamicCache object containing pre-computed hidden-states (keys and values in the + self-attention blocks and convolution and ssm states in the mamba blocks) that can be used (see + `past_key_values` input) to speed up sequential decoding. + Key and value cache tensors have shape `(batch_size, num_heads, seq_len, head_dim)`. + Convolution and ssm states tensors have shape `(batch_size, d_inner, d_conv)` and + `(batch_size, d_inner, d_state)` respectively. + See the `HybridMambaAttentionDynamicCache` class for more details. + + If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that + don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all + `input_ids` of shape `(batch_size, sequence_length)`. + inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This + is useful if you want more control over how to convert `input_ids` indices into associated vectors than the + model's internal embedding lookup matrix. + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see + `past_key_values`). + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + output_router_logits (`bool`, *optional*): + Whether or not to return the logits of all the routers. They are useful for computing the router loss, and + should not be returned during inference. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. + cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*): + Indices depicting the position of the input sequence tokens in the sequence. Contrarily to `position_ids`, + this tensor is not affected by padding. It is used to update the cache in the correct position and to infer + the complete sequence length. +""" + + +@add_start_docstrings( + "The bare Bamba Model outputting raw hidden-states without any specific head on top.", + BAMBA_START_DOCSTRING, +) +# Adapted from transformers.models.jamba.modeling_jamba.JambaModel +class BambaModel(BambaPreTrainedModel): + """ + Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`BambaDecoderLayer`] + + Args: + config: BambaConfig + """ + + def __init__(self, config: BambaConfig): + super().__init__(config) + self.padding_idx = config.pad_token_id + self.vocab_size = config.vocab_size + + self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx) + decoder_layers = [] + for i in range(config.num_hidden_layers): + decoder_layers.append(BambaDecoderLayer(config, layer_idx=i, layer_type=config.layers_block_type[i])) + self.layers = nn.ModuleList(decoder_layers) + + self._attn_implementation = config._attn_implementation + self.final_layernorm = BambaRMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.rotary_emb = BambaRotaryEmbedding(config=config) + + self.gradient_checkpointing = False + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.embed_tokens + + def set_input_embeddings(self, value): + self.embed_tokens = value + + @can_return_tuple + @add_start_docstrings_to_model_forward(BAMBA_INPUTS_DOCSTRING) + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[HybridMambaAttentionDynamicCache] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + cache_position: Optional[torch.LongTensor] = None, + **kwargs, # NOOP kwargs, for now + ) -> BaseModelOutputWithPast: + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + use_cache = use_cache if use_cache is not None else self.config.use_cache + + if (input_ids is None) ^ (inputs_embeds is not None): + raise ValueError("You must specify exactly one of input_ids or inputs_embeds") + + if self.gradient_checkpointing and self.training and use_cache: + logger.warning_once( + "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`." + ) + use_cache = False + + if inputs_embeds is None: + inputs_embeds = self.embed_tokens(input_ids) + hidden_states = inputs_embeds + + if use_cache and past_key_values is None: + logger.warning_once( + "Bamba requires an initialized `HybridMambaAttentionDynamicCache` to return a cache. None was " + "provided, so no cache will be returned." + ) + + if cache_position is None: + cache_position = torch.arange(hidden_states.shape[1], device=hidden_states.device) + if position_ids is None: + position_ids = cache_position.unsqueeze(0) + + causal_mask = self._update_causal_mask( + attention_mask, inputs_embeds, cache_position, past_key_values, output_attentions + ) + mamba_mask = self._update_mamba_mask(attention_mask, cache_position) + + # create position embeddings to be shared across the decoder layers + position_embeddings = self.rotary_emb(hidden_states, position_ids) + + all_hidden_states = () if output_hidden_states else None + all_self_attns = () if output_attentions else None + + for decoder_layer in self.layers: + # Depending on the layer type we opt for 2D base attention mask (Mamba) or 4D causal mask (Attention) + layer_mask = mamba_mask if decoder_layer.layer_type == "mamba" else causal_mask + + if output_hidden_states: + all_hidden_states += (hidden_states,) + + if self.gradient_checkpointing and self.training: + layer_outputs = self._gradient_checkpointing_func( + decoder_layer.__call__, + hidden_states, + layer_mask, + position_ids, + past_key_values, + output_attentions, + use_cache, + cache_position, + position_embeddings, + ) + else: + layer_outputs = decoder_layer( + hidden_states, + attention_mask=layer_mask, + position_ids=position_ids, + past_key_value=past_key_values, + output_attentions=output_attentions, + use_cache=use_cache, + cache_position=cache_position, + position_embeddings=position_embeddings, + ) + + hidden_states = layer_outputs[0] + + if output_attentions: + if layer_outputs[1] is not None: + # append attentions only of attention layers. Mamba layers return `None` as the attention weights + all_self_attns += (layer_outputs[1],) + + hidden_states = self.final_layernorm(hidden_states) + + # add hidden states from the last decoder layer + if output_hidden_states: + all_hidden_states += (hidden_states,) + + if past_key_values and not past_key_values.has_previous_state: + past_key_values.has_previous_state = True + + next_cache = None if not use_cache else past_key_values + + return BaseModelOutputWithPast( + last_hidden_state=hidden_states, + past_key_values=next_cache, + hidden_states=all_hidden_states, + attentions=all_self_attns, + ) + + def _update_causal_mask( + self, + attention_mask: torch.Tensor, + input_tensor: torch.Tensor, + cache_position: torch.Tensor, + past_key_values: HybridMambaAttentionDynamicCache, + output_attentions: bool, + ): + if self.config._attn_implementation == "flash_attention_2": + if attention_mask is not None and 0.0 in attention_mask: + return attention_mask + return None + + # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in + # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail + # to infer the attention mask. + past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0 + + # When output attentions is True, sdpa implementation's forward method calls the eager implementation's forward + if self.config._attn_implementation == "sdpa" and not output_attentions: + if AttentionMaskConverter._ignore_causal_mask_sdpa( + attention_mask, + inputs_embeds=input_tensor, + past_key_values_length=past_seen_tokens, + is_training=self.training, + ): + return None + + dtype, device = input_tensor.dtype, input_tensor.device + sequence_length = input_tensor.shape[1] + target_length = ( + attention_mask.shape[-1] + if isinstance(attention_mask, torch.Tensor) + else past_seen_tokens + sequence_length + 1 + ) + + # In case the provided `attention` mask is 2D, we generate a causal mask here (4D). + causal_mask = self._prepare_4d_causal_attention_mask_with_cache_position( + attention_mask, + sequence_length=sequence_length, + target_length=target_length, + dtype=dtype, + device=device, + cache_position=cache_position, + batch_size=input_tensor.shape[0], + ) + + if ( + self.config._attn_implementation == "sdpa" + and attention_mask is not None + and attention_mask.device.type in ["cuda", "xpu", "npu"] + and not output_attentions + ): + # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when + # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path. + # Details: https://github.com/pytorch/pytorch/issues/110213 + min_dtype = torch.finfo(dtype).min + causal_mask = AttentionMaskConverter._unmask_unattended(causal_mask, min_dtype) + + return causal_mask + + @staticmethod + def _prepare_4d_causal_attention_mask_with_cache_position( + attention_mask: torch.Tensor, + sequence_length: int, + target_length: int, + dtype: torch.dtype, + device: torch.device, + cache_position: torch.Tensor, + batch_size: int, + **kwargs, + ): + """ + Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape + `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing. + + Args: + attention_mask (`torch.Tensor`): + A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape + `(batch_size, 1, query_length, key_value_length)`. + sequence_length (`int`): + The sequence length being processed. + target_length (`int`): + The target length: when generating with static cache, the mask should be as long as the static cache, + to account for the 0 padding, the part of the cache that is not filled yet. + dtype (`torch.dtype`): + The dtype to use for the 4D attention mask. + device (`torch.device`): + The device to place the 4D attention mask on. + cache_position (`torch.Tensor`): + Indices depicting the position of the input sequence tokens in the sequence. + batch_size (`torch.Tensor`): + Batch size. + """ + if attention_mask is not None and attention_mask.dim() == 4: + # In this case we assume that the mask comes already in inverted form and requires no inversion or slicing. + causal_mask = attention_mask + else: + min_dtype = torch.finfo(dtype).min + causal_mask = torch.full( + (sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device + ) + if sequence_length != 1: + causal_mask = torch.triu(causal_mask, diagonal=1) + causal_mask *= torch.arange(target_length, device=device) > cache_position.reshape(-1, 1) + causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1) + if attention_mask is not None: + causal_mask = causal_mask.clone() # copy to contiguous memory for in-place edit + mask_length = attention_mask.shape[-1] + padding_attention_mask = (attention_mask[:, None, None, :] == attention_mask[:, None, :, None])[ + :, :, -sequence_length:, : + ].to(dtype) + padding_mask = causal_mask[:, :, :, :mask_length] + padding_attention_mask + padding_mask = padding_mask == 0 + causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill( + padding_mask, min_dtype + ) + + return causal_mask + + def _update_mamba_mask(self, attention_mask, cache_position): + """ + No need for zeroing states when + 1. Cached forward + 2. Attending to all inputs + """ + mamba_mask = attention_mask + if cache_position[0] > 0 or (attention_mask is not None and torch.all(attention_mask == 1)): + mamba_mask = None + return mamba_mask + + +class BambaForCausalLM(BambaPreTrainedModel, GenerationMixin): + _tied_weights_keys = ["lm_head.weight"] + _tp_plan = {"lm_head": "colwise_rep"} + _pp_plan = {"lm_head": (["hidden_states"], ["logits"])} + + def __init__(self, config): + super().__init__(config) + self.model = BambaModel(config) + self.vocab_size = config.vocab_size + self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.model.embed_tokens + + def set_input_embeddings(self, value): + self.model.embed_tokens = value + + def get_output_embeddings(self): + return self.lm_head + + def set_output_embeddings(self, new_embeddings): + self.lm_head = new_embeddings + + def set_decoder(self, decoder): + self.model = decoder + + def get_decoder(self): + return self.model + + @can_return_tuple + @add_start_docstrings_to_model_forward(BAMBA_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC) + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[HybridMambaAttentionDynamicCache] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + cache_position: Optional[torch.LongTensor] = None, + logits_to_keep: Union[int, torch.Tensor] = 0, + **kwargs, + ) -> CausalLMOutputWithPast: + r""" + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., + config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored + (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. + + logits_to_keep (`int` or `torch.Tensor`, *optional*): + If an `int`, compute logits for the last `logits_to_keep` tokens. If `0`, calculate logits for all + `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that + token can save memory, which becomes pretty significant for long sequences or large vocabulary size. + If a `torch.Tensor`, must be 1D corresponding to the indices to keep in the sequence length dimension. + This is useful when using packed tensor format (single dimension for batch and sequence length). + + Returns: + + Example: + + ```python + >>> from transformers import AutoTokenizer, BambaForCausalLM + + >>> model = BambaForCausalLM.from_pretrained("...") + >>> tokenizer = AutoTokenizer.from_pretrained("...") + + >>> prompt = "Hey, are you conscious? Can you talk to me?" + >>> inputs = tokenizer(prompt, return_tensors="pt") + + >>> # Generate + >>> generate_ids = model.generate(inputs.input_ids, max_length=30) + >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] + "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you." + ```""" + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + + # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn) + outputs: BaseModelOutputWithPast = self.model( + input_ids=input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + cache_position=cache_position, + **kwargs, + ) + + hidden_states = outputs.last_hidden_state + # Only compute necessary logits, and do not upcast them to float if we are not computing the loss + slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep + logits = self.lm_head(hidden_states[:, slice_indices, :]) + + loss = None + if labels is not None: + loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.config.vocab_size, **kwargs) + + return CausalLMOutputWithPast( + loss=loss, + logits=logits, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + def prepare_inputs_for_generation( + self, + input_ids, + past_key_values=None, + attention_mask=None, + inputs_embeds=None, + cache_position=None, + position_ids=None, + use_cache=True, + **kwargs, + ): + # Overwitten -- has a unique cache type, `HybridMambaAttentionDynamicCache` + + empty_past_kv = past_key_values is None + + # If we have cache: let's slice `input_ids` through `cache_position`, to keep only the unprocessed tokens + # Exception 1: when passing input_embeds, input_ids may be missing entries + # Exception 2: some generation methods do special slicing of input_ids, so we don't need to do it here + # Exception 3: with synced GPUs cache_position may go out of bounds, but we only want dummy token in that case. + # (we can't check exception 3 while compiling) + if not empty_past_kv: + if ( + inputs_embeds is not None # Exception 1 + or cache_position[-1] >= input_ids.shape[1] # Exception 3 + ): + input_ids = input_ids[:, -cache_position.shape[0] :] + elif input_ids.shape[1] != cache_position.shape[0]: # Default case (the "else", a no op, is Exception 2) + input_ids = input_ids[:, cache_position] + else: + past_key_values = HybridMambaAttentionDynamicCache( + self.config, input_ids.shape[0], self.dtype, device=self.device + ) + + if attention_mask is not None and position_ids is None: + # create position_ids on the fly for batch generation + position_ids = attention_mask.long().cumsum(-1) - 1 + position_ids.masked_fill_(attention_mask == 0, 1) + if not empty_past_kv: + position_ids = position_ids[:, -input_ids.shape[1] :] + + # if `inputs_embeds` are passed, we only want to use them in the 1st generation step + if inputs_embeds is not None and empty_past_kv: + model_inputs = {"inputs_embeds": inputs_embeds} + else: + model_inputs = {"input_ids": input_ids.contiguous()} # `contiguous()` needed for compilation use cases + + model_inputs.update( + { + "position_ids": position_ids, + "past_key_values": past_key_values, + "use_cache": use_cache, + "attention_mask": attention_mask, + "logits_to_keep": self.config.num_logits_to_keep, + "cache_position": cache_position, + } + ) + return model_inputs + + +__all__ = ["BambaModel", "BambaForCausalLM", "BambaPreTrainedModel"] diff --git a/docs/transformers/build/lib/transformers/models/bamba/modular_bamba.py b/docs/transformers/build/lib/transformers/models/bamba/modular_bamba.py new file mode 100644 index 0000000000000000000000000000000000000000..3e0eb513d5cdec54aefef1fe50e0ecf4a86696b6 --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/bamba/modular_bamba.py @@ -0,0 +1,1312 @@ +# coding=utf-8 +# Copyright 2024 IBM and the HuggingFace Inc. team. All rights reserved. +# +# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX +# and OPT implementations in this library. It has been modified from its +# original forms to accommodate minor architectural differences compared +# to GPT-NeoX and OPT used by the Meta AI team that trained the model. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch Bamba model.""" + +from typing import Optional, Tuple, Union + +import torch +import torch.utils.checkpoint +from torch import nn + +import transformers.models.jamba.modeling_jamba as modeling_jamba +from transformers.activations import ACT2FN +from transformers.models.jamba.modeling_jamba import JambaAttentionDecoderLayer +from transformers.models.llama.modeling_llama import ( + LlamaAttention, + LlamaForCausalLM, + LlamaMLP, + LlamaRMSNorm, + LlamaRotaryEmbedding, + rotate_half, +) +from transformers.models.mamba2.modeling_mamba2 import ( + MambaRMSNormGated, + pad_tensor_by_size, + reshape_into_chunks, + segment_sum, +) + +from ...modeling_attn_mask_utils import ( + AttentionMaskConverter, +) +from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast +from ...modeling_utils import PreTrainedModel +from ...utils import ( + add_start_docstrings, + add_start_docstrings_to_model_forward, + can_return_tuple, + logging, + replace_return_docstrings, +) +from ...utils.import_utils import ( + is_causal_conv1d_available, + is_flash_attn_2_available, + is_mamba_2_ssm_available, +) +from .configuration_bamba import BambaConfig + + +if is_flash_attn_2_available(): + pass + +if is_mamba_2_ssm_available(): + from mamba_ssm.ops.triton.selective_state_update import selective_state_update + from mamba_ssm.ops.triton.ssd_combined import mamba_chunk_scan_combined, mamba_split_conv1d_scan_combined +else: + selective_state_update = None + +if is_causal_conv1d_available(): + from causal_conv1d import causal_conv1d_fn, causal_conv1d_update +else: + causal_conv1d_update, causal_conv1d_fn = None, None + +is_fast_path_available = all((selective_state_update, causal_conv1d_fn, causal_conv1d_update)) + + +logger = logging.get_logger(__name__) + +_CONFIG_FOR_DOC = "BambaConfig" + + +# Adapted from transformers.models.jamba.modeling_jamba.HybridMambaAttentionDynamicCache for the v2 mixer +class HybridMambaAttentionDynamicCache(modeling_jamba.HybridMambaAttentionDynamicCache): + """ + A dynamic cache that can handle both the attention cache (which has a seq_len dimension) and the mamba cache + (which has a constant shape regardless of seq_len). + + This cache has two sets of lists of tensors: `key_cache` and `value_cache` for attention cache and `conv_states` + and `ssm_states` for mamba cache. Each of these lists has `num_layers` tensors. The expected shape for each tensor + For attention layers, `key_cache` and `value_cache` have a shape of `(batch_size, num_heads, seq_len, head_dim)`, + while `conv_states` and `ssm_states` have a shape of `(batch_size, 0)` (empty tensors). + For mamba layers, `key_cache` and `value_cache` have a shape of `(batch_size, 0)` (empty tensors), + while `conv_states` represents the convolution state and has a shape of `(batch_size, d_inner, d_conv)`, + and `ssm_states` represents the ssm state and has a shape of `(batch_size, d_inner, d_state)`. + """ + + def __init__(self, config: BambaConfig, batch_size, dtype=torch.float16, device=None): + super().__init__(config, batch_size, dtype, device) + self.layers_block_type = config.layers_block_type + self.has_previous_state = False # only used by mamba + conv_kernel_size = config.mamba_d_conv + ssm_state_size = config.mamba_d_state + + self.conv_states = [] + self.ssm_states = [] + self.transformer_layers = [] + for i in range(config.num_hidden_layers): + if self.layers_block_type[i] == "mamba": + self.conv_states += [ + torch.zeros( + batch_size, + (config.mamba_expand * config.hidden_size + 2 * config.mamba_n_groups * ssm_state_size), + conv_kernel_size, + device=device, + dtype=dtype, + ) + ] + self.ssm_states += [ + torch.zeros( + batch_size, + config.mamba_n_heads, + config.mamba_d_head, + ssm_state_size, + device=device, + dtype=dtype, + ) + ] + else: + self.conv_states += [torch.tensor([[]] * batch_size, device=device)] + self.ssm_states += [torch.tensor([[]] * batch_size, device=device)] + self.transformer_layers.append(i) + + self.key_cache = [torch.tensor([[]] * batch_size, device=device) for _ in range(config.num_hidden_layers)] + self.value_cache = [torch.tensor([[]] * batch_size, device=device) for _ in range(config.num_hidden_layers)] + + +class BambaRotaryEmbedding(LlamaRotaryEmbedding): + pass + + +# Adapted from transformers.models.glm.modular_glm.apply_rotary_pos_emb +def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1): + """Applies Rotary Position Embedding to the query and key tensors. + + Removes the interleaving of cos and sin from GLM + + Args: + q (`torch.Tensor`): The query tensor. + k (`torch.Tensor`): The key tensor. + cos (`torch.Tensor`): The cosine part of the rotary embedding. + sin (`torch.Tensor`): The sine part of the rotary embedding. + position_ids (`torch.Tensor`, *optional*): + Deprecated and unused. + unsqueeze_dim (`int`, *optional*, defaults to 1): + The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and + sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note + that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and + k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes + cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have + the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2. + Returns: + `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding. + """ + cos = cos.unsqueeze(unsqueeze_dim) + sin = sin.unsqueeze(unsqueeze_dim) + + # Keep half or full tensor for later concatenation + rotary_dim = cos.shape[-1] + q_rot, q_pass = q[..., :rotary_dim], q[..., rotary_dim:] + k_rot, k_pass = k[..., :rotary_dim], k[..., rotary_dim:] + + # Apply rotary embeddings on the first half or full tensor + q_embed = (q_rot * cos) + (rotate_half(q_rot) * sin) + k_embed = (k_rot * cos) + (rotate_half(k_rot) * sin) + + # Concatenate back to full shape + q_embed = torch.cat([q_embed, q_pass], dim=-1) + k_embed = torch.cat([k_embed, k_pass], dim=-1) + return q_embed, k_embed + + +class BambaAttention(LlamaAttention): + pass + + +class BambaRMSNormGated(MambaRMSNormGated): + pass + + +def apply_mask_to_padding_states(hidden_states, attention_mask): + """ + Tunes out the hidden states for padding tokens, see https://github.com/state-spaces/mamba/issues/66 + """ + if attention_mask is not None and attention_mask.shape[1] > 1 and attention_mask.shape[0] > 1: + dtype = hidden_states.dtype + hidden_states = (hidden_states * attention_mask[:, :, None]).to(dtype) + + return hidden_states + + +# Adapted from transformers.models.mamba2.modeling_mamba2.Mamba2Mixer +class BambaMixer(nn.Module): + """ + Compute ∆, A, B, C, and D the state space parameters and compute the `contextualized_states`. + A, D are input independent (see Mamba paper [1] Section 3.5.2 "Interpretation of A" for why A isn't selective) + ∆, B, C are input-dependent (this is a key difference between Mamba and the linear time invariant S4, + and is why Mamba is called **selective** state spaces) + + The are a few differences between this and Mamba2Mixer: + - The variable use_precomputed_states is slightly different due to the HybridCache structure + - There's a few non-obvious bugs fixed with batching in the slow path that exist in main + - Some extra variables that our layer doesn't need have been removed + - We ported most of the refactors in https://github.com/huggingface/transformers/pull/35154, which is (as of Dec 18, 2024) unmerged + """ + + def __init__(self, config: BambaConfig, layer_idx: int): + super().__init__() + self.num_heads = config.mamba_n_heads + self.hidden_size = config.hidden_size + self.ssm_state_size = config.mamba_d_state + self.conv_kernel_size = config.mamba_d_conv + self.intermediate_size = int(config.mamba_expand * self.hidden_size) + self.layer_idx = layer_idx + self.use_conv_bias = config.mamba_conv_bias + self.activation = config.hidden_act + self.act = ACT2FN[config.hidden_act] + self.use_bias = config.mamba_proj_bias + + self.layer_norm_epsilon = config.rms_norm_eps + + self.n_groups = config.mamba_n_groups + self.head_dim = config.mamba_d_head + self.chunk_size = config.mamba_chunk_size + + # FIXME: + self.time_step_limit = (0.0, float("inf")) + self.time_step_min = 0.001 + self.time_step_max = 0.1 + + self.conv_dim = self.intermediate_size + 2 * self.n_groups * self.ssm_state_size + self.conv1d = nn.Conv1d( + in_channels=self.conv_dim, + out_channels=self.conv_dim, + bias=config.mamba_conv_bias, + kernel_size=self.conv_kernel_size, + groups=self.conv_dim, + padding=self.conv_kernel_size - 1, + ) + + # projection of the input hidden states + projection_size = self.intermediate_size + self.conv_dim + self.num_heads + self.in_proj = nn.Linear( + self.hidden_size, + projection_size, + bias=self.use_bias, + ) + # selective projection used to make dt, B and C input dependant + + # time step projection (discretization) + # instantiate once and copy inv_dt in init_weights of PretrainedModel + self.dt_bias = nn.Parameter(torch.ones(self.num_heads)) + + # S4D real initialization. These are not discretized! + # The core is to load them, compute the discrete states, then write the updated state. Keeps the memory bounded + A = torch.arange(1, self.num_heads + 1) + self.A_log = nn.Parameter(torch.log(A)) + self.A_log._no_weight_decay = True + self.norm = BambaRMSNormGated(self.intermediate_size, eps=self.layer_norm_epsilon) + self.D = nn.Parameter(torch.ones(self.num_heads)) + self.D._no_weight_decay = True + + self.out_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=self.use_bias) + + if not is_fast_path_available: + logger.warning_once( + "The fast path is not available because on of `(selective_state_update, causal_conv1d_fn, causal_conv1d_update)`" + " is None. Falling back to the naive implementation. To install follow https://github.com/state-spaces/mamba/#installation and" + " https://github.com/Dao-AILab/causal-conv1d" + ) + else: + logger.warning_once("The fast path for Bamba will be used when running the model on a GPU") + + def cuda_kernels_forward( + self, + hidden_states: torch.Tensor, + cache_params: Optional[HybridMambaAttentionDynamicCache] = None, + cache_position: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + ): + # 1. Gated MLP's linear projection + hidden_states = apply_mask_to_padding_states(hidden_states, attention_mask) + projected_states = self.in_proj(hidden_states) + + # Set up dimensions for reshapes later + batch_size, seq_len, _ = hidden_states.shape + groups_time_state_size = self.n_groups * self.ssm_state_size + + use_precomputed_states = ( + cache_params is not None + and cache_params.has_previous_state + and seq_len == 1 + and cache_params.conv_states[self.layer_idx].shape[0] + == cache_params.ssm_states[self.layer_idx].shape[0] + == batch_size + and cache_position is not None + and cache_position[0] > 0 + ) + + # getting projected states from cache if it exists + if use_precomputed_states: + gate, hidden_states_B_C, dt = projected_states.squeeze(1).split( + [self.intermediate_size, self.conv_dim, self.num_heads], dim=-1 + ) + + # 2. Convolution sequence transformation + hidden_states_B_C = causal_conv1d_update( + hidden_states_B_C, + cache_params.conv_states[self.layer_idx], + self.conv1d.weight.squeeze(1), + self.conv1d.bias, + self.activation, + ) + + hidden_states, B, C = torch.split( + hidden_states_B_C, + [self.intermediate_size, groups_time_state_size, groups_time_state_size], + dim=-1, + ) + + # 3. SSM transformation + A = -torch.exp(self.A_log.float()) # (nheads,) + A = A[:, None, ...][:, :, None].expand(-1, self.head_dim, self.ssm_state_size).to(dtype=torch.float32) + dt = dt[:, :, None].expand(-1, -1, self.head_dim) + dt_bias = self.dt_bias[:, None, ...].expand(-1, self.head_dim) + D = self.D[:, None, ...].expand(-1, self.head_dim) + B = B.view(batch_size, self.n_groups, B.shape[1] // self.n_groups) + C = C.view(batch_size, self.n_groups, C.shape[1] // self.n_groups) + hidden_states_reshaped = hidden_states.view(batch_size, self.num_heads, self.head_dim) + hidden_states = selective_state_update( + cache_params.ssm_states[self.layer_idx], + hidden_states_reshaped, + dt, + A, + B, + C, + D, + z=None, + dt_bias=dt_bias, + dt_softplus=True, + ) + hidden_states = hidden_states.view(batch_size, self.num_heads * self.head_dim) + hidden_states = self.norm(hidden_states, gate) + + # 4. Final linear projection + out = self.out_proj(hidden_states)[:, None, ...] + # Fused calculations or step by step if no initialized cache is found + else: + A = -torch.exp(self.A_log.float()) # (num_heads) or (intermediate_size, state_size) + dt_limit_kwargs = {} if self.time_step_limit == (0.0, float("inf")) else {"dt_limit": self.time_step_limit} + + # 2-4. Fused kernel for conv1d, SSM, and the final projection + if self.training and cache_params is None: + out = mamba_split_conv1d_scan_combined( + projected_states, + self.conv1d.weight.squeeze(1), + self.conv1d.bias, + self.dt_bias, + A, + D=self.D, + chunk_size=self.chunk_size, + seq_idx=None, # was seq_idx + activation=self.activation, + rmsnorm_weight=self.norm.weight, + rmsnorm_eps=self.norm.variance_epsilon, + outproj_weight=self.out_proj.weight, + outproj_bias=self.out_proj.bias, + headdim=self.head_dim, + ngroups=self.n_groups, + norm_before_gate=False, + return_final_states=False, + **dt_limit_kwargs, + ) + + else: + gate, hidden_states_B_C, dt = projected_states.split( + [self.intermediate_size, self.conv_dim, self.num_heads], dim=-1 + ) + + # 2. Convolution sequence transformation + # Init cache + if cache_params is not None: + # storing the states + # If we just take xBC[:, :, -self.d_conv :], it will error if seqlen < self.d_conv + # Instead F.pad will pad with zeros if seqlen < self.d_conv, and truncate otherwise. + hidden_states_B_C_transposed = hidden_states_B_C.transpose(1, 2) + conv_states = nn.functional.pad( + hidden_states_B_C_transposed, + (self.conv_kernel_size - hidden_states_B_C_transposed.shape[-1], 0), + ) + cache_params.conv_states[self.layer_idx].copy_(conv_states) + + if self.activation not in ["silu", "swish"]: + hidden_states_B_C = self.act( + self.conv1d(hidden_states_B_C.transpose(1, 2))[..., :seq_len].transpose(1, 2) + ) + else: + hidden_states_B_C = causal_conv1d_fn( + x=hidden_states_B_C.transpose(1, 2), + weight=self.conv1d.weight.squeeze(1), + bias=self.conv1d.bias, + activation=self.activation, + ).transpose(1, 2) + + hidden_states_B_C = apply_mask_to_padding_states(hidden_states_B_C, attention_mask) + hidden_states, B, C = torch.split( + hidden_states_B_C, + [self.intermediate_size, groups_time_state_size, groups_time_state_size], + dim=-1, + ) + + # 3. SSM transformation + scan_output, ssm_state = mamba_chunk_scan_combined( + hidden_states.view(batch_size, seq_len, -1, self.head_dim), + dt, + A, + B.view(batch_size, seq_len, self.n_groups, -1), + C.view(batch_size, seq_len, self.n_groups, -1), + chunk_size=self.chunk_size, + D=self.D, + z=None, + seq_idx=None, + return_final_states=True, + dt_bias=self.dt_bias, + dt_softplus=True, + **dt_limit_kwargs, + ) + + # Init cache + if ssm_state is not None and cache_params is not None: + cache_params.ssm_states[self.layer_idx].copy_(ssm_state) + + scan_output = scan_output.view(batch_size, seq_len, -1) + # Multiply "gate" branch and apply extra normalization layer + scan_output = self.norm(scan_output, gate) + + # 4. Final linear projection + out = self.out_proj(scan_output) + return out + + # fmt: off + def torch_forward( + self, + input_states, + cache_params: Optional[HybridMambaAttentionDynamicCache] = None, + cache_position: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + ): + batch_size, seq_len, _ = input_states.shape + dtype = input_states.dtype + + # 1. Gated MLP's linear projection + input_states = apply_mask_to_padding_states(input_states, attention_mask) + projected_states = self.in_proj(input_states) + gate, hidden_states_B_C, dt = projected_states.split( + [self.intermediate_size, self.conv_dim, self.num_heads], dim=-1 + ) + + use_precomputed_states = ( + cache_params is not None + and cache_params.has_previous_state + and seq_len == 1 + and cache_params.conv_states[self.layer_idx].shape[0] + == cache_params.ssm_states[self.layer_idx].shape[0] + == batch_size + and cache_position is not None + and cache_position[0] > 0 + ) + + # 2. Convolution sequence transformation + if use_precomputed_states: + cache_params.conv_states[self.layer_idx] = cache_params.conv_states[self.layer_idx].roll(shifts=-1, dims=-1) + cache_params.conv_states[self.layer_idx][:, :, -1] = hidden_states_B_C[:, 0, :].to(cache_params.conv_states[self.layer_idx].device) + + # We need to guarantee that anything regarding the cache is on the same device + conv_states = cache_params.conv_states[self.layer_idx].to(device=self.conv1d.weight.device) + + hidden_states_B_C = torch.sum( + conv_states * self.conv1d.weight.squeeze(1), dim=-1 + ) + if self.use_conv_bias: + hidden_states_B_C = hidden_states_B_C + self.conv1d.bias + hidden_states_B_C = self.act(hidden_states_B_C) + else: + # Init cache + if cache_params is not None: + hidden_states_B_C_transposed = hidden_states_B_C.transpose(1, 2) + conv_states = nn.functional.pad( + hidden_states_B_C_transposed, (self.conv_kernel_size - hidden_states_B_C_transposed.shape[-1], 0) + ) + cache_params.conv_states[self.layer_idx].copy_(conv_states) + + hidden_states_B_C = self.act(self.conv1d(hidden_states_B_C.transpose(1, 2))[..., :seq_len].transpose(1, 2)) + + hidden_states_B_C = apply_mask_to_padding_states(hidden_states_B_C, attention_mask) + hidden_states, B, C = torch.split( + hidden_states_B_C, + [self.intermediate_size, self.n_groups * self.ssm_state_size, self.n_groups * self.ssm_state_size], + dim=-1 + ) + + # 3. SSM transformation + A = -torch.exp(self.A_log.float()) # [num_heads] + if use_precomputed_states: + # We need to guarantee that anything regarding the cache is on the same device + cache_device = cache_params.ssm_states[self.layer_idx].device + + # Note: there is no need to pad parameter matrices here, as there is just one new token + # for batched generation + dt = dt[:, 0, :][:, None, ...] + dt = dt.transpose(1, 2).expand(batch_size, dt.shape[-1], self.head_dim) + # [num_heads] -> [num_heads, head_dim] + dt_bias = self.dt_bias[..., None].expand(self.dt_bias.shape[0], self.head_dim) + + dt = torch.nn.functional.softplus(dt + dt_bias.to(dt.dtype)) + dt = torch.clamp(dt, self.time_step_limit[0], self.time_step_limit[1]) + A = A[..., None, None].expand(self.num_heads, self.head_dim, self.ssm_state_size).to(dtype=torch.float32) + # [bsz, num_heads, head_dim, state_size] + dA = (torch.exp(dt[..., None] * A)).to(device=cache_device) + + # Discretize B + # [bsz, n_groups * state_size] -> [bsz, n_groups, 1, state_size] -> + # -> [bsz, n_groups, group to head repetition factor, state_size] -> [bsz, num_heads, state_size] + B = B.reshape(batch_size, self.n_groups, -1)[..., None, :] + B = B.expand(batch_size, self.n_groups, self.num_heads // self.n_groups, B.shape[-1]).contiguous() + B = B.reshape(batch_size, -1, B.shape[-1]) + # [bsz, num_heads, head_dim, state_size] + dB = dt[..., None] * B[..., None, :] + + # Discretize x into dB + # [bsz, intermediate_size] -> [bsz, num_heads, head_dim] + hidden_states = hidden_states.reshape(batch_size, -1, self.head_dim) + dBx = (dB * hidden_states[..., None]).to(device=cache_device) + + # State calculation + cache_params.ssm_states[self.layer_idx].copy_( + cache_params.ssm_states[self.layer_idx] * dA + dBx + ) + + # Subsequent output + # [bsz, n_groups * state_size] -> [bsz, num_heads, state_size] + C = C.reshape(batch_size, self.n_groups, -1)[..., None, :] + C = C.expand(batch_size, self.n_groups, self.num_heads // self.n_groups, C.shape[-1]).contiguous() + C = C.reshape(batch_size, -1, C.shape[-1]) + # [bsz, num_heads, head_dim] + + ssm_states = cache_params.ssm_states[self.layer_idx].to(device=C.device, dtype=C.dtype) # Shape: [b, h, d, n] + # Reshape ssm_states to merge the first two dimensions + ssm_states_reshaped = ssm_states.view(batch_size * self.num_heads, self.head_dim, self.ssm_state_size) # Shape: [b*h, d, n] + C_reshaped = C.view(batch_size * self.num_heads, self.ssm_state_size, 1) # Shape: [b*h, n, 1] + y = torch.bmm(ssm_states_reshaped, C_reshaped) + y = y.view(batch_size, self.num_heads, self.head_dim) + + # D skip connection + # [num_heads] -> [num_heads, head_dim] + D = self.D[..., None].expand(self.D.shape[0], self.head_dim) + y = (y + hidden_states * D).to(y.dtype) + + # [bsz, num_heads, head_dim] -> [bsz, 1, intermediate_size] + y = y.reshape(batch_size, -1)[:, None, ...] + else: + # begin ssd naive implementation without einsums + dt = nn.functional.softplus(dt + self.dt_bias) + dt = torch.clamp(dt, self.time_step_limit[0], self.time_step_limit[1]) + hidden_states = hidden_states.reshape(batch_size, seq_len, -1, self.head_dim).float() + B = B.reshape(batch_size, seq_len, -1, self.ssm_state_size).float() + C = C.reshape(batch_size, seq_len, -1, self.ssm_state_size).float() + B = B.repeat_interleave(self.num_heads // self.n_groups, dim=2, output_size=self.num_heads) + C = C.repeat_interleave(self.num_heads // self.n_groups, dim=2, output_size=self.num_heads) + pad_size = (self.chunk_size - seq_len % self.chunk_size) % self.chunk_size + + D_residual = self.D[..., None] * pad_tensor_by_size(hidden_states, pad_size) + + # Discretize x and A + hidden_states = hidden_states * dt[..., None] + A = A.to(hidden_states.dtype) * dt + + # Rearrange into blocks/chunks + hidden_states, A, B, C = [reshape_into_chunks(t, pad_size, self.chunk_size) for t in (hidden_states, A, B, C)] + + # [bsz, -1, chunk_size, num_heads] -> [bsz, num_heads, -1, chunk_size] + A = A.permute(0, 3, 1, 2) + A_cumsum = torch.cumsum(A, dim=-1) + + # 1. Compute the output for each intra-chunk (diagonal blocks) + # This is the analog of a causal mask + L = torch.exp(segment_sum(A)) + + # Contraction of C and B to get G (attention-weights like) + G_intermediate = C[:, :, :, None, :, :] * B[:, :, None, :, :, :] # shape: (b, c, l, s, h, n) + G = G_intermediate.sum(dim=-1) # shape: (b, c, l, s, h) + + # Compute M, equivalent to applying attention mask to weights + M_intermediate = G[..., None] * L.permute(0, 2, 3, 4, 1)[..., None] + M = M_intermediate.sum(dim=-1) + + # Compute Y_diag (apply to values) + Y_diag = (M[..., None] * hidden_states[:, :, None]).sum(dim=3) + + # 2. Compute the state for each intra-chunk + # (right term of low-rank factorization of off-diagonal blocks; B terms) + decay_states = torch.exp((A_cumsum[:, :, :, -1:] - A_cumsum)) + B_decay = B * decay_states.permute(0, -2, -1, 1)[..., None] + states = (B_decay[..., None, :] * hidden_states[..., None]).sum(dim=2) + + # 3. Compute the inter-chunk SSM recurrence; produces correct SSM states at chunk boundaries + # (middle term of factorization of off-diag blocks; A terms) + if use_precomputed_states: + previous_states = cache_params.ssm_states[self.layer_idx][:, None, ...].to(device=states.device) + else: + previous_states = torch.zeros_like(states[:, :1]) + states = torch.cat([previous_states, states], dim=1) + decay_chunk = torch.exp(segment_sum(nn.functional.pad(A_cumsum[:, :, :, -1], (1, 0)))) + decay_chunk = decay_chunk.transpose(1, 3) + new_states = (decay_chunk[..., None, None] * states[:, :, None, ...]).sum(dim=1) + states, ssm_state = new_states[:, :-1], new_states[:, -1] + + # 4. Compute state -> output conversion per chunk + # (left term of low-rank factorization of off-diagonal blocks; C terms) + state_decay_out = torch.exp(A_cumsum) + C_times_states = (C[..., None, :] * states[:, :, None, ...]) + state_decay_out_permuted = state_decay_out.permute(0, 2, 3, 1) + Y_off = (C_times_states.sum(-1) * state_decay_out_permuted[..., None]) + + # Add output of intra-chunk and inter-chunk terms (diagonal and off-diagonal blocks) + y = Y_diag + Y_off + # [bsz, -1, self.chunk_size, num_heads, head_dim] -> [bsz, (padded) seq_len, num_heads, head_dim] + y = y.reshape(batch_size, -1, self.num_heads, self.head_dim) + + y = y + D_residual + # Cutting off padded chunks + if pad_size > 0: + y = y[:, :seq_len, :, :] + y = y.reshape(batch_size, seq_len, -1) + + # Init cache + if ssm_state is not None and cache_params is not None: + cache_params.ssm_states[self.layer_idx].copy_(ssm_state) + + scan_output = self.norm(y, gate) + + # end ssd naive + + # 4. Final linear projection + contextualized_states = self.out_proj(scan_output.to(dtype)) # [batch, seq_len, hidden_size] + return contextualized_states + # fmt: on + + def forward( + self, + hidden_states, + cache_params: Optional[HybridMambaAttentionDynamicCache] = None, + cache_position: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + ): + if is_fast_path_available and "cuda" in self.in_proj.weight.device.type: + return self.cuda_kernels_forward(hidden_states, cache_params, cache_position, attention_mask) + dtype = hidden_states.dtype + if attention_mask is not None and attention_mask.shape[1] > 1 and attention_mask.shape[0] > 1: + # tune out hidden states for pad tokens, see https://github.com/state-spaces/mamba/issues/66 + hidden_states = (hidden_states * attention_mask[:, :, None]).to(dtype) + + return self.torch_forward(hidden_states, cache_params, cache_position, attention_mask) + + +class BambaMLP(LlamaMLP): + pass + + +class BambaRMSNorm(LlamaRMSNorm): + pass + + +class BambaDecoderLayer(JambaAttentionDecoderLayer): + def __init__(self, config: BambaConfig, layer_idx: int, layer_type: str = "mamba"): + super().__init__() + + del self.self_attn + + num_experts = 1 + ffn_layer_class = BambaMLP if num_experts == 1 else None + self.feed_forward = ffn_layer_class(config) + + self.layer_type = layer_type + if layer_type == "mamba": + self.mamba = BambaMixer(config=config, layer_idx=layer_idx) + elif layer_type == "attention": + self.self_attn = BambaAttention(config, layer_idx) + else: + raise ValueError("Invalid layer_type") + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[HybridMambaAttentionDynamicCache] = None, + output_attentions: Optional[bool] = False, + use_cache: Optional[bool] = False, + cache_position: Optional[torch.LongTensor] = None, + position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None, # necessary, but kept here for BC + **kwargs, + ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]: + """ + Args: + hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)` + attention_mask (`torch.FloatTensor`, *optional*): attention mask of size + `(batch, sequence_length)` where padding elements are indicated by 0. + past_key_value (`HybridMambaAttentionDynamicCache`, *optional*): cached past key and value projection states + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding + (see `past_key_values`). + cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*): + Indices depicting the position of the input sequence tokens in the sequence. + position_embeddings (`Tuple[torch.FloatTensor, torch.FloatTensor]`, *optional*): + Tuple containing the cosine and sine positional embeddings of shape `(batch_size, seq_len, head_dim)`, + with `head_dim` being the embedding dimension of each attention head. + kwargs (`dict`, *optional*): + Arbitrary kwargs to be ignored, used for FSDP and other methods that injects code + into the model + """ + + residual = hidden_states + + hidden_states = self.input_layernorm(hidden_states) + + # this is a hybrid decoder layer + if self.layer_type == "mamba": + hidden_states = self.mamba( + hidden_states=hidden_states, + cache_params=past_key_value, + cache_position=cache_position, + attention_mask=attention_mask, + ) + self_attn_weights = None + elif self.layer_type == "attention": + hidden_states, self_attn_weights = self.self_attn( + hidden_states=hidden_states, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_value=past_key_value, + output_attentions=output_attentions, + use_cache=use_cache, + cache_position=cache_position, + position_embeddings=position_embeddings, + **kwargs, + ) + + # residual connection after attention + hidden_states = residual + hidden_states + + # feed-forward + residual = hidden_states + hidden_states = self.pre_ff_layernorm(hidden_states) + hidden_states = self.feed_forward(hidden_states) + hidden_states = residual + hidden_states + + outputs = (hidden_states,) + + if output_attentions: + outputs += (self_attn_weights,) + + return outputs + + +BAMBA_START_DOCSTRING = r""" + This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads + etc.) + + This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. + Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage + and behavior. + + Parameters: + config ([`BambaConfig`]): + Model configuration class with all the parameters of the model. Initializing with a config file does not + load the weights associated with the model, only the configuration. Check out the + [`~PreTrainedModel.from_pretrained`] method to load the model weights. +""" + + +@add_start_docstrings( + "The bare BambaModel outputting raw hidden-states without any specific head on top.", + BAMBA_START_DOCSTRING, +) +class BambaPreTrainedModel(PreTrainedModel): + config_class = BambaConfig + base_model_prefix = "model" + supports_gradient_checkpointing = True + _no_split_modules = ["BambaDecoderLayer"] + _skip_keys_device_placement = "past_key_values" + _supports_flash_attn_2 = True + _supports_sdpa = True + _supports_cache_class = True # Note: only supports HybridMambaAttentionDynamicCache + _is_stateful = True + + def _init_weights(self, module): + std = self.config.initializer_range + if isinstance(module, (nn.Linear, nn.Conv1d)): + module.weight.data.normal_(mean=0.0, std=std) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, (BambaRMSNormGated, BambaRMSNorm)): + module.weight.data.fill_(1.0) + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=std) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + elif isinstance(module, BambaMixer): + module.dt_bias.data.fill_(1.0) + module.A_log.data = torch.log(torch.arange(1, module.num_heads + 1)) + module.D.data.fill_(1.0) + + +BAMBA_INPUTS_DOCSTRING = r""" + Args: + input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide + it. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + If `past_key_values` is used, optionally only the last `input_ids` have to be input (see + `past_key_values`). + + If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`] + and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more + information on the default strategy. + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, + config.n_positions - 1]`. + + [What are position IDs?](../glossary#position-ids) + past_key_values (`HybridMambaAttentionDynamicCache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): + A HybridMambaAttentionDynamicCache object containing pre-computed hidden-states (keys and values in the + self-attention blocks and convolution and ssm states in the mamba blocks) that can be used (see + `past_key_values` input) to speed up sequential decoding. + Key and value cache tensors have shape `(batch_size, num_heads, seq_len, head_dim)`. + Convolution and ssm states tensors have shape `(batch_size, d_inner, d_conv)` and + `(batch_size, d_inner, d_state)` respectively. + See the `HybridMambaAttentionDynamicCache` class for more details. + + If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that + don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all + `input_ids` of shape `(batch_size, sequence_length)`. + inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This + is useful if you want more control over how to convert `input_ids` indices into associated vectors than the + model's internal embedding lookup matrix. + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see + `past_key_values`). + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + output_router_logits (`bool`, *optional*): + Whether or not to return the logits of all the routers. They are useful for computing the router loss, and + should not be returned during inference. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. + cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*): + Indices depicting the position of the input sequence tokens in the sequence. Contrarily to `position_ids`, + this tensor is not affected by padding. It is used to update the cache in the correct position and to infer + the complete sequence length. +""" + + +@add_start_docstrings( + "The bare Bamba Model outputting raw hidden-states without any specific head on top.", + BAMBA_START_DOCSTRING, +) +# Adapted from transformers.models.jamba.modeling_jamba.JambaModel +class BambaModel(BambaPreTrainedModel): + """ + Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`BambaDecoderLayer`] + + Args: + config: BambaConfig + """ + + def __init__(self, config: BambaConfig): + super().__init__(config) + self.padding_idx = config.pad_token_id + self.vocab_size = config.vocab_size + + self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx) + decoder_layers = [] + for i in range(config.num_hidden_layers): + decoder_layers.append(BambaDecoderLayer(config, layer_idx=i, layer_type=config.layers_block_type[i])) + self.layers = nn.ModuleList(decoder_layers) + + self._attn_implementation = config._attn_implementation + self.final_layernorm = BambaRMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.rotary_emb = BambaRotaryEmbedding(config=config) + + self.gradient_checkpointing = False + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.embed_tokens + + def set_input_embeddings(self, value): + self.embed_tokens = value + + @can_return_tuple + @add_start_docstrings_to_model_forward(BAMBA_INPUTS_DOCSTRING) + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[HybridMambaAttentionDynamicCache] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + cache_position: Optional[torch.LongTensor] = None, + **kwargs, # NOOP kwargs, for now + ) -> BaseModelOutputWithPast: + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + use_cache = use_cache if use_cache is not None else self.config.use_cache + + if (input_ids is None) ^ (inputs_embeds is not None): + raise ValueError("You must specify exactly one of input_ids or inputs_embeds") + + if self.gradient_checkpointing and self.training and use_cache: + logger.warning_once( + "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`." + ) + use_cache = False + + if inputs_embeds is None: + inputs_embeds = self.embed_tokens(input_ids) + hidden_states = inputs_embeds + + if use_cache and past_key_values is None: + logger.warning_once( + "Bamba requires an initialized `HybridMambaAttentionDynamicCache` to return a cache. None was " + "provided, so no cache will be returned." + ) + + if cache_position is None: + cache_position = torch.arange(hidden_states.shape[1], device=hidden_states.device) + if position_ids is None: + position_ids = cache_position.unsqueeze(0) + + causal_mask = self._update_causal_mask( + attention_mask, inputs_embeds, cache_position, past_key_values, output_attentions + ) + mamba_mask = self._update_mamba_mask(attention_mask, cache_position) + + # create position embeddings to be shared across the decoder layers + position_embeddings = self.rotary_emb(hidden_states, position_ids) + + all_hidden_states = () if output_hidden_states else None + all_self_attns = () if output_attentions else None + + for decoder_layer in self.layers: + # Depending on the layer type we opt for 2D base attention mask (Mamba) or 4D causal mask (Attention) + layer_mask = mamba_mask if decoder_layer.layer_type == "mamba" else causal_mask + + if output_hidden_states: + all_hidden_states += (hidden_states,) + + if self.gradient_checkpointing and self.training: + layer_outputs = self._gradient_checkpointing_func( + decoder_layer.__call__, + hidden_states, + layer_mask, + position_ids, + past_key_values, + output_attentions, + use_cache, + cache_position, + position_embeddings, + ) + else: + layer_outputs = decoder_layer( + hidden_states, + attention_mask=layer_mask, + position_ids=position_ids, + past_key_value=past_key_values, + output_attentions=output_attentions, + use_cache=use_cache, + cache_position=cache_position, + position_embeddings=position_embeddings, + ) + + hidden_states = layer_outputs[0] + + if output_attentions: + if layer_outputs[1] is not None: + # append attentions only of attention layers. Mamba layers return `None` as the attention weights + all_self_attns += (layer_outputs[1],) + + hidden_states = self.final_layernorm(hidden_states) + + # add hidden states from the last decoder layer + if output_hidden_states: + all_hidden_states += (hidden_states,) + + if past_key_values and not past_key_values.has_previous_state: + past_key_values.has_previous_state = True + + next_cache = None if not use_cache else past_key_values + + return BaseModelOutputWithPast( + last_hidden_state=hidden_states, + past_key_values=next_cache, + hidden_states=all_hidden_states, + attentions=all_self_attns, + ) + + def _update_causal_mask( + self, + attention_mask: torch.Tensor, + input_tensor: torch.Tensor, + cache_position: torch.Tensor, + past_key_values: HybridMambaAttentionDynamicCache, + output_attentions: bool, + ): + if self.config._attn_implementation == "flash_attention_2": + if attention_mask is not None and 0.0 in attention_mask: + return attention_mask + return None + + # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in + # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail + # to infer the attention mask. + past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0 + + # When output attentions is True, sdpa implementation's forward method calls the eager implementation's forward + if self.config._attn_implementation == "sdpa" and not output_attentions: + if AttentionMaskConverter._ignore_causal_mask_sdpa( + attention_mask, + inputs_embeds=input_tensor, + past_key_values_length=past_seen_tokens, + is_training=self.training, + ): + return None + + dtype, device = input_tensor.dtype, input_tensor.device + sequence_length = input_tensor.shape[1] + target_length = ( + attention_mask.shape[-1] + if isinstance(attention_mask, torch.Tensor) + else past_seen_tokens + sequence_length + 1 + ) + + # In case the provided `attention` mask is 2D, we generate a causal mask here (4D). + causal_mask = self._prepare_4d_causal_attention_mask_with_cache_position( + attention_mask, + sequence_length=sequence_length, + target_length=target_length, + dtype=dtype, + device=device, + cache_position=cache_position, + batch_size=input_tensor.shape[0], + ) + + if ( + self.config._attn_implementation == "sdpa" + and attention_mask is not None + and attention_mask.device.type in ["cuda", "xpu", "npu"] + and not output_attentions + ): + # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when + # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path. + # Details: https://github.com/pytorch/pytorch/issues/110213 + min_dtype = torch.finfo(dtype).min + causal_mask = AttentionMaskConverter._unmask_unattended(causal_mask, min_dtype) + + return causal_mask + + @staticmethod + def _prepare_4d_causal_attention_mask_with_cache_position( + attention_mask: torch.Tensor, + sequence_length: int, + target_length: int, + dtype: torch.dtype, + device: torch.device, + cache_position: torch.Tensor, + batch_size: int, + **kwargs, + ): + """ + Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape + `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing. + + Args: + attention_mask (`torch.Tensor`): + A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape + `(batch_size, 1, query_length, key_value_length)`. + sequence_length (`int`): + The sequence length being processed. + target_length (`int`): + The target length: when generating with static cache, the mask should be as long as the static cache, + to account for the 0 padding, the part of the cache that is not filled yet. + dtype (`torch.dtype`): + The dtype to use for the 4D attention mask. + device (`torch.device`): + The device to place the 4D attention mask on. + cache_position (`torch.Tensor`): + Indices depicting the position of the input sequence tokens in the sequence. + batch_size (`torch.Tensor`): + Batch size. + """ + if attention_mask is not None and attention_mask.dim() == 4: + # In this case we assume that the mask comes already in inverted form and requires no inversion or slicing. + causal_mask = attention_mask + else: + min_dtype = torch.finfo(dtype).min + causal_mask = torch.full( + (sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device + ) + if sequence_length != 1: + causal_mask = torch.triu(causal_mask, diagonal=1) + causal_mask *= torch.arange(target_length, device=device) > cache_position.reshape(-1, 1) + causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1) + if attention_mask is not None: + causal_mask = causal_mask.clone() # copy to contiguous memory for in-place edit + mask_length = attention_mask.shape[-1] + padding_attention_mask = (attention_mask[:, None, None, :] == attention_mask[:, None, :, None])[ + :, :, -sequence_length:, : + ].to(dtype) + padding_mask = causal_mask[:, :, :, :mask_length] + padding_attention_mask + padding_mask = padding_mask == 0 + causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill( + padding_mask, min_dtype + ) + + return causal_mask + + def _update_mamba_mask(self, attention_mask, cache_position): + """ + No need for zeroing states when + 1. Cached forward + 2. Attending to all inputs + """ + mamba_mask = attention_mask + if cache_position[0] > 0 or (attention_mask is not None and torch.all(attention_mask == 1)): + mamba_mask = None + return mamba_mask + + +class BambaForCausalLM(LlamaForCausalLM): + @can_return_tuple + @add_start_docstrings_to_model_forward(BAMBA_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC) + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[HybridMambaAttentionDynamicCache] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + cache_position: Optional[torch.LongTensor] = None, + logits_to_keep: Union[int, torch.Tensor] = 0, + **kwargs, + ) -> CausalLMOutputWithPast: + r""" + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., + config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored + (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. + + logits_to_keep (`int` or `torch.Tensor`, *optional*): + If an `int`, compute logits for the last `logits_to_keep` tokens. If `0`, calculate logits for all + `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that + token can save memory, which becomes pretty significant for long sequences or large vocabulary size. + If a `torch.Tensor`, must be 1D corresponding to the indices to keep in the sequence length dimension. + This is useful when using packed tensor format (single dimension for batch and sequence length). + + Returns: + + Example: + + ```python + >>> from transformers import AutoTokenizer, BambaForCausalLM + + >>> model = BambaForCausalLM.from_pretrained("...") + >>> tokenizer = AutoTokenizer.from_pretrained("...") + + >>> prompt = "Hey, are you conscious? Can you talk to me?" + >>> inputs = tokenizer(prompt, return_tensors="pt") + + >>> # Generate + >>> generate_ids = model.generate(inputs.input_ids, max_length=30) + >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] + "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you." + ```""" + return super().forward( + input_ids, + attention_mask, + position_ids, + past_key_values, + inputs_embeds, + labels, + use_cache, + output_attentions, + output_hidden_states, + cache_position, + logits_to_keep, + **kwargs, + ) + + def prepare_inputs_for_generation( + self, + input_ids, + past_key_values=None, + attention_mask=None, + inputs_embeds=None, + cache_position=None, + position_ids=None, + use_cache=True, + **kwargs, + ): + # Overwitten -- has a unique cache type, `HybridMambaAttentionDynamicCache` + + empty_past_kv = past_key_values is None + + # If we have cache: let's slice `input_ids` through `cache_position`, to keep only the unprocessed tokens + # Exception 1: when passing input_embeds, input_ids may be missing entries + # Exception 2: some generation methods do special slicing of input_ids, so we don't need to do it here + # Exception 3: with synced GPUs cache_position may go out of bounds, but we only want dummy token in that case. + # (we can't check exception 3 while compiling) + if not empty_past_kv: + if ( + inputs_embeds is not None # Exception 1 + or cache_position[-1] >= input_ids.shape[1] # Exception 3 + ): + input_ids = input_ids[:, -cache_position.shape[0] :] + elif input_ids.shape[1] != cache_position.shape[0]: # Default case (the "else", a no op, is Exception 2) + input_ids = input_ids[:, cache_position] + else: + past_key_values = HybridMambaAttentionDynamicCache( + self.config, input_ids.shape[0], self.dtype, device=self.device + ) + + if attention_mask is not None and position_ids is None: + # create position_ids on the fly for batch generation + position_ids = attention_mask.long().cumsum(-1) - 1 + position_ids.masked_fill_(attention_mask == 0, 1) + if not empty_past_kv: + position_ids = position_ids[:, -input_ids.shape[1] :] + + # if `inputs_embeds` are passed, we only want to use them in the 1st generation step + if inputs_embeds is not None and empty_past_kv: + model_inputs = {"inputs_embeds": inputs_embeds} + else: + model_inputs = {"input_ids": input_ids.contiguous()} # `contiguous()` needed for compilation use cases + + model_inputs.update( + { + "position_ids": position_ids, + "past_key_values": past_key_values, + "use_cache": use_cache, + "attention_mask": attention_mask, + "logits_to_keep": self.config.num_logits_to_keep, + "cache_position": cache_position, + } + ) + return model_inputs + + +__all__ = ["BambaModel", "BambaForCausalLM", "BambaPreTrainedModel"] diff --git a/docs/transformers/build/lib/transformers/models/bark/__init__.py b/docs/transformers/build/lib/transformers/models/bark/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..c5296fc47fc423c300cf6c43bccb5daafd6d134a --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/bark/__init__.py @@ -0,0 +1,28 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ...utils import _LazyModule +from ...utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .configuration_bark import * + from .modeling_bark import * + from .processing_bark import * +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/docs/transformers/build/lib/transformers/models/bark/configuration_bark.py b/docs/transformers/build/lib/transformers/models/bark/configuration_bark.py new file mode 100644 index 0000000000000000000000000000000000000000..932bad618aa187ce0e55dc772fc803ef9d1d08dd --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/bark/configuration_bark.py @@ -0,0 +1,303 @@ +# coding=utf-8 +# Copyright 2023 The Suno AI Authors and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""BARK model configuration""" + +from typing import Dict + +from ...configuration_utils import PretrainedConfig +from ...utils import add_start_docstrings, logging +from ..auto import CONFIG_MAPPING, AutoConfig + + +logger = logging.get_logger(__name__) + + +BARK_SUBMODELCONFIG_START_DOCSTRING = """ + This is the configuration class to store the configuration of a [`{model}`]. It is used to instantiate the model + according to the specified arguments, defining the model architecture. Instantiating a configuration with the + defaults will yield a similar configuration to that of the Bark [suno/bark](https://huggingface.co/suno/bark) + architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + block_size (`int`, *optional*, defaults to 1024): + The maximum sequence length that this model might ever be used with. Typically set this to something large + just in case (e.g., 512 or 1024 or 2048). + input_vocab_size (`int`, *optional*, defaults to 10_048): + Vocabulary size of a Bark sub-model. Defines the number of different tokens that can be represented by the + `inputs_ids` passed when calling [`{model}`]. Defaults to 10_048 but should be carefully thought with + regards to the chosen sub-model. + output_vocab_size (`int`, *optional*, defaults to 10_048): + Output vocabulary size of a Bark sub-model. Defines the number of different tokens that can be represented + by the: `output_ids` when passing forward a [`{model}`]. Defaults to 10_048 but should be carefully thought + with regards to the chosen sub-model. + num_layers (`int`, *optional*, defaults to 12): + Number of hidden layers in the given sub-model. + num_heads (`int`, *optional*, defaults to 12): + Number of attention heads for each attention layer in the Transformer architecture. + hidden_size (`int`, *optional*, defaults to 768): + Dimensionality of the "intermediate" (often named feed-forward) layer in the architecture. + dropout (`float`, *optional*, defaults to 0.0): + The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. + bias (`bool`, *optional*, defaults to `True`): + Whether or not to use bias in the linear layers and layer norm layers. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + use_cache (`bool`, *optional*, defaults to `True`): + Whether or not the model should return the last key/values attentions (not used by all models). +""" + + +class BarkSubModelConfig(PretrainedConfig): + keys_to_ignore_at_inference = ["past_key_values"] + + attribute_map = { + "num_attention_heads": "num_heads", + "num_hidden_layers": "num_layers", + "vocab_size": "input_vocab_size", + "window_size": "block_size", + } + + def __init__( + self, + block_size=1024, + input_vocab_size=10_048, + output_vocab_size=10_048, + num_layers=12, + num_heads=12, + hidden_size=768, + dropout=0.0, + bias=True, # True: bias in Linears and LayerNorms, like GPT-2. False: a bit better and faster + initializer_range=0.02, + use_cache=True, + **kwargs, + ): + self.block_size = block_size + self.input_vocab_size = input_vocab_size + self.output_vocab_size = output_vocab_size + self.num_layers = num_layers + self.num_heads = num_heads + self.hidden_size = hidden_size + self.dropout = dropout + self.bias = bias + self.use_cache = use_cache + self.initializer_range = initializer_range + + super().__init__(**kwargs) + + +@add_start_docstrings( + BARK_SUBMODELCONFIG_START_DOCSTRING.format(config="BarkSemanticConfig", model="BarkSemanticModel"), + """ + Example: + + ```python + >>> from transformers import BarkSemanticConfig, BarkSemanticModel + + >>> # Initializing a Bark sub-module style configuration + >>> configuration = BarkSemanticConfig() + + >>> # Initializing a model (with random weights) from the suno/bark style configuration + >>> model = BarkSemanticModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""", +) +class BarkSemanticConfig(BarkSubModelConfig): + model_type = "semantic" + base_config_key = "semantic_config" + + +@add_start_docstrings( + BARK_SUBMODELCONFIG_START_DOCSTRING.format(config="BarkCoarseConfig", model="BarkCoarseModel"), + """ + Example: + + ```python + >>> from transformers import BarkCoarseConfig, BarkCoarseModel + + >>> # Initializing a Bark sub-module style configuration + >>> configuration = BarkCoarseConfig() + + >>> # Initializing a model (with random weights) from the suno/bark style configuration + >>> model = BarkCoarseModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""", +) +class BarkCoarseConfig(BarkSubModelConfig): + model_type = "coarse_acoustics" + base_config_key = "coarse_acoustics_config" + + +@add_start_docstrings( + BARK_SUBMODELCONFIG_START_DOCSTRING.format(config="BarkFineConfig", model="BarkFineModel"), + """ + n_codes_total (`int`, *optional*, defaults to 8): + The total number of audio codebooks predicted. Used in the fine acoustics sub-model. + n_codes_given (`int`, *optional*, defaults to 1): + The number of audio codebooks predicted in the coarse acoustics sub-model. Used in the acoustics + sub-models. + Example: + + ```python + >>> from transformers import BarkFineConfig, BarkFineModel + + >>> # Initializing a Bark sub-module style configuration + >>> configuration = BarkFineConfig() + + >>> # Initializing a model (with random weights) from the suno/bark style configuration + >>> model = BarkFineModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""", +) +class BarkFineConfig(BarkSubModelConfig): + model_type = "fine_acoustics" + base_config_key = "fine_acoustics_config" + + def __init__(self, tie_word_embeddings=True, n_codes_total=8, n_codes_given=1, **kwargs): + self.n_codes_total = n_codes_total + self.n_codes_given = n_codes_given + + super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs) + + +class BarkConfig(PretrainedConfig): + """ + This is the configuration class to store the configuration of a [`BarkModel`]. It is used to instantiate a Bark + model according to the specified sub-models configurations, defining the model architecture. + + Instantiating a configuration with the defaults will yield a similar configuration to that of the Bark + [suno/bark](https://huggingface.co/suno/bark) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + semantic_config ([`BarkSemanticConfig`], *optional*): + Configuration of the underlying semantic sub-model. + coarse_acoustics_config ([`BarkCoarseConfig`], *optional*): + Configuration of the underlying coarse acoustics sub-model. + fine_acoustics_config ([`BarkFineConfig`], *optional*): + Configuration of the underlying fine acoustics sub-model. + codec_config ([`AutoConfig`], *optional*): + Configuration of the underlying codec sub-model. + + Example: + + ```python + >>> from transformers import ( + ... BarkSemanticConfig, + ... BarkCoarseConfig, + ... BarkFineConfig, + ... BarkModel, + ... BarkConfig, + ... AutoConfig, + ... ) + + >>> # Initializing Bark sub-modules configurations. + >>> semantic_config = BarkSemanticConfig() + >>> coarse_acoustics_config = BarkCoarseConfig() + >>> fine_acoustics_config = BarkFineConfig() + >>> codec_config = AutoConfig.from_pretrained("facebook/encodec_24khz") + + + >>> # Initializing a Bark module style configuration + >>> configuration = BarkConfig.from_sub_model_configs( + ... semantic_config, coarse_acoustics_config, fine_acoustics_config, codec_config + ... ) + + >>> # Initializing a model (with random weights) + >>> model = BarkModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ``` + """ + + model_type = "bark" + sub_configs = { + "semantic_config": BarkSemanticConfig, + "coarse_acoustics_config": BarkCoarseConfig, + "fine_acoustics_config": BarkFineConfig, + "codec_config": AutoConfig, + } + + def __init__( + self, + semantic_config: Dict = None, + coarse_acoustics_config: Dict = None, + fine_acoustics_config: Dict = None, + codec_config: Dict = None, + initializer_range=0.02, + **kwargs, + ): + if semantic_config is None: + semantic_config = {} + logger.info("semantic_config is None. initializing the semantic model with default values.") + + if coarse_acoustics_config is None: + coarse_acoustics_config = {} + logger.info("coarse_acoustics_config is None. initializing the coarse model with default values.") + + if fine_acoustics_config is None: + fine_acoustics_config = {} + logger.info("fine_acoustics_config is None. initializing the fine model with default values.") + + if codec_config is None: + codec_config = {} + logger.info("codec_config is None. initializing the codec model with default values.") + + self.semantic_config = BarkSemanticConfig(**semantic_config) + self.coarse_acoustics_config = BarkCoarseConfig(**coarse_acoustics_config) + self.fine_acoustics_config = BarkFineConfig(**fine_acoustics_config) + codec_model_type = codec_config["model_type"] if "model_type" in codec_config else "encodec" + self.codec_config = CONFIG_MAPPING[codec_model_type](**codec_config) + + self.initializer_range = initializer_range + + super().__init__(**kwargs) + + @classmethod + def from_sub_model_configs( + cls, + semantic_config: BarkSemanticConfig, + coarse_acoustics_config: BarkCoarseConfig, + fine_acoustics_config: BarkFineConfig, + codec_config: PretrainedConfig, + **kwargs, + ): + r""" + Instantiate a [`BarkConfig`] (or a derived class) from bark sub-models configuration. + + Returns: + [`BarkConfig`]: An instance of a configuration object + """ + return cls( + semantic_config=semantic_config.to_dict(), + coarse_acoustics_config=coarse_acoustics_config.to_dict(), + fine_acoustics_config=fine_acoustics_config.to_dict(), + codec_config=codec_config.to_dict(), + **kwargs, + ) + + +__all__ = ["BarkCoarseConfig", "BarkConfig", "BarkFineConfig", "BarkSemanticConfig"] diff --git a/docs/transformers/build/lib/transformers/models/bark/convert_suno_to_hf.py b/docs/transformers/build/lib/transformers/models/bark/convert_suno_to_hf.py new file mode 100644 index 0000000000000000000000000000000000000000..803656b623edbffbfef01bb2d9644cfc6f983a93 --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/bark/convert_suno_to_hf.py @@ -0,0 +1,263 @@ +"""Convert Bark checkpoint.""" + +import argparse +import os +from pathlib import Path + +import torch +from bark.generation import _load_model as _bark_load_model +from huggingface_hub import hf_hub_download + +from transformers import EncodecConfig, EncodecModel, set_seed +from transformers.models.bark.configuration_bark import ( + BarkCoarseConfig, + BarkConfig, + BarkFineConfig, + BarkSemanticConfig, +) +from transformers.models.bark.generation_configuration_bark import ( + BarkCoarseGenerationConfig, + BarkFineGenerationConfig, + BarkGenerationConfig, + BarkSemanticGenerationConfig, +) +from transformers.models.bark.modeling_bark import BarkCoarseModel, BarkFineModel, BarkModel, BarkSemanticModel +from transformers.utils import logging + + +logging.set_verbosity_info() +logger = logging.get_logger(__name__) + +set_seed(770) + + +new_layer_name_dict = { + "c_attn": "att_proj", + "c_proj": "out_proj", + "c_fc": "in_proj", + "transformer.": "", + "h.": "layers.", + "ln_1": "layernorm_1", + "ln_2": "layernorm_2", + "ln_f": "layernorm_final", + "wpe": "position_embeds_layer", + "wte": "input_embeds_layer", +} + + +REMOTE_MODEL_PATHS = { + "text_small": { + "repo_id": "suno/bark", + "file_name": "text.pt", + }, + "coarse_small": { + "repo_id": "suno/bark", + "file_name": "coarse.pt", + }, + "fine_small": { + "repo_id": "suno/bark", + "file_name": "fine.pt", + }, + "text": { + "repo_id": "suno/bark", + "file_name": "text_2.pt", + }, + "coarse": { + "repo_id": "suno/bark", + "file_name": "coarse_2.pt", + }, + "fine": { + "repo_id": "suno/bark", + "file_name": "fine_2.pt", + }, +} + +CUR_PATH = os.path.dirname(os.path.abspath(__file__)) +default_cache_dir = os.path.join(os.path.expanduser("~"), ".cache") +CACHE_DIR = os.path.join(os.getenv("XDG_CACHE_HOME", default_cache_dir), "suno", "bark_v0") + + +def _get_ckpt_path(model_type, use_small=False): + key = model_type + if use_small: + key += "_small" + return os.path.join(CACHE_DIR, REMOTE_MODEL_PATHS[key]["file_name"]) + + +def _download(from_hf_path, file_name): + os.makedirs(CACHE_DIR, exist_ok=True) + hf_hub_download(repo_id=from_hf_path, filename=file_name, local_dir=CACHE_DIR) + + +def _load_model(ckpt_path, device, use_small=False, model_type="text"): + if model_type == "text": + ModelClass = BarkSemanticModel + ConfigClass = BarkSemanticConfig + GenerationConfigClass = BarkSemanticGenerationConfig + elif model_type == "coarse": + ModelClass = BarkCoarseModel + ConfigClass = BarkCoarseConfig + GenerationConfigClass = BarkCoarseGenerationConfig + elif model_type == "fine": + ModelClass = BarkFineModel + ConfigClass = BarkFineConfig + GenerationConfigClass = BarkFineGenerationConfig + else: + raise NotImplementedError() + model_key = f"{model_type}_small" if use_small else model_type + model_info = REMOTE_MODEL_PATHS[model_key] + if not os.path.exists(ckpt_path): + logger.info(f"{model_type} model not found, downloading into `{CACHE_DIR}`.") + _download(model_info["repo_id"], model_info["file_name"]) + checkpoint = torch.load(ckpt_path, map_location=device, weights_only=True) + # this is a hack + model_args = checkpoint["model_args"] + if "input_vocab_size" not in model_args: + model_args["input_vocab_size"] = model_args["vocab_size"] + model_args["output_vocab_size"] = model_args["vocab_size"] + del model_args["vocab_size"] + + # convert Bark model arguments to HF Bark model arguments + model_args["num_heads"] = model_args.pop("n_head") + model_args["hidden_size"] = model_args.pop("n_embd") + model_args["num_layers"] = model_args.pop("n_layer") + + model_config = ConfigClass(**checkpoint["model_args"]) + model = ModelClass(config=model_config) + model_generation_config = GenerationConfigClass() + + model.generation_config = model_generation_config + state_dict = checkpoint["model"] + # fixup checkpoint + unwanted_prefix = "_orig_mod." + for k, v in list(state_dict.items()): + if k.startswith(unwanted_prefix): + # replace part of the key with corresponding layer name in HF implementation + new_k = k[len(unwanted_prefix) :] + for old_layer_name in new_layer_name_dict: + new_k = new_k.replace(old_layer_name, new_layer_name_dict[old_layer_name]) + + state_dict[new_k] = state_dict.pop(k) + + extra_keys = set(state_dict.keys()) - set(model.state_dict().keys()) + extra_keys = {k for k in extra_keys if not k.endswith(".attn.bias")} + missing_keys = set(model.state_dict().keys()) - set(state_dict.keys()) + missing_keys = {k for k in missing_keys if not k.endswith(".attn.bias")} + if len(extra_keys) != 0: + raise ValueError(f"extra keys found: {extra_keys}") + if len(missing_keys) != 0: + raise ValueError(f"missing keys: {missing_keys}") + model.load_state_dict(state_dict, strict=False) + n_params = model.num_parameters(exclude_embeddings=True) + val_loss = checkpoint["best_val_loss"].item() + logger.info(f"model loaded: {round(n_params / 1e6, 1)}M params, {round(val_loss, 3)} loss") + model.eval() + model.to(device) + del checkpoint, state_dict + + return model + + +def load_model(pytorch_dump_folder_path, use_small=False, model_type="text"): + if model_type not in ("text", "coarse", "fine"): + raise NotImplementedError() + + device = "cpu" # do conversion on cpu + + ckpt_path = _get_ckpt_path(model_type, use_small=use_small) + model = _load_model(ckpt_path, device, model_type=model_type, use_small=use_small) + + # load bark initial model + bark_model = _bark_load_model(ckpt_path, "cpu", model_type=model_type, use_small=use_small) + + if model_type == "text": + bark_model = bark_model["model"] + + if model.num_parameters(exclude_embeddings=True) != bark_model.get_num_params(): + raise ValueError("initial and new models don't have the same number of parameters") + + # check if same output as the bark model + batch_size = 5 + sequence_length = 10 + + if model_type in ["text", "coarse"]: + vec = torch.randint(256, (batch_size, sequence_length), dtype=torch.int) + output_old_model = bark_model(vec)[0] + + output_new_model_total = model(vec) + + # take last logits + output_new_model = output_new_model_total.logits[:, [-1], :] + + else: + prediction_codebook_channel = 3 + n_codes_total = 8 + vec = torch.randint(256, (batch_size, sequence_length, n_codes_total), dtype=torch.int) + + output_new_model_total = model(prediction_codebook_channel, vec) + output_old_model = bark_model(prediction_codebook_channel, vec) + + output_new_model = output_new_model_total.logits + + # output difference should come from the difference of self-attention implementation design + if output_new_model.shape != output_old_model.shape: + raise ValueError("initial and new outputs don't have the same shape") + if (output_new_model - output_old_model).abs().max().item() > 1e-3: + raise ValueError("initial and new outputs are not equal") + + Path(pytorch_dump_folder_path).mkdir(exist_ok=True) + model.save_pretrained(pytorch_dump_folder_path) + + +def load_whole_bark_model( + semantic_path, + coarse_path, + fine_path, + append_text, + hub_path, + folder_path, +): + pytorch_dump_folder_path = os.path.join(folder_path, append_text) + + semanticConfig = BarkSemanticConfig.from_pretrained(os.path.join(semantic_path, "config.json")) + coarseAcousticConfig = BarkCoarseConfig.from_pretrained(os.path.join(coarse_path, "config.json")) + fineAcousticConfig = BarkFineConfig.from_pretrained(os.path.join(fine_path, "config.json")) + codecConfig = EncodecConfig.from_pretrained("facebook/encodec_24khz") + + semantic = BarkSemanticModel.from_pretrained(semantic_path) + coarseAcoustic = BarkCoarseModel.from_pretrained(coarse_path) + fineAcoustic = BarkFineModel.from_pretrained(fine_path) + codec = EncodecModel.from_pretrained("facebook/encodec_24khz") + + bark_config = BarkConfig.from_sub_model_configs( + semanticConfig, coarseAcousticConfig, fineAcousticConfig, codecConfig + ) + + bark_generation_config = BarkGenerationConfig.from_sub_model_configs( + semantic.generation_config, coarseAcoustic.generation_config, fineAcoustic.generation_config + ) + + bark = BarkModel(bark_config) + + bark.semantic = semantic + bark.coarse_acoustics = coarseAcoustic + bark.fine_acoustics = fineAcoustic + bark.codec_model = codec + + bark.generation_config = bark_generation_config + + Path(pytorch_dump_folder_path).mkdir(exist_ok=True) + bark.save_pretrained(pytorch_dump_folder_path, repo_id=hub_path, push_to_hub=True) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + # Required parameters + + parser.add_argument("model_type", type=str, help="text, coarse or fine.") + parser.add_argument("pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.") + parser.add_argument("--is_small", action="store_true", help="convert the small version instead of the large.") + + args = parser.parse_args() + + load_model(args.pytorch_dump_folder_path, model_type=args.model_type, use_small=args.is_small) diff --git a/docs/transformers/build/lib/transformers/models/bark/generation_configuration_bark.py b/docs/transformers/build/lib/transformers/models/bark/generation_configuration_bark.py new file mode 100644 index 0000000000000000000000000000000000000000..00ff22c8b894be11110b5a5ec9f20cece58e3788 --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/bark/generation_configuration_bark.py @@ -0,0 +1,330 @@ +# coding=utf-8 +# Copyright 2023 The Suno AI Authors and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""BARK model generation configuration""" + +import copy +from typing import Dict + +from ...generation.configuration_utils import GenerationConfig +from ...utils import logging + + +logger = logging.get_logger(__name__) + + +class BarkSemanticGenerationConfig(GenerationConfig): + model_type = "semantic" + + def __init__( + self, + eos_token_id=10_000, + renormalize_logits=True, + max_new_tokens=768, + output_scores=False, + return_dict_in_generate=False, + output_hidden_states=False, + output_attentions=False, + temperature=1.0, + do_sample=False, + text_encoding_offset=10_048, + text_pad_token=129_595, + semantic_infer_token=129_599, + semantic_vocab_size=10_000, + max_input_semantic_length=256, + semantic_rate_hz=49.9, + min_eos_p=None, + **kwargs, + ): + """Class that holds a generation configuration for [`BarkSemanticModel`]. + + This configuration inherit from [`GenerationConfig`] and can be used to control the model generation. Read the + documentation from [`GenerationConfig`] for more information. + + Args: + eos_token_id (`int`, *optional*, defaults to 10_000): + The id of the *end-of-sequence* token. + renormalize_logits (`bool`, *optional*, defaults to `True`): + Whether to renormalize the logits after applying all the logits processors (including the + custom ones). It's highly recommended to set this flag to `True` as the search algorithms suppose the + score logits are normalized but some logit processors break the normalization. + max_new_tokens (`int`, *optional*, defaults to 768): + The maximum numbers of tokens to generate, ignoring the number of tokens in the prompt. + output_scores (`bool`, *optional*, defaults to `False`): + Whether or not to return the prediction scores. See `scores` under returned tensors for more details. + return_dict_in_generate (`bool`, *optional*, defaults to `False`): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. + output_hidden_states (`bool`, *optional*, defaults to `False`): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors + for more details. + output_attentions (`bool`, *optional*, defaults to `False`): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more details. + temperature (`float`, *optional*, defaults to 1.0): + The value used to modulate the next token probabilities. + do_sample (`bool`, *optional*, defaults to `False`): + Whether or not to use sampling ; use greedy decoding otherwise. + text_encoding_offset (`int`, *optional*, defaults to 10_048): + Text encoding offset. + text_pad_token (`int`, *optional*, defaults to 129_595): + Text pad token. + semantic_infer_token (`int`, *optional*, defaults to 129_599): + Semantic infer token. + semantic_vocab_size (`int`, *optional*, defaults to 10_000): + Semantic vocab size. + max_input_semantic_length (`int`, *optional*, defaults to 256): + Max length of semantic input vector. + semantic_rate_hz (`float`, *optional*, defaults to 49.9): + Semantic rate in Hertz. + min_eos_p (`float`, *optional*): + Minimum threshold of the probability of the EOS token for it to be sampled. This is an early stopping + strategy to mitigate potential unwanted generations at the end of a prompt. The original implementation + suggests a default value of 0.2. + """ + super().__init__( + temperature=temperature, + do_sample=do_sample, + eos_token_id=eos_token_id, + renormalize_logits=renormalize_logits, + max_new_tokens=max_new_tokens, + output_scores=output_scores, + return_dict_in_generate=return_dict_in_generate, + output_hidden_states=output_hidden_states, + output_attentions=output_attentions, + **kwargs, + ) + + self.text_encoding_offset = text_encoding_offset + self.text_pad_token = text_pad_token + self.semantic_pad_token = eos_token_id + self.semantic_infer_token = semantic_infer_token + self.semantic_vocab_size = semantic_vocab_size + self.max_input_semantic_length = max_input_semantic_length + self.semantic_rate_hz = semantic_rate_hz + self.min_eos_p = min_eos_p + + +class BarkCoarseGenerationConfig(GenerationConfig): + model_type = "coarse_acoustics" + + def __init__( + self, + renormalize_logits=True, + output_scores=False, + return_dict_in_generate=False, + output_hidden_states=False, + output_attentions=False, + temperature=1.0, + do_sample=False, + coarse_semantic_pad_token=12_048, + coarse_rate_hz=75, + n_coarse_codebooks=2, + coarse_infer_token=12_050, + max_coarse_input_length=256, + max_coarse_history: int = 630, + sliding_window_len: int = 60, + **kwargs, + ): + """Class that holds a generation configuration for [`BarkCoarseModel`]. + + This configuration inherit from [`GenerationConfig`] and can be used to control the model generation. Read the + documentation from [`GenerationConfig`] for more information. + + Args: + renormalize_logits (`bool`, *optional*, defaults to `True`): + Whether to renormalize the logits after applying all the logits processors (including the + custom ones). It's highly recommended to set this flag to `True` as the search algorithms suppose the + score logits are normalized but some logit processors break the normalization. + output_scores (`bool`, *optional*, defaults to `False`): + Whether or not to return the prediction scores. See `scores` under returned tensors for more details. + return_dict_in_generate (`bool`, *optional*, defaults to `False`): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. + output_hidden_states (`bool`, *optional*, defaults to `False`): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors + for more details. + output_attentions (`bool`, *optional*, defaults to `False`): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more details. + temperature (`float`, *optional*, defaults to 1.0): + The value used to modulate the next token probabilities. + do_sample (`bool`, *optional*, defaults to `False`): + Whether or not to use sampling ; use greedy decoding otherwise. + coarse_semantic_pad_token (`int`, *optional*, defaults to 12_048): + Coarse semantic pad token. + coarse_rate_hz (`int`, *optional*, defaults to 75): + Coarse rate in Hertz. + n_coarse_codebooks (`int`, *optional*, defaults to 2): + Number of coarse codebooks. + coarse_infer_token (`int`, *optional*, defaults to 12_050): + Coarse infer token. + max_coarse_input_length (`int`, *optional*, defaults to 256): + Max length of input coarse vector. + max_coarse_history (`int`, *optional*, defaults to 630): + Max length of the output of the coarse acoustics model used in the fine generation step. + sliding_window_len (`int`, *optional*, defaults to 60): + The coarse generation step uses a sliding window to generate raw audio. + """ + super().__init__( + temperature=temperature, + do_sample=do_sample, + renormalize_logits=renormalize_logits, + output_scores=output_scores, + return_dict_in_generate=return_dict_in_generate, + output_hidden_states=output_hidden_states, + output_attentions=output_attentions, + **kwargs, + ) + + self.coarse_semantic_pad_token = coarse_semantic_pad_token + self.coarse_rate_hz = coarse_rate_hz + self.n_coarse_codebooks = n_coarse_codebooks + self.coarse_infer_token = coarse_infer_token + self.max_coarse_input_length = max_coarse_input_length + self.max_coarse_history = max_coarse_history + self.sliding_window_len = sliding_window_len + + +class BarkFineGenerationConfig(GenerationConfig): + model_type = "fine_acoustics" + + def __init__( + self, + temperature=1.0, + max_fine_history_length=512, + max_fine_input_length=1024, + n_fine_codebooks=8, + **kwargs, + ): + """Class that holds a generation configuration for [`BarkFineModel`]. + + [`BarkFineModel`] is an autoencoder model, so should not usually be used for generation. However, under the + hood, it uses `temperature` when used by [`BarkModel`] + + This configuration inherit from [`GenerationConfig`] and can be used to control the model generation. Read the + documentation from [`GenerationConfig`] for more information. + + Args: + temperature (`float`, *optional*): + The value used to modulate the next token probabilities. + max_fine_history_length (`int`, *optional*, defaults to 512): + Max length of the fine history vector. + max_fine_input_length (`int`, *optional*, defaults to 1024): + Max length of fine input vector. + n_fine_codebooks (`int`, *optional*, defaults to 8): + Number of codebooks used. + """ + super().__init__(temperature=temperature) + + self.max_fine_history_length = max_fine_history_length + self.max_fine_input_length = max_fine_input_length + self.n_fine_codebooks = n_fine_codebooks + + def validate(self, **kwargs): + """ + Overrides GenerationConfig.validate because BarkFineGenerationConfig don't use any parameters outside + temperature. + """ + pass + + +class BarkGenerationConfig(GenerationConfig): + model_type = "bark" + + # TODO (joao): nested from_dict + + def __init__( + self, + semantic_config: Dict = None, + coarse_acoustics_config: Dict = None, + fine_acoustics_config: Dict = None, + sample_rate=24_000, + codebook_size=1024, + **kwargs, + ): + """Class that holds a generation configuration for [`BarkModel`]. + + The [`BarkModel`] does not have a `generate` method, but uses this class to generate speeches with a nested + [`BarkGenerationConfig`] which uses [`BarkSemanticGenerationConfig`], [`BarkCoarseGenerationConfig`], + [`BarkFineGenerationConfig`]. + + This configuration inherit from [`GenerationConfig`] and can be used to control the model generation. Read the + documentation from [`GenerationConfig`] for more information. + + Args: + semantic_config (`Dict`, *optional*): + Semantic generation configuration. + coarse_acoustics_config (`Dict`, *optional*): + Coarse generation configuration. + fine_acoustics_config (`Dict`, *optional*): + Fine generation configuration. + sample_rate (`int`, *optional*, defaults to 24_000): + Sample rate. + codebook_size (`int`, *optional*, defaults to 1024): + Vector length for each codebook. + """ + if semantic_config is None: + semantic_config = {} + logger.info("semantic_config is None. initializing the semantic model with default values.") + + if coarse_acoustics_config is None: + coarse_acoustics_config = {} + logger.info("coarse_acoustics_config is None. initializing the coarse model with default values.") + + if fine_acoustics_config is None: + fine_acoustics_config = {} + logger.info("fine_acoustics_config is None. initializing the fine model with default values.") + + self.semantic_config = BarkSemanticGenerationConfig(**semantic_config) + self.coarse_acoustics_config = BarkCoarseGenerationConfig(**coarse_acoustics_config) + self.fine_acoustics_config = BarkFineGenerationConfig(**fine_acoustics_config) + + self.sample_rate = sample_rate + self.codebook_size = codebook_size + + @classmethod + def from_sub_model_configs( + cls, + semantic_config: BarkSemanticGenerationConfig, + coarse_acoustics_config: BarkCoarseGenerationConfig, + fine_acoustics_config: BarkFineGenerationConfig, + **kwargs, + ): + r""" + Instantiate a [`BarkGenerationConfig`] (or a derived class) from bark sub-models generation configuration. + + Returns: + [`BarkGenerationConfig`]: An instance of a configuration object + """ + return cls( + semantic_config=semantic_config.to_dict(), + coarse_acoustics_config=coarse_acoustics_config.to_dict(), + fine_acoustics_config=fine_acoustics_config.to_dict(), + **kwargs, + ) + + def to_dict(self): + """ + Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`]. + + Returns: + `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance, + """ + output = copy.deepcopy(self.__dict__) + + output["semantic_config"] = self.semantic_config.to_dict() + output["coarse_acoustics_config"] = self.coarse_acoustics_config.to_dict() + output["fine_acoustics_config"] = self.fine_acoustics_config.to_dict() + + output["model_type"] = self.__class__.model_type + return output diff --git a/docs/transformers/build/lib/transformers/models/bark/modeling_bark.py b/docs/transformers/build/lib/transformers/models/bark/modeling_bark.py new file mode 100644 index 0000000000000000000000000000000000000000..57a0c4e5a7a53f6af0ae7018e3da5b5f6cf8e121 --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/bark/modeling_bark.py @@ -0,0 +1,1869 @@ +# coding=utf-8 +# Copyright 2023 The Suno AI Authors and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch BARK model.""" + +import math +import warnings +from typing import Dict, Optional, Tuple, Union + +import numpy as np +import torch +from torch import nn +from torch.nn import functional as F + +from ...generation import GenerationMixin +from ...generation.logits_process import ( + AlternatingCodebooksLogitsProcessor, + BarkEosPrioritizerLogitsProcessor, + SuppressTokensLogitsProcessor, +) +from ...modeling_attn_mask_utils import _prepare_4d_attention_mask +from ...modeling_flash_attention_utils import flash_attn_supports_top_left_mask, is_flash_attn_available +from ...modeling_outputs import CausalLMOutputWithPast, MaskedLMOutput +from ...modeling_utils import PreTrainedModel, get_parameter_device +from ...utils import ( + add_start_docstrings, + add_start_docstrings_to_model_forward, + is_accelerate_available, + is_torch_accelerator_available, + logging, +) +from ..auto import AutoModel +from .configuration_bark import ( + BarkCoarseConfig, + BarkConfig, + BarkFineConfig, + BarkSemanticConfig, + BarkSubModelConfig, +) +from .generation_configuration_bark import ( + BarkCoarseGenerationConfig, + BarkFineGenerationConfig, + BarkSemanticGenerationConfig, +) + + +if is_flash_attn_available(): + from ...modeling_flash_attention_utils import _flash_attention_forward + + +logger = logging.get_logger(__name__) + + +_CHECKPOINT_FOR_DOC = "suno/bark-small" +_CONFIG_FOR_DOC = "BarkConfig" + + +class BarkSelfAttention(nn.Module): + # adapted from GPTNeoSelfAttention and Bark code + # BarkSelfAttention can have two attention type, i.e full attention or causal attention + + def __init__(self, config, is_causal=False): + super().__init__() + + # regularization + self.dropout = config.dropout + self.attn_dropout = nn.Dropout(config.dropout) + self.resid_dropout = nn.Dropout(config.dropout) + + self.embed_dim = config.hidden_size + self.num_heads = config.num_heads + self.head_dim = self.embed_dim // self.num_heads + + if config.hidden_size % config.num_heads != 0: + raise ValueError( + f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:" + f" {self.num_heads})." + ) + + # key, query, value projections for all heads, but in a batch + self.att_proj = nn.Linear(config.hidden_size, 3 * config.hidden_size, bias=config.bias) + # output projection + self.out_proj = nn.Linear(config.hidden_size, config.hidden_size, bias=config.bias) + + self.is_causal = is_causal + if is_causal: + block_size = config.block_size + bias = torch.tril(torch.ones((block_size, block_size), dtype=bool)).view(1, 1, block_size, block_size) + self.register_buffer("bias", bias) + + # Copied from transformers.models.gpt_neo.modeling_gpt_neo.GPTNeoSelfAttention._split_heads + def _split_heads(self, tensor, num_heads, attn_head_size): + """ + Splits hidden_size dim into attn_head_size and num_heads + """ + new_shape = tensor.size()[:-1] + (num_heads, attn_head_size) + tensor = tensor.view(new_shape) + return tensor.permute(0, 2, 1, 3) # (batch, head, seq_length, head_features) + + def _merge_heads(self, tensor, num_heads, attn_head_size): + """ + Merges attn_head_size dim and num_attn_heads dim into hidden_size + """ + + # re-assemble all head outputs side by side + # (batch, num_heads, seq_len, attn_head_size) -> (batch, seq_len, num_heads*attn_head_size) + tensor = tensor.transpose(1, 2).contiguous() + tensor = tensor.view(tensor.size()[:-2] + (num_heads * attn_head_size,)) + + return tensor + + def _attn(self, query, key, value, attention_mask=None, head_mask=None): + # unlike GPTNeo's SelfAttention, divide by the square root of the dimension of the query and the key + attn_weights = torch.matmul(query, key.transpose(-1, -2)) * (1.0 / math.sqrt(self.head_dim)) + + if self.is_causal: + query_length, key_length = query.size(-2), key.size(-2) + + # fill the upper left part of the attention weights with inf + attn_weights = attn_weights.masked_fill( + self.bias[:, :, key_length - query_length : key_length, :key_length] == 0, + torch.finfo(attn_weights.dtype).min, + ) + + if attention_mask is not None: + # Apply the attention mask + attn_weights = attn_weights + attention_mask + + attn_weights = nn.functional.softmax(attn_weights, dim=-1) + attn_weights = attn_weights.to(value.dtype) + attn_weights = self.attn_dropout(attn_weights) + + # Mask heads if we want to + if head_mask is not None: + attn_weights = attn_weights * head_mask + + # (batch, num_heads, seq_len, seq_len) x (batch, num_heads, seq_len, attn_head_size) + # -> (batch, num_heads, seq_len, attn_head_size) + attn_output = torch.matmul(attn_weights, value) + + return attn_output, attn_weights + + def forward( + self, + hidden_states, + attention_mask=None, + past_key_values=None, + head_mask=None, + use_cache=False, + output_attentions=False, + ): + # calculate query, key, values for all heads in batch and move head forward to be the batch dim + query, key, value = self.att_proj(hidden_states).split(self.embed_dim, dim=2) + + query = self._split_heads(query, self.num_heads, self.head_dim) + key = self._split_heads(key, self.num_heads, self.head_dim) + value = self._split_heads(value, self.num_heads, self.head_dim) + + if past_key_values is not None: + past_key = past_key_values[0] + past_value = past_key_values[1] + key = torch.cat((past_key, key), dim=-2) + value = torch.cat((past_value, value), dim=-2) + + if use_cache is True: + present = (key, value) + else: + present = None + + attn_output, attn_weights = self._attn(query, key, value, attention_mask, head_mask) + + attn_output = self._merge_heads(attn_output, self.num_heads, self.head_dim) + attn_output = self.out_proj(attn_output) + attn_output = self.resid_dropout(attn_output) + + outputs = (attn_output, present) + if output_attentions: + outputs += (attn_weights,) + + return outputs + + +class BarkSelfFlashAttention2(BarkSelfAttention): + """ + Bark flash attention module. This module inherits from `BarkSelfAttention` as the weights of the module stays + untouched. The only required change would be on the forward pass where it needs to correctly call the public API of + flash attention and deal with padding tokens in case the input contains any of them. + """ + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1. + # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignment, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0. + # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left). + self._flash_attn_uses_top_left_mask = flash_attn_supports_top_left_mask() + + def _split_heads(self, tensor, num_heads, attn_head_size): + """ + Splits hidden_size dim into attn_head_size and num_heads + """ + new_shape = tensor.size()[:-1] + (num_heads, attn_head_size) + tensor = tensor.view(new_shape) + # Flash attention requires the input to have the shape + # batch_size x seq_length x head_dim x hidden_dim - (batch, seq_length, head, head_features) + return tensor + + def _merge_heads(self, tensor, num_heads, attn_head_size): + """ + Merges attn_head_size dim and num_attn_heads dim into hidden_size + """ + # re-assemble all head outputs side by side + # (batch, seq_len, num_heads, attn_head_size) -> (batch, seq_len, num_heads*attn_head_size) + tensor = tensor.view(tensor.size()[:-2] + (num_heads * attn_head_size,)) + return tensor + + def forward( + self, + hidden_states, + attention_mask=None, + past_key_values=None, + head_mask=None, + use_cache=False, + output_attentions=False, + ): + batch_size, query_len, _ = hidden_states.size() + + # calculate query, key, values for all heads in batch and move head forward to be the batch dim + query, key, value = self.att_proj(hidden_states).split(self.embed_dim, dim=2) + + query = self._split_heads(query, self.num_heads, self.head_dim) + key = self._split_heads(key, self.num_heads, self.head_dim) + value = self._split_heads(value, self.num_heads, self.head_dim) + + if past_key_values is not None: + # (batch, head, seq_length, head_features) -> (batch, seq_length, head, head_features) + past_key = past_key_values[0].transpose(1, 2) + past_value = past_key_values[1].transpose(1, 2) + # and merge on seq_length + key = torch.cat((past_key, key), dim=1) + value = torch.cat((past_value, value), dim=1) + + if use_cache is True: + # (batch, head, seq_length, head_features) + present = (key.transpose(1, 2), value.transpose(1, 2)) + else: + present = None + + attn_output = _flash_attention_forward( + query, + key, + value, + attention_mask, + query_len, + dropout=self.dropout if self.training else 0.0, + use_top_left_mask=self._flash_attn_uses_top_left_mask, + is_causal=self.is_causal, + ) + + attn_output = self._merge_heads(attn_output, self.num_heads, self.head_dim) + attn_output = self.out_proj(attn_output) + attn_output = self.resid_dropout(attn_output) + + outputs = (attn_output, present) + if output_attentions: + attn_weights = None + outputs += (attn_weights,) + + return outputs + + +BARK_ATTENTION_CLASSES = { + "eager": BarkSelfAttention, + "flash_attention_2": BarkSelfFlashAttention2, +} + + +class BarkLayerNorm(nn.Module): + """LayerNorm but with an optional bias. PyTorch doesn't support simply bias=False.""" + + def __init__(self, hidden_size, bias=True): + super().__init__() + self.weight = nn.Parameter(torch.ones(hidden_size)) + self.bias = nn.Parameter(torch.zeros(hidden_size)) if bias else None + + def forward(self, input): + return F.layer_norm(input, self.weight.shape, self.weight, self.bias, eps=1e-5) + + +class BarkMLP(nn.Module): + def __init__(self, config): + super().__init__() + self.in_proj = nn.Linear(config.hidden_size, 4 * config.hidden_size, bias=config.bias) + self.out_proj = nn.Linear(4 * config.hidden_size, config.hidden_size, bias=config.bias) + self.dropout = nn.Dropout(config.dropout) + self.gelu = nn.GELU() + + def forward(self, hidden_states): + hidden_states = self.in_proj(hidden_states) + hidden_states = self.gelu(hidden_states) + hidden_states = self.out_proj(hidden_states) + hidden_states = self.dropout(hidden_states) + return hidden_states + + +class BarkBlock(nn.Module): + def __init__(self, config, is_causal=False): + super().__init__() + + if is_causal: + # if causal, uses handmade LayerNorm, so that the layerNorm bias is optional + # this handmade layerNorm is used to stick with Bark choice of leaving optional bias in + # AutoRegressive models (corresponding to the "Text" and the "Coarse" modules) + self.layernorm_1 = BarkLayerNorm(config.hidden_size, bias=config.bias) + self.layernorm_2 = BarkLayerNorm(config.hidden_size, bias=config.bias) + else: + self.layernorm_1 = nn.LayerNorm(config.hidden_size) + self.layernorm_2 = nn.LayerNorm(config.hidden_size) + + self.attn = BARK_ATTENTION_CLASSES[config._attn_implementation](config, is_causal=is_causal) + + self.mlp = BarkMLP(config) + + def forward( + self, + hidden_states, + past_key_values=None, + attention_mask=None, + head_mask=None, + use_cache=False, + output_attentions=False, + ): + intermediary_hidden_states = self.layernorm_1(hidden_states) + + attn_outputs = self.attn( + intermediary_hidden_states, + past_key_values=past_key_values, + attention_mask=attention_mask, + head_mask=head_mask, + use_cache=use_cache, + output_attentions=output_attentions, + ) + + attn_output = attn_outputs[0] # output_attn: output, present_key_values, (attn_weights) + outputs = attn_outputs[1:] + + intermediary_hidden_states = hidden_states + attn_output + intermediary_hidden_states = intermediary_hidden_states + self.mlp( + self.layernorm_2(intermediary_hidden_states) + ) + + if use_cache: + outputs = (intermediary_hidden_states,) + outputs + else: + outputs = (intermediary_hidden_states,) + outputs[1:] + + return outputs # hidden_states, ((present), attentions) + + +class BarkPreTrainedModel(PreTrainedModel): + """ + An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained + models. + """ + + config_class = BarkConfig + supports_gradient_checkpointing = False + _supports_flash_attn_2 = True + + def _init_weights(self, module): + """Initialize the weights.""" + if isinstance(module, (nn.Linear,)): + # Slightly different from the TF version which uses truncated_normal for initialization + # cf https://github.com/pytorch/pytorch/pull/5617 + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + elif isinstance(module, nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + + def __init__(self, *inputs, **kwargs): + super().__init__(*inputs, **kwargs) + + @property + def device(self) -> torch.device: + """ + `torch.device`: The device on which the module is (assuming that all the module parameters are on the same + device). + """ + + # if has _hf_hook, has been offloaded so the device has to be found in the hook + if not hasattr(self, "_hf_hook"): + return get_parameter_device(self) + for module in self.modules(): + if ( + hasattr(module, "_hf_hook") + and hasattr(module._hf_hook, "execution_device") + and module._hf_hook.execution_device is not None + ): + return torch.device(module._hf_hook.execution_device) + + return get_parameter_device(self) + + +BARK_MODEL_START_DOCSTRING = """ + This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads + etc.) + + This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. + Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage + and behavior. + + Parameters: + config ([`{config}`]): + Model configuration class with all the parameters of the model. Initializing with a config file does not + load the weights associated with the model, only the configuration. Check out the + [`~PreTrainedModel.from_pretrained`] method to load the model weights. +""" + + +BARK_START_DOCSTRING = r""" + This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads + etc.) + + This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. + Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage + and behavior. + + Parameters: + config ([`BarkConfig`]): + Model configuration class with all the parameters of the model. Initializing with a config file does not + load the weights associated with the model, only the configuration. Check out the + [`~PreTrainedModel.from_pretrained`] method to load the model weights. +""" + + +BARK_FINE_INPUTS_DOCSTRING = r""" + Args: + codebook_idx (`int`): + Index of the codebook that will be predicted. + input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length, number_of_codebooks)`): + Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide + it. Initially, indices of the first two codebooks are obtained from the `coarse` sub-model. The rest is + predicted recursively by attending the previously predicted channels. The model predicts on windows of + length 1024. + attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, + config.max_position_embeddings - 1]`. + + [What are position IDs?](../glossary#position-ids) + head_mask (`torch.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*): + Mask to nullify selected heads of the attention modules in the encoder. Mask values selected in `[0, 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): NOT IMPLEMENTED YET. + input_embeds (`torch.FloatTensor` of shape `(batch_size, input_sequence_length, hidden_size)`, *optional*): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. If + `past_key_values` is used, optionally only the last `input_embeds` have to be input (see + `past_key_values`). This is useful if you want more control over how to convert `input_ids` indices into + associated vectors than the model's internal embedding lookup matrix. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. +""" + +BARK_CAUSAL_MODEL_INPUTS_DOCSTRING = r""" + Args: + input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide + it. Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. [What are input IDs?](../glossary#input-ids) + past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache` is passed or when `config.use_cache=True`): + Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape + `(batch_size, num_heads, sequence_length, embed_size_per_head)`. + + Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see + `past_key_values` input) to speed up sequential decoding. + + If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that + don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all + `input_ids` of shape `(batch_size, sequence_length)`. + attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, + config.max_position_embeddings - 1]`. + + [What are position IDs?](../glossary#position-ids) + head_mask (`torch.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*): + Mask to nullify selected heads of the attention modules in the encoder. Mask values selected in `[0, 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + input_embeds (`torch.FloatTensor` of shape `(batch_size, input_sequence_length, hidden_size)`, *optional*): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. + Here, due to `Bark` particularities, if `past_key_values` is used, `input_embeds` will be ignored and you + have to use `input_ids`. If `past_key_values` is not used and `use_cache` is set to `True`, `input_embeds` + is used in priority instead of `input_ids`. + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see + `past_key_values`). + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. +""" + + +# GPT2-like autoregressive model +class BarkCausalModel(BarkPreTrainedModel, GenerationMixin): + config_class = BarkSubModelConfig + + def __init__(self, config): + super().__init__(config) + self.config = config + + # initialize as an autoregressive GPT-like model + self.input_embeds_layer = nn.Embedding(config.input_vocab_size, config.hidden_size) + self.position_embeds_layer = nn.Embedding(config.block_size, config.hidden_size) + + self.drop = nn.Dropout(config.dropout) + + self.layers = nn.ModuleList([BarkBlock(config, is_causal=True) for _ in range(config.num_layers)]) + self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2" + + self.layernorm_final = BarkLayerNorm(config.hidden_size, bias=config.bias) + + self.lm_head = nn.Linear(config.hidden_size, config.output_vocab_size, bias=False) + self.gradient_checkpointing = False + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.input_embeds_layer + + def set_input_embeddings(self, new_embeddings): + self.input_embeds_layer = new_embeddings + + def prepare_inputs_for_generation(self, input_ids, past_key_values=None, **kwargs): + # Overwritten -- bark has a model-specific hack + input_embeds = kwargs.get("input_embeds", None) + + attention_mask = kwargs.get("attention_mask", None) + position_ids = kwargs.get("position_ids", None) + + if past_key_values is not None: + # Omit tokens covered by past_key_values + seq_len = input_ids.shape[1] + past_length = past_key_values[0][0].shape[2] + + # Some generation methods already pass only the last input ID + if input_ids.shape[1] > past_length: + remove_prefix_length = past_length + else: + # Default to old behavior: keep only final ID + remove_prefix_length = input_ids.shape[1] - 1 + + input_ids = input_ids[:, remove_prefix_length:] + + # input_embeds have already been used and is not required anymore + input_embeds = None + else: + if input_embeds is not None and kwargs.get("use_cache"): + seq_len = input_embeds.shape[1] + else: + seq_len = input_ids.shape[1] + + # ensure that attention_mask and position_ids shapes are aligned with the weird Bark hack of reducing + # sequence length on the first forward pass + if attention_mask is not None: + attention_mask = attention_mask[:, :seq_len] + if position_ids is not None: + position_ids = position_ids[:, :seq_len] + + if attention_mask is not None and position_ids is None: + # create position_ids on the fly for batch generation + position_ids = attention_mask.long().cumsum(-1) - 1 + position_ids.masked_fill_(attention_mask == 0, 1) + if past_key_values: + position_ids = position_ids[:, -input_ids.shape[1] :] + else: + position_ids = None + + if input_embeds is not None and kwargs.get("use_cache"): + return { + "input_ids": None, + "input_embeds": input_embeds, + "past_key_values": past_key_values, + "use_cache": kwargs.get("use_cache"), + "position_ids": position_ids, + "attention_mask": attention_mask, + } + return { + "input_ids": input_ids, + "past_key_values": past_key_values, + "use_cache": kwargs.get("use_cache"), + "position_ids": position_ids, + "attention_mask": attention_mask, + } + + @add_start_docstrings_to_model_forward(BARK_CAUSAL_MODEL_INPUTS_DOCSTRING) + def forward( + self, + input_ids: Optional[torch.Tensor] = None, + past_key_values: Optional[Tuple[torch.FloatTensor]] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + labels: Optional[torch.LongTensor] = None, + input_embeds: Optional[torch.Tensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple[torch.Tensor], CausalLMOutputWithPast]: + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + use_cache = use_cache if use_cache is not None else self.config.use_cache + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + loss = None + if labels is not None: + raise NotImplementedError( + "Training is not implemented yet for Bark - ensure you do not pass `labels` to the model." + ) + + # Verify if input_embeds already exists + # then compute embeddings. + if input_ids is not None and input_embeds is not None: + raise ValueError("You cannot specify both input_ids and input_embeds at the same time") + elif input_embeds is not None and past_key_values is None: + # we want to return the input_embeds in priority so that it is in line with a weird hack + # of Bark which concatenate two bits of the input_embeds on the first forward pass of the semantic model + pass + elif input_ids is not None: + input_embeds = self.input_embeds_layer(input_ids) # token embeddings of shape (b, t, n_embd) + elif input_embeds is not None: + pass + else: + raise ValueError("You have to specify either input_ids or input_embeds") + + input_shape = input_embeds.size()[:-1] + batch_size = input_embeds.shape[0] + seq_length = input_shape[-1] + + device = input_ids.device if input_ids is not None else input_embeds.device + + if past_key_values is None: + past_length = 0 + past_key_values = tuple([None] * len(self.layers)) + else: + past_length = past_key_values[0][0].size(-2) + + if position_ids is None: + position_ids = torch.arange(past_length, seq_length + past_length, dtype=torch.long, device=device) + position_ids = position_ids.unsqueeze(0) # shape (1, seq_length) + + position_embeds = self.position_embeds_layer(position_ids) # position embeddings of shape (1, t, n_embd) + + # Attention mask. + if attention_mask is not None: + if batch_size <= 0: + raise ValueError("batch_size has to be defined and > 0") + if self._use_flash_attention_2: + attention_mask = attention_mask if 0 in attention_mask else None + else: + attention_mask = attention_mask.view(batch_size, -1) + # [bsz, to_seq_length] -> [bsz, 1, 1, to_seq_length] + # from_seq_length is 1 to easily broadcast + attention_mask = _prepare_4d_attention_mask(attention_mask, input_embeds.dtype, tgt_len=1) + + # Prepare head mask if needed + # 1.0 in head_mask indicate we keep the head + # attention_probs has shape bsz x num_heads x N x N + # head_mask has shape num_layers x batch x num_heads x N x N + head_mask = self.get_head_mask(head_mask, self.config.num_layers) + + hidden_states = self.drop(input_embeds + position_embeds) + output_shape = input_shape + (hidden_states.size(-1),) + + if self.gradient_checkpointing and self.training: + if use_cache: + logger.warning_once( + "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." + ) + use_cache = False + + present_key_values = () if use_cache else None + all_self_attentions = () if output_attentions else None + all_hidden_states = () if output_hidden_states else None + + for i, (block, past_layer_key_values) in enumerate(zip(self.layers, past_key_values)): + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + if self.gradient_checkpointing and self.training: + outputs = self._gradient_checkpointing_func( + block.__call__, + hidden_states, + None, + attention_mask, + head_mask[i], + use_cache, + output_attentions, + ) + else: + outputs = block( + hidden_states, + past_key_values=past_layer_key_values, + attention_mask=attention_mask, + head_mask=head_mask[i], + use_cache=use_cache, + output_attentions=output_attentions, + ) + + hidden_states = outputs[0] + + if use_cache: + present_key_values = present_key_values + (outputs[1],) + + if output_attentions: + all_self_attentions = all_self_attentions + (outputs[2 if use_cache else 1],) + + hidden_states = self.layernorm_final(hidden_states) + + hidden_states = hidden_states.view(output_shape) + + # Add last hidden state + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + logits = self.lm_head(hidden_states) + + if not return_dict: + return tuple( + v for v in [None, logits, present_key_values, all_hidden_states, all_self_attentions] if v is not None + ) + + return CausalLMOutputWithPast( + loss=loss, + logits=logits, + past_key_values=present_key_values, + hidden_states=all_hidden_states, + attentions=all_self_attentions, + ) + + @staticmethod + def _reorder_cache( + past_key_values: Tuple[Tuple[torch.Tensor]], beam_idx: torch.Tensor + ) -> Tuple[Tuple[torch.Tensor]]: + """ + This function is used to re-order the `past_key_values` cache if [`~PreTrainedModel.beam_search`] or + [`~PreTrainedModel.beam_sample`] is called. This is required to match `past_key_values` with the correct + beam_idx at every generation step. + """ + # Necessary for beam_search + return tuple( + tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past) + for layer_past in past_key_values + ) + + +@add_start_docstrings( + """Bark semantic (or text) model. It shares the same architecture as the coarse model. + It is a GPT-2 like autoregressive model with a language modeling head on top.""", + BARK_MODEL_START_DOCSTRING.format(config="BarkSemanticConfig"), +) +class BarkSemanticModel(BarkCausalModel): + base_model_prefix = "semantic" + config_class = BarkSemanticConfig + + def generate( + self, + input_ids: torch.Tensor, + semantic_generation_config: BarkSemanticGenerationConfig = None, + history_prompt: Optional[Dict[str, torch.Tensor]] = None, + attention_mask: Optional[torch.Tensor] = None, + **kwargs, + ) -> torch.LongTensor: + """ + Generates text semantic tokens from an input prompt and an additional optional `Bark` speaker prompt. + + Args: + input_ids (`Optional[torch.Tensor]` of shape (batch_size, seq_len), *optional*): + Input ids, i.e tokenized input sentences. Will be truncated up to + semantic_generation_config.max_input_semantic_length tokens. Note that the output audios will be as + long as the longest generation among the batch. + semantic_generation_config (`BarkSemanticGenerationConfig`): + Generation config indicating how to generate the semantic tokens. + history_prompt (`Optional[Dict[str,torch.Tensor]]`, *optional*): + Optional `Bark` speaker prompt. + attention_mask (`Optional[torch.Tensor]`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + Returns: + torch.LongTensor: Output semantic tokens. + """ + if semantic_generation_config is None: + raise ValueError("`semantic_generation_config` has to be provided") + + batch_size = input_ids.shape[0] + + max_input_semantic_length = semantic_generation_config.max_input_semantic_length + + input_ids = input_ids + semantic_generation_config.text_encoding_offset + + if attention_mask is not None: + input_ids = input_ids.masked_fill((1 - attention_mask).bool(), semantic_generation_config.text_pad_token) + + if history_prompt is not None: + semantic_history = history_prompt["semantic_prompt"][-max_input_semantic_length:] + semantic_history = nn.functional.pad( + semantic_history, + (0, max_input_semantic_length - len(semantic_history)), + value=semantic_generation_config.semantic_pad_token, + mode="constant", + ) + else: + semantic_history = torch.tensor( + [semantic_generation_config.semantic_pad_token] * max_input_semantic_length, dtype=torch.int + ).to(self.device) + + semantic_history = torch.repeat_interleave(semantic_history[None], batch_size, dim=0) + + infer_array = torch.tensor( + [[semantic_generation_config.semantic_infer_token]] * batch_size, dtype=torch.int + ).to(self.device) + + input_embeds = torch.cat( + [ + self.input_embeds_layer(input_ids[:, :max_input_semantic_length]) + + self.input_embeds_layer(semantic_history[:, : max_input_semantic_length + 1]), + self.input_embeds_layer(infer_array), + ], + dim=1, + ) + + tokens_to_suppress = list( + range(semantic_generation_config.semantic_vocab_size, semantic_generation_config.semantic_pad_token) + ) + tokens_to_suppress.extend( + list(range(semantic_generation_config.semantic_pad_token + 1, self.config.output_vocab_size)) + ) + + suppress_tokens_logits_processor = SuppressTokensLogitsProcessor(tokens_to_suppress, device=input_ids.device) + + min_eos_p = kwargs.get("min_eos_p", semantic_generation_config.min_eos_p) + early_stopping_logits_processor = BarkEosPrioritizerLogitsProcessor( + eos_token_id=semantic_generation_config.eos_token_id, min_eos_p=min_eos_p, device=input_ids.device + ) + + # pass input_ids in order to stay consistent with the transformers generate method even though it is not used + # (except to get the input seq_len - that's why we keep the first 257 tokens) + semantic_output = super().generate( + torch.ones((batch_size, max_input_semantic_length + 1), dtype=torch.int, device=self.device), + input_embeds=input_embeds, + logits_processor=[suppress_tokens_logits_processor, early_stopping_logits_processor], + generation_config=semantic_generation_config, + **kwargs, + ) # size: 10048 + + # take the generated semantic tokens + semantic_output = semantic_output[:, max_input_semantic_length + 1 :] + + return semantic_output + + +@add_start_docstrings( + """Bark coarse acoustics model. + It shares the same architecture as the semantic (or text) model. It is a GPT-2 like autoregressive model with a + language modeling head on top.""", + BARK_MODEL_START_DOCSTRING.format(config="BarkCoarseConfig"), +) +class BarkCoarseModel(BarkCausalModel): + base_model_prefix = "coarse_acoustics" + config_class = BarkCoarseConfig + + def preprocess_histories( + self, + max_coarse_history: int, + semantic_to_coarse_ratio: int, + batch_size: int, + semantic_generation_config: int, + codebook_size: int, + history_prompt: Optional[Dict[str, torch.Tensor]] = None, + ): + """ + Preprocess the optional `Bark` speaker prompts before `self.generate`. + + Args: + max_coarse_history (`int`): + Maximum size of coarse tokens used. + semantic_to_coarse_ratio (`int`): + Ratio of semantic to coarse frequency + batch_size (`int`): + Batch size, i.e the number of samples. + semantic_generation_config (`BarkSemanticGenerationConfig`): + Generation config indicating how to generate the semantic tokens. + codebook_size (`int`): + Codebook channel size, i.e. the size of the output vocabulary per codebook channel. + history_prompt (`Optional[Dict[str,torch.Tensor]]`): + Optional `Bark` speaker prompt. + Returns: Returns: + `tuple(torch.FloatTensor)`: + - **x_semantic_history** (`torch.FloatTensor` -- Processed semantic speaker prompt. + - **x_coarse_history** (`torch.FloatTensor`) -- Processed coarse speaker prompt. + """ + if history_prompt is not None: + x_semantic_history = torch.repeat_interleave(history_prompt["semantic_prompt"][None], batch_size, dim=0) + # clone to avoid modifying history_prompt.coarse_prompt + x_coarse_history = history_prompt["coarse_prompt"].clone() + + # offset x_coarse_history + if codebook_size is not None: + for n in range(1, x_coarse_history.shape[0]): + # offset + x_coarse_history[n, :] += codebook_size * n + + # flatten x_coarse_history + x_coarse_history = torch.transpose(x_coarse_history, 0, 1).reshape(-1) + + x_coarse_history = x_coarse_history + semantic_generation_config.semantic_vocab_size + + x_coarse_history = torch.repeat_interleave(x_coarse_history[None], batch_size, dim=0) + # e.g: after SEMANTIC_VOCAB_SIZE (10000), 1024 tokens dedicated to first codebook, 1024 next tokens + # dedicated to second codebook. + + max_semantic_history = int(np.floor(max_coarse_history / semantic_to_coarse_ratio)) + # trim histories correctly + n_semantic_hist_provided = min( + [ + max_semantic_history, + x_semantic_history.shape[1] - x_semantic_history.shape[1] % 2, + int(np.floor(x_coarse_history.shape[1] / semantic_to_coarse_ratio)), + ] + ) + + n_coarse_hist_provided = int(round(n_semantic_hist_provided * semantic_to_coarse_ratio)) + + x_semantic_history = x_semantic_history[:, -n_semantic_hist_provided:].int() + x_coarse_history = x_coarse_history[:, -n_coarse_hist_provided:].int() + # bit of a hack for time alignment (sounds better) - from Bark original implementation + x_coarse_history = x_coarse_history[:, :-2] + + else: + # shape: (batch_size, 0) + x_semantic_history = torch.tensor([[]] * batch_size, dtype=torch.int, device=self.device) + x_coarse_history = torch.tensor([[]] * batch_size, dtype=torch.int, device=self.device) + + return x_semantic_history, x_coarse_history + + def generate( + self, + semantic_output: torch.Tensor, + semantic_generation_config: BarkSemanticGenerationConfig = None, + coarse_generation_config: BarkCoarseGenerationConfig = None, + codebook_size: int = 1024, + history_prompt: Optional[Dict[str, torch.Tensor]] = None, + return_output_lengths: Optional[bool] = None, + **kwargs, + ) -> Union[torch.LongTensor, Tuple[torch.LongTensor, torch.LongTensor]]: + """ + Generates coarse acoustics tokens from input text semantic tokens and an additional optional `Bark` speaker + prompt. + + Args: + semantic_output (`torch.Tensor` of shape (batch_size, seq_len), *optional*): + Input text semantic ids, i.e the output of `BarkSemanticModel.generate`. + semantic_generation_config (`BarkSemanticGenerationConfig`): + Generation config indicating how to generate the semantic tokens. + coarse_generation_config (`BarkCoarseGenerationConfig`): + Generation config indicating how to generate the coarse tokens. + codebook_size (`int`, *optional*, defaults to 1024): + Codebook channel size, i.e. the size of the output vocabulary per codebook channel. + history_prompt (`Optional[Dict[str,torch.Tensor]]`, *optional*): + Optional `Bark` speaker prompt. + return_output_lengths (`bool`, *optional*): + Whether or not to return the output lengths. Useful when batching. + Returns: + By default: + torch.LongTensor: Output coarse acoustics tokens. + If `return_output_lengths=True`: + `Tuple(torch.Tensor, torch.Tensor): The output coarse acoustics tokens, and the length of each sample + of the batch. + """ + + if semantic_generation_config is None: + raise ValueError("`semantic_generation_config` has to be provided") + + if coarse_generation_config is None: + raise ValueError("`coarse_generation_config` has to be provided") + + max_coarse_input_length = coarse_generation_config.max_coarse_input_length + max_coarse_history = coarse_generation_config.max_coarse_history + sliding_window_len = coarse_generation_config.sliding_window_len + + # replace semantic_pad_token (eos_tok and pad_tok here) with coarse_semantic_pad_token i.e the pad_token + # used in the next model + semantic_output.masked_fill_( + semantic_output == semantic_generation_config.semantic_pad_token, + coarse_generation_config.coarse_semantic_pad_token, + ) + + semantic_to_coarse_ratio = ( + coarse_generation_config.coarse_rate_hz + / semantic_generation_config.semantic_rate_hz + * coarse_generation_config.n_coarse_codebooks + ) + max_semantic_history = int(np.floor(max_coarse_history / semantic_to_coarse_ratio)) + + output_lengths = (semantic_output != coarse_generation_config.coarse_semantic_pad_token).sum(1) + output_lengths = torch.floor( + output_lengths * semantic_to_coarse_ratio / coarse_generation_config.n_coarse_codebooks + ) + output_lengths = torch.round(output_lengths * coarse_generation_config.n_coarse_codebooks).int() + + max_generated_len = torch.max(output_lengths).item() + + batch_size = semantic_output.shape[0] + + x_semantic_history, x_coarse = self.preprocess_histories( + history_prompt=history_prompt, + max_coarse_history=max_coarse_history, + semantic_to_coarse_ratio=semantic_to_coarse_ratio, + batch_size=batch_size, + semantic_generation_config=semantic_generation_config, + codebook_size=codebook_size, + ) + base_semantic_idx = x_semantic_history.shape[1] + + semantic_output = torch.hstack([x_semantic_history, semantic_output]) + + n_window_steps = int(np.ceil(max_generated_len / sliding_window_len)) + + total_generated_len = 0 + + len_coarse_history = x_coarse.shape[1] + + for _ in range(n_window_steps): + semantic_idx = base_semantic_idx + int(round(total_generated_len / semantic_to_coarse_ratio)) + + # pad from right side + input_coarse = semantic_output[:, np.max([0, semantic_idx - max_semantic_history]) :] + input_coarse = input_coarse[:, :max_coarse_input_length] + input_coarse = F.pad( + input_coarse, + (0, max_coarse_input_length - input_coarse.shape[-1]), + "constant", + coarse_generation_config.coarse_semantic_pad_token, + ) + + input_coarse = torch.hstack( + [ + input_coarse, + torch.tensor([[coarse_generation_config.coarse_infer_token]] * batch_size, device=self.device), + x_coarse[:, -max_coarse_history:], + ] + ) + + alternatingLogitsProcessor = AlternatingCodebooksLogitsProcessor( + input_coarse.shape[1], + semantic_generation_config.semantic_vocab_size, + codebook_size, + ) + + output_coarse = super().generate( + input_coarse, + logits_processor=[alternatingLogitsProcessor], + max_new_tokens=min(sliding_window_len, max_generated_len - total_generated_len), + generation_config=coarse_generation_config, + **kwargs, + ) + + input_coarse_len = input_coarse.shape[1] + + x_coarse = torch.hstack([x_coarse, output_coarse[:, input_coarse_len:]]) + total_generated_len = x_coarse.shape[1] - len_coarse_history + + del output_coarse + + coarse_output = x_coarse[:, len_coarse_history:] + + if return_output_lengths: + return coarse_output, output_lengths + + return coarse_output + + +@add_start_docstrings( + """Bark fine acoustics model. It is a non-causal GPT-like model with `config.n_codes_total` embedding layers and + language modeling heads, one for each codebook.""", + BARK_MODEL_START_DOCSTRING.format(config="BarkFineConfig"), +) +class BarkFineModel(BarkPreTrainedModel): + base_model_prefix = "fine_acoustics" + config_class = BarkFineConfig + main_input_name = "codebook_idx" + + def __init__(self, config): + # non-causal gpt-like model with one embedding layer and one lm_head for each codebook of Encodec + super().__init__(config) + self.config = config + + # initialize a modified non causal GPT-like model + # note that for there is one embedding layer and one lm_head for each codebook of Encodec + self.input_embeds_layers = nn.ModuleList( + [nn.Embedding(config.input_vocab_size, config.hidden_size) for _ in range(config.n_codes_total)] + ) + self.position_embeds_layer = nn.Embedding(config.block_size, config.hidden_size) + + self.drop = nn.Dropout(config.dropout) + + self.layers = nn.ModuleList([BarkBlock(config, is_causal=False) for _ in range(config.num_layers)]) + self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2" + + self.layernorm_final = nn.LayerNorm(config.hidden_size) + + self.lm_heads = nn.ModuleList( + [ + nn.Linear(config.hidden_size, config.output_vocab_size, bias=False) + for _ in range(config.n_codes_given, config.n_codes_total) + ] + ) + self.gradient_checkpointing = False + self.n_codes_total = config.n_codes_total + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + # one embedding layers for each codebook + return self.input_embeds_layers + + def set_input_embeddings(self, new_embeddings): + # one embedding layers for each codebook + self.input_embeds_layers = new_embeddings + + def get_output_embeddings(self): + # one lm_head for each codebook + return self.lm_heads + + def set_output_embeddings(self, new_output_embeddings): + # one lm_head for each codebook + self.lm_heads = new_output_embeddings + + def _resize_token_embeddings(self, new_num_tokens, pad_to_multiple_of=None, mean_resizing=True): + old_embeddings_list = self.get_input_embeddings() + new_embeddings_list = nn.ModuleList( + [ + self._get_resized_embeddings(old_embeddings, new_num_tokens, pad_to_multiple_of, mean_resizing) + for old_embeddings in old_embeddings_list + ] + ) + self.set_input_embeddings(new_embeddings_list) + new_num_tokens = new_embeddings_list[0].weight.shape[0] + + # if word embeddings are not tied, make sure that lm head is resized as well + if self.get_output_embeddings() is not None and not self.config.tie_word_embeddings: + old_lm_head_list = self.get_output_embeddings() + new_lm_head_list = nn.ModuleList( + [self._get_resized_lm_head(old_lm_head, new_num_tokens) for old_lm_head in old_lm_head_list] + ) + self.set_output_embeddings(new_lm_head_list) + + return self.get_input_embeddings() + + def resize_token_embeddings( + self, + new_num_tokens: Optional[int] = None, + pad_to_multiple_of: Optional[int] = None, + mean_resizing: bool = True, + ) -> nn.Embedding: + """ + Resizes input token embeddings matrix of the model if `new_num_tokens != config.vocab_size`. + + Takes care of tying weights embeddings afterwards if the model class has a `tie_weights()` method. + + Arguments: + new_num_tokens (`int`, *optional*): + The number of new tokens in the embedding matrix. Increasing the size will add newly initialized + vectors at the end. Reducing the size will remove vectors from the end. If not provided or `None`, just + returns a pointer to the input tokens `torch.nn.Embedding` module of the model without doing anything. + pad_to_multiple_of (`int`, *optional*): + If set will pad the embedding matrix to a multiple of the provided value. + + This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability + `>= 7.5` (Volta), or on TPUs which benefit from having sequence lengths be a multiple of 128. For more + details about this, or help on choosing the correct value for resizing, refer to this guide: + https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#requirements-tc + mean_resizing (`bool`): + Whether to initialize the added embeddings from a multivariate normal distribution that has old embeddings' mean and + covariance or to initialize them with a normal distribution that has a mean of zero and std equals `config.initializer_range`. + + Setting `mean_resizing` to `True` is useful when increasing the size of the embeddings of causal language models, + where the generated tokens' probabilities won't be affected by the added embeddings because initializing the new embeddings with the + old embeddings' mean will reduce the kl-divergence between the next token probability before and after adding the new embeddings. + Refer to this article for more information: https://nlp.stanford.edu/~johnhew/vocab-expansion.html + + Return: + `torch.nn.Embedding`: Pointer to the input tokens Embeddings Module of the model. + """ + model_embeds = self._resize_token_embeddings(new_num_tokens, pad_to_multiple_of, mean_resizing) + if new_num_tokens is None and pad_to_multiple_of is None: + return model_embeds + + # Update base model and current model config + self.config.output_vocab_size = model_embeds[0].weight.shape[0] + self.config.vocab_size = model_embeds[0].weight.shape[0] + self.output_vocab_size = model_embeds[0].weight.shape[0] + self.vocab_size = model_embeds[0].weight.shape[0] + + # Tie weights again if needed + self.tie_weights() + + return model_embeds + + def _tie_weights(self): + if getattr(self.config, "tie_word_embeddings", True): + self._tied_weights_keys = [] + output_embeddings = self.get_output_embeddings() + input_embeddings = self.get_input_embeddings() + + for i in range(self.config.n_codes_total - self.config.n_codes_given): + # self.input_embeds_layers[i + 1].weight = self.lm_heads[i].weight + self._tie_or_clone_weights(output_embeddings[i], input_embeddings[i + 1]) + self._tied_weights_keys.append(f"lm_heads.{i}.weight") + + def tie_weights(self): + """ + Tie the weights between the input embeddings list and the output embeddings list. + + If the `torchscript` flag is set in the configuration, can't handle parameter sharing so we are cloning the + weights instead. + """ + if getattr(self.config, "tie_word_embeddings", True): + self._tied_weights_keys = [] + output_embeddings = self.get_output_embeddings() + input_embeddings = self.get_input_embeddings() + + for i in range(self.config.n_codes_total - self.config.n_codes_given): + # self.input_embeds_layers[i + 1].weight = self.lm_heads[i].weight + self._tie_or_clone_weights(output_embeddings[i], input_embeddings[i + 1]) + self._tied_weights_keys.append(f"lm_heads.{i}.weight") + + for module in self.modules(): + if hasattr(module, "_tie_weights"): + module._tie_weights() + + @add_start_docstrings_to_model_forward(BARK_FINE_INPUTS_DOCSTRING) + def forward( + self, + codebook_idx: int, # an additionnal idx corresponding to the id of the codebook that will be predicted + input_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + labels: Optional[torch.LongTensor] = None, + input_embeds: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple[torch.Tensor], MaskedLMOutput]: + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + loss = None + if labels is not None: + raise NotImplementedError("Training is not implemented yet") + + if codebook_idx == 0: + raise ValueError("Cannot predict 0th codebook - 0th codebook should be predicted by the coarse model") + + if input_ids is not None and input_embeds is not None: + raise ValueError("You cannot specify both input_ids and input_embeds at the same time") + + if input_ids is None and input_embeds is None: + raise ValueError("You have to specify either input_ids or input_embeds") + + if input_ids is not None: + # the input_embeddings are the sum of the j previous codebooks embeddings before + # the current codebook_idx codebook + + # forward the GPT model itself + input_embeds = [ + input_embeds_layer(input_ids[:, :, i]).unsqueeze(-1) + for i, input_embeds_layer in enumerate(self.input_embeds_layers) + ] # token embeddings of shape (b, t, n_embd) + input_embeds = torch.cat(input_embeds, dim=-1) + input_embeds = input_embeds[:, :, :, : codebook_idx + 1].sum(dim=-1) + + input_shape = input_embeds.size()[:-1] + batch_size = input_embeds.shape[0] + seq_length = input_shape[1] + + device = input_ids.device if input_ids is not None else input_embeds.device + + if position_ids is None: + position_ids = torch.arange(0, seq_length, dtype=torch.long, device=device) + position_ids = position_ids.unsqueeze(0) # shape (1, seq_length) + + position_embeds = self.position_embeds_layer(position_ids) # position embeddings of shape (1, t, n_embd) + + # Attention mask. + if attention_mask is not None: + if batch_size <= 0: + raise ValueError("batch_size has to be defined and > 0") + if self._use_flash_attention_2: + attention_mask = attention_mask if 0 in attention_mask else None + else: + # [bsz, to_seq_length] -> [bsz, 1, 1, to_seq_length] + # from_seq_length is 1 to easily broadcast + attention_mask = _prepare_4d_attention_mask(attention_mask, input_embeds.dtype, tgt_len=1) + + head_mask = self.get_head_mask(head_mask, self.config.num_layers) + + hidden_states = self.drop(input_embeds + position_embeds) + output_shape = input_shape + (hidden_states.size(-1),) + + all_self_attentions = () if output_attentions else None + all_hidden_states = () if output_hidden_states else None + + for i, block in enumerate(self.layers): + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + outputs = block( + hidden_states, + attention_mask=attention_mask, + head_mask=head_mask[i], + output_attentions=output_attentions, + ) + + hidden_states = outputs[0] + + if output_attentions: + all_self_attentions = all_self_attentions + (outputs[1],) + + hidden_states = self.layernorm_final(hidden_states) + hidden_states = hidden_states.view(output_shape) + + # Add last hidden state + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + logits = self.lm_heads[codebook_idx - self.config.n_codes_given](hidden_states) + + if not return_dict: + return tuple(v for v in [None, logits, all_hidden_states, all_self_attentions] if v is not None) + + return MaskedLMOutput( + loss=loss, + logits=logits, + hidden_states=all_hidden_states, + attentions=all_self_attentions, + ) + + def generate( + self, + coarse_output: torch.Tensor, + semantic_generation_config: BarkSemanticGenerationConfig = None, + coarse_generation_config: BarkCoarseGenerationConfig = None, + fine_generation_config: BarkFineGenerationConfig = None, + codebook_size: int = 1024, + history_prompt: Optional[Dict[str, torch.Tensor]] = None, + **kwargs, + ) -> torch.LongTensor: + """ + Generates fine acoustics tokens from input coarse acoustics tokens and an additional optional `Bark` speaker + prompt. + + Args: + coarse_output (`torch.Tensor` of shape (batch_size, seq_len)): + Input coarse acoustics ids, i.e the output of `BarkCoarseModel.generate`. + semantic_generation_config (`BarkSemanticGenerationConfig`): + Generation config indicating how to generate the semantic tokens. + coarse_generation_config (`BarkCoarseGenerationConfig`): + Generation config indicating how to generate the coarse tokens. + fine_generation_config (`BarkFineGenerationConfig`): + Generation config indicating how to generate the fine tokens. + codebook_size (`int`, *optional*, defaults to 1024): + Codebook channel size, i.e. the size of the output vocabulary per codebook channel. + history_prompt (`Optional[Dict[str,torch.Tensor]]`, *optional*): + Optional `Bark` speaker prompt. + Returns: + torch.LongTensor: Output fine acoustics tokens. + """ + if semantic_generation_config is None: + raise ValueError("`semantic_generation_config` has to be provided") + + if coarse_generation_config is None: + raise ValueError("`coarse_generation_config` has to be provided") + + if fine_generation_config is None: + raise ValueError("`fine_generation_config` has to be provided") + + # since we don't really use GenerationConfig through the fine model (autoencoder) + # and since only temperature is used from the classic GenerationConfig parameters + # manually impose the kwargs priority over the generation config + temperature = kwargs.get("temperature", fine_generation_config.temperature) + + max_fine_history_length = fine_generation_config.max_fine_history_length + max_fine_input_length = fine_generation_config.max_fine_input_length + + # shape: (batch, n_coarse_codebooks * seq_len) + # new_shape: (batch, seq_len, n_coarse_codebooks) + coarse_output = coarse_output.view(coarse_output.shape[0], -1, coarse_generation_config.n_coarse_codebooks) + + # brings ids into the range [0, codebook_size -1] + coarse_output = torch.remainder(coarse_output - semantic_generation_config.semantic_vocab_size, codebook_size) + batch_size = coarse_output.shape[0] + + if history_prompt is not None: + x_fine_history = torch.repeat_interleave(history_prompt["fine_prompt"].T[None], batch_size, dim=0) + # transpose to get to shape (seq_len, n_fine_codebooks) + else: + x_fine_history = None + + n_coarse = coarse_generation_config.n_coarse_codebooks + + # pad the last 6th codebooks + fine_input = F.pad( + coarse_output, + (0, fine_generation_config.n_fine_codebooks - n_coarse), + "constant", + codebook_size, + ) + + # prepend history if available (max max_fine_history_length) + if x_fine_history is not None: + fine_input = torch.cat([x_fine_history[:, -max_fine_history_length:, :], fine_input], dim=1) + + # len of the fine_history that has been added to fine_input + n_history = x_fine_history[:, -max_fine_history_length:, :].shape[1] + else: + n_history = 0 + + n_remove_from_end = 0 + # need to pad if too short (since non-causal model) + if fine_input.shape[1] < max_fine_input_length: + n_remove_from_end = max_fine_input_length - fine_input.shape[1] + fine_input = F.pad(fine_input, (0, 0, 0, n_remove_from_end), mode="constant", value=codebook_size) + + # we can be lazy about fractional loop and just keep overwriting codebooks. + # seems that coarse_output.shape[1] - (max_fine_input_length - n_history) is equal to minus n_remove_from_end + # So if we needed to pad because too short, n_loops is always 1 (because n_remove_from_end > 0) + # If not, we loop over at least twice. + + n_loops = (coarse_output.shape[1] - (max_fine_input_length - n_history)) / max_fine_history_length + n_loops = int(np.ceil(n_loops)) + n_loops = max(0, n_loops) + 1 + + for n_outer in range(n_loops): + start_idx = min([n_outer * max_fine_history_length, fine_input.shape[1] - max_fine_input_length]) + + start_fill_idx = min( + [n_history + n_outer * max_fine_history_length, fine_input.shape[1] - max_fine_history_length] + ) + rel_start_fill_idx = start_fill_idx - start_idx + input_buffer = fine_input[:, start_idx : start_idx + max_fine_input_length, :] + for n_inner in range(n_coarse, fine_generation_config.n_fine_codebooks): + logits = self.forward(n_inner, input_buffer).logits + if temperature is None or temperature == 1.0: + relevant_logits = logits[:, rel_start_fill_idx:, :codebook_size] + codebook_preds = torch.argmax(relevant_logits, -1) + else: + relevant_logits = logits[:, :, :codebook_size] / temperature + # apply softmax + probs = F.softmax(relevant_logits, dim=-1)[:, rel_start_fill_idx:max_fine_input_length] + # reshape to 2D: (batch_size, seq_len, codebook_size) -> (batch_size*seq_len, codebook_size) + probs = probs.reshape((-1, codebook_size)) + # multinomial then reshape : (batch_size*seq_len)-> (batch_size,seq_len) + codebook_preds = torch.multinomial(probs, num_samples=1).view(batch_size, -1) + codebook_preds = codebook_preds.to(torch.int32) + input_buffer[:, rel_start_fill_idx:, n_inner] = codebook_preds + del logits, codebook_preds + + # transfer into fine_input + for n_inner in range(n_coarse, fine_generation_config.n_fine_codebooks): + fine_input[ + :, start_fill_idx : start_fill_idx + (max_fine_input_length - rel_start_fill_idx), n_inner + ] = input_buffer[:, rel_start_fill_idx:, n_inner] + del input_buffer + + fine_input = fine_input.transpose(1, 2)[:, :, n_history:] + if n_remove_from_end > 0: + fine_input = fine_input[:, :, :-n_remove_from_end] + + if fine_input.shape[-1] != coarse_output.shape[-2]: + raise ValueError("input and output should have the same seq_len") + + return fine_input + + +@add_start_docstrings( + """ + The full Bark model, a text-to-speech model composed of 4 sub-models: + - [`BarkSemanticModel`] (also referred to as the 'text' model): a causal auto-regressive transformer model that + takes + as input tokenized text, and predicts semantic text tokens that capture the meaning of the text. + - [`BarkCoarseModel`] (also refered to as the 'coarse acoustics' model), also a causal autoregressive transformer, + that takes into input the results of the last model. It aims at regressing the first two audio codebooks necessary + to `encodec`. + - [`BarkFineModel`] (the 'fine acoustics' model), this time a non-causal autoencoder transformer, which iteratively + predicts the last codebooks based on the sum of the previous codebooks embeddings. + - having predicted all the codebook channels from the [`EncodecModel`], Bark uses it to decode the output audio + array. + + It should be noted that each of the first three modules can support conditional speaker embeddings to condition the + output sound according to specific predefined voice. + """, + BARK_START_DOCSTRING, +) +class BarkModel(BarkPreTrainedModel): + config_class = BarkConfig + + def __init__(self, config): + super().__init__(config) + + self.semantic = BarkSemanticModel(config.semantic_config) + self.coarse_acoustics = BarkCoarseModel(config.coarse_acoustics_config) + self.fine_acoustics = BarkFineModel(config.fine_acoustics_config) + + self.codec_model = AutoModel.from_config(config.codec_config) + + self.config = config + + @classmethod + def can_generate(cls) -> bool: + # Bark has a unique model structure, where the external class (`BarkModel`) doesn't need to inherit from + # `GenerationMixin` (it has a non-standard generation method), but one of the internal models do + # (`BarkSemanticModel`). This means that the base `can_generate()` will return `False`, but we need to + # override it so as to do `GenerationConfig` handling in multiple parts of the codebase. + return True + + @property + def device(self) -> torch.device: + """ + `torch.device`: The device on which the module is (assuming that all the module parameters are on the same + device). + """ + # for bark_model, device must be verified on its sub-models + # if has _hf_hook, has been offloaded so the device has to be found in the hook + if not hasattr(self.semantic, "_hf_hook"): + return get_parameter_device(self) + for module in self.semantic.modules(): + if ( + hasattr(module, "_hf_hook") + and hasattr(module._hf_hook, "execution_device") + and module._hf_hook.execution_device is not None + ): + return torch.device(module._hf_hook.execution_device) + + def enable_cpu_offload( + self, + accelerator_id: Optional[int] = 0, + **kwargs, + ): + r""" + Offloads all sub-models to CPU using accelerate, reducing memory usage with a low impact on performance. This + method moves one whole sub-model at a time to the accelerator when it is used, and the sub-model remains in accelerator until the next sub-model runs. + + Args: + accelerator_id (`int`, *optional*, defaults to 0): + accelerator id on which the sub-models will be loaded and offloaded. This argument is deprecated. + kwargs (`dict`, *optional*): + additional keyword arguments: + `gpu_id`: accelerator id on which the sub-models will be loaded and offloaded. + """ + if is_accelerate_available(): + from accelerate import cpu_offload_with_hook + else: + raise ImportError("`enable_model_cpu_offload` requires `accelerate`.") + + gpu_id = kwargs.get("gpu_id", 0) + + if gpu_id != 0: + warnings.warn( + "The argument `gpu_id` is deprecated and will be removed in version 4.54.0 of Transformers. Please use `accelerator_id` instead.", + FutureWarning, + ) + accelerator_id = gpu_id + + device_type = "cuda" + if is_torch_accelerator_available(): + device_type = torch.accelerator.current_accelerator().type + device = torch.device(f"{device_type}:{accelerator_id}") + + torch_accelerator_module = getattr(torch, device_type) + if self.device.type != "cpu": + self.to("cpu") + torch_accelerator_module.empty_cache() # otherwise we don't see the memory savings (but they probably exist) + + # this layer is used outside the first foward pass of semantic so need to be loaded before semantic + self.semantic.input_embeds_layer, _ = cpu_offload_with_hook(self.semantic.input_embeds_layer, device) + + hook = None + for cpu_offloaded_model in [ + self.semantic, + self.coarse_acoustics, + self.fine_acoustics, + ]: + _, hook = cpu_offload_with_hook(cpu_offloaded_model, device, prev_module_hook=hook) + + self.fine_acoustics_hook = hook + + _, hook = cpu_offload_with_hook(self.codec_model, device, prev_module_hook=hook) + + # We'll offload the last model manually. + self.codec_model_hook = hook + + def codec_decode(self, fine_output, output_lengths=None): + """Turn quantized audio codes into audio array using encodec.""" + + fine_output = fine_output.transpose(0, 1) + emb = self.codec_model.quantizer.decode(fine_output) + + if output_lengths is not None: + # encodec uses LSTMs which behaves differently with appended padding + # decoding with encodec takes around 0.1% of the total generation time + # to keep generation quality, we break batching + out = [sample[:, :l].unsqueeze(0) for (sample, l) in zip(emb, output_lengths)] + audio_arr = [self.codec_model.decoder(sample).squeeze() for sample in out] + else: + out = self.codec_model.decoder(emb) + audio_arr = out.squeeze(1) # squeeze the codebook dimension + + return audio_arr + + @torch.no_grad() + def generate( + self, + input_ids: Optional[torch.Tensor] = None, + history_prompt: Optional[Dict[str, torch.Tensor]] = None, + return_output_lengths: Optional[bool] = None, + **kwargs, + ) -> torch.LongTensor: + """ + Generates audio from an input prompt and an additional optional `Bark` speaker prompt. + + Args: + input_ids (`Optional[torch.Tensor]` of shape (batch_size, seq_len), *optional*): + Input ids. Will be truncated up to 256 tokens. Note that the output audios will be as long as the + longest generation among the batch. + history_prompt (`Optional[Dict[str,torch.Tensor]]`, *optional*): + Optional `Bark` speaker prompt. Note that for now, this model takes only one speaker prompt per batch. + kwargs (*optional*): Remaining dictionary of keyword arguments. Keyword arguments are of two types: + + - Without a prefix, they will be entered as `**kwargs` for the `generate` method of each sub-model. + - With a *semantic_*, *coarse_*, *fine_* prefix, they will be input for the `generate` method of the + semantic, coarse and fine respectively. It has the priority over the keywords without a prefix. + + This means you can, for example, specify a generation strategy for all sub-models except one. + return_output_lengths (`bool`, *optional*): + Whether or not to return the waveform lengths. Useful when batching. + Returns: + By default: + - **audio_waveform** (`torch.Tensor` of shape (batch_size, seq_len)): Generated audio waveform. + When `return_output_lengths=True`: + Returns a tuple made of: + - **audio_waveform** (`torch.Tensor` of shape (batch_size, seq_len)): Generated audio waveform. + - **output_lengths** (`torch.Tensor` of shape (batch_size)): The length of each waveform in the batch + Example: + + ```python + >>> from transformers import AutoProcessor, BarkModel + + >>> processor = AutoProcessor.from_pretrained("suno/bark-small") + >>> model = BarkModel.from_pretrained("suno/bark-small") + + >>> # To add a voice preset, you can pass `voice_preset` to `BarkProcessor.__call__(...)` + >>> voice_preset = "v2/en_speaker_6" + + >>> inputs = processor("Hello, my dog is cute, I need him in my life", voice_preset=voice_preset) + + >>> audio_array = model.generate(**inputs, semantic_max_new_tokens=100) + >>> audio_array = audio_array.cpu().numpy().squeeze() + ``` + """ + # TODO (joao):workaround until nested generation config is compatible with PreTrained Model + # todo: dict + semantic_generation_config = BarkSemanticGenerationConfig(**self.generation_config.semantic_config) + coarse_generation_config = BarkCoarseGenerationConfig(**self.generation_config.coarse_acoustics_config) + fine_generation_config = BarkFineGenerationConfig(**self.generation_config.fine_acoustics_config) + + kwargs_semantic = { + # if "attention_mask" is set, it should not be passed to CoarseModel and FineModel + "attention_mask": kwargs.pop("attention_mask", None), + "min_eos_p": kwargs.pop("min_eos_p", None), + } + kwargs_coarse = {} + kwargs_fine = {} + for key, value in kwargs.items(): + if key.startswith("semantic_"): + key = key[len("semantic_") :] + kwargs_semantic[key] = value + elif key.startswith("coarse_"): + key = key[len("coarse_") :] + kwargs_coarse[key] = value + elif key.startswith("fine_"): + key = key[len("fine_") :] + kwargs_fine[key] = value + else: + # If the key is already in a specific config, then it's been set with a + # submodules specific value and we don't override + if key not in kwargs_semantic: + kwargs_semantic[key] = value + if key not in kwargs_coarse: + kwargs_coarse[key] = value + if key not in kwargs_fine: + kwargs_fine[key] = value + + # 1. Generate from the semantic model + if "generation_config" in kwargs_semantic: + kwargs_semantic.pop("generation_config") + semantic_output = self.semantic.generate( + input_ids, + history_prompt=history_prompt, + semantic_generation_config=semantic_generation_config, + **kwargs_semantic, + ) + + # 2. Generate from the coarse model + if "generation_config" in kwargs_coarse: + kwargs_coarse.pop("generation_config") + coarse_output = self.coarse_acoustics.generate( + semantic_output, + history_prompt=history_prompt, + semantic_generation_config=semantic_generation_config, + coarse_generation_config=coarse_generation_config, + codebook_size=self.generation_config.codebook_size, + return_output_lengths=return_output_lengths, + **kwargs_coarse, + ) + + output_lengths = None + if return_output_lengths: + coarse_output, output_lengths = coarse_output + # (batch_size, seq_len*coarse_codebooks) -> (batch_size, seq_len) + output_lengths = output_lengths // coarse_generation_config.n_coarse_codebooks + + # 3. "generate" from the fine model + if "generation_config" in kwargs_fine: + kwargs_fine.pop("generation_config") + output = self.fine_acoustics.generate( + coarse_output, + history_prompt=history_prompt, + semantic_generation_config=semantic_generation_config, + coarse_generation_config=coarse_generation_config, + fine_generation_config=fine_generation_config, + codebook_size=self.generation_config.codebook_size, + **kwargs_fine, + ) + + if getattr(self, "fine_acoustics_hook", None) is not None: + # Manually offload fine_acoustics to CPU + # and load codec_model to GPU + # since bark doesn't use codec_model forward pass + self.fine_acoustics_hook.offload() + self.codec_model = self.codec_model.to(self.device) + + # 4. Decode the output and generate audio array + audio = self.codec_decode(output, output_lengths) + + if getattr(self, "codec_model_hook", None) is not None: + # Offload codec_model to CPU + self.codec_model_hook.offload() + + if return_output_lengths: + output_lengths = [len(sample) for sample in audio] + audio = nn.utils.rnn.pad_sequence(audio, batch_first=True, padding_value=0) + return audio, output_lengths + + return audio + + @classmethod + def _check_and_enable_flash_attn_2( + cls, + config, + torch_dtype: Optional[torch.dtype] = None, + device_map: Optional[Union[str, Dict[str, int]]] = None, + hard_check_only: bool = False, + check_device_map: bool = False, + ): + """ + `_check_and_enable_flash_attn_2` originally don't expand flash attention enabling to the model + sub-configurations. We override the original method to make sure that Bark sub-models are using Flash Attention + if necessary. + + If you don't know about Flash Attention, check out the official repository of flash attention: + https://github.com/Dao-AILab/flash-attention + + For using Flash Attention 1.0 you can do it directly via the `BetterTransformer` API, have a look at this + specific section of the documentation to learn more about it: + https://huggingface.co/docs/transformers/main/en/perf_infer_gpu_one#decoder-models + + The method checks if the current setup is compatible with Flash Attention as it requires the model to be in + half precision and not ran on CPU. + + If all checks pass and `hard_check_only` is False, the method will set the config attribute `_attn_implementation` to "flash_attention_2" so that the model + can initialize the correct attention module + """ + config = super()._check_and_enable_flash_attn_2( + config, torch_dtype, device_map, hard_check_only=hard_check_only, check_device_map=check_device_map + ) + + config.semantic_config._attn_implementation = config._attn_implementation + config.coarse_acoustics_config._attn_implementation = config._attn_implementation + config.fine_acoustics_config._attn_implementation = config._attn_implementation + return config + + +__all__ = [ + "BarkFineModel", + "BarkSemanticModel", + "BarkCoarseModel", + "BarkModel", + "BarkPreTrainedModel", + "BarkCausalModel", +] diff --git a/docs/transformers/build/lib/transformers/models/bark/processing_bark.py b/docs/transformers/build/lib/transformers/models/bark/processing_bark.py new file mode 100644 index 0000000000000000000000000000000000000000..d2ec0629e42146f0ce743c3129c32b58a3fa4f47 --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/bark/processing_bark.py @@ -0,0 +1,296 @@ +# coding=utf-8 +# Copyright 2023 The Suno AI Authors and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Processor class for Bark +""" + +import json +import os +from typing import Optional + +import numpy as np + +from ...feature_extraction_utils import BatchFeature +from ...processing_utils import ProcessorMixin +from ...utils import logging +from ...utils.hub import cached_file +from ..auto import AutoTokenizer + + +logger = logging.get_logger(__name__) + + +class BarkProcessor(ProcessorMixin): + r""" + Constructs a Bark processor which wraps a text tokenizer and optional Bark voice presets into a single processor. + + Args: + tokenizer ([`PreTrainedTokenizer`]): + An instance of [`PreTrainedTokenizer`]. + speaker_embeddings (`Dict[Dict[str]]`, *optional*): + Optional nested speaker embeddings dictionary. The first level contains voice preset names (e.g + `"en_speaker_4"`). The second level contains `"semantic_prompt"`, `"coarse_prompt"` and `"fine_prompt"` + embeddings. The values correspond to the path of the corresponding `np.ndarray`. See + [here](https://suno-ai.notion.site/8b8e8749ed514b0cbf3f699013548683?v=bc67cff786b04b50b3ceb756fd05f68c) for + a list of `voice_preset_names`. + + """ + + tokenizer_class = "AutoTokenizer" + attributes = ["tokenizer"] + + preset_shape = { + "semantic_prompt": 1, + "coarse_prompt": 2, + "fine_prompt": 2, + } + + def __init__(self, tokenizer, speaker_embeddings=None): + super().__init__(tokenizer) + + self.speaker_embeddings = speaker_embeddings + + @classmethod + def from_pretrained( + cls, pretrained_processor_name_or_path, speaker_embeddings_dict_path="speaker_embeddings_path.json", **kwargs + ): + r""" + Instantiate a Bark processor associated with a pretrained model. + + Args: + pretrained_model_name_or_path (`str` or `os.PathLike`): + This can be either: + + - a string, the *model id* of a pretrained [`BarkProcessor`] hosted inside a model repo on + huggingface.co. + - a path to a *directory* containing a processor saved using the [`~BarkProcessor.save_pretrained`] + method, e.g., `./my_model_directory/`. + speaker_embeddings_dict_path (`str`, *optional*, defaults to `"speaker_embeddings_path.json"`): + The name of the `.json` file containing the speaker_embeddings dictionary located in + `pretrained_model_name_or_path`. If `None`, no speaker_embeddings is loaded. + **kwargs + Additional keyword arguments passed along to both + [`~tokenization_utils_base.PreTrainedTokenizer.from_pretrained`]. + """ + + if speaker_embeddings_dict_path is not None: + speaker_embeddings_path = cached_file( + pretrained_processor_name_or_path, + speaker_embeddings_dict_path, + subfolder=kwargs.pop("subfolder", None), + cache_dir=kwargs.pop("cache_dir", None), + force_download=kwargs.pop("force_download", False), + proxies=kwargs.pop("proxies", None), + resume_download=kwargs.pop("resume_download", None), + local_files_only=kwargs.pop("local_files_only", False), + token=kwargs.pop("use_auth_token", None), + revision=kwargs.pop("revision", None), + _raise_exceptions_for_gated_repo=False, + _raise_exceptions_for_missing_entries=False, + _raise_exceptions_for_connection_errors=False, + ) + if speaker_embeddings_path is None: + logger.warning( + f"""`{os.path.join(pretrained_processor_name_or_path, speaker_embeddings_dict_path)}` does not exists + , no preloaded speaker embeddings will be used - Make sure to provide a correct path to the json + dictionary if wanted, otherwise set `speaker_embeddings_dict_path=None`.""" + ) + speaker_embeddings = None + else: + with open(speaker_embeddings_path) as speaker_embeddings_json: + speaker_embeddings = json.load(speaker_embeddings_json) + else: + speaker_embeddings = None + + tokenizer = AutoTokenizer.from_pretrained(pretrained_processor_name_or_path, **kwargs) + + return cls(tokenizer=tokenizer, speaker_embeddings=speaker_embeddings) + + def save_pretrained( + self, + save_directory, + speaker_embeddings_dict_path="speaker_embeddings_path.json", + speaker_embeddings_directory="speaker_embeddings", + push_to_hub: bool = False, + **kwargs, + ): + """ + Saves the attributes of this processor (tokenizer...) in the specified directory so that it can be reloaded + using the [`~BarkProcessor.from_pretrained`] method. + + Args: + save_directory (`str` or `os.PathLike`): + Directory where the tokenizer files and the speaker embeddings will be saved (directory will be created + if it does not exist). + speaker_embeddings_dict_path (`str`, *optional*, defaults to `"speaker_embeddings_path.json"`): + The name of the `.json` file that will contains the speaker_embeddings nested path dictionary, if it + exists, and that will be located in `pretrained_model_name_or_path/speaker_embeddings_directory`. + speaker_embeddings_directory (`str`, *optional*, defaults to `"speaker_embeddings/"`): + The name of the folder in which the speaker_embeddings arrays will be saved. + push_to_hub (`bool`, *optional*, defaults to `False`): + Whether or not to push your model to the Hugging Face model hub after saving it. You can specify the + repository you want to push to with `repo_id` (will default to the name of `save_directory` in your + namespace). + kwargs: + Additional key word arguments passed along to the [`~utils.PushToHubMixin.push_to_hub`] method. + """ + if self.speaker_embeddings is not None: + os.makedirs(os.path.join(save_directory, speaker_embeddings_directory, "v2"), exist_ok=True) + + embeddings_dict = {} + + embeddings_dict["repo_or_path"] = save_directory + + for prompt_key in self.speaker_embeddings: + if prompt_key != "repo_or_path": + voice_preset = self._load_voice_preset(prompt_key) + + tmp_dict = {} + for key in self.speaker_embeddings[prompt_key]: + np.save( + os.path.join( + embeddings_dict["repo_or_path"], speaker_embeddings_directory, f"{prompt_key}_{key}" + ), + voice_preset[key], + allow_pickle=False, + ) + tmp_dict[key] = os.path.join(speaker_embeddings_directory, f"{prompt_key}_{key}.npy") + + embeddings_dict[prompt_key] = tmp_dict + + with open(os.path.join(save_directory, speaker_embeddings_dict_path), "w") as fp: + json.dump(embeddings_dict, fp) + + super().save_pretrained(save_directory, push_to_hub, **kwargs) + + def _load_voice_preset(self, voice_preset: Optional[str] = None, **kwargs): + voice_preset_paths = self.speaker_embeddings[voice_preset] + + voice_preset_dict = {} + for key in ["semantic_prompt", "coarse_prompt", "fine_prompt"]: + if key not in voice_preset_paths: + raise ValueError( + f"Voice preset unrecognized, missing {key} as a key in self.speaker_embeddings[{voice_preset}]." + ) + + path = cached_file( + self.speaker_embeddings.get("repo_or_path", "/"), + voice_preset_paths[key], + subfolder=kwargs.pop("subfolder", None), + cache_dir=kwargs.pop("cache_dir", None), + force_download=kwargs.pop("force_download", False), + proxies=kwargs.pop("proxies", None), + resume_download=kwargs.pop("resume_download", None), + local_files_only=kwargs.pop("local_files_only", False), + token=kwargs.pop("use_auth_token", None), + revision=kwargs.pop("revision", None), + _raise_exceptions_for_gated_repo=False, + _raise_exceptions_for_missing_entries=False, + _raise_exceptions_for_connection_errors=False, + ) + if path is None: + raise ValueError( + f"""`{os.path.join(self.speaker_embeddings.get("repo_or_path", "/"), voice_preset_paths[key])}` does not exists + , no preloaded voice preset will be used - Make sure to provide correct paths to the {voice_preset} + embeddings.""" + ) + + voice_preset_dict[key] = np.load(path) + + return voice_preset_dict + + def _validate_voice_preset_dict(self, voice_preset: Optional[dict] = None): + for key in ["semantic_prompt", "coarse_prompt", "fine_prompt"]: + if key not in voice_preset: + raise ValueError(f"Voice preset unrecognized, missing {key} as a key.") + + if not isinstance(voice_preset[key], np.ndarray): + raise TypeError(f"{key} voice preset must be a {str(self.preset_shape[key])}D ndarray.") + + if len(voice_preset[key].shape) != self.preset_shape[key]: + raise ValueError(f"{key} voice preset must be a {str(self.preset_shape[key])}D ndarray.") + + def __call__( + self, + text=None, + voice_preset=None, + return_tensors="pt", + max_length=256, + add_special_tokens=False, + return_attention_mask=True, + return_token_type_ids=False, + **kwargs, + ): + """ + Main method to prepare for the model one or several sequences(s). This method forwards the `text` and `kwargs` + arguments to the AutoTokenizer's [`~AutoTokenizer.__call__`] to encode the text. The method also proposes a + voice preset which is a dictionary of arrays that conditions `Bark`'s output. `kwargs` arguments are forwarded + to the tokenizer and to `cached_file` method if `voice_preset` is a valid filename. + + Args: + text (`str`, `List[str]`, `List[List[str]]`): + The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings + (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set + `is_split_into_words=True` (to lift the ambiguity with a batch of sequences). + voice_preset (`str`, `Dict[np.ndarray]`): + The voice preset, i.e the speaker embeddings. It can either be a valid voice_preset name, e.g + `"en_speaker_1"`, or directly a dictionary of `np.ndarray` embeddings for each submodel of `Bark`. Or + it can be a valid file name of a local `.npz` single voice preset. + return_tensors (`str` or [`~utils.TensorType`], *optional*): + If set, will return tensors of a particular framework. Acceptable values are: + + - `'pt'`: Return PyTorch `torch.Tensor` objects. + - `'np'`: Return NumPy `np.ndarray` objects. + + Returns: + Tuple([`BatchEncoding`], [`BatchFeature`]): A tuple composed of a [`BatchEncoding`], i.e the output of the + `tokenizer` and a [`BatchFeature`], i.e the voice preset with the right tensors type. + """ + if voice_preset is not None and not isinstance(voice_preset, dict): + if ( + isinstance(voice_preset, str) + and self.speaker_embeddings is not None + and voice_preset in self.speaker_embeddings + ): + voice_preset = self._load_voice_preset(voice_preset) + + else: + if isinstance(voice_preset, str) and not voice_preset.endswith(".npz"): + voice_preset = voice_preset + ".npz" + + voice_preset = np.load(voice_preset) + + if voice_preset is not None: + self._validate_voice_preset_dict(voice_preset, **kwargs) + voice_preset = BatchFeature(data=voice_preset, tensor_type=return_tensors) + + encoded_text = self.tokenizer( + text, + return_tensors=return_tensors, + padding="max_length", + max_length=max_length, + return_attention_mask=return_attention_mask, + return_token_type_ids=return_token_type_ids, + add_special_tokens=add_special_tokens, + **kwargs, + ) + + if voice_preset is not None: + encoded_text["history_prompt"] = voice_preset + + return encoded_text + + +__all__ = ["BarkProcessor"] diff --git a/docs/transformers/build/lib/transformers/models/bart/__init__.py b/docs/transformers/build/lib/transformers/models/bart/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..8f4c713f4698d7c901ed06738efc8983e317805c --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/bart/__init__.py @@ -0,0 +1,31 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ...utils import _LazyModule +from ...utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .configuration_bart import * + from .modeling_bart import * + from .modeling_flax_bart import * + from .modeling_tf_bart import * + from .tokenization_bart import * + from .tokenization_bart_fast import * +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/docs/transformers/build/lib/transformers/models/bart/configuration_bart.py b/docs/transformers/build/lib/transformers/models/bart/configuration_bart.py new file mode 100644 index 0000000000000000000000000000000000000000..4ce4316e3c03150d1845b7a3f96409ae4b637e55 --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/bart/configuration_bart.py @@ -0,0 +1,405 @@ +# coding=utf-8 +# Copyright 2021 The Fairseq Authors and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""BART model configuration""" + +import warnings +from collections import OrderedDict +from typing import Any, Mapping, Optional + +from ... import PreTrainedTokenizer +from ...configuration_utils import PretrainedConfig +from ...onnx import OnnxConfig, OnnxConfigWithPast, OnnxSeq2SeqConfigWithPast +from ...onnx.utils import compute_effective_axis_dimension +from ...utils import TensorType, is_torch_available, logging + + +logger = logging.get_logger(__name__) + + +class BartConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`BartModel`]. It is used to instantiate a BART + model according to the specified arguments, defining the model architecture. Instantiating a configuration with the + defaults will yield a similar configuration to that of the BART + [facebook/bart-large](https://huggingface.co/facebook/bart-large) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + + Args: + vocab_size (`int`, *optional*, defaults to 50265): + Vocabulary size of the BART model. Defines the number of different tokens that can be represented by the + `inputs_ids` passed when calling [`BartModel`] or [`TFBartModel`]. + d_model (`int`, *optional*, defaults to 1024): + Dimensionality of the layers and the pooler layer. + encoder_layers (`int`, *optional*, defaults to 12): + Number of encoder layers. + decoder_layers (`int`, *optional*, defaults to 12): + Number of decoder layers. + encoder_attention_heads (`int`, *optional*, defaults to 16): + Number of attention heads for each attention layer in the Transformer encoder. + decoder_attention_heads (`int`, *optional*, defaults to 16): + Number of attention heads for each attention layer in the Transformer decoder. + decoder_ffn_dim (`int`, *optional*, defaults to 4096): + Dimensionality of the "intermediate" (often named feed-forward) layer in decoder. + encoder_ffn_dim (`int`, *optional*, defaults to 4096): + Dimensionality of the "intermediate" (often named feed-forward) layer in decoder. + activation_function (`str` or `function`, *optional*, defaults to `"gelu"`): + The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, + `"relu"`, `"silu"` and `"gelu_new"` are supported. + dropout (`float`, *optional*, defaults to 0.1): + The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. + attention_dropout (`float`, *optional*, defaults to 0.0): + The dropout ratio for the attention probabilities. + activation_dropout (`float`, *optional*, defaults to 0.0): + The dropout ratio for activations inside the fully connected layer. + classifier_dropout (`float`, *optional*, defaults to 0.0): + The dropout ratio for classifier. + max_position_embeddings (`int`, *optional*, defaults to 1024): + The maximum sequence length that this model might ever be used with. Typically set this to something large + just in case (e.g., 512 or 1024 or 2048). + init_std (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + encoder_layerdrop (`float`, *optional*, defaults to 0.0): + The LayerDrop probability for the encoder. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556) + for more details. + decoder_layerdrop (`float`, *optional*, defaults to 0.0): + The LayerDrop probability for the decoder. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556) + for more details. + scale_embedding (`bool`, *optional*, defaults to `False`): + Scale embeddings by diving by sqrt(d_model). + use_cache (`bool`, *optional*, defaults to `True`): + Whether or not the model should return the last key/values attentions (not used by all models). + num_labels (`int`, *optional*, defaults to 3): + The number of labels to use in [`BartForSequenceClassification`]. + forced_eos_token_id (`int`, *optional*, defaults to 2): + The id of the token to force as the last generated token when `max_length` is reached. Usually set to + `eos_token_id`. + + Example: + + ```python + >>> from transformers import BartConfig, BartModel + + >>> # Initializing a BART facebook/bart-large style configuration + >>> configuration = BartConfig() + + >>> # Initializing a model (with random weights) from the facebook/bart-large style configuration + >>> model = BartModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "bart" + keys_to_ignore_at_inference = ["past_key_values"] + attribute_map = {"num_attention_heads": "encoder_attention_heads", "hidden_size": "d_model"} + + def __init__( + self, + vocab_size=50265, + max_position_embeddings=1024, + encoder_layers=12, + encoder_ffn_dim=4096, + encoder_attention_heads=16, + decoder_layers=12, + decoder_ffn_dim=4096, + decoder_attention_heads=16, + encoder_layerdrop=0.0, + decoder_layerdrop=0.0, + activation_function="gelu", + d_model=1024, + dropout=0.1, + attention_dropout=0.0, + activation_dropout=0.0, + init_std=0.02, + classifier_dropout=0.0, + scale_embedding=False, + use_cache=True, + num_labels=3, + pad_token_id=1, + bos_token_id=0, + eos_token_id=2, + is_encoder_decoder=True, + decoder_start_token_id=2, + forced_eos_token_id=2, + **kwargs, + ): + self.vocab_size = vocab_size + self.max_position_embeddings = max_position_embeddings + self.d_model = d_model + self.encoder_ffn_dim = encoder_ffn_dim + self.encoder_layers = encoder_layers + self.encoder_attention_heads = encoder_attention_heads + self.decoder_ffn_dim = decoder_ffn_dim + self.decoder_layers = decoder_layers + self.decoder_attention_heads = decoder_attention_heads + self.dropout = dropout + self.attention_dropout = attention_dropout + self.activation_dropout = activation_dropout + self.activation_function = activation_function + self.init_std = init_std + self.encoder_layerdrop = encoder_layerdrop + self.decoder_layerdrop = decoder_layerdrop + self.classifier_dropout = classifier_dropout + self.use_cache = use_cache + self.num_hidden_layers = encoder_layers + self.scale_embedding = scale_embedding # scale factor will be sqrt(d_model) if True + + super().__init__( + num_labels=num_labels, + pad_token_id=pad_token_id, + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + is_encoder_decoder=is_encoder_decoder, + decoder_start_token_id=decoder_start_token_id, + forced_eos_token_id=forced_eos_token_id, + **kwargs, + ) + + # ensure backward compatibility for BART CNN models + if self.forced_bos_token_id is None and kwargs.get("force_bos_token_to_be_generated", False): + self.forced_bos_token_id = self.bos_token_id + warnings.warn( + f"Please make sure the config includes `forced_bos_token_id={self.bos_token_id}` in future versions. " + "The config can simply be saved and uploaded again to be fixed." + ) + + +class BartOnnxConfig(OnnxSeq2SeqConfigWithPast): + @property + def inputs(self) -> Mapping[str, Mapping[int, str]]: + if self.task in ["default", "seq2seq-lm"]: + common_inputs = OrderedDict( + [ + ("input_ids", {0: "batch", 1: "encoder_sequence"}), + ("attention_mask", {0: "batch", 1: "encoder_sequence"}), + ] + ) + + if self.use_past: + common_inputs["decoder_input_ids"] = {0: "batch"} + common_inputs["decoder_attention_mask"] = {0: "batch", 1: "past_decoder_sequence + sequence"} + else: + common_inputs["decoder_input_ids"] = {0: "batch", 1: "decoder_sequence"} + common_inputs["decoder_attention_mask"] = {0: "batch", 1: "decoder_sequence"} + + if self.use_past: + self.fill_with_past_key_values_(common_inputs, direction="inputs") + elif self.task == "causal-lm": + # TODO: figure this case out. + common_inputs = OrderedDict( + [ + ("input_ids", {0: "batch", 1: "encoder_sequence"}), + ("attention_mask", {0: "batch", 1: "encoder_sequence"}), + ] + ) + if self.use_past: + num_encoder_layers, _ = self.num_layers + for i in range(num_encoder_layers): + common_inputs[f"past_key_values.{i}.key"] = {0: "batch", 2: "past_sequence + sequence"} + common_inputs[f"past_key_values.{i}.value"] = {0: "batch", 2: "past_sequence + sequence"} + else: + common_inputs = OrderedDict( + [ + ("input_ids", {0: "batch", 1: "encoder_sequence"}), + ("attention_mask", {0: "batch", 1: "encoder_sequence"}), + ("decoder_input_ids", {0: "batch", 1: "decoder_sequence"}), + ("decoder_attention_mask", {0: "batch", 1: "decoder_sequence"}), + ] + ) + + return common_inputs + + @property + def outputs(self) -> Mapping[str, Mapping[int, str]]: + if self.task in ["default", "seq2seq-lm"]: + common_outputs = super().outputs + else: + common_outputs = super(OnnxConfigWithPast, self).outputs + if self.use_past: + num_encoder_layers, _ = self.num_layers + for i in range(num_encoder_layers): + common_outputs[f"present.{i}.key"] = {0: "batch", 2: "past_sequence + sequence"} + common_outputs[f"present.{i}.value"] = {0: "batch", 2: "past_sequence + sequence"} + return common_outputs + + def _generate_dummy_inputs_for_default_and_seq2seq_lm( + self, + tokenizer: PreTrainedTokenizer, + batch_size: int = -1, + seq_length: int = -1, + is_pair: bool = False, + framework: Optional[TensorType] = None, + ) -> Mapping[str, Any]: + encoder_inputs = self._generate_dummy_inputs_for_sequence_classification_and_question_answering( + tokenizer, batch_size, seq_length, is_pair, framework + ) + + # Generate decoder inputs + decoder_seq_length = seq_length if not self.use_past else 1 + decoder_inputs = self._generate_dummy_inputs_for_sequence_classification_and_question_answering( + tokenizer, batch_size, decoder_seq_length, is_pair, framework + ) + decoder_inputs = {f"decoder_{name}": tensor for name, tensor in decoder_inputs.items()} + common_inputs = dict(**encoder_inputs, **decoder_inputs) + + if self.use_past: + if not is_torch_available(): + raise ValueError("Cannot generate dummy past_keys inputs without PyTorch installed.") + else: + import torch + batch, encoder_seq_length = common_inputs["input_ids"].shape + decoder_seq_length = common_inputs["decoder_input_ids"].shape[1] + num_encoder_attention_heads, num_decoder_attention_heads = self.num_attention_heads + encoder_shape = ( + batch, + num_encoder_attention_heads, + encoder_seq_length, + self._config.hidden_size // num_encoder_attention_heads, + ) + decoder_past_length = decoder_seq_length + 3 + decoder_shape = ( + batch, + num_decoder_attention_heads, + decoder_past_length, + self._config.hidden_size // num_decoder_attention_heads, + ) + + common_inputs["decoder_attention_mask"] = torch.cat( + [common_inputs["decoder_attention_mask"], torch.ones(batch, decoder_past_length)], dim=1 + ) + + common_inputs["past_key_values"] = [] + # If the number of encoder and decoder layers are present in the model configuration, both are considered + num_encoder_layers, num_decoder_layers = self.num_layers + min_num_layers = min(num_encoder_layers, num_decoder_layers) + max_num_layers = max(num_encoder_layers, num_decoder_layers) - min_num_layers + remaining_side_name = "encoder" if num_encoder_layers > num_decoder_layers else "decoder" + + for _ in range(min_num_layers): + common_inputs["past_key_values"].append( + ( + torch.zeros(decoder_shape), + torch.zeros(decoder_shape), + torch.zeros(encoder_shape), + torch.zeros(encoder_shape), + ) + ) + # TODO: test this. + shape = encoder_shape if remaining_side_name == "encoder" else decoder_shape + for _ in range(min_num_layers, max_num_layers): + common_inputs["past_key_values"].append((torch.zeros(shape), torch.zeros(shape))) + return common_inputs + + def _generate_dummy_inputs_for_causal_lm( + self, + tokenizer: PreTrainedTokenizer, + batch_size: int = -1, + seq_length: int = -1, + is_pair: bool = False, + framework: Optional[TensorType] = None, + ) -> Mapping[str, Any]: + common_inputs = self._generate_dummy_inputs_for_sequence_classification_and_question_answering( + tokenizer, batch_size, seq_length, is_pair, framework + ) + + if self.use_past: + if not is_torch_available(): + raise ValueError("Cannot generate dummy past_keys inputs without PyTorch installed.") + else: + import torch + batch, seqlen = common_inputs["input_ids"].shape + # Not using the same length for past_key_values + past_key_values_length = seqlen + 2 + num_encoder_layers, _ = self.num_layers + num_encoder_attention_heads, _ = self.num_attention_heads + past_shape = ( + batch, + num_encoder_attention_heads, + past_key_values_length, + self._config.hidden_size // num_encoder_attention_heads, + ) + + mask_dtype = common_inputs["attention_mask"].dtype + common_inputs["attention_mask"] = torch.cat( + [common_inputs["attention_mask"], torch.ones(batch, past_key_values_length, dtype=mask_dtype)], dim=1 + ) + common_inputs["past_key_values"] = [ + (torch.zeros(past_shape), torch.zeros(past_shape)) for _ in range(num_encoder_layers) + ] + return common_inputs + + def _generate_dummy_inputs_for_sequence_classification_and_question_answering( + self, + tokenizer: PreTrainedTokenizer, + batch_size: int = -1, + seq_length: int = -1, + is_pair: bool = False, + framework: Optional[TensorType] = None, + ) -> Mapping[str, Any]: + # Copied from OnnxConfig.generate_dummy_inputs + # Did not use super(OnnxConfigWithPast, self).generate_dummy_inputs for code clarity. + # If dynamic axis (-1) we forward with a fixed dimension of 2 samples to avoid optimizations made by ONNX + batch_size = compute_effective_axis_dimension( + batch_size, fixed_dimension=OnnxConfig.default_fixed_batch, num_token_to_add=0 + ) + + # If dynamic axis (-1) we forward with a fixed dimension of 8 tokens to avoid optimizations made by ONNX + token_to_add = tokenizer.num_special_tokens_to_add(is_pair) + seq_length = compute_effective_axis_dimension( + seq_length, fixed_dimension=OnnxConfig.default_fixed_sequence, num_token_to_add=token_to_add + ) + + # Generate dummy inputs according to compute batch and sequence + dummy_input = [" ".join([tokenizer.unk_token]) * seq_length] * batch_size + common_inputs = dict(tokenizer(dummy_input, return_tensors=framework)) + return common_inputs + + def generate_dummy_inputs( + self, + tokenizer: PreTrainedTokenizer, + batch_size: int = -1, + seq_length: int = -1, + is_pair: bool = False, + framework: Optional[TensorType] = None, + ) -> Mapping[str, Any]: + if self.task in ["default", "seq2seq-lm"]: + common_inputs = self._generate_dummy_inputs_for_default_and_seq2seq_lm( + tokenizer, batch_size=batch_size, seq_length=seq_length, is_pair=is_pair, framework=framework + ) + + elif self.task == "causal-lm": + common_inputs = self._generate_dummy_inputs_for_causal_lm( + tokenizer, batch_size=batch_size, seq_length=seq_length, is_pair=is_pair, framework=framework + ) + else: + common_inputs = self._generate_dummy_inputs_for_sequence_classification_and_question_answering( + tokenizer, batch_size=batch_size, seq_length=seq_length, is_pair=is_pair, framework=framework + ) + + return common_inputs + + def _flatten_past_key_values_(self, flattened_output, name, idx, t): + if self.task in ["default", "seq2seq-lm"]: + flattened_output = super()._flatten_past_key_values_(flattened_output, name, idx, t) + else: + flattened_output = super(OnnxSeq2SeqConfigWithPast, self)._flatten_past_key_values_( + flattened_output, name, idx, t + ) + + +__all__ = ["BartConfig", "BartOnnxConfig"] diff --git a/docs/transformers/build/lib/transformers/models/bart/convert_bart_original_pytorch_checkpoint_to_pytorch.py b/docs/transformers/build/lib/transformers/models/bart/convert_bart_original_pytorch_checkpoint_to_pytorch.py new file mode 100644 index 0000000000000000000000000000000000000000..84dc415443f0e81a43a528a9942142639bfae28e --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/bart/convert_bart_original_pytorch_checkpoint_to_pytorch.py @@ -0,0 +1,156 @@ +# coding=utf-8 +# Copyright 2020 The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Convert BART checkpoint.""" + +import argparse +import os +from pathlib import Path + +import fairseq +import torch +from packaging import version +from torch import nn + +from transformers import ( + BartConfig, + BartForConditionalGeneration, + BartForSequenceClassification, + BartModel, + BartTokenizer, +) +from transformers.utils import logging + + +FAIRSEQ_MODELS = ["bart.large", "bart.large.mnli", "bart.large.cnn", "bart_xsum/model.pt"] +extra_arch = {"bart.large": BartModel, "bart.large.mnli": BartForSequenceClassification} +if version.parse(fairseq.__version__) < version.parse("0.9.0"): + raise Exception("requires fairseq >= 0.9.0") + + +logging.set_verbosity_info() +logger = logging.get_logger(__name__) + +SAMPLE_TEXT = " Hello world! cécé herlolip" + +mnli_rename_keys = [ + ("model.classification_heads.mnli.dense.weight", "classification_head.dense.weight"), + ("model.classification_heads.mnli.dense.bias", "classification_head.dense.bias"), + ("model.classification_heads.mnli.out_proj.weight", "classification_head.out_proj.weight"), + ("model.classification_heads.mnli.out_proj.bias", "classification_head.out_proj.bias"), +] + + +def remove_ignore_keys_(state_dict): + ignore_keys = [ + "encoder.version", + "decoder.version", + "model.encoder.version", + "model.decoder.version", + "_float_tensor", + ] + for k in ignore_keys: + state_dict.pop(k, None) + + +def rename_key(dct, old, new): + val = dct.pop(old) + dct[new] = val + + +def load_xsum_checkpoint(checkpoint_path): + """Checkpoint path should end in model.pt""" + sd = torch.load(checkpoint_path, map_location="cpu", weights_only=True) + hub_interface = torch.hub.load("pytorch/fairseq", "bart.large.cnn").eval() + hub_interface.model.load_state_dict(sd["model"]) + return hub_interface + + +def make_linear_from_emb(emb): + vocab_size, emb_size = emb.weight.shape + lin_layer = nn.Linear(vocab_size, emb_size, bias=False) + lin_layer.weight.data = emb.weight.data + return lin_layer + + +@torch.no_grad() +def convert_bart_checkpoint(checkpoint_path, pytorch_dump_folder_path, hf_checkpoint_name=None): + """ + Copy/paste/tweak model's weights to our BERT structure. + """ + if not os.path.exists(checkpoint_path): + bart = torch.hub.load("pytorch/fairseq", checkpoint_path).eval() + else: + bart = load_xsum_checkpoint(checkpoint_path) + + bart.model.upgrade_state_dict(bart.model.state_dict()) + if hf_checkpoint_name is None: + hf_checkpoint_name = checkpoint_path.replace(".", "-") + config = BartConfig.from_pretrained(hf_checkpoint_name) + tokens = bart.encode(SAMPLE_TEXT).unsqueeze(0) + tokens2 = BartTokenizer.from_pretrained(hf_checkpoint_name).encode(SAMPLE_TEXT, return_tensors="pt").unsqueeze(0) + if not torch.eq(tokens, tokens2).all(): + raise ValueError( + f"converted tokenizer and pretrained tokenizer returned different output: {tokens} != {tokens2}" + ) + + if checkpoint_path == "bart.large.mnli": + state_dict = bart.state_dict() + remove_ignore_keys_(state_dict) + state_dict["model.shared.weight"] = state_dict["model.decoder.embed_tokens.weight"] + for src, dest in mnli_rename_keys: + rename_key(state_dict, src, dest) + model = BartForSequenceClassification(config).eval() + model.load_state_dict(state_dict) + fairseq_output = bart.predict("mnli", tokens, return_logits=True) + new_model_outputs = model(tokens)[0] # logits + else: # no classification heads to worry about + state_dict = bart.model.state_dict() + remove_ignore_keys_(state_dict) + state_dict["shared.weight"] = state_dict["decoder.embed_tokens.weight"] + fairseq_output = bart.extract_features(tokens) + if hf_checkpoint_name == "facebook/bart-large": + model = BartModel(config).eval() + model.load_state_dict(state_dict) + new_model_outputs = model(tokens).model[0] + else: + model = BartForConditionalGeneration(config).eval() # an existing summarization ckpt + model.model.load_state_dict(state_dict) + if hasattr(model, "lm_head"): + model.lm_head = make_linear_from_emb(model.model.shared) + new_model_outputs = model.model(tokens)[0] + + # Check results + if fairseq_output.shape != new_model_outputs.shape: + raise ValueError( + f"`fairseq_output` shape and `new_model_output` shape are different: {fairseq_output.shape=}, {new_model_outputs.shape}" + ) + if (fairseq_output != new_model_outputs).any().item(): + raise ValueError("Some values in `fairseq_output` are different from `new_model_outputs`") + Path(pytorch_dump_folder_path).mkdir(exist_ok=True) + model.save_pretrained(pytorch_dump_folder_path) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + # Required parameters + parser.add_argument( + "fairseq_path", type=str, help="bart.large, bart.large.cnn or a path to a model.pt on local filesystem." + ) + parser.add_argument("pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.") + parser.add_argument( + "--hf_config", default=None, type=str, help="Which huggingface architecture to use: bart-large-xsum" + ) + args = parser.parse_args() + convert_bart_checkpoint(args.fairseq_path, args.pytorch_dump_folder_path, hf_checkpoint_name=args.hf_config) diff --git a/docs/transformers/build/lib/transformers/models/bart/modeling_bart.py b/docs/transformers/build/lib/transformers/models/bart/modeling_bart.py new file mode 100644 index 0000000000000000000000000000000000000000..bc3d4dcd936c1a52b70634c41998f1288f82f50a --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/bart/modeling_bart.py @@ -0,0 +1,2184 @@ +# coding=utf-8 +# Copyright 2021 The Fairseq Authors and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch BART model.""" + +import copy +import math +import warnings +from typing import List, Optional, Tuple, Union + +import torch +import torch.utils.checkpoint +from torch import nn +from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss + +from ...activations import ACT2FN +from ...generation import GenerationMixin +from ...modeling_attn_mask_utils import ( + _prepare_4d_attention_mask, + _prepare_4d_attention_mask_for_sdpa, + _prepare_4d_causal_attention_mask, + _prepare_4d_causal_attention_mask_for_sdpa, +) +from ...modeling_flash_attention_utils import flash_attn_supports_top_left_mask, is_flash_attn_available +from ...modeling_outputs import ( + BaseModelOutput, + BaseModelOutputWithPastAndCrossAttentions, + CausalLMOutputWithCrossAttentions, + Seq2SeqLMOutput, + Seq2SeqModelOutput, + Seq2SeqQuestionAnsweringModelOutput, + Seq2SeqSequenceClassifierOutput, +) +from ...modeling_utils import PreTrainedModel +from ...utils import ( + add_code_sample_docstrings, + add_end_docstrings, + add_start_docstrings, + add_start_docstrings_to_model_forward, + logging, + replace_return_docstrings, +) +from .configuration_bart import BartConfig + + +if is_flash_attn_available(): + from ...modeling_flash_attention_utils import _flash_attention_forward + + +logger = logging.get_logger(__name__) + +_CHECKPOINT_FOR_DOC = "facebook/bart-base" +_CONFIG_FOR_DOC = "BartConfig" + +# Base model docstring +_EXPECTED_OUTPUT_SHAPE = [1, 8, 768] + +# SequenceClassification docstring +_CHECKPOINT_FOR_SEQUENCE_CLASSIFICATION = "valhalla/bart-large-sst2" +_SEQ_CLASS_EXPECTED_LOSS = 0.0 +_SEQ_CLASS_EXPECTED_OUTPUT = "'POSITIVE'" + +# QuestionAsnwering docstring +_CHECKPOINT_FOR_QA = "valhalla/bart-large-finetuned-squadv1" +_QA_EXPECTED_LOSS = 0.59 +_QA_EXPECTED_OUTPUT = "' nice puppet'" + + +def shift_tokens_right(input_ids: torch.Tensor, pad_token_id: int, decoder_start_token_id: int): + """ + Shift input ids one token to the right. + """ + shifted_input_ids = input_ids.new_zeros(input_ids.shape) + shifted_input_ids[:, 1:] = input_ids[:, :-1].clone() + shifted_input_ids[:, 0] = decoder_start_token_id + + if pad_token_id is None: + raise ValueError("self.model.config.pad_token_id has to be defined.") + # replace possible -100 values in labels by `pad_token_id` + shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id) + + return shifted_input_ids + + +class BartLearnedPositionalEmbedding(nn.Embedding): + """ + This module learns positional embeddings up to a fixed maximum size. + """ + + def __init__(self, num_embeddings: int, embedding_dim: int): + # Bart is set up so that if padding_idx is specified then offset the embedding ids by 2 + # and adjust num_embeddings appropriately. Other models don't have this hack + self.offset = 2 + super().__init__(num_embeddings + self.offset, embedding_dim) + + def forward(self, input_ids: torch.Tensor, past_key_values_length: int = 0): + """`input_ids' shape is expected to be [bsz x seqlen].""" + + bsz, seq_len = input_ids.shape[:2] + positions = torch.arange( + past_key_values_length, past_key_values_length + seq_len, dtype=torch.long, device=self.weight.device + ).expand(bsz, -1) + + return super().forward(positions + self.offset) + + +class BartScaledWordEmbedding(nn.Embedding): + """ + This module overrides nn.Embeddings' forward by multiplying with embeddings scale. + """ + + def __init__(self, num_embeddings: int, embedding_dim: int, padding_idx: int, embed_scale: Optional[float] = 1.0): + super().__init__(num_embeddings, embedding_dim, padding_idx) + self.embed_scale = embed_scale + + def forward(self, input_ids: torch.Tensor): + return super().forward(input_ids) * self.embed_scale + + +class BartAttention(nn.Module): + """Multi-headed attention from 'Attention Is All You Need' paper""" + + def __init__( + self, + embed_dim: int, + num_heads: int, + dropout: float = 0.0, + is_decoder: bool = False, + bias: bool = True, + is_causal: bool = False, + config: Optional[BartConfig] = None, + ): + super().__init__() + self.embed_dim = embed_dim + self.num_heads = num_heads + self.dropout = dropout + self.head_dim = embed_dim // num_heads + self.config = config + + if (self.head_dim * num_heads) != self.embed_dim: + raise ValueError( + f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}" + f" and `num_heads`: {num_heads})." + ) + self.scaling = self.head_dim**-0.5 + self.is_decoder = is_decoder + self.is_causal = is_causal + + self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias) + self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias) + self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias) + self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias) + + def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int): + return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous() + + def forward( + self, + hidden_states: torch.Tensor, + key_value_states: Optional[torch.Tensor] = None, + past_key_value: Optional[Tuple[torch.Tensor]] = None, + attention_mask: Optional[torch.Tensor] = None, + layer_head_mask: Optional[torch.Tensor] = None, + output_attentions: bool = False, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + """Input shape: Batch x Time x Channel""" + + # if key_value_states are provided this layer is used as a cross-attention layer + # for the decoder + is_cross_attention = key_value_states is not None + + bsz, tgt_len, _ = hidden_states.size() + + # get query proj + query_states = self.q_proj(hidden_states) * self.scaling + # get key, value proj + # `past_key_value[0].shape[2] == key_value_states.shape[1]` + # is checking that the `sequence_length` of the `past_key_value` is the same as + # the provided `key_value_states` to support prefix tuning + if ( + is_cross_attention + and past_key_value is not None + and past_key_value[0].shape[2] == key_value_states.shape[1] + ): + # reuse k,v, cross_attentions + key_states = past_key_value[0] + value_states = past_key_value[1] + elif is_cross_attention: + # cross_attentions + key_states = self._shape(self.k_proj(key_value_states), -1, bsz) + value_states = self._shape(self.v_proj(key_value_states), -1, bsz) + elif past_key_value is not None: + # reuse k, v, self_attention + key_states = self._shape(self.k_proj(hidden_states), -1, bsz) + value_states = self._shape(self.v_proj(hidden_states), -1, bsz) + key_states = torch.cat([past_key_value[0], key_states], dim=2) + value_states = torch.cat([past_key_value[1], value_states], dim=2) + else: + # self_attention + key_states = self._shape(self.k_proj(hidden_states), -1, bsz) + value_states = self._shape(self.v_proj(hidden_states), -1, bsz) + + if self.is_decoder: + # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states. + # Further calls to cross_attention layer can then reuse all cross-attention + # key/value_states (first "if" case) + # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of + # all previous decoder key/value_states. Further calls to uni-directional self-attention + # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case) + # if encoder bi-directional self-attention `past_key_value` is always `None` + past_key_value = (key_states, value_states) + + proj_shape = (bsz * self.num_heads, -1, self.head_dim) + query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape) + key_states = key_states.reshape(*proj_shape) + value_states = value_states.reshape(*proj_shape) + + src_len = key_states.size(1) + attn_weights = torch.bmm(query_states, key_states.transpose(1, 2)) + + if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len): + raise ValueError( + f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is" + f" {attn_weights.size()}" + ) + + if attention_mask is not None: + if attention_mask.size() != (bsz, 1, tgt_len, src_len): + raise ValueError( + f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}" + ) + attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attention_mask + attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len) + + attn_weights = nn.functional.softmax(attn_weights, dim=-1) + + if layer_head_mask is not None: + if layer_head_mask.size() != (self.num_heads,): + raise ValueError( + f"Head mask for a single layer should be of size {(self.num_heads,)}, but is" + f" {layer_head_mask.size()}" + ) + attn_weights = layer_head_mask.view(1, -1, 1, 1) * attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len) + + if output_attentions: + # this operation is a bit awkward, but it's required to + # make sure that attn_weights keeps its gradient. + # In order to do so, attn_weights have to be reshaped + # twice and have to be reused in the following + attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, src_len) + else: + attn_weights_reshaped = None + + attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training) + + attn_output = torch.bmm(attn_probs, value_states) + + if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim): + raise ValueError( + f"`attn_output` should be of size {(bsz * self.num_heads, tgt_len, self.head_dim)}, but is" + f" {attn_output.size()}" + ) + + attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim) + attn_output = attn_output.transpose(1, 2) + + # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be + # partitioned across GPUs when using tensor-parallelism. + attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim) + + attn_output = self.out_proj(attn_output) + + return attn_output, attn_weights_reshaped, past_key_value + + +class BartFlashAttention2(BartAttention): + """ + Bart flash attention module. This module inherits from `BartAttention` as the weights of the module stays + untouched. The only required change would be on the forward pass where it needs to correctly call the public API of + flash attention and deal with padding tokens in case the input contains any of them. + """ + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1. + # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignment, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0. + # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left). + self._flash_attn_uses_top_left_mask = flash_attn_supports_top_left_mask() + + def _reshape(self, tensor: torch.Tensor, seq_len: int, bsz: int): + return tensor.view(bsz, seq_len, self.num_heads, self.head_dim) + + def forward( + self, + hidden_states: torch.Tensor, + key_value_states: Optional[torch.Tensor] = None, + past_key_value: Optional[Tuple[torch.Tensor]] = None, + attention_mask: Optional[torch.Tensor] = None, + layer_head_mask: Optional[torch.Tensor] = None, + output_attentions: bool = False, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + # BartFlashAttention2 attention does not support output_attentions + if output_attentions: + raise ValueError("BartFlashAttention2 attention does not support output_attentions") + + # if key_value_states are provided this layer is used as a cross-attention layer + # for the decoder + is_cross_attention = key_value_states is not None + + bsz, q_len, _ = hidden_states.size() + + # get query proj + query_states = self._reshape(self.q_proj(hidden_states), -1, bsz) + # get key, value proj + # `past_key_value[0].shape[2] == key_value_states.shape[1]` + # is checking that the `sequence_length` of the `past_key_value` is the same as + # the provided `key_value_states` to support prefix tuning + if ( + is_cross_attention + and past_key_value is not None + and past_key_value[0].shape[2] == key_value_states.shape[1] + ): + # reuse k,v, cross_attentions + key_states = past_key_value[0].transpose(1, 2) + value_states = past_key_value[1].transpose(1, 2) + elif is_cross_attention: + # cross_attentions + key_states = self._reshape(self.k_proj(key_value_states), -1, bsz) + value_states = self._reshape(self.v_proj(key_value_states), -1, bsz) + elif past_key_value is not None: + # reuse k, v, self_attention + key_states = self._reshape(self.k_proj(hidden_states), -1, bsz) + value_states = self._reshape(self.v_proj(hidden_states), -1, bsz) + key_states = torch.cat([past_key_value[0].transpose(1, 2), key_states], dim=1) + value_states = torch.cat([past_key_value[1].transpose(1, 2), value_states], dim=1) + else: + # self_attention + key_states = self._reshape(self.k_proj(hidden_states), -1, bsz) + value_states = self._reshape(self.v_proj(hidden_states), -1, bsz) + + if self.is_decoder: + # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states. + # Further calls to cross_attention layer can then reuse all cross-attention + # key/value_states (first "if" case) + # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of + # all previous decoder key/value_states. Further calls to uni-directional self-attention + # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case) + # if encoder bi-directional self-attention `past_key_value` is always `None` + past_key_value = (key_states.transpose(1, 2), value_states.transpose(1, 2)) + + kv_seq_len = key_states.shape[-2] + if past_key_value is not None: + kv_seq_len += past_key_value[0].shape[-2] + + # In PEFT, usually we cast the layer norms in float32 for training stability reasons + # therefore the input hidden states gets silently casted in float32. Hence, we need + # cast them back in the correct dtype just to be sure everything works as expected. + # This might slowdown training & inference so it is recommended to not cast the LayerNorms + # in fp32. (LlamaRMSNorm handles it correctly) + + input_dtype = query_states.dtype + if input_dtype == torch.float32: + if torch.is_autocast_enabled(): + target_dtype = torch.get_autocast_gpu_dtype() + # Handle the case where the model is quantized + elif hasattr(self.config, "_pre_quantization_dtype"): + target_dtype = self.config._pre_quantization_dtype + else: + target_dtype = self.q_proj.weight.dtype + + logger.warning_once( + f"The input hidden states seems to be silently casted in float32, this might be related to" + f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in" + f" {target_dtype}." + ) + + query_states = query_states.to(target_dtype) + key_states = key_states.to(target_dtype) + value_states = value_states.to(target_dtype) + + attn_output = _flash_attention_forward( + query_states, + key_states, + value_states, + attention_mask, + q_len, + dropout=self.dropout if self.training else 0.0, + is_causal=self.is_causal, + use_top_left_mask=self._flash_attn_uses_top_left_mask, + ) + + attn_output = attn_output.reshape(bsz, q_len, -1) + attn_output = self.out_proj(attn_output) + + if not output_attentions: + attn_weights = None + + return attn_output, attn_weights, past_key_value + + +class BartSdpaAttention(BartAttention): + def forward( + self, + hidden_states: torch.Tensor, + key_value_states: Optional[torch.Tensor] = None, + past_key_value: Optional[Tuple[torch.Tensor]] = None, + attention_mask: Optional[torch.Tensor] = None, + layer_head_mask: Optional[torch.Tensor] = None, + output_attentions: bool = False, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + """Input shape: Batch x Time x Channel""" + if output_attentions or layer_head_mask is not None: + # TODO: Improve this warning with e.g. `model.config._attn_implementation = "manual"` once this is implemented. + logger.warning_once( + "BartModel is using BartSdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True` or `layer_head_mask` not None. Falling back to the manual attention" + ' implementation, but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.' + ) + return super().forward( + hidden_states, + key_value_states=key_value_states, + past_key_value=past_key_value, + attention_mask=attention_mask, + layer_head_mask=layer_head_mask, + output_attentions=output_attentions, + ) + + # if key_value_states are provided this layer is used as a cross-attention layer + # for the decoder + is_cross_attention = key_value_states is not None + + bsz, tgt_len, _ = hidden_states.size() + + # get query proj + query_states = self.q_proj(hidden_states) + # get key, value proj + # `past_key_value[0].shape[2] == key_value_states.shape[1]` + # is checking that the `sequence_length` of the `past_key_value` is the same as + # the provided `key_value_states` to support prefix tuning + if ( + is_cross_attention + and past_key_value is not None + and past_key_value[0].shape[2] == key_value_states.shape[1] + ): + # reuse k,v, cross_attentions + key_states = past_key_value[0] + value_states = past_key_value[1] + elif is_cross_attention: + # cross_attentions + key_states = self._shape(self.k_proj(key_value_states), -1, bsz) + value_states = self._shape(self.v_proj(key_value_states), -1, bsz) + elif past_key_value is not None: + # reuse k, v, self_attention + key_states = self._shape(self.k_proj(hidden_states), -1, bsz) + value_states = self._shape(self.v_proj(hidden_states), -1, bsz) + key_states = torch.cat([past_key_value[0], key_states], dim=2) + value_states = torch.cat([past_key_value[1], value_states], dim=2) + else: + # self_attention + key_states = self._shape(self.k_proj(hidden_states), -1, bsz) + value_states = self._shape(self.v_proj(hidden_states), -1, bsz) + + if self.is_decoder: + # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states. + # Further calls to cross_attention layer can then reuse all cross-attention + # key/value_states (first "if" case) + # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of + # all previous decoder key/value_states. Further calls to uni-directional self-attention + # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case) + # if encoder bi-directional self-attention `past_key_value` is always `None` + past_key_value = (key_states, value_states) + + query_states = self._shape(query_states, tgt_len, bsz) + + # We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment + # in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling. + # The tgt_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case tgt_len == 1. + is_causal = True if self.is_causal and attention_mask is None and tgt_len > 1 else False + + # NOTE: SDPA with memory-efficient backend is currently (torch==2.1.2) bugged when using non-contiguous inputs and a custom attn_mask, + # but we are fine here as `_shape` do call `.contiguous()`. Reference: https://github.com/pytorch/pytorch/issues/112577 + attn_output = torch.nn.functional.scaled_dot_product_attention( + query_states, + key_states, + value_states, + attn_mask=attention_mask, + dropout_p=self.dropout if self.training else 0.0, + is_causal=is_causal, + ) + + if attn_output.size() != (bsz, self.num_heads, tgt_len, self.head_dim): + raise ValueError( + f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is" + f" {attn_output.size()}" + ) + + attn_output = attn_output.transpose(1, 2) + + # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be + # partitioned across GPUs when using tensor-parallelism. + attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim) + + attn_output = self.out_proj(attn_output) + + return attn_output, None, past_key_value + + +BART_ATTENTION_CLASSES = { + "eager": BartAttention, + "sdpa": BartSdpaAttention, + "flash_attention_2": BartFlashAttention2, +} + + +class BartEncoderLayer(nn.Module): + def __init__(self, config: BartConfig): + super().__init__() + self.embed_dim = config.d_model + + self.self_attn = BART_ATTENTION_CLASSES[config._attn_implementation]( + embed_dim=self.embed_dim, + num_heads=config.encoder_attention_heads, + dropout=config.attention_dropout, + config=config, + ) + self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim) + self.dropout = config.dropout + self.activation_fn = ACT2FN[config.activation_function] + self.activation_dropout = config.activation_dropout + self.fc1 = nn.Linear(self.embed_dim, config.encoder_ffn_dim) + self.fc2 = nn.Linear(config.encoder_ffn_dim, self.embed_dim) + self.final_layer_norm = nn.LayerNorm(self.embed_dim) + + def forward( + self, + hidden_states: torch.FloatTensor, + attention_mask: torch.FloatTensor, + layer_head_mask: torch.FloatTensor, + output_attentions: Optional[bool] = False, + ) -> Tuple[torch.FloatTensor, Optional[torch.FloatTensor]]: + """ + Args: + hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)` + attention_mask (`torch.FloatTensor`): attention mask of size + `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values. + layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size + `(encoder_attention_heads,)`. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + """ + residual = hidden_states + hidden_states, attn_weights, _ = self.self_attn( + hidden_states=hidden_states, + attention_mask=attention_mask, + layer_head_mask=layer_head_mask, + output_attentions=output_attentions, + ) + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = residual + hidden_states + hidden_states = self.self_attn_layer_norm(hidden_states) + + residual = hidden_states + hidden_states = self.activation_fn(self.fc1(hidden_states)) + hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training) + hidden_states = self.fc2(hidden_states) + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = residual + hidden_states + hidden_states = self.final_layer_norm(hidden_states) + + if hidden_states.dtype == torch.float16 and ( + torch.isinf(hidden_states).any() or torch.isnan(hidden_states).any() + ): + clamp_value = torch.finfo(hidden_states.dtype).max - 1000 + hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value) + + outputs = (hidden_states,) + + if output_attentions: + outputs += (attn_weights,) + + return outputs + + +class BartDecoderLayer(nn.Module): + def __init__(self, config: BartConfig): + super().__init__() + self.embed_dim = config.d_model + + self.self_attn = BART_ATTENTION_CLASSES[config._attn_implementation]( + embed_dim=self.embed_dim, + num_heads=config.decoder_attention_heads, + dropout=config.attention_dropout, + is_decoder=True, + is_causal=True, + config=config, + ) + self.dropout = config.dropout + self.activation_fn = ACT2FN[config.activation_function] + self.activation_dropout = config.activation_dropout + + self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim) + self.encoder_attn = BART_ATTENTION_CLASSES[config._attn_implementation]( + self.embed_dim, + config.decoder_attention_heads, + dropout=config.attention_dropout, + is_decoder=True, + config=config, + ) + self.encoder_attn_layer_norm = nn.LayerNorm(self.embed_dim) + self.fc1 = nn.Linear(self.embed_dim, config.decoder_ffn_dim) + self.fc2 = nn.Linear(config.decoder_ffn_dim, self.embed_dim) + self.final_layer_norm = nn.LayerNorm(self.embed_dim) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, + encoder_attention_mask: Optional[torch.Tensor] = None, + layer_head_mask: Optional[torch.Tensor] = None, + cross_attn_layer_head_mask: Optional[torch.Tensor] = None, + past_key_value: Optional[Tuple[torch.Tensor]] = None, + output_attentions: Optional[bool] = False, + use_cache: Optional[bool] = True, + ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]: + """ + Args: + hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)` + attention_mask (`torch.FloatTensor`): attention mask of size + `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values. + encoder_hidden_states (`torch.FloatTensor`): + cross attention input to the layer of shape `(batch, seq_len, embed_dim)` + encoder_attention_mask (`torch.FloatTensor`): encoder attention mask of size + `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values. + layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size + `(encoder_attention_heads,)`. + cross_attn_layer_head_mask (`torch.FloatTensor`): mask for cross-attention heads in a given layer of + size `(decoder_attention_heads,)`. + past_key_value (`Tuple(torch.FloatTensor)`): cached past key and value projection states + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + """ + residual = hidden_states + + # Self Attention + # decoder uni-directional self-attention cached key/values tuple is at positions 1,2 + self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None + # add present self-attn cache to positions 1,2 of present_key_value tuple + hidden_states, self_attn_weights, present_key_value = self.self_attn( + hidden_states=hidden_states, + past_key_value=self_attn_past_key_value, + attention_mask=attention_mask, + layer_head_mask=layer_head_mask, + output_attentions=output_attentions, + ) + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = residual + hidden_states + hidden_states = self.self_attn_layer_norm(hidden_states) + + # Cross-Attention Block + cross_attn_present_key_value = None + cross_attn_weights = None + if encoder_hidden_states is not None: + residual = hidden_states + + # cross_attn cached key/values tuple is at positions 3,4 of present_key_value tuple + cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None + hidden_states, cross_attn_weights, cross_attn_present_key_value = self.encoder_attn( + hidden_states=hidden_states, + key_value_states=encoder_hidden_states, + attention_mask=encoder_attention_mask, + layer_head_mask=cross_attn_layer_head_mask, + past_key_value=cross_attn_past_key_value, + output_attentions=output_attentions, + ) + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = residual + hidden_states + hidden_states = self.encoder_attn_layer_norm(hidden_states) + + # add cross-attn to positions 3,4 of present_key_value tuple + present_key_value = present_key_value + cross_attn_present_key_value + + # Fully Connected + residual = hidden_states + hidden_states = self.activation_fn(self.fc1(hidden_states)) + hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training) + hidden_states = self.fc2(hidden_states) + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = residual + hidden_states + hidden_states = self.final_layer_norm(hidden_states) + + outputs = (hidden_states,) + + if output_attentions: + outputs += (self_attn_weights, cross_attn_weights) + + if use_cache: + outputs += (present_key_value,) + + return outputs + + +class BartClassificationHead(nn.Module): + """Head for sentence-level classification tasks.""" + + def __init__( + self, + input_dim: int, + inner_dim: int, + num_classes: int, + pooler_dropout: float, + ): + super().__init__() + self.dense = nn.Linear(input_dim, inner_dim) + self.dropout = nn.Dropout(p=pooler_dropout) + self.out_proj = nn.Linear(inner_dim, num_classes) + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + hidden_states = self.dropout(hidden_states) + hidden_states = self.dense(hidden_states) + hidden_states = torch.tanh(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = self.out_proj(hidden_states) + return hidden_states + + +class BartPreTrainedModel(PreTrainedModel): + config_class = BartConfig + base_model_prefix = "model" + supports_gradient_checkpointing = True + _keys_to_ignore_on_load_unexpected = ["encoder.version", "decoder.version"] + _no_split_modules = [r"BartEncoderLayer", r"BartDecoderLayer"] + _skip_keys_device_placement = "past_key_values" + _supports_flash_attn_2 = True + _supports_sdpa = True + + def _init_weights(self, module): + std = self.config.init_std + if isinstance(module, nn.Linear): + module.weight.data.normal_(mean=0.0, std=std) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=std) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + + @property + def dummy_inputs(self): + pad_token = self.config.pad_token_id + input_ids = torch.tensor([[0, 6, 10, 4, 2], [0, 8, 12, 2, pad_token]], device=self.device) + dummy_inputs = { + "attention_mask": input_ids.ne(pad_token), + "input_ids": input_ids, + } + return dummy_inputs + + +class PretrainedBartModel(BartPreTrainedModel): + def __init_subclass__(self): + warnings.warn( + "The class `PretrainedBartModel` has been depreciated, please use `BartPreTrainedModel` instead.", + FutureWarning, + ) + + +class BartPretrainedModel(BartPreTrainedModel): + def __init_subclass__(self): + warnings.warn( + "The class `PretrainedBartModel` has been depreciated, please use `BartPreTrainedModel` instead.", + FutureWarning, + ) + + +BART_START_DOCSTRING = r""" + This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads + etc.) + + This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. + Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage + and behavior. + + Parameters: + config ([`BartConfig`]): + Model configuration class with all the parameters of the model. Initializing with a config file does not + load the weights associated with the model, only the configuration. Check out the + [`~PreTrainedModel.from_pretrained`] method to load the model weights. +""" + +BART_GENERATION_EXAMPLE = r""" + Summarization example: + + ```python + >>> from transformers import AutoTokenizer, BartForConditionalGeneration + + >>> model = BartForConditionalGeneration.from_pretrained("facebook/bart-large-cnn") + >>> tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn") + + >>> ARTICLE_TO_SUMMARIZE = ( + ... "PG&E stated it scheduled the blackouts in response to forecasts for high winds " + ... "amid dry conditions. The aim is to reduce the risk of wildfires. Nearly 800 thousand customers were " + ... "scheduled to be affected by the shutoffs which were expected to last through at least midday tomorrow." + ... ) + >>> inputs = tokenizer([ARTICLE_TO_SUMMARIZE], max_length=1024, return_tensors="pt") + + >>> # Generate Summary + >>> summary_ids = model.generate(inputs["input_ids"], num_beams=2, min_length=0, max_length=20) + >>> tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] + 'PG&E scheduled the blackouts in response to forecasts for high winds amid dry conditions' + ``` + + Mask filling example: + + ```python + >>> from transformers import AutoTokenizer, BartForConditionalGeneration + + >>> tokenizer = AutoTokenizer.from_pretrained("facebook/bart-base") + >>> model = BartForConditionalGeneration.from_pretrained("facebook/bart-base") + + >>> TXT = "My friends are but they eat too many carbs." + >>> input_ids = tokenizer([TXT], return_tensors="pt")["input_ids"] + >>> logits = model(input_ids).logits + + >>> masked_index = (input_ids[0] == tokenizer.mask_token_id).nonzero().item() + >>> probs = logits[0, masked_index].softmax(dim=0) + >>> values, predictions = probs.topk(5) + + >>> tokenizer.decode(predictions).split() + ['not', 'good', 'healthy', 'great', 'very'] + ``` +""" + +BART_INPUTS_DOCSTRING = r""" + Args: + input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide + it. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*): + Indices of decoder input sequence tokens in the vocabulary. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are decoder input IDs?](../glossary#decoder-input-ids) + + Bart uses the `eos_token_id` as the starting token for `decoder_input_ids` generation. If `past_key_values` + is used, optionally only the last `decoder_input_ids` have to be input (see `past_key_values`). + + For translation and summarization training, `decoder_input_ids` should be provided. If no + `decoder_input_ids` is provided, the model will create this tensor by shifting the `input_ids` to the right + for denoising pre-training following the paper. + decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*): + Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also + be used by default. + + If you want to change padding behavior, you should read [`modeling_bart._prepare_decoder_attention_mask`] + and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more + information on the default strategy. + head_mask (`torch.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*): + Mask to nullify selected heads of the attention modules in the encoder. Mask values selected in `[0, 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + decoder_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*): + Mask to nullify selected heads of the attention modules in the decoder. Mask values selected in `[0, 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*): + Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in `[0, + 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + encoder_outputs (`tuple(tuple(torch.FloatTensor)`, *optional*): + Tuple consists of (`last_hidden_state`, *optional*: `hidden_states`, *optional*: `attentions`) + `last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)`, *optional*) is a sequence of + hidden-states at the output of the last layer of the encoder. Used in the cross-attention of the decoder. + past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): + Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape + `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape + `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`. + + Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention + blocks) that can be used (see `past_key_values` input) to speed up sequential decoding. + + If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that + don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all + `decoder_input_ids` of shape `(batch_size, sequence_length)`. + inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. + This is useful if you want more control over how to convert `input_ids` indices into associated vectors + than the model's internal embedding lookup matrix. + decoder_inputs_embeds (`torch.FloatTensor` of shape `(batch_size, target_sequence_length, hidden_size)`, *optional*): + Optionally, instead of passing `decoder_input_ids` you can choose to directly pass an embedded + representation. If `past_key_values` is used, optionally only the last `decoder_inputs_embeds` have to be + input (see `past_key_values`). This is useful if you want more control over how to convert + `decoder_input_ids` indices into associated vectors than the model's internal embedding lookup matrix. + + If `decoder_input_ids` and `decoder_inputs_embeds` are both unset, `decoder_inputs_embeds` takes the value + of `inputs_embeds`. + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see + `past_key_values`). + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. +""" + + +class BartEncoder(BartPreTrainedModel): + """ + Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a + [`BartEncoderLayer`]. + + Args: + config: BartConfig + embed_tokens (nn.Embedding): output embedding + """ + + def __init__(self, config: BartConfig, embed_tokens: Optional[nn.Embedding] = None): + super().__init__(config) + + self.dropout = config.dropout + self.layerdrop = config.encoder_layerdrop + + embed_dim = config.d_model + self.padding_idx = config.pad_token_id + self.max_source_positions = config.max_position_embeddings + embed_scale = math.sqrt(embed_dim) if config.scale_embedding else 1.0 + + self.embed_tokens = BartScaledWordEmbedding( + config.vocab_size, embed_dim, self.padding_idx, embed_scale=embed_scale + ) + + if embed_tokens is not None: + self.embed_tokens.weight = embed_tokens.weight + + self.embed_positions = BartLearnedPositionalEmbedding( + config.max_position_embeddings, + embed_dim, + ) + self.layers = nn.ModuleList([BartEncoderLayer(config) for _ in range(config.encoder_layers)]) + self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2" + self._use_sdpa = config._attn_implementation == "sdpa" + self.layernorm_embedding = nn.LayerNorm(embed_dim) + + self.gradient_checkpointing = False + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.embed_tokens + + def set_input_embeddings(self, value): + self.embed_tokens = value + + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, BaseModelOutput]: + r""" + Args: + input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you + provide it. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + head_mask (`torch.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*): + Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. + This is useful if you want more control over how to convert `input_ids` indices into associated vectors + than the model's internal embedding lookup matrix. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors + for more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. + """ + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + # retrieve input_ids and inputs_embeds + if input_ids is not None and inputs_embeds is not None: + raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") + elif input_ids is not None: + input = input_ids + input_ids = input_ids.view(-1, input_ids.shape[-1]) + elif inputs_embeds is not None: + input = inputs_embeds[:, :, -1] + else: + raise ValueError("You have to specify either input_ids or inputs_embeds") + + if inputs_embeds is None: + inputs_embeds = self.embed_tokens(input_ids) + + embed_pos = self.embed_positions(input) + embed_pos = embed_pos.to(inputs_embeds.device) + + hidden_states = inputs_embeds + embed_pos + hidden_states = self.layernorm_embedding(hidden_states) + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) + + # expand attention_mask + if attention_mask is not None: + if self._use_flash_attention_2: + attention_mask = attention_mask if 0 in attention_mask else None + elif self._use_sdpa and head_mask is None and not output_attentions: + # output_attentions=True & head_mask can not be supported when using SDPA, fall back to + # the manual implementation that requires a 4D causal mask in all cases. + # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] + attention_mask = _prepare_4d_attention_mask_for_sdpa(attention_mask, inputs_embeds.dtype) + else: + # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] + attention_mask = _prepare_4d_attention_mask(attention_mask, inputs_embeds.dtype) + + encoder_states = () if output_hidden_states else None + all_attentions = () if output_attentions else None + + # check if head_mask has a correct number of layers specified if desired + if head_mask is not None: + if head_mask.size()[0] != (len(self.layers)): + raise ValueError( + f"The head_mask should be specified for {len(self.layers)} layers, but it is for" + f" {head_mask.size()[0]}." + ) + + for idx, encoder_layer in enumerate(self.layers): + if output_hidden_states: + encoder_states = encoder_states + (hidden_states,) + # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) + to_drop = False + if self.training: + dropout_probability = torch.rand([]) + if dropout_probability < self.layerdrop: # skip the layer + to_drop = True + + if to_drop: + layer_outputs = (None, None) + else: + if self.gradient_checkpointing and self.training: + layer_outputs = self._gradient_checkpointing_func( + encoder_layer.__call__, + hidden_states, + attention_mask, + (head_mask[idx] if head_mask is not None else None), + output_attentions, + ) + else: + layer_outputs = encoder_layer( + hidden_states, + attention_mask, + layer_head_mask=(head_mask[idx] if head_mask is not None else None), + output_attentions=output_attentions, + ) + + hidden_states = layer_outputs[0] + + if output_attentions: + all_attentions = all_attentions + (layer_outputs[1],) + + if output_hidden_states: + encoder_states = encoder_states + (hidden_states,) + + if not return_dict: + return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None) + return BaseModelOutput( + last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions + ) + + +class BartDecoder(BartPreTrainedModel): + """ + Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`BartDecoderLayer`] + + Args: + config: BartConfig + embed_tokens (nn.Embedding): output embedding + """ + + def __init__(self, config: BartConfig, embed_tokens: Optional[nn.Embedding] = None): + super().__init__(config) + self.dropout = config.dropout + self.layerdrop = config.decoder_layerdrop + self.padding_idx = config.pad_token_id + self.max_target_positions = config.max_position_embeddings + embed_scale = math.sqrt(config.d_model) if config.scale_embedding else 1.0 + + self.embed_tokens = BartScaledWordEmbedding( + config.vocab_size, config.d_model, self.padding_idx, embed_scale=embed_scale + ) + + if embed_tokens is not None: + self.embed_tokens.weight = embed_tokens.weight + + self.embed_positions = BartLearnedPositionalEmbedding( + config.max_position_embeddings, + config.d_model, + ) + self.layers = nn.ModuleList([BartDecoderLayer(config) for _ in range(config.decoder_layers)]) + self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2" + self._use_sdpa = config._attn_implementation == "sdpa" + + self.layernorm_embedding = nn.LayerNorm(config.d_model) + + self.gradient_checkpointing = False + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.embed_tokens + + def set_input_embeddings(self, value): + self.embed_tokens = value + + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + encoder_attention_mask: Optional[torch.LongTensor] = None, + head_mask: Optional[torch.Tensor] = None, + cross_attn_head_mask: Optional[torch.Tensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, BaseModelOutputWithPastAndCrossAttentions]: + r""" + Args: + input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you + provide it. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*): + Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention + of the decoder. + encoder_attention_mask (`torch.LongTensor` of shape `(batch_size, encoder_sequence_length)`, *optional*): + Mask to avoid performing cross-attention on padding tokens indices of encoder input_ids. Mask values + selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*): + Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*): + Mask to nullify selected heads of the cross-attention modules in the decoder to avoid performing + cross-attention on hidden heads. Mask values selected in `[0, 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): + Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of + shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of + shape `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`. + + Contains pre-computed hidden-states (key and values in the self-attention blocks and in the + cross-attention blocks) that can be used (see `past_key_values` input) to speed up sequential decoding. + + If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those + that don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of + all `decoder_input_ids` of shape `(batch_size, sequence_length)`. + inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. + This is useful if you want more control over how to convert `input_ids` indices into associated vectors + than the model's internal embedding lookup matrix. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors + for more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. + """ + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + use_cache = use_cache if use_cache is not None else self.config.use_cache + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + # retrieve input_ids and inputs_embeds + if input_ids is not None and inputs_embeds is not None: + raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time") + elif input_ids is not None: + input = input_ids + input_shape = input.shape + input_ids = input_ids.view(-1, input_shape[-1]) + elif inputs_embeds is not None: + input_shape = inputs_embeds.size()[:-1] + input = inputs_embeds[:, :, -1] + else: + raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds") + + # past_key_values_length + past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0 + + if inputs_embeds is None: + inputs_embeds = self.embed_tokens(input) + + if self._use_flash_attention_2: + # 2d mask is passed through the layers + attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None + elif self._use_sdpa and not output_attentions and cross_attn_head_mask is None: + # output_attentions=True & cross_attn_head_mask can not be supported when using SDPA, and we fall back on + # the manual implementation that requires a 4D causal mask in all cases. + attention_mask = _prepare_4d_causal_attention_mask_for_sdpa( + attention_mask, + input_shape, + inputs_embeds, + past_key_values_length, + ) + else: + # 4d mask is passed through the layers + attention_mask = _prepare_4d_causal_attention_mask( + attention_mask, input_shape, inputs_embeds, past_key_values_length + ) + + # expand encoder attention mask + if encoder_hidden_states is not None and encoder_attention_mask is not None: + if self._use_flash_attention_2: + encoder_attention_mask = encoder_attention_mask if 0 in encoder_attention_mask else None + elif self._use_sdpa and cross_attn_head_mask is None and not output_attentions: + # output_attentions=True & cross_attn_head_mask can not be supported when using SDPA, and we fall back on + # the manual implementation that requires a 4D causal mask in all cases. + # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] + encoder_attention_mask = _prepare_4d_attention_mask_for_sdpa( + encoder_attention_mask, + inputs_embeds.dtype, + tgt_len=input_shape[-1], + ) + else: + # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] + encoder_attention_mask = _prepare_4d_attention_mask( + encoder_attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1] + ) + + # embed positions + positions = self.embed_positions(input, past_key_values_length) + positions = positions.to(inputs_embeds.device) + + hidden_states = inputs_embeds + positions + hidden_states = self.layernorm_embedding(hidden_states) + + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) + + if self.gradient_checkpointing and self.training: + if use_cache: + logger.warning_once( + "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." + ) + use_cache = False + + # decoder layers + all_hidden_states = () if output_hidden_states else None + all_self_attns = () if output_attentions else None + all_cross_attentions = () if (output_attentions and encoder_hidden_states is not None) else None + next_decoder_cache = () if use_cache else None + + # check if head_mask/cross_attn_head_mask has a correct number of layers specified if desired + for attn_mask, mask_name in zip([head_mask, cross_attn_head_mask], ["head_mask", "cross_attn_head_mask"]): + if attn_mask is not None: + if attn_mask.size()[0] != (len(self.layers)): + raise ValueError( + f"The `{mask_name}` should be specified for {len(self.layers)} layers, but it is for" + f" {head_mask.size()[0]}." + ) + + for idx, decoder_layer in enumerate(self.layers): + # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) + if output_hidden_states: + all_hidden_states += (hidden_states,) + if self.training: + dropout_probability = torch.rand([]) + if dropout_probability < self.layerdrop: + continue + + past_key_value = past_key_values[idx] if past_key_values is not None else None + + if self.gradient_checkpointing and self.training: + layer_outputs = self._gradient_checkpointing_func( + decoder_layer.__call__, + hidden_states, + attention_mask, + encoder_hidden_states, + encoder_attention_mask, + head_mask[idx] if head_mask is not None else None, + cross_attn_head_mask[idx] if cross_attn_head_mask is not None else None, + None, + output_attentions, + use_cache, + ) + else: + layer_outputs = decoder_layer( + hidden_states, + attention_mask=attention_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + layer_head_mask=(head_mask[idx] if head_mask is not None else None), + cross_attn_layer_head_mask=( + cross_attn_head_mask[idx] if cross_attn_head_mask is not None else None + ), + past_key_value=past_key_value, + output_attentions=output_attentions, + use_cache=use_cache, + ) + hidden_states = layer_outputs[0] + + if use_cache: + next_decoder_cache += (layer_outputs[3 if output_attentions else 1],) + + if output_attentions: + all_self_attns += (layer_outputs[1],) + + if encoder_hidden_states is not None: + all_cross_attentions += (layer_outputs[2],) + + # add hidden states from the last decoder layer + if output_hidden_states: + all_hidden_states += (hidden_states,) + + next_cache = next_decoder_cache if use_cache else None + if not return_dict: + return tuple( + v + for v in [hidden_states, next_cache, all_hidden_states, all_self_attns, all_cross_attentions] + if v is not None + ) + return BaseModelOutputWithPastAndCrossAttentions( + last_hidden_state=hidden_states, + past_key_values=next_cache, + hidden_states=all_hidden_states, + attentions=all_self_attns, + cross_attentions=all_cross_attentions, + ) + + +@add_start_docstrings( + "The bare BART Model outputting raw hidden-states without any specific head on top.", + BART_START_DOCSTRING, +) +class BartModel(BartPreTrainedModel): + _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"] + + def __init__(self, config: BartConfig): + super().__init__(config) + + padding_idx, vocab_size = config.pad_token_id, config.vocab_size + embed_scale = math.sqrt(config.d_model) if config.scale_embedding else 1.0 + self.shared = BartScaledWordEmbedding(vocab_size, config.d_model, padding_idx, embed_scale=embed_scale) + + self.encoder = BartEncoder(config, self.shared) + self.decoder = BartDecoder(config, self.shared) + + # Initialize weights and apply final processing + self.post_init() + + def _tie_weights(self): + if self.config.tie_word_embeddings: + # Some model checkpoints like "facebook/bart-large-cnn"'s embedding weight is in decoder.embed_tokens, need check here, see issue #36247 + if self.shared.weight.device == torch.device( + "meta" + ) and self.decoder.embed_tokens.weight.device != torch.device("meta"): + self._tie_or_clone_weights(self.encoder.embed_tokens, self.decoder.embed_tokens) + self._tie_or_clone_weights(self.shared, self.decoder.embed_tokens) + else: + self._tie_or_clone_weights(self.encoder.embed_tokens, self.shared) + self._tie_or_clone_weights(self.decoder.embed_tokens, self.shared) + + def get_input_embeddings(self): + return self.shared + + def set_input_embeddings(self, value): + self.shared = value + self.encoder.embed_tokens = self.shared + self.decoder.embed_tokens = self.shared + + def get_encoder(self): + return self.encoder + + def get_decoder(self): + return self.decoder + + @add_start_docstrings_to_model_forward(BART_INPUTS_DOCSTRING) + @add_code_sample_docstrings( + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=Seq2SeqModelOutput, + config_class=_CONFIG_FOR_DOC, + expected_output=_EXPECTED_OUTPUT_SHAPE, + ) + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + decoder_input_ids: Optional[torch.LongTensor] = None, + decoder_attention_mask: Optional[torch.LongTensor] = None, + head_mask: Optional[torch.Tensor] = None, + decoder_head_mask: Optional[torch.Tensor] = None, + cross_attn_head_mask: Optional[torch.Tensor] = None, + encoder_outputs: Optional[List[torch.FloatTensor]] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + decoder_inputs_embeds: Optional[torch.FloatTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, Seq2SeqModelOutput]: + # different to other models, Bart automatically creates decoder_input_ids from + # input_ids if no decoder_input_ids are provided + if decoder_input_ids is None and decoder_inputs_embeds is None: + if input_ids is None: + raise ValueError( + "If no `decoder_input_ids` or `decoder_inputs_embeds` are " + "passed, `input_ids` cannot be `None`. Please pass either " + "`input_ids` or `decoder_input_ids` or `decoder_inputs_embeds`." + ) + + decoder_input_ids = shift_tokens_right( + input_ids, self.config.pad_token_id, self.config.decoder_start_token_id + ) + + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + use_cache = use_cache if use_cache is not None else self.config.use_cache + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if encoder_outputs is None: + encoder_outputs = self.encoder( + input_ids=input_ids, + attention_mask=attention_mask, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + # If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput when return_dict=True + elif return_dict and not isinstance(encoder_outputs, BaseModelOutput): + encoder_outputs = BaseModelOutput( + last_hidden_state=encoder_outputs[0], + hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None, + attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None, + ) + + # decoder outputs consists of (dec_features, past_key_value, dec_hidden, dec_attn) + decoder_outputs = self.decoder( + input_ids=decoder_input_ids, + attention_mask=decoder_attention_mask, + encoder_hidden_states=encoder_outputs[0], + encoder_attention_mask=attention_mask, + head_mask=decoder_head_mask, + cross_attn_head_mask=cross_attn_head_mask, + past_key_values=past_key_values, + inputs_embeds=decoder_inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + if not return_dict: + return decoder_outputs + encoder_outputs + + return Seq2SeqModelOutput( + last_hidden_state=decoder_outputs.last_hidden_state, + past_key_values=decoder_outputs.past_key_values, + decoder_hidden_states=decoder_outputs.hidden_states, + decoder_attentions=decoder_outputs.attentions, + cross_attentions=decoder_outputs.cross_attentions, + encoder_last_hidden_state=encoder_outputs.last_hidden_state, + encoder_hidden_states=encoder_outputs.hidden_states, + encoder_attentions=encoder_outputs.attentions, + ) + + +@add_start_docstrings( + "The BART Model with a language modeling head. Can be used for summarization.", BART_START_DOCSTRING +) +class BartForConditionalGeneration(BartPreTrainedModel, GenerationMixin): + base_model_prefix = "model" + _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight", "lm_head.weight"] + _keys_to_ignore_on_load_missing = ["final_logits_bias"] + + def __init__(self, config: BartConfig): + super().__init__(config) + self.model = BartModel(config) + self.register_buffer("final_logits_bias", torch.zeros((1, self.model.shared.num_embeddings))) + self.lm_head = nn.Linear(config.d_model, self.model.shared.num_embeddings, bias=False) + + # Initialize weights and apply final processing + self.post_init() + + def get_encoder(self): + return self.model.get_encoder() + + def get_decoder(self): + return self.model.get_decoder() + + def resize_token_embeddings( + self, new_num_tokens: int, pad_to_multiple_of: Optional[int] = None, mean_resizing: bool = True + ) -> nn.Embedding: + new_embeddings = super().resize_token_embeddings(new_num_tokens, pad_to_multiple_of, mean_resizing) + self._resize_final_logits_bias(new_embeddings.weight.shape[0]) + return new_embeddings + + def _resize_final_logits_bias(self, new_num_tokens: int) -> None: + old_num_tokens = self.final_logits_bias.shape[-1] + if new_num_tokens <= old_num_tokens: + new_bias = self.final_logits_bias[:, :new_num_tokens] + else: + extra_bias = torch.zeros((1, new_num_tokens - old_num_tokens), device=self.final_logits_bias.device) + new_bias = torch.cat([self.final_logits_bias, extra_bias], dim=1) + self.register_buffer("final_logits_bias", new_bias) + + def get_output_embeddings(self): + return self.lm_head + + def set_output_embeddings(self, new_embeddings): + self.lm_head = new_embeddings + + def _tie_weights(self): + if self.config.tie_word_embeddings: + self.model._tie_weights() + self._tie_or_clone_weights(self.lm_head, self.model.shared) + + @add_start_docstrings_to_model_forward(BART_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=Seq2SeqLMOutput, config_class=_CONFIG_FOR_DOC) + @add_end_docstrings(BART_GENERATION_EXAMPLE) + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + decoder_input_ids: Optional[torch.LongTensor] = None, + decoder_attention_mask: Optional[torch.LongTensor] = None, + head_mask: Optional[torch.Tensor] = None, + decoder_head_mask: Optional[torch.Tensor] = None, + cross_attn_head_mask: Optional[torch.Tensor] = None, + encoder_outputs: Optional[List[torch.FloatTensor]] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + decoder_inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, Seq2SeqLMOutput]: + r""" + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., + config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored + (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. + + Returns: + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if labels is not None: + if use_cache: + logger.warning("The `use_cache` argument is changed to `False` since `labels` is provided.") + use_cache = False + if decoder_input_ids is None and decoder_inputs_embeds is None: + decoder_input_ids = shift_tokens_right( + labels, self.config.pad_token_id, self.config.decoder_start_token_id + ) + + outputs = self.model( + input_ids, + attention_mask=attention_mask, + decoder_input_ids=decoder_input_ids, + encoder_outputs=encoder_outputs, + decoder_attention_mask=decoder_attention_mask, + head_mask=head_mask, + decoder_head_mask=decoder_head_mask, + cross_attn_head_mask=cross_attn_head_mask, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + decoder_inputs_embeds=decoder_inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + lm_logits = self.lm_head(outputs[0]) + lm_logits = lm_logits + self.final_logits_bias.to(lm_logits.device) + + masked_lm_loss = None + if labels is not None: + labels = labels.to(lm_logits.device) + loss_fct = CrossEntropyLoss() + masked_lm_loss = loss_fct(lm_logits.view(-1, self.config.vocab_size), labels.view(-1)) + + if not return_dict: + output = (lm_logits,) + outputs[1:] + return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output + + return Seq2SeqLMOutput( + loss=masked_lm_loss, + logits=lm_logits, + past_key_values=outputs.past_key_values, + decoder_hidden_states=outputs.decoder_hidden_states, + decoder_attentions=outputs.decoder_attentions, + cross_attentions=outputs.cross_attentions, + encoder_last_hidden_state=outputs.encoder_last_hidden_state, + encoder_hidden_states=outputs.encoder_hidden_states, + encoder_attentions=outputs.encoder_attentions, + ) + + def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor): + return shift_tokens_right(labels, self.config.pad_token_id, self.config.decoder_start_token_id) + + @staticmethod + def _reorder_cache(past_key_values, beam_idx): + reordered_past = () + for layer_past in past_key_values: + # cached cross_attention states don't have to be reordered -> they are always the same + reordered_past += ( + tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past[:2]) + + layer_past[2:], + ) + return reordered_past + + +@add_start_docstrings( + """ + Bart model with a sequence classification/head on top (a linear layer on top of the pooled output) e.g. for GLUE + tasks. + """, + BART_START_DOCSTRING, +) +class BartForSequenceClassification(BartPreTrainedModel): + _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"] + + def __init__(self, config: BartConfig, **kwargs): + super().__init__(config, **kwargs) + self.model = BartModel(config) + self.classification_head = BartClassificationHead( + config.d_model, + config.d_model, + config.num_labels, + config.classifier_dropout, + ) + + # Initialize weights and apply final processing + self.post_init() + + @add_start_docstrings_to_model_forward(BART_INPUTS_DOCSTRING) + @add_code_sample_docstrings( + checkpoint=_CHECKPOINT_FOR_SEQUENCE_CLASSIFICATION, + output_type=Seq2SeqSequenceClassifierOutput, + config_class=_CONFIG_FOR_DOC, + expected_output=_SEQ_CLASS_EXPECTED_OUTPUT, + expected_loss=_SEQ_CLASS_EXPECTED_LOSS, + ) + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + decoder_input_ids: Optional[torch.LongTensor] = None, + decoder_attention_mask: Optional[torch.LongTensor] = None, + head_mask: Optional[torch.Tensor] = None, + decoder_head_mask: Optional[torch.Tensor] = None, + cross_attn_head_mask: Optional[torch.Tensor] = None, + encoder_outputs: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + decoder_inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, Seq2SeqSequenceClassifierOutput]: + r""" + labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., + config.num_labels - 1]`. If `config.num_labels > 1` a classification loss is computed (Cross-Entropy). + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + if labels is not None: + use_cache = False + + if input_ids is None and inputs_embeds is not None: + raise NotImplementedError( + f"Passing input embeddings is currently not supported for {self.__class__.__name__}" + ) + + outputs = self.model( + input_ids, + attention_mask=attention_mask, + decoder_input_ids=decoder_input_ids, + decoder_attention_mask=decoder_attention_mask, + head_mask=head_mask, + decoder_head_mask=decoder_head_mask, + cross_attn_head_mask=cross_attn_head_mask, + encoder_outputs=encoder_outputs, + inputs_embeds=inputs_embeds, + decoder_inputs_embeds=decoder_inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + hidden_states = outputs[0] # last hidden state + + eos_mask = input_ids.eq(self.config.eos_token_id).to(hidden_states.device) + + if len(torch.unique_consecutive(eos_mask.sum(1))) > 1: + raise ValueError("All examples must have the same number of tokens.") + sentence_representation = hidden_states[eos_mask, :].view(hidden_states.size(0), -1, hidden_states.size(-1))[ + :, -1, : + ] + logits = self.classification_head(sentence_representation) + + loss = None + if labels is not None: + labels = labels.to(logits.device) + if self.config.problem_type is None: + if self.config.num_labels == 1: + self.config.problem_type = "regression" + elif self.config.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int): + self.config.problem_type = "single_label_classification" + else: + self.config.problem_type = "multi_label_classification" + + if self.config.problem_type == "regression": + loss_fct = MSELoss() + if self.config.num_labels == 1: + loss = loss_fct(logits.squeeze(), labels.squeeze()) + else: + loss = loss_fct(logits, labels) + elif self.config.problem_type == "single_label_classification": + loss_fct = CrossEntropyLoss() + loss = loss_fct(logits.view(-1, self.config.num_labels), labels.view(-1)) + elif self.config.problem_type == "multi_label_classification": + loss_fct = BCEWithLogitsLoss() + loss = loss_fct(logits, labels) + if not return_dict: + output = (logits,) + outputs[1:] + return ((loss,) + output) if loss is not None else output + + return Seq2SeqSequenceClassifierOutput( + loss=loss, + logits=logits, + past_key_values=outputs.past_key_values, + decoder_hidden_states=outputs.decoder_hidden_states, + decoder_attentions=outputs.decoder_attentions, + cross_attentions=outputs.cross_attentions, + encoder_last_hidden_state=outputs.encoder_last_hidden_state, + encoder_hidden_states=outputs.encoder_hidden_states, + encoder_attentions=outputs.encoder_attentions, + ) + + +@add_start_docstrings( + """ + BART Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear + layer on top of the hidden-states output to compute `span start logits` and `span end logits`). + """, + BART_START_DOCSTRING, +) +class BartForQuestionAnswering(BartPreTrainedModel): + _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"] + + def __init__(self, config): + super().__init__(config) + + config.num_labels = 2 + self.num_labels = config.num_labels + + self.model = BartModel(config) + self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels) + + # Initialize weights and apply final processing + self.post_init() + + @add_start_docstrings_to_model_forward(BART_INPUTS_DOCSTRING) + @add_code_sample_docstrings( + checkpoint=_CHECKPOINT_FOR_QA, + output_type=Seq2SeqQuestionAnsweringModelOutput, + config_class=_CONFIG_FOR_DOC, + expected_loss=_QA_EXPECTED_LOSS, + expected_output=_QA_EXPECTED_OUTPUT, + ) + def forward( + self, + input_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + decoder_input_ids: Optional[torch.LongTensor] = None, + decoder_attention_mask: Optional[torch.LongTensor] = None, + head_mask: Optional[torch.Tensor] = None, + decoder_head_mask: Optional[torch.Tensor] = None, + cross_attn_head_mask: Optional[torch.Tensor] = None, + encoder_outputs: Optional[List[torch.FloatTensor]] = None, + start_positions: Optional[torch.LongTensor] = None, + end_positions: Optional[torch.LongTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + decoder_inputs_embeds: Optional[torch.FloatTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, Seq2SeqQuestionAnsweringModelOutput]: + r""" + start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for position (index) of the start of the labelled span for computing the token classification loss. + Positions are clamped to the length of the sequence (*sequence_length*). Position outside of the sequence + are not taken into account for computing the loss. + end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for position (index) of the end of the labelled span for computing the token classification loss. + Positions are clamped to the length of the sequence (*sequence_length*). Position outside of the sequence + are not taken into account for computing the loss. + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + if start_positions is not None and end_positions is not None: + use_cache = False + + outputs = self.model( + input_ids, + attention_mask=attention_mask, + decoder_input_ids=decoder_input_ids, + decoder_attention_mask=decoder_attention_mask, + head_mask=head_mask, + decoder_head_mask=decoder_head_mask, + cross_attn_head_mask=cross_attn_head_mask, + encoder_outputs=encoder_outputs, + inputs_embeds=inputs_embeds, + decoder_inputs_embeds=decoder_inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + sequence_output = outputs[0] + + logits = self.qa_outputs(sequence_output) + start_logits, end_logits = logits.split(1, dim=-1) + start_logits = start_logits.squeeze(-1).contiguous() + end_logits = end_logits.squeeze(-1).contiguous() + + total_loss = None + if start_positions is not None and end_positions is not None: + # If we are on multi-GPU, split add a dimension + if len(start_positions.size()) > 1: + start_positions = start_positions.squeeze(-1) + if len(end_positions.size()) > 1: + end_positions = end_positions.squeeze(-1) + # sometimes the start/end positions are outside our model inputs, we ignore these terms + ignored_index = start_logits.size(1) + start_positions = start_positions.clamp(0, ignored_index) + end_positions = end_positions.clamp(0, ignored_index) + + loss_fct = CrossEntropyLoss(ignore_index=ignored_index) + start_loss = loss_fct(start_logits, start_positions) + end_loss = loss_fct(end_logits, end_positions) + total_loss = (start_loss + end_loss) / 2 + + if not return_dict: + output = ( + start_logits, + end_logits, + ) + outputs[1:] + return ((total_loss,) + output) if total_loss is not None else output + + return Seq2SeqQuestionAnsweringModelOutput( + loss=total_loss, + start_logits=start_logits, + end_logits=end_logits, + past_key_values=outputs.past_key_values, + decoder_hidden_states=outputs.decoder_hidden_states, + decoder_attentions=outputs.decoder_attentions, + cross_attentions=outputs.cross_attentions, + encoder_last_hidden_state=outputs.encoder_last_hidden_state, + encoder_hidden_states=outputs.encoder_hidden_states, + encoder_attentions=outputs.encoder_attentions, + ) + + +class BartDecoderWrapper(BartPreTrainedModel): + """ + This wrapper class is a helper class to correctly load pretrained checkpoints when the causal language model is + used in combination with the [`EncoderDecoderModel`] framework. + """ + + def __init__(self, config): + super().__init__(config) + self.decoder = BartDecoder(config) + + def forward(self, *args, **kwargs): + return self.decoder(*args, **kwargs) + + +@add_start_docstrings( + """ + BART decoder with a language modeling head on top (linear layer with weights tied to the input embeddings). + """, + BART_START_DOCSTRING, +) +class BartForCausalLM(BartPreTrainedModel, GenerationMixin): + _tied_weights_keys = ["lm_head.weight"] + + def __init__(self, config): + config = copy.deepcopy(config) + config.is_decoder = True + config.is_encoder_decoder = False + super().__init__(config) + self.model = BartDecoderWrapper(config) + + self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.model.decoder.embed_tokens + + def set_input_embeddings(self, value): + self.model.decoder.embed_tokens = value + + def get_output_embeddings(self): + return self.lm_head + + def set_output_embeddings(self, new_embeddings): + self.lm_head = new_embeddings + + def set_decoder(self, decoder): + self.model.decoder = decoder + + def get_decoder(self): + return self.model.decoder + + @replace_return_docstrings(output_type=CausalLMOutputWithCrossAttentions, config_class=_CONFIG_FOR_DOC) + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + encoder_attention_mask: Optional[torch.FloatTensor] = None, + head_mask: Optional[torch.Tensor] = None, + cross_attn_head_mask: Optional[torch.Tensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, CausalLMOutputWithCrossAttentions]: + r""" + Args: + input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you + provide it. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention + if the model is configured as a decoder. + encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used + in the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`: + head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*): + Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*): + Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): + Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of + shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of + shape `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`. The two additional + tensors are only required when the model is used as a decoder in a Sequence to Sequence model. + + Contains pre-computed hidden-states (key and values in the self-attention blocks and in the + cross-attention blocks) that can be used (see `past_key_values` input) to speed up sequential decoding. + + If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those + that don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of + all `decoder_input_ids` of shape `(batch_size, sequence_length)`. + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., + config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored + (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding + (see `past_key_values`). + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors + for more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. + + Returns: + + Example: + + ```python + >>> from transformers import AutoTokenizer, BartForCausalLM + + >>> tokenizer = AutoTokenizer.from_pretrained("facebook/bart-base") + >>> model = BartForCausalLM.from_pretrained("facebook/bart-base", add_cross_attention=False) + >>> assert model.config.is_decoder, f"{model.__class__} has to be configured as a decoder." + >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt") + >>> outputs = model(**inputs) + + >>> logits = outputs.logits + >>> expected_shape = [1, inputs.input_ids.shape[-1], model.config.vocab_size] + >>> list(logits.shape) == expected_shape + True + ```""" + + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn) + outputs = self.model.decoder( + input_ids=input_ids, + attention_mask=attention_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + head_mask=head_mask, + cross_attn_head_mask=cross_attn_head_mask, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + logits = self.lm_head(outputs[0]) + + loss = None + if labels is not None: + labels = labels.to(logits.device) + loss_fct = CrossEntropyLoss() + loss = loss_fct(logits.view(-1, self.config.vocab_size), labels.view(-1)) + + if not return_dict: + output = (logits,) + outputs[1:] + return (loss,) + output if loss is not None else output + + return CausalLMOutputWithCrossAttentions( + loss=loss, + logits=logits, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + cross_attentions=outputs.cross_attentions, + ) + + @staticmethod + def _reorder_cache(past_key_values, beam_idx): + reordered_past = () + for layer_past in past_key_values: + reordered_past += ( + tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past), + ) + return reordered_past + + +__all__ = [ + "BartForCausalLM", + "BartForConditionalGeneration", + "BartForQuestionAnswering", + "BartForSequenceClassification", + "BartModel", + "BartPreTrainedModel", + "BartPretrainedModel", + "PretrainedBartModel", +] diff --git a/docs/transformers/build/lib/transformers/models/bart/modeling_flax_bart.py b/docs/transformers/build/lib/transformers/models/bart/modeling_flax_bart.py new file mode 100644 index 0000000000000000000000000000000000000000..18c8f6b85ccc8da02f8a0aeb78c069d93ebc754d --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/bart/modeling_flax_bart.py @@ -0,0 +1,2006 @@ +# coding=utf-8 +# Copyright 2021 The Fairseq Authors and The Google Flax Team Authors And The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Flax Bart model.""" + +import math +import random +from functools import partial +from typing import Callable, Optional, Tuple + +import flax.linen as nn +import jax +import jax.numpy as jnp +from flax.core.frozen_dict import FrozenDict, freeze, unfreeze +from flax.linen import combine_masks, make_causal_mask +from flax.linen.attention import dot_product_attention_weights +from flax.traverse_util import flatten_dict, unflatten_dict +from jax import lax +from jax.random import PRNGKey + +from ...modeling_flax_outputs import ( + FlaxBaseModelOutput, + FlaxBaseModelOutputWithPastAndCrossAttentions, + FlaxCausalLMOutputWithCrossAttentions, + FlaxSeq2SeqLMOutput, + FlaxSeq2SeqModelOutput, + FlaxSeq2SeqQuestionAnsweringModelOutput, + FlaxSeq2SeqSequenceClassifierOutput, +) +from ...modeling_flax_utils import ( + ACT2FN, + FlaxPreTrainedModel, + append_call_sample_docstring, + append_replace_return_docstrings, + overwrite_call_docstring, +) +from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings +from .configuration_bart import BartConfig + + +logger = logging.get_logger(__name__) + +_CHECKPOINT_FOR_DOC = "facebook/bart-base" +_CONFIG_FOR_DOC = "BartConfig" + + +BART_START_DOCSTRING = r""" + This model inherits from [`FlaxPreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads + etc.) + + This model is also a Flax Linen + [flax.nn.Module](https://flax.readthedocs.io/en/latest/_autosummary/flax.nn.module.html) subclass. Use it as a + regular Flax Module and refer to the Flax documentation for all matter related to general usage and behavior. + + Finally, this model supports inherent JAX features such as: + + - [Just-In-Time (JIT) compilation](https://jax.readthedocs.io/en/latest/jax.html#just-in-time-compilation-jit) + - [Automatic Differentiation](https://jax.readthedocs.io/en/latest/jax.html#automatic-differentiation) + - [Vectorization](https://jax.readthedocs.io/en/latest/jax.html#vectorization-vmap) + - [Parallelization](https://jax.readthedocs.io/en/latest/jax.html#parallelization-pmap) + + Parameters: + config ([`BartConfig`]): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the [`~FlaxPreTrainedModel.from_pretrained`] method to load the model weights. + dtype (`jax.numpy.dtype`, *optional*, defaults to `jax.numpy.float32`): + The data type of the computation. Can be one of `jax.numpy.float32`, `jax.numpy.float16` (on GPUs) and + `jax.numpy.bfloat16` (on TPUs). + + This can be used to enable mixed-precision training or half-precision inference on GPUs or TPUs. If + specified all the computation will be performed with the given `dtype`. + + **Note that this only specifies the dtype of the computation and does not influence the dtype of model + parameters.** + + If you wish to change the dtype of the model parameters, see [`~FlaxPreTrainedModel.to_fp16`] and + [`~FlaxPreTrainedModel.to_bf16`]. +""" + +BART_INPUTS_DOCSTRING = r""" + Args: + input_ids (`jnp.ndarray` of shape `(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide + it. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + attention_mask (`jnp.ndarray` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + decoder_input_ids (`jnp.ndarray` of shape `(batch_size, target_sequence_length)`, *optional*): + Indices of decoder input sequence tokens in the vocabulary. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are decoder input IDs?](../glossary#decoder-input-ids) + + For translation and summarization training, `decoder_input_ids` should be provided. If no + `decoder_input_ids` is provided, the model will create this tensor by shifting the `input_ids` to the right + for denoising pre-training following the paper. + decoder_attention_mask (`jnp.ndarray` of shape `(batch_size, target_sequence_length)`, *optional*): + Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also + be used by default. + + If you want to change padding behavior, you should modify to your needs. See diagram 1 in [the + paper](https://arxiv.org/abs/1910.13461) for more information on the default strategy. + position_ids (`numpy.ndarray` of shape `(batch_size, sequence_length)`, *optional*): + Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, + config.max_position_embeddings - 1]`. + decoder_position_ids (`numpy.ndarray` of shape `(batch_size, sequence_length)`, *optional*): + Indices of positions of each decoder input sequence tokens in the position embeddings. Selected in the + range `[0, config.max_position_embeddings - 1]`. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. +""" + + +BART_ENCODE_INPUTS_DOCSTRING = r""" + Args: + input_ids (`jnp.ndarray` of shape `(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide + it. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + attention_mask (`jnp.ndarray` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + position_ids (`numpy.ndarray` of shape `(batch_size, sequence_length)`, *optional*): + Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, + config.max_position_embeddings - 1]`. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. +""" + +BART_DECODE_INPUTS_DOCSTRING = r""" + Args: + decoder_input_ids (`jnp.ndarray` of shape `(batch_size, target_sequence_length)`): + Indices of decoder input sequence tokens in the vocabulary. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are decoder input IDs?](../glossary#decoder-input-ids) + + For translation and summarization training, `decoder_input_ids` should be provided. If no + `decoder_input_ids` is provided, the model will create this tensor by shifting the `input_ids` to the right + for denoising pre-training following the paper. + encoder_outputs (`tuple(tuple(jnp.ndarray)`): + Tuple consists of (`last_hidden_state`, *optional*: `hidden_states`, *optional*: `attentions`) + `last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)`, *optional*) is a sequence of + hidden-states at the output of the last layer of the encoder. Used in the cross-attention of the decoder. + encoder_attention_mask (`jnp.ndarray` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + decoder_attention_mask (`jnp.ndarray` of shape `(batch_size, target_sequence_length)`, *optional*): + Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also + be used by default. + + If you want to change padding behavior, you should modify to your needs. See diagram 1 in [the + paper](https://arxiv.org/abs/1910.13461) for more information on the default strategy. + decoder_position_ids (`numpy.ndarray` of shape `(batch_size, sequence_length)`, *optional*): + Indices of positions of each decoder input sequence tokens in the position embeddings. Selected in the + range `[0, config.max_position_embeddings - 1]`. + past_key_values (`Dict[str, np.ndarray]`, *optional*, returned by `init_cache` or when passing previous `past_key_values`): + Dictionary of pre-computed hidden-states (key and values in the attention blocks) that can be used for fast + auto-regressive decoding. Pre-computed key and value hidden-states are of shape *[batch_size, max_length]*. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. +""" + + +def shift_tokens_right(input_ids: jnp.ndarray, pad_token_id: int, decoder_start_token_id: int) -> jnp.ndarray: + """ + Shift input ids one token to the right. + """ + shifted_input_ids = jnp.zeros_like(input_ids) + shifted_input_ids = shifted_input_ids.at[:, 1:].set(input_ids[:, :-1]) + shifted_input_ids = shifted_input_ids.at[:, 0].set(decoder_start_token_id) + + shifted_input_ids = jnp.where(shifted_input_ids == -100, pad_token_id, shifted_input_ids) + return shifted_input_ids + + +class FlaxBartAttention(nn.Module): + config: BartConfig + embed_dim: int + num_heads: int + dropout: float = 0.0 + causal: bool = False + bias: bool = True + dtype: jnp.dtype = jnp.float32 # the dtype of the computation + + def setup(self) -> None: + self.head_dim = self.embed_dim // self.num_heads + if self.head_dim * self.num_heads != self.embed_dim: + raise ValueError( + f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}" + f" and `num_heads`: {self.num_heads})." + ) + + dense = partial( + nn.Dense, + self.embed_dim, + use_bias=self.bias, + dtype=self.dtype, + kernel_init=jax.nn.initializers.normal(self.config.init_std), + ) + + self.q_proj, self.k_proj, self.v_proj = dense(), dense(), dense() + self.out_proj = dense() + + self.dropout_layer = nn.Dropout(rate=self.dropout) + + if self.causal: + self.causal_mask = make_causal_mask( + jnp.ones((1, self.config.max_position_embeddings), dtype="bool"), dtype="bool" + ) + + def _split_heads(self, hidden_states): + return hidden_states.reshape(hidden_states.shape[:2] + (self.num_heads, self.head_dim)) + + def _merge_heads(self, hidden_states): + return hidden_states.reshape(hidden_states.shape[:2] + (self.embed_dim,)) + + @nn.compact + def _concatenate_to_cache(self, key, value, query, attention_mask): + """ + This function takes projected key, value states from a single input token and concatenates the states to cached + states from previous steps. This function is slightly adapted from the official Flax repository: + https://github.com/google/flax/blob/491ce18759622506588784b4fca0e4bf05f8c8cd/flax/linen/attention.py#L252 + """ + # detect if we're initializing by absence of existing cache data. + is_initialized = self.has_variable("cache", "cached_key") + cached_key = self.variable("cache", "cached_key", jnp.zeros, key.shape, key.dtype) + cached_value = self.variable("cache", "cached_value", jnp.zeros, value.shape, value.dtype) + cache_index = self.variable("cache", "cache_index", lambda: jnp.array(0, dtype=jnp.int32)) + + if is_initialized: + *batch_dims, max_length, num_heads, depth_per_head = cached_key.value.shape + # update key, value caches with our new 1d spatial slices + cur_index = cache_index.value + indices = (0,) * len(batch_dims) + (cur_index, 0, 0) + key = lax.dynamic_update_slice(cached_key.value, key, indices) + value = lax.dynamic_update_slice(cached_value.value, value, indices) + cached_key.value = key + cached_value.value = value + num_updated_cache_vectors = query.shape[1] + cache_index.value = cache_index.value + num_updated_cache_vectors + # causal mask for cached decoder self-attention: our single query position should only attend to those key positions that have already been generated and cached, not the remaining zero elements. + pad_mask = jnp.broadcast_to( + jnp.arange(max_length) < cur_index + num_updated_cache_vectors, + tuple(batch_dims) + (1, num_updated_cache_vectors, max_length), + ) + attention_mask = combine_masks(pad_mask, attention_mask) + return key, value, attention_mask + + def __call__( + self, + hidden_states: jnp.ndarray, + key_value_states: Optional[jnp.ndarray] = None, + attention_mask: Optional[jnp.ndarray] = None, + init_cache: bool = False, + deterministic: bool = True, + ) -> Tuple[jnp.ndarray]: + """Input shape: Batch x Time x Channel""" + + # if key_value_states are provided this layer is used as a cross-attention layer + # for the decoder + is_cross_attention = key_value_states is not None + batch_size = hidden_states.shape[0] + + # get query proj + query_states = self.q_proj(hidden_states) + # get key, value proj + if is_cross_attention: + # cross_attentions + key_states = self.k_proj(key_value_states) + value_states = self.v_proj(key_value_states) + else: + # self_attention + key_states = self.k_proj(hidden_states) + value_states = self.v_proj(hidden_states) + + query_states = self._split_heads(query_states) + key_states = self._split_heads(key_states) + value_states = self._split_heads(value_states) + + # handle cache prepare causal attention mask + if self.causal: + query_length, key_length = query_states.shape[1], key_states.shape[1] + if self.has_variable("cache", "cached_key"): + mask_shift = self.variables["cache"]["cache_index"] + max_decoder_length = self.variables["cache"]["cached_key"].shape[1] + causal_mask = lax.dynamic_slice( + self.causal_mask, (0, 0, mask_shift, 0), (1, 1, query_length, max_decoder_length) + ) + else: + causal_mask = self.causal_mask[:, :, :query_length, :key_length] + causal_mask = jnp.broadcast_to(causal_mask, (batch_size,) + causal_mask.shape[1:]) + + # combine masks if needed + if attention_mask is not None and self.causal: + attention_mask = jnp.broadcast_to(jnp.expand_dims(attention_mask, axis=(-3, -2)), causal_mask.shape) + attention_mask = combine_masks(attention_mask, causal_mask) + elif self.causal: + attention_mask = causal_mask + elif attention_mask is not None: + attention_mask = jnp.expand_dims(attention_mask, axis=(-3, -2)) + + # During fast autoregressive decoding, we feed one position at a time, + # and cache the keys and values step by step. + if self.causal and (self.has_variable("cache", "cached_key") or init_cache): + key_states, value_states, attention_mask = self._concatenate_to_cache( + key_states, value_states, query_states, attention_mask + ) + + # Convert the boolean attention mask to an attention bias. + if attention_mask is not None: + # attention mask in the form of attention bias + attention_bias = lax.select( + attention_mask > 0, + jnp.full(attention_mask.shape, 0.0).astype(self.dtype), + jnp.full(attention_mask.shape, jnp.finfo(self.dtype).min).astype(self.dtype), + ) + else: + attention_bias = None + + dropout_rng = None + if not deterministic and self.dropout > 0.0: + dropout_rng = self.make_rng("dropout") + + attn_weights = dot_product_attention_weights( + query_states, + key_states, + bias=attention_bias, + dropout_rng=dropout_rng, + dropout_rate=self.dropout, + broadcast_dropout=True, + deterministic=deterministic, + dtype=self.dtype, + precision=None, + ) + + attn_output = jnp.einsum("...hqk,...khd->...qhd", attn_weights, value_states) + attn_output = self._merge_heads(attn_output) + attn_output = self.out_proj(attn_output) + + return attn_output, attn_weights + + +class FlaxBartEncoderLayer(nn.Module): + config: BartConfig + dtype: jnp.dtype = jnp.float32 + + def setup(self) -> None: + self.embed_dim = self.config.d_model + self.self_attn = FlaxBartAttention( + config=self.config, + embed_dim=self.embed_dim, + num_heads=self.config.encoder_attention_heads, + dropout=self.config.attention_dropout, + dtype=self.dtype, + ) + self.self_attn_layer_norm = nn.LayerNorm(dtype=self.dtype, epsilon=1e-05) + self.dropout_layer = nn.Dropout(rate=self.config.dropout) + self.activation_fn = ACT2FN[self.config.activation_function] + self.activation_dropout_layer = nn.Dropout(rate=self.config.activation_dropout) + self.fc1 = nn.Dense( + self.config.encoder_ffn_dim, + dtype=self.dtype, + kernel_init=jax.nn.initializers.normal(self.config.init_std), + ) + self.fc2 = nn.Dense( + self.embed_dim, dtype=self.dtype, kernel_init=jax.nn.initializers.normal(self.config.init_std) + ) + self.final_layer_norm = nn.LayerNorm(dtype=self.dtype, epsilon=1e-05) + + def __call__( + self, + hidden_states: jnp.ndarray, + attention_mask: jnp.ndarray, + output_attentions: bool = True, + deterministic: bool = True, + ) -> Tuple[jnp.ndarray]: + residual = hidden_states + hidden_states, attn_weights = self.self_attn(hidden_states=hidden_states, attention_mask=attention_mask) + + hidden_states = self.dropout_layer(hidden_states, deterministic=deterministic) + hidden_states = residual + hidden_states + hidden_states = self.self_attn_layer_norm(hidden_states) + + residual = hidden_states + hidden_states = self.activation_fn(self.fc1(hidden_states)) + hidden_states = self.activation_dropout_layer(hidden_states, deterministic=deterministic) + hidden_states = self.fc2(hidden_states) + hidden_states = self.dropout_layer(hidden_states, deterministic=deterministic) + hidden_states = residual + hidden_states + hidden_states = self.final_layer_norm(hidden_states) + + outputs = (hidden_states,) + + if output_attentions: + outputs += (attn_weights,) + + return outputs + + +class FlaxBartEncoderLayerCollection(nn.Module): + config: BartConfig + dtype: jnp.dtype = jnp.float32 # the dtype of the computation + + def setup(self): + self.layers = [ + FlaxBartEncoderLayer(self.config, name=str(i), dtype=self.dtype) for i in range(self.config.encoder_layers) + ] + self.layerdrop = self.config.encoder_layerdrop + + def __call__( + self, + hidden_states, + attention_mask, + deterministic: bool = True, + output_attentions: bool = False, + output_hidden_states: bool = False, + return_dict: bool = True, + ): + all_attentions = () if output_attentions else None + all_hidden_states = () if output_hidden_states else None + + for encoder_layer in self.layers: + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) + dropout_probability = random.uniform(0, 1) + if not deterministic and (dropout_probability < self.layerdrop): # skip the layer + layer_outputs = (None, None) + else: + layer_outputs = encoder_layer( + hidden_states, + attention_mask, + output_attentions, + deterministic, + ) + hidden_states = layer_outputs[0] + if output_attentions: + all_attentions = all_attentions + (layer_outputs[1],) + + if output_hidden_states: + all_hidden_states += (hidden_states,) + + outputs = (hidden_states, all_hidden_states, all_attentions) + + if not return_dict: + return tuple(v for v in outputs if v is not None) + + return FlaxBaseModelOutput( + last_hidden_state=hidden_states, hidden_states=all_hidden_states, attentions=all_attentions + ) + + +class FlaxBartDecoderLayer(nn.Module): + config: BartConfig + dtype: jnp.dtype = jnp.float32 + + def setup(self) -> None: + self.embed_dim = self.config.d_model + self.self_attn = FlaxBartAttention( + config=self.config, + embed_dim=self.embed_dim, + num_heads=self.config.decoder_attention_heads, + dropout=self.config.attention_dropout, + causal=True, + dtype=self.dtype, + ) + self.dropout_layer = nn.Dropout(rate=self.config.dropout) + self.activation_fn = ACT2FN[self.config.activation_function] + self.activation_dropout_layer = nn.Dropout(rate=self.config.activation_dropout) + + self.self_attn_layer_norm = nn.LayerNorm(dtype=self.dtype, epsilon=1e-05) + self.encoder_attn = FlaxBartAttention( + config=self.config, + embed_dim=self.embed_dim, + num_heads=self.config.decoder_attention_heads, + dropout=self.config.attention_dropout, + dtype=self.dtype, + ) + self.encoder_attn_layer_norm = nn.LayerNorm(dtype=self.dtype, epsilon=1e-05) + self.fc1 = nn.Dense( + self.config.decoder_ffn_dim, + dtype=self.dtype, + kernel_init=jax.nn.initializers.normal(self.config.init_std), + ) + self.fc2 = nn.Dense( + self.embed_dim, dtype=self.dtype, kernel_init=jax.nn.initializers.normal(self.config.init_std) + ) + self.final_layer_norm = nn.LayerNorm(dtype=self.dtype, epsilon=1e-05) + + def __call__( + self, + hidden_states: jnp.ndarray, + attention_mask: jnp.ndarray, + encoder_hidden_states: Optional[jnp.ndarray] = None, + encoder_attention_mask: Optional[jnp.ndarray] = None, + init_cache: bool = False, + output_attentions: bool = True, + deterministic: bool = True, + ) -> Tuple[jnp.ndarray]: + residual = hidden_states + + # Self Attention + hidden_states, self_attn_weights = self.self_attn( + hidden_states=hidden_states, attention_mask=attention_mask, init_cache=init_cache + ) + hidden_states = self.dropout_layer(hidden_states, deterministic=deterministic) + hidden_states = residual + hidden_states + hidden_states = self.self_attn_layer_norm(hidden_states) + + # Cross-Attention Block + cross_attn_weights = None + if encoder_hidden_states is not None: + residual = hidden_states + + hidden_states, cross_attn_weights = self.encoder_attn( + hidden_states=hidden_states, + key_value_states=encoder_hidden_states, + attention_mask=encoder_attention_mask, + ) + hidden_states = self.dropout_layer(hidden_states, deterministic=deterministic) + hidden_states = residual + hidden_states + hidden_states = self.encoder_attn_layer_norm(hidden_states) + + # Fully Connected + residual = hidden_states + hidden_states = self.activation_fn(self.fc1(hidden_states)) + hidden_states = self.activation_dropout_layer(hidden_states, deterministic=deterministic) + hidden_states = self.fc2(hidden_states) + hidden_states = self.dropout_layer(hidden_states, deterministic=deterministic) + hidden_states = residual + hidden_states + hidden_states = self.final_layer_norm(hidden_states) + + outputs = (hidden_states,) + + if output_attentions: + outputs += (self_attn_weights, cross_attn_weights) + + return outputs + + +class FlaxBartDecoderLayerCollection(nn.Module): + config: BartConfig + dtype: jnp.dtype = jnp.float32 # the dtype of the computation + + def setup(self): + self.layers = [ + FlaxBartDecoderLayer(self.config, name=str(i), dtype=self.dtype) for i in range(self.config.decoder_layers) + ] + self.layerdrop = self.config.decoder_layerdrop + + def __call__( + self, + hidden_states, + attention_mask, + encoder_hidden_states: Optional[jnp.ndarray] = None, + encoder_attention_mask: Optional[jnp.ndarray] = None, + deterministic: bool = True, + init_cache: bool = False, + output_attentions: bool = False, + output_hidden_states: bool = False, + return_dict: bool = True, + ): + # decoder layers + all_hidden_states = () if output_hidden_states else None + all_self_attns = () if output_attentions else None + all_cross_attentions = () if (output_attentions and encoder_hidden_states is not None) else None + + for decoder_layer in self.layers: + if output_hidden_states: + all_hidden_states += (hidden_states,) + # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) + dropout_probability = random.uniform(0, 1) + if not deterministic and (dropout_probability < self.layerdrop): + layer_outputs = (None, None, None) + else: + layer_outputs = decoder_layer( + hidden_states, + attention_mask=attention_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + init_cache=init_cache, + output_attentions=output_attentions, + deterministic=deterministic, + ) + + hidden_states = layer_outputs[0] + if output_attentions: + all_self_attns += (layer_outputs[1],) + + if encoder_hidden_states is not None: + all_cross_attentions += (layer_outputs[2],) + + # add hidden states from the last decoder layer + if output_hidden_states: + all_hidden_states += (hidden_states,) + + outputs = [hidden_states, all_hidden_states, all_self_attns, all_cross_attentions] + + if not return_dict: + return tuple(v for v in outputs if v is not None) + + return FlaxBaseModelOutputWithPastAndCrossAttentions( + last_hidden_state=hidden_states, + hidden_states=all_hidden_states, + attentions=all_self_attns, + cross_attentions=all_cross_attentions, + ) + + +class FlaxBartClassificationHead(nn.Module): + """Head for sentence-level classification tasks.""" + + config: BartConfig + inner_dim: int + num_classes: int + pooler_dropout: float + dtype: jnp.dtype = jnp.float32 + + def setup(self): + self.dense = nn.Dense( + self.inner_dim, dtype=self.dtype, kernel_init=jax.nn.initializers.normal(self.config.init_std) + ) + self.dropout = nn.Dropout(rate=self.pooler_dropout) + self.out_proj = nn.Dense( + self.num_classes, + dtype=self.dtype, + kernel_init=jax.nn.initializers.normal(self.config.init_std), + ) + + def __call__(self, hidden_states: jnp.ndarray, deterministic: bool): + hidden_states = self.dropout(hidden_states, deterministic=deterministic) + hidden_states = self.dense(hidden_states) + hidden_states = jnp.tanh(hidden_states) + hidden_states = self.dropout(hidden_states, deterministic=deterministic) + hidden_states = self.out_proj(hidden_states) + return hidden_states + + +class FlaxBartEncoder(nn.Module): + config: BartConfig + embed_tokens: nn.Embed + dtype: jnp.dtype = jnp.float32 # the dtype of the computation + + def setup(self): + self.dropout_layer = nn.Dropout(rate=self.config.dropout) + + embed_dim = self.config.d_model + self.padding_idx = self.config.pad_token_id + self.max_source_positions = self.config.max_position_embeddings + self.embed_scale = math.sqrt(embed_dim) if self.config.scale_embedding else 1.0 + + # Bart is set up so that if padding_idx is specified then offset the embedding ids by 2 + # and adjust num_embeddings appropriately. Other models don't have this hack + self.offset = 2 + self.embed_positions = nn.Embed( + self.config.max_position_embeddings + self.offset, + embed_dim, + embedding_init=jax.nn.initializers.normal(self.config.init_std), + dtype=self.dtype, + ) + self.layers = FlaxBartEncoderLayerCollection(self.config, self.dtype) + self.layernorm_embedding = nn.LayerNorm(dtype=self.dtype, epsilon=1e-05) + + def __call__( + self, + input_ids, + attention_mask, + position_ids, + output_attentions: bool = False, + output_hidden_states: bool = False, + return_dict: bool = True, + deterministic: bool = True, + ): + input_shape = input_ids.shape + input_ids = input_ids.reshape(-1, input_shape[-1]) + + inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale + + embed_pos = self.embed_positions(position_ids + self.offset) + + hidden_states = inputs_embeds + embed_pos + hidden_states = self.layernorm_embedding(hidden_states) + hidden_states = self.dropout_layer(hidden_states, deterministic=deterministic) + + outputs = self.layers( + hidden_states, + attention_mask, + deterministic=deterministic, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + if not return_dict: + return outputs + + return FlaxBaseModelOutput( + last_hidden_state=outputs.last_hidden_state, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +class FlaxBartDecoder(nn.Module): + config: BartConfig + embed_tokens: nn.Embed + dtype: jnp.dtype = jnp.float32 # the dtype of the computation + + def setup(self): + self.dropout_layer = nn.Dropout(rate=self.config.dropout) + + embed_dim = self.config.d_model + self.padding_idx = self.config.pad_token_id + self.max_target_positions = self.config.max_position_embeddings + self.embed_scale = math.sqrt(self.config.d_model) if self.config.scale_embedding else 1.0 + + # Bart is set up so that if padding_idx is specified then offset the embedding ids by 2 + # and adjust num_embeddings appropriately. Other models don't have this hack + self.offset = 2 + self.embed_positions = nn.Embed( + self.config.max_position_embeddings + self.offset, + embed_dim, + embedding_init=jax.nn.initializers.normal(self.config.init_std), + dtype=self.dtype, + ) + + self.layers = FlaxBartDecoderLayerCollection(self.config, self.dtype) + self.layernorm_embedding = nn.LayerNorm(dtype=self.dtype, epsilon=1e-05) + + def __call__( + self, + input_ids, + attention_mask, + position_ids, + encoder_hidden_states: Optional[jnp.ndarray] = None, + encoder_attention_mask: Optional[jnp.ndarray] = None, + init_cache: bool = False, + output_attentions: bool = False, + output_hidden_states: bool = False, + return_dict: bool = True, + deterministic: bool = True, + ): + input_shape = input_ids.shape + input_ids = input_ids.reshape(-1, input_shape[-1]) + + inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale + + # embed positions + positions = self.embed_positions(position_ids + self.offset) + + hidden_states = inputs_embeds + positions + hidden_states = self.layernorm_embedding(hidden_states) + + hidden_states = self.dropout_layer(hidden_states, deterministic=deterministic) + + outputs = self.layers( + hidden_states, + attention_mask, + encoder_hidden_states, + encoder_attention_mask, + deterministic=deterministic, + init_cache=init_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + if not return_dict: + return outputs + + return FlaxBaseModelOutputWithPastAndCrossAttentions( + last_hidden_state=outputs.last_hidden_state, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + cross_attentions=outputs.cross_attentions, + ) + + +class FlaxBartModule(nn.Module): + config: BartConfig + dtype: jnp.dtype = jnp.float32 # the dtype of the computation + + def setup(self): + self.shared = nn.Embed( + self.config.vocab_size, + self.config.d_model, + embedding_init=jax.nn.initializers.normal(self.config.init_std), + dtype=self.dtype, + ) + + self.encoder = FlaxBartEncoder(self.config, dtype=self.dtype, embed_tokens=self.shared) + self.decoder = FlaxBartDecoder(self.config, dtype=self.dtype, embed_tokens=self.shared) + + def _get_encoder_module(self): + return self.encoder + + def _get_decoder_module(self): + return self.decoder + + def __call__( + self, + input_ids, + attention_mask, + decoder_input_ids, + decoder_attention_mask, + position_ids, + decoder_position_ids, + output_attentions: bool = False, + output_hidden_states: bool = False, + return_dict: bool = True, + deterministic: bool = True, + ): + encoder_outputs = self.encoder( + input_ids=input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + deterministic=deterministic, + ) + + decoder_outputs = self.decoder( + input_ids=decoder_input_ids, + attention_mask=decoder_attention_mask, + position_ids=decoder_position_ids, + encoder_hidden_states=encoder_outputs[0], + encoder_attention_mask=attention_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + deterministic=deterministic, + ) + + if not return_dict: + return decoder_outputs + encoder_outputs + + return FlaxSeq2SeqModelOutput( + last_hidden_state=decoder_outputs.last_hidden_state, + decoder_hidden_states=decoder_outputs.hidden_states, + decoder_attentions=decoder_outputs.attentions, + cross_attentions=decoder_outputs.cross_attentions, + encoder_last_hidden_state=encoder_outputs.last_hidden_state, + encoder_hidden_states=encoder_outputs.hidden_states, + encoder_attentions=encoder_outputs.attentions, + ) + + +class FlaxBartPreTrainedModel(FlaxPreTrainedModel): + config_class = BartConfig + base_model_prefix: str = "model" + module_class: nn.Module = None + + def __init__( + self, + config: BartConfig, + input_shape: Tuple[int] = (1, 1), + seed: int = 0, + dtype: jnp.dtype = jnp.float32, + _do_init: bool = True, + **kwargs, + ): + module = self.module_class(config=config, dtype=dtype, **kwargs) + super().__init__(config, module, input_shape=input_shape, seed=seed, dtype=dtype, _do_init=_do_init) + + def init_weights(self, rng: jax.random.PRNGKey, input_shape: Tuple, params: FrozenDict = None) -> FrozenDict: + # init input tensors + input_ids = jnp.zeros(input_shape, dtype="i4") + # make sure initialization pass will work for FlaxBartForSequenceClassificationModule + input_ids = input_ids.at[(..., -1)].set(self.config.eos_token_id) + attention_mask = jnp.ones_like(input_ids) + decoder_input_ids = input_ids + decoder_attention_mask = jnp.ones_like(input_ids) + + batch_size, sequence_length = input_ids.shape + position_ids = jnp.broadcast_to(jnp.arange(sequence_length)[None, :], (batch_size, sequence_length)) + decoder_position_ids = jnp.broadcast_to(jnp.arange(sequence_length)[None, :], (batch_size, sequence_length)) + + params_rng, dropout_rng = jax.random.split(rng) + rngs = {"params": params_rng, "dropout": dropout_rng} + + random_params = self.module.init( + rngs, + input_ids, + attention_mask, + decoder_input_ids, + decoder_attention_mask, + position_ids, + decoder_position_ids, + )["params"] + + if params is not None: + random_params = flatten_dict(unfreeze(random_params)) + params = flatten_dict(unfreeze(params)) + for missing_key in self._missing_keys: + params[missing_key] = random_params[missing_key] + self._missing_keys = set() + return freeze(unflatten_dict(params)) + else: + return random_params + + def init_cache(self, batch_size, max_length, encoder_outputs): + r""" + Args: + batch_size (`int`): + batch_size used for fast auto-regressive decoding. Defines the batch size of the initialized cache. + max_length (`int`): + maximum possible length for auto-regressive decoding. Defines the sequence length of the initialized + cache. + encoder_outputs (`Union[FlaxBaseModelOutput, tuple(tuple(jnp.ndarray)]`): + `encoder_outputs` consists of (`last_hidden_state`, *optional*: `hidden_states`, *optional*: + `attentions`). `last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)`, *optional*) + is a sequence of hidden-states at the output of the last layer of the encoder. Used in the + cross-attention of the decoder. + """ + # init input variables to retrieve cache + decoder_input_ids = jnp.ones((batch_size, max_length), dtype="i4") + decoder_attention_mask = jnp.ones_like(decoder_input_ids) + decoder_position_ids = jnp.broadcast_to( + jnp.arange(jnp.atleast_2d(decoder_input_ids).shape[-1]), decoder_input_ids.shape + ) + + def _decoder_forward(module, decoder_input_ids, decoder_attention_mask, decoder_position_ids, **kwargs): + decoder_module = module._get_decoder_module() + return decoder_module( + decoder_input_ids, + decoder_attention_mask, + decoder_position_ids, + **kwargs, + ) + + init_variables = self.module.init( + jax.random.PRNGKey(0), + decoder_input_ids=decoder_input_ids, + decoder_attention_mask=decoder_attention_mask, + decoder_position_ids=decoder_position_ids, + encoder_hidden_states=encoder_outputs[0], + init_cache=True, + method=_decoder_forward, # we only need to call the decoder to init the cache + ) + return unfreeze(init_variables["cache"]) + + @add_start_docstrings(BART_ENCODE_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=FlaxBaseModelOutput, config_class=BartConfig) + def encode( + self, + input_ids: jnp.ndarray, + attention_mask: Optional[jnp.ndarray] = None, + position_ids: Optional[jnp.ndarray] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + train: bool = False, + params: dict = None, + dropout_rng: PRNGKey = None, + ): + r""" + Returns: + + Example: + + ```python + >>> from transformers import AutoTokenizer, FlaxBartForConditionalGeneration + + >>> model = FlaxBartForConditionalGeneration.from_pretrained("facebook/bart-large-cnn") + >>> tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn") + + >>> text = "My friends are cool but they eat too many carbs." + >>> inputs = tokenizer(text, max_length=1024, return_tensors="jax") + >>> encoder_outputs = model.encode(**inputs) + ```""" + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.return_dict + + if attention_mask is None: + attention_mask = jnp.ones_like(input_ids) + if position_ids is None: + batch_size, sequence_length = input_ids.shape + position_ids = jnp.broadcast_to(jnp.arange(sequence_length)[None, :], (batch_size, sequence_length)) + + # Handle any PRNG if needed + rngs = {} + if dropout_rng is not None: + rngs["dropout"] = dropout_rng + + def _encoder_forward(module, input_ids, attention_mask, position_ids, **kwargs): + encode_module = module._get_encoder_module() + return encode_module(input_ids, attention_mask, position_ids, **kwargs) + + return self.module.apply( + {"params": params or self.params}, + input_ids=jnp.array(input_ids, dtype="i4"), + attention_mask=jnp.array(attention_mask, dtype="i4"), + position_ids=jnp.array(position_ids, dtype="i4"), + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + deterministic=not train, + rngs=rngs, + method=_encoder_forward, + ) + + @add_start_docstrings(BART_DECODE_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=FlaxBaseModelOutputWithPastAndCrossAttentions, config_class=BartConfig) + def decode( + self, + decoder_input_ids, + encoder_outputs, + encoder_attention_mask: Optional[jnp.ndarray] = None, + decoder_attention_mask: Optional[jnp.ndarray] = None, + decoder_position_ids: Optional[jnp.ndarray] = None, + past_key_values: dict = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + train: bool = False, + params: dict = None, + dropout_rng: PRNGKey = None, + ): + r""" + Returns: + + Example: + + ```python + >>> import jax.numpy as jnp + >>> from transformers import AutoTokenizer, FlaxBartForConditionalGeneration + + >>> model = FlaxBartForConditionalGeneration.from_pretrained("facebook/bart-large-cnn") + >>> tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn") + + >>> text = "My friends are cool but they eat too many carbs." + >>> inputs = tokenizer(text, max_length=1024, return_tensors="jax") + >>> encoder_outputs = model.encode(**inputs) + + >>> decoder_start_token_id = model.config.decoder_start_token_id + >>> decoder_input_ids = jnp.ones((inputs.input_ids.shape[0], 1), dtype="i4") * decoder_start_token_id + + >>> outputs = model.decode(decoder_input_ids, encoder_outputs) + >>> last_decoder_hidden_states = outputs.last_hidden_state + ```""" + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.return_dict + + encoder_hidden_states = encoder_outputs[0] + if encoder_attention_mask is None: + batch_size, sequence_length = encoder_hidden_states.shape[:2] + encoder_attention_mask = jnp.ones((batch_size, sequence_length)) + + batch_size, sequence_length = decoder_input_ids.shape + if decoder_attention_mask is None: + decoder_attention_mask = jnp.ones((batch_size, sequence_length)) + + if decoder_position_ids is None: + if past_key_values is not None: + raise ValueError("Make sure to provide `decoder_position_ids` when passing `past_key_values`.") + + decoder_position_ids = jnp.broadcast_to( + jnp.arange(sequence_length)[None, :], (batch_size, sequence_length) + ) + + # Handle any PRNG if needed + rngs = {} + if dropout_rng is not None: + rngs["dropout"] = dropout_rng + + inputs = {"params": params or self.params} + + # if past_key_values are passed then cache is already initialized a private flag init_cache has to be + # passed down to ensure cache is used. It has to be made sure that cache is marked as mutable so that + # it can be changed by FlaxBartAttention module + if past_key_values: + inputs["cache"] = past_key_values + mutable = ["cache"] + else: + mutable = False + + def _decoder_forward(module, decoder_input_ids, decoder_attention_mask, decoder_position_ids, **kwargs): + decoder_module = module._get_decoder_module() + return decoder_module( + decoder_input_ids, + decoder_attention_mask, + decoder_position_ids, + **kwargs, + ) + + outputs = self.module.apply( + inputs, + decoder_input_ids=jnp.array(decoder_input_ids, dtype="i4"), + decoder_attention_mask=jnp.array(decoder_attention_mask, dtype="i4"), + decoder_position_ids=jnp.array(decoder_position_ids, dtype="i4"), + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=jnp.array(encoder_attention_mask, dtype="i4"), + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + deterministic=not train, + rngs=rngs, + mutable=mutable, + method=_decoder_forward, + ) + + # add updated cache to model output + if past_key_values is not None and return_dict: + outputs, past = outputs + outputs["past_key_values"] = unfreeze(past["cache"]) + return outputs + elif past_key_values is not None and not return_dict: + outputs, past = outputs + outputs = outputs[:1] + (unfreeze(past["cache"]),) + outputs[1:] + + return outputs + + @add_start_docstrings_to_model_forward(BART_INPUTS_DOCSTRING) + def __call__( + self, + input_ids: jnp.ndarray, + attention_mask: Optional[jnp.ndarray] = None, + decoder_input_ids: Optional[jnp.ndarray] = None, + decoder_attention_mask: Optional[jnp.ndarray] = None, + position_ids: Optional[jnp.ndarray] = None, + decoder_position_ids: Optional[jnp.ndarray] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + train: bool = False, + params: dict = None, + dropout_rng: PRNGKey = None, + ): + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.return_dict + + # prepare encoder inputs + if attention_mask is None: + attention_mask = jnp.ones_like(input_ids) + if position_ids is None: + batch_size, sequence_length = input_ids.shape + position_ids = jnp.broadcast_to(jnp.arange(sequence_length)[None, :], (batch_size, sequence_length)) + + # prepare decoder inputs + if decoder_input_ids is None: + decoder_input_ids = shift_tokens_right( + input_ids, self.config.pad_token_id, decoder_start_token_id=self.config.decoder_start_token_id + ) + if decoder_attention_mask is None: + decoder_attention_mask = jnp.ones_like(decoder_input_ids) + if decoder_position_ids is None: + batch_size, sequence_length = decoder_input_ids.shape + decoder_position_ids = jnp.broadcast_to( + jnp.arange(sequence_length)[None, :], (batch_size, sequence_length) + ) + + # Handle any PRNG if needed + rngs = {"dropout": dropout_rng} if dropout_rng is not None else {} + + return self.module.apply( + {"params": params or self.params}, + input_ids=jnp.array(input_ids, dtype="i4"), + attention_mask=jnp.array(attention_mask, dtype="i4"), + position_ids=jnp.array(position_ids, dtype="i4"), + decoder_input_ids=jnp.array(decoder_input_ids, dtype="i4"), + decoder_attention_mask=jnp.array(decoder_attention_mask, dtype="i4"), + decoder_position_ids=jnp.array(decoder_position_ids, dtype="i4"), + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + deterministic=not train, + rngs=rngs, + ) + + +@add_start_docstrings( + "The bare Bart Model transformer outputting raw hidden-states without any specific head on top.", + BART_START_DOCSTRING, +) +class FlaxBartModel(FlaxBartPreTrainedModel): + config: BartConfig + dtype: jnp.dtype = jnp.float32 # the dtype of the computation + module_class = FlaxBartModule + + +append_call_sample_docstring(FlaxBartModel, _CHECKPOINT_FOR_DOC, FlaxSeq2SeqModelOutput, _CONFIG_FOR_DOC) + + +class FlaxBartForConditionalGenerationModule(nn.Module): + config: BartConfig + dtype: jnp.dtype = jnp.float32 + bias_init: Callable[..., jnp.ndarray] = jax.nn.initializers.zeros + + def setup(self): + self.model = FlaxBartModule(config=self.config, dtype=self.dtype) + self.lm_head = nn.Dense( + self.model.shared.num_embeddings, + use_bias=False, + dtype=self.dtype, + kernel_init=jax.nn.initializers.normal(self.config.init_std), + ) + self.final_logits_bias = self.param("final_logits_bias", self.bias_init, (1, self.model.shared.num_embeddings)) + + def _get_encoder_module(self): + return self.model.encoder + + def _get_decoder_module(self): + return self.model.decoder + + def __call__( + self, + input_ids, + attention_mask, + decoder_input_ids, + decoder_attention_mask, + position_ids, + decoder_position_ids, + output_attentions: bool = False, + output_hidden_states: bool = False, + return_dict: bool = True, + deterministic: bool = True, + ): + outputs = self.model( + input_ids=input_ids, + attention_mask=attention_mask, + decoder_input_ids=decoder_input_ids, + decoder_attention_mask=decoder_attention_mask, + position_ids=position_ids, + decoder_position_ids=decoder_position_ids, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + deterministic=deterministic, + ) + + hidden_states = outputs[0] + + if self.config.tie_word_embeddings: + shared_embedding = self.model.variables["params"]["shared"]["embedding"] + lm_logits = self.lm_head.apply({"params": {"kernel": shared_embedding.T}}, hidden_states) + else: + lm_logits = self.lm_head(hidden_states) + + lm_logits += jax.lax.stop_gradient(self.final_logits_bias.astype(self.dtype)) + + if not return_dict: + output = (lm_logits,) + outputs[1:] + return output + + return FlaxSeq2SeqLMOutput( + logits=lm_logits, + decoder_hidden_states=outputs.decoder_hidden_states, + decoder_attentions=outputs.decoder_attentions, + cross_attentions=outputs.cross_attentions, + encoder_last_hidden_state=outputs.encoder_last_hidden_state, + encoder_hidden_states=outputs.encoder_hidden_states, + encoder_attentions=outputs.encoder_attentions, + ) + + +@add_start_docstrings( + "The BART Model with a language modeling head. Can be used for summarization.", BART_START_DOCSTRING +) +class FlaxBartForConditionalGeneration(FlaxBartPreTrainedModel): + module_class = FlaxBartForConditionalGenerationModule + dtype: jnp.dtype = jnp.float32 + + @add_start_docstrings(BART_DECODE_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=FlaxCausalLMOutputWithCrossAttentions, config_class=BartConfig) + def decode( + self, + decoder_input_ids, + encoder_outputs, + encoder_attention_mask: Optional[jnp.ndarray] = None, + decoder_attention_mask: Optional[jnp.ndarray] = None, + decoder_position_ids: Optional[jnp.ndarray] = None, + past_key_values: dict = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + train: bool = False, + params: dict = None, + dropout_rng: PRNGKey = None, + ): + r""" + Returns: + + Example: + + ```python + >>> import jax.numpy as jnp + >>> from transformers import AutoTokenizer, FlaxBartForConditionalGeneration + + >>> model = FlaxBartForConditionalGeneration.from_pretrained("facebook/bart-large-cnn") + >>> tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn") + + >>> text = "My friends are cool but they eat too many carbs." + >>> inputs = tokenizer(text, max_length=1024, return_tensors="jax") + >>> encoder_outputs = model.encode(**inputs) + + >>> decoder_start_token_id = model.config.decoder_start_token_id + >>> decoder_input_ids = jnp.ones((inputs.input_ids.shape[0], 1), dtype="i4") * decoder_start_token_id + + >>> outputs = model.decode(decoder_input_ids, encoder_outputs) + >>> logits = outputs.logits + ```""" + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.return_dict + + encoder_hidden_states = encoder_outputs[0] + if encoder_attention_mask is None: + batch_size, sequence_length = encoder_hidden_states.shape[:2] + encoder_attention_mask = jnp.ones((batch_size, sequence_length)) + + batch_size, sequence_length = decoder_input_ids.shape + if decoder_attention_mask is None: + decoder_attention_mask = jnp.ones((batch_size, sequence_length)) + + if decoder_position_ids is None: + if past_key_values is not None: + raise ValueError("Make sure to provide `decoder_position_ids` when passing `past_key_values`.") + + decoder_position_ids = jnp.broadcast_to( + jnp.arange(sequence_length)[None, :], (batch_size, sequence_length) + ) + + # Handle any PRNG if needed + rngs = {} + if dropout_rng is not None: + rngs["dropout"] = dropout_rng + + inputs = {"params": params or self.params} + + # if past_key_values are passed then cache is already initialized a private flag init_cache has to be + # passed down to ensure cache is used. It has to be made sure that cache is marked as mutable so that + # it can be changed by FlaxBartAttention module + if past_key_values: + inputs["cache"] = past_key_values + mutable = ["cache"] + else: + mutable = False + + def _decoder_forward(module, decoder_input_ids, decoder_attention_mask, decoder_position_ids, **kwargs): + decoder_module = module._get_decoder_module() + outputs = decoder_module( + decoder_input_ids, + decoder_attention_mask, + decoder_position_ids, + **kwargs, + ) + hidden_states = outputs[0] + + if self.config.tie_word_embeddings: + shared_embedding = module.model.variables["params"]["shared"]["embedding"] + lm_logits = module.lm_head.apply({"params": {"kernel": shared_embedding.T}}, hidden_states) + else: + lm_logits = module.lm_head(hidden_states) + + lm_logits += module.final_logits_bias.astype(self.dtype) + return lm_logits, outputs + + outputs = self.module.apply( + inputs, + decoder_input_ids=jnp.array(decoder_input_ids, dtype="i4"), + decoder_attention_mask=jnp.array(decoder_attention_mask, dtype="i4"), + decoder_position_ids=jnp.array(decoder_position_ids, dtype="i4"), + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=jnp.array(encoder_attention_mask, dtype="i4"), + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + deterministic=not train, + rngs=rngs, + mutable=mutable, + method=_decoder_forward, + ) + + if past_key_values is None: + lm_logits, decoder_outputs = outputs + else: + (lm_logits, decoder_outputs), past = outputs + + if return_dict: + outputs = FlaxCausalLMOutputWithCrossAttentions( + logits=lm_logits, + hidden_states=decoder_outputs.hidden_states, + attentions=decoder_outputs.attentions, + cross_attentions=decoder_outputs.cross_attentions, + ) + else: + outputs = (lm_logits,) + decoder_outputs[1:] + + # add updated cache to model output + if past_key_values is not None and return_dict: + outputs["past_key_values"] = unfreeze(past["cache"]) + return outputs + elif past_key_values is not None and not return_dict: + outputs = outputs[:1] + (unfreeze(past["cache"]),) + outputs[1:] + + return outputs + + def prepare_inputs_for_generation( + self, + decoder_input_ids, + max_length, + attention_mask: Optional[jax.Array] = None, + decoder_attention_mask: Optional[jax.Array] = None, + encoder_outputs=None, + **kwargs, + ): + # initializing the cache + batch_size, seq_length = decoder_input_ids.shape + + past_key_values = self.init_cache(batch_size, max_length, encoder_outputs) + # Note that usually one would have to put 0's in the attention_mask for x > input_ids.shape[-1] and x < cache_length. + # But since the decoder uses a causal mask, those positions are masked anyways. + # Thus we can create a single static attention_mask here, which is more efficient for compilation + extended_attention_mask = jnp.ones((batch_size, max_length), dtype="i4") + if decoder_attention_mask is not None: + position_ids = decoder_attention_mask.cumsum(axis=-1) - 1 + extended_attention_mask = lax.dynamic_update_slice(extended_attention_mask, decoder_attention_mask, (0, 0)) + else: + position_ids = jnp.broadcast_to(jnp.arange(seq_length, dtype="i4")[None, :], (batch_size, seq_length)) + + return { + "past_key_values": past_key_values, + "encoder_outputs": encoder_outputs, + "encoder_attention_mask": attention_mask, + "decoder_attention_mask": extended_attention_mask, + "decoder_position_ids": position_ids, + } + + def update_inputs_for_generation(self, model_outputs, model_kwargs): + model_kwargs["past_key_values"] = model_outputs.past_key_values + model_kwargs["decoder_position_ids"] = model_kwargs["decoder_position_ids"][:, -1:] + 1 + return model_kwargs + + +FLAX_BART_CONDITIONAL_GENERATION_DOCSTRING = """ + Returns: + + Summarization example: + + ```python + >>> from transformers import AutoTokenizer, FlaxBartForConditionalGeneration + + >>> model = FlaxBartForConditionalGeneration.from_pretrained("facebook/bart-large-cnn") + >>> tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-cnn") + + >>> ARTICLE_TO_SUMMARIZE = "My friends are cool but they eat too many carbs." + >>> inputs = tokenizer([ARTICLE_TO_SUMMARIZE], max_length=1024, return_tensors="np") + + >>> # Generate Summary + >>> summary_ids = model.generate(inputs["input_ids"]).sequences + >>> print(tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)) + ``` + + Mask filling example: + + ```python + >>> import jax + >>> from transformers import AutoTokenizer, FlaxBartForConditionalGeneration + + >>> model = FlaxBartForConditionalGeneration.from_pretrained("facebook/bart-large") + >>> tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large") + + >>> TXT = "My friends are but they eat too many carbs." + >>> input_ids = tokenizer([TXT], return_tensors="jax")["input_ids"] + + >>> logits = model(input_ids).logits + >>> masked_index = (input_ids[0] == tokenizer.mask_token_id).nonzero()[0].item() + >>> probs = jax.nn.softmax(logits[0, masked_index], axis=0) + >>> values, predictions = jax.lax.top_k(probs, k=1) + + >>> tokenizer.decode(predictions).split() + ``` +""" + +overwrite_call_docstring( + FlaxBartForConditionalGeneration, BART_INPUTS_DOCSTRING + FLAX_BART_CONDITIONAL_GENERATION_DOCSTRING +) +append_replace_return_docstrings( + FlaxBartForConditionalGeneration, output_type=FlaxSeq2SeqLMOutput, config_class=_CONFIG_FOR_DOC +) + + +class FlaxBartForSequenceClassificationModule(nn.Module): + config: BartConfig + dtype: jnp.dtype = jnp.float32 + num_labels: Optional[int] = None + + def setup(self): + self.model = FlaxBartModule(config=self.config, dtype=self.dtype) + self.classification_head = FlaxBartClassificationHead( + config=self.config, + inner_dim=self.config.d_model, + num_classes=self.num_labels if self.num_labels is not None else self.config.num_labels, + pooler_dropout=self.config.classifier_dropout, + ) + + def _get_encoder_module(self): + return self.model.encoder + + def _get_decoder_module(self): + return self.model.decoder + + def __call__( + self, + input_ids, + attention_mask, + decoder_input_ids, + decoder_attention_mask, + position_ids, + decoder_position_ids, + output_attentions: bool = False, + output_hidden_states: bool = False, + return_dict: bool = True, + deterministic: bool = True, + ): + outputs = self.model( + input_ids=input_ids, + attention_mask=attention_mask, + decoder_input_ids=decoder_input_ids, + decoder_attention_mask=decoder_attention_mask, + position_ids=position_ids, + decoder_position_ids=decoder_position_ids, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + deterministic=deterministic, + ) + + hidden_states = outputs[0] # last hidden state + + eos_mask = jnp.where(input_ids == self.config.eos_token_id, 1, 0) + + # The first condition is necessary to overcome jax._src.errors.ConcretizationTypeError during JIT compilation + if not isinstance(eos_mask, jax.interpreters.partial_eval.DynamicJaxprTracer): + if len(jnp.unique(eos_mask.sum(1))) > 1: + raise ValueError("All examples must have the same number of tokens.") + + if any(eos_mask.sum(1) == 0): + raise ValueError("There are missing tokens in input_ids") + + # Ensure to keep 1 only for the last token for each example + eos_mask_noised = eos_mask + jnp.arange(eos_mask.shape[1]) * 1e-6 + eos_mask = jnp.where(eos_mask_noised == eos_mask_noised.max(1).reshape(-1, 1), 1, 0) + + sentence_representation = jnp.einsum("ijk, ij -> ijk", hidden_states, eos_mask).sum(1) + logits = self.classification_head(sentence_representation, deterministic=deterministic) + + if not return_dict: + output = (logits,) + outputs[1:] + return output + + return FlaxSeq2SeqSequenceClassifierOutput( + logits=logits, + decoder_hidden_states=outputs.decoder_hidden_states, + decoder_attentions=outputs.decoder_attentions, + cross_attentions=outputs.cross_attentions, + encoder_last_hidden_state=outputs.encoder_last_hidden_state, + encoder_hidden_states=outputs.encoder_hidden_states, + encoder_attentions=outputs.encoder_attentions, + ) + + +@add_start_docstrings( + """ + Bart model with a sequence classification/head on top (a linear layer on top of the pooled output) e.g. for GLUE + tasks. + """, + BART_START_DOCSTRING, +) +class FlaxBartForSequenceClassification(FlaxBartPreTrainedModel): + module_class = FlaxBartForSequenceClassificationModule + dtype = jnp.float32 + + +append_call_sample_docstring( + FlaxBartForSequenceClassification, + _CHECKPOINT_FOR_DOC, + FlaxSeq2SeqSequenceClassifierOutput, + _CONFIG_FOR_DOC, +) + + +class FlaxBartForQuestionAnsweringModule(nn.Module): + config: BartConfig + dtype: jnp.dtype = jnp.float32 + num_labels = 2 + + def setup(self): + self.model = FlaxBartModule(config=self.config, dtype=self.dtype) + self.qa_outputs = nn.Dense( + self.num_labels, dtype=self.dtype, kernel_init=jax.nn.initializers.normal(self.config.init_std) + ) + + def _get_encoder_module(self): + return self.model.encoder + + def _get_decoder_module(self): + return self.model.decoder + + def __call__( + self, + input_ids, + attention_mask, + decoder_input_ids, + decoder_attention_mask, + position_ids, + decoder_position_ids, + output_attentions: bool = False, + output_hidden_states: bool = False, + return_dict: bool = True, + deterministic: bool = True, + ): + outputs = self.model( + input_ids=input_ids, + attention_mask=attention_mask, + decoder_input_ids=decoder_input_ids, + decoder_attention_mask=decoder_attention_mask, + position_ids=position_ids, + decoder_position_ids=decoder_position_ids, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + deterministic=deterministic, + ) + + sequence_output = outputs[0] + + logits = self.qa_outputs(sequence_output) + start_logits, end_logits = jnp.split(logits, logits.shape[-1], axis=-1) + start_logits = start_logits.squeeze(-1) + end_logits = end_logits.squeeze(-1) + + if not return_dict: + output = (start_logits, end_logits) + outputs[1:] + return output + + return FlaxSeq2SeqQuestionAnsweringModelOutput( + start_logits=start_logits, + end_logits=end_logits, + decoder_hidden_states=outputs.decoder_hidden_states, + decoder_attentions=outputs.decoder_attentions, + cross_attentions=outputs.cross_attentions, + encoder_last_hidden_state=outputs.encoder_last_hidden_state, + encoder_hidden_states=outputs.encoder_hidden_states, + encoder_attentions=outputs.encoder_attentions, + ) + + +@add_start_docstrings( + """ + BART Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear + layer on top of the hidden-states output to compute `span start logits` and `span end logits`). + """, + BART_START_DOCSTRING, +) +class FlaxBartForQuestionAnswering(FlaxBartPreTrainedModel): + module_class = FlaxBartForQuestionAnsweringModule + dtype = jnp.float32 + + +append_call_sample_docstring( + FlaxBartForQuestionAnswering, + _CHECKPOINT_FOR_DOC, + FlaxSeq2SeqQuestionAnsweringModelOutput, + _CONFIG_FOR_DOC, +) + + +class FlaxBartDecoderPreTrainedModel(FlaxPreTrainedModel): + config_class = BartConfig + base_model_prefix: str = "model" + module_class: nn.Module = None + + def __init__( + self, + config: BartConfig, + input_shape: Tuple[int] = (1, 1), + seed: int = 0, + dtype: jnp.dtype = jnp.float32, + _do_init: bool = True, + **kwargs, + ): + config.is_decoder = True + config.is_encoder_decoder = False + module = self.module_class(config=config, dtype=dtype, **kwargs) + super().__init__(config, module, input_shape=input_shape, seed=seed, dtype=dtype, _do_init=_do_init) + + def init_weights(self, rng: jax.random.PRNGKey, input_shape: Tuple, params: FrozenDict = None) -> FrozenDict: + # init input tensors + input_ids = jnp.zeros(input_shape, dtype="i4") + attention_mask = jnp.ones_like(input_ids) + + batch_size, sequence_length = input_ids.shape + position_ids = jnp.broadcast_to(jnp.arange(sequence_length)[None, :], (batch_size, sequence_length)) + + params_rng, dropout_rng = jax.random.split(rng) + rngs = {"params": params_rng, "dropout": dropout_rng} + encoder_hidden_states = jnp.zeros(input_shape + (self.config.d_model,)) + encoder_attention_mask = attention_mask + module_init_outputs = self.module.init( + rngs, + input_ids, + attention_mask, + position_ids, + encoder_hidden_states, + encoder_attention_mask, + return_dict=False, + ) + return module_init_outputs["params"] + + def init_cache(self, batch_size, max_length): + r""" + Args: + batch_size (`int`): + batch_size used for fast auto-regressive decoding. Defines the batch size of the initialized cache. + max_length (`int`): + maximum possible length for auto-regressive decoding. Defines the sequence length of the initialized + cache. + """ + # init input variables to retrieve cache + input_ids = jnp.ones((batch_size, max_length), dtype="i4") + attention_mask = jnp.ones_like(input_ids, dtype="i4") + position_ids = jnp.broadcast_to(jnp.arange(jnp.atleast_2d(input_ids).shape[-1]), input_ids.shape) + + init_variables = self.module.init( + jax.random.PRNGKey(0), input_ids, attention_mask, position_ids, return_dict=False, init_cache=True + ) + return unfreeze(init_variables["cache"]) + + @add_start_docstrings_to_model_forward(BART_DECODE_INPUTS_DOCSTRING) + def __call__( + self, + input_ids: jnp.ndarray, + attention_mask: Optional[jnp.ndarray] = None, + position_ids: Optional[jnp.ndarray] = None, + encoder_hidden_states: Optional[jnp.ndarray] = None, + encoder_attention_mask: Optional[jnp.ndarray] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + train: bool = False, + params: dict = None, + past_key_values: dict = None, + dropout_rng: PRNGKey = None, + ): + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.return_dict + + if encoder_hidden_states is not None and encoder_attention_mask is None: + batch_size, sequence_length = encoder_hidden_states.shape[:2] + encoder_attention_mask = jnp.ones((batch_size, sequence_length)) + + # prepare decoder inputs + if attention_mask is None: + attention_mask = jnp.ones_like(input_ids) + if position_ids is None: + batch_size, sequence_length = input_ids.shape + position_ids = jnp.broadcast_to(jnp.arange(sequence_length)[None, :], (batch_size, sequence_length)) + + # Handle any PRNG if needed + rngs = {"dropout": dropout_rng} if dropout_rng is not None else {} + + inputs = {"params": params or self.params} + + # if past_key_values are passed then cache is already initialized a private flag init_cache has to be passed + # down to ensure cache is used. It has to be made sure that cache is marked as mutable so that it can be + # changed by FlaxBartAttention module + if past_key_values: + inputs["cache"] = past_key_values + mutable = ["cache"] + else: + mutable = False + + outputs = self.module.apply( + inputs, + input_ids=jnp.array(input_ids, dtype="i4"), + attention_mask=jnp.array(attention_mask, dtype="i4"), + position_ids=jnp.array(position_ids, dtype="i4"), + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + deterministic=not train, + rngs=rngs, + mutable=mutable, + ) + + # add updated cache to model output + if past_key_values is not None and return_dict: + outputs, past_key_values = outputs + outputs["past_key_values"] = unfreeze(past_key_values["cache"]) + return outputs + elif past_key_values is not None and not return_dict: + outputs, past_key_values = outputs + outputs = outputs[:1] + (unfreeze(past_key_values["cache"]),) + outputs[1:] + + return outputs + + +class FlaxBartDecoderWrapper(nn.Module): + """ + This wrapper class is a helper class to correctly load pretrained checkpoints when the causal language model is + used in combination with the [`EncoderDecoderModel`] framework. + """ + + config: BartConfig + dtype: jnp.dtype = jnp.float32 + + def setup(self): + embed_dim = self.config.d_model + embed_tokens = nn.Embed( + self.config.vocab_size, + embed_dim, + embedding_init=jax.nn.initializers.normal(self.config.init_std), + dtype=self.dtype, + ) + self.decoder = FlaxBartDecoder(config=self.config, embed_tokens=embed_tokens, dtype=self.dtype) + + def __call__(self, *args, **kwargs): + return self.decoder(*args, **kwargs) + + +class FlaxBartForCausalLMModule(nn.Module): + config: BartConfig + dtype: jnp.dtype = jnp.float32 + + def setup(self): + self.model = FlaxBartDecoderWrapper(config=self.config, dtype=self.dtype) + self.lm_head = nn.Dense( + self.config.vocab_size, + use_bias=False, + dtype=self.dtype, + kernel_init=jax.nn.initializers.normal(self.config.init_std), + ) + + def __call__( + self, + input_ids, + attention_mask, + position_ids, + encoder_hidden_states: Optional[jnp.ndarray] = None, + encoder_attention_mask: Optional[jnp.ndarray] = None, + init_cache: bool = False, + output_attentions: bool = False, + output_hidden_states: bool = False, + return_dict: bool = True, + deterministic: bool = True, + ): + outputs = self.model( + input_ids, + attention_mask, + position_ids, + encoder_hidden_states, + encoder_attention_mask, + deterministic=deterministic, + init_cache=init_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + hidden_states = outputs[0] + + if self.config.tie_word_embeddings: + shared_embedding = self.model.variables["params"]["decoder"]["embed_tokens"]["embedding"] + lm_logits = self.lm_head.apply({"params": {"kernel": shared_embedding.T}}, hidden_states) + else: + lm_logits = self.lm_head(hidden_states) + + if not return_dict: + return (lm_logits,) + outputs[1:] + + return FlaxCausalLMOutputWithCrossAttentions( + logits=lm_logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + cross_attentions=outputs.cross_attentions, + ) + + +@add_start_docstrings( + """ + Bart Decoder Model with a language modeling head on top (linear layer with weights tied to the input embeddings) + e.g for autoregressive tasks. + """, + BART_START_DOCSTRING, +) +class FlaxBartForCausalLM(FlaxBartDecoderPreTrainedModel): + module_class = FlaxBartForCausalLMModule + + def prepare_inputs_for_generation(self, input_ids, max_length, attention_mask: Optional[jax.Array] = None): + # initializing the cache + batch_size, seq_length = input_ids.shape + + past_key_values = self.init_cache(batch_size, max_length) + # Note that usually one would have to put 0's in the attention_mask for x > input_ids.shape[-1] and x < cache_length. + # But since the decoder uses a causal mask, those positions are masked anyway. + # Thus, we can create a single static attention_mask here, which is more efficient for compilation + extended_attention_mask = jnp.ones((batch_size, max_length), dtype="i4") + if attention_mask is not None: + position_ids = attention_mask.cumsum(axis=-1) - 1 + extended_attention_mask = lax.dynamic_update_slice(extended_attention_mask, attention_mask, (0, 0)) + else: + position_ids = jnp.broadcast_to(jnp.arange(seq_length, dtype="i4")[None, :], (batch_size, seq_length)) + + return { + "past_key_values": past_key_values, + "attention_mask": extended_attention_mask, + "position_ids": position_ids, + } + + def update_inputs_for_generation(self, model_outputs, model_kwargs): + model_kwargs["past_key_values"] = model_outputs.past_key_values + model_kwargs["position_ids"] = model_kwargs["position_ids"][:, -1:] + 1 + return model_kwargs + + +append_call_sample_docstring( + FlaxBartForCausalLM, + _CHECKPOINT_FOR_DOC, + FlaxCausalLMOutputWithCrossAttentions, + _CONFIG_FOR_DOC, +) + + +__all__ = [ + "FlaxBartDecoderPreTrainedModel", + "FlaxBartForCausalLM", + "FlaxBartForConditionalGeneration", + "FlaxBartForQuestionAnswering", + "FlaxBartForSequenceClassification", + "FlaxBartModel", + "FlaxBartPreTrainedModel", +] diff --git a/docs/transformers/build/lib/transformers/models/bart/modeling_tf_bart.py b/docs/transformers/build/lib/transformers/models/bart/modeling_tf_bart.py new file mode 100644 index 0000000000000000000000000000000000000000..7ab9817986e6adff40a2dde760d0aea89e7f91ad --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/bart/modeling_tf_bart.py @@ -0,0 +1,1714 @@ +# coding=utf-8 +# Copyright 2021 The Fairseq Authors and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""TF 2.0 Bart model.""" + +from __future__ import annotations + +import random +from typing import Optional, Tuple, Union + +import numpy as np +import tensorflow as tf + +from ...activations_tf import get_tf_activation +from ...modeling_tf_outputs import ( + TFBaseModelOutput, + TFBaseModelOutputWithPastAndCrossAttentions, + TFSeq2SeqLMOutput, + TFSeq2SeqModelOutput, + TFSeq2SeqSequenceClassifierOutput, +) + +# Public API +from ...modeling_tf_utils import ( + TFCausalLanguageModelingLoss, + TFModelInputType, + TFPreTrainedModel, + TFSequenceClassificationLoss, + keras, + keras_serializable, + unpack_inputs, +) +from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax +from ...utils import ( + add_code_sample_docstrings, + add_end_docstrings, + add_start_docstrings, + add_start_docstrings_to_model_forward, + logging, + replace_return_docstrings, +) +from .configuration_bart import BartConfig + + +logger = logging.get_logger(__name__) + +_CHECKPOINT_FOR_DOC = "facebook/bart-large" +_CONFIG_FOR_DOC = "BartConfig" + + +LARGE_NEGATIVE = -1e8 + + +def shift_tokens_right(input_ids: tf.Tensor, pad_token_id: int, decoder_start_token_id: int): + pad_token_id = tf.cast(pad_token_id, input_ids.dtype) + decoder_start_token_id = tf.cast(decoder_start_token_id, input_ids.dtype) + start_tokens = tf.fill( + (shape_list(input_ids)[0], 1), tf.convert_to_tensor(decoder_start_token_id, input_ids.dtype) + ) + shifted_input_ids = tf.concat([start_tokens, input_ids[:, :-1]], -1) + # replace possible -100 values in labels by `pad_token_id` + shifted_input_ids = tf.where( + shifted_input_ids == -100, + tf.fill(shape_list(shifted_input_ids), tf.convert_to_tensor(pad_token_id, input_ids.dtype)), + shifted_input_ids, + ) + + # "Verify that `labels` has only positive values and -100" + assert_gte0 = tf.debugging.assert_greater_equal(shifted_input_ids, tf.constant(0, dtype=input_ids.dtype)) + + # Make sure the assertion op is called by wrapping the result in an identity no-op + with tf.control_dependencies([assert_gte0]): + shifted_input_ids = tf.identity(shifted_input_ids) + + return shifted_input_ids + + +def _make_causal_mask(input_ids_shape: tf.TensorShape, past_key_values_length: int = 0): + """ + Make causal mask used for bi-directional self-attention. + """ + bsz = input_ids_shape[0] + tgt_len = input_ids_shape[1] + mask = tf.ones((tgt_len, tgt_len)) * LARGE_NEGATIVE + mask_cond = tf.range(shape_list(mask)[-1]) + + mask = tf.where(mask_cond < tf.reshape(mask_cond + 1, (shape_list(mask)[-1], 1)), 0.0, mask) + + if past_key_values_length > 0: + mask = tf.concat([tf.zeros((tgt_len, past_key_values_length)), mask], axis=-1) + + return tf.tile(mask[None, None, :, :], (bsz, 1, 1, 1)) + + +def _expand_mask(mask: tf.Tensor, tgt_len: Optional[int] = None): + """ + Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`. + """ + src_len = shape_list(mask)[1] + tgt_len = tgt_len if tgt_len is not None else src_len + one_cst = tf.constant(1.0) + mask = tf.cast(mask, dtype=one_cst.dtype) + expanded_mask = tf.tile(mask[:, None, None, :], (1, 1, tgt_len, 1)) + + return (one_cst - expanded_mask) * LARGE_NEGATIVE + + +class TFBartLearnedPositionalEmbedding(keras.layers.Embedding): + """ + This module learns positional embeddings up to a fixed maximum size. + """ + + def __init__(self, num_embeddings: int, embedding_dim: int, **kwargs): + # Bart is set up so that if padding_idx is specified then offset the embedding ids by 2 + # and adjust num_embeddings appropriately. Other models don't have this hack + self.offset = 2 + super().__init__(num_embeddings + self.offset, embedding_dim, **kwargs) + + def call( + self, + input_shape: Optional[tf.TensorShape] = None, + past_key_values_length: int = 0, + position_ids: tf.Tensor | None = None, + ): + """Input is expected to be of size [bsz x seqlen].""" + if position_ids is None: + seq_len = input_shape[1] + position_ids = tf.range(seq_len, delta=1, name="range") + position_ids += past_key_values_length + + offset_dtype = position_ids.dtype if isinstance(position_ids, tf.Tensor) else tf.int32 + return super().call(position_ids + tf.constant(self.offset, dtype=offset_dtype)) + + +class TFBartAttention(keras.layers.Layer): + """Multi-headed attention from "Attention Is All You Need""" + + def __init__( + self, + embed_dim: int, + num_heads: int, + dropout: float = 0.0, + is_decoder: bool = False, + bias: bool = True, + **kwargs, + ): + super().__init__(**kwargs) + self.embed_dim = embed_dim + + self.num_heads = num_heads + self.dropout = keras.layers.Dropout(dropout) + self.head_dim = embed_dim // num_heads + if (self.head_dim * num_heads) != self.embed_dim: + raise ValueError( + f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}" + f" and `num_heads`: {num_heads})." + ) + self.scaling = self.head_dim**-0.5 + self.is_decoder = is_decoder + + self.k_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="k_proj") + self.q_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="q_proj") + self.v_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="v_proj") + self.out_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="out_proj") + + def _shape(self, tensor: tf.Tensor, seq_len: int, bsz: int): + return tf.transpose(tf.reshape(tensor, (bsz, seq_len, self.num_heads, self.head_dim)), (0, 2, 1, 3)) + + def call( + self, + hidden_states: tf.Tensor, + key_value_states: tf.Tensor | None = None, + past_key_value: Tuple[Tuple[tf.Tensor]] | None = None, + attention_mask: tf.Tensor | None = None, + layer_head_mask: tf.Tensor | None = None, + training: Optional[bool] = False, + ) -> Tuple[tf.Tensor, tf.Tensor | None]: + """Input shape: Batch x Time x Channel""" + + # if key_value_states are provided this layer is used as a cross-attention layer + # for the decoder + is_cross_attention = key_value_states is not None + bsz, tgt_len, embed_dim = shape_list(hidden_states) + + # get query proj + query_states = self.q_proj(hidden_states) * self.scaling + # get key, value proj + if is_cross_attention and past_key_value is not None: + # reuse k,v, cross_attentions + key_states = past_key_value[0] + value_states = past_key_value[1] + elif is_cross_attention: + # cross_attentions + key_states = self._shape(self.k_proj(key_value_states), -1, bsz) + value_states = self._shape(self.v_proj(key_value_states), -1, bsz) + elif past_key_value is not None: + # reuse k, v, self_attention + key_states = self._shape(self.k_proj(hidden_states), -1, bsz) + value_states = self._shape(self.v_proj(hidden_states), -1, bsz) + key_states = tf.concat([past_key_value[0], key_states], axis=2) + value_states = tf.concat([past_key_value[1], value_states], axis=2) + else: + # self_attention + key_states = self._shape(self.k_proj(hidden_states), -1, bsz) + value_states = self._shape(self.v_proj(hidden_states), -1, bsz) + + if self.is_decoder: + # if cross_attention save Tuple(tf.Tensor, tf.Tensor) of all cross attention key/value_states. + # Further calls to cross_attention layer can then reuse all cross-attention + # key/value_states (first "if" case) + # if uni-directional self-attention (decoder) save Tuple(tf.Tensor, tf.Tensor) of + # all previous decoder key/value_states. Further calls to uni-directional self-attention + # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case) + # if encoder bi-directional self-attention `past_key_value` is always `None` + past_key_value = (key_states, value_states) + + proj_shape = (bsz * self.num_heads, -1, self.head_dim) + query_states = tf.reshape(self._shape(query_states, tgt_len, bsz), proj_shape) + key_states = tf.reshape(key_states, proj_shape) + value_states = tf.reshape(value_states, proj_shape) + + src_len = shape_list(key_states)[1] + attn_weights = tf.matmul(query_states, key_states, transpose_b=True) + + tf.debugging.assert_equal( + shape_list(attn_weights), + [bsz * self.num_heads, tgt_len, src_len], + message=( + f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is" + f" {shape_list(attn_weights)}" + ), + ) + + if attention_mask is not None: + tf.debugging.assert_equal( + shape_list(attention_mask), + [bsz, 1, tgt_len, src_len], + message=( + f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is" + f" {shape_list(attention_mask)}" + ), + ) + + attention_mask = tf.cast(attention_mask, dtype=attn_weights.dtype) + attn_weights = tf.reshape(attn_weights, (bsz, self.num_heads, tgt_len, src_len)) + attention_mask + attn_weights = tf.reshape(attn_weights, (bsz * self.num_heads, tgt_len, src_len)) + + attn_weights = stable_softmax(attn_weights, axis=-1) + + if layer_head_mask is not None: + tf.debugging.assert_equal( + shape_list(layer_head_mask), + [self.num_heads], + message=( + f"Head mask for a single layer should be of size {(self.num_heads)}, but is" + f" {shape_list(layer_head_mask)}" + ), + ) + + attn_weights = tf.reshape(layer_head_mask, (1, -1, 1, 1)) * tf.reshape( + attn_weights, (bsz, self.num_heads, tgt_len, src_len) + ) + attn_weights = tf.reshape(attn_weights, (bsz * self.num_heads, tgt_len, src_len)) + + attn_probs = self.dropout(attn_weights, training=training) + attn_output = tf.matmul(attn_probs, value_states) + + tf.debugging.assert_equal( + shape_list(attn_output), + [bsz * self.num_heads, tgt_len, self.head_dim], + message=( + f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is" + f" {shape_list(attn_output)}" + ), + ) + + attn_output = tf.transpose( + tf.reshape(attn_output, (bsz, self.num_heads, tgt_len, self.head_dim)), (0, 2, 1, 3) + ) + attn_output = tf.reshape(attn_output, (bsz, tgt_len, embed_dim)) + + attn_output = self.out_proj(attn_output) + attn_weights: tf.Tensor = tf.reshape(attn_weights, (bsz, self.num_heads, tgt_len, src_len)) + + return attn_output, attn_weights, past_key_value + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "k_proj", None) is not None: + with tf.name_scope(self.k_proj.name): + self.k_proj.build([None, None, self.embed_dim]) + if getattr(self, "q_proj", None) is not None: + with tf.name_scope(self.q_proj.name): + self.q_proj.build([None, None, self.embed_dim]) + if getattr(self, "v_proj", None) is not None: + with tf.name_scope(self.v_proj.name): + self.v_proj.build([None, None, self.embed_dim]) + if getattr(self, "out_proj", None) is not None: + with tf.name_scope(self.out_proj.name): + self.out_proj.build([None, None, self.embed_dim]) + + +class TFBartEncoderLayer(keras.layers.Layer): + def __init__(self, config: BartConfig, **kwargs): + super().__init__(**kwargs) + self.embed_dim = config.d_model + self.self_attn = TFBartAttention( + self.embed_dim, config.encoder_attention_heads, dropout=config.attention_dropout, name="self_attn" + ) + self.self_attn_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="self_attn_layer_norm") + self.dropout = keras.layers.Dropout(config.dropout) + self.activation_fn = get_tf_activation(config.activation_function) + self.activation_dropout = keras.layers.Dropout(config.activation_dropout) + self.fc1 = keras.layers.Dense(config.encoder_ffn_dim, name="fc1") + self.fc2 = keras.layers.Dense(self.embed_dim, name="fc2") + self.final_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm") + self.config = config + + def call( + self, + hidden_states: tf.Tensor, + attention_mask: np.ndarray | tf.Tensor | None, + layer_head_mask: tf.Tensor | None, + training: Optional[bool] = False, + ) -> tf.Tensor: + """ + Args: + hidden_states (`tf.Tensor`): input to the layer of shape `(batch, seq_len, embed_dim)` + attention_mask (`tf.Tensor`): attention mask of size + `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values. + layer_head_mask (`tf.Tensor`): mask for attention heads in a given layer of size + `(encoder_attention_heads,)` + """ + residual = hidden_states + hidden_states, self_attn_weights, _ = self.self_attn( + hidden_states=hidden_states, attention_mask=attention_mask, layer_head_mask=layer_head_mask + ) + + tf.debugging.assert_equal( + shape_list(hidden_states), + shape_list(residual), + message=f"Self attn modified the shape of query {shape_list(residual)} to {shape_list(hidden_states)}", + ) + + hidden_states = self.dropout(hidden_states, training=training) + hidden_states = residual + hidden_states + hidden_states = self.self_attn_layer_norm(hidden_states) + + residual = hidden_states + hidden_states = self.activation_fn(self.fc1(hidden_states)) + hidden_states = self.activation_dropout(hidden_states, training=training) + hidden_states = self.fc2(hidden_states) + hidden_states = self.dropout(hidden_states, training=training) + hidden_states = residual + hidden_states + hidden_states = self.final_layer_norm(hidden_states) + + return hidden_states, self_attn_weights + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "self_attn", None) is not None: + with tf.name_scope(self.self_attn.name): + self.self_attn.build(None) + if getattr(self, "self_attn_layer_norm", None) is not None: + with tf.name_scope(self.self_attn_layer_norm.name): + self.self_attn_layer_norm.build([None, None, self.embed_dim]) + if getattr(self, "fc1", None) is not None: + with tf.name_scope(self.fc1.name): + self.fc1.build([None, None, self.embed_dim]) + if getattr(self, "fc2", None) is not None: + with tf.name_scope(self.fc2.name): + self.fc2.build([None, None, self.config.encoder_ffn_dim]) + if getattr(self, "final_layer_norm", None) is not None: + with tf.name_scope(self.final_layer_norm.name): + self.final_layer_norm.build([None, None, self.embed_dim]) + + +class TFBartDecoderLayer(keras.layers.Layer): + def __init__(self, config: BartConfig, **kwargs): + super().__init__(**kwargs) + self.embed_dim = config.d_model + self.self_attn = TFBartAttention( + embed_dim=self.embed_dim, + num_heads=config.decoder_attention_heads, + dropout=config.attention_dropout, + name="self_attn", + is_decoder=True, + ) + self.dropout = keras.layers.Dropout(config.dropout) + self.activation_fn = get_tf_activation(config.activation_function) + self.activation_dropout = keras.layers.Dropout(config.activation_dropout) + + self.self_attn_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="self_attn_layer_norm") + self.encoder_attn = TFBartAttention( + self.embed_dim, + config.decoder_attention_heads, + dropout=config.attention_dropout, + name="encoder_attn", + is_decoder=True, + ) + self.encoder_attn_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="encoder_attn_layer_norm") + self.fc1 = keras.layers.Dense(config.decoder_ffn_dim, name="fc1") + self.fc2 = keras.layers.Dense(self.embed_dim, name="fc2") + self.final_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm") + self.config = config + + def call( + self, + hidden_states: tf.Tensor, + attention_mask: np.ndarray | tf.Tensor | None = None, + encoder_hidden_states: np.ndarray | tf.Tensor | None = None, + encoder_attention_mask: np.ndarray | tf.Tensor | None = None, + layer_head_mask: tf.Tensor | None = None, + cross_attn_layer_head_mask: tf.Tensor | None = None, + past_key_value: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None, + training: Optional[bool] = False, + ) -> Tuple[tf.Tensor, tf.Tensor, Tuple[Tuple[tf.Tensor]]]: + """ + Args: + hidden_states (`tf.Tensor`): input to the layer of shape `(batch, seq_len, embed_dim)` + attention_mask (`tf.Tensor`): attention mask of size + `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values. + encoder_hidden_states (`tf.Tensor`): + cross attention input to the layer of shape `(batch, seq_len, embed_dim)` + encoder_attention_mask (`tf.Tensor`): encoder attention mask of size + `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values. + layer_head_mask (`tf.Tensor`): mask for attention heads in a given layer of size + `(decoder_attention_heads,)` + cross_attn_layer_head_mask (`tf.Tensor`): mask for heads of the cross-attention module. + `(decoder_attention_heads,)` + past_key_value (`Tuple(tf.Tensor)`): cached past key and value projection states + """ + residual = hidden_states + + # Self Attention + # decoder uni-directional self-attention cached key/values tuple is at positions 1,2 + self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None + # add present self-attn cache to positions 1,2 of present_key_value tuple + hidden_states, self_attn_weights, present_key_value = self.self_attn( + hidden_states=hidden_states, + past_key_value=self_attn_past_key_value, + attention_mask=attention_mask, + layer_head_mask=layer_head_mask, + ) + hidden_states = self.dropout(hidden_states, training=training) + hidden_states = residual + hidden_states + hidden_states = self.self_attn_layer_norm(hidden_states) + + # Cross-Attention Block + cross_attn_present_key_value = None + cross_attn_weights = None + if encoder_hidden_states is not None: + residual = hidden_states + + # cross_attn cached key/values tuple is at positions 3,4 of present_key_value tuple + cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None + hidden_states, cross_attn_weights, cross_attn_present_key_value = self.encoder_attn( + hidden_states=hidden_states, + key_value_states=encoder_hidden_states, + attention_mask=encoder_attention_mask, + layer_head_mask=cross_attn_layer_head_mask, + past_key_value=cross_attn_past_key_value, + ) + hidden_states = self.dropout(hidden_states, training=training) + hidden_states = residual + hidden_states + hidden_states = self.encoder_attn_layer_norm(hidden_states) + + # add cross-attn to positions 3,4 of present_key_value tuple + present_key_value = present_key_value + cross_attn_present_key_value + + # Fully Connected + residual = hidden_states + hidden_states = self.activation_fn(self.fc1(hidden_states)) + hidden_states = self.activation_dropout(hidden_states, training=training) + hidden_states = self.fc2(hidden_states) + hidden_states = self.dropout(hidden_states, training=training) + hidden_states = residual + hidden_states + hidden_states = self.final_layer_norm(hidden_states) + + return ( + hidden_states, + self_attn_weights, + cross_attn_weights, + present_key_value, + ) + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "self_attn", None) is not None: + with tf.name_scope(self.self_attn.name): + self.self_attn.build(None) + if getattr(self, "self_attn_layer_norm", None) is not None: + with tf.name_scope(self.self_attn_layer_norm.name): + self.self_attn_layer_norm.build([None, None, self.embed_dim]) + if getattr(self, "encoder_attn", None) is not None: + with tf.name_scope(self.encoder_attn.name): + self.encoder_attn.build(None) + if getattr(self, "encoder_attn_layer_norm", None) is not None: + with tf.name_scope(self.encoder_attn_layer_norm.name): + self.encoder_attn_layer_norm.build([None, None, self.embed_dim]) + if getattr(self, "fc1", None) is not None: + with tf.name_scope(self.fc1.name): + self.fc1.build([None, None, self.embed_dim]) + if getattr(self, "fc2", None) is not None: + with tf.name_scope(self.fc2.name): + self.fc2.build([None, None, self.config.decoder_ffn_dim]) + if getattr(self, "final_layer_norm", None) is not None: + with tf.name_scope(self.final_layer_norm.name): + self.final_layer_norm.build([None, None, self.embed_dim]) + + +class TFBartClassificationHead(keras.layers.Layer): + """Head for sentence-level classification tasks.""" + + def __init__(self, inner_dim: int, num_classes: int, pooler_dropout: float, name: str, **kwargs): + super().__init__(name=name, **kwargs) + self.dense = keras.layers.Dense(inner_dim, name="dense") + self.dropout = keras.layers.Dropout(pooler_dropout) + self.out_proj = keras.layers.Dense(num_classes, name="out_proj") + self.input_dim = inner_dim + self.inner_dim = inner_dim + + def call(self, inputs): + hidden_states = self.dropout(inputs) + hidden_states = self.dense(hidden_states) + hidden_states = keras.activations.tanh(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = self.out_proj(hidden_states) + return hidden_states + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build([None, None, self.input_dim]) + if getattr(self, "out_proj", None) is not None: + with tf.name_scope(self.out_proj.name): + self.out_proj.build([None, None, self.inner_dim]) + + +class TFBartPretrainedModel(TFPreTrainedModel): + config_class = BartConfig + base_model_prefix = "model" + + @property + def dummy_inputs(self): + dummy_inputs = super().dummy_inputs + # Dummy inputs should not contain the default val of 1 + # as this is the padding token and some assertions check it + dummy_inputs["input_ids"] = dummy_inputs["input_ids"] * 2 + if "decoder_input_ids" in dummy_inputs: + dummy_inputs["decoder_input_ids"] = dummy_inputs["decoder_input_ids"] * 2 + return dummy_inputs + + def tf_to_pt_weight_rename(self, tf_weight): + if tf_weight == "model.shared.weight": + return tf_weight, "model.decoder.embed_tokens.weight" + else: + return (tf_weight,) + + +BART_START_DOCSTRING = r""" + This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads + etc.) + + This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it + as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and + behavior. + + + + TensorFlow models and layers in `transformers` accept two formats as input: + + - having all inputs as keyword arguments (like PyTorch models), or + - having all inputs as a list, tuple or dict in the first positional argument. + + The reason the second format is supported is that Keras methods prefer this format when passing inputs to models + and layers. Because of this support, when using methods like `model.fit()` things should "just work" for you - just + pass your inputs and labels in any format that `model.fit()` supports! If, however, you want to use the second + format outside of Keras methods like `fit()` and `predict()`, such as when creating your own layers or models with + the Keras `Functional` API, there are three possibilities you can use to gather all the input Tensors in the first + positional argument: + + - a single Tensor with `input_ids` only and nothing else: `model(input_ids)` + - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring: + `model([input_ids, attention_mask])` or `model([input_ids, attention_mask, token_type_ids])` + - a dictionary with one or several input Tensors associated to the input names given in the docstring: + `model({"input_ids": input_ids, "token_type_ids": token_type_ids})` + + Note that when creating models and layers with + [subclassing](https://keras.io/guides/making_new_layers_and_models_via_subclassing/) then you don't need to worry + about any of this, as you can just pass inputs like you would to any other Python function! + + + + Args: + config ([`BartConfig`]): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the [`~TFPreTrainedModel.from_pretrained`] method to load the model weights. +""" + + +BART_GENERATION_EXAMPLE = r""" + Summarization example: + + ```python + >>> from transformers import AutoTokenizer, TFBartForConditionalGeneration + + >>> model = TFBartForConditionalGeneration.from_pretrained("facebook/bart-large") + >>> tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large") + + >>> ARTICLE_TO_SUMMARIZE = "My friends are cool but they eat too many carbs." + >>> inputs = tokenizer([ARTICLE_TO_SUMMARIZE], max_length=1024, return_tensors="tf") + + >>> # Generate Summary + >>> summary_ids = model.generate(inputs["input_ids"], num_beams=4, max_length=5) + >>> print(tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)) + ``` + + Mask filling example: + + ```python + >>> from transformers import AutoTokenizer, TFBartForConditionalGeneration + + >>> tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large") + >>> TXT = "My friends are but they eat too many carbs." + + >>> model = TFBartForConditionalGeneration.from_pretrained("facebook/bart-large") + >>> input_ids = tokenizer([TXT], return_tensors="tf")["input_ids"] + >>> logits = model(input_ids).logits + >>> probs = tf.nn.softmax(logits[0]) + >>> # probs[5] is associated with the mask token + ``` +""" + + +BART_INPUTS_DOCSTRING = r""" + Args: + input_ids (`tf.Tensor` of shape `({0})`): + Indices of input sequence tokens in the vocabulary. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + attention_mask (`tf.Tensor` of shape `({0})`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + decoder_input_ids (`tf.Tensor` of shape `(batch_size, target_sequence_length)`, *optional*): + Indices of decoder input sequence tokens in the vocabulary. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are decoder input IDs?](../glossary#decoder-input-ids) + + Bart uses the `eos_token_id` as the starting token for `decoder_input_ids` generation. If `past_key_values` + is used, optionally only the last `decoder_input_ids` have to be input (see `past_key_values`). + + For translation and summarization training, `decoder_input_ids` should be provided. If no + `decoder_input_ids` is provided, the model will create this tensor by shifting the `input_ids` to the right + for denoising pre-training following the paper. + decoder_attention_mask (`tf.Tensor` of shape `(batch_size, target_sequence_length)`, *optional*): + will be made by default and ignore pad tokens. It is not recommended to set this for most use cases. + decoder_position_ids (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Indices of positions of each decoder input sequence tokens in the position embeddings. Selected in the + range `[0, config.max_position_embeddings - 1]`. + head_mask (`tf.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*): + Mask to nullify selected heads of the attention modules in the encoder. Mask values selected in `[0, 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + decoder_head_mask (`tf.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*): + Mask to nullify selected heads of the attention modules in the decoder. Mask values selected in `[0, 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + cross_attn_head_mask (`tf.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*): + Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + encoder_outputs (`tf.FloatTensor`, *optional*): + hidden states at the output of the last layer of the encoder. Used in the cross-attention of the decoder. + of shape `(batch_size, sequence_length, hidden_size)` is a sequence of + past_key_values (`Tuple[Tuple[tf.Tensor]]` of length `config.n_layers`) + contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding. + If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that + don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all + `decoder_input_ids` of shape `(batch_size, sequence_length)`. + inputs_embeds (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. + This is useful if you want more control over how to convert `input_ids` indices into associated vectors + than the model's internal embedding lookup matrix. + use_cache (`bool`, *optional*, defaults to `True`): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see + `past_key_values`). Set to `False` during training, `True` during generation + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. This argument can be used only in eager mode, in graph mode the value in the + config will be used instead. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. This argument can be used only in eager mode, in graph mode the value in the config will be + used instead. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. This argument can be used in + eager mode, in graph mode the value will always be set to True. + training (`bool`, *optional*, defaults to `False`): + Whether or not to use the model in training mode (some modules like dropout modules have different + behaviors between training and evaluation). +""" + + +@keras_serializable +class TFBartEncoder(keras.layers.Layer): + config_class = BartConfig + """ + Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a + [`TFBartEncoderLayer`]. + + Args: + config: BartConfig + """ + + def __init__(self, config: BartConfig, embed_tokens: Optional[keras.layers.Embedding] = None, **kwargs): + super().__init__(**kwargs) + self.config = config + self.dropout = keras.layers.Dropout(config.dropout) + self.layerdrop = config.encoder_layerdrop + self.padding_idx = config.pad_token_id + self.max_source_positions = config.max_position_embeddings + self.embed_scale = tf.math.sqrt(float(config.d_model)) if config.scale_embedding else 1.0 + + self.embed_tokens = embed_tokens + self.embed_positions = TFBartLearnedPositionalEmbedding( + config.max_position_embeddings, + config.d_model, + name="embed_positions", + ) + self.layers = [TFBartEncoderLayer(config, name=f"layers.{i}") for i in range(config.encoder_layers)] + self.layernorm_embedding = keras.layers.LayerNormalization(epsilon=1e-5, name="layernorm_embedding") + self.embed_dim = config.d_model + + @unpack_inputs + def call( + self, + input_ids: TFModelInputType | None = None, + inputs_embeds: np.ndarray | tf.Tensor | None = None, + attention_mask: np.ndarray | tf.Tensor | None = None, + head_mask: np.ndarray | tf.Tensor | None = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + training: Optional[bool] = False, + ) -> Union[TFBaseModelOutput, Tuple[tf.Tensor]]: + """ + Args: + input_ids (`tf.Tensor` of shape `(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you + provide it. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + attention_mask (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + head_mask (`tf.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, `optional): + Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + inputs_embeds (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. + This is useful if you want more control over how to convert `input_ids` indices into associated vectors + than the model's internal embedding lookup matrix. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors + for more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. + """ + if input_ids is not None and inputs_embeds is not None: + raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") + elif input_ids is not None: + input_shape = shape_list(input_ids) + elif inputs_embeds is not None: + input_shape = shape_list(inputs_embeds)[:-1] + else: + raise ValueError("You have to specify either input_ids or inputs_embeds") + + if inputs_embeds is None: + check_embeddings_within_bounds(input_ids, self.embed_tokens.input_dim) + inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale + + embed_pos = self.embed_positions(input_shape) + hidden_states = inputs_embeds + embed_pos + hidden_states = self.layernorm_embedding(hidden_states) + hidden_states = self.dropout(hidden_states, training=training) + + # check attention mask and invert + if attention_mask is not None: + # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] + attention_mask = _expand_mask(attention_mask) + else: + attention_mask = None + + encoder_states = () if output_hidden_states else None + all_attentions = () if output_attentions else None + + # check if head_mask has a correct number of layers specified if desired + if head_mask is not None: + tf.debugging.assert_equal( + shape_list(head_mask)[0], + len(self.layers), + message=( + f"The head_mask should be specified for {len(self.layers)} layers, but it is for" + f" {shape_list(head_mask)[0]}." + ), + ) + + # encoder layers + for idx, encoder_layer in enumerate(self.layers): + if output_hidden_states: + encoder_states = encoder_states + (hidden_states,) + # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) + dropout_probability = random.uniform(0, 1) + if training and (dropout_probability < self.layerdrop): # skip the layer + continue + + hidden_states, attn = encoder_layer( + hidden_states, + attention_mask, + head_mask[idx] if head_mask is not None else None, + ) + + if output_attentions: + all_attentions += (attn,) + + if output_hidden_states: + encoder_states = encoder_states + (hidden_states,) + + if not return_dict: + return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None) + return TFBaseModelOutput( + last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions + ) + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "embed_positions", None) is not None: + with tf.name_scope(self.embed_positions.name): + self.embed_positions.build(None) + if getattr(self, "layernorm_embedding", None) is not None: + with tf.name_scope(self.layernorm_embedding.name): + self.layernorm_embedding.build([None, None, self.embed_dim]) + if getattr(self, "layers", None) is not None: + for layer in self.layers: + with tf.name_scope(layer.name): + layer.build(None) + + +@keras_serializable +class TFBartDecoder(keras.layers.Layer): + config_class = BartConfig + """ + Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`TFBartDecoderLayer`] + + Args: + config: BartConfig + embed_tokens: output embedding + """ + + def __init__(self, config: BartConfig, embed_tokens: Optional[keras.layers.Embedding] = None, **kwargs): + super().__init__(**kwargs) + self.config = config + self.padding_idx = config.pad_token_id + self.embed_tokens = embed_tokens + self.layerdrop = config.decoder_layerdrop + self.embed_positions = TFBartLearnedPositionalEmbedding( + config.max_position_embeddings, + config.d_model, + name="embed_positions", + ) + self.embed_scale = tf.math.sqrt(float(config.d_model)) if config.scale_embedding else 1.0 + self.layers = [TFBartDecoderLayer(config, name=f"layers.{i}") for i in range(config.decoder_layers)] + self.layernorm_embedding = keras.layers.LayerNormalization(epsilon=1e-5, name="layernorm_embedding") + + self.dropout = keras.layers.Dropout(config.dropout) + + @unpack_inputs + def call( + self, + input_ids: TFModelInputType | None = None, + inputs_embeds: np.ndarray | tf.Tensor | None = None, + attention_mask: np.ndarray | tf.Tensor | None = None, + position_ids: np.ndarray | tf.Tensor | None = None, + encoder_hidden_states: np.ndarray | tf.Tensor | None = None, + encoder_attention_mask: np.ndarray | tf.Tensor | None = None, + head_mask: np.ndarray | tf.Tensor | None = None, + cross_attn_head_mask: np.ndarray | tf.Tensor | None = None, + past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + training: Optional[bool] = False, + ) -> Union[TFBaseModelOutputWithPastAndCrossAttentions, Tuple[tf.Tensor]]: + r""" + Args: + input_ids (`tf.Tensor` of shape `(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you + provide it. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + attention_mask (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + position_ids (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Indices of positions of each decoder input sequence tokens in the position embeddings. Selected in the + range `[0, config.max_position_embeddings - 1]`. + encoder_hidden_states (`tf.Tensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*): + Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention + of the decoder. + encoder_attention_mask (`tf.Tensor` of shape `(batch_size, encoder_sequence_length)`, *optional*): + Mask to avoid performing cross-attention on padding tokens indices of encoder input_ids. Mask values + selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + head_mask (`tf.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*): + Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + cross_attn_head_mask (`tf.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*): + Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + past_key_values (`Tuple[Tuple[tf.Tensor]]` of length `config.n_layers` with each tuple having 2 tuples each of which has 2 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`): + Contains precomputed key and value hidden-states of the attention blocks. Can be used to speed up + decoding. + + If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those + that don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of + all `decoder_input_ids` of shape `(batch_size, sequence_length)`. + inputs_embeds (`tf.tTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. + This is useful if you want more control over how to convert `input_ids` indices into associated vectors + than the model's internal embedding lookup matrix. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors + for more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. + """ + + if input_ids is not None and inputs_embeds is not None: + raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time") + elif input_ids is not None: + input_shape = shape_list(input_ids) + elif inputs_embeds is not None: + input_shape = shape_list(inputs_embeds)[:-1] + else: + raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds") + + past_key_values_length = shape_list(past_key_values[0][0])[2] if past_key_values is not None else 0 + + # embed positions + if position_ids is None: + positions = self.embed_positions(input_shape, past_key_values_length) + else: + positions = self.embed_positions(input_shape, position_ids=position_ids) + + if inputs_embeds is None: + check_embeddings_within_bounds(input_ids, self.embed_tokens.input_dim) + inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale + + hidden_states = inputs_embeds + + # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] + if input_shape[-1] > 1: + combined_attention_mask = _make_causal_mask(input_shape, past_key_values_length=past_key_values_length) + else: + combined_attention_mask = _expand_mask( + tf.ones((input_shape[0], input_shape[1] + past_key_values_length)), tgt_len=input_shape[-1] + ) + + if attention_mask is not None: + combined_attention_mask = combined_attention_mask + _expand_mask(attention_mask, tgt_len=input_shape[-1]) + + if encoder_hidden_states is not None and encoder_attention_mask is not None: + # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] + encoder_attention_mask = _expand_mask(encoder_attention_mask, tgt_len=input_shape[-1]) + + hidden_states = self.layernorm_embedding(hidden_states + positions) + hidden_states = self.dropout(hidden_states, training=training) + + # decoder layers + all_hidden_states = () if output_hidden_states else None + all_self_attns = () if output_attentions else None + all_cross_attns = () if (output_attentions and encoder_hidden_states is not None) else None + present_key_values = () if use_cache else None + + # check if head_mask and cross_attn_head_mask have a correct number of layers specified if desired + for attn_mask_name, attn_mask in [("head_mask", head_mask), ("cross_attn_head_mask", cross_attn_head_mask)]: + if attn_mask is not None: + tf.debugging.assert_equal( + shape_list(attn_mask)[0], + len(self.layers), + message=( + f"The {attn_mask_name} should be specified for {len(self.layers)} layers, but it is for" + f" {shape_list(attn_mask)[0]}." + ), + ) + + for idx, decoder_layer in enumerate(self.layers): + # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) + if output_hidden_states: + all_hidden_states += (hidden_states,) + + dropout_probability = random.uniform(0, 1) + + if training and (dropout_probability < self.layerdrop): + continue + + past_key_value = past_key_values[idx] if past_key_values is not None else None + + hidden_states, layer_self_attn, layer_cross_attn, present_key_value = decoder_layer( + hidden_states, + attention_mask=combined_attention_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + layer_head_mask=head_mask[idx] if head_mask is not None else None, + cross_attn_layer_head_mask=cross_attn_head_mask[idx] if cross_attn_head_mask is not None else None, + past_key_value=past_key_value, + ) + + if use_cache: + present_key_values += (present_key_value,) + + if output_attentions: + all_self_attns += (layer_self_attn,) + + if encoder_hidden_states is not None: + all_cross_attns += (layer_cross_attn,) + + if output_hidden_states: + all_hidden_states += (hidden_states,) + + if not return_dict: + return hidden_states, present_key_values, all_hidden_states, all_self_attns, all_cross_attns + else: + return TFBaseModelOutputWithPastAndCrossAttentions( + last_hidden_state=hidden_states, + past_key_values=present_key_values, + hidden_states=all_hidden_states, + attentions=all_self_attns, + cross_attentions=all_cross_attns, + ) + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "embed_positions", None) is not None: + with tf.name_scope(self.embed_positions.name): + self.embed_positions.build(None) + if getattr(self, "layernorm_embedding", None) is not None: + with tf.name_scope(self.layernorm_embedding.name): + self.layernorm_embedding.build([None, None, self.config.d_model]) + if getattr(self, "layers", None) is not None: + for layer in self.layers: + with tf.name_scope(layer.name): + layer.build(None) + + +@keras_serializable +class TFBartMainLayer(keras.layers.Layer): + config_class = BartConfig + + def __init__(self, config: BartConfig, load_weight_prefix=None, **kwargs): + super().__init__(**kwargs) + self.config = config + self.shared = keras.layers.Embedding( + input_dim=config.vocab_size, + output_dim=config.d_model, + embeddings_initializer=keras.initializers.TruncatedNormal(stddev=self.config.init_std), + name="model.shared", + ) + # Additional attribute to specify the expected name scope of the layer (for loading/storing weights) + self.shared.load_weight_prefix = "model.shared" if load_weight_prefix is None else load_weight_prefix + + self.encoder = TFBartEncoder(config, self.shared, name="encoder") + self.decoder = TFBartDecoder(config, self.shared, name="decoder") + + def get_input_embeddings(self): + return self.shared + + def set_input_embeddings(self, new_embeddings): + self.shared = new_embeddings + self.encoder.embed_tokens = self.shared + self.decoder.embed_tokens = self.shared + + @unpack_inputs + def call( + self, + input_ids: TFModelInputType | None = None, + attention_mask: np.ndarray | tf.Tensor | None = None, + decoder_input_ids: np.ndarray | tf.Tensor | None = None, + decoder_attention_mask: np.ndarray | tf.Tensor | None = None, + decoder_position_ids: np.ndarray | tf.Tensor | None = None, + head_mask: np.ndarray | tf.Tensor | None = None, + decoder_head_mask: np.ndarray | tf.Tensor | None = None, + cross_attn_head_mask: np.ndarray | tf.Tensor | None = None, + encoder_outputs: Optional[Union[Tuple, TFBaseModelOutput]] = None, + past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None, + inputs_embeds: np.ndarray | tf.Tensor | None = None, + decoder_inputs_embeds: np.ndarray | tf.Tensor | None = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + training: Optional[bool] = False, + **kwargs, + ) -> Union[TFSeq2SeqModelOutput, Tuple[tf.Tensor]]: + # different to other models, Bart automatically creates decoder_input_ids from + # input_ids if no decoder_input_ids are provided + if decoder_input_ids is None and decoder_inputs_embeds is None: + if input_ids is None: + raise ValueError( + "If no `decoder_input_ids` or `decoder_inputs_embeds` are " + "passed, `input_ids` cannot be `None`. Please pass either " + "`input_ids` or `decoder_input_ids` or `decoder_inputs_embeds`." + ) + + decoder_input_ids = shift_tokens_right( + input_ids, self.config.pad_token_id, self.config.decoder_start_token_id + ) + + if encoder_outputs is None: + encoder_outputs = self.encoder( + input_ids=input_ids, + attention_mask=attention_mask, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + ) + # If the user passed a tuple for encoder_outputs, we wrap it in a TFBaseModelOutput when return_dict=True + elif return_dict and not isinstance(encoder_outputs, TFBaseModelOutput): + encoder_outputs = TFBaseModelOutput( + last_hidden_state=encoder_outputs[0], + hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None, + attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None, + ) + # If the user passed a TFBaseModelOutput for encoder_outputs, we wrap it in a tuple when return_dict=False + elif not return_dict and not isinstance(encoder_outputs, tuple): + encoder_outputs = encoder_outputs.to_tuple() + + decoder_outputs = self.decoder( + decoder_input_ids, + attention_mask=decoder_attention_mask, + position_ids=decoder_position_ids, + encoder_hidden_states=encoder_outputs[0], + encoder_attention_mask=attention_mask, + head_mask=decoder_head_mask, + cross_attn_head_mask=cross_attn_head_mask, + past_key_values=past_key_values, + inputs_embeds=decoder_inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + ) + + if not return_dict: + return decoder_outputs + encoder_outputs + + return TFSeq2SeqModelOutput( + last_hidden_state=decoder_outputs.last_hidden_state, + past_key_values=decoder_outputs.past_key_values, + decoder_hidden_states=decoder_outputs.hidden_states, + decoder_attentions=decoder_outputs.attentions, + cross_attentions=decoder_outputs.cross_attentions, + encoder_last_hidden_state=encoder_outputs.last_hidden_state, + encoder_hidden_states=encoder_outputs.hidden_states, + encoder_attentions=encoder_outputs.attentions, + ) + + def build(self, input_shape=None): + if self.built: + return + self.built = True + # The shared/tied weights expect to be in the model base namespace + # Adding "/" to the end (not the start!) of a tf.name_scope puts it in the root namespace rather than + # the current one. + with tf.name_scope(self.shared.load_weight_prefix + "/" + self.shared.name + "/"): + self.shared.build(None) + if getattr(self, "encoder", None) is not None: + with tf.name_scope(self.encoder.name): + self.encoder.build(None) + if getattr(self, "decoder", None) is not None: + with tf.name_scope(self.decoder.name): + self.decoder.build(None) + + +@add_start_docstrings( + "The bare BART Model outputting raw hidden-states without any specific head on top.", + BART_START_DOCSTRING, +) +class TFBartModel(TFBartPretrainedModel): + _requires_load_weight_prefix = True + + def __init__(self, config: BartConfig, load_weight_prefix=None, *inputs, **kwargs): + super().__init__(config, *inputs, **kwargs) + + self.model = TFBartMainLayer(config, load_weight_prefix=load_weight_prefix, name="model") + + def get_encoder(self): + return self.model.encoder + + def get_decoder(self): + return self.model.decoder + + @add_start_docstrings_to_model_forward(BART_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @add_code_sample_docstrings( + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=TFSeq2SeqModelOutput, + config_class=_CONFIG_FOR_DOC, + ) + @unpack_inputs + def call( + self, + input_ids: TFModelInputType | None = None, + attention_mask: np.ndarray | tf.Tensor | None = None, + decoder_input_ids: np.ndarray | tf.Tensor | None = None, + decoder_attention_mask: np.ndarray | tf.Tensor | None = None, + decoder_position_ids: np.ndarray | tf.Tensor | None = None, + head_mask: np.ndarray | tf.Tensor | None = None, + decoder_head_mask: np.ndarray | tf.Tensor | None = None, + cross_attn_head_mask: np.ndarray | tf.Tensor | None = None, + encoder_outputs: Optional[Union[Tuple, TFBaseModelOutput]] = None, + past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None, + inputs_embeds: np.ndarray | tf.Tensor | None = None, + decoder_inputs_embeds: np.ndarray | tf.Tensor | None = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + training: Optional[bool] = False, + **kwargs, + ) -> Union[TFBaseModelOutput, Tuple[tf.Tensor]]: + outputs = self.model( + input_ids=input_ids, + attention_mask=attention_mask, + decoder_input_ids=decoder_input_ids, + decoder_attention_mask=decoder_attention_mask, + decoder_position_ids=decoder_position_ids, + head_mask=head_mask, + decoder_head_mask=decoder_head_mask, + cross_attn_head_mask=cross_attn_head_mask, + encoder_outputs=encoder_outputs, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + decoder_inputs_embeds=decoder_inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + ) + + return outputs + + def serving_output(self, output): + pkv = tf.tuple(output.past_key_values)[1] if self.config.use_cache else None + dec_hs = tf.convert_to_tensor(output.decoder_hidden_states) if self.config.output_hidden_states else None + dec_attns = tf.convert_to_tensor(output.decoder_attentions) if self.config.output_attentions else None + cross_attns = tf.convert_to_tensor(output.cross_attentions) if self.config.output_attentions else None + enc_hs = tf.convert_to_tensor(output.encoder_hidden_states) if self.config.output_hidden_states else None + enc_attns = tf.convert_to_tensor(output.encoder_attentions) if self.config.output_attentions else None + + return TFSeq2SeqModelOutput( + last_hidden_state=output.last_hidden_state, + past_key_values=pkv, + decoder_hidden_states=dec_hs, + decoder_attentions=dec_attns, + cross_attentions=cross_attns, + encoder_last_hidden_state=output.encoder_last_hidden_state, + encoder_hidden_states=enc_hs, + encoder_attentions=enc_attns, + ) + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "model", None) is not None: + with tf.name_scope(self.model.name): + self.model.build(None) + + +class BiasLayer(keras.layers.Layer): + """ + Bias as a layer. It is used for serialization purposes: `keras.Model.save_weights` stores on a per-layer basis, + so all weights have to be registered in a layer. + """ + + def __init__(self, shape, initializer, trainable, name, **kwargs): + super().__init__(name=name, **kwargs) + # Note: the name of this variable will NOT be scoped when serialized, i.e. it will not be in the format of + # "outer_layer/inner_layer/.../name:0". Instead, it will be "name:0". For further details, see: + # https://github.com/huggingface/transformers/pull/18833#issuecomment-1233090214 + self.bias = self.add_weight(name=name, shape=shape, initializer=initializer, trainable=trainable) + + def call(self, x): + return x + self.bias + + +@add_start_docstrings( + "The BART Model with a language modeling head. Can be used for summarization.", + BART_START_DOCSTRING, +) +class TFBartForConditionalGeneration(TFBartPretrainedModel, TFCausalLanguageModelingLoss): + _keys_to_ignore_on_load_missing = [r"final_logits_bias"] + _requires_load_weight_prefix = True + + def __init__(self, config, load_weight_prefix=None, *inputs, **kwargs): + super().__init__(config, *inputs, **kwargs) + self.model = TFBartMainLayer(config, load_weight_prefix=load_weight_prefix, name="model") + self.use_cache = config.use_cache + # final_bias_logits is registered as a buffer in pytorch, so not trainable for the sake of consistency. + self.bias_layer = BiasLayer( + name="final_logits_bias", shape=[1, config.vocab_size], initializer="zeros", trainable=False + ) + + def get_decoder(self): + return self.model.decoder + + def get_encoder(self): + return self.model.encoder + + def get_output_embeddings(self): + return self.get_input_embeddings() + + def set_output_embeddings(self, value): + self.set_input_embeddings(value) + + def get_bias(self): + return {"final_logits_bias": self.bias_layer.bias} + + def set_bias(self, value): + # Replaces the existing layers containing bias for correct (de)serialization. + vocab_size = value["final_logits_bias"].shape[-1] + self.bias_layer = BiasLayer( + name="final_logits_bias", shape=[1, vocab_size], initializer="zeros", trainable=False + ) + self.bias_layer.bias.assign(value["final_logits_bias"]) + + @add_start_docstrings_to_model_forward(BART_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=TFSeq2SeqLMOutput, config_class=_CONFIG_FOR_DOC) + @add_end_docstrings(BART_GENERATION_EXAMPLE) + @unpack_inputs + def call( + self, + input_ids: TFModelInputType | None = None, + attention_mask: np.ndarray | tf.Tensor | None = None, + decoder_input_ids: np.ndarray | tf.Tensor | None = None, + decoder_attention_mask: np.ndarray | tf.Tensor | None = None, + decoder_position_ids: np.ndarray | tf.Tensor | None = None, + head_mask: np.ndarray | tf.Tensor | None = None, + decoder_head_mask: np.ndarray | tf.Tensor | None = None, + cross_attn_head_mask: np.ndarray | tf.Tensor | None = None, + encoder_outputs: Optional[TFBaseModelOutput] = None, + past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None, + inputs_embeds: np.ndarray | tf.Tensor | None = None, + decoder_inputs_embeds: np.ndarray | tf.Tensor | None = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + labels: tf.Tensor | None = None, + training: Optional[bool] = False, + ) -> Union[TFSeq2SeqLMOutput, Tuple[tf.Tensor]]: + r""" + labels (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., + config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored + (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. + + Returns: + + """ + + if labels is not None: + labels = tf.where( + labels == self.config.pad_token_id, + tf.cast(tf.fill(shape_list(labels), -100), labels.dtype), + labels, + ) + use_cache = False + if decoder_input_ids is None and decoder_inputs_embeds is None: + decoder_input_ids = shift_tokens_right( + labels, self.config.pad_token_id, self.config.decoder_start_token_id + ) + + outputs = self.model( + input_ids, + attention_mask=attention_mask, + decoder_input_ids=decoder_input_ids, + encoder_outputs=encoder_outputs, + decoder_attention_mask=decoder_attention_mask, + decoder_position_ids=decoder_position_ids, + head_mask=head_mask, + decoder_head_mask=decoder_head_mask, + cross_attn_head_mask=cross_attn_head_mask, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + decoder_inputs_embeds=decoder_inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + ) + lm_logits = tf.matmul(outputs[0], self.model.shared.weights, transpose_b=True) + lm_logits = self.bias_layer(lm_logits) + masked_lm_loss = None if labels is None else self.hf_compute_loss(labels, lm_logits) + + if not return_dict: + output = (lm_logits,) + outputs[1:] + return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output + return TFSeq2SeqLMOutput( + loss=masked_lm_loss, + logits=lm_logits, + past_key_values=outputs.past_key_values, # index 1 of d outputs + decoder_hidden_states=outputs.decoder_hidden_states, # index 2 of d outputs + decoder_attentions=outputs.decoder_attentions, # index 3 of d outputs + cross_attentions=outputs.cross_attentions, # index 4 of d outputs + encoder_last_hidden_state=outputs.encoder_last_hidden_state, # index 0 of encoder outputs + encoder_hidden_states=outputs.encoder_hidden_states, # 1 of e out + encoder_attentions=outputs.encoder_attentions, # 2 of e out + ) + + def serving_output(self, output): + pkv = tf.tuple(output.past_key_values)[1] if self.config.use_cache else None + dec_hs = tf.convert_to_tensor(output.decoder_hidden_states) if self.config.output_hidden_states else None + dec_attns = tf.convert_to_tensor(output.decoder_attentions) if self.config.output_attentions else None + cross_attns = tf.convert_to_tensor(output.cross_attentions) if self.config.output_attentions else None + enc_hs = tf.convert_to_tensor(output.encoder_hidden_states) if self.config.output_hidden_states else None + enc_attns = tf.convert_to_tensor(output.encoder_attentions) if self.config.output_attentions else None + + return TFSeq2SeqLMOutput( + logits=output.logits, + past_key_values=pkv, + decoder_hidden_states=dec_hs, + decoder_attentions=dec_attns, + cross_attentions=cross_attns, + encoder_last_hidden_state=output.encoder_last_hidden_state, + encoder_hidden_states=enc_hs, + encoder_attentions=enc_attns, + ) + + def prepare_inputs_for_generation( + self, + decoder_input_ids, + past_key_values=None, + attention_mask=None, + decoder_attention_mask=None, + head_mask=None, + decoder_head_mask=None, + cross_attn_head_mask=None, + use_cache=None, + encoder_outputs=None, + **kwargs, + ): + # cut decoder_input_ids if past_key_values is used + if past_key_values is not None: + decoder_input_ids = decoder_input_ids[:, -1:] + + if decoder_attention_mask is not None: # xla + decoder_position_ids = tf.math.cumsum(decoder_attention_mask, axis=-1, exclusive=True)[:, -1:] + elif past_key_values is not None: # no xla + past_key_values + decoder_position_ids = past_key_values[0][0].shape[2] + else: # no xla + no past_key_values + decoder_position_ids = tf.range(decoder_input_ids.shape[1]) + + return { + "input_ids": None, # encoder_outputs is defined. input_ids not needed + "encoder_outputs": encoder_outputs, + "past_key_values": past_key_values, + "decoder_input_ids": decoder_input_ids, + "attention_mask": attention_mask, + "decoder_attention_mask": decoder_attention_mask, + "decoder_position_ids": decoder_position_ids, + "head_mask": head_mask, + "decoder_head_mask": decoder_head_mask, + "cross_attn_head_mask": cross_attn_head_mask, + "use_cache": use_cache, # change this to avoid caching (presumably for debugging) + } + + def prepare_decoder_input_ids_from_labels(self, labels: tf.Tensor): + return shift_tokens_right(labels, self.config.pad_token_id, self.config.decoder_start_token_id) + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "model", None) is not None: + with tf.name_scope(self.model.name): + self.model.build(None) + if getattr(self, "bias_layer", None) is not None: + with tf.name_scope(self.bias_layer.name): + self.bias_layer.build(None) + + +@add_start_docstrings( + """ + Bart model with a sequence classification/head on top (a linear layer on top of the pooled output) e.g. for GLUE + tasks. + """, + BART_START_DOCSTRING, +) +class TFBartForSequenceClassification(TFBartPretrainedModel, TFSequenceClassificationLoss): + def __init__(self, config: BartConfig, load_weight_prefix=None, *inputs, **kwargs): + super().__init__(config, *inputs, **kwargs) + self.model = TFBartMainLayer(config, load_weight_prefix=load_weight_prefix, name="model") + self.classification_head = TFBartClassificationHead( + config.d_model, config.num_labels, config.classifier_dropout, name="classification_head" + ) + + @add_start_docstrings_to_model_forward(BART_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=TFSeq2SeqSequenceClassifierOutput, config_class=_CONFIG_FOR_DOC) + @unpack_inputs + def call( + self, + input_ids: TFModelInputType | None = None, + attention_mask: np.ndarray | tf.Tensor | None = None, + decoder_input_ids: np.ndarray | tf.Tensor | None = None, + decoder_attention_mask: np.ndarray | tf.Tensor | None = None, + decoder_position_ids: np.ndarray | tf.Tensor | None = None, + head_mask: np.ndarray | tf.Tensor | None = None, + decoder_head_mask: np.ndarray | tf.Tensor | None = None, + cross_attn_head_mask: np.ndarray | tf.Tensor | None = None, + encoder_outputs: Optional[TFBaseModelOutput] = None, + past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None, + inputs_embeds: np.ndarray | tf.Tensor | None = None, + decoder_inputs_embeds: np.ndarray | tf.Tensor | None = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + labels: tf.Tensor | None = None, + training: Optional[bool] = False, + ) -> Union[TFSeq2SeqSequenceClassifierOutput, Tuple[tf.Tensor]]: + r""" + labels (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., + config.num_labels - 1]`. If `config.num_labels > 1` a classification loss is computed (Cross-Entropy). + + Returns: + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + if labels is not None: + use_cache = False + + if input_ids is None and inputs_embeds is not None: + raise NotImplementedError( + f"Passing input embeddings is currently not supported for {self.__class__.__name__}" + ) + + outputs = self.model( + input_ids=input_ids, + attention_mask=attention_mask, + decoder_input_ids=decoder_input_ids, + decoder_attention_mask=decoder_attention_mask, + decoder_position_ids=decoder_position_ids, + head_mask=head_mask, + decoder_head_mask=decoder_head_mask, + cross_attn_head_mask=cross_attn_head_mask, + encoder_outputs=encoder_outputs, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + decoder_inputs_embeds=decoder_inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + ) + + last_hidden_state = outputs[0] + eos_mask = tf.equal(input_ids, self.config.eos_token_id) + # out the rows with False where present. Then verify all the final + # entries are True + self_masked = tf.reshape(tf.boolean_mask(eos_mask, eos_mask), (tf.shape(input_ids)[0], -1)) + tf.Assert(tf.reduce_all(self_masked[:, -1]), ["All examples must have the same number of tokens."]) + + masked = tf.reshape( + tf.boolean_mask(last_hidden_state, eos_mask), + (tf.shape(input_ids)[0], tf.shape(self_masked)[1], tf.shape(last_hidden_state)[-1]), + ) + + sentence_representation = masked[:, -1, :] + logits = self.classification_head(sentence_representation) + loss = None if labels is None else self.hf_compute_loss(labels=labels, logits=logits) + + if not return_dict: + output = (logits,) + outputs[1:] + return ((loss,) + output) if loss is not None else output + + return TFSeq2SeqSequenceClassifierOutput( + loss=loss, + logits=logits, + past_key_values=outputs.past_key_values, + decoder_hidden_states=outputs.decoder_hidden_states, + decoder_attentions=outputs.decoder_attentions, + cross_attentions=outputs.cross_attentions, + encoder_last_hidden_state=outputs.encoder_last_hidden_state, + encoder_hidden_states=outputs.encoder_hidden_states, + encoder_attentions=outputs.encoder_attentions, + ) + + def serving_output(self, output): + logits = tf.convert_to_tensor(output.logits) + pkv = tf.tuple(output.past_key_values)[1] if self.config.use_cache else None + dec_hs = tf.convert_to_tensor(output.decoder_hidden_states) if self.config.output_hidden_states else None + dec_attns = tf.convert_to_tensor(output.decoder_attentions) if self.config.output_attentions else None + cross_attns = tf.convert_to_tensor(output.cross_attentions) if self.config.output_attentions else None + enc_hs = tf.convert_to_tensor(output.encoder_hidden_states) if self.config.output_hidden_states else None + enc_attns = tf.convert_to_tensor(output.encoder_attentions) if self.config.output_attentions else None + + return TFSeq2SeqSequenceClassifierOutput( + logits=logits, + past_key_values=pkv, + decoder_hidden_states=dec_hs, + decoder_attentions=dec_attns, + cross_attentions=cross_attns, + encoder_last_hidden_state=output.encoder_last_hidden_state, + encoder_hidden_states=enc_hs, + encoder_attentions=enc_attns, + ) + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "model", None) is not None: + with tf.name_scope(self.model.name): + self.model.build(None) + if getattr(self, "classification_head", None) is not None: + with tf.name_scope(self.classification_head.name): + self.classification_head.build(None) + + +__all__ = ["TFBartForConditionalGeneration", "TFBartForSequenceClassification", "TFBartModel", "TFBartPretrainedModel"] diff --git a/docs/transformers/build/lib/transformers/models/bart/tokenization_bart.py b/docs/transformers/build/lib/transformers/models/bart/tokenization_bart.py new file mode 100644 index 0000000000000000000000000000000000000000..4c516cb81be0d260e562d09103691635eba77118 --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/bart/tokenization_bart.py @@ -0,0 +1,393 @@ +# coding=utf-8 +# Copyright 2020 The Facebook AI Research Team Authors and The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import json +import os +from functools import lru_cache +from typing import List, Optional, Tuple + +import regex as re + +from ...tokenization_utils import AddedToken, PreTrainedTokenizer +from ...utils import logging + + +logger = logging.get_logger(__name__) + + +VOCAB_FILES_NAMES = {"vocab_file": "vocab.json", "merges_file": "merges.txt"} + +# See all BART models at https://huggingface.co/models?filter=bart + + +@lru_cache() +def bytes_to_unicode(): + """ + Returns list of utf-8 byte and a mapping to unicode strings. We specifically avoids mapping to whitespace/control + characters the bpe code barfs on. + + The reversible bpe codes work on unicode strings. This means you need a large # of unicode characters in your vocab + if you want to avoid UNKs. When you're at something like a 10B token dataset you end up needing around 5K for + decent coverage. This is a significant percentage of your normal, say, 32K bpe vocab. To avoid that, we want lookup + tables between utf-8 bytes and unicode strings. + """ + bs = ( + list(range(ord("!"), ord("~") + 1)) + list(range(ord("¡"), ord("¬") + 1)) + list(range(ord("®"), ord("ÿ") + 1)) + ) + cs = bs[:] + n = 0 + for b in range(2**8): + if b not in bs: + bs.append(b) + cs.append(2**8 + n) + n += 1 + cs = [chr(n) for n in cs] + return dict(zip(bs, cs)) + + +def get_pairs(word): + """ + Return set of symbol pairs in a word. + + Word is represented as tuple of symbols (symbols being variable-length strings). + """ + pairs = set() + prev_char = word[0] + for char in word[1:]: + pairs.add((prev_char, char)) + prev_char = char + return pairs + + +class BartTokenizer(PreTrainedTokenizer): + """ + Constructs a BART tokenizer, which is smilar to the ROBERTa tokenizer, using byte-level Byte-Pair-Encoding. + + This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will + be encoded differently whether it is at the beginning of the sentence (without space) or not: + + ```python + >>> from transformers import BartTokenizer + + >>> tokenizer = BartTokenizer.from_pretrained("facebook/bart-base") + >>> tokenizer("Hello world")["input_ids"] + [0, 31414, 232, 2] + + >>> tokenizer(" Hello world")["input_ids"] + [0, 20920, 232, 2] + ``` + + You can get around that behavior by passing `add_prefix_space=True` when instantiating this tokenizer or when you + call it on some text, but since the model was not pretrained this way, it might yield a decrease in performance. + + + + When used with `is_split_into_words=True`, this tokenizer will add a space before each word (even the first one). + + + + This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to + this superclass for more information regarding those methods. + + Args: + vocab_file (`str`): + Path to the vocabulary file. + merges_file (`str`): + Path to the merges file. + errors (`str`, *optional*, defaults to `"replace"`): + Paradigm to follow when decoding bytes to UTF-8. See + [bytes.decode](https://docs.python.org/3/library/stdtypes.html#bytes.decode) for more information. + bos_token (`str`, *optional*, defaults to `""`): + The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token. + + + + When building a sequence using special tokens, this is not the token that is used for the beginning of + sequence. The token used is the `cls_token`. + + + + eos_token (`str`, *optional*, defaults to `""`): + The end of sequence token. + + + + When building a sequence using special tokens, this is not the token that is used for the end of sequence. + The token used is the `sep_token`. + + + + sep_token (`str`, *optional*, defaults to `""`): + The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for + sequence classification or for a text and a question for question answering. It is also used as the last + token of a sequence built with special tokens. + cls_token (`str`, *optional*, defaults to `""`): + The classifier token which is used when doing sequence classification (classification of the whole sequence + instead of per-token classification). It is the first token of the sequence when built with special tokens. + unk_token (`str`, *optional*, defaults to `""`): + The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this + token instead. + pad_token (`str`, *optional*, defaults to `""`): + The token used for padding, for example when batching sequences of different lengths. + mask_token (`str`, *optional*, defaults to `""`): + The token used for masking values. This is the token used when training this model with masked language + modeling. This is the token which the model will try to predict. + add_prefix_space (`bool`, *optional*, defaults to `False`): + Whether or not to add an initial space to the input. This allows to treat the leading word just as any + other word. (BART tokenizer detect beginning of words by the preceding space). + """ + + vocab_files_names = VOCAB_FILES_NAMES + model_input_names = ["input_ids", "attention_mask"] + + def __init__( + self, + vocab_file, + merges_file, + errors="replace", + bos_token="", + eos_token="", + sep_token="", + cls_token="", + unk_token="", + pad_token="", + mask_token="", + add_prefix_space=False, + **kwargs, + ): + bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token + eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token + sep_token = AddedToken(sep_token, lstrip=False, rstrip=False) if isinstance(sep_token, str) else sep_token + cls_token = AddedToken(cls_token, lstrip=False, rstrip=False) if isinstance(cls_token, str) else cls_token + unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token + pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token + + # Mask token behave like a normal word, i.e. include the space before it + mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token + + with open(vocab_file, encoding="utf-8") as vocab_handle: + self.encoder = json.load(vocab_handle) + self.decoder = {v: k for k, v in self.encoder.items()} + self.errors = errors # how to handle errors in decoding + self.byte_encoder = bytes_to_unicode() + self.byte_decoder = {v: k for k, v in self.byte_encoder.items()} + with open(merges_file, encoding="utf-8") as merges_handle: + bpe_merges = merges_handle.read().split("\n")[1:-1] + bpe_merges = [tuple(merge.split()) for merge in bpe_merges] + self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges)))) + self.cache = {} + self.add_prefix_space = add_prefix_space + + # Should have added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions + self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""") + + super().__init__( + errors=errors, + bos_token=bos_token, + eos_token=eos_token, + unk_token=unk_token, + sep_token=sep_token, + cls_token=cls_token, + pad_token=pad_token, + mask_token=mask_token, + add_prefix_space=add_prefix_space, + **kwargs, + ) + + @property + def vocab_size(self): + return len(self.encoder) + + def get_vocab(self): + return dict(self.encoder, **self.added_tokens_encoder) + + def bpe(self, token): + if token in self.cache: + return self.cache[token] + word = tuple(token) + pairs = get_pairs(word) + + if not pairs: + return token + + while True: + bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf"))) + if bigram not in self.bpe_ranks: + break + first, second = bigram + new_word = [] + i = 0 + while i < len(word): + try: + j = word.index(first, i) + except ValueError: + new_word.extend(word[i:]) + break + else: + new_word.extend(word[i:j]) + i = j + + if word[i] == first and i < len(word) - 1 and word[i + 1] == second: + new_word.append(first + second) + i += 2 + else: + new_word.append(word[i]) + i += 1 + new_word = tuple(new_word) + word = new_word + if len(word) == 1: + break + else: + pairs = get_pairs(word) + word = " ".join(word) + self.cache[token] = word + return word + + def _tokenize(self, text): + """Tokenize a string.""" + bpe_tokens = [] + for token in re.findall(self.pat, text): + token = "".join( + self.byte_encoder[b] for b in token.encode("utf-8") + ) # Maps all our bytes to unicode strings, avoiding control tokens of the BPE (spaces in our case) + bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(" ")) + return bpe_tokens + + def _convert_token_to_id(self, token): + """Converts a token (str) in an id using the vocab.""" + return self.encoder.get(token, self.encoder.get(self.unk_token)) + + def _convert_id_to_token(self, index): + """Converts an index (integer) in a token (str) using the vocab.""" + return self.decoder.get(index) + + def convert_tokens_to_string(self, tokens): + """Converts a sequence of tokens (string) in a single string.""" + text = "".join(tokens) + text = bytearray([self.byte_decoder[c] for c in text]).decode("utf-8", errors=self.errors) + return text + + def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]: + if not os.path.isdir(save_directory): + logger.error(f"Vocabulary path ({save_directory}) should be a directory") + return + vocab_file = os.path.join( + save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"] + ) + merge_file = os.path.join( + save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["merges_file"] + ) + + with open(vocab_file, "w", encoding="utf-8") as f: + f.write(json.dumps(self.encoder, indent=2, sort_keys=True, ensure_ascii=False) + "\n") + + index = 0 + with open(merge_file, "w", encoding="utf-8") as writer: + writer.write("#version: 0.2\n") + for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]): + if index != token_index: + logger.warning( + f"Saving vocabulary to {merge_file}: BPE merge indices are not consecutive." + " Please check that the tokenizer is not corrupted!" + ) + index = token_index + writer.write(" ".join(bpe_tokens) + "\n") + index += 1 + + return vocab_file, merge_file + + def build_inputs_with_special_tokens( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None + ) -> List[int]: + """ + Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and + adding special tokens. A BART sequence has the following format: + + - single sequence: ` X ` + - pair of sequences: ` A B ` + + Args: + token_ids_0 (`List[int]`): + List of IDs to which the special tokens will be added. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + + Returns: + `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens. + """ + if token_ids_1 is None: + return [self.cls_token_id] + token_ids_0 + [self.sep_token_id] + cls = [self.cls_token_id] + sep = [self.sep_token_id] + return cls + token_ids_0 + sep + sep + token_ids_1 + sep + + def get_special_tokens_mask( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False + ) -> List[int]: + """ + Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding + special tokens using the tokenizer `prepare_for_model` method. + + Args: + token_ids_0 (`List[int]`): + List of IDs. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + already_has_special_tokens (`bool`, *optional*, defaults to `False`): + Whether or not the token list is already formatted with special tokens for the model. + + Returns: + `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. + """ + if already_has_special_tokens: + return super().get_special_tokens_mask( + token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True + ) + + if token_ids_1 is None: + return [1] + ([0] * len(token_ids_0)) + [1] + return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1] + + def create_token_type_ids_from_sequences( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None + ) -> List[int]: + """ + Create a mask from the two sequences passed to be used in a sequence-pair classification task. BART does not + make use of token type ids, therefore a list of zeros is returned. + + Args: + token_ids_0 (`List[int]`): + List of IDs. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + + Returns: + `List[int]`: List of zeros. + """ + sep = [self.sep_token_id] + cls = [self.cls_token_id] + + if token_ids_1 is None: + return len(cls + token_ids_0 + sep) * [0] + return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0] + + def prepare_for_tokenization(self, text, is_split_into_words=False, **kwargs): + add_prefix_space = kwargs.pop("add_prefix_space", self.add_prefix_space) + if (is_split_into_words or add_prefix_space) and (len(text) > 0 and not text[0].isspace()): + text = " " + text + return (text, kwargs) + + +__all__ = ["BartTokenizer"] diff --git a/docs/transformers/build/lib/transformers/models/bart/tokenization_bart_fast.py b/docs/transformers/build/lib/transformers/models/bart/tokenization_bart_fast.py new file mode 100644 index 0000000000000000000000000000000000000000..f7c92b9d22c99f1e4d69e7c3d36785ad9c6825d9 --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/bart/tokenization_bart_fast.py @@ -0,0 +1,271 @@ +# coding=utf-8 +# Copyright 2020 The Facebook AI Research Team Authors and The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import json +from typing import List, Optional, Tuple + +from tokenizers import processors + +from ...tokenization_utils_base import AddedToken, BatchEncoding +from ...tokenization_utils_fast import PreTrainedTokenizerFast +from ...utils import logging +from .tokenization_bart import BartTokenizer + + +logger = logging.get_logger(__name__) + + +VOCAB_FILES_NAMES = {"vocab_file": "vocab.json", "merges_file": "merges.txt", "tokenizer_file": "tokenizer.json"} + +# See all BART models at https://huggingface.co/models?filter=bart + + +class BartTokenizerFast(PreTrainedTokenizerFast): + r""" + Construct a "fast" BART tokenizer (backed by HuggingFace's *tokenizers* library), derived from the GPT-2 tokenizer, + using byte-level Byte-Pair-Encoding. + + This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will + be encoded differently whether it is at the beginning of the sentence (without space) or not: + + ```python + >>> from transformers import BartTokenizerFast + + >>> tokenizer = BartTokenizerFast.from_pretrained("facebook/bart-base") + >>> tokenizer("Hello world")["input_ids"] + [0, 31414, 232, 2] + + >>> tokenizer(" Hello world")["input_ids"] + [0, 20920, 232, 2] + ``` + + You can get around that behavior by passing `add_prefix_space=True` when instantiating this tokenizer or when you + call it on some text, but since the model was not pretrained this way, it might yield a decrease in performance. + + + + When used with `is_split_into_words=True`, this tokenizer needs to be instantiated with `add_prefix_space=True`. + + + + This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should + refer to this superclass for more information regarding those methods. + + Args: + vocab_file (`str`): + Path to the vocabulary file. + merges_file (`str`): + Path to the merges file. + errors (`str`, *optional*, defaults to `"replace"`): + Paradigm to follow when decoding bytes to UTF-8. See + [bytes.decode](https://docs.python.org/3/library/stdtypes.html#bytes.decode) for more information. + bos_token (`str`, *optional*, defaults to `""`): + The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token. + + + + When building a sequence using special tokens, this is not the token that is used for the beginning of + sequence. The token used is the `cls_token`. + + + + eos_token (`str`, *optional*, defaults to `""`): + The end of sequence token. + + + + When building a sequence using special tokens, this is not the token that is used for the end of sequence. + The token used is the `sep_token`. + + + + sep_token (`str`, *optional*, defaults to `""`): + The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for + sequence classification or for a text and a question for question answering. It is also used as the last + token of a sequence built with special tokens. + cls_token (`str`, *optional*, defaults to `""`): + The classifier token which is used when doing sequence classification (classification of the whole sequence + instead of per-token classification). It is the first token of the sequence when built with special tokens. + unk_token (`str`, *optional*, defaults to `""`): + The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this + token instead. + pad_token (`str`, *optional*, defaults to `""`): + The token used for padding, for example when batching sequences of different lengths. + mask_token (`str`, *optional*, defaults to `""`): + The token used for masking values. This is the token used when training this model with masked language + modeling. This is the token which the model will try to predict. + add_prefix_space (`bool`, *optional*, defaults to `False`): + Whether or not to add an initial space to the input. This allows to treat the leading word just as any + other word. (BART tokenizer detect beginning of words by the preceding space). + trim_offsets (`bool`, *optional*, defaults to `True`): + Whether the post processing step should trim offsets to avoid including whitespaces. + """ + + vocab_files_names = VOCAB_FILES_NAMES + model_input_names = ["input_ids", "attention_mask"] + slow_tokenizer_class = BartTokenizer + + def __init__( + self, + vocab_file=None, + merges_file=None, + tokenizer_file=None, + errors="replace", + bos_token="", + eos_token="", + sep_token="", + cls_token="", + unk_token="", + pad_token="", + mask_token="", + add_prefix_space=False, + trim_offsets=True, + **kwargs, + ): + # we have to specify that this tokens is special otherwise adding it will reset the normalized flag to `False` in `add_special_tokens` + mask_token = ( + AddedToken(mask_token, lstrip=True, normalized=True, special=True) + if isinstance(mask_token, str) + else mask_token + ) + super().__init__( + vocab_file, + merges_file, + tokenizer_file=tokenizer_file, + errors=errors, + bos_token=bos_token, + eos_token=eos_token, + sep_token=sep_token, + cls_token=cls_token, + unk_token=unk_token, + pad_token=pad_token, + mask_token=mask_token, + add_prefix_space=add_prefix_space, + trim_offsets=trim_offsets, + **kwargs, + ) + + # the pre_tokenizer is already updated in the GPT2TokenizerFast `__init__` + tokenizer_component = "post_processor" + tokenizer_component_instance = getattr(self.backend_tokenizer, tokenizer_component, None) + if tokenizer_component_instance: + state = json.loads(tokenizer_component_instance.__getstate__()) + + # The lists 'sep' and 'cls' must be cased in tuples for the object `post_processor_class` + if "sep" in state: + state["sep"] = tuple(state["sep"]) + if "cls" in state: + state["cls"] = tuple(state["cls"]) + + changes_to_apply = False + + if state.get("add_prefix_space", add_prefix_space) != add_prefix_space: + state["add_prefix_space"] = add_prefix_space + changes_to_apply = True + + if state.get("trim_offsets", trim_offsets) != trim_offsets: + state["trim_offsets"] = trim_offsets + changes_to_apply = True + + if changes_to_apply: + component_class = getattr(processors, state.pop("type")) + new_value = component_class(**state) + setattr(self.backend_tokenizer, tokenizer_component, new_value) + + @property + def mask_token(self) -> str: + """ + `str`: Mask token, to use when training a model with masked-language modeling. Log an error if used while not + having been set. + + BART tokenizer has a special mask token to be usable in the fill-mask pipeline. The mask token will greedily + comprise the space before the **. + """ + if self._mask_token is None: + if self.verbose: + logger.error("Using mask_token, but it is not set yet.") + return None + return str(self._mask_token) + + @mask_token.setter + def mask_token(self, value): + """ + Overriding the default behavior of the mask token to have it eat the space before it. + + This is needed to preserve backward compatibility with all the previously used models based on Bart. + """ + # Mask token behave like a normal word, i.e. include the space before it + # So we set lstrip to True + value = AddedToken(value, lstrip=True, rstrip=False) if isinstance(value, str) else value + self._mask_token = value + + def _batch_encode_plus(self, *args, **kwargs) -> BatchEncoding: + is_split_into_words = kwargs.get("is_split_into_words", False) + + if is_split_into_words and not self.add_prefix_space: + raise ValueError( + f"You need to instantiate {self.__class__.__name__} with add_prefix_space=True " + "to use it with pretokenized inputs." + ) + + return super()._batch_encode_plus(*args, **kwargs) + + def _encode_plus(self, *args, **kwargs) -> BatchEncoding: + is_split_into_words = kwargs.get("is_split_into_words", False) + + if is_split_into_words and not self.add_prefix_space: + raise ValueError( + f"You need to instantiate {self.__class__.__name__} with add_prefix_space=True " + "to use it with pretokenized inputs." + ) + + return super()._encode_plus(*args, **kwargs) + + def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]: + files = self._tokenizer.model.save(save_directory, name=filename_prefix) + return tuple(files) + + def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None): + output = [self.bos_token_id] + token_ids_0 + [self.eos_token_id] + if token_ids_1 is None: + return output + + return output + [self.eos_token_id] + token_ids_1 + [self.eos_token_id] + + def create_token_type_ids_from_sequences( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None + ) -> List[int]: + """ + Create a mask from the two sequences passed to be used in a sequence-pair classification task. BART does not + make use of token type ids, therefore a list of zeros is returned. + + Args: + token_ids_0 (`List[int]`): + List of IDs. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + + Returns: + `List[int]`: List of zeros. + """ + sep = [self.sep_token_id] + cls = [self.cls_token_id] + + if token_ids_1 is None: + return len(cls + token_ids_0 + sep) * [0] + return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0] + + +__all__ = ["BartTokenizerFast"] diff --git a/docs/transformers/build/lib/transformers/models/barthez/__init__.py b/docs/transformers/build/lib/transformers/models/barthez/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..323fe2fe8af9823d4478957b2f94b078ec39b7f3 --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/barthez/__init__.py @@ -0,0 +1,27 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ...utils import _LazyModule +from ...utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .tokenization_barthez import * + from .tokenization_barthez_fast import * +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/docs/transformers/build/lib/transformers/models/barthez/tokenization_barthez.py b/docs/transformers/build/lib/transformers/models/barthez/tokenization_barthez.py new file mode 100644 index 0000000000000000000000000000000000000000..0db6634d986881e5dbb03b726748f6c31167f5da --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/barthez/tokenization_barthez.py @@ -0,0 +1,291 @@ +# coding=utf-8 +# Copyright 2020 Ecole Polytechnique and the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License +"""Tokenization classes for the BARThez model.""" + +import os +from shutil import copyfile +from typing import Any, Dict, List, Optional, Tuple + +import sentencepiece as spm + +from ...tokenization_utils import AddedToken, PreTrainedTokenizer +from ...utils import logging +from ...utils.import_utils import requires + + +logger = logging.get_logger(__name__) + +VOCAB_FILES_NAMES = {"vocab_file": "sentencepiece.bpe.model"} + + +SPIECE_UNDERLINE = "▁" + +# TODO this class is useless. This is the most standard sentencpiece model. Let's find which one is closest and nuke this. + + +@requires(backends=("sentencepiece",)) +class BarthezTokenizer(PreTrainedTokenizer): + """ + Adapted from [`CamembertTokenizer`] and [`BartTokenizer`]. Construct a BARThez tokenizer. Based on + [SentencePiece](https://github.com/google/sentencepiece). + + This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to + this superclass for more information regarding those methods. + + Args: + vocab_file (`str`): + [SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that + contains the vocabulary necessary to instantiate a tokenizer. + bos_token (`str`, *optional*, defaults to `""`): + The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token. + + + + When building a sequence using special tokens, this is not the token that is used for the beginning of + sequence. The token used is the `cls_token`. + + + + eos_token (`str`, *optional*, defaults to `""`): + The end of sequence token. + + + + When building a sequence using special tokens, this is not the token that is used for the end of sequence. + The token used is the `sep_token`. + + + + sep_token (`str`, *optional*, defaults to `""`): + The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for + sequence classification or for a text and a question for question answering. It is also used as the last + token of a sequence built with special tokens. + cls_token (`str`, *optional*, defaults to `""`): + The classifier token which is used when doing sequence classification (classification of the whole sequence + instead of per-token classification). It is the first token of the sequence when built with special tokens. + unk_token (`str`, *optional*, defaults to `""`): + The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this + token instead. + pad_token (`str`, *optional*, defaults to `""`): + The token used for padding, for example when batching sequences of different lengths. + mask_token (`str`, *optional*, defaults to `""`): + The token used for masking values. This is the token used when training this model with masked language + modeling. This is the token which the model will try to predict. + sp_model_kwargs (`dict`, *optional*): + Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for + SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things, + to set: + + - `enable_sampling`: Enable subword regularization. + - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout. + + - `nbest_size = {0,1}`: No sampling is performed. + - `nbest_size > 1`: samples from the nbest_size results. + - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice) + using forward-filtering-and-backward-sampling algorithm. + + - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for + BPE-dropout. + + Attributes: + sp_model (`SentencePieceProcessor`): + The *SentencePiece* processor that is used for every conversion (string, tokens and IDs). + """ + + vocab_files_names = VOCAB_FILES_NAMES + model_input_names = ["input_ids", "attention_mask"] + + def __init__( + self, + vocab_file, + bos_token="", + eos_token="", + sep_token="", + cls_token="", + unk_token="", + pad_token="", + mask_token="", + sp_model_kwargs: Optional[Dict[str, Any]] = None, + **kwargs, + ) -> None: + # Mask token behave like a normal word, i.e. include the space before it. Will have normalized=False by default this way + mask_token = AddedToken(mask_token, lstrip=True, special=True) if isinstance(mask_token, str) else mask_token + + self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs + + self.vocab_file = vocab_file + self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) + self.sp_model.Load(str(vocab_file)) + super().__init__( + bos_token=bos_token, + eos_token=eos_token, + unk_token=unk_token, + sep_token=sep_token, + cls_token=cls_token, + pad_token=pad_token, + mask_token=mask_token, + sp_model_kwargs=self.sp_model_kwargs, + **kwargs, + ) + + def build_inputs_with_special_tokens( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None + ) -> List[int]: + """ + Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and + adding special tokens. A BARThez sequence has the following format: + + - single sequence: ` X ` + - pair of sequences: ` A B ` + + Args: + token_ids_0 (`List[int]`): + List of IDs to which the special tokens will be added. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + + Returns: + `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens. + """ + + if token_ids_1 is None: + return [self.cls_token_id] + token_ids_0 + [self.sep_token_id] + cls = [self.cls_token_id] + sep = [self.sep_token_id] + return cls + token_ids_0 + sep + sep + token_ids_1 + sep + + def get_special_tokens_mask( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False + ) -> List[int]: + """ + Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding + special tokens using the tokenizer `prepare_for_model` method. + + Args: + token_ids_0 (`List[int]`): + List of IDs. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + already_has_special_tokens (`bool`, *optional*, defaults to `False`): + Whether or not the token list is already formatted with special tokens for the model. + + Returns: + `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. + """ + if already_has_special_tokens: + return super().get_special_tokens_mask( + token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True + ) + + if token_ids_1 is None: + return [1] + ([0] * len(token_ids_0)) + [1] + return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1] + + def create_token_type_ids_from_sequences( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None + ) -> List[int]: + """ + Create a mask from the two sequences passed to be used in a sequence-pair classification task. + + Args: + token_ids_0 (`List[int]`): + List of IDs. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + + Returns: + `List[int]`: List of zeros. + """ + sep = [self.sep_token_id] + cls = [self.cls_token_id] + + if token_ids_1 is None: + return len(cls + token_ids_0 + sep) * [0] + return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0] + + @property + def vocab_size(self): + return len(self.sp_model) + + def get_vocab(self): + vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)} + vocab.update(self.added_tokens_encoder) + return vocab + + def _tokenize(self, text: str) -> List[str]: + return self.sp_model.encode(text, out_type=str) + + def _convert_token_to_id(self, token): + """Converts a token (str) in an id using the vocab.""" + return self.sp_model.PieceToId(token) + + def _convert_id_to_token(self, index): + """Converts an index (integer) in a token (str) using the vocab.""" + return self.sp_model.IdToPiece(index) + + # Copied from transformers.models.albert.tokenization_albert.AlbertTokenizer.convert_tokens_to_string + def convert_tokens_to_string(self, tokens): + """Converts a sequence of tokens (string) in a single string.""" + current_sub_tokens = [] + out_string = "" + prev_is_special = False + for token in tokens: + # make sure that special tokens are not decoded using sentencepiece model + if token in self.all_special_tokens: + if not prev_is_special: + out_string += " " + out_string += self.sp_model.decode(current_sub_tokens) + token + prev_is_special = True + current_sub_tokens = [] + else: + current_sub_tokens.append(token) + prev_is_special = False + out_string += self.sp_model.decode(current_sub_tokens) + return out_string.strip() + + def __getstate__(self): + state = self.__dict__.copy() + state["sp_model"] = None + return state + + def __setstate__(self, d): + self.__dict__ = d + + # for backward compatibility + if not hasattr(self, "sp_model_kwargs"): + self.sp_model_kwargs = {} + + self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) + self.sp_model.Load(self.vocab_file) + + def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]: + if not os.path.isdir(save_directory): + logger.error(f"Vocabulary path ({save_directory}) should be a directory") + return + out_vocab_file = os.path.join( + save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"] + ) + + if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file): + copyfile(self.vocab_file, out_vocab_file) + elif not os.path.isfile(self.vocab_file): + with open(out_vocab_file, "wb") as fi: + content_spiece_model = self.sp_model.serialized_model_proto() + fi.write(content_spiece_model) + + return (out_vocab_file,) + + +__all__ = ["BarthezTokenizer"] diff --git a/docs/transformers/build/lib/transformers/models/barthez/tokenization_barthez_fast.py b/docs/transformers/build/lib/transformers/models/barthez/tokenization_barthez_fast.py new file mode 100644 index 0000000000000000000000000000000000000000..a1d95ef03e48820f39dd91adf117fb6cdc94fe0a --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/barthez/tokenization_barthez_fast.py @@ -0,0 +1,197 @@ +# coding=utf-8 +# Copyright 2020 Ecole Polytechnique and the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License +"""Tokenization classes for the BARThez model.""" + +import os +from shutil import copyfile +from typing import List, Optional, Tuple + +from ...tokenization_utils import AddedToken +from ...tokenization_utils_fast import PreTrainedTokenizerFast +from ...utils import is_sentencepiece_available, logging + + +if is_sentencepiece_available(): + from .tokenization_barthez import BarthezTokenizer +else: + BarthezTokenizer = None + +logger = logging.get_logger(__name__) + +VOCAB_FILES_NAMES = {"vocab_file": "sentencepiece.bpe.model", "tokenizer_file": "tokenizer.json"} + + +SPIECE_UNDERLINE = "▁" + + +class BarthezTokenizerFast(PreTrainedTokenizerFast): + """ + Adapted from [`CamembertTokenizer`] and [`BartTokenizer`]. Construct a "fast" BARThez tokenizer. Based on + [SentencePiece](https://github.com/google/sentencepiece). + + This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should + refer to this superclass for more information regarding those methods. + + Args: + vocab_file (`str`): + [SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that + contains the vocabulary necessary to instantiate a tokenizer. + bos_token (`str`, *optional*, defaults to `""`): + The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token. + + + + When building a sequence using special tokens, this is not the token that is used for the beginning of + sequence. The token used is the `cls_token`. + + + + eos_token (`str`, *optional*, defaults to `""`): + The end of sequence token. + + + + When building a sequence using special tokens, this is not the token that is used for the end of sequence. + The token used is the `sep_token`. + + + + sep_token (`str`, *optional*, defaults to `""`): + The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for + sequence classification or for a text and a question for question answering. It is also used as the last + token of a sequence built with special tokens. + cls_token (`str`, *optional*, defaults to `""`): + The classifier token which is used when doing sequence classification (classification of the whole sequence + instead of per-token classification). It is the first token of the sequence when built with special tokens. + unk_token (`str`, *optional*, defaults to `""`): + The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this + token instead. + pad_token (`str`, *optional*, defaults to `""`): + The token used for padding, for example when batching sequences of different lengths. + mask_token (`str`, *optional*, defaults to `""`): + The token used for masking values. This is the token used when training this model with masked language + modeling. This is the token which the model will try to predict. + additional_special_tokens (`List[str]`, *optional*, defaults to `["NOTUSED", "NOTUSED"]`): + Additional special tokens used by the tokenizer. + """ + + vocab_files_names = VOCAB_FILES_NAMES + model_input_names = ["input_ids", "attention_mask"] + slow_tokenizer_class = BarthezTokenizer + + def __init__( + self, + vocab_file=None, + tokenizer_file=None, + bos_token="", + eos_token="", + sep_token="", + cls_token="", + unk_token="", + pad_token="", + mask_token="", + **kwargs, + ): + # Mask token behave like a normal word, i.e. include the space before it + mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token + + super().__init__( + vocab_file, + tokenizer_file=tokenizer_file, + bos_token=bos_token, + eos_token=eos_token, + unk_token=unk_token, + sep_token=sep_token, + cls_token=cls_token, + pad_token=pad_token, + mask_token=mask_token, + **kwargs, + ) + + self.vocab_file = vocab_file + + @property + def can_save_slow_tokenizer(self) -> bool: + return os.path.isfile(self.vocab_file) if self.vocab_file else False + + def build_inputs_with_special_tokens( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None + ) -> List[int]: + """ + Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and + adding special tokens. A BARThez sequence has the following format: + + - single sequence: ` X ` + - pair of sequences: ` A B ` + + Args: + token_ids_0 (`List[int]`): + List of IDs to which the special tokens will be added. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + + Returns: + `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens. + """ + + if token_ids_1 is None: + return [self.cls_token_id] + token_ids_0 + [self.sep_token_id] + cls = [self.cls_token_id] + sep = [self.sep_token_id] + return cls + token_ids_0 + sep + sep + token_ids_1 + sep + + def create_token_type_ids_from_sequences( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None + ) -> List[int]: + """ + Create a mask from the two sequences passed to be used in a sequence-pair classification task. + + Args: + token_ids_0 (`List[int]`): + List of IDs. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + + Returns: + `List[int]`: List of zeros. + """ + sep = [self.sep_token_id] + cls = [self.cls_token_id] + + if token_ids_1 is None: + return len(cls + token_ids_0 + sep) * [0] + return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0] + + def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]: + if not self.can_save_slow_tokenizer: + raise ValueError( + "Your fast tokenizer does not have the necessary information to save the vocabulary for a slow " + "tokenizer." + ) + + if not os.path.isdir(save_directory): + logger.error(f"Vocabulary path ({save_directory}) should be a directory") + return + out_vocab_file = os.path.join( + save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"] + ) + + if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file): + copyfile(self.vocab_file, out_vocab_file) + + return (out_vocab_file,) + + +__all__ = ["BarthezTokenizerFast"] diff --git a/docs/transformers/build/lib/transformers/models/bartpho/__init__.py b/docs/transformers/build/lib/transformers/models/bartpho/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..597be95d8175cac9d48edf3eb4d42a2a91b95833 --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/bartpho/__init__.py @@ -0,0 +1,26 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ...utils import _LazyModule +from ...utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .tokenization_bartpho import * +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/docs/transformers/build/lib/transformers/models/bartpho/tokenization_bartpho.py b/docs/transformers/build/lib/transformers/models/bartpho/tokenization_bartpho.py new file mode 100644 index 0000000000000000000000000000000000000000..c3e121089e62977591a5c6f85a2af1c3a0d208a4 --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/bartpho/tokenization_bartpho.py @@ -0,0 +1,318 @@ +# coding=utf-8 +# Copyright 2021 VinAI Research and the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License +"""Tokenization classes for BARTpho-syllable model.""" + +import os +from shutil import copyfile +from typing import Any, Dict, List, Optional, Tuple + +import sentencepiece as spm + +from ...tokenization_utils import AddedToken, PreTrainedTokenizer +from ...utils import logging +from ...utils.import_utils import requires + + +logger = logging.get_logger(__name__) + +SPIECE_UNDERLINE = "▁" + +VOCAB_FILES_NAMES = {"vocab_file": "sentencepiece.bpe.model", "monolingual_vocab_file": "dict.txt"} + + +@requires(backends=("sentencepiece",)) +class BartphoTokenizer(PreTrainedTokenizer): + """ + Adapted from [`XLMRobertaTokenizer`]. Based on [SentencePiece](https://github.com/google/sentencepiece). + + This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to + this superclass for more information regarding those methods. + + Args: + vocab_file (`str`): + Path to the vocabulary file. This vocabulary is the pre-trained SentencePiece model available from the + multilingual XLM-RoBERTa, also used in mBART, consisting of 250K types. + monolingual_vocab_file (`str`): + Path to the monolingual vocabulary file. This monolingual vocabulary consists of Vietnamese-specialized + types extracted from the multilingual vocabulary vocab_file of 250K types. + bos_token (`str`, *optional*, defaults to `""`): + The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token. + + + + When building a sequence using special tokens, this is not the token that is used for the beginning of + sequence. The token used is the `cls_token`. + + + + eos_token (`str`, *optional*, defaults to `""`): + The end of sequence token. + + + + When building a sequence using special tokens, this is not the token that is used for the end of sequence. + The token used is the `sep_token`. + + + + sep_token (`str`, *optional*, defaults to `""`): + The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for + sequence classification or for a text and a question for question answering. It is also used as the last + token of a sequence built with special tokens. + cls_token (`str`, *optional*, defaults to `""`): + The classifier token which is used when doing sequence classification (classification of the whole sequence + instead of per-token classification). It is the first token of the sequence when built with special tokens. + unk_token (`str`, *optional*, defaults to `""`): + The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this + token instead. + pad_token (`str`, *optional*, defaults to `""`): + The token used for padding, for example when batching sequences of different lengths. + mask_token (`str`, *optional*, defaults to `""`): + The token used for masking values. This is the token used when training this model with masked language + modeling. This is the token which the model will try to predict. + sp_model_kwargs (`dict`, *optional*): + Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for + SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things, + to set: + + - `enable_sampling`: Enable subword regularization. + - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout. + + - `nbest_size = {0,1}`: No sampling is performed. + - `nbest_size > 1`: samples from the nbest_size results. + - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice) + using forward-filtering-and-backward-sampling algorithm. + + - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for + BPE-dropout. + + Attributes: + sp_model (`SentencePieceProcessor`): + The *SentencePiece* processor that is used for every conversion (string, tokens and IDs). + """ + + vocab_files_names = VOCAB_FILES_NAMES + model_input_names = ["input_ids", "attention_mask"] + + def __init__( + self, + vocab_file, + monolingual_vocab_file, + bos_token="", + eos_token="", + sep_token="", + cls_token="", + unk_token="", + pad_token="", + mask_token="", + sp_model_kwargs: Optional[Dict[str, Any]] = None, + **kwargs, + ) -> None: + # Mask token behave like a normal word, i.e. include the space before it + mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token + + self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs + + self.vocab_file = vocab_file + self.monolingual_vocab_file = monolingual_vocab_file + self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) + self.sp_model.Load(str(vocab_file)) + + # Load the reduced vocab + + # Keep order of special tokens for backward compatibility + self.fairseq_tokens_to_ids = {} + cnt = 0 + for token in [bos_token, pad_token, eos_token, unk_token, sep_token, cls_token]: + if str(token) not in self.fairseq_tokens_to_ids: + self.fairseq_tokens_to_ids[str(token)] = cnt + cnt += 1 + with open(monolingual_vocab_file, "r", encoding="utf-8") as f: + for line in f.readlines(): + token = line.strip().split()[0] + self.fairseq_tokens_to_ids[token] = len(self.fairseq_tokens_to_ids) + if str(mask_token) not in self.fairseq_tokens_to_ids: + self.fairseq_tokens_to_ids[str(mask_token)] = len(self.fairseq_tokens_to_ids) + + self.fairseq_ids_to_tokens = {v: k for k, v in self.fairseq_tokens_to_ids.items()} + + super().__init__( + bos_token=bos_token, + eos_token=eos_token, + unk_token=unk_token, + sep_token=sep_token, + cls_token=cls_token, + pad_token=pad_token, + mask_token=mask_token, + sp_model_kwargs=self.sp_model_kwargs, + **kwargs, + ) + + def __getstate__(self): + state = self.__dict__.copy() + state["sp_model"] = None + state["sp_model_proto"] = self.sp_model.serialized_model_proto() + return state + + def __setstate__(self, d): + self.__dict__ = d + + # for backward compatibility + if not hasattr(self, "sp_model_kwargs"): + self.sp_model_kwargs = {} + + self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) + self.sp_model.LoadFromSerializedProto(self.sp_model_proto) + + def build_inputs_with_special_tokens( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None + ) -> List[int]: + """ + Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and + adding special tokens. An BARTPho sequence has the following format: + + - single sequence: ` X ` + - pair of sequences: ` A B ` + + Args: + token_ids_0 (`List[int]`): + List of IDs to which the special tokens will be added. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + + Returns: + `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens. + """ + + if token_ids_1 is None: + return [self.cls_token_id] + token_ids_0 + [self.sep_token_id] + cls = [self.cls_token_id] + sep = [self.sep_token_id] + return cls + token_ids_0 + sep + sep + token_ids_1 + sep + + def get_special_tokens_mask( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False + ) -> List[int]: + """ + Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding + special tokens using the tokenizer `prepare_for_model` method. + + Args: + token_ids_0 (`List[int]`): + List of IDs. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + already_has_special_tokens (`bool`, *optional*, defaults to `False`): + Whether or not the token list is already formatted with special tokens for the model. + + Returns: + `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. + """ + + if already_has_special_tokens: + return super().get_special_tokens_mask( + token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True + ) + + if token_ids_1 is None: + return [1] + ([0] * len(token_ids_0)) + [1] + return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1] + + def create_token_type_ids_from_sequences( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None + ) -> List[int]: + """ + Create a mask from the two sequences passed to be used in a sequence-pair classification task. BARTPho does not + make use of token type ids, therefore a list of zeros is returned. + + Args: + token_ids_0 (`List[int]`): + List of IDs. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + + Returns: + `List[int]`: List of zeros. + + """ + + sep = [self.sep_token_id] + cls = [self.cls_token_id] + + if token_ids_1 is None: + return len(cls + token_ids_0 + sep) * [0] + return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0] + + @property + def vocab_size(self): + return len(self.fairseq_ids_to_tokens) + + def get_vocab(self): + vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)} + vocab.update(self.added_tokens_encoder) + return vocab + + def _tokenize(self, text: str) -> List[str]: + return self.sp_model.encode(text, out_type=str) + + def _convert_token_to_id(self, token): + """Converts a token (str) in an id using the vocab.""" + if token in self.fairseq_tokens_to_ids: + return self.fairseq_tokens_to_ids[token] + else: + return self.unk_token_id + + def _convert_id_to_token(self, index): + """Converts an index (integer) in a token (str) using the vocab.""" + return self.fairseq_ids_to_tokens[index] + + def convert_tokens_to_string(self, tokens): + """Converts a sequence of tokens (strings for sub-words) in a single string.""" + out_string = "".join(tokens).replace(SPIECE_UNDERLINE, " ").strip() + return out_string + + def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]: + if not os.path.isdir(save_directory): + logger.error(f"Vocabulary path ({save_directory}) should be a directory") + return + out_vocab_file = os.path.join( + save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"] + ) + out_monolingual_vocab_file = os.path.join( + save_directory, + (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["monolingual_vocab_file"], + ) + + if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file): + copyfile(self.vocab_file, out_vocab_file) + elif not os.path.isfile(self.vocab_file): + with open(out_vocab_file, "wb") as fi: + content_spiece_model = self.sp_model.serialized_model_proto() + fi.write(content_spiece_model) + + if os.path.abspath(self.monolingual_vocab_file) != os.path.abspath( + out_monolingual_vocab_file + ) and os.path.isfile(self.monolingual_vocab_file): + copyfile(self.monolingual_vocab_file, out_monolingual_vocab_file) + elif not os.path.isfile(self.monolingual_vocab_file): + with open(out_monolingual_vocab_file, "w", encoding="utf-8") as fp: + for token in self.fairseq_tokens_to_ids: + if token not in self.all_special_tokens: + fp.write(f"{str(token)} \n") + + return out_vocab_file, out_monolingual_vocab_file + + +__all__ = ["BartphoTokenizer"] diff --git a/docs/transformers/build/lib/transformers/models/beit/__init__.py b/docs/transformers/build/lib/transformers/models/beit/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..44f838a9a5d97f5e604dc47c63db4cbef7479849 --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/beit/__init__.py @@ -0,0 +1,30 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ...utils import _LazyModule +from ...utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .configuration_beit import * + from .feature_extraction_beit import * + from .image_processing_beit import * + from .modeling_beit import * + from .modeling_flax_beit import * +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/docs/transformers/build/lib/transformers/models/beit/configuration_beit.py b/docs/transformers/build/lib/transformers/models/beit/configuration_beit.py new file mode 100644 index 0000000000000000000000000000000000000000..834988258c6b755fcca9f275db139ed76b9a8b54 --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/beit/configuration_beit.py @@ -0,0 +1,229 @@ +# coding=utf-8 +# Copyright Microsoft Research and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""BEiT model configuration""" + +import warnings +from collections import OrderedDict +from typing import Mapping + +from packaging import version + +from ...configuration_utils import PretrainedConfig +from ...onnx import OnnxConfig +from ...utils.backbone_utils import BackboneConfigMixin, get_aligned_output_features_output_indices + + +class BeitConfig(BackboneConfigMixin, PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`BeitModel`]. It is used to instantiate an BEiT + model according to the specified arguments, defining the model architecture. Instantiating a configuration with the + defaults will yield a similar configuration to that of the BEiT + [microsoft/beit-base-patch16-224-pt22k](https://huggingface.co/microsoft/beit-base-patch16-224-pt22k) architecture. + + Args: + vocab_size (`int`, *optional*, defaults to 8192): + Vocabulary size of the BEiT model. Defines the number of different image tokens that can be used during + pre-training. + hidden_size (`int`, *optional*, defaults to 768): + Dimensionality of the encoder layers and the pooler layer. + num_hidden_layers (`int`, *optional*, defaults to 12): + Number of hidden layers in the Transformer encoder. + num_attention_heads (`int`, *optional*, defaults to 12): + Number of attention heads for each attention layer in the Transformer encoder. + intermediate_size (`int`, *optional*, defaults to 3072): + Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder. + hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`): + The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, + `"relu"`, `"selu"` and `"gelu_new"` are supported. + hidden_dropout_prob (`float`, *optional*, defaults to 0.0): + The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. + attention_probs_dropout_prob (`float`, *optional*, defaults to 0.0): + The dropout ratio for the attention probabilities. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + layer_norm_eps (`float`, *optional*, defaults to 1e-12): + The epsilon used by the layer normalization layers. + image_size (`int`, *optional*, defaults to 224): + The size (resolution) of each image. + patch_size (`int`, *optional*, defaults to 16): + The size (resolution) of each patch. + num_channels (`int`, *optional*, defaults to 3): + The number of input channels. + use_mask_token (`bool`, *optional*, defaults to `False`): + Whether to use a mask token for masked image modeling. + use_absolute_position_embeddings (`bool`, *optional*, defaults to `False`): + Whether to use BERT-style absolute position embeddings. + use_relative_position_bias (`bool`, *optional*, defaults to `False`): + Whether to use T5-style relative position embeddings in the self-attention layers. + use_shared_relative_position_bias (`bool`, *optional*, defaults to `False`): + Whether to use the same relative position embeddings across all self-attention layers of the Transformer. + layer_scale_init_value (`float`, *optional*, defaults to 0.1): + Scale to use in the self-attention layers. 0.1 for base, 1e-5 for large. Set 0 to disable layer scale. + drop_path_rate (`float`, *optional*, defaults to 0.1): + Stochastic depth rate per sample (when applied in the main path of residual layers). + use_mean_pooling (`bool`, *optional*, defaults to `True`): + Whether to mean pool the final hidden states of the patches instead of using the final hidden state of the + CLS token, before applying the classification head. + pool_scales (`Tuple[int]`, *optional*, defaults to `[1, 2, 3, 6]`): + Pooling scales used in Pooling Pyramid Module applied on the last feature map. + use_auxiliary_head (`bool`, *optional*, defaults to `True`): + Whether to use an auxiliary head during training. + auxiliary_loss_weight (`float`, *optional*, defaults to 0.4): + Weight of the cross-entropy loss of the auxiliary head. + auxiliary_channels (`int`, *optional*, defaults to 256): + Number of channels to use in the auxiliary head. + auxiliary_num_convs (`int`, *optional*, defaults to 1): + Number of convolutional layers to use in the auxiliary head. + auxiliary_concat_input (`bool`, *optional*, defaults to `False`): + Whether to concatenate the output of the auxiliary head with the input before the classification layer. + semantic_loss_ignore_index (`int`, *optional*, defaults to 255): + The index that is ignored by the loss function of the semantic segmentation model. + out_features (`List[str]`, *optional*): + If used as backbone, list of features to output. Can be any of `"stem"`, `"stage1"`, `"stage2"`, etc. + (depending on how many stages the model has). If unset and `out_indices` is set, will default to the + corresponding stages. If unset and `out_indices` is unset, will default to the last stage. Must be in the + same order as defined in the `stage_names` attribute. + out_indices (`List[int]`, *optional*): + If used as backbone, list of indices of features to output. Can be any of 0, 1, 2, etc. (depending on how + many stages the model has). If unset and `out_features` is set, will default to the corresponding stages. + If unset and `out_features` is unset, will default to the last stage. Must be in the + same order as defined in the `stage_names` attribute. + add_fpn (`bool`, *optional*, defaults to `False`): + Whether to add a FPN as part of the backbone. Only relevant for [`BeitBackbone`]. + reshape_hidden_states (`bool`, *optional*, defaults to `True`): + Whether to reshape the feature maps to 4D tensors of shape `(batch_size, hidden_size, height, width)` in + case the model is used as backbone. If `False`, the feature maps will be 3D tensors of shape `(batch_size, + seq_len, hidden_size)`. Only relevant for [`BeitBackbone`]. + + Example: + + ```python + >>> from transformers import BeitConfig, BeitModel + + >>> # Initializing a BEiT beit-base-patch16-224-pt22k style configuration + >>> configuration = BeitConfig() + + >>> # Initializing a model (with random weights) from the beit-base-patch16-224-pt22k style configuration + >>> model = BeitModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "beit" + + def __init__( + self, + vocab_size=8192, + hidden_size=768, + num_hidden_layers=12, + num_attention_heads=12, + intermediate_size=3072, + hidden_act="gelu", + hidden_dropout_prob=0.0, + attention_probs_dropout_prob=0.0, + initializer_range=0.02, + layer_norm_eps=1e-12, + image_size=224, + patch_size=16, + num_channels=3, + use_mask_token=False, + use_absolute_position_embeddings=False, + use_relative_position_bias=False, + use_shared_relative_position_bias=False, + layer_scale_init_value=0.1, + drop_path_rate=0.1, + use_mean_pooling=True, + pool_scales=[1, 2, 3, 6], + use_auxiliary_head=True, + auxiliary_loss_weight=0.4, + auxiliary_channels=256, + auxiliary_num_convs=1, + auxiliary_concat_input=False, + semantic_loss_ignore_index=255, + out_features=None, + out_indices=None, + add_fpn=False, + reshape_hidden_states=True, + **kwargs, + ): + super().__init__(**kwargs) + + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.intermediate_size = intermediate_size + self.hidden_act = hidden_act + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.initializer_range = initializer_range + self.layer_norm_eps = layer_norm_eps + + self.image_size = image_size + self.patch_size = patch_size + self.num_channels = num_channels + self.use_mask_token = use_mask_token + self.use_absolute_position_embeddings = use_absolute_position_embeddings + self.use_relative_position_bias = use_relative_position_bias + self.use_shared_relative_position_bias = use_shared_relative_position_bias + self.layer_scale_init_value = layer_scale_init_value + self.drop_path_rate = drop_path_rate + self.use_mean_pooling = use_mean_pooling + # decode head attributes (semantic segmentation) + self.pool_scales = pool_scales + # auxiliary head attributes (semantic segmentation) + self.use_auxiliary_head = use_auxiliary_head + self.auxiliary_loss_weight = auxiliary_loss_weight + self.auxiliary_channels = auxiliary_channels + self.auxiliary_num_convs = auxiliary_num_convs + self.auxiliary_concat_input = auxiliary_concat_input + self.semantic_loss_ignore_index = semantic_loss_ignore_index + + # handle backwards compatibility + if "segmentation_indices" in kwargs: + warnings.warn( + "The `segmentation_indices` argument is deprecated and will be removed in a future version, use `out_indices` instead.", + FutureWarning, + ) + out_indices = kwargs.pop("segmentation_indices") + + # backbone attributes + self.stage_names = ["stem"] + [f"stage{idx}" for idx in range(1, self.num_hidden_layers + 1)] + self._out_features, self._out_indices = get_aligned_output_features_output_indices( + out_features=out_features, out_indices=out_indices, stage_names=self.stage_names + ) + self.add_fpn = add_fpn + self.reshape_hidden_states = reshape_hidden_states + + +# Copied from transformers.models.vit.configuration_vit.ViTOnnxConfig +class BeitOnnxConfig(OnnxConfig): + torch_onnx_minimum_version = version.parse("1.11") + + @property + def inputs(self) -> Mapping[str, Mapping[int, str]]: + return OrderedDict( + [ + ("pixel_values", {0: "batch", 1: "num_channels", 2: "height", 3: "width"}), + ] + ) + + @property + def atol_for_validation(self) -> float: + return 1e-4 + + +__all__ = ["BeitConfig", "BeitOnnxConfig"] diff --git a/docs/transformers/build/lib/transformers/models/beit/convert_beit_unilm_to_pytorch.py b/docs/transformers/build/lib/transformers/models/beit/convert_beit_unilm_to_pytorch.py new file mode 100644 index 0000000000000000000000000000000000000000..46c72a97f4956144400ae2ab2e99b47134610db5 --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/beit/convert_beit_unilm_to_pytorch.py @@ -0,0 +1,373 @@ +# coding=utf-8 +# Copyright 2021 The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Convert BEiT checkpoints from the unilm repository.""" + +import argparse +import json +from pathlib import Path + +import requests +import torch +from datasets import load_dataset +from huggingface_hub import hf_hub_download +from PIL import Image + +from transformers import ( + BeitConfig, + BeitForImageClassification, + BeitForMaskedImageModeling, + BeitForSemanticSegmentation, + BeitImageProcessor, +) +from transformers.image_utils import PILImageResampling +from transformers.utils import logging + + +logging.set_verbosity_info() +logger = logging.get_logger(__name__) + + +# here we list all keys to be renamed (original name on the left, our name on the right) +def create_rename_keys(config, has_lm_head=False, is_semantic=False): + prefix = "backbone." if is_semantic else "" + + rename_keys = [] + for i in range(config.num_hidden_layers): + # encoder layers: output projection, 2 feedforward neural networks and 2 layernorms + rename_keys.append((f"{prefix}blocks.{i}.norm1.weight", f"beit.encoder.layer.{i}.layernorm_before.weight")) + rename_keys.append((f"{prefix}blocks.{i}.norm1.bias", f"beit.encoder.layer.{i}.layernorm_before.bias")) + rename_keys.append( + (f"{prefix}blocks.{i}.attn.proj.weight", f"beit.encoder.layer.{i}.attention.output.dense.weight") + ) + rename_keys.append( + (f"{prefix}blocks.{i}.attn.proj.bias", f"beit.encoder.layer.{i}.attention.output.dense.bias") + ) + rename_keys.append((f"{prefix}blocks.{i}.norm2.weight", f"beit.encoder.layer.{i}.layernorm_after.weight")) + rename_keys.append((f"{prefix}blocks.{i}.norm2.bias", f"beit.encoder.layer.{i}.layernorm_after.bias")) + rename_keys.append((f"{prefix}blocks.{i}.mlp.fc1.weight", f"beit.encoder.layer.{i}.intermediate.dense.weight")) + rename_keys.append((f"{prefix}blocks.{i}.mlp.fc1.bias", f"beit.encoder.layer.{i}.intermediate.dense.bias")) + rename_keys.append((f"{prefix}blocks.{i}.mlp.fc2.weight", f"beit.encoder.layer.{i}.output.dense.weight")) + rename_keys.append((f"{prefix}blocks.{i}.mlp.fc2.bias", f"beit.encoder.layer.{i}.output.dense.bias")) + + # projection layer + position embeddings + rename_keys.extend( + [ + (f"{prefix}cls_token", "beit.embeddings.cls_token"), + (f"{prefix}patch_embed.proj.weight", "beit.embeddings.patch_embeddings.projection.weight"), + (f"{prefix}patch_embed.proj.bias", "beit.embeddings.patch_embeddings.projection.bias"), + ] + ) + + if has_lm_head: + # mask token + shared relative position bias + layernorm + rename_keys.extend( + [ + ("mask_token", "beit.embeddings.mask_token"), + ( + "rel_pos_bias.relative_position_bias_table", + "beit.encoder.relative_position_bias.relative_position_bias_table", + ), + ( + "rel_pos_bias.relative_position_index", + "beit.encoder.relative_position_bias.relative_position_index", + ), + ("norm.weight", "layernorm.weight"), + ("norm.bias", "layernorm.bias"), + ] + ) + elif is_semantic: + # semantic segmentation classification heads + rename_keys.extend( + [ + ("decode_head.conv_seg.weight", "decode_head.classifier.weight"), + ("decode_head.conv_seg.bias", "decode_head.classifier.bias"), + ("auxiliary_head.conv_seg.weight", "auxiliary_head.classifier.weight"), + ("auxiliary_head.conv_seg.bias", "auxiliary_head.classifier.bias"), + ] + ) + else: + # layernorm + classification head + rename_keys.extend( + [ + ("fc_norm.weight", "beit.pooler.layernorm.weight"), + ("fc_norm.bias", "beit.pooler.layernorm.bias"), + ("head.weight", "classifier.weight"), + ("head.bias", "classifier.bias"), + ] + ) + + return rename_keys + + +# we split up the matrix of each encoder layer into queries, keys and values +def read_in_q_k_v(state_dict, config, has_lm_head=False, is_semantic=False): + for i in range(config.num_hidden_layers): + prefix = "backbone." if is_semantic else "" + # queries, keys and values + in_proj_weight = state_dict.pop(f"{prefix}blocks.{i}.attn.qkv.weight") + q_bias = state_dict.pop(f"{prefix}blocks.{i}.attn.q_bias") + v_bias = state_dict.pop(f"{prefix}blocks.{i}.attn.v_bias") + + state_dict[f"beit.encoder.layer.{i}.attention.attention.query.weight"] = in_proj_weight[ + : config.hidden_size, : + ] + state_dict[f"beit.encoder.layer.{i}.attention.attention.query.bias"] = q_bias + state_dict[f"beit.encoder.layer.{i}.attention.attention.key.weight"] = in_proj_weight[ + config.hidden_size : config.hidden_size * 2, : + ] + state_dict[f"beit.encoder.layer.{i}.attention.attention.value.weight"] = in_proj_weight[ + -config.hidden_size :, : + ] + state_dict[f"beit.encoder.layer.{i}.attention.attention.value.bias"] = v_bias + + # gamma_1 and gamma_2 + # we call them lambda because otherwise they are renamed when using .from_pretrained + gamma_1 = state_dict.pop(f"{prefix}blocks.{i}.gamma_1") + gamma_2 = state_dict.pop(f"{prefix}blocks.{i}.gamma_2") + + state_dict[f"beit.encoder.layer.{i}.lambda_1"] = gamma_1 + state_dict[f"beit.encoder.layer.{i}.lambda_2"] = gamma_2 + + # relative_position bias table + index + if not has_lm_head: + # each layer has its own relative position bias + table = state_dict.pop(f"{prefix}blocks.{i}.attn.relative_position_bias_table") + index = state_dict.pop(f"{prefix}blocks.{i}.attn.relative_position_index") + + state_dict[ + f"beit.encoder.layer.{i}.attention.attention.relative_position_bias.relative_position_bias_table" + ] = table + state_dict[ + f"beit.encoder.layer.{i}.attention.attention.relative_position_bias.relative_position_index" + ] = index + + +def rename_key(dct, old, new): + val = dct.pop(old) + dct[new] = val + + +# We will verify our results on an image of cute cats +def prepare_img(): + url = "http://images.cocodataset.org/val2017/000000039769.jpg" + im = Image.open(requests.get(url, stream=True).raw) + return im + + +@torch.no_grad() +def convert_beit_checkpoint(checkpoint_url, pytorch_dump_folder_path): + """ + Copy/paste/tweak model's weights to our BEiT structure. + """ + + # define default BEiT configuration + config = BeitConfig() + has_lm_head = False + is_semantic = False + repo_id = "huggingface/label-files" + # set config parameters based on URL + if checkpoint_url[-9:-4] == "pt22k": + # masked image modeling + config.use_shared_relative_position_bias = True + config.use_mask_token = True + has_lm_head = True + elif checkpoint_url[-9:-4] == "ft22k": + # intermediate fine-tuning on ImageNet-22k + config.use_relative_position_bias = True + config.num_labels = 21841 + filename = "imagenet-22k-id2label.json" + id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r")) + id2label = {int(k): v for k, v in id2label.items()} + # this dataset contains 21843 labels but the model only has 21841 + # we delete the classes as mentioned in https://github.com/google-research/big_transfer/issues/18 + del id2label[9205] + del id2label[15027] + config.id2label = id2label + config.label2id = {v: k for k, v in id2label.items()} + elif checkpoint_url[-8:-4] == "to1k": + # fine-tuning on ImageNet-1k + config.use_relative_position_bias = True + config.num_labels = 1000 + filename = "imagenet-1k-id2label.json" + id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r")) + id2label = {int(k): v for k, v in id2label.items()} + config.id2label = id2label + config.label2id = {v: k for k, v in id2label.items()} + if "384" in checkpoint_url: + config.image_size = 384 + if "512" in checkpoint_url: + config.image_size = 512 + elif "ade20k" in checkpoint_url: + # fine-tuning + config.use_relative_position_bias = True + config.num_labels = 150 + filename = "ade20k-id2label.json" + id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r")) + id2label = {int(k): v for k, v in id2label.items()} + config.id2label = id2label + config.label2id = {v: k for k, v in id2label.items()} + config.image_size = 640 + is_semantic = True + else: + raise ValueError("Checkpoint not supported, URL should either end with 'pt22k', 'ft22k', 'to1k' or 'ade20k'") + + # size of the architecture + if "base" in checkpoint_url: + pass + elif "large" in checkpoint_url: + config.hidden_size = 1024 + config.intermediate_size = 4096 + config.num_hidden_layers = 24 + config.num_attention_heads = 16 + if "ade20k" in checkpoint_url: + config.image_size = 640 + config.out_indices = [7, 11, 15, 23] + else: + raise ValueError("Should either find 'base' or 'large' in checkpoint URL") + + # load state_dict of original model, remove and rename some keys + state_dict = torch.hub.load_state_dict_from_url(checkpoint_url, map_location="cpu", check_hash=True) + state_dict = state_dict["model"] if "ade20k" not in checkpoint_url else state_dict["state_dict"] + + rename_keys = create_rename_keys(config, has_lm_head=has_lm_head, is_semantic=is_semantic) + for src, dest in rename_keys: + rename_key(state_dict, src, dest) + read_in_q_k_v(state_dict, config, has_lm_head=has_lm_head, is_semantic=is_semantic) + if is_semantic: + # add prefix to decoder keys + for key, val in state_dict.copy().items(): + val = state_dict.pop(key) + if key.startswith("backbone.fpn"): + key = key.replace("backbone.fpn", "fpn") + state_dict[key] = val + + # load HuggingFace model + if checkpoint_url[-9:-4] == "pt22k": + model = BeitForMaskedImageModeling(config) + elif "ade20k" in checkpoint_url: + model = BeitForSemanticSegmentation(config) + else: + model = BeitForImageClassification(config) + model.eval() + model.load_state_dict(state_dict) + + # Check outputs on an image + if is_semantic: + image_processor = BeitImageProcessor(size=config.image_size, do_center_crop=False) + ds = load_dataset("hf-internal-testing/fixtures_ade20k", split="test", trust_remote_code=True) + image = Image.open(ds[0]["file"]) + else: + image_processor = BeitImageProcessor( + size=config.image_size, resample=PILImageResampling.BILINEAR, do_center_crop=False + ) + image = prepare_img() + + encoding = image_processor(images=image, return_tensors="pt") + pixel_values = encoding["pixel_values"] + + outputs = model(pixel_values) + logits = outputs.logits + + # verify logits + expected_shape = torch.Size([1, 1000]) + if checkpoint_url[:-4].endswith("beit_base_patch16_224_pt22k"): + expected_shape = torch.Size([1, 196, 8192]) + elif checkpoint_url[:-4].endswith("beit_large_patch16_224_pt22k"): + expected_shape = torch.Size([1, 196, 8192]) + elif checkpoint_url[:-4].endswith("beit_base_patch16_224_pt22k_ft22k"): + expected_shape = torch.Size([1, 21841]) + expected_logits = torch.tensor([2.2288, 2.4671, 0.7395]) + expected_class_idx = 2397 + elif checkpoint_url[:-4].endswith("beit_large_patch16_224_pt22k_ft22k"): + expected_shape = torch.Size([1, 21841]) + expected_logits = torch.tensor([1.6881, -0.2787, 0.5901]) + expected_class_idx = 2396 + elif checkpoint_url[:-4].endswith("beit_base_patch16_224_pt22k_ft1k"): + expected_logits = torch.tensor([0.1241, 0.0798, -0.6569]) + expected_class_idx = 285 + elif checkpoint_url[:-4].endswith("beit_base_patch16_224_pt22k_ft22kto1k"): + expected_logits = torch.tensor([-1.2385, -1.0987, -1.0108]) + expected_class_idx = 281 + elif checkpoint_url[:-4].endswith("beit_base_patch16_384_pt22k_ft22kto1k"): + expected_logits = torch.tensor([-1.5303, -0.9484, -0.3147]) + expected_class_idx = 761 + elif checkpoint_url[:-4].endswith("beit_large_patch16_224_pt22k_ft1k"): + expected_logits = torch.tensor([0.4610, -0.0928, 0.2086]) + expected_class_idx = 761 + elif checkpoint_url[:-4].endswith("beit_large_patch16_224_pt22k_ft22kto1k"): + expected_logits = torch.tensor([-0.4804, 0.6257, -0.1837]) + expected_class_idx = 761 + elif checkpoint_url[:-4].endswith("beit_large_patch16_384_pt22k_ft22kto1k"): + expected_logits = torch.tensor([[-0.5122, 0.5117, -0.2113]]) + expected_class_idx = 761 + elif checkpoint_url[:-4].endswith("beit_large_patch16_512_pt22k_ft22kto1k"): + expected_logits = torch.tensor([-0.3062, 0.7261, 0.4852]) + expected_class_idx = 761 + elif checkpoint_url[:-4].endswith("beit_base_patch16_640_pt22k_ft22ktoade20k"): + expected_shape = (1, 150, 160, 160) + expected_logits = torch.tensor( + [ + [[-4.9225, -2.3954, -3.0522], [-2.8822, -1.0046, -1.7561], [-2.9549, -1.3228, -2.1347]], + [[-5.8168, -3.4129, -4.0778], [-3.8651, -2.2214, -3.0277], [-3.8356, -2.4643, -3.3535]], + [[-0.0078, 3.9952, 4.0754], [2.9856, 4.6944, 5.0035], [3.2413, 4.7813, 4.9969]], + ] + ) + elif checkpoint_url[:-4].endswith("beit_large_patch16_640_pt22k_ft22ktoade20k"): + expected_shape = (1, 150, 160, 160) + expected_logits = torch.tensor( + [ + [[-4.3305, -2.3049, -3.0161], [-2.9591, -1.5305, -2.2251], [-3.4198, -1.8004, -2.9062]], + [[-5.8922, -3.7435, -4.3978], [-4.2063, -2.7872, -3.4755], [-4.2791, -3.1874, -4.1681]], + [[0.9895, 4.3467, 4.7663], [4.2476, 5.6830, 6.1518], [4.5550, 6.2495, 6.5154]], + ] + ) + else: + raise ValueError("Can't verify logits as model is not supported") + + if logits.shape != expected_shape: + raise ValueError(f"Shape of logits not as expected. {logits.shape=}, {expected_shape=}") + if not has_lm_head: + if is_semantic: + if not torch.allclose(logits[0, :3, :3, :3], expected_logits, atol=1e-3): + raise ValueError("First elements of logits not as expected") + else: + print("Predicted class idx:", logits.argmax(-1).item()) + + if not torch.allclose(logits[0, :3], expected_logits, atol=1e-3): + raise ValueError("First elements of logits not as expected") + if logits.argmax(-1).item() != expected_class_idx: + raise ValueError("Predicted class index not as expected") + + Path(pytorch_dump_folder_path).mkdir(exist_ok=True) + print(f"Saving model to {pytorch_dump_folder_path}") + model.save_pretrained(pytorch_dump_folder_path) + print(f"Saving image processor to {pytorch_dump_folder_path}") + image_processor.save_pretrained(pytorch_dump_folder_path) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + + parser.add_argument( + "--checkpoint_url", + default="https://conversationhub.blob.core.windows.net/beit-share-public/beit/beit_base_patch16_224_pt22k_ft22kto1k.pth", + type=str, + help="URL to the original PyTorch checkpoint (.pth file).", + ) + parser.add_argument( + "--pytorch_dump_folder_path", default=None, type=str, help="Path to the folder to output PyTorch model." + ) + args = parser.parse_args() + convert_beit_checkpoint(args.checkpoint_url, args.pytorch_dump_folder_path) diff --git a/docs/transformers/build/lib/transformers/models/beit/feature_extraction_beit.py b/docs/transformers/build/lib/transformers/models/beit/feature_extraction_beit.py new file mode 100644 index 0000000000000000000000000000000000000000..7886897c6d1018e8b27e19170ef562cebc4462da --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/beit/feature_extraction_beit.py @@ -0,0 +1,38 @@ +# coding=utf-8 +# Copyright 2021 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Feature extractor class for BEiT.""" + +import warnings + +from ...utils import logging +from ...utils.import_utils import requires +from .image_processing_beit import BeitImageProcessor + + +logger = logging.get_logger(__name__) + + +@requires(backends=("vision",)) +class BeitFeatureExtractor(BeitImageProcessor): + def __init__(self, *args, **kwargs) -> None: + warnings.warn( + "The class BeitFeatureExtractor is deprecated and will be removed in version 5 of Transformers. Please" + " use BeitImageProcessor instead.", + FutureWarning, + ) + super().__init__(*args, **kwargs) + + +__all__ = ["BeitFeatureExtractor"] diff --git a/docs/transformers/build/lib/transformers/models/beit/image_processing_beit.py b/docs/transformers/build/lib/transformers/models/beit/image_processing_beit.py new file mode 100644 index 0000000000000000000000000000000000000000..eb2950f0e20e44cdbd1d65b7130c36044f0970ad --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/beit/image_processing_beit.py @@ -0,0 +1,517 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Image processor class for Beit.""" + +from typing import Any, Dict, List, Optional, Tuple, Union + +import numpy as np + +from ...image_processing_utils import INIT_SERVICE_KWARGS, BaseImageProcessor, BatchFeature, get_size_dict +from ...image_transforms import resize, to_channel_dimension_format +from ...image_utils import ( + IMAGENET_STANDARD_MEAN, + IMAGENET_STANDARD_STD, + ChannelDimension, + ImageInput, + PILImageResampling, + infer_channel_dimension_format, + is_scaled_image, + make_list_of_images, + to_numpy_array, + valid_images, + validate_preprocess_arguments, +) +from ...utils import ( + TensorType, + filter_out_non_signature_kwargs, + is_torch_available, + is_torch_tensor, + is_vision_available, + logging, +) +from ...utils.deprecation import deprecate_kwarg +from ...utils.import_utils import requires + + +if is_vision_available(): + import PIL + +if is_torch_available(): + import torch + + +logger = logging.get_logger(__name__) + + +@requires(backends=("vision",)) +class BeitImageProcessor(BaseImageProcessor): + r""" + Constructs a BEiT image processor. + + Args: + do_resize (`bool`, *optional*, defaults to `True`): + Whether to resize the image's (height, width) dimensions to the specified `size`. Can be overridden by the + `do_resize` parameter in the `preprocess` method. + size (`Dict[str, int]` *optional*, defaults to `{"height": 256, "width": 256}`): + Size of the output image after resizing. Can be overridden by the `size` parameter in the `preprocess` + method. + resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`): + Resampling filter to use if resizing the image. Can be overridden by the `resample` parameter in the + `preprocess` method. + do_center_crop (`bool`, *optional*, defaults to `True`): + Whether to center crop the image. If the input size is smaller than `crop_size` along any edge, the image + is padded with 0's and then center cropped. Can be overridden by the `do_center_crop` parameter in the + `preprocess` method. + crop_size (`Dict[str, int]`, *optional*, defaults to `{"height": 224, "width": 224}`): + Desired output size when applying center-cropping. Only has an effect if `do_center_crop` is set to `True`. + Can be overridden by the `crop_size` parameter in the `preprocess` method. + rescale_factor (`int` or `float`, *optional*, defaults to `1/255`): + Scale factor to use if rescaling the image. Can be overridden by the `rescale_factor` parameter in the + `preprocess` method. + do_rescale (`bool`, *optional*, defaults to `True`): + Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the `do_rescale` + parameter in the `preprocess` method. + do_normalize (`bool`, *optional*, defaults to `True`): + Whether to normalize the image. Can be overridden by the `do_normalize` parameter in the `preprocess` + method. + image_mean (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_MEAN`): + The mean to use if normalizing the image. This is a float or list of floats of length of the number of + channels of the image. Can be overridden by the `image_mean` parameter in the `preprocess` method. + image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_STD`): + The standard deviation to use if normalizing the image. This is a float or list of floats of length of the + number of channels of the image. Can be overridden by the `image_std` parameter in the `preprocess` method. + do_reduce_labels (`bool`, *optional*, defaults to `False`): + Whether or not to reduce all label values of segmentation maps by 1. Usually used for datasets where 0 is + used for background, and background itself is not included in all classes of a dataset (e.g. ADE20k). The + background label will be replaced by 255. Can be overridden by the `do_reduce_labels` parameter in the + `preprocess` method. + """ + + model_input_names = ["pixel_values"] + + @deprecate_kwarg("reduce_labels", new_name="do_reduce_labels", version="4.41.0") + @filter_out_non_signature_kwargs(extra=INIT_SERVICE_KWARGS) + def __init__( + self, + do_resize: bool = True, + size: Dict[str, int] = None, + resample: PILImageResampling = PILImageResampling.BICUBIC, + do_center_crop: bool = True, + crop_size: Dict[str, int] = None, + rescale_factor: Union[int, float] = 1 / 255, + do_rescale: bool = True, + do_normalize: bool = True, + image_mean: Optional[Union[float, List[float]]] = None, + image_std: Optional[Union[float, List[float]]] = None, + do_reduce_labels: bool = False, + **kwargs, + ) -> None: + super().__init__(**kwargs) + size = size if size is not None else {"height": 256, "width": 256} + size = get_size_dict(size) + crop_size = crop_size if crop_size is not None else {"height": 224, "width": 224} + crop_size = get_size_dict(crop_size, param_name="crop_size") + self.do_resize = do_resize + self.size = size + self.resample = resample + self.do_center_crop = do_center_crop + self.crop_size = crop_size + self.do_rescale = do_rescale + self.rescale_factor = rescale_factor + self.do_normalize = do_normalize + self.image_mean = image_mean if image_mean is not None else IMAGENET_STANDARD_MEAN + self.image_std = image_std if image_std is not None else IMAGENET_STANDARD_STD + self.do_reduce_labels = do_reduce_labels + + @classmethod + def from_dict(cls, image_processor_dict: Dict[str, Any], **kwargs): + """ + Overrides the `from_dict` method from the base class to save support of deprecated `reduce_labels` in old configs + """ + image_processor_dict = image_processor_dict.copy() + if "reduce_labels" in image_processor_dict: + image_processor_dict["do_reduce_labels"] = image_processor_dict.pop("reduce_labels") + return super().from_dict(image_processor_dict, **kwargs) + + def resize( + self, + image: np.ndarray, + size: Dict[str, int], + resample: PILImageResampling = PILImageResampling.BICUBIC, + data_format: Optional[Union[str, ChannelDimension]] = None, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + **kwargs, + ) -> np.ndarray: + """ + Resize an image to (size["height"], size["width"]). + + Args: + image (`np.ndarray`): + Image to resize. + size (`Dict[str, int]`): + Size of the output image. + resample (`PILImageResampling`, *optional*, defaults to `PIL.Image.BICUBIC`): + Resampling filter to use when resiizing the image. + data_format (`str` or `ChannelDimension`, *optional*): + The channel dimension format of the image. If not provided, it will be the same as the input image. + input_data_format (`str` or `ChannelDimension`, *optional*): + The channel dimension format of the input image. If not provided, it will be inferred. + """ + size = get_size_dict(size, default_to_square=True, param_name="size") + if "height" not in size or "width" not in size: + raise ValueError(f"The `size` argument must contain `height` and `width` keys. Got {size.keys()}") + return resize( + image, + size=(size["height"], size["width"]), + resample=resample, + data_format=data_format, + input_data_format=input_data_format, + **kwargs, + ) + + def reduce_label(self, label: ImageInput) -> np.ndarray: + label = to_numpy_array(label) + # Avoid using underflow conversion + label[label == 0] = 255 + label = label - 1 + label[label == 254] = 255 + return label + + def _preprocess( + self, + image: ImageInput, + do_reduce_labels: Optional[bool] = None, + do_resize: Optional[bool] = None, + size: Dict[str, int] = None, + resample: PILImageResampling = None, + do_center_crop: Optional[bool] = None, + crop_size: Dict[str, int] = None, + do_rescale: Optional[bool] = None, + rescale_factor: Optional[float] = None, + do_normalize: Optional[bool] = None, + image_mean: Optional[Union[float, List[float]]] = None, + image_std: Optional[Union[float, List[float]]] = None, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + ): + if do_reduce_labels: + image = self.reduce_label(image) + + if do_resize: + image = self.resize(image=image, size=size, resample=resample, input_data_format=input_data_format) + + if do_center_crop: + image = self.center_crop(image=image, size=crop_size, input_data_format=input_data_format) + + if do_rescale: + image = self.rescale(image=image, scale=rescale_factor, input_data_format=input_data_format) + + if do_normalize: + image = self.normalize(image=image, mean=image_mean, std=image_std, input_data_format=input_data_format) + + return image + + def _preprocess_image( + self, + image: ImageInput, + do_resize: Optional[bool] = None, + size: Dict[str, int] = None, + resample: PILImageResampling = None, + do_center_crop: Optional[bool] = None, + crop_size: Dict[str, int] = None, + do_rescale: Optional[bool] = None, + rescale_factor: Optional[float] = None, + do_normalize: Optional[bool] = None, + image_mean: Optional[Union[float, List[float]]] = None, + image_std: Optional[Union[float, List[float]]] = None, + data_format: Optional[Union[str, ChannelDimension]] = None, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + ) -> np.ndarray: + """Preprocesses a single image.""" + # All transformations expect numpy arrays. + image = to_numpy_array(image) + if do_rescale and is_scaled_image(image): + logger.warning_once( + "It looks like you are trying to rescale already rescaled images. If the input" + " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again." + ) + if input_data_format is None: + input_data_format = infer_channel_dimension_format(image) + image = self._preprocess( + image, + do_reduce_labels=False, + do_resize=do_resize, + size=size, + resample=resample, + do_center_crop=do_center_crop, + crop_size=crop_size, + do_rescale=do_rescale, + rescale_factor=rescale_factor, + do_normalize=do_normalize, + image_mean=image_mean, + image_std=image_std, + input_data_format=input_data_format, + ) + if data_format is not None: + image = to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) + return image + + def _preprocess_segmentation_map( + self, + segmentation_map: ImageInput, + do_resize: Optional[bool] = None, + size: Dict[str, int] = None, + resample: PILImageResampling = None, + do_center_crop: Optional[bool] = None, + crop_size: Dict[str, int] = None, + do_reduce_labels: Optional[bool] = None, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + ): + """Preprocesses a single segmentation map.""" + # All transformations expect numpy arrays. + segmentation_map = to_numpy_array(segmentation_map) + # Add an axis to the segmentation maps for transformations. + if segmentation_map.ndim == 2: + segmentation_map = segmentation_map[None, ...] + added_dimension = True + input_data_format = ChannelDimension.FIRST + else: + added_dimension = False + if input_data_format is None: + input_data_format = infer_channel_dimension_format(segmentation_map, num_channels=1) + segmentation_map = self._preprocess( + image=segmentation_map, + do_reduce_labels=do_reduce_labels, + do_resize=do_resize, + resample=resample, + size=size, + do_center_crop=do_center_crop, + crop_size=crop_size, + do_normalize=False, + do_rescale=False, + input_data_format=ChannelDimension.FIRST, + ) + # Remove extra axis if added + if added_dimension: + segmentation_map = np.squeeze(segmentation_map, axis=0) + segmentation_map = segmentation_map.astype(np.int64) + return segmentation_map + + def __call__(self, images, segmentation_maps=None, **kwargs): + # Overrides the `__call__` method of the `Preprocessor` class such that the images and segmentation maps can both + # be passed in as positional arguments. + return super().__call__(images, segmentation_maps=segmentation_maps, **kwargs) + + @deprecate_kwarg("reduce_labels", new_name="do_reduce_labels", version="4.41.0") + @filter_out_non_signature_kwargs() + def preprocess( + self, + images: ImageInput, + segmentation_maps: Optional[ImageInput] = None, + do_resize: Optional[bool] = None, + size: Dict[str, int] = None, + resample: PILImageResampling = None, + do_center_crop: Optional[bool] = None, + crop_size: Dict[str, int] = None, + do_rescale: Optional[bool] = None, + rescale_factor: Optional[float] = None, + do_normalize: Optional[bool] = None, + image_mean: Optional[Union[float, List[float]]] = None, + image_std: Optional[Union[float, List[float]]] = None, + do_reduce_labels: Optional[bool] = None, + return_tensors: Optional[Union[str, TensorType]] = None, + data_format: ChannelDimension = ChannelDimension.FIRST, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + ) -> PIL.Image.Image: + """ + Preprocess an image or batch of images. + + Args: + images (`ImageInput`): + Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If + passing in images with pixel values between 0 and 1, set `do_rescale=False`. + segmentation_maps (`ImageInput`, *optional*) + Segmentation maps to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If + passing in images with pixel values between 0 and 1, set `do_rescale=False`. + do_resize (`bool`, *optional*, defaults to `self.do_resize`): + Whether to resize the image. + size (`Dict[str, int]`, *optional*, defaults to `self.size`): + Size of the image after resizing. + resample (`int`, *optional*, defaults to `self.resample`): + Resampling filter to use if resizing the image. This can be one of the enum `PILImageResampling`, Only + has an effect if `do_resize` is set to `True`. + do_center_crop (`bool`, *optional*, defaults to `self.do_center_crop`): + Whether to center crop the image. + crop_size (`Dict[str, int]`, *optional*, defaults to `self.crop_size`): + Size of the image after center crop. If one edge the image is smaller than `crop_size`, it will be + padded with zeros and then cropped + do_rescale (`bool`, *optional*, defaults to `self.do_rescale`): + Whether to rescale the image values between [0 - 1]. + rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`): + Rescale factor to rescale the image by if `do_rescale` is set to `True`. + do_normalize (`bool`, *optional*, defaults to `self.do_normalize`): + Whether to normalize the image. + image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`): + Image mean. + image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`): + Image standard deviation. + do_reduce_labels (`bool`, *optional*, defaults to `self.do_reduce_labels`): + Whether or not to reduce all label values of segmentation maps by 1. Usually used for datasets where 0 + is used for background, and background itself is not included in all classes of a dataset (e.g. + ADE20k). The background label will be replaced by 255. + return_tensors (`str` or `TensorType`, *optional*): + The type of tensors to return. Can be one of: + - Unset: Return a list of `np.ndarray`. + - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`. + - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`. + - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`. + - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`. + data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`): + The channel dimension format for the output image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + - Unset: Use the channel dimension format of the input image. + input_data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format for the input image. If unset, the channel dimension format is inferred + from the input image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + - `"none"` or `ChannelDimension.NONE`: image in (height, width) format. + """ + do_resize = do_resize if do_resize is not None else self.do_resize + size = size if size is not None else self.size + size = get_size_dict(size, default_to_square=True, param_name="size") + resample = resample if resample is not None else self.resample + do_center_crop = do_center_crop if do_center_crop is not None else self.do_center_crop + crop_size = crop_size if crop_size is not None else self.crop_size + crop_size = get_size_dict(crop_size, default_to_square=True, param_name="crop_size") + do_rescale = do_rescale if do_rescale is not None else self.do_rescale + rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor + do_normalize = do_normalize if do_normalize is not None else self.do_normalize + image_mean = image_mean if image_mean is not None else self.image_mean + image_std = image_std if image_std is not None else self.image_std + do_reduce_labels = do_reduce_labels if do_reduce_labels is not None else self.do_reduce_labels + + images = make_list_of_images(images) + + if segmentation_maps is not None: + segmentation_maps = make_list_of_images(segmentation_maps, expected_ndims=2) + + if segmentation_maps is not None and not valid_images(segmentation_maps): + raise ValueError( + "Invalid segmentation_maps type. Must be of type PIL.Image.Image, numpy.ndarray, " + "torch.Tensor, tf.Tensor or jax.ndarray." + ) + if not valid_images(images): + raise ValueError( + "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, " + "torch.Tensor, tf.Tensor or jax.ndarray." + ) + + validate_preprocess_arguments( + do_rescale=do_rescale, + rescale_factor=rescale_factor, + do_normalize=do_normalize, + image_mean=image_mean, + image_std=image_std, + do_center_crop=do_center_crop, + crop_size=crop_size, + do_resize=do_resize, + size=size, + resample=resample, + ) + + images = [ + self._preprocess_image( + image=img, + do_resize=do_resize, + do_center_crop=do_center_crop, + do_rescale=do_rescale, + do_normalize=do_normalize, + resample=resample, + size=size, + rescale_factor=rescale_factor, + crop_size=crop_size, + image_mean=image_mean, + image_std=image_std, + data_format=data_format, + input_data_format=input_data_format, + ) + for img in images + ] + + data = {"pixel_values": images} + + if segmentation_maps is not None: + segmentation_maps = [ + self._preprocess_segmentation_map( + segmentation_map=segmentation_map, + do_reduce_labels=do_reduce_labels, + do_resize=do_resize, + resample=resample, + size=size, + do_center_crop=do_center_crop, + crop_size=crop_size, + ) + for segmentation_map in segmentation_maps + ] + data["labels"] = segmentation_maps + + return BatchFeature(data=data, tensor_type=return_tensors) + + def post_process_semantic_segmentation(self, outputs, target_sizes: List[Tuple] = None): + """ + Converts the output of [`BeitForSemanticSegmentation`] into semantic segmentation maps. Only supports PyTorch. + + Args: + outputs ([`BeitForSemanticSegmentation`]): + Raw outputs of the model. + target_sizes (`List[Tuple]` of length `batch_size`, *optional*): + List of tuples corresponding to the requested final size (height, width) of each prediction. If unset, + predictions will not be resized. + + Returns: + semantic_segmentation: `List[torch.Tensor]` of length `batch_size`, where each item is a semantic + segmentation map of shape (height, width) corresponding to the target_sizes entry (if `target_sizes` is + specified). Each entry of each `torch.Tensor` correspond to a semantic class id. + """ + # TODO: add support for other frameworks + logits = outputs.logits + + # Resize logits and compute semantic segmentation maps + if target_sizes is not None: + if len(logits) != len(target_sizes): + raise ValueError( + "Make sure that you pass in as many target sizes as the batch dimension of the logits" + ) + + if is_torch_tensor(target_sizes): + target_sizes = target_sizes.numpy() + + semantic_segmentation = [] + + for idx in range(len(logits)): + resized_logits = torch.nn.functional.interpolate( + logits[idx].unsqueeze(dim=0), size=target_sizes[idx], mode="bilinear", align_corners=False + ) + semantic_map = resized_logits[0].argmax(dim=0) + semantic_segmentation.append(semantic_map) + else: + semantic_segmentation = logits.argmax(dim=1) + semantic_segmentation = [semantic_segmentation[i] for i in range(semantic_segmentation.shape[0])] + + return semantic_segmentation + + +__all__ = ["BeitImageProcessor"] diff --git a/docs/transformers/build/lib/transformers/models/beit/modeling_beit.py b/docs/transformers/build/lib/transformers/models/beit/modeling_beit.py new file mode 100644 index 0000000000000000000000000000000000000000..94c6f19d4342efae12dd8442bf1fb7e0b0b1815f --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/beit/modeling_beit.py @@ -0,0 +1,1652 @@ +# coding=utf-8 +# Copyright 2021 Microsoft Research and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch BEiT model.""" + +import collections.abc +import math +import warnings +from dataclasses import dataclass +from typing import List, Optional, Tuple, Union + +import torch +import torch.utils.checkpoint +from torch import Tensor, nn +from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss + +from ...activations import ACT2FN +from ...modeling_outputs import ( + BackboneOutput, + BaseModelOutput, + BaseModelOutputWithPooling, + ImageClassifierOutput, + MaskedLMOutput, + SemanticSegmenterOutput, +) +from ...modeling_utils import PreTrainedModel +from ...pytorch_utils import compile_compatible_method_lru_cache, find_pruneable_heads_and_indices, prune_linear_layer +from ...utils import ( + add_code_sample_docstrings, + add_start_docstrings, + add_start_docstrings_to_model_forward, + logging, + replace_return_docstrings, + torch_int, +) +from ...utils.backbone_utils import BackboneMixin +from .configuration_beit import BeitConfig + + +logger = logging.get_logger(__name__) + +# General docstring +_CONFIG_FOR_DOC = "BeitConfig" + +# Base docstring +_CHECKPOINT_FOR_DOC = "microsoft/beit-base-patch16-224-pt22k" +_EXPECTED_OUTPUT_SHAPE = [1, 197, 768] + +# Image classification docstring +_IMAGE_CLASS_CHECKPOINT = "microsoft/beit-base-patch16-224" +_IMAGE_CLASS_EXPECTED_OUTPUT = "tabby, tabby cat" + + +@dataclass +class BeitModelOutputWithPooling(BaseModelOutputWithPooling): + """ + Class for outputs of [`BeitModel`]. + + Args: + last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): + Sequence of hidden-states at the output of the last layer of the model. + pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`): + Average of the last layer hidden states of the patch tokens (excluding the *[CLS]* token) if + *config.use_mean_pooling* is set to True. If set to False, then the final hidden state of the *[CLS]* token + will be returned. + hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of + shape `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + + +def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = False) -> torch.Tensor: + """ + Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). + + Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks, + however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper... + See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the + layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the + argument. + """ + if drop_prob == 0.0 or not training: + return input + keep_prob = 1 - drop_prob + shape = (input.shape[0],) + (1,) * (input.ndim - 1) # work with diff dim tensors, not just 2D ConvNets + random_tensor = keep_prob + torch.rand(shape, dtype=input.dtype, device=input.device) + random_tensor.floor_() # binarize + output = input.div(keep_prob) * random_tensor + return output + + +class BeitDropPath(nn.Module): + """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).""" + + def __init__(self, drop_prob: Optional[float] = None) -> None: + super().__init__() + self.drop_prob = drop_prob + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + return drop_path(hidden_states, self.drop_prob, self.training) + + def extra_repr(self) -> str: + return "p={}".format(self.drop_prob) + + +# Based on timm implementation, which can be found here: +# https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/vision_transformer.py +class BeitEmbeddings(nn.Module): + """ + Construct the CLS token, position and patch embeddings. Optionally, also the mask token. + + """ + + def __init__(self, config: BeitConfig) -> None: + super().__init__() + + self.cls_token = nn.Parameter(torch.zeros(1, 1, config.hidden_size)) + if config.use_mask_token: + self.mask_token = nn.Parameter(torch.zeros(1, 1, config.hidden_size)) + else: + self.mask_token = None + self.patch_embeddings = BeitPatchEmbeddings(config) + self.patch_size = config.patch_size + self.image_size = ( + config.image_size + if isinstance(config.image_size, collections.abc.Iterable) + else (config.image_size, config.image_size) + ) + num_patches = self.patch_embeddings.num_patches + if config.use_absolute_position_embeddings: + self.position_embeddings = nn.Parameter(torch.zeros(1, num_patches + 1, config.hidden_size)) + else: + self.position_embeddings = None + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + # Copied from transformers.models.vit.modeling_vit.ViTEmbeddings.interpolate_pos_encoding + def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width: int) -> torch.Tensor: + """ + This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution + images. This method is also adapted to support torch.jit tracing. + + Adapted from: + - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and + - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211 + """ + + num_patches = embeddings.shape[1] - 1 + num_positions = self.position_embeddings.shape[1] - 1 + + # always interpolate when tracing to ensure the exported model works for dynamic input shapes + if not torch.jit.is_tracing() and num_patches == num_positions and height == width: + return self.position_embeddings + + class_pos_embed = self.position_embeddings[:, :1] + patch_pos_embed = self.position_embeddings[:, 1:] + + dim = embeddings.shape[-1] + + new_height = height // self.patch_size + new_width = width // self.patch_size + + sqrt_num_positions = torch_int(num_positions**0.5) + patch_pos_embed = patch_pos_embed.reshape(1, sqrt_num_positions, sqrt_num_positions, dim) + patch_pos_embed = patch_pos_embed.permute(0, 3, 1, 2) + + patch_pos_embed = nn.functional.interpolate( + patch_pos_embed, + size=(new_height, new_width), + mode="bicubic", + align_corners=False, + ) + + patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim) + + return torch.cat((class_pos_embed, patch_pos_embed), dim=1) + + def forward( + self, + pixel_values: torch.Tensor, + bool_masked_pos: Optional[torch.BoolTensor] = None, + interpolate_pos_encoding: Optional[bool] = None, + ) -> torch.Tensor: + if self.position_embeddings is not None and interpolate_pos_encoding is not None: + warnings.warn( + "`interpolate_pos_encoding` argument has no effect for BEiTEmbeddings, embeddings are always " + "interpolated to the input image size. The argument will be removed in transformers v4.51.0." + ) + + _, _, height, width = pixel_values.shape + embeddings, (patch_height, patch_width) = self.patch_embeddings(pixel_values) + batch_size, seq_len, _ = embeddings.size() + + if bool_masked_pos is not None: + mask_tokens = self.mask_token.expand(batch_size, seq_len, -1) + # replace the masked visual tokens by mask_tokens + w = bool_masked_pos.unsqueeze(-1).type_as(mask_tokens) + embeddings = embeddings * (1 - w) + mask_tokens * w + + cls_tokens = self.cls_token.expand(batch_size, -1, -1) + embeddings = torch.cat((cls_tokens, embeddings), dim=1) + + if self.position_embeddings is not None: + embeddings = embeddings + self.interpolate_pos_encoding(embeddings, height, width) + + embeddings = self.dropout(embeddings) + + return embeddings, (patch_height, patch_width) + + +class BeitPatchEmbeddings(nn.Module): + """ + This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial + `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a + Transformer. + """ + + def __init__(self, config): + super().__init__() + image_size, patch_size = config.image_size, config.patch_size + num_channels, hidden_size = config.num_channels, config.hidden_size + + image_size = image_size if isinstance(image_size, collections.abc.Iterable) else (image_size, image_size) + patch_size = patch_size if isinstance(patch_size, collections.abc.Iterable) else (patch_size, patch_size) + num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0]) + patch_shape = (image_size[0] // patch_size[0], image_size[1] // patch_size[1]) + self.image_size = image_size + self.patch_size = patch_size + self.num_channels = num_channels + self.num_patches = num_patches + self.patch_shape = patch_shape + + self.projection = nn.Conv2d(num_channels, hidden_size, kernel_size=patch_size, stride=patch_size) + + def forward(self, pixel_values: torch.Tensor) -> torch.Tensor: + batch_size, num_channels, height, width = pixel_values.shape + if num_channels != self.num_channels: + raise ValueError( + "Make sure that the channel dimension of the pixel values match with the one set in the configuration." + ) + + embeddings = self.projection(pixel_values) + patch_height, patch_width = embeddings.shape[2], embeddings.shape[3] + embeddings = embeddings.flatten(2).transpose(1, 2) + + return embeddings, (patch_height, patch_width) + + +class BeitSelfAttention(nn.Module): + def __init__(self, config: BeitConfig, window_size: Optional[tuple] = None) -> None: + super().__init__() + self.config = config + if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"): + raise ValueError( + f"The hidden size {config.hidden_size} is not a multiple of the number of attention " + f"heads {config.num_attention_heads}." + ) + + self.num_attention_heads = config.num_attention_heads + self.attention_head_size = int(config.hidden_size / config.num_attention_heads) + self.all_head_size = self.num_attention_heads * self.attention_head_size + + self.query = nn.Linear(config.hidden_size, self.all_head_size) + self.key = nn.Linear(config.hidden_size, self.all_head_size, bias=False) + self.value = nn.Linear(config.hidden_size, self.all_head_size) + + self.dropout = nn.Dropout(config.attention_probs_dropout_prob) + + self.has_relative_position_bias = bool(window_size) + if self.has_relative_position_bias: + self.relative_position_bias = BeitRelativePositionBias(config, window_size=window_size) + + def transpose_for_scores(self, x): + new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size) + x = x.view(*new_x_shape) + return x.permute(0, 2, 1, 3) + + def forward( + self, + hidden_states: torch.Tensor, + head_mask: Optional[torch.Tensor] = None, + output_attentions: bool = False, + relative_position_bias: Optional[torch.Tensor] = None, + interpolate_pos_encoding: bool = False, + resolution: Optional[Tuple[int]] = None, + ) -> Union[Tuple[torch.Tensor], Tuple[torch.Tensor, torch.Tensor]]: + mixed_query_layer = self.query(hidden_states) + + key_layer = self.transpose_for_scores(self.key(hidden_states)) + value_layer = self.transpose_for_scores(self.value(hidden_states)) + query_layer = self.transpose_for_scores(mixed_query_layer) + + # Take the dot product between "query" and "key" to get the raw attention scores. + attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2)) + + attention_scores = attention_scores / math.sqrt(self.attention_head_size) + + # Add relative position bias if present. + if self.has_relative_position_bias: + height, width = resolution + window_size = (height // self.config.patch_size, width // self.config.patch_size) + attention_scores = attention_scores + self.relative_position_bias( + window_size, interpolate_pos_encoding, dim_size=hidden_states.shape[1] + ) + + # Add shared relative position bias if provided. + if relative_position_bias is not None: + attention_scores = attention_scores + relative_position_bias + + # Normalize the attention scores to probabilities. + attention_probs = nn.functional.softmax(attention_scores, dim=-1) + + # This is actually dropping out entire tokens to attend to, which might + # seem a bit unusual, but is taken from the original Transformer paper. + attention_probs = self.dropout(attention_probs) + + # Mask heads if we want to + if head_mask is not None: + attention_probs = attention_probs * head_mask + + context_layer = torch.matmul(attention_probs, value_layer) + + context_layer = context_layer.permute(0, 2, 1, 3).contiguous() + new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,) + context_layer = context_layer.view(*new_context_layer_shape) + + outputs = (context_layer, attention_probs) if output_attentions else (context_layer,) + + return outputs + + +class BeitSdpaSelfAttention(BeitSelfAttention): + def forward( + self, + hidden_states: torch.Tensor, + head_mask: Optional[torch.Tensor] = None, + output_attentions: bool = False, + relative_position_bias: Optional[torch.Tensor] = None, + interpolate_pos_encoding: bool = False, + resolution: Optional[Tuple[int]] = None, + ) -> Union[Tuple[torch.Tensor], Tuple[torch.Tensor, torch.Tensor]]: + if output_attentions or head_mask is not None: + logger.warning_once( + "`BeitSdpaSelfAttention` is used but `torch.nn.functional.scaled_dot_product_attention` does not " + "support `output_attentions=True` or `head_mask`. Falling back to the manual attention implementation, " + "but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. " + 'This warning can be removed using the argument `attn_implementation="eager"` when loading the model.' + ) + return super().forward( + hidden_states=hidden_states, + head_mask=head_mask, + output_attentions=output_attentions, + relative_position_bias=relative_position_bias, + interpolate_pos_encoding=interpolate_pos_encoding, + resolution=resolution, + ) + + mixed_query_layer = self.query(hidden_states) + key_layer = self.transpose_for_scores(self.key(hidden_states)) + value_layer = self.transpose_for_scores(self.value(hidden_states)) + query_layer = self.transpose_for_scores(mixed_query_layer) + + attn_bias = None + if self.has_relative_position_bias: + height, width = resolution + window_size = (height // self.config.patch_size, width // self.config.patch_size) + attn_bias = self.relative_position_bias( + window_size, interpolate_pos_encoding, dim_size=hidden_states.shape[1] + ) + + # Add shared relative position bias if provided. + if relative_position_bias is not None: + if attn_bias is None: + attn_bias = relative_position_bias + else: + attn_bias += relative_position_bias + + scaling = 1 / math.sqrt(self.attention_head_size) + context_layer = torch.nn.functional.scaled_dot_product_attention( + query_layer, + key_layer, + value_layer, + attn_mask=attn_bias, + dropout_p=self.config.attention_probs_dropout_prob if self.training else 0.0, + is_causal=False, + scale=scaling, + ) + context_layer = context_layer.permute(0, 2, 1, 3).contiguous() + new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,) + context_layer = context_layer.view(*new_context_layer_shape) + return context_layer, None + + +class BeitSelfOutput(nn.Module): + """ + The residual connection is defined in BeitLayer instead of here (as is the case with other models), due to the + layernorm applied before each block. + """ + + def __init__(self, config: BeitConfig) -> None: + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor, gamma=None) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + + return hidden_states + + +BEIT_SELF_ATTENTION_CLASSES = { + "eager": BeitSelfAttention, + "sdpa": BeitSdpaSelfAttention, +} + + +class BeitAttention(nn.Module): + def __init__(self, config: BeitConfig, window_size: Optional[tuple] = None) -> None: + super().__init__() + self.attention = BEIT_SELF_ATTENTION_CLASSES[config._attn_implementation](config, window_size=window_size) + self.output = BeitSelfOutput(config) + self.pruned_heads = set() + + def prune_heads(self, heads): + if len(heads) == 0: + return + heads, index = find_pruneable_heads_and_indices( + heads, self.attention.num_attention_heads, self.attention.attention_head_size, self.pruned_heads + ) + + # Prune linear layers + self.attention.query = prune_linear_layer(self.attention.query, index) + self.attention.key = prune_linear_layer(self.attention.key, index) + self.attention.value = prune_linear_layer(self.attention.value, index) + self.output.dense = prune_linear_layer(self.output.dense, index, dim=1) + + # Update hyper params and store pruned heads + self.attention.num_attention_heads = self.attention.num_attention_heads - len(heads) + self.attention.all_head_size = self.attention.attention_head_size * self.attention.num_attention_heads + self.pruned_heads = self.pruned_heads.union(heads) + + def forward( + self, + hidden_states: torch.Tensor, + head_mask: Optional[torch.Tensor] = None, + output_attentions: bool = False, + relative_position_bias: Optional[torch.Tensor] = None, + interpolate_pos_encoding: bool = False, + resolution: Optional[Tuple[int]] = None, + ) -> Union[Tuple[torch.Tensor], Tuple[torch.Tensor, torch.Tensor]]: + self_outputs = self.attention( + hidden_states, head_mask, output_attentions, relative_position_bias, interpolate_pos_encoding, resolution + ) + + attention_output = self.output(self_outputs[0], hidden_states) + + outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them + return outputs + + +class BeitIntermediate(nn.Module): + def __init__(self, config: BeitConfig) -> None: + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.intermediate_size) + if isinstance(config.hidden_act, str): + self.intermediate_act_fn = ACT2FN[config.hidden_act] + else: + self.intermediate_act_fn = config.hidden_act + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.intermediate_act_fn(hidden_states) + + return hidden_states + + +class BeitOutput(nn.Module): + def __init__(self, config: BeitConfig) -> None: + super().__init__() + self.dense = nn.Linear(config.intermediate_size, config.hidden_size) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + + return hidden_states + + +class BeitLayer(nn.Module): + """This corresponds to the Block class in the timm implementation.""" + + def __init__(self, config: BeitConfig, window_size: Optional[tuple] = None, drop_path_rate: float = 0.0) -> None: + super().__init__() + self.chunk_size_feed_forward = config.chunk_size_feed_forward + self.seq_len_dim = 1 + self.attention = BeitAttention(config, window_size=window_size) + self.intermediate = BeitIntermediate(config) + self.output = BeitOutput(config) + self.layernorm_before = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.drop_path = BeitDropPath(drop_path_rate) if drop_path_rate > 0.0 else nn.Identity() + self.layernorm_after = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + + init_values = config.layer_scale_init_value + if init_values > 0: + self.lambda_1 = nn.Parameter(init_values * torch.ones((config.hidden_size)), requires_grad=True) + self.lambda_2 = nn.Parameter(init_values * torch.ones((config.hidden_size)), requires_grad=True) + else: + self.lambda_1, self.lambda_2 = None, None + + def forward( + self, + hidden_states: torch.Tensor, + head_mask: Optional[torch.Tensor] = None, + output_attentions: bool = False, + relative_position_bias: Optional[torch.Tensor] = None, + interpolate_pos_encoding: bool = False, + resolution: Optional[Tuple[int]] = None, + ) -> Union[Tuple[torch.Tensor], Tuple[torch.Tensor, torch.Tensor]]: + self_attention_outputs = self.attention( + self.layernorm_before(hidden_states), # in BEiT, layernorm is applied before self-attention + head_mask, + output_attentions=output_attentions, + relative_position_bias=relative_position_bias, + interpolate_pos_encoding=interpolate_pos_encoding, + resolution=resolution, + ) + attention_output = self_attention_outputs[0] + outputs = self_attention_outputs[1:] # add self attentions if we output attention weights + + # apply lambda_1 if present + if self.lambda_1 is not None: + attention_output = self.lambda_1 * attention_output + + # first residual connection + hidden_states = self.drop_path(attention_output) + hidden_states + + # in BEiT, layernorm is also applied after self-attention + layer_output = self.layernorm_after(hidden_states) + + layer_output = self.intermediate(layer_output) + layer_output = self.output(layer_output) + + if self.lambda_2 is not None: + layer_output = self.lambda_2 * layer_output + + # second residual connection + layer_output = self.drop_path(layer_output) + hidden_states + + outputs = (layer_output,) + outputs + + return outputs + + +class BeitRelativePositionBias(nn.Module): + def __init__(self, config: BeitConfig, window_size: tuple) -> None: + super().__init__() + self.window_size = window_size + self.num_relative_distance = (2 * window_size[0] - 1) * (2 * window_size[1] - 1) + 3 + self.relative_position_bias_table = nn.Parameter( + torch.zeros(self.num_relative_distance, config.num_attention_heads) + ) # 2*Wh-1 * 2*Ww-1, nH + # cls to token & token 2 cls & cls to cls + + @compile_compatible_method_lru_cache(maxsize=10) + def generate_relative_position_index(self, window_size: Tuple[int, int]) -> torch.Tensor: + """ + This method creates the relative position index, modified to support arbitrary window sizes, + as introduced in [MiDaS v3.1](https://arxiv.org/abs/2307.14460). + """ + num_relative_distance = (2 * window_size[0] - 1) * (2 * window_size[1] - 1) + 3 + # cls to token & token 2 cls & cls to cls + # get pair-wise relative position index for each token inside the window + window_area = window_size[0] * window_size[1] + grid = torch.meshgrid(torch.arange(window_size[0]), torch.arange(window_size[1]), indexing="ij") + coords = torch.stack(grid) # 2, Wh, Ww + coords_flatten = torch.flatten(coords, 1) # 2, Wh*Ww + relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :] # 2, Wh*Ww, Wh*Ww + relative_coords = relative_coords.permute(1, 2, 0).contiguous() # Wh*Ww, Wh*Ww, 2 + relative_coords[:, :, 0] += window_size[0] - 1 # shift to start from 0 + relative_coords[:, :, 1] += window_size[1] - 1 + relative_coords[:, :, 0] *= 2 * window_size[1] - 1 + relative_position_index = torch.zeros(size=(window_area + 1,) * 2, dtype=relative_coords.dtype) + relative_position_index[1:, 1:] = relative_coords.sum(-1) # Wh*Ww, Wh*Ww + relative_position_index[0, 0:] = num_relative_distance - 3 + relative_position_index[0:, 0] = num_relative_distance - 2 + relative_position_index[0, 0] = num_relative_distance - 1 + return relative_position_index + + def forward(self, window_size, interpolate_pos_encoding: bool = False, dim_size=None) -> torch.Tensor: + """ + Modification of timm.models.beit.py: Attention._get_rel_pos_bias to support arbitrary window sizes. + """ + old_height = 2 * self.window_size[0] - 1 + old_width = 2 * self.window_size[1] - 1 + + new_height = 2 * window_size[0] - 1 + new_width = 2 * window_size[1] - 1 + + old_relative_position_bias_table = self.relative_position_bias_table + + old_num_relative_distance = self.num_relative_distance + new_num_relative_distance = new_height * new_width + 3 + + old_sub_table = old_relative_position_bias_table[: old_num_relative_distance - 3] + + old_sub_table = old_sub_table.reshape(1, old_width, old_height, -1).permute(0, 3, 1, 2) + new_sub_table = nn.functional.interpolate( + old_sub_table, size=(torch_int(new_height), torch_int(new_width)), mode="bilinear" + ) + new_sub_table = new_sub_table.permute(0, 2, 3, 1).reshape(new_num_relative_distance - 3, -1) + + new_relative_position_bias_table = torch.cat( + [new_sub_table, old_relative_position_bias_table[old_num_relative_distance - 3 :]] + ) + + relative_position_index = self.generate_relative_position_index(window_size) + relative_position_bias = new_relative_position_bias_table[relative_position_index.view(-1)] + + # patch_size*num_patches_height, patch_size*num_patches_width, num_attention_heads + relative_position_bias = relative_position_bias.view( + window_size[0] * window_size[1] + 1, window_size[0] * window_size[1] + 1, -1 + ) + # num_attention_heads, patch_size*num_patches_width, patch_size*num_patches_height + relative_position_bias = relative_position_bias.permute(2, 0, 1).contiguous() + + if interpolate_pos_encoding: + relative_position_bias = nn.functional.interpolate( + relative_position_bias.unsqueeze(1), + size=(dim_size, dim_size), + mode="bilinear", + align_corners=False, + ).squeeze(1) + + return relative_position_bias.unsqueeze(0) + + +class BeitEncoder(nn.Module): + def __init__(self, config: BeitConfig, window_size: Optional[tuple] = None) -> None: + super().__init__() + self.config = config + self.has_relative_position_bias = config.use_shared_relative_position_bias + if self.has_relative_position_bias: + self.relative_position_bias = BeitRelativePositionBias(config, window_size=window_size) + + # stochastic depth decay rule + dpr = [x.item() for x in torch.linspace(0, config.drop_path_rate, config.num_hidden_layers, device="cpu")] + self.layer = nn.ModuleList( + [ + BeitLayer( + config, + window_size=window_size if config.use_relative_position_bias else None, + drop_path_rate=dpr[i], + ) + for i in range(config.num_hidden_layers) + ] + ) + self.gradient_checkpointing = False + + def forward( + self, + hidden_states: torch.Tensor, + head_mask: Optional[torch.Tensor] = None, + output_attentions: bool = False, + output_hidden_states: bool = False, + interpolate_pos_encoding: bool = False, + resolution: Optional[Tuple[int, int]] = None, + return_dict: bool = True, + ) -> Union[tuple, BaseModelOutput]: + all_hidden_states = () if output_hidden_states else None + all_self_attentions = () if output_attentions else None + + for i, layer_module in enumerate(self.layer): + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + if self.has_relative_position_bias: + height, width = resolution + window_size = (height // self.config.patch_size, width // self.config.patch_size) + relative_position_bias = self.relative_position_bias( + window_size, interpolate_pos_encoding=interpolate_pos_encoding, dim_size=hidden_states.shape[1] + ) + else: + relative_position_bias = None + + layer_head_mask = head_mask[i] if head_mask is not None else None + + if self.gradient_checkpointing and self.training: + layer_outputs = self._gradient_checkpointing_func( + layer_module.__call__, + hidden_states, + layer_head_mask, + output_attentions, + relative_position_bias, + interpolate_pos_encoding, + resolution, + ) + else: + layer_outputs = layer_module( + hidden_states, + layer_head_mask, + output_attentions, + relative_position_bias, + interpolate_pos_encoding, + resolution, + ) + + hidden_states = layer_outputs[0] + + if output_attentions: + all_self_attentions = all_self_attentions + (layer_outputs[1],) + + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + if not return_dict: + return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None) + return BaseModelOutput( + last_hidden_state=hidden_states, + hidden_states=all_hidden_states, + attentions=all_self_attentions, + ) + + +class BeitPreTrainedModel(PreTrainedModel): + """ + An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained + models. + """ + + config_class = BeitConfig + base_model_prefix = "beit" + main_input_name = "pixel_values" + supports_gradient_checkpointing = True + _no_split_modules = ["BeitLayer"] + _keys_to_ignore_on_load_unexpected = [r".*relative_position_index.*"] + _supports_sdpa = True + + def _init_weights(self, module): + """Initialize the weights""" + if isinstance(module, (nn.Linear, nn.Conv2d, nn.ConvTranspose2d)): + # Slightly different from the TF version which uses truncated_normal for initialization + # cf https://github.com/pytorch/pytorch/pull/5617 + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + elif isinstance(module, nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + elif isinstance(module, BeitEmbeddings): + module.cls_token.data.zero_() + if module.mask_token is not None: + module.mask_token.data.zero_() + if module.position_embeddings is not None: + module.position_embeddings.data.zero_() + elif isinstance(module, BeitRelativePositionBias): + module.relative_position_bias_table.data.zero_() + elif isinstance(module, BeitLayer): + if module.lambda_1 is not None: + module.lambda_1.data.fill_(self.config.layer_scale_init_value) + module.lambda_2.data.fill_(self.config.layer_scale_init_value) + + +BEIT_START_DOCSTRING = r""" + This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it + as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and + behavior. + + Parameters: + config ([`BeitConfig`]): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights. +""" + +BEIT_INPUTS_DOCSTRING = r""" + Args: + pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`): + Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See + [`BeitImageProcessor.__call__`] for details. + + head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*): + Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + interpolate_pos_encoding (`bool`, *optional*, defaults to `False`): + Whether to interpolate the pre-trained position encodings. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. +""" + + +@add_start_docstrings( + "The bare Beit Model transformer outputting raw hidden-states without any specific head on top.", + BEIT_START_DOCSTRING, +) +class BeitModel(BeitPreTrainedModel): + def __init__(self, config: BeitConfig, add_pooling_layer: bool = True) -> None: + super().__init__(config) + self.config = config + + self.embeddings = BeitEmbeddings(config) + self.encoder = BeitEncoder(config, window_size=self.embeddings.patch_embeddings.patch_shape) + + self.layernorm = ( + nn.Identity() if config.use_mean_pooling else nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + ) + self.pooler = BeitPooler(config) if add_pooling_layer else None + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.embeddings.patch_embeddings + + def _prune_heads(self, heads_to_prune): + """ + Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base + class PreTrainedModel + """ + for layer, heads in heads_to_prune.items(): + self.encoder.layer[layer].attention.prune_heads(heads) + + @add_start_docstrings_to_model_forward(BEIT_INPUTS_DOCSTRING) + @add_code_sample_docstrings( + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=BeitModelOutputWithPooling, + config_class=_CONFIG_FOR_DOC, + modality="vision", + expected_output=_EXPECTED_OUTPUT_SHAPE, + ) + def forward( + self, + pixel_values: torch.Tensor, + bool_masked_pos: Optional[torch.BoolTensor] = None, + head_mask: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + interpolate_pos_encoding: bool = False, + return_dict: Optional[bool] = None, + ) -> Union[tuple, BeitModelOutputWithPooling]: + r""" + bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`, *optional*): + Boolean masked positions. Indicates which patches are masked (1) and which aren't (0). + """ + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + # Prepare head mask if needed + # 1.0 in head_mask indicate we keep the head + # attention_probs has shape bsz x n_heads x N x N + # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads] + # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length] + head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers) + + embedding_output, _ = self.embeddings(pixel_values, bool_masked_pos=bool_masked_pos) + resolution = pixel_values.shape[2:] + + encoder_outputs = self.encoder( + embedding_output, + head_mask=head_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + resolution=resolution, + return_dict=return_dict, + interpolate_pos_encoding=interpolate_pos_encoding, + ) + sequence_output = encoder_outputs[0] + sequence_output = self.layernorm(sequence_output) + pooled_output = self.pooler(sequence_output) if self.pooler is not None else None + + if not return_dict: + head_outputs = (sequence_output, pooled_output) if pooled_output is not None else (sequence_output,) + return head_outputs + encoder_outputs[1:] + + return BeitModelOutputWithPooling( + last_hidden_state=sequence_output, + pooler_output=pooled_output, + hidden_states=encoder_outputs.hidden_states, + attentions=encoder_outputs.attentions, + ) + + +class BeitPooler(nn.Module): + def __init__(self, config: BeitConfig) -> None: + super().__init__() + self.layernorm = ( + nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) if config.use_mean_pooling else None + ) + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + if self.layernorm is not None: + # Mean pool the final hidden states of the patch tokens + patch_tokens = hidden_states[:, 1:, :] + pooled_output = self.layernorm(patch_tokens.mean(1)) + else: + # Pool by simply taking the final hidden state of the [CLS] token + pooled_output = hidden_states[:, 0] + + return pooled_output + + +@add_start_docstrings( + """Beit Model transformer with a 'language' modeling head on top. BEiT does masked image modeling by predicting + visual tokens of a Vector-Quantize Variational Autoencoder (VQ-VAE), whereas other vision models like ViT and DeiT + predict RGB pixel values. As a result, this class is incompatible with [`AutoModelForMaskedImageModeling`], so you + will need to use [`BeitForMaskedImageModeling`] directly if you wish to do masked image modeling with BEiT.""", + BEIT_START_DOCSTRING, +) +class BeitForMaskedImageModeling(BeitPreTrainedModel): + def __init__(self, config: BeitConfig) -> None: + super().__init__(config) + + self.num_labels = config.num_labels + self.beit = BeitModel(config, add_pooling_layer=False) + + # Classifier head + self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.lm_head = nn.Linear(config.hidden_size, config.vocab_size) + + # Initialize weights and apply final processing + self.post_init() + + @add_start_docstrings_to_model_forward(BEIT_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=MaskedLMOutput, config_class=_CONFIG_FOR_DOC) + def forward( + self, + pixel_values: Optional[torch.Tensor] = None, + bool_masked_pos: Optional[torch.BoolTensor] = None, + head_mask: Optional[torch.Tensor] = None, + labels: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + interpolate_pos_encoding: bool = False, + return_dict: Optional[bool] = None, + ) -> Union[tuple, MaskedLMOutput]: + r""" + bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`): + Boolean masked positions. Indicates which patches are masked (1) and which aren't (0). + + labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the image classification/regression loss. Indices should be in `[0, ..., + config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If + `config.num_labels > 1` a classification loss is computed (Cross-Entropy). + + Returns: + + Examples: + + ```python + >>> from transformers import AutoImageProcessor, BeitForMaskedImageModeling + >>> import torch + >>> from PIL import Image + >>> import requests + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + + >>> image_processor = AutoImageProcessor.from_pretrained("microsoft/beit-base-patch16-224-pt22k") + >>> model = BeitForMaskedImageModeling.from_pretrained("microsoft/beit-base-patch16-224-pt22k") + + >>> num_patches = (model.config.image_size // model.config.patch_size) ** 2 + >>> pixel_values = image_processor(images=image, return_tensors="pt").pixel_values + >>> # create random boolean mask of shape (batch_size, num_patches) + >>> bool_masked_pos = torch.randint(low=0, high=2, size=(1, num_patches)).bool() + + >>> outputs = model(pixel_values, bool_masked_pos=bool_masked_pos) + >>> loss, logits = outputs.loss, outputs.logits + >>> list(logits.shape) + [1, 196, 8192] + ```""" + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.beit( + pixel_values, + bool_masked_pos=bool_masked_pos, + head_mask=head_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + interpolate_pos_encoding=interpolate_pos_encoding, + return_dict=return_dict, + ) + + sequence_output = outputs[0] + sequence_output = self.layernorm(sequence_output) + prediction_scores = self.lm_head(sequence_output[:, 1:]) + + masked_lm_loss = None + if labels is not None: + loss_fct = CrossEntropyLoss() # -100 index = padding token + masked_lm_loss = loss_fct(prediction_scores[bool_masked_pos], labels) + + if not return_dict: + output = (prediction_scores,) + outputs[1:] + return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output + + return MaskedLMOutput( + loss=masked_lm_loss, + logits=prediction_scores, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +@add_start_docstrings( + """ + Beit Model transformer with an image classification head on top (a linear layer on top of the average of the final + hidden states of the patch tokens) e.g. for ImageNet. + """, + BEIT_START_DOCSTRING, +) +class BeitForImageClassification(BeitPreTrainedModel): + def __init__(self, config: BeitConfig) -> None: + super().__init__(config) + + self.num_labels = config.num_labels + self.beit = BeitModel(config, add_pooling_layer=True) + + # Classifier head + self.classifier = nn.Linear(config.hidden_size, config.num_labels) if config.num_labels > 0 else nn.Identity() + + # Initialize weights and apply final processing + self.post_init() + + @add_start_docstrings_to_model_forward(BEIT_INPUTS_DOCSTRING) + @add_code_sample_docstrings( + checkpoint=_IMAGE_CLASS_CHECKPOINT, + output_type=ImageClassifierOutput, + config_class=_CONFIG_FOR_DOC, + expected_output=_IMAGE_CLASS_EXPECTED_OUTPUT, + ) + def forward( + self, + pixel_values: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + labels: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + interpolate_pos_encoding: bool = False, + return_dict: Optional[bool] = None, + ) -> Union[tuple, ImageClassifierOutput]: + r""" + labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the image classification/regression loss. Indices should be in `[0, ..., + config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If + `config.num_labels > 1` a classification loss is computed (Cross-Entropy). + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + outputs = self.beit( + pixel_values, + head_mask=head_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + interpolate_pos_encoding=interpolate_pos_encoding, + return_dict=return_dict, + ) + + pooled_output = outputs.pooler_output if return_dict else outputs[1] + + logits = self.classifier(pooled_output) + + loss = None + if labels is not None: + if self.config.problem_type is None: + if self.num_labels == 1: + self.config.problem_type = "regression" + elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int): + self.config.problem_type = "single_label_classification" + else: + self.config.problem_type = "multi_label_classification" + + if self.config.problem_type == "regression": + loss_fct = MSELoss() + if self.num_labels == 1: + loss = loss_fct(logits.squeeze(), labels.squeeze()) + else: + loss = loss_fct(logits, labels) + elif self.config.problem_type == "single_label_classification": + loss_fct = CrossEntropyLoss() + loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) + elif self.config.problem_type == "multi_label_classification": + loss_fct = BCEWithLogitsLoss() + loss = loss_fct(logits, labels) + if not return_dict: + output = (logits,) + outputs[2:] + return ((loss,) + output) if loss is not None else output + + return ImageClassifierOutput( + loss=loss, + logits=logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +class BeitConvModule(nn.Module): + """ + A convolutional block that bundles conv/norm/activation layers. This block simplifies the usage of convolution + layers, which are commonly used with a norm layer (e.g., BatchNorm) and activation layer (e.g., ReLU). + + Based on OpenMMLab's implementation, found in https://github.com/open-mmlab/mmsegmentation. + """ + + def __init__( + self, + in_channels: int, + out_channels: int, + kernel_size: Union[int, Tuple[int, int]], + padding: Union[int, Tuple[int, int], str] = 0, + bias: bool = False, + dilation: Union[int, Tuple[int, int]] = 1, + ) -> None: + super().__init__() + self.conv = nn.Conv2d( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=kernel_size, + padding=padding, + bias=bias, + dilation=dilation, + ) + self.bn = nn.BatchNorm2d(out_channels) + self.activation = nn.ReLU() + + def forward(self, input: torch.Tensor) -> torch.Tensor: + output = self.conv(input) + output = self.bn(output) + output = self.activation(output) + + return output + + +class BeitPyramidPoolingBlock(nn.Module): + def __init__(self, pool_scale: int, in_channels: int, channels: int) -> None: + super().__init__() + self.layers = [ + nn.AdaptiveAvgPool2d(pool_scale), + BeitConvModule(in_channels, channels, kernel_size=1), + ] + for i, layer in enumerate(self.layers): + self.add_module(str(i), layer) + + def forward(self, input: torch.Tensor) -> torch.Tensor: + hidden_state = input + for layer in self.layers: + hidden_state = layer(hidden_state) + return hidden_state + + +class BeitPyramidPoolingModule(nn.Module): + """ + Pyramid Pooling Module (PPM) used in PSPNet. + + Args: + pool_scales (tuple[int]): Pooling scales used in Pooling Pyramid + Module. + in_channels (int): Input channels. + channels (int): Channels after modules, before conv_seg. + align_corners (bool): align_corners argument of F.interpolate. + + Based on OpenMMLab's implementation, found in https://github.com/open-mmlab/mmsegmentation. + """ + + def __init__(self, pool_scales: Tuple[int, ...], in_channels: int, channels: int, align_corners: bool) -> None: + super().__init__() + self.pool_scales = pool_scales + self.align_corners = align_corners + self.in_channels = in_channels + self.channels = channels + self.blocks = [] + for i, pool_scale in enumerate(pool_scales): + block = BeitPyramidPoolingBlock(pool_scale=pool_scale, in_channels=in_channels, channels=channels) + self.blocks.append(block) + self.add_module(str(i), block) + + def forward(self, x: torch.Tensor) -> List[torch.Tensor]: + ppm_outs = [] + for ppm in self.blocks: + ppm_out = ppm(x) + upsampled_ppm_out = nn.functional.interpolate( + ppm_out, size=x.size()[2:], mode="bilinear", align_corners=self.align_corners + ) + ppm_outs.append(upsampled_ppm_out) + return ppm_outs + + +class BeitUperHead(nn.Module): + """ + Unified Perceptual Parsing for Scene Understanding. This head is the implementation of + [UPerNet](https://arxiv.org/abs/1807.10221). + + Based on OpenMMLab's implementation, found in https://github.com/open-mmlab/mmsegmentation. + """ + + def __init__(self, config: BeitConfig) -> None: + super().__init__() + + self.pool_scales = config.pool_scales # e.g. (1, 2, 3, 6) + self.in_channels = [config.hidden_size] * 4 # e.g. [768, 768, 768, 768] + self.channels = config.hidden_size + self.align_corners = False + self.classifier = nn.Conv2d(self.channels, config.num_labels, kernel_size=1) + + # PSP Module + self.psp_modules = BeitPyramidPoolingModule( + self.pool_scales, + self.in_channels[-1], + self.channels, + align_corners=self.align_corners, + ) + self.bottleneck = BeitConvModule( + self.in_channels[-1] + len(self.pool_scales) * self.channels, + self.channels, + kernel_size=3, + padding=1, + ) + # FPN Module + self.lateral_convs = nn.ModuleList() + self.fpn_convs = nn.ModuleList() + for in_channels in self.in_channels[:-1]: # skip the top layer + l_conv = BeitConvModule(in_channels, self.channels, kernel_size=1) + fpn_conv = BeitConvModule(self.channels, self.channels, kernel_size=3, padding=1) + self.lateral_convs.append(l_conv) + self.fpn_convs.append(fpn_conv) + + self.fpn_bottleneck = BeitConvModule( + len(self.in_channels) * self.channels, + self.channels, + kernel_size=3, + padding=1, + ) + + def psp_forward(self, inputs): + x = inputs[-1] + psp_outs = [x] + psp_outs.extend(self.psp_modules(x)) + psp_outs = torch.cat(psp_outs, dim=1) + output = self.bottleneck(psp_outs) + + return output + + def forward(self, encoder_hidden_states: torch.Tensor) -> torch.Tensor: + # build laterals + laterals = [lateral_conv(encoder_hidden_states[i]) for i, lateral_conv in enumerate(self.lateral_convs)] + + laterals.append(self.psp_forward(encoder_hidden_states)) + + # build top-down path + used_backbone_levels = len(laterals) + for i in range(used_backbone_levels - 1, 0, -1): + prev_shape = laterals[i - 1].shape[2:] + laterals[i - 1] = laterals[i - 1] + nn.functional.interpolate( + laterals[i], size=prev_shape, mode="bilinear", align_corners=self.align_corners + ) + + # build outputs + fpn_outs = [self.fpn_convs[i](laterals[i]) for i in range(used_backbone_levels - 1)] + # append psp feature + fpn_outs.append(laterals[-1]) + + for i in range(used_backbone_levels - 1, 0, -1): + fpn_outs[i] = nn.functional.interpolate( + fpn_outs[i], size=fpn_outs[0].shape[2:], mode="bilinear", align_corners=self.align_corners + ) + fpn_outs = torch.cat(fpn_outs, dim=1) + output = self.fpn_bottleneck(fpn_outs) + output = self.classifier(output) + + return output + + +class BeitFCNHead(nn.Module): + """ + Fully Convolution Networks for Semantic Segmentation. This head is implemented of + [FCNNet](https://arxiv.org/abs/1411.4038>). + + Args: + config (BeitConfig): Configuration. + in_channels + kernel_size (int): The kernel size for convs in the head. Default: 3. + dilation (int): The dilation rate for convs in the head. Default: 1. + + + Based on OpenMMLab's implementation, found in https://github.com/open-mmlab/mmsegmentation. + """ + + def __init__( + self, config: BeitConfig, in_index: int = 2, kernel_size: int = 3, dilation: Union[int, Tuple[int, int]] = 1 + ) -> None: + super().__init__() + self.in_channels = config.hidden_size + self.channels = config.auxiliary_channels + self.num_convs = config.auxiliary_num_convs + self.concat_input = config.auxiliary_concat_input + self.in_index = in_index + + conv_padding = (kernel_size // 2) * dilation + convs = [] + convs.append( + BeitConvModule( + self.in_channels, self.channels, kernel_size=kernel_size, padding=conv_padding, dilation=dilation + ) + ) + for i in range(self.num_convs - 1): + convs.append( + BeitConvModule( + self.channels, self.channels, kernel_size=kernel_size, padding=conv_padding, dilation=dilation + ) + ) + if self.num_convs == 0: + self.convs = nn.Identity() + else: + self.convs = nn.Sequential(*convs) + if self.concat_input: + self.conv_cat = BeitConvModule( + self.in_channels + self.channels, self.channels, kernel_size=kernel_size, padding=kernel_size // 2 + ) + + self.classifier = nn.Conv2d(self.channels, config.num_labels, kernel_size=1) + + def forward(self, encoder_hidden_states: torch.Tensor) -> torch.Tensor: + # just take the relevant feature maps + hidden_states = encoder_hidden_states[self.in_index] + output = self.convs(hidden_states) + if self.concat_input: + output = self.conv_cat(torch.cat([hidden_states, output], dim=1)) + output = self.classifier(output) + return output + + +@add_start_docstrings( + """ + Beit Model transformer with a semantic segmentation head on top e.g. for ADE20k, CityScapes. + """, + BEIT_START_DOCSTRING, +) +class BeitForSemanticSegmentation(BeitPreTrainedModel): + def __init__(self, config: BeitConfig) -> None: + super().__init__(config) + + self.num_labels = config.num_labels + self.beit = BeitModel(config, add_pooling_layer=False) + + # FPNs + if len(self.config.out_indices) != 4: + raise ValueError( + "BeitForSemanticSegmentation requires config.out_indices to be a list of 4 integers, " + "specifying which features to use from the backbone. One can use [3, 5, 7, 11] in case of " + "a base-sized architecture." + ) + self.fpn1 = nn.Sequential( + nn.ConvTranspose2d(config.hidden_size, config.hidden_size, kernel_size=2, stride=2), + nn.BatchNorm2d(config.hidden_size), + nn.GELU(), + nn.ConvTranspose2d(config.hidden_size, config.hidden_size, kernel_size=2, stride=2), + ) + self.fpn2 = nn.Sequential( + nn.ConvTranspose2d(config.hidden_size, config.hidden_size, kernel_size=2, stride=2), + ) + self.fpn3 = nn.Identity() + self.fpn4 = nn.MaxPool2d(kernel_size=2, stride=2) + + # Semantic segmentation head(s) + self.decode_head = BeitUperHead(config) + self.auxiliary_head = BeitFCNHead(config) if config.use_auxiliary_head else None + + # Initialize weights and apply final processing + self.post_init() + + def compute_loss(self, logits, auxiliary_logits, labels): + # upsample logits to the images' original size + upsampled_logits = nn.functional.interpolate( + logits, size=labels.shape[-2:], mode="bilinear", align_corners=False + ) + if auxiliary_logits is not None: + upsampled_auxiliary_logits = nn.functional.interpolate( + auxiliary_logits, size=labels.shape[-2:], mode="bilinear", align_corners=False + ) + # compute weighted loss + loss_fct = CrossEntropyLoss(ignore_index=self.config.semantic_loss_ignore_index) + main_loss = loss_fct(upsampled_logits, labels) + loss = main_loss + if auxiliary_logits is not None: + auxiliary_loss = loss_fct(upsampled_auxiliary_logits, labels) + loss += self.config.auxiliary_loss_weight * auxiliary_loss + + return loss + + @add_start_docstrings_to_model_forward(BEIT_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=SemanticSegmenterOutput, config_class=_CONFIG_FOR_DOC) + def forward( + self, + pixel_values: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + labels: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + interpolate_pos_encoding: bool = False, + return_dict: Optional[bool] = None, + ) -> Union[tuple, SemanticSegmenterOutput]: + r""" + labels (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*): + Ground truth semantic segmentation maps for computing the loss. Indices should be in `[0, ..., + config.num_labels - 1]`. If `config.num_labels > 1`, a classification loss is computed (Cross-Entropy). + + Returns: + + Examples: + + ```python + >>> from transformers import AutoImageProcessor, BeitForSemanticSegmentation + >>> from PIL import Image + >>> import requests + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + + >>> image_processor = AutoImageProcessor.from_pretrained("microsoft/beit-base-finetuned-ade-640-640") + >>> model = BeitForSemanticSegmentation.from_pretrained("microsoft/beit-base-finetuned-ade-640-640") + + >>> inputs = image_processor(images=image, return_tensors="pt") + >>> outputs = model(**inputs) + >>> # logits are of shape (batch_size, num_labels, height, width) + >>> logits = outputs.logits + ```""" + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + + if labels is not None and self.config.num_labels == 1: + raise ValueError("The number of labels should be greater than one") + + outputs = self.beit( + pixel_values, + head_mask=head_mask, + output_attentions=output_attentions, + output_hidden_states=True, # we need the intermediate hidden states + interpolate_pos_encoding=interpolate_pos_encoding, + return_dict=return_dict, + ) + + encoder_hidden_states = outputs.hidden_states if return_dict else outputs[1] + + # only keep certain features, and reshape + # note that we do +1 as the encoder_hidden_states also includes the initial embeddings + features = [feature for idx, feature in enumerate(encoder_hidden_states) if idx + 1 in self.config.out_indices] + batch_size = pixel_values.shape[0] + patch_resolution = self.config.image_size // self.config.patch_size + features = [ + x[:, 1:, :].permute(0, 2, 1).reshape(batch_size, -1, patch_resolution, patch_resolution) for x in features + ] + + # apply FPNs + ops = [self.fpn1, self.fpn2, self.fpn3, self.fpn4] + for i in range(len(features)): + features[i] = ops[i](features[i]) + + logits = self.decode_head(features) + + auxiliary_logits = None + if self.auxiliary_head is not None: + auxiliary_logits = self.auxiliary_head(features) + + loss = None + if labels is not None: + loss = self.compute_loss(logits, auxiliary_logits, labels) + + if not return_dict: + if output_hidden_states: + output = (logits,) + outputs[1:] + else: + output = (logits,) + outputs[2:] + return ((loss,) + output) if loss is not None else output + + return SemanticSegmenterOutput( + loss=loss, + logits=logits, + hidden_states=outputs.hidden_states if output_hidden_states else None, + attentions=outputs.attentions, + ) + + +@add_start_docstrings( + """ + BEiT backbone, to be used with frameworks like DETR and MaskFormer. + """, + BEIT_START_DOCSTRING, +) +class BeitBackbone(BeitPreTrainedModel, BackboneMixin): + def __init__(self, config): + super().__init__(config) + super()._init_backbone(config) + + self.num_features = [config.hidden_size for _ in range(config.num_hidden_layers + 1)] + self.embeddings = BeitEmbeddings(config) + self.encoder = BeitEncoder(config, window_size=self.embeddings.patch_embeddings.patch_shape) + + if config.add_fpn: + if len(self.config.out_indices) != 4: + raise ValueError( + "BeitBackbone requires config.out_indices to be a list of 4 integers, " + "specifying which features to use from the backbone. One can use [3, 5, 7, 11] in case of " + "a base-sized architecture." + ) + hidden_size = config.hidden_size + self.fpn1 = nn.Sequential( + nn.ConvTranspose2d(hidden_size, hidden_size, kernel_size=2, stride=2), + nn.BatchNorm2d(hidden_size, eps=config.batch_norm_eps), + nn.GELU(), + nn.ConvTranspose2d(hidden_size, hidden_size, kernel_size=2, stride=2), + ) + + self.fpn2 = nn.Sequential(nn.ConvTranspose2d(hidden_size, hidden_size, kernel_size=2, stride=2)) + self.fpn3 = nn.Identity() + self.fpn4 = nn.MaxPool2d(kernel_size=2, stride=2) + + # initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.embeddings.patch_embeddings + + @add_start_docstrings_to_model_forward(BEIT_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=BackboneOutput, config_class=_CONFIG_FOR_DOC) + def forward( + self, + pixel_values: Tensor, + output_hidden_states: Optional[bool] = None, + output_attentions: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> BackboneOutput: + """ + Returns: + + Examples: + + ```python + >>> from transformers import AutoImageProcessor, AutoBackbone + >>> import torch + >>> from PIL import Image + >>> import requests + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + + >>> processor = AutoImageProcessor.from_pretrained("microsoft/beit-base-patch16-224") + >>> model = AutoBackbone.from_pretrained( + ... "microsoft/beit-base-patch16-224", out_features=["stage1", "stage2", "stage3", "stage4"] + ... ) + + >>> inputs = processor(image, return_tensors="pt") + + >>> outputs = model(**inputs) + >>> feature_maps = outputs.feature_maps + >>> list(feature_maps[-1].shape) + [1, 768, 14, 14] + ```""" + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + + batch_size = pixel_values.shape[0] + embedding_output, (patch_height, patch_width) = self.embeddings(pixel_values) + resolution = pixel_values.shape[2:] + + outputs = self.encoder( + embedding_output, + output_hidden_states=True, + output_attentions=output_attentions, + resolution=resolution, + return_dict=return_dict, + ) + + hidden_states = outputs.hidden_states if return_dict else outputs[1] + + feature_maps = () + for stage, hidden_state in zip(self.stage_names, hidden_states): + if stage in self.out_features: + if self.config.reshape_hidden_states: + hidden_state = hidden_state[:, 1:, :] + hidden_state = hidden_state.permute(0, 2, 1) + hidden_state = hidden_state.reshape(batch_size, -1, patch_height, patch_width) + + feature_maps += (hidden_state,) + + if self.config.add_fpn: + feature_maps = [ + self.fpn1(feature_maps[0]), + self.fpn2(feature_maps[1]), + self.fpn3(feature_maps[2]), + self.fpn4(feature_maps[3]), + ] + feature_maps = tuple(feature_maps) + + if not return_dict: + if output_hidden_states: + output = (feature_maps,) + outputs[1:] + else: + output = (feature_maps,) + outputs[2:] + return output + + return BackboneOutput( + feature_maps=feature_maps, + hidden_states=outputs.hidden_states if output_hidden_states else None, + attentions=outputs.attentions, + ) + + +__all__ = [ + "BeitForImageClassification", + "BeitForMaskedImageModeling", + "BeitForSemanticSegmentation", + "BeitModel", + "BeitPreTrainedModel", + "BeitBackbone", +] diff --git a/docs/transformers/build/lib/transformers/models/beit/modeling_flax_beit.py b/docs/transformers/build/lib/transformers/models/beit/modeling_flax_beit.py new file mode 100644 index 0000000000000000000000000000000000000000..d37eedea3f4e805fdee885d46c9d2b45fef9d4a0 --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/beit/modeling_flax_beit.py @@ -0,0 +1,956 @@ +# coding=utf-8 +# Copyright 2021 Microsoft Research and the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from typing import Callable, List, Optional, Tuple + +import flax +import flax.linen as nn +import jax +import jax.numpy as jnp +import numpy as np +from flax.core.frozen_dict import FrozenDict, freeze, unfreeze +from flax.linen.attention import dot_product_attention_weights +from flax.traverse_util import flatten_dict, unflatten_dict + +from ...modeling_flax_outputs import ( + FlaxBaseModelOutput, + FlaxBaseModelOutputWithPooling, + FlaxMaskedLMOutput, + FlaxSequenceClassifierOutput, +) +from ...modeling_flax_utils import ( + ACT2FN, + FlaxPreTrainedModel, + append_replace_return_docstrings, + overwrite_call_docstring, +) +from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward +from .configuration_beit import BeitConfig + + +@flax.struct.dataclass +class FlaxBeitModelOutputWithPooling(FlaxBaseModelOutputWithPooling): + """ + Class for outputs of [`FlaxBeitModel`]. + + Args: + last_hidden_state (`jnp.ndarray` of shape `(batch_size, sequence_length, hidden_size)`): + Sequence of hidden-states at the output of the last layer of the model. + pooler_output (`jnp.ndarray` of shape `(batch_size, hidden_size)`): + Average of the last layer hidden states of the patch tokens (excluding the *[CLS]* token) if + *config.use_mean_pooling* is set to True. If set to False, then the final hidden state of the *[CLS]* token + will be returned. + hidden_states (`tuple(jnp.ndarray)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `jnp.ndarray` (one for the output of the embeddings + one for the output of each layer) of shape + `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus + the initial embedding outputs. + attentions (`tuple(jnp.ndarray)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `jnp.ndarray` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in + the self-attention heads. + """ + + +BEIT_START_DOCSTRING = r""" + + This model inherits from [`FlaxPreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading, saving and converting weights from PyTorch models) + + This model is also a + [flax.linen.Module](https://flax.readthedocs.io/en/latest/api_reference/flax.linen/module.html) subclass. Use it as + a regular Flax linen Module and refer to the Flax documentation for all matter related to general usage and + behavior. + + Finally, this model supports inherent JAX features such as: + + - [Just-In-Time (JIT) compilation](https://jax.readthedocs.io/en/latest/jax.html#just-in-time-compilation-jit) + - [Automatic Differentiation](https://jax.readthedocs.io/en/latest/jax.html#automatic-differentiation) + - [Vectorization](https://jax.readthedocs.io/en/latest/jax.html#vectorization-vmap) + - [Parallelization](https://jax.readthedocs.io/en/latest/jax.html#parallelization-pmap) + + Parameters: + config ([`BeitConfig`]): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the [`~FlaxPreTrainedModel.from_pretrained`] method to load the model weights. + dtype (`jax.numpy.dtype`, *optional*, defaults to `jax.numpy.float32`): + The data type of the computation. Can be one of `jax.numpy.float32`, `jax.numpy.float16` (on GPUs) and + `jax.numpy.bfloat16` (on TPUs). + + This can be used to enable mixed-precision training or half-precision inference on GPUs or TPUs. If + specified all the computation will be performed with the given `dtype`. + + **Note that this only specifies the dtype of the computation and does not influence the dtype of model + parameters.** + + If you wish to change the dtype of the model parameters, see [`~FlaxPreTrainedModel.to_fp16`] and + [`~FlaxPreTrainedModel.to_bf16`]. +""" + +BEIT_INPUTS_DOCSTRING = r""" + Args: + pixel_values (`numpy.ndarray` of shape `(batch_size, num_channels, height, width)`): + Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See + [`AutoImageProcessor.__call__`] for details. + + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. +""" + + +def relative_position_index_init(window_size: Tuple[int, int]) -> jnp.ndarray: + """ + get pair-wise relative position index for each token inside the window + """ + num_relative_distance = (2 * window_size[0] - 1) * (2 * window_size[1] - 1) + 3 + + coords_h = np.arange(window_size[0]) + coords_w = np.arange(window_size[1]) + coords = np.stack(np.meshgrid(coords_h, coords_w, indexing="ij")) # 2, Wh, Ww + coords_flatten = np.reshape(coords, (2, -1)) + relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :] # 2, Wh*Ww, Wh*Ww + relative_coords = np.transpose(relative_coords, (1, 2, 0)) # Wh*Ww, Wh*Ww, 2 + relative_coords[:, :, 0] += window_size[0] - 1 # shift to start from 0 + relative_coords[:, :, 1] += window_size[1] - 1 + relative_coords[:, :, 0] *= 2 * window_size[1] - 1 + + relative_position_index = np.zeros(shape=(window_size[0] * window_size[1] + 1,) * 2, dtype=relative_coords.dtype) + relative_position_index[1:, 1:] = relative_coords.sum(-1) # Wh*Ww, Wh*Ww + relative_position_index[0, 0:] = num_relative_distance - 3 + relative_position_index[0:, 0] = num_relative_distance - 2 + relative_position_index[0, 0] = num_relative_distance - 1 + return jnp.array(relative_position_index) + + +def ones_with_scale(key, shape, scale, dtype=jnp.float32): + return jnp.ones(shape, dtype) * scale + + +class FlaxBeitDropPath(nn.Module): + """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).""" + + rate: float + + @nn.module.compact + def __call__(self, inputs, deterministic: Optional[bool] = True): + if self.rate == 0.0: + return inputs + keep_prob = 1.0 - self.rate + if deterministic: + return inputs + else: + shape = (inputs.shape[0],) + (1,) * (inputs.ndim - 1) # work with diff dim tensors, not just 2D ConvNets + rng = self.make_rng("droppath") + random_tensor = keep_prob + jax.random.uniform(rng, shape=shape, dtype=inputs.dtype) + binary_tensor = jnp.floor(random_tensor) + output = inputs / keep_prob * binary_tensor + return output + + +class FlaxBeitPatchEmbeddings(nn.Module): + config: BeitConfig + dtype: jnp.dtype = jnp.float32 # the dtype of the computation + + def setup(self): + self.num_channels = self.config.num_channels + image_size = self.config.image_size + patch_size = self.config.patch_size + num_patches = (image_size // patch_size) * (image_size // patch_size) + patch_shape = (image_size // patch_size, image_size // patch_size) + self.num_patches = num_patches + self.patch_shape = patch_shape + self.projection = nn.Conv( + self.config.hidden_size, + kernel_size=(patch_size, patch_size), + strides=(patch_size, patch_size), + padding="VALID", + dtype=self.dtype, + kernel_init=jax.nn.initializers.normal(self.config.initializer_range), + ) + + def __call__(self, pixel_values): + num_channels = pixel_values.shape[-1] + if num_channels != self.num_channels: + raise ValueError( + "Make sure that the channel dimension of the pixel values match with the one set in the configuration." + ) + embeddings = self.projection(pixel_values) + batch_size, _, _, channels = embeddings.shape + return jnp.reshape(embeddings, (batch_size, -1, channels)) + + +class FlaxBeitEmbeddings(nn.Module): + """Construct the CLS token, position and patch embeddings.""" + + config: BeitConfig + dtype: jnp.dtype = jnp.float32 # the dtype of the computation + + def setup(self): + self.cls_token = self.param("cls_token", nn.initializers.zeros, (1, 1, self.config.hidden_size)) + if self.config.use_mask_token: + self.mask_token = self.param("mask_token", nn.initializers.zeros, (1, 1, self.config.hidden_size)) + self.patch_embeddings = FlaxBeitPatchEmbeddings(self.config, dtype=self.dtype) + num_patches = self.patch_embeddings.num_patches + if self.config.use_absolute_position_embeddings: + self.position_embeddings = self.param( + "position_embeddings", nn.initializers.zeros, (1, num_patches + 1, self.config.hidden_size) + ) + self.dropout = nn.Dropout(rate=self.config.hidden_dropout_prob) + + def __call__(self, pixel_values, bool_masked_pos=None, deterministic=True): + embeddings = self.patch_embeddings(pixel_values) + batch_size, seq_len, _ = embeddings.shape + + cls_tokens = jnp.broadcast_to(self.cls_token, (batch_size, 1, self.config.hidden_size)) + cls_tokens = cls_tokens.astype(embeddings.dtype) + + if bool_masked_pos is not None: + mask_tokens = jnp.broadcast_to(self.mask_token, (batch_size, seq_len, self.config.hidden_size)) + mask_tokens = mask_tokens.astype(embeddings.dtype) + # replace the masked visual tokens by mask_tokens + w = jnp.expand_dims(bool_masked_pos, axis=-1) + embeddings = embeddings * (1 - w) + mask_tokens * w + + embeddings = jnp.concatenate((cls_tokens, embeddings), axis=1) + + if self.config.use_absolute_position_embeddings: + embeddings = embeddings + self.position_embeddings.astype(embeddings.dtype) + + embeddings = self.dropout(embeddings, deterministic=deterministic) + return embeddings + + +class FlaxBeitRelativePositionBias(nn.Module): + config: BeitConfig + window_size: Tuple[int, int] + dtype: jnp.dtype = jnp.float32 # the dtype of the computation + + def setup(self): + num_relative_distance = (2 * self.window_size[0] - 1) * (2 * self.window_size[1] - 1) + 3 + self.relative_position_bias_table = self.param( + "relative_position_bias_table", + nn.initializers.zeros, + (num_relative_distance, self.config.num_attention_heads), + ) # 2*Wh-1 * 2*Ww-1, nH + # cls to token & token 2 cls & cls to cls + + self.relative_position_index = relative_position_index_init(self.window_size) + + def __call__(self): + index = self.relative_position_index.reshape(-1) + shape = (self.window_size[0] * self.window_size[1] + 1, self.window_size[0] * self.window_size[1] + 1, -1) + relative_position_bias = self.relative_position_bias_table[index].reshape(shape) # Wh*Ww,Wh*Ww,nH + return jnp.transpose(relative_position_bias, (2, 0, 1)) + + +class FlaxBeitSelfAttention(nn.Module): + config: BeitConfig + window_size: Tuple[int, int] + dtype: jnp.dtype = jnp.float32 # the dtype of the computation + + def setup(self): + if self.config.hidden_size % self.config.num_attention_heads != 0 and not hasattr( + self.config, "embedding_size" + ): + raise ValueError( + f"The hidden size {self.config.hidden_size} is not a multiple of the number of attention " + f"heads {self.config.num_attention_heads}." + ) + + self.query = nn.Dense( + self.config.hidden_size, + dtype=self.dtype, + kernel_init=jax.nn.initializers.normal(self.config.initializer_range), + ) + self.key = nn.Dense( + self.config.hidden_size, + dtype=self.dtype, + kernel_init=jax.nn.initializers.normal(self.config.initializer_range), + use_bias=False, + ) + self.value = nn.Dense( + self.config.hidden_size, + dtype=self.dtype, + kernel_init=jax.nn.initializers.normal(self.config.initializer_range), + ) + + self.relative_position_bias = ( + FlaxBeitRelativePositionBias(self.config, window_size=self.window_size, dtype=self.dtype) + if self.window_size + else None + ) + + def __call__( + self, hidden_states, relative_position_bias=None, deterministic: bool = True, output_attentions: bool = False + ): + head_dim = self.config.hidden_size // self.config.num_attention_heads + + query_states = self.query(hidden_states).reshape( + hidden_states.shape[:2] + (self.config.num_attention_heads, head_dim) + ) + value_states = self.value(hidden_states).reshape( + hidden_states.shape[:2] + (self.config.num_attention_heads, head_dim) + ) + key_states = self.key(hidden_states).reshape( + hidden_states.shape[:2] + (self.config.num_attention_heads, head_dim) + ) + + dropout_rng = None + if not deterministic and self.config.attention_probs_dropout_prob > 0.0: + dropout_rng = self.make_rng("dropout") + + attention_bias = jnp.array(0.0, dtype=self.dtype) + # Add relative position bias if present. + if self.relative_position_bias is not None: + attention_bias = jnp.expand_dims(self.relative_position_bias(), 0) + attention_bias = attention_bias.astype(query_states.dtype) + + # Add shared relative position bias if provided. + if relative_position_bias is not None: + attention_bias = attention_bias + relative_position_bias.astype(attention_bias.dtype) + + attn_weights = dot_product_attention_weights( + query_states, + key_states, + bias=attention_bias, + dropout_rng=dropout_rng, + dropout_rate=self.config.attention_probs_dropout_prob, + broadcast_dropout=True, + deterministic=deterministic, + dtype=self.dtype, + precision=None, + ) + + attn_output = jnp.einsum("...hqk,...khd->...qhd", attn_weights, value_states) + attn_output = attn_output.reshape(attn_output.shape[:2] + (-1,)) + + outputs = (attn_output, attn_weights) if output_attentions else (attn_output,) + return outputs + + +class FlaxBeitSelfOutput(nn.Module): + config: BeitConfig + dtype: jnp.dtype = jnp.float32 # the dtype of the computation + + def setup(self): + self.dense = nn.Dense( + self.config.hidden_size, + kernel_init=jax.nn.initializers.normal(self.config.initializer_range), + dtype=self.dtype, + ) + self.dropout = nn.Dropout(rate=self.config.hidden_dropout_prob) + + def __call__(self, hidden_states, deterministic: bool = True): + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states, deterministic=deterministic) + return hidden_states + + +class FlaxBeitAttention(nn.Module): + config: BeitConfig + window_size: Tuple[int, int] + dtype: jnp.dtype = jnp.float32 + + def setup(self): + self.attention = FlaxBeitSelfAttention(self.config, self.window_size, dtype=self.dtype) + self.output = FlaxBeitSelfOutput(self.config, dtype=self.dtype) + + def __call__( + self, hidden_states, relative_position_bias=None, deterministic=True, output_attentions: bool = False + ): + attn_outputs = self.attention( + hidden_states, relative_position_bias, deterministic=deterministic, output_attentions=output_attentions + ) + attn_output = attn_outputs[0] + attn_output = self.output(attn_output, deterministic=deterministic) + + outputs = (attn_output,) + + if output_attentions: + outputs += (attn_outputs[1],) + + return outputs + + +class FlaxBeitIntermediate(nn.Module): + config: BeitConfig + dtype: jnp.dtype = jnp.float32 # the dtype of the computation + + def setup(self): + self.dense = nn.Dense( + self.config.intermediate_size, + kernel_init=jax.nn.initializers.normal(self.config.initializer_range), + dtype=self.dtype, + ) + self.activation = ACT2FN[self.config.hidden_act] + + def __call__(self, hidden_states): + hidden_states = self.dense(hidden_states) + hidden_states = self.activation(hidden_states) + + return hidden_states + + +class FlaxBeitOutput(nn.Module): + config: BeitConfig + dtype: jnp.dtype = jnp.float32 # the dtype of the computation + + def setup(self): + self.dense = nn.Dense( + self.config.hidden_size, + kernel_init=jax.nn.initializers.normal(self.config.initializer_range), + dtype=self.dtype, + ) + self.dropout = nn.Dropout(rate=self.config.hidden_dropout_prob) + + def __call__(self, hidden_states, deterministic: bool = True): + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states, deterministic=deterministic) + + return hidden_states + + +class FlaxBeitLayer(nn.Module): + config: BeitConfig + window_size: Tuple[int, int] + drop_path_rate: float + dtype: jnp.dtype = jnp.float32 # the dtype of the computation + + def setup(self): + self.attention = FlaxBeitAttention(self.config, self.window_size, dtype=self.dtype) + self.intermediate = FlaxBeitIntermediate(self.config, dtype=self.dtype) + self.output = FlaxBeitOutput(self.config, dtype=self.dtype) + self.layernorm_before = nn.LayerNorm(epsilon=self.config.layer_norm_eps, dtype=self.dtype) + self.drop_path = FlaxBeitDropPath(rate=self.drop_path_rate) + self.layernorm_after = nn.LayerNorm(epsilon=self.config.layer_norm_eps, dtype=self.dtype) + + self.init_values = self.config.layer_scale_init_value + if self.init_values > 0: + self.lambda_1 = self.param("lambda_1", ones_with_scale, (self.config.hidden_size), self.init_values) + self.lambda_2 = self.param("lambda_2", ones_with_scale, (self.config.hidden_size), self.init_values) + else: + self.lambda_1 = None + self.lambda_2 = None + + def __call__( + self, hidden_states, relative_position_bias=None, deterministic: bool = True, output_attentions: bool = False + ): + self_attention_outputs = self.attention( + self.layernorm_before(hidden_states), # in BEiT, layernorm is applied before self-attention + relative_position_bias, + deterministic=deterministic, + output_attentions=output_attentions, + ) + attention_output = self_attention_outputs[0] + + # apply lambda_1 if present + if self.lambda_1 is not None: + attention_output = self.lambda_1.astype(attention_output.dtype) * attention_output + + # first residual connection + hidden_states = self.drop_path(attention_output, deterministic=deterministic) + hidden_states + + # in BEiT, layernorm is also applied after self-attention + layer_output = self.layernorm_after(hidden_states) + + layer_output = self.intermediate(layer_output) + layer_output = self.output(layer_output, deterministic=deterministic) + + # apply lambda_2 if present + if self.lambda_2 is not None: + layer_output = self.lambda_2.astype(layer_output.dtype) * layer_output + + # second residual connection + layer_output = self.drop_path(layer_output, deterministic=deterministic) + hidden_states + + outputs = (layer_output,) + + if output_attentions: + outputs += (self_attention_outputs[1],) + + return outputs + + +class FlaxBeitLayerCollection(nn.Module): + config: BeitConfig + window_size: Tuple[int, int] + drop_path_rates: List[float] + relative_position_bias: Callable[[], jnp.ndarray] + dtype: jnp.dtype = jnp.float32 # the dtype of the computation + + def setup(self): + self.layers = [ + FlaxBeitLayer( + self.config, + window_size=self.window_size if self.config.use_relative_position_bias else None, + drop_path_rate=self.drop_path_rates[i], + name=str(i), + dtype=self.dtype, + ) + for i in range(self.config.num_hidden_layers) + ] + + def __call__( + self, + hidden_states, + deterministic: bool = True, + output_attentions: bool = False, + output_hidden_states: bool = False, + return_dict: bool = True, + ): + all_attentions = () if output_attentions else None + all_hidden_states = () if output_hidden_states else None + + for i, layer in enumerate(self.layers): + if output_hidden_states: + all_hidden_states += (hidden_states,) + relative_position_bias = self.relative_position_bias() if self.relative_position_bias is not None else None + layer_outputs = layer( + hidden_states, relative_position_bias, deterministic=deterministic, output_attentions=output_attentions + ) + + hidden_states = layer_outputs[0] + + if output_attentions: + all_attentions += (layer_outputs[1],) + + if output_hidden_states: + all_hidden_states += (hidden_states,) + + outputs = (hidden_states,) + if not return_dict: + return tuple(v for v in outputs if v is not None) + + return FlaxBaseModelOutput( + last_hidden_state=hidden_states, hidden_states=all_hidden_states, attentions=all_attentions + ) + + +class FlaxBeitEncoder(nn.Module): + config: BeitConfig + window_size: Tuple[int, int] + dtype: jnp.dtype = jnp.float32 # the dtype of the computation + + def setup(self): + if self.config.use_shared_relative_position_bias: + self.relative_position_bias = FlaxBeitRelativePositionBias( + config=self.config, window_size=self.window_size, dtype=self.dtype + ) + + # stochastic depth decay rule + drop_path_rates = list(np.linspace(0, self.config.drop_path_rate, self.config.num_hidden_layers)) + self.layer = FlaxBeitLayerCollection( + self.config, + window_size=self.window_size, + drop_path_rates=drop_path_rates, + relative_position_bias=self.relative_position_bias + if self.config.use_shared_relative_position_bias + else None, + dtype=self.dtype, + ) + + def __call__( + self, + hidden_states, + deterministic: bool = True, + output_attentions: bool = False, + output_hidden_states: bool = False, + return_dict: bool = True, + ): + return self.layer( + hidden_states, + deterministic=deterministic, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + +class FlaxBeitPreTrainedModel(FlaxPreTrainedModel): + """ + An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained + models. + """ + + config_class = BeitConfig + base_model_prefix = "beit" + main_input_name = "pixel_values" + module_class: nn.Module = None + + def __init__( + self, + config: BeitConfig, + input_shape=None, + seed: int = 0, + dtype: jnp.dtype = jnp.float32, + _do_init: bool = True, + **kwargs, + ): + module = self.module_class(config=config, dtype=dtype, **kwargs) + if input_shape is None: + input_shape = (1, config.image_size, config.image_size, config.num_channels) + super().__init__(config, module, input_shape=input_shape, seed=seed, dtype=dtype, _do_init=_do_init) + + def init_weights(self, rng: jax.random.PRNGKey, input_shape: Tuple, params: FrozenDict = None) -> FrozenDict: + # init input tensors + pixel_values = jnp.zeros(input_shape, dtype=self.dtype) + + params_rng, dropout_rng = jax.random.split(rng) + dropout_rng, droppath_rng = jax.random.split(dropout_rng) + rngs = {"params": params_rng, "dropout": dropout_rng, "droppath": droppath_rng} + + random_params = self.module.init(rngs, pixel_values, return_dict=False)["params"] + + if params is not None: + random_params = flatten_dict(unfreeze(random_params)) + params = flatten_dict(unfreeze(params)) + for missing_key in self._missing_keys: + params[missing_key] = random_params[missing_key] + self._missing_keys = set() + return freeze(unflatten_dict(params)) + else: + return random_params + + @add_start_docstrings_to_model_forward(BEIT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + def __call__( + self, + pixel_values, + bool_masked_pos=None, + params: dict = None, + dropout_rng: jax.random.PRNGKey = None, + train: bool = False, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ): + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.return_dict + + pixel_values = jnp.transpose(pixel_values, (0, 2, 3, 1)) + # Handle any PRNG if needed + rngs = {} + if dropout_rng is not None: + dropout_rng, droppath_rng = jax.random.split(dropout_rng) + rngs["dropout"] = dropout_rng + rngs["droppath"] = droppath_rng + + return self.module.apply( + {"params": params or self.params}, + jnp.array(pixel_values, dtype=jnp.float32), + bool_masked_pos, + not train, + output_attentions, + output_hidden_states, + return_dict, + rngs=rngs, + ) + + +class FlaxBeitPooler(nn.Module): + config: BeitConfig + dtype: jnp.dtype = jnp.float32 # the dtype of the computation + + def setup(self): + if self.config.use_mean_pooling: + self.layernorm = nn.LayerNorm(epsilon=self.config.layer_norm_eps, dtype=self.dtype) + + def __call__(self, hidden_states): + if self.config.use_mean_pooling: + # Mean pool the final hidden states of the patch tokens + patch_tokens = hidden_states[:, 1:, :] + pooled_output = self.layernorm(jnp.mean(patch_tokens, axis=1)) + else: + # Pool by simply taking the final hidden state of the [CLS] token + pooled_output = hidden_states[:, 0] + + return pooled_output + + +class FlaxBeitModule(nn.Module): + config: BeitConfig + dtype: jnp.dtype = jnp.float32 # the dtype of the computation + add_pooling_layer: bool = True + + def setup(self): + self.embeddings = FlaxBeitEmbeddings(self.config, dtype=self.dtype) + self.encoder = FlaxBeitEncoder( + self.config, window_size=self.embeddings.patch_embeddings.patch_shape, dtype=self.dtype + ) + if not self.config.use_mean_pooling: + self.layernorm = nn.LayerNorm(epsilon=self.config.layer_norm_eps, dtype=self.dtype) + self.pooler = FlaxBeitPooler(self.config, dtype=self.dtype) if self.add_pooling_layer else None + + def __call__( + self, + pixel_values, + bool_masked_pos=None, + deterministic: bool = True, + output_attentions: bool = False, + output_hidden_states: bool = False, + return_dict: bool = True, + ): + hidden_states = self.embeddings(pixel_values, bool_masked_pos, deterministic=deterministic) + + outputs = self.encoder( + hidden_states, + deterministic=deterministic, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + hidden_states = outputs[0] + if not self.config.use_mean_pooling: + hidden_states = self.layernorm(hidden_states) + pooled = self.pooler(hidden_states) if self.add_pooling_layer else None + + if not return_dict: + # if pooled is None, don't return it + if pooled is None: + return (hidden_states,) + outputs[1:] + return (hidden_states, pooled) + outputs[1:] + + return FlaxBeitModelOutputWithPooling( + last_hidden_state=hidden_states, + pooler_output=pooled, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +@add_start_docstrings( + "The bare Beit Model transformer outputting raw hidden-states without any specific head on top.", + BEIT_START_DOCSTRING, +) +class FlaxBeitModel(FlaxBeitPreTrainedModel): + module_class = FlaxBeitModule + + +FLAX_BEIT_MODEL_DOCSTRING = """ + Returns: + + Examples: + + ```python + >>> from transformers import AutoImageProcessor, FlaxBeitModel + >>> from PIL import Image + >>> import requests + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + + >>> image_processor = AutoImageProcessor.from_pretrained("microsoft/beit-base-patch16-224-pt22k-ft22k") + >>> model = FlaxBeitModel.from_pretrained("microsoft/beit-base-patch16-224-pt22k-ft22k") + + >>> inputs = image_processor(images=image, return_tensors="np") + >>> outputs = model(**inputs) + >>> last_hidden_states = outputs.last_hidden_state + ``` +""" + +overwrite_call_docstring(FlaxBeitModel, FLAX_BEIT_MODEL_DOCSTRING) +append_replace_return_docstrings(FlaxBeitModel, output_type=FlaxBeitModelOutputWithPooling, config_class=BeitConfig) + + +class FlaxBeitForMaskedImageModelingModule(nn.Module): + config: BeitConfig + dtype: jnp.dtype = jnp.float32 # the dtype of the computation + + def setup(self): + self.beit = FlaxBeitModule(self.config, add_pooling_layer=False, dtype=self.dtype) + + # Classifier head + self.layernorm = nn.LayerNorm(epsilon=self.config.layer_norm_eps, dtype=self.dtype) + self.lm_head = nn.Dense( + self.config.vocab_size, + kernel_init=jax.nn.initializers.normal(self.config.initializer_range), + dtype=self.dtype, + ) + + def __call__( + self, + pixel_values=None, + bool_masked_pos=None, + deterministic: bool = True, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + ): + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.beit( + pixel_values, + bool_masked_pos, + deterministic=deterministic, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + sequence_output = outputs[0] + sequence_output = self.layernorm(sequence_output) + prediction_scores = self.lm_head(sequence_output[:, 1:]) + + if not return_dict: + output = (prediction_scores,) + outputs[2:] + return output + + return FlaxMaskedLMOutput( + logits=prediction_scores, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +@add_start_docstrings( + "Beit Model transformer with a 'language' modeling head on top (to predict visual tokens).", + BEIT_START_DOCSTRING, +) +class FlaxBeitForMaskedImageModeling(FlaxBeitPreTrainedModel): + module_class = FlaxBeitForMaskedImageModelingModule + + +FLAX_BEIT_MLM_DOCSTRING = """ + bool_masked_pos (`numpy.ndarray` of shape `(batch_size, num_patches)`): + Boolean masked positions. Indicates which patches are masked (1) and which aren't (0). + + Returns: + + Examples: + + ```python + >>> from transformers import AutoImageProcessor, BeitForMaskedImageModeling + >>> from PIL import Image + >>> import requests + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + + >>> image_processor = AutoImageProcessor.from_pretrained("microsoft/beit-base-patch16-224-pt22k") + >>> model = BeitForMaskedImageModeling.from_pretrained("microsoft/beit-base-patch16-224-pt22k") + + >>> inputs = image_processor(images=image, return_tensors="np") + >>> outputs = model(**inputs) + >>> logits = outputs.logits + ``` +""" + +overwrite_call_docstring(FlaxBeitForMaskedImageModeling, FLAX_BEIT_MLM_DOCSTRING) +append_replace_return_docstrings( + FlaxBeitForMaskedImageModeling, output_type=FlaxMaskedLMOutput, config_class=BeitConfig +) + + +class FlaxBeitForImageClassificationModule(nn.Module): + config: BeitConfig + dtype: jnp.dtype = jnp.float32 + + def setup(self): + self.beit = FlaxBeitModule(config=self.config, dtype=self.dtype, add_pooling_layer=True) + self.classifier = nn.Dense( + self.config.num_labels, + kernel_init=jax.nn.initializers.normal(self.config.initializer_range), + dtype=self.dtype, + ) + + def __call__( + self, + pixel_values=None, + bool_masked_pos=None, + deterministic: bool = True, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + ): + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.beit( + pixel_values, + deterministic=deterministic, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + pooled_output = outputs[1] + logits = self.classifier(pooled_output) + + if not return_dict: + output = (logits,) + outputs[2:] + return output + + return FlaxSequenceClassifierOutput( + logits=logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +@add_start_docstrings( + """ + Beit Model transformer with an image classification head on top (a linear layer on top of the average of the final + hidden states of the patch tokens) e.g. for ImageNet. + """, + BEIT_START_DOCSTRING, +) +class FlaxBeitForImageClassification(FlaxBeitPreTrainedModel): + module_class = FlaxBeitForImageClassificationModule + + +FLAX_BEIT_CLASSIF_DOCSTRING = """ + Returns: + + Example: + + ```python + >>> from transformers import AutoImageProcessor, FlaxBeitForImageClassification + >>> from PIL import Image + >>> import requests + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + + >>> image_processor = AutoImageProcessor.from_pretrained("microsoft/beit-base-patch16-224") + >>> model = FlaxBeitForImageClassification.from_pretrained("microsoft/beit-base-patch16-224") + + >>> inputs = image_processor(images=image, return_tensors="np") + >>> outputs = model(**inputs) + >>> logits = outputs.logits + >>> # model predicts one of the 1000 ImageNet classes + >>> predicted_class_idx = logits.argmax(-1).item() + >>> print("Predicted class:", model.config.id2label[predicted_class_idx]) + ``` +""" + +overwrite_call_docstring(FlaxBeitForImageClassification, FLAX_BEIT_CLASSIF_DOCSTRING) +append_replace_return_docstrings( + FlaxBeitForImageClassification, output_type=FlaxSequenceClassifierOutput, config_class=BeitConfig +) + + +__all__ = [ + "FlaxBeitForImageClassification", + "FlaxBeitForMaskedImageModeling", + "FlaxBeitModel", + "FlaxBeitPreTrainedModel", +] diff --git a/docs/transformers/build/lib/transformers/models/bert/__init__.py b/docs/transformers/build/lib/transformers/models/bert/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..2ef22794dde26e6275ba0ae850f6042ff6a451fd --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/bert/__init__.py @@ -0,0 +1,32 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ...utils import _LazyModule +from ...utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .configuration_bert import * + from .modeling_bert import * + from .modeling_flax_bert import * + from .modeling_tf_bert import * + from .tokenization_bert import * + from .tokenization_bert_fast import * + from .tokenization_bert_tf import * +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/docs/transformers/build/lib/transformers/models/bert/configuration_bert.py b/docs/transformers/build/lib/transformers/models/bert/configuration_bert.py new file mode 100644 index 0000000000000000000000000000000000000000..ea29fb81c435aa710dfb8912bf83c7068ed30ed6 --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/bert/configuration_bert.py @@ -0,0 +1,154 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""BERT model configuration""" + +from collections import OrderedDict +from typing import Mapping + +from ...configuration_utils import PretrainedConfig +from ...onnx import OnnxConfig +from ...utils import logging + + +logger = logging.get_logger(__name__) + + +class BertConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`BertModel`] or a [`TFBertModel`]. It is used to + instantiate a BERT model according to the specified arguments, defining the model architecture. Instantiating a + configuration with the defaults will yield a similar configuration to that of the BERT + [google-bert/bert-base-uncased](https://huggingface.co/google-bert/bert-base-uncased) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + + Args: + vocab_size (`int`, *optional*, defaults to 30522): + Vocabulary size of the BERT model. Defines the number of different tokens that can be represented by the + `inputs_ids` passed when calling [`BertModel`] or [`TFBertModel`]. + hidden_size (`int`, *optional*, defaults to 768): + Dimensionality of the encoder layers and the pooler layer. + num_hidden_layers (`int`, *optional*, defaults to 12): + Number of hidden layers in the Transformer encoder. + num_attention_heads (`int`, *optional*, defaults to 12): + Number of attention heads for each attention layer in the Transformer encoder. + intermediate_size (`int`, *optional*, defaults to 3072): + Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder. + hidden_act (`str` or `Callable`, *optional*, defaults to `"gelu"`): + The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, + `"relu"`, `"silu"` and `"gelu_new"` are supported. + hidden_dropout_prob (`float`, *optional*, defaults to 0.1): + The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. + attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1): + The dropout ratio for the attention probabilities. + max_position_embeddings (`int`, *optional*, defaults to 512): + The maximum sequence length that this model might ever be used with. Typically set this to something large + just in case (e.g., 512 or 1024 or 2048). + type_vocab_size (`int`, *optional*, defaults to 2): + The vocabulary size of the `token_type_ids` passed when calling [`BertModel`] or [`TFBertModel`]. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + layer_norm_eps (`float`, *optional*, defaults to 1e-12): + The epsilon used by the layer normalization layers. + position_embedding_type (`str`, *optional*, defaults to `"absolute"`): + Type of position embedding. Choose one of `"absolute"`, `"relative_key"`, `"relative_key_query"`. For + positional embeddings use `"absolute"`. For more information on `"relative_key"`, please refer to + [Self-Attention with Relative Position Representations (Shaw et al.)](https://arxiv.org/abs/1803.02155). + For more information on `"relative_key_query"`, please refer to *Method 4* in [Improve Transformer Models + with Better Relative Position Embeddings (Huang et al.)](https://arxiv.org/abs/2009.13658). + is_decoder (`bool`, *optional*, defaults to `False`): + Whether the model is used as a decoder or not. If `False`, the model is used as an encoder. + use_cache (`bool`, *optional*, defaults to `True`): + Whether or not the model should return the last key/values attentions (not used by all models). Only + relevant if `config.is_decoder=True`. + classifier_dropout (`float`, *optional*): + The dropout ratio for the classification head. + + Examples: + + ```python + >>> from transformers import BertConfig, BertModel + + >>> # Initializing a BERT google-bert/bert-base-uncased style configuration + >>> configuration = BertConfig() + + >>> # Initializing a model (with random weights) from the google-bert/bert-base-uncased style configuration + >>> model = BertModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "bert" + + def __init__( + self, + vocab_size=30522, + hidden_size=768, + num_hidden_layers=12, + num_attention_heads=12, + intermediate_size=3072, + hidden_act="gelu", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + max_position_embeddings=512, + type_vocab_size=2, + initializer_range=0.02, + layer_norm_eps=1e-12, + pad_token_id=0, + position_embedding_type="absolute", + use_cache=True, + classifier_dropout=None, + **kwargs, + ): + super().__init__(pad_token_id=pad_token_id, **kwargs) + + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.hidden_act = hidden_act + self.intermediate_size = intermediate_size + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.max_position_embeddings = max_position_embeddings + self.type_vocab_size = type_vocab_size + self.initializer_range = initializer_range + self.layer_norm_eps = layer_norm_eps + self.position_embedding_type = position_embedding_type + self.use_cache = use_cache + self.classifier_dropout = classifier_dropout + + +class BertOnnxConfig(OnnxConfig): + @property + def inputs(self) -> Mapping[str, Mapping[int, str]]: + if self.task == "multiple-choice": + dynamic_axis = {0: "batch", 1: "choice", 2: "sequence"} + else: + dynamic_axis = {0: "batch", 1: "sequence"} + return OrderedDict( + [ + ("input_ids", dynamic_axis), + ("attention_mask", dynamic_axis), + ("token_type_ids", dynamic_axis), + ] + ) + + +__all__ = ["BertConfig", "BertOnnxConfig"] diff --git a/docs/transformers/build/lib/transformers/models/bert/convert_bert_original_tf2_checkpoint_to_pytorch.py b/docs/transformers/build/lib/transformers/models/bert/convert_bert_original_tf2_checkpoint_to_pytorch.py new file mode 100644 index 0000000000000000000000000000000000000000..9dfd8da474e37419472fe9b3ccd35d9836c671a1 --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/bert/convert_bert_original_tf2_checkpoint_to_pytorch.py @@ -0,0 +1,246 @@ +# Copyright 2020 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +This script can be used to convert a head-less TF2.x Bert model to PyTorch, as published on the official (now +deprecated) GitHub: https://github.com/tensorflow/models/tree/v2.3.0/official/nlp/bert + +TF2.x uses different variable names from the original BERT (TF 1.4) implementation. The script re-maps the TF2.x Bert +weight names to the original names, so the model can be imported with Huggingface/transformer. + +You may adapt this script to include classification/MLM/NSP/etc. heads. + +Note: This script is only working with an older version of the TensorFlow models repository (<= v2.3.0). + Models trained with never versions are not compatible with this script. +""" + +import argparse +import os +import re + +import tensorflow as tf +import torch + +from transformers import BertConfig, BertModel +from transformers.utils import logging + + +logging.set_verbosity_info() +logger = logging.get_logger(__name__) + + +def load_tf2_weights_in_bert(model, tf_checkpoint_path, config): + tf_path = os.path.abspath(tf_checkpoint_path) + logger.info(f"Converting TensorFlow checkpoint from {tf_path}") + # Load weights from TF model + init_vars = tf.train.list_variables(tf_path) + names = [] + arrays = [] + layer_depth = [] + for full_name, shape in init_vars: + # logger.info(f"Loading TF weight {name} with shape {shape}") + name = full_name.split("/") + if full_name == "_CHECKPOINTABLE_OBJECT_GRAPH" or name[0] in ["global_step", "save_counter"]: + logger.info(f"Skipping non-model layer {full_name}") + continue + if "optimizer" in full_name: + logger.info(f"Skipping optimization layer {full_name}") + continue + if name[0] == "model": + # ignore initial 'model' + name = name[1:] + # figure out how many levels deep the name is + depth = 0 + for _name in name: + if _name.startswith("layer_with_weights"): + depth += 1 + else: + break + layer_depth.append(depth) + # read data + array = tf.train.load_variable(tf_path, full_name) + names.append("/".join(name)) + arrays.append(array) + logger.info(f"Read a total of {len(arrays):,} layers") + + # Sanity check + if len(set(layer_depth)) != 1: + raise ValueError(f"Found layer names with different depths (layer depth {list(set(layer_depth))})") + layer_depth = list(set(layer_depth))[0] + if layer_depth != 1: + raise ValueError( + "The model contains more than just the embedding/encoder layers. This script does not handle MLM/NSP" + " heads." + ) + + # convert layers + logger.info("Converting weights...") + for full_name, array in zip(names, arrays): + name = full_name.split("/") + pointer = model + trace = [] + for i, m_name in enumerate(name): + if m_name == ".ATTRIBUTES": + # variable names end with .ATTRIBUTES/VARIABLE_VALUE + break + if m_name.startswith("layer_with_weights"): + layer_num = int(m_name.split("-")[-1]) + if layer_num <= 2: + # embedding layers + # layer_num 0: word_embeddings + # layer_num 1: position_embeddings + # layer_num 2: token_type_embeddings + continue + elif layer_num == 3: + # embedding LayerNorm + trace.extend(["embeddings", "LayerNorm"]) + pointer = getattr(pointer, "embeddings") + pointer = getattr(pointer, "LayerNorm") + elif layer_num > 3 and layer_num < config.num_hidden_layers + 4: + # encoder layers + trace.extend(["encoder", "layer", str(layer_num - 4)]) + pointer = getattr(pointer, "encoder") + pointer = getattr(pointer, "layer") + pointer = pointer[layer_num - 4] + elif layer_num == config.num_hidden_layers + 4: + # pooler layer + trace.extend(["pooler", "dense"]) + pointer = getattr(pointer, "pooler") + pointer = getattr(pointer, "dense") + elif m_name == "embeddings": + trace.append("embeddings") + pointer = getattr(pointer, "embeddings") + if layer_num == 0: + trace.append("word_embeddings") + pointer = getattr(pointer, "word_embeddings") + elif layer_num == 1: + trace.append("position_embeddings") + pointer = getattr(pointer, "position_embeddings") + elif layer_num == 2: + trace.append("token_type_embeddings") + pointer = getattr(pointer, "token_type_embeddings") + else: + raise ValueError(f"Unknown embedding layer with name {full_name}") + trace.append("weight") + pointer = getattr(pointer, "weight") + elif m_name == "_attention_layer": + # self-attention layer + trace.extend(["attention", "self"]) + pointer = getattr(pointer, "attention") + pointer = getattr(pointer, "self") + elif m_name == "_attention_layer_norm": + # output attention norm + trace.extend(["attention", "output", "LayerNorm"]) + pointer = getattr(pointer, "attention") + pointer = getattr(pointer, "output") + pointer = getattr(pointer, "LayerNorm") + elif m_name == "_attention_output_dense": + # output attention dense + trace.extend(["attention", "output", "dense"]) + pointer = getattr(pointer, "attention") + pointer = getattr(pointer, "output") + pointer = getattr(pointer, "dense") + elif m_name == "_output_dense": + # output dense + trace.extend(["output", "dense"]) + pointer = getattr(pointer, "output") + pointer = getattr(pointer, "dense") + elif m_name == "_output_layer_norm": + # output dense + trace.extend(["output", "LayerNorm"]) + pointer = getattr(pointer, "output") + pointer = getattr(pointer, "LayerNorm") + elif m_name == "_key_dense": + # attention key + trace.append("key") + pointer = getattr(pointer, "key") + elif m_name == "_query_dense": + # attention query + trace.append("query") + pointer = getattr(pointer, "query") + elif m_name == "_value_dense": + # attention value + trace.append("value") + pointer = getattr(pointer, "value") + elif m_name == "_intermediate_dense": + # attention intermediate dense + trace.extend(["intermediate", "dense"]) + pointer = getattr(pointer, "intermediate") + pointer = getattr(pointer, "dense") + elif m_name == "_output_layer_norm": + # output layer norm + trace.append("output") + pointer = getattr(pointer, "output") + # weights & biases + elif m_name in ["bias", "beta"]: + trace.append("bias") + pointer = getattr(pointer, "bias") + elif m_name in ["kernel", "gamma"]: + trace.append("weight") + pointer = getattr(pointer, "weight") + else: + logger.warning(f"Ignored {m_name}") + # for certain layers reshape is necessary + trace = ".".join(trace) + if re.match(r"(\S+)\.attention\.self\.(key|value|query)\.(bias|weight)", trace) or re.match( + r"(\S+)\.attention\.output\.dense\.weight", trace + ): + array = array.reshape(pointer.data.shape) + if "kernel" in full_name: + array = array.transpose() + if pointer.shape == array.shape: + pointer.data = torch.from_numpy(array) + else: + raise ValueError( + f"Shape mismatch in layer {full_name}: Model expects shape {pointer.shape} but layer contains shape:" + f" {array.shape}" + ) + logger.info(f"Successfully set variable {full_name} to PyTorch layer {trace}") + return model + + +def convert_tf2_checkpoint_to_pytorch(tf_checkpoint_path, config_path, pytorch_dump_path): + # Instantiate model + logger.info(f"Loading model based on config from {config_path}...") + config = BertConfig.from_json_file(config_path) + model = BertModel(config) + + # Load weights from checkpoint + logger.info(f"Loading weights from checkpoint {tf_checkpoint_path}...") + load_tf2_weights_in_bert(model, tf_checkpoint_path, config) + + # Save pytorch-model + logger.info(f"Saving PyTorch model to {pytorch_dump_path}...") + torch.save(model.state_dict(), pytorch_dump_path) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + "--tf_checkpoint_path", type=str, required=True, help="Path to the TensorFlow 2.x checkpoint path." + ) + parser.add_argument( + "--bert_config_file", + type=str, + required=True, + help="The config json file corresponding to the BERT model. This specifies the model architecture.", + ) + parser.add_argument( + "--pytorch_dump_path", + type=str, + required=True, + help="Path to the output PyTorch model (must include filename).", + ) + args = parser.parse_args() + convert_tf2_checkpoint_to_pytorch(args.tf_checkpoint_path, args.bert_config_file, args.pytorch_dump_path) diff --git a/docs/transformers/build/lib/transformers/models/bert/convert_bert_original_tf_checkpoint_to_pytorch.py b/docs/transformers/build/lib/transformers/models/bert/convert_bert_original_tf_checkpoint_to_pytorch.py new file mode 100644 index 0000000000000000000000000000000000000000..be904ddd7e6c192676aaff97da587a963749c215 --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/bert/convert_bert_original_tf_checkpoint_to_pytorch.py @@ -0,0 +1,62 @@ +# coding=utf-8 +# Copyright 2018 The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Convert BERT checkpoint.""" + +import argparse + +import torch + +from transformers import BertConfig, BertForPreTraining, load_tf_weights_in_bert +from transformers.utils import logging + + +logging.set_verbosity_info() + + +def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, bert_config_file, pytorch_dump_path): + # Initialise PyTorch model + config = BertConfig.from_json_file(bert_config_file) + print(f"Building PyTorch model from configuration: {config}") + model = BertForPreTraining(config) + + # Load weights from tf checkpoint + load_tf_weights_in_bert(model, config, tf_checkpoint_path) + + # Save pytorch-model + print(f"Save PyTorch model to {pytorch_dump_path}") + torch.save(model.state_dict(), pytorch_dump_path) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + # Required parameters + parser.add_argument( + "--tf_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path." + ) + parser.add_argument( + "--bert_config_file", + default=None, + type=str, + required=True, + help=( + "The config json file corresponding to the pre-trained BERT model. \n" + "This specifies the model architecture." + ), + ) + parser.add_argument( + "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model." + ) + args = parser.parse_args() + convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path, args.bert_config_file, args.pytorch_dump_path) diff --git a/docs/transformers/build/lib/transformers/models/bert/convert_bert_pytorch_checkpoint_to_original_tf.py b/docs/transformers/build/lib/transformers/models/bert/convert_bert_pytorch_checkpoint_to_original_tf.py new file mode 100644 index 0000000000000000000000000000000000000000..8e1e85d5c04e51041822bbfc1843e14a820a1f0b --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/bert/convert_bert_pytorch_checkpoint_to_original_tf.py @@ -0,0 +1,112 @@ +# coding=utf-8 +# Copyright 2018 The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Convert Huggingface Pytorch checkpoint to Tensorflow checkpoint.""" + +import argparse +import os + +import numpy as np +import tensorflow as tf +import torch + +from transformers import BertModel + + +def convert_pytorch_checkpoint_to_tf(model: BertModel, ckpt_dir: str, model_name: str): + """ + Args: + model: BertModel Pytorch model instance to be converted + ckpt_dir: Tensorflow model directory + model_name: model name + + Currently supported HF models: + + - Y BertModel + - N BertForMaskedLM + - N BertForPreTraining + - N BertForMultipleChoice + - N BertForNextSentencePrediction + - N BertForSequenceClassification + - N BertForQuestionAnswering + """ + + tensors_to_transpose = ("dense.weight", "attention.self.query", "attention.self.key", "attention.self.value") + + var_map = ( + ("layer.", "layer_"), + ("word_embeddings.weight", "word_embeddings"), + ("position_embeddings.weight", "position_embeddings"), + ("token_type_embeddings.weight", "token_type_embeddings"), + (".", "/"), + ("LayerNorm/weight", "LayerNorm/gamma"), + ("LayerNorm/bias", "LayerNorm/beta"), + ("weight", "kernel"), + ) + + if not os.path.isdir(ckpt_dir): + os.makedirs(ckpt_dir) + + state_dict = model.state_dict() + + def to_tf_var_name(name: str): + for patt, repl in iter(var_map): + name = name.replace(patt, repl) + return f"bert/{name}" + + def create_tf_var(tensor: np.ndarray, name: str, session: tf.Session): + tf_dtype = tf.dtypes.as_dtype(tensor.dtype) + tf_var = tf.get_variable(dtype=tf_dtype, shape=tensor.shape, name=name, initializer=tf.zeros_initializer()) + session.run(tf.variables_initializer([tf_var])) + session.run(tf_var) + return tf_var + + tf.reset_default_graph() + with tf.Session() as session: + for var_name in state_dict: + tf_name = to_tf_var_name(var_name) + torch_tensor = state_dict[var_name].numpy() + if any(x in var_name for x in tensors_to_transpose): + torch_tensor = torch_tensor.T + tf_var = create_tf_var(tensor=torch_tensor, name=tf_name, session=session) + tf_var.assign(tf.cast(torch_tensor, tf_var.dtype)) + tf_weight = session.run(tf_var) + print(f"Successfully created {tf_name}: {np.allclose(tf_weight, torch_tensor)}") + + saver = tf.train.Saver(tf.trainable_variables()) + saver.save(session, os.path.join(ckpt_dir, model_name.replace("-", "_") + ".ckpt")) + + +def main(raw_args=None): + parser = argparse.ArgumentParser() + parser.add_argument("--model_name", type=str, required=True, help="model name e.g. google-bert/bert-base-uncased") + parser.add_argument( + "--cache_dir", type=str, default=None, required=False, help="Directory containing pytorch model" + ) + parser.add_argument("--pytorch_model_path", type=str, required=True, help="/path/to/.bin") + parser.add_argument("--tf_cache_dir", type=str, required=True, help="Directory in which to save tensorflow model") + args = parser.parse_args(raw_args) + + model = BertModel.from_pretrained( + pretrained_model_name_or_path=args.model_name, + state_dict=torch.load(args.pytorch_model_path, weights_only=True), + cache_dir=args.cache_dir, + ) + + convert_pytorch_checkpoint_to_tf(model=model, ckpt_dir=args.tf_cache_dir, model_name=args.model_name) + + +if __name__ == "__main__": + main() diff --git a/docs/transformers/build/lib/transformers/models/bert/convert_bert_token_dropping_original_tf2_checkpoint_to_pytorch.py b/docs/transformers/build/lib/transformers/models/bert/convert_bert_token_dropping_original_tf2_checkpoint_to_pytorch.py new file mode 100644 index 0000000000000000000000000000000000000000..cba1e1a2c3f73e8932fd8bce816e21df5818aa60 --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/bert/convert_bert_token_dropping_original_tf2_checkpoint_to_pytorch.py @@ -0,0 +1,188 @@ +# Copyright 2022 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +""" +This script converts a lm-head checkpoint from the "Token Dropping" implementation into a PyTorch-compatible BERT +model. The official implementation of "Token Dropping" can be found in the TensorFlow Models repository: + +https://github.com/tensorflow/models/tree/master/official/projects/token_dropping +""" + +import argparse + +import tensorflow as tf +import torch + +from transformers import BertConfig, BertForMaskedLM +from transformers.models.bert.modeling_bert import ( + BertIntermediate, + BertLayer, + BertOutput, + BertPooler, + BertSelfAttention, + BertSelfOutput, +) +from transformers.utils import logging + + +logging.set_verbosity_info() + + +def convert_checkpoint_to_pytorch(tf_checkpoint_path: str, config_path: str, pytorch_dump_path: str): + def get_masked_lm_array(name: str): + full_name = f"masked_lm/{name}/.ATTRIBUTES/VARIABLE_VALUE" + array = tf.train.load_variable(tf_checkpoint_path, full_name) + + if "kernel" in name: + array = array.transpose() + + return torch.from_numpy(array) + + def get_encoder_array(name: str): + full_name = f"encoder/{name}/.ATTRIBUTES/VARIABLE_VALUE" + array = tf.train.load_variable(tf_checkpoint_path, full_name) + + if "kernel" in name: + array = array.transpose() + + return torch.from_numpy(array) + + def get_encoder_layer_array(layer_index: int, name: str): + full_name = f"encoder/_transformer_layers/{layer_index}/{name}/.ATTRIBUTES/VARIABLE_VALUE" + array = tf.train.load_variable(tf_checkpoint_path, full_name) + + if "kernel" in name: + array = array.transpose() + + return torch.from_numpy(array) + + def get_encoder_attention_layer_array(layer_index: int, name: str, orginal_shape): + full_name = f"encoder/_transformer_layers/{layer_index}/_attention_layer/{name}/.ATTRIBUTES/VARIABLE_VALUE" + array = tf.train.load_variable(tf_checkpoint_path, full_name) + array = array.reshape(orginal_shape) + + if "kernel" in name: + array = array.transpose() + + return torch.from_numpy(array) + + print(f"Loading model based on config from {config_path}...") + config = BertConfig.from_json_file(config_path) + model = BertForMaskedLM(config) + + # Layers + for layer_index in range(0, config.num_hidden_layers): + layer: BertLayer = model.bert.encoder.layer[layer_index] + + # Self-attention + self_attn: BertSelfAttention = layer.attention.self + + self_attn.query.weight.data = get_encoder_attention_layer_array( + layer_index, "_query_dense/kernel", self_attn.query.weight.data.shape + ) + self_attn.query.bias.data = get_encoder_attention_layer_array( + layer_index, "_query_dense/bias", self_attn.query.bias.data.shape + ) + self_attn.key.weight.data = get_encoder_attention_layer_array( + layer_index, "_key_dense/kernel", self_attn.key.weight.data.shape + ) + self_attn.key.bias.data = get_encoder_attention_layer_array( + layer_index, "_key_dense/bias", self_attn.key.bias.data.shape + ) + self_attn.value.weight.data = get_encoder_attention_layer_array( + layer_index, "_value_dense/kernel", self_attn.value.weight.data.shape + ) + self_attn.value.bias.data = get_encoder_attention_layer_array( + layer_index, "_value_dense/bias", self_attn.value.bias.data.shape + ) + + # Self-attention Output + self_output: BertSelfOutput = layer.attention.output + + self_output.dense.weight.data = get_encoder_attention_layer_array( + layer_index, "_output_dense/kernel", self_output.dense.weight.data.shape + ) + self_output.dense.bias.data = get_encoder_attention_layer_array( + layer_index, "_output_dense/bias", self_output.dense.bias.data.shape + ) + + self_output.LayerNorm.weight.data = get_encoder_layer_array(layer_index, "_attention_layer_norm/gamma") + self_output.LayerNorm.bias.data = get_encoder_layer_array(layer_index, "_attention_layer_norm/beta") + + # Intermediate + intermediate: BertIntermediate = layer.intermediate + + intermediate.dense.weight.data = get_encoder_layer_array(layer_index, "_intermediate_dense/kernel") + intermediate.dense.bias.data = get_encoder_layer_array(layer_index, "_intermediate_dense/bias") + + # Output + bert_output: BertOutput = layer.output + + bert_output.dense.weight.data = get_encoder_layer_array(layer_index, "_output_dense/kernel") + bert_output.dense.bias.data = get_encoder_layer_array(layer_index, "_output_dense/bias") + + bert_output.LayerNorm.weight.data = get_encoder_layer_array(layer_index, "_output_layer_norm/gamma") + bert_output.LayerNorm.bias.data = get_encoder_layer_array(layer_index, "_output_layer_norm/beta") + + # Embeddings + model.bert.embeddings.position_embeddings.weight.data = get_encoder_array("_position_embedding_layer/embeddings") + model.bert.embeddings.token_type_embeddings.weight.data = get_encoder_array("_type_embedding_layer/embeddings") + model.bert.embeddings.LayerNorm.weight.data = get_encoder_array("_embedding_norm_layer/gamma") + model.bert.embeddings.LayerNorm.bias.data = get_encoder_array("_embedding_norm_layer/beta") + + # LM Head + lm_head = model.cls.predictions.transform + + lm_head.dense.weight.data = get_masked_lm_array("dense/kernel") + lm_head.dense.bias.data = get_masked_lm_array("dense/bias") + + lm_head.LayerNorm.weight.data = get_masked_lm_array("layer_norm/gamma") + lm_head.LayerNorm.bias.data = get_masked_lm_array("layer_norm/beta") + + model.bert.embeddings.word_embeddings.weight.data = get_masked_lm_array("embedding_table") + + # Pooling + model.bert.pooler = BertPooler(config=config) + model.bert.pooler.dense.weight.data: BertPooler = get_encoder_array("_pooler_layer/kernel") + model.bert.pooler.dense.bias.data: BertPooler = get_encoder_array("_pooler_layer/bias") + + # Export final model + model.save_pretrained(pytorch_dump_path) + + # Integration test - should load without any errors ;) + new_model = BertForMaskedLM.from_pretrained(pytorch_dump_path) + print(new_model.eval()) + + print("Model conversion was done sucessfully!") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + "--tf_checkpoint_path", type=str, required=True, help="Path to the TensorFlow Token Dropping checkpoint path." + ) + parser.add_argument( + "--bert_config_file", + type=str, + required=True, + help="The config json file corresponding to the BERT model. This specifies the model architecture.", + ) + parser.add_argument( + "--pytorch_dump_path", + type=str, + required=True, + help="Path to the output PyTorch model.", + ) + args = parser.parse_args() + convert_checkpoint_to_pytorch(args.tf_checkpoint_path, args.bert_config_file, args.pytorch_dump_path) diff --git a/docs/transformers/build/lib/transformers/models/bert/modeling_bert.py b/docs/transformers/build/lib/transformers/models/bert/modeling_bert.py new file mode 100644 index 0000000000000000000000000000000000000000..3a311f4a734c6ef2ab60eb63a8d135b013caa480 --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/bert/modeling_bert.py @@ -0,0 +1,2019 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch BERT model.""" + +import math +import os +import warnings +from dataclasses import dataclass +from typing import List, Optional, Tuple, Union + +import torch +import torch.utils.checkpoint +from packaging import version +from torch import nn +from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss + +from ...activations import ACT2FN +from ...generation import GenerationMixin +from ...modeling_attn_mask_utils import ( + _prepare_4d_attention_mask_for_sdpa, + _prepare_4d_causal_attention_mask_for_sdpa, +) +from ...modeling_outputs import ( + BaseModelOutputWithPastAndCrossAttentions, + BaseModelOutputWithPoolingAndCrossAttentions, + CausalLMOutputWithCrossAttentions, + MaskedLMOutput, + MultipleChoiceModelOutput, + NextSentencePredictorOutput, + QuestionAnsweringModelOutput, + SequenceClassifierOutput, + TokenClassifierOutput, +) +from ...modeling_utils import PreTrainedModel +from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer +from ...utils import ( + ModelOutput, + add_code_sample_docstrings, + add_start_docstrings, + add_start_docstrings_to_model_forward, + get_torch_version, + logging, + replace_return_docstrings, +) +from .configuration_bert import BertConfig + + +logger = logging.get_logger(__name__) + +_CHECKPOINT_FOR_DOC = "google-bert/bert-base-uncased" +_CONFIG_FOR_DOC = "BertConfig" + +# TokenClassification docstring +_CHECKPOINT_FOR_TOKEN_CLASSIFICATION = "dbmdz/bert-large-cased-finetuned-conll03-english" +_TOKEN_CLASS_EXPECTED_OUTPUT = ( + "['O', 'I-ORG', 'I-ORG', 'I-ORG', 'O', 'O', 'O', 'O', 'O', 'I-LOC', 'O', 'I-LOC', 'I-LOC'] " +) +_TOKEN_CLASS_EXPECTED_LOSS = 0.01 + +# QuestionAnswering docstring +_CHECKPOINT_FOR_QA = "deepset/bert-base-cased-squad2" +_QA_EXPECTED_OUTPUT = "'a nice puppet'" +_QA_EXPECTED_LOSS = 7.41 +_QA_TARGET_START_INDEX = 14 +_QA_TARGET_END_INDEX = 15 + +# SequenceClassification docstring +_CHECKPOINT_FOR_SEQUENCE_CLASSIFICATION = "textattack/bert-base-uncased-yelp-polarity" +_SEQ_CLASS_EXPECTED_OUTPUT = "'LABEL_1'" +_SEQ_CLASS_EXPECTED_LOSS = 0.01 + + +def load_tf_weights_in_bert(model, config, tf_checkpoint_path): + """Load tf checkpoints in a pytorch model.""" + try: + import re + + import numpy as np + import tensorflow as tf + except ImportError: + logger.error( + "Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see " + "https://www.tensorflow.org/install/ for installation instructions." + ) + raise + tf_path = os.path.abspath(tf_checkpoint_path) + logger.info(f"Converting TensorFlow checkpoint from {tf_path}") + # Load weights from TF model + init_vars = tf.train.list_variables(tf_path) + names = [] + arrays = [] + for name, shape in init_vars: + logger.info(f"Loading TF weight {name} with shape {shape}") + array = tf.train.load_variable(tf_path, name) + names.append(name) + arrays.append(array) + + for name, array in zip(names, arrays): + name = name.split("/") + # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v + # which are not required for using pretrained model + if any( + n in ["adam_v", "adam_m", "AdamWeightDecayOptimizer", "AdamWeightDecayOptimizer_1", "global_step"] + for n in name + ): + logger.info(f"Skipping {'/'.join(name)}") + continue + pointer = model + for m_name in name: + if re.fullmatch(r"[A-Za-z]+_\d+", m_name): + scope_names = re.split(r"_(\d+)", m_name) + else: + scope_names = [m_name] + if scope_names[0] == "kernel" or scope_names[0] == "gamma": + pointer = getattr(pointer, "weight") + elif scope_names[0] == "output_bias" or scope_names[0] == "beta": + pointer = getattr(pointer, "bias") + elif scope_names[0] == "output_weights": + pointer = getattr(pointer, "weight") + elif scope_names[0] == "squad": + pointer = getattr(pointer, "classifier") + else: + try: + pointer = getattr(pointer, scope_names[0]) + except AttributeError: + logger.info(f"Skipping {'/'.join(name)}") + continue + if len(scope_names) >= 2: + num = int(scope_names[1]) + pointer = pointer[num] + if m_name[-11:] == "_embeddings": + pointer = getattr(pointer, "weight") + elif m_name == "kernel": + array = np.transpose(array) + try: + if pointer.shape != array.shape: + raise ValueError(f"Pointer shape {pointer.shape} and array shape {array.shape} mismatched") + except ValueError as e: + e.args += (pointer.shape, array.shape) + raise + logger.info(f"Initialize PyTorch weight {name}") + pointer.data = torch.from_numpy(array) + return model + + +class BertEmbeddings(nn.Module): + """Construct the embeddings from word, position and token_type embeddings.""" + + def __init__(self, config): + super().__init__() + self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id) + self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size) + self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size) + + # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load + # any TensorFlow checkpoint file + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + # position_ids (1, len position emb) is contiguous in memory and exported when serialized + self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") + self.register_buffer( + "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False + ) + self.register_buffer( + "token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False + ) + + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + token_type_ids: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + past_key_values_length: int = 0, + ) -> torch.Tensor: + if input_ids is not None: + input_shape = input_ids.size() + else: + input_shape = inputs_embeds.size()[:-1] + + seq_length = input_shape[1] + + if position_ids is None: + position_ids = self.position_ids[:, past_key_values_length : seq_length + past_key_values_length] + + # Setting the token_type_ids to the registered buffer in constructor where it is all zeros, which usually occurs + # when its auto-generated, registered buffer helps users when tracing the model without passing token_type_ids, solves + # issue #5664 + if token_type_ids is None: + if hasattr(self, "token_type_ids"): + buffered_token_type_ids = self.token_type_ids[:, :seq_length] + buffered_token_type_ids_expanded = buffered_token_type_ids.expand(input_shape[0], seq_length) + token_type_ids = buffered_token_type_ids_expanded + else: + token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device) + + if inputs_embeds is None: + inputs_embeds = self.word_embeddings(input_ids) + token_type_embeddings = self.token_type_embeddings(token_type_ids) + + embeddings = inputs_embeds + token_type_embeddings + if self.position_embedding_type == "absolute": + position_embeddings = self.position_embeddings(position_ids) + embeddings += position_embeddings + embeddings = self.LayerNorm(embeddings) + embeddings = self.dropout(embeddings) + return embeddings + + +class BertSelfAttention(nn.Module): + def __init__(self, config, position_embedding_type=None): + super().__init__() + if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"): + raise ValueError( + f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention " + f"heads ({config.num_attention_heads})" + ) + + self.num_attention_heads = config.num_attention_heads + self.attention_head_size = int(config.hidden_size / config.num_attention_heads) + self.all_head_size = self.num_attention_heads * self.attention_head_size + + self.query = nn.Linear(config.hidden_size, self.all_head_size) + self.key = nn.Linear(config.hidden_size, self.all_head_size) + self.value = nn.Linear(config.hidden_size, self.all_head_size) + + self.dropout = nn.Dropout(config.attention_probs_dropout_prob) + self.position_embedding_type = position_embedding_type or getattr( + config, "position_embedding_type", "absolute" + ) + if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query": + self.max_position_embeddings = config.max_position_embeddings + self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size) + + self.is_decoder = config.is_decoder + + def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor: + new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size) + x = x.view(new_x_shape) + return x.permute(0, 2, 1, 3) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.FloatTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + encoder_attention_mask: Optional[torch.FloatTensor] = None, + past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None, + output_attentions: Optional[bool] = False, + ) -> Tuple[torch.Tensor]: + mixed_query_layer = self.query(hidden_states) + + # If this is instantiated as a cross-attention module, the keys + # and values come from an encoder; the attention mask needs to be + # such that the encoder's padding tokens are not attended to. + is_cross_attention = encoder_hidden_states is not None + + if is_cross_attention and past_key_value is not None: + # reuse k,v, cross_attentions + key_layer = past_key_value[0] + value_layer = past_key_value[1] + attention_mask = encoder_attention_mask + elif is_cross_attention: + key_layer = self.transpose_for_scores(self.key(encoder_hidden_states)) + value_layer = self.transpose_for_scores(self.value(encoder_hidden_states)) + attention_mask = encoder_attention_mask + elif past_key_value is not None: + key_layer = self.transpose_for_scores(self.key(hidden_states)) + value_layer = self.transpose_for_scores(self.value(hidden_states)) + key_layer = torch.cat([past_key_value[0], key_layer], dim=2) + value_layer = torch.cat([past_key_value[1], value_layer], dim=2) + else: + key_layer = self.transpose_for_scores(self.key(hidden_states)) + value_layer = self.transpose_for_scores(self.value(hidden_states)) + + query_layer = self.transpose_for_scores(mixed_query_layer) + + use_cache = past_key_value is not None + if self.is_decoder: + # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states. + # Further calls to cross_attention layer can then reuse all cross-attention + # key/value_states (first "if" case) + # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of + # all previous decoder key/value_states. Further calls to uni-directional self-attention + # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case) + # if encoder bi-directional self-attention `past_key_value` is always `None` + past_key_value = (key_layer, value_layer) + + # Take the dot product between "query" and "key" to get the raw attention scores. + attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2)) + + if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query": + query_length, key_length = query_layer.shape[2], key_layer.shape[2] + if use_cache: + position_ids_l = torch.tensor(key_length - 1, dtype=torch.long, device=hidden_states.device).view( + -1, 1 + ) + else: + position_ids_l = torch.arange(query_length, dtype=torch.long, device=hidden_states.device).view(-1, 1) + position_ids_r = torch.arange(key_length, dtype=torch.long, device=hidden_states.device).view(1, -1) + distance = position_ids_l - position_ids_r + + positional_embedding = self.distance_embedding(distance + self.max_position_embeddings - 1) + positional_embedding = positional_embedding.to(dtype=query_layer.dtype) # fp16 compatibility + + if self.position_embedding_type == "relative_key": + relative_position_scores = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding) + attention_scores = attention_scores + relative_position_scores + elif self.position_embedding_type == "relative_key_query": + relative_position_scores_query = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding) + relative_position_scores_key = torch.einsum("bhrd,lrd->bhlr", key_layer, positional_embedding) + attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key + + attention_scores = attention_scores / math.sqrt(self.attention_head_size) + if attention_mask is not None: + # Apply the attention mask is (precomputed for all layers in BertModel forward() function) + attention_scores = attention_scores + attention_mask + + # Normalize the attention scores to probabilities. + attention_probs = nn.functional.softmax(attention_scores, dim=-1) + + # This is actually dropping out entire tokens to attend to, which might + # seem a bit unusual, but is taken from the original Transformer paper. + attention_probs = self.dropout(attention_probs) + + # Mask heads if we want to + if head_mask is not None: + attention_probs = attention_probs * head_mask + + context_layer = torch.matmul(attention_probs, value_layer) + + context_layer = context_layer.permute(0, 2, 1, 3).contiguous() + new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,) + context_layer = context_layer.view(new_context_layer_shape) + + outputs = (context_layer, attention_probs) if output_attentions else (context_layer,) + + if self.is_decoder: + outputs = outputs + (past_key_value,) + return outputs + + +class BertSdpaSelfAttention(BertSelfAttention): + def __init__(self, config, position_embedding_type=None): + super().__init__(config, position_embedding_type=position_embedding_type) + self.dropout_prob = config.attention_probs_dropout_prob + self.require_contiguous_qkv = version.parse(get_torch_version()) < version.parse("2.2.0") + + # Adapted from BertSelfAttention + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + encoder_attention_mask: Optional[torch.FloatTensor] = None, + past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None, + output_attentions: Optional[bool] = False, + ) -> Tuple[torch.Tensor]: + if self.position_embedding_type != "absolute" or output_attentions or head_mask is not None: + # TODO: Improve this warning with e.g. `model.config._attn_implementation = "manual"` once implemented. + logger.warning_once( + "BertSdpaSelfAttention is used but `torch.nn.functional.scaled_dot_product_attention` does not support " + "non-absolute `position_embedding_type` or `output_attentions=True` or `head_mask`. Falling back to " + "the manual attention implementation, but specifying the manual implementation will be required from " + "Transformers version v5.0.0 onwards. This warning can be removed using the argument " + '`attn_implementation="eager"` when loading the model.' + ) + return super().forward( + hidden_states, + attention_mask, + head_mask, + encoder_hidden_states, + encoder_attention_mask, + past_key_value, + output_attentions, + ) + + bsz, tgt_len, _ = hidden_states.size() + + query_layer = self.transpose_for_scores(self.query(hidden_states)) + + # If this is instantiated as a cross-attention module, the keys and values come from an encoder; the attention + # mask needs to be such that the encoder's padding tokens are not attended to. + is_cross_attention = encoder_hidden_states is not None + + current_states = encoder_hidden_states if is_cross_attention else hidden_states + attention_mask = encoder_attention_mask if is_cross_attention else attention_mask + + # Check `seq_length` of `past_key_value` == `len(current_states)` to support prefix tuning + if is_cross_attention and past_key_value and past_key_value[0].shape[2] == current_states.shape[1]: + key_layer, value_layer = past_key_value + else: + key_layer = self.transpose_for_scores(self.key(current_states)) + value_layer = self.transpose_for_scores(self.value(current_states)) + if past_key_value is not None and not is_cross_attention: + key_layer = torch.cat([past_key_value[0], key_layer], dim=2) + value_layer = torch.cat([past_key_value[1], value_layer], dim=2) + + if self.is_decoder: + # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states. + # Further calls to cross_attention layer can then reuse all cross-attention + # key/value_states (first "if" case) + # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of + # all previous decoder key/value_states. Further calls to uni-directional self-attention + # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case) + # if encoder bi-directional self-attention `past_key_value` is always `None` + past_key_value = (key_layer, value_layer) + + # SDPA with memory-efficient backend is broken in torch==2.1.2 when using non-contiguous inputs and a custom + # attn_mask, so we need to call `.contiguous()` here. This was fixed in torch==2.2.0. + # Reference: https://github.com/pytorch/pytorch/issues/112577 + if self.require_contiguous_qkv and query_layer.device.type == "cuda" and attention_mask is not None: + query_layer = query_layer.contiguous() + key_layer = key_layer.contiguous() + value_layer = value_layer.contiguous() + + # We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment + # in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling. + # The tgt_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create + # a causal mask in case tgt_len == 1. + is_causal = ( + True if self.is_decoder and not is_cross_attention and attention_mask is None and tgt_len > 1 else False + ) + + attn_output = torch.nn.functional.scaled_dot_product_attention( + query_layer, + key_layer, + value_layer, + attn_mask=attention_mask, + dropout_p=self.dropout_prob if self.training else 0.0, + is_causal=is_causal, + ) + + attn_output = attn_output.transpose(1, 2) + attn_output = attn_output.reshape(bsz, tgt_len, self.all_head_size) + + outputs = (attn_output,) + if self.is_decoder: + outputs = outputs + (past_key_value,) + return outputs + + +class BertSelfOutput(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = self.LayerNorm(hidden_states + input_tensor) + return hidden_states + + +BERT_SELF_ATTENTION_CLASSES = { + "eager": BertSelfAttention, + "sdpa": BertSdpaSelfAttention, +} + + +class BertAttention(nn.Module): + def __init__(self, config, position_embedding_type=None): + super().__init__() + self.self = BERT_SELF_ATTENTION_CLASSES[config._attn_implementation]( + config, position_embedding_type=position_embedding_type + ) + self.output = BertSelfOutput(config) + self.pruned_heads = set() + + def prune_heads(self, heads): + if len(heads) == 0: + return + heads, index = find_pruneable_heads_and_indices( + heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads + ) + + # Prune linear layers + self.self.query = prune_linear_layer(self.self.query, index) + self.self.key = prune_linear_layer(self.self.key, index) + self.self.value = prune_linear_layer(self.self.value, index) + self.output.dense = prune_linear_layer(self.output.dense, index, dim=1) + + # Update hyper params and store pruned heads + self.self.num_attention_heads = self.self.num_attention_heads - len(heads) + self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads + self.pruned_heads = self.pruned_heads.union(heads) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.FloatTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + encoder_attention_mask: Optional[torch.FloatTensor] = None, + past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None, + output_attentions: Optional[bool] = False, + ) -> Tuple[torch.Tensor]: + self_outputs = self.self( + hidden_states, + attention_mask, + head_mask, + encoder_hidden_states, + encoder_attention_mask, + past_key_value, + output_attentions, + ) + attention_output = self.output(self_outputs[0], hidden_states) + outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them + return outputs + + +class BertIntermediate(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.intermediate_size) + if isinstance(config.hidden_act, str): + self.intermediate_act_fn = ACT2FN[config.hidden_act] + else: + self.intermediate_act_fn = config.hidden_act + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.intermediate_act_fn(hidden_states) + return hidden_states + + +class BertOutput(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.intermediate_size, config.hidden_size) + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = self.LayerNorm(hidden_states + input_tensor) + return hidden_states + + +class BertLayer(nn.Module): + def __init__(self, config): + super().__init__() + self.chunk_size_feed_forward = config.chunk_size_feed_forward + self.seq_len_dim = 1 + self.attention = BertAttention(config) + self.is_decoder = config.is_decoder + self.add_cross_attention = config.add_cross_attention + if self.add_cross_attention: + if not self.is_decoder: + raise ValueError(f"{self} should be used as a decoder model if cross attention is added") + self.crossattention = BertAttention(config, position_embedding_type="absolute") + self.intermediate = BertIntermediate(config) + self.output = BertOutput(config) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.FloatTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + encoder_attention_mask: Optional[torch.FloatTensor] = None, + past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None, + output_attentions: Optional[bool] = False, + ) -> Tuple[torch.Tensor]: + # decoder uni-directional self-attention cached key/values tuple is at positions 1,2 + self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None + self_attention_outputs = self.attention( + hidden_states, + attention_mask, + head_mask, + output_attentions=output_attentions, + past_key_value=self_attn_past_key_value, + ) + attention_output = self_attention_outputs[0] + + # if decoder, the last output is tuple of self-attn cache + if self.is_decoder: + outputs = self_attention_outputs[1:-1] + present_key_value = self_attention_outputs[-1] + else: + outputs = self_attention_outputs[1:] # add self attentions if we output attention weights + + cross_attn_present_key_value = None + if self.is_decoder and encoder_hidden_states is not None: + if not hasattr(self, "crossattention"): + raise ValueError( + f"If `encoder_hidden_states` are passed, {self} has to be instantiated with cross-attention layers" + " by setting `config.add_cross_attention=True`" + ) + + # cross_attn cached key/values tuple is at positions 3,4 of past_key_value tuple + cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None + cross_attention_outputs = self.crossattention( + attention_output, + attention_mask, + head_mask, + encoder_hidden_states, + encoder_attention_mask, + cross_attn_past_key_value, + output_attentions, + ) + attention_output = cross_attention_outputs[0] + outputs = outputs + cross_attention_outputs[1:-1] # add cross attentions if we output attention weights + + # add cross-attn cache to positions 3,4 of present_key_value tuple + cross_attn_present_key_value = cross_attention_outputs[-1] + present_key_value = present_key_value + cross_attn_present_key_value + + layer_output = apply_chunking_to_forward( + self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attention_output + ) + outputs = (layer_output,) + outputs + + # if decoder, return the attn key/values as the last output + if self.is_decoder: + outputs = outputs + (present_key_value,) + + return outputs + + def feed_forward_chunk(self, attention_output): + intermediate_output = self.intermediate(attention_output) + layer_output = self.output(intermediate_output, attention_output) + return layer_output + + +class BertEncoder(nn.Module): + def __init__(self, config): + super().__init__() + self.config = config + self.layer = nn.ModuleList([BertLayer(config) for _ in range(config.num_hidden_layers)]) + self.gradient_checkpointing = False + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.FloatTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + encoder_attention_mask: Optional[torch.FloatTensor] = None, + past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = False, + output_hidden_states: Optional[bool] = False, + return_dict: Optional[bool] = True, + ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPastAndCrossAttentions]: + all_hidden_states = () if output_hidden_states else None + all_self_attentions = () if output_attentions else None + all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None + + if self.gradient_checkpointing and self.training: + if use_cache: + logger.warning_once( + "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." + ) + use_cache = False + + next_decoder_cache = () if use_cache else None + for i, layer_module in enumerate(self.layer): + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + layer_head_mask = head_mask[i] if head_mask is not None else None + past_key_value = past_key_values[i] if past_key_values is not None else None + + if self.gradient_checkpointing and self.training: + layer_outputs = self._gradient_checkpointing_func( + layer_module.__call__, + hidden_states, + attention_mask, + layer_head_mask, + encoder_hidden_states, + encoder_attention_mask, + past_key_value, + output_attentions, + ) + else: + layer_outputs = layer_module( + hidden_states, + attention_mask, + layer_head_mask, + encoder_hidden_states, + encoder_attention_mask, + past_key_value, + output_attentions, + ) + + hidden_states = layer_outputs[0] + if use_cache: + next_decoder_cache += (layer_outputs[-1],) + if output_attentions: + all_self_attentions = all_self_attentions + (layer_outputs[1],) + if self.config.add_cross_attention: + all_cross_attentions = all_cross_attentions + (layer_outputs[2],) + + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + if not return_dict: + return tuple( + v + for v in [ + hidden_states, + next_decoder_cache, + all_hidden_states, + all_self_attentions, + all_cross_attentions, + ] + if v is not None + ) + return BaseModelOutputWithPastAndCrossAttentions( + last_hidden_state=hidden_states, + past_key_values=next_decoder_cache, + hidden_states=all_hidden_states, + attentions=all_self_attentions, + cross_attentions=all_cross_attentions, + ) + + +class BertPooler(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + self.activation = nn.Tanh() + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + # We "pool" the model by simply taking the hidden state corresponding + # to the first token. + first_token_tensor = hidden_states[:, 0] + pooled_output = self.dense(first_token_tensor) + pooled_output = self.activation(pooled_output) + return pooled_output + + +class BertPredictionHeadTransform(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + if isinstance(config.hidden_act, str): + self.transform_act_fn = ACT2FN[config.hidden_act] + else: + self.transform_act_fn = config.hidden_act + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.transform_act_fn(hidden_states) + hidden_states = self.LayerNorm(hidden_states) + return hidden_states + + +class BertLMPredictionHead(nn.Module): + def __init__(self, config): + super().__init__() + self.transform = BertPredictionHeadTransform(config) + + # The output weights are the same as the input embeddings, but there is + # an output-only bias for each token. + self.decoder = nn.Linear(config.hidden_size, config.vocab_size, bias=False) + + self.bias = nn.Parameter(torch.zeros(config.vocab_size)) + + # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings` + self.decoder.bias = self.bias + + def _tie_weights(self): + self.decoder.bias = self.bias + + def forward(self, hidden_states): + hidden_states = self.transform(hidden_states) + hidden_states = self.decoder(hidden_states) + return hidden_states + + +class BertOnlyMLMHead(nn.Module): + def __init__(self, config): + super().__init__() + self.predictions = BertLMPredictionHead(config) + + def forward(self, sequence_output: torch.Tensor) -> torch.Tensor: + prediction_scores = self.predictions(sequence_output) + return prediction_scores + + +class BertOnlyNSPHead(nn.Module): + def __init__(self, config): + super().__init__() + self.seq_relationship = nn.Linear(config.hidden_size, 2) + + def forward(self, pooled_output): + seq_relationship_score = self.seq_relationship(pooled_output) + return seq_relationship_score + + +class BertPreTrainingHeads(nn.Module): + def __init__(self, config): + super().__init__() + self.predictions = BertLMPredictionHead(config) + self.seq_relationship = nn.Linear(config.hidden_size, 2) + + def forward(self, sequence_output, pooled_output): + prediction_scores = self.predictions(sequence_output) + seq_relationship_score = self.seq_relationship(pooled_output) + return prediction_scores, seq_relationship_score + + +class BertPreTrainedModel(PreTrainedModel): + """ + An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained + models. + """ + + config_class = BertConfig + load_tf_weights = load_tf_weights_in_bert + base_model_prefix = "bert" + supports_gradient_checkpointing = True + _supports_sdpa = True + + def _init_weights(self, module): + """Initialize the weights""" + if isinstance(module, nn.Linear): + # Slightly different from the TF version which uses truncated_normal for initialization + # cf https://github.com/pytorch/pytorch/pull/5617 + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + elif isinstance(module, nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + elif isinstance(module, BertLMPredictionHead): + module.bias.data.zero_() + + +@dataclass +class BertForPreTrainingOutput(ModelOutput): + """ + Output type of [`BertForPreTraining`]. + + Args: + loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`): + Total loss as the sum of the masked language modeling loss and the next sequence prediction + (classification) loss. + prediction_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`): + Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). + seq_relationship_logits (`torch.FloatTensor` of shape `(batch_size, 2)`): + Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation + before SoftMax). + hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of + shape `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + + loss: Optional[torch.FloatTensor] = None + prediction_logits: Optional[torch.FloatTensor] = None + seq_relationship_logits: Optional[torch.FloatTensor] = None + hidden_states: Optional[Tuple[torch.FloatTensor]] = None + attentions: Optional[Tuple[torch.FloatTensor]] = None + + +BERT_START_DOCSTRING = r""" + + This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads + etc.) + + This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. + Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage + and behavior. + + Parameters: + config ([`BertConfig`]): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights. +""" + +BERT_INPUTS_DOCSTRING = r""" + Args: + input_ids (`torch.LongTensor` of shape `({0})`): + Indices of input sequence tokens in the vocabulary. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + attention_mask (`torch.FloatTensor` of shape `({0})`or `(batch_size, sequence_length, target_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*): + Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0, + 1]`: + + - 0 corresponds to a *sentence A* token, + - 1 corresponds to a *sentence B* token. + + [What are token type IDs?](../glossary#token-type-ids) + position_ids (`torch.LongTensor` of shape `({0})`, *optional*): + Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, + config.max_position_embeddings - 1]`. + + [What are position IDs?](../glossary#position-ids) + head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*): + Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This + is useful if you want more control over how to convert `input_ids` indices into associated vectors than the + model's internal embedding lookup matrix. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. +""" + + +@add_start_docstrings( + "The bare Bert Model transformer outputting raw hidden-states without any specific head on top.", + BERT_START_DOCSTRING, +) +class BertModel(BertPreTrainedModel): + """ + + The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of + cross-attention is added between the self-attention layers, following the architecture described in [Attention is + all you need](https://arxiv.org/abs/1706.03762) by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, + Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin. + + To behave as an decoder the model needs to be initialized with the `is_decoder` argument of the configuration set + to `True`. To be used in a Seq2Seq model, the model needs to initialized with both `is_decoder` argument and + `add_cross_attention` set to `True`; an `encoder_hidden_states` is then expected as an input to the forward pass. + """ + + _no_split_modules = ["BertEmbeddings", "BertLayer"] + + def __init__(self, config, add_pooling_layer=True): + super().__init__(config) + self.config = config + + self.embeddings = BertEmbeddings(config) + self.encoder = BertEncoder(config) + + self.pooler = BertPooler(config) if add_pooling_layer else None + + self.attn_implementation = config._attn_implementation + self.position_embedding_type = config.position_embedding_type + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.embeddings.word_embeddings + + def set_input_embeddings(self, value): + self.embeddings.word_embeddings = value + + def _prune_heads(self, heads_to_prune): + """ + Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base + class PreTrainedModel + """ + for layer, heads in heads_to_prune.items(): + self.encoder.layer[layer].attention.prune_heads(heads) + + @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @add_code_sample_docstrings( + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=BaseModelOutputWithPoolingAndCrossAttentions, + config_class=_CONFIG_FOR_DOC, + ) + def forward( + self, + input_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + token_type_ids: Optional[torch.Tensor] = None, + position_ids: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, + encoder_attention_mask: Optional[torch.Tensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPoolingAndCrossAttentions]: + r""" + encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if + the model is configured as a decoder. + encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)` or `(batch_size, sequence_length, target_length)`, *optional*): + Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in + the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`): + Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding. + + If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that + don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all + `decoder_input_ids` of shape `(batch_size, sequence_length)`. + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see + `past_key_values`). + """ + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if self.config.is_decoder: + use_cache = use_cache if use_cache is not None else self.config.use_cache + else: + use_cache = False + + if input_ids is not None and inputs_embeds is not None: + raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") + elif input_ids is not None: + self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask) + input_shape = input_ids.size() + elif inputs_embeds is not None: + input_shape = inputs_embeds.size()[:-1] + else: + raise ValueError("You have to specify either input_ids or inputs_embeds") + + batch_size, seq_length = input_shape + device = input_ids.device if input_ids is not None else inputs_embeds.device + + # past_key_values_length + past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0 + + if token_type_ids is None: + if hasattr(self.embeddings, "token_type_ids"): + buffered_token_type_ids = self.embeddings.token_type_ids[:, :seq_length] + buffered_token_type_ids_expanded = buffered_token_type_ids.expand(batch_size, seq_length) + token_type_ids = buffered_token_type_ids_expanded + else: + token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device) + + embedding_output = self.embeddings( + input_ids=input_ids, + position_ids=position_ids, + token_type_ids=token_type_ids, + inputs_embeds=inputs_embeds, + past_key_values_length=past_key_values_length, + ) + + if attention_mask is None: + attention_mask = torch.ones((batch_size, seq_length + past_key_values_length), device=device) + + use_sdpa_attention_masks = ( + self.attn_implementation == "sdpa" + and self.position_embedding_type == "absolute" + and head_mask is None + and not output_attentions + ) + + # Expand the attention mask + if use_sdpa_attention_masks and attention_mask.dim() == 2: + # Expand the attention mask for SDPA. + # [bsz, seq_len] -> [bsz, 1, seq_len, seq_len] + if self.config.is_decoder: + extended_attention_mask = _prepare_4d_causal_attention_mask_for_sdpa( + attention_mask, + input_shape, + embedding_output, + past_key_values_length, + ) + else: + extended_attention_mask = _prepare_4d_attention_mask_for_sdpa( + attention_mask, embedding_output.dtype, tgt_len=seq_length + ) + else: + # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length] + # ourselves in which case we just need to make it broadcastable to all heads. + extended_attention_mask = self.get_extended_attention_mask(attention_mask, input_shape) + + # If a 2D or 3D attention mask is provided for the cross-attention + # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length] + if self.config.is_decoder and encoder_hidden_states is not None: + encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size() + encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length) + if encoder_attention_mask is None: + encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device) + + if use_sdpa_attention_masks and encoder_attention_mask.dim() == 2: + # Expand the attention mask for SDPA. + # [bsz, seq_len] -> [bsz, 1, seq_len, seq_len] + encoder_extended_attention_mask = _prepare_4d_attention_mask_for_sdpa( + encoder_attention_mask, embedding_output.dtype, tgt_len=seq_length + ) + else: + encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask) + else: + encoder_extended_attention_mask = None + + # Prepare head mask if needed + # 1.0 in head_mask indicate we keep the head + # attention_probs has shape bsz x n_heads x N x N + # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads] + # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length] + head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers) + + encoder_outputs = self.encoder( + embedding_output, + attention_mask=extended_attention_mask, + head_mask=head_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_extended_attention_mask, + past_key_values=past_key_values, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + sequence_output = encoder_outputs[0] + pooled_output = self.pooler(sequence_output) if self.pooler is not None else None + + if not return_dict: + return (sequence_output, pooled_output) + encoder_outputs[1:] + + return BaseModelOutputWithPoolingAndCrossAttentions( + last_hidden_state=sequence_output, + pooler_output=pooled_output, + past_key_values=encoder_outputs.past_key_values, + hidden_states=encoder_outputs.hidden_states, + attentions=encoder_outputs.attentions, + cross_attentions=encoder_outputs.cross_attentions, + ) + + +@add_start_docstrings( + """ + Bert Model with two heads on top as done during the pretraining: a `masked language modeling` head and a `next + sentence prediction (classification)` head. + """, + BERT_START_DOCSTRING, +) +class BertForPreTraining(BertPreTrainedModel): + _tied_weights_keys = ["predictions.decoder.bias", "cls.predictions.decoder.weight"] + + def __init__(self, config): + super().__init__(config) + + self.bert = BertModel(config) + self.cls = BertPreTrainingHeads(config) + + # Initialize weights and apply final processing + self.post_init() + + def get_output_embeddings(self): + return self.cls.predictions.decoder + + def set_output_embeddings(self, new_embeddings): + self.cls.predictions.decoder = new_embeddings + self.cls.predictions.bias = new_embeddings.bias + + @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @replace_return_docstrings(output_type=BertForPreTrainingOutput, config_class=_CONFIG_FOR_DOC) + def forward( + self, + input_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + token_type_ids: Optional[torch.Tensor] = None, + position_ids: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + labels: Optional[torch.Tensor] = None, + next_sentence_label: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple[torch.Tensor], BertForPreTrainingOutput]: + r""" + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ..., + config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), + the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]` + next_sentence_label (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the next sequence prediction (classification) loss. Input should be a sequence + pair (see `input_ids` docstring) Indices should be in `[0, 1]`: + + - 0 indicates sequence B is a continuation of sequence A, + - 1 indicates sequence B is a random sequence. + kwargs (`Dict[str, any]`, *optional*, defaults to `{}`): + Used to hide legacy arguments that have been deprecated. + + Returns: + + Example: + + ```python + >>> from transformers import AutoTokenizer, BertForPreTraining + >>> import torch + + >>> tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased") + >>> model = BertForPreTraining.from_pretrained("google-bert/bert-base-uncased") + + >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt") + >>> outputs = model(**inputs) + + >>> prediction_logits = outputs.prediction_logits + >>> seq_relationship_logits = outputs.seq_relationship_logits + ``` + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.bert( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + sequence_output, pooled_output = outputs[:2] + prediction_scores, seq_relationship_score = self.cls(sequence_output, pooled_output) + + total_loss = None + if labels is not None and next_sentence_label is not None: + loss_fct = CrossEntropyLoss() + masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1)) + next_sentence_loss = loss_fct(seq_relationship_score.view(-1, 2), next_sentence_label.view(-1)) + total_loss = masked_lm_loss + next_sentence_loss + + if not return_dict: + output = (prediction_scores, seq_relationship_score) + outputs[2:] + return ((total_loss,) + output) if total_loss is not None else output + + return BertForPreTrainingOutput( + loss=total_loss, + prediction_logits=prediction_scores, + seq_relationship_logits=seq_relationship_score, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +@add_start_docstrings( + """Bert Model with a `language modeling` head on top for CLM fine-tuning.""", BERT_START_DOCSTRING +) +class BertLMHeadModel(BertPreTrainedModel, GenerationMixin): + _tied_weights_keys = ["cls.predictions.decoder.bias", "cls.predictions.decoder.weight"] + + def __init__(self, config): + super().__init__(config) + + if not config.is_decoder: + logger.warning("If you want to use `BertLMHeadModel` as a standalone, add `is_decoder=True.`") + + self.bert = BertModel(config, add_pooling_layer=False) + self.cls = BertOnlyMLMHead(config) + + # Initialize weights and apply final processing + self.post_init() + + def get_output_embeddings(self): + return self.cls.predictions.decoder + + def set_output_embeddings(self, new_embeddings): + self.cls.predictions.decoder = new_embeddings + self.cls.predictions.bias = new_embeddings.bias + + @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @add_code_sample_docstrings( + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=CausalLMOutputWithCrossAttentions, + config_class=_CONFIG_FOR_DOC, + ) + def forward( + self, + input_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + token_type_ids: Optional[torch.Tensor] = None, + position_ids: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, + encoder_attention_mask: Optional[torch.Tensor] = None, + labels: Optional[torch.Tensor] = None, + past_key_values: Optional[List[torch.Tensor]] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + **loss_kwargs, + ) -> Union[Tuple[torch.Tensor], CausalLMOutputWithCrossAttentions]: + r""" + encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if + the model is configured as a decoder. + encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in + the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in + `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are + ignored (masked), the loss is only computed for the tokens with labels n `[0, ..., config.vocab_size]` + past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`): + Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding. + + If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that + don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all + `decoder_input_ids` of shape `(batch_size, sequence_length)`. + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see + `past_key_values`). + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + if labels is not None: + use_cache = False + + outputs = self.bert( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + past_key_values=past_key_values, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + sequence_output = outputs[0] + prediction_scores = self.cls(sequence_output) + + lm_loss = None + if labels is not None: + lm_loss = self.loss_function(prediction_scores, labels, self.config.vocab_size, **loss_kwargs) + + if not return_dict: + output = (prediction_scores,) + outputs[2:] + return ((lm_loss,) + output) if lm_loss is not None else output + + return CausalLMOutputWithCrossAttentions( + loss=lm_loss, + logits=prediction_scores, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + cross_attentions=outputs.cross_attentions, + ) + + def _reorder_cache(self, past_key_values, beam_idx): + reordered_past = () + for layer_past in past_key_values: + reordered_past += ( + tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past), + ) + return reordered_past + + +@add_start_docstrings("""Bert Model with a `language modeling` head on top.""", BERT_START_DOCSTRING) +class BertForMaskedLM(BertPreTrainedModel): + _tied_weights_keys = ["predictions.decoder.bias", "cls.predictions.decoder.weight"] + + def __init__(self, config): + super().__init__(config) + + if config.is_decoder: + logger.warning( + "If you want to use `BertForMaskedLM` make sure `config.is_decoder=False` for " + "bi-directional self-attention." + ) + + self.bert = BertModel(config, add_pooling_layer=False) + self.cls = BertOnlyMLMHead(config) + + # Initialize weights and apply final processing + self.post_init() + + def get_output_embeddings(self): + return self.cls.predictions.decoder + + def set_output_embeddings(self, new_embeddings): + self.cls.predictions.decoder = new_embeddings + self.cls.predictions.bias = new_embeddings.bias + + @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @add_code_sample_docstrings( + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=MaskedLMOutput, + config_class=_CONFIG_FOR_DOC, + expected_output="'paris'", + expected_loss=0.88, + ) + def forward( + self, + input_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + token_type_ids: Optional[torch.Tensor] = None, + position_ids: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, + encoder_attention_mask: Optional[torch.Tensor] = None, + labels: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple[torch.Tensor], MaskedLMOutput]: + r""" + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ..., + config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the + loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]` + """ + + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.bert( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + sequence_output = outputs[0] + prediction_scores = self.cls(sequence_output) + + masked_lm_loss = None + if labels is not None: + loss_fct = CrossEntropyLoss() # -100 index = padding token + masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1)) + + if not return_dict: + output = (prediction_scores,) + outputs[2:] + return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output + + return MaskedLMOutput( + loss=masked_lm_loss, + logits=prediction_scores, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + def prepare_inputs_for_generation(self, input_ids, attention_mask=None, **model_kwargs): + input_shape = input_ids.shape + effective_batch_size = input_shape[0] + + # add a dummy token + if self.config.pad_token_id is None: + raise ValueError("The PAD token should be defined for generation") + + attention_mask = torch.cat([attention_mask, attention_mask.new_zeros((attention_mask.shape[0], 1))], dim=-1) + dummy_token = torch.full( + (effective_batch_size, 1), self.config.pad_token_id, dtype=torch.long, device=input_ids.device + ) + input_ids = torch.cat([input_ids, dummy_token], dim=1) + + return {"input_ids": input_ids, "attention_mask": attention_mask} + + @classmethod + def can_generate(cls) -> bool: + """ + Legacy correction: BertForMaskedLM can't call `generate()` from `GenerationMixin`, even though it has a + `prepare_inputs_for_generation` method. + """ + return False + + +@add_start_docstrings( + """Bert Model with a `next sentence prediction (classification)` head on top.""", + BERT_START_DOCSTRING, +) +class BertForNextSentencePrediction(BertPreTrainedModel): + def __init__(self, config): + super().__init__(config) + + self.bert = BertModel(config) + self.cls = BertOnlyNSPHead(config) + + # Initialize weights and apply final processing + self.post_init() + + @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @replace_return_docstrings(output_type=NextSentencePredictorOutput, config_class=_CONFIG_FOR_DOC) + def forward( + self, + input_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + token_type_ids: Optional[torch.Tensor] = None, + position_ids: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + labels: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + **kwargs, + ) -> Union[Tuple[torch.Tensor], NextSentencePredictorOutput]: + r""" + labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair + (see `input_ids` docstring). Indices should be in `[0, 1]`: + + - 0 indicates sequence B is a continuation of sequence A, + - 1 indicates sequence B is a random sequence. + + Returns: + + Example: + + ```python + >>> from transformers import AutoTokenizer, BertForNextSentencePrediction + >>> import torch + + >>> tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased") + >>> model = BertForNextSentencePrediction.from_pretrained("google-bert/bert-base-uncased") + + >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced." + >>> next_sentence = "The sky is blue due to the shorter wavelength of blue light." + >>> encoding = tokenizer(prompt, next_sentence, return_tensors="pt") + + >>> outputs = model(**encoding, labels=torch.LongTensor([1])) + >>> logits = outputs.logits + >>> assert logits[0, 0] < logits[0, 1] # next sentence was random + ``` + """ + + if "next_sentence_label" in kwargs: + warnings.warn( + "The `next_sentence_label` argument is deprecated and will be removed in a future version, use" + " `labels` instead.", + FutureWarning, + ) + labels = kwargs.pop("next_sentence_label") + + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.bert( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + pooled_output = outputs[1] + + seq_relationship_scores = self.cls(pooled_output) + + next_sentence_loss = None + if labels is not None: + loss_fct = CrossEntropyLoss() + next_sentence_loss = loss_fct(seq_relationship_scores.view(-1, 2), labels.view(-1)) + + if not return_dict: + output = (seq_relationship_scores,) + outputs[2:] + return ((next_sentence_loss,) + output) if next_sentence_loss is not None else output + + return NextSentencePredictorOutput( + loss=next_sentence_loss, + logits=seq_relationship_scores, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +@add_start_docstrings( + """ + Bert Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled + output) e.g. for GLUE tasks. + """, + BERT_START_DOCSTRING, +) +class BertForSequenceClassification(BertPreTrainedModel): + def __init__(self, config): + super().__init__(config) + self.num_labels = config.num_labels + self.config = config + + self.bert = BertModel(config) + classifier_dropout = ( + config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob + ) + self.dropout = nn.Dropout(classifier_dropout) + self.classifier = nn.Linear(config.hidden_size, config.num_labels) + + # Initialize weights and apply final processing + self.post_init() + + @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @add_code_sample_docstrings( + checkpoint=_CHECKPOINT_FOR_SEQUENCE_CLASSIFICATION, + output_type=SequenceClassifierOutput, + config_class=_CONFIG_FOR_DOC, + expected_output=_SEQ_CLASS_EXPECTED_OUTPUT, + expected_loss=_SEQ_CLASS_EXPECTED_LOSS, + ) + def forward( + self, + input_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + token_type_ids: Optional[torch.Tensor] = None, + position_ids: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + labels: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple[torch.Tensor], SequenceClassifierOutput]: + r""" + labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., + config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If + `config.num_labels > 1` a classification loss is computed (Cross-Entropy). + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.bert( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + pooled_output = outputs[1] + + pooled_output = self.dropout(pooled_output) + logits = self.classifier(pooled_output) + + loss = None + if labels is not None: + if self.config.problem_type is None: + if self.num_labels == 1: + self.config.problem_type = "regression" + elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int): + self.config.problem_type = "single_label_classification" + else: + self.config.problem_type = "multi_label_classification" + + if self.config.problem_type == "regression": + loss_fct = MSELoss() + if self.num_labels == 1: + loss = loss_fct(logits.squeeze(), labels.squeeze()) + else: + loss = loss_fct(logits, labels) + elif self.config.problem_type == "single_label_classification": + loss_fct = CrossEntropyLoss() + loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) + elif self.config.problem_type == "multi_label_classification": + loss_fct = BCEWithLogitsLoss() + loss = loss_fct(logits, labels) + if not return_dict: + output = (logits,) + outputs[2:] + return ((loss,) + output) if loss is not None else output + + return SequenceClassifierOutput( + loss=loss, + logits=logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +@add_start_docstrings( + """ + Bert Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a + softmax) e.g. for RocStories/SWAG tasks. + """, + BERT_START_DOCSTRING, +) +class BertForMultipleChoice(BertPreTrainedModel): + def __init__(self, config): + super().__init__(config) + + self.bert = BertModel(config) + classifier_dropout = ( + config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob + ) + self.dropout = nn.Dropout(classifier_dropout) + self.classifier = nn.Linear(config.hidden_size, 1) + + # Initialize weights and apply final processing + self.post_init() + + @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length")) + @add_code_sample_docstrings( + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=MultipleChoiceModelOutput, + config_class=_CONFIG_FOR_DOC, + ) + def forward( + self, + input_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + token_type_ids: Optional[torch.Tensor] = None, + position_ids: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + labels: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple[torch.Tensor], MultipleChoiceModelOutput]: + r""" + labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the multiple choice classification loss. Indices should be in `[0, ..., + num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See + `input_ids` above) + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1] + + input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None + attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None + token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None + position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None + inputs_embeds = ( + inputs_embeds.view(-1, inputs_embeds.size(-2), inputs_embeds.size(-1)) + if inputs_embeds is not None + else None + ) + + outputs = self.bert( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + pooled_output = outputs[1] + + pooled_output = self.dropout(pooled_output) + logits = self.classifier(pooled_output) + reshaped_logits = logits.view(-1, num_choices) + + loss = None + if labels is not None: + loss_fct = CrossEntropyLoss() + loss = loss_fct(reshaped_logits, labels) + + if not return_dict: + output = (reshaped_logits,) + outputs[2:] + return ((loss,) + output) if loss is not None else output + + return MultipleChoiceModelOutput( + loss=loss, + logits=reshaped_logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +@add_start_docstrings( + """ + Bert Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for + Named-Entity-Recognition (NER) tasks. + """, + BERT_START_DOCSTRING, +) +class BertForTokenClassification(BertPreTrainedModel): + def __init__(self, config): + super().__init__(config) + self.num_labels = config.num_labels + + self.bert = BertModel(config, add_pooling_layer=False) + classifier_dropout = ( + config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob + ) + self.dropout = nn.Dropout(classifier_dropout) + self.classifier = nn.Linear(config.hidden_size, config.num_labels) + + # Initialize weights and apply final processing + self.post_init() + + @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @add_code_sample_docstrings( + checkpoint=_CHECKPOINT_FOR_TOKEN_CLASSIFICATION, + output_type=TokenClassifierOutput, + config_class=_CONFIG_FOR_DOC, + expected_output=_TOKEN_CLASS_EXPECTED_OUTPUT, + expected_loss=_TOKEN_CLASS_EXPECTED_LOSS, + ) + def forward( + self, + input_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + token_type_ids: Optional[torch.Tensor] = None, + position_ids: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + labels: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple[torch.Tensor], TokenClassifierOutput]: + r""" + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`. + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.bert( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + sequence_output = outputs[0] + + sequence_output = self.dropout(sequence_output) + logits = self.classifier(sequence_output) + + loss = None + if labels is not None: + loss_fct = CrossEntropyLoss() + loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) + + if not return_dict: + output = (logits,) + outputs[2:] + return ((loss,) + output) if loss is not None else output + + return TokenClassifierOutput( + loss=loss, + logits=logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +@add_start_docstrings( + """ + Bert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear + layers on top of the hidden-states output to compute `span start logits` and `span end logits`). + """, + BERT_START_DOCSTRING, +) +class BertForQuestionAnswering(BertPreTrainedModel): + def __init__(self, config): + super().__init__(config) + self.num_labels = config.num_labels + + self.bert = BertModel(config, add_pooling_layer=False) + self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels) + + # Initialize weights and apply final processing + self.post_init() + + @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @add_code_sample_docstrings( + checkpoint=_CHECKPOINT_FOR_QA, + output_type=QuestionAnsweringModelOutput, + config_class=_CONFIG_FOR_DOC, + qa_target_start_index=_QA_TARGET_START_INDEX, + qa_target_end_index=_QA_TARGET_END_INDEX, + expected_output=_QA_EXPECTED_OUTPUT, + expected_loss=_QA_EXPECTED_LOSS, + ) + def forward( + self, + input_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + token_type_ids: Optional[torch.Tensor] = None, + position_ids: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + start_positions: Optional[torch.Tensor] = None, + end_positions: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple[torch.Tensor], QuestionAnsweringModelOutput]: + r""" + start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for position (index) of the start of the labelled span for computing the token classification loss. + Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence + are not taken into account for computing the loss. + end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for position (index) of the end of the labelled span for computing the token classification loss. + Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence + are not taken into account for computing the loss. + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.bert( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + sequence_output = outputs[0] + + logits = self.qa_outputs(sequence_output) + start_logits, end_logits = logits.split(1, dim=-1) + start_logits = start_logits.squeeze(-1).contiguous() + end_logits = end_logits.squeeze(-1).contiguous() + + total_loss = None + if start_positions is not None and end_positions is not None: + # If we are on multi-GPU, split add a dimension + if len(start_positions.size()) > 1: + start_positions = start_positions.squeeze(-1) + if len(end_positions.size()) > 1: + end_positions = end_positions.squeeze(-1) + # sometimes the start/end positions are outside our model inputs, we ignore these terms + ignored_index = start_logits.size(1) + start_positions = start_positions.clamp(0, ignored_index) + end_positions = end_positions.clamp(0, ignored_index) + + loss_fct = CrossEntropyLoss(ignore_index=ignored_index) + start_loss = loss_fct(start_logits, start_positions) + end_loss = loss_fct(end_logits, end_positions) + total_loss = (start_loss + end_loss) / 2 + + if not return_dict: + output = (start_logits, end_logits) + outputs[2:] + return ((total_loss,) + output) if total_loss is not None else output + + return QuestionAnsweringModelOutput( + loss=total_loss, + start_logits=start_logits, + end_logits=end_logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +__all__ = [ + "BertForMaskedLM", + "BertForMultipleChoice", + "BertForNextSentencePrediction", + "BertForPreTraining", + "BertForQuestionAnswering", + "BertForSequenceClassification", + "BertForTokenClassification", + "BertLayer", + "BertLMHeadModel", + "BertModel", + "BertPreTrainedModel", + "load_tf_weights_in_bert", +] diff --git a/docs/transformers/build/lib/transformers/models/bert/modeling_flax_bert.py b/docs/transformers/build/lib/transformers/models/bert/modeling_flax_bert.py new file mode 100644 index 0000000000000000000000000000000000000000..61939a53f4a4c1446596b71c71f4d0fb05158b18 --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/bert/modeling_flax_bert.py @@ -0,0 +1,1727 @@ +# coding=utf-8 +# Copyright 2021 The Google Flax Team Authors and The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Callable, Optional, Tuple + +import flax +import flax.linen as nn +import jax +import jax.numpy as jnp +import numpy as np +from flax.core.frozen_dict import FrozenDict, freeze, unfreeze +from flax.linen import combine_masks, make_causal_mask +from flax.linen import partitioning as nn_partitioning +from flax.linen.attention import dot_product_attention_weights +from flax.traverse_util import flatten_dict, unflatten_dict +from jax import lax + +from ...modeling_flax_outputs import ( + FlaxBaseModelOutputWithPastAndCrossAttentions, + FlaxBaseModelOutputWithPooling, + FlaxBaseModelOutputWithPoolingAndCrossAttentions, + FlaxCausalLMOutputWithCrossAttentions, + FlaxMaskedLMOutput, + FlaxMultipleChoiceModelOutput, + FlaxNextSentencePredictorOutput, + FlaxQuestionAnsweringModelOutput, + FlaxSequenceClassifierOutput, + FlaxTokenClassifierOutput, +) +from ...modeling_flax_utils import ( + ACT2FN, + FlaxPreTrainedModel, + append_call_sample_docstring, + append_replace_return_docstrings, + overwrite_call_docstring, +) +from ...utils import ModelOutput, add_start_docstrings, add_start_docstrings_to_model_forward, logging +from .configuration_bert import BertConfig + + +logger = logging.get_logger(__name__) + +_CHECKPOINT_FOR_DOC = "google-bert/bert-base-uncased" +_CONFIG_FOR_DOC = "BertConfig" + +remat = nn_partitioning.remat + + +@flax.struct.dataclass +class FlaxBertForPreTrainingOutput(ModelOutput): + """ + Output type of [`BertForPreTraining`]. + + Args: + prediction_logits (`jnp.ndarray` of shape `(batch_size, sequence_length, config.vocab_size)`): + Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). + seq_relationship_logits (`jnp.ndarray` of shape `(batch_size, 2)`): + Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation + before SoftMax). + hidden_states (`tuple(jnp.ndarray)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `jnp.ndarray` (one for the output of the embeddings + one for the output of each layer) of shape + `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (`tuple(jnp.ndarray)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `jnp.ndarray` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + + prediction_logits: jnp.ndarray = None + seq_relationship_logits: jnp.ndarray = None + hidden_states: Optional[Tuple[jnp.ndarray]] = None + attentions: Optional[Tuple[jnp.ndarray]] = None + + +BERT_START_DOCSTRING = r""" + + This model inherits from [`FlaxPreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading, saving and converting weights from PyTorch models) + + This model is also a + [flax.linen.Module](https://flax.readthedocs.io/en/latest/api_reference/flax.linen/module.html) subclass. Use it as + a regular Flax linen Module and refer to the Flax documentation for all matter related to general usage and + behavior. + + Finally, this model supports inherent JAX features such as: + + - [Just-In-Time (JIT) compilation](https://jax.readthedocs.io/en/latest/jax.html#just-in-time-compilation-jit) + - [Automatic Differentiation](https://jax.readthedocs.io/en/latest/jax.html#automatic-differentiation) + - [Vectorization](https://jax.readthedocs.io/en/latest/jax.html#vectorization-vmap) + - [Parallelization](https://jax.readthedocs.io/en/latest/jax.html#parallelization-pmap) + + Parameters: + config ([`BertConfig`]): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the [`~FlaxPreTrainedModel.from_pretrained`] method to load the model weights. + dtype (`jax.numpy.dtype`, *optional*, defaults to `jax.numpy.float32`): + The data type of the computation. Can be one of `jax.numpy.float32`, `jax.numpy.float16` (on GPUs) and + `jax.numpy.bfloat16` (on TPUs). + + This can be used to enable mixed-precision training or half-precision inference on GPUs or TPUs. If + specified all the computation will be performed with the given `dtype`. + + **Note that this only specifies the dtype of the computation and does not influence the dtype of model + parameters.** + + If you wish to change the dtype of the model parameters, see [`~FlaxPreTrainedModel.to_fp16`] and + [`~FlaxPreTrainedModel.to_bf16`]. + dtype (`jax.numpy.dtype`, *optional*, defaults to `jax.numpy.float32`): + The data type of the computation. Can be one of `jax.numpy.float32`, `jax.numpy.float16` (on GPUs) and + `jax.numpy.bfloat16` (on TPUs). + + This can be used to enable mixed-precision training or half-precision inference on GPUs or TPUs. If + specified all the computation will be performed with the given `dtype`. + + **Note that this only specifies the dtype of the computation and does not influence the dtype of model + parameters.** + + If you wish to change the dtype of the model parameters, see [`~FlaxPreTrainedModel.to_fp16`] and + [`~FlaxPreTrainedModel.to_bf16`]. + +""" + +BERT_INPUTS_DOCSTRING = r""" + Args: + input_ids (`numpy.ndarray` of shape `({0})`): + Indices of input sequence tokens in the vocabulary. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + attention_mask (`numpy.ndarray` of shape `({0})`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + token_type_ids (`numpy.ndarray` of shape `({0})`, *optional*): + Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0, + 1]`: + + - 0 corresponds to a *sentence A* token, + - 1 corresponds to a *sentence B* token. + + [What are token type IDs?](../glossary#token-type-ids) + position_ids (`numpy.ndarray` of shape `({0})`, *optional*): + Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, + config.max_position_embeddings - 1]`. + head_mask (`numpy.ndarray` of shape `({0})`, `optional): + Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. + +""" + + +class FlaxBertEmbeddings(nn.Module): + """Construct the embeddings from word, position and token_type embeddings.""" + + config: BertConfig + dtype: jnp.dtype = jnp.float32 # the dtype of the computation + + def setup(self): + self.word_embeddings = nn.Embed( + self.config.vocab_size, + self.config.hidden_size, + embedding_init=jax.nn.initializers.normal(stddev=self.config.initializer_range), + dtype=self.dtype, + ) + self.position_embeddings = nn.Embed( + self.config.max_position_embeddings, + self.config.hidden_size, + embedding_init=jax.nn.initializers.normal(stddev=self.config.initializer_range), + dtype=self.dtype, + ) + self.token_type_embeddings = nn.Embed( + self.config.type_vocab_size, + self.config.hidden_size, + embedding_init=jax.nn.initializers.normal(stddev=self.config.initializer_range), + dtype=self.dtype, + ) + self.LayerNorm = nn.LayerNorm(epsilon=self.config.layer_norm_eps, dtype=self.dtype) + self.dropout = nn.Dropout(rate=self.config.hidden_dropout_prob) + + def __call__(self, input_ids, token_type_ids, position_ids, attention_mask, deterministic: bool = True): + # Embed + inputs_embeds = self.word_embeddings(input_ids.astype("i4")) + position_embeds = self.position_embeddings(position_ids.astype("i4")) + token_type_embeddings = self.token_type_embeddings(token_type_ids.astype("i4")) + + # Sum all embeddings + hidden_states = inputs_embeds + token_type_embeddings + position_embeds + + # Layer Norm + hidden_states = self.LayerNorm(hidden_states) + hidden_states = self.dropout(hidden_states, deterministic=deterministic) + return hidden_states + + +class FlaxBertSelfAttention(nn.Module): + config: BertConfig + causal: bool = False + dtype: jnp.dtype = jnp.float32 # the dtype of the computation + + def setup(self): + self.head_dim = self.config.hidden_size // self.config.num_attention_heads + if self.config.hidden_size % self.config.num_attention_heads != 0: + raise ValueError( + "`config.hidden_size`: {self.config.hidden_size} has to be a multiple of `config.num_attention_heads` " + " : {self.config.num_attention_heads}" + ) + + self.query = nn.Dense( + self.config.hidden_size, + dtype=self.dtype, + kernel_init=jax.nn.initializers.normal(self.config.initializer_range), + ) + self.key = nn.Dense( + self.config.hidden_size, + dtype=self.dtype, + kernel_init=jax.nn.initializers.normal(self.config.initializer_range), + ) + self.value = nn.Dense( + self.config.hidden_size, + dtype=self.dtype, + kernel_init=jax.nn.initializers.normal(self.config.initializer_range), + ) + + if self.causal: + self.causal_mask = make_causal_mask( + jnp.ones((1, self.config.max_position_embeddings), dtype="bool"), dtype="bool" + ) + + def _split_heads(self, hidden_states): + return hidden_states.reshape(hidden_states.shape[:2] + (self.config.num_attention_heads, self.head_dim)) + + def _merge_heads(self, hidden_states): + return hidden_states.reshape(hidden_states.shape[:2] + (self.config.hidden_size,)) + + @nn.compact + # Copied from transformers.models.bart.modeling_flax_bart.FlaxBartAttention._concatenate_to_cache + def _concatenate_to_cache(self, key, value, query, attention_mask): + """ + This function takes projected key, value states from a single input token and concatenates the states to cached + states from previous steps. This function is slightly adapted from the official Flax repository: + https://github.com/google/flax/blob/491ce18759622506588784b4fca0e4bf05f8c8cd/flax/linen/attention.py#L252 + """ + # detect if we're initializing by absence of existing cache data. + is_initialized = self.has_variable("cache", "cached_key") + cached_key = self.variable("cache", "cached_key", jnp.zeros, key.shape, key.dtype) + cached_value = self.variable("cache", "cached_value", jnp.zeros, value.shape, value.dtype) + cache_index = self.variable("cache", "cache_index", lambda: jnp.array(0, dtype=jnp.int32)) + + if is_initialized: + *batch_dims, max_length, num_heads, depth_per_head = cached_key.value.shape + # update key, value caches with our new 1d spatial slices + cur_index = cache_index.value + indices = (0,) * len(batch_dims) + (cur_index, 0, 0) + key = lax.dynamic_update_slice(cached_key.value, key, indices) + value = lax.dynamic_update_slice(cached_value.value, value, indices) + cached_key.value = key + cached_value.value = value + num_updated_cache_vectors = query.shape[1] + cache_index.value = cache_index.value + num_updated_cache_vectors + # causal mask for cached decoder self-attention: our single query position should only attend to those key positions that have already been generated and cached, not the remaining zero elements. + pad_mask = jnp.broadcast_to( + jnp.arange(max_length) < cur_index + num_updated_cache_vectors, + tuple(batch_dims) + (1, num_updated_cache_vectors, max_length), + ) + attention_mask = combine_masks(pad_mask, attention_mask) + return key, value, attention_mask + + def __call__( + self, + hidden_states, + attention_mask, + layer_head_mask, + key_value_states: Optional[jnp.ndarray] = None, + init_cache: bool = False, + deterministic=True, + output_attentions: bool = False, + ): + # if key_value_states are provided this layer is used as a cross-attention layer + # for the decoder + is_cross_attention = key_value_states is not None + batch_size = hidden_states.shape[0] + + # get query proj + query_states = self.query(hidden_states) + # get key, value proj + if is_cross_attention: + # cross_attentions + key_states = self.key(key_value_states) + value_states = self.value(key_value_states) + else: + # self_attention + key_states = self.key(hidden_states) + value_states = self.value(hidden_states) + + query_states = self._split_heads(query_states) + key_states = self._split_heads(key_states) + value_states = self._split_heads(value_states) + + # handle cache prepare causal attention mask + if self.causal: + query_length, key_length = query_states.shape[1], key_states.shape[1] + if self.has_variable("cache", "cached_key"): + mask_shift = self.variables["cache"]["cache_index"] + max_decoder_length = self.variables["cache"]["cached_key"].shape[1] + causal_mask = lax.dynamic_slice( + self.causal_mask, (0, 0, mask_shift, 0), (1, 1, query_length, max_decoder_length) + ) + else: + causal_mask = self.causal_mask[:, :, :query_length, :key_length] + causal_mask = jnp.broadcast_to(causal_mask, (batch_size,) + causal_mask.shape[1:]) + + # combine masks if needed + if attention_mask is not None and self.causal: + attention_mask = jnp.broadcast_to(jnp.expand_dims(attention_mask, axis=(-3, -2)), causal_mask.shape) + attention_mask = combine_masks(attention_mask, causal_mask) + elif self.causal: + attention_mask = causal_mask + elif attention_mask is not None: + attention_mask = jnp.expand_dims(attention_mask, axis=(-3, -2)) + + # During fast autoregressive decoding, we feed one position at a time, + # and cache the keys and values step by step. + if self.causal and (self.has_variable("cache", "cached_key") or init_cache): + key_states, value_states, attention_mask = self._concatenate_to_cache( + key_states, value_states, query_states, attention_mask + ) + + # Convert the boolean attention mask to an attention bias. + if attention_mask is not None: + # attention mask in the form of attention bias + attention_bias = lax.select( + attention_mask > 0, + jnp.full(attention_mask.shape, 0.0).astype(self.dtype), + jnp.full(attention_mask.shape, jnp.finfo(self.dtype).min).astype(self.dtype), + ) + else: + attention_bias = None + + dropout_rng = None + if not deterministic and self.config.attention_probs_dropout_prob > 0.0: + dropout_rng = self.make_rng("dropout") + + attn_weights = dot_product_attention_weights( + query_states, + key_states, + bias=attention_bias, + dropout_rng=dropout_rng, + dropout_rate=self.config.attention_probs_dropout_prob, + broadcast_dropout=True, + deterministic=deterministic, + dtype=self.dtype, + precision=None, + ) + + # Mask heads if we want to + if layer_head_mask is not None: + attn_weights = jnp.einsum("...hqk,h->...hqk", attn_weights, layer_head_mask) + + attn_output = jnp.einsum("...hqk,...khd->...qhd", attn_weights, value_states) + attn_output = attn_output.reshape(attn_output.shape[:2] + (-1,)) + + outputs = (attn_output, attn_weights) if output_attentions else (attn_output,) + return outputs + + +class FlaxBertSelfOutput(nn.Module): + config: BertConfig + dtype: jnp.dtype = jnp.float32 # the dtype of the computation + + def setup(self): + self.dense = nn.Dense( + self.config.hidden_size, + kernel_init=jax.nn.initializers.normal(self.config.initializer_range), + dtype=self.dtype, + ) + self.LayerNorm = nn.LayerNorm(epsilon=self.config.layer_norm_eps, dtype=self.dtype) + self.dropout = nn.Dropout(rate=self.config.hidden_dropout_prob) + + def __call__(self, hidden_states, input_tensor, deterministic: bool = True): + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states, deterministic=deterministic) + hidden_states = self.LayerNorm(hidden_states + input_tensor) + return hidden_states + + +class FlaxBertAttention(nn.Module): + config: BertConfig + causal: bool = False + dtype: jnp.dtype = jnp.float32 + + def setup(self): + self.self = FlaxBertSelfAttention(self.config, causal=self.causal, dtype=self.dtype) + self.output = FlaxBertSelfOutput(self.config, dtype=self.dtype) + + def __call__( + self, + hidden_states, + attention_mask, + layer_head_mask, + key_value_states=None, + init_cache=False, + deterministic=True, + output_attentions: bool = False, + ): + # Attention mask comes in as attention_mask.shape == (*batch_sizes, kv_length) + # FLAX expects: attention_mask.shape == (*batch_sizes, 1, 1, kv_length) such that it is broadcastable + # with attn_weights.shape == (*batch_sizes, num_heads, q_length, kv_length) + attn_outputs = self.self( + hidden_states, + attention_mask, + layer_head_mask=layer_head_mask, + key_value_states=key_value_states, + init_cache=init_cache, + deterministic=deterministic, + output_attentions=output_attentions, + ) + attn_output = attn_outputs[0] + hidden_states = self.output(attn_output, hidden_states, deterministic=deterministic) + + outputs = (hidden_states,) + + if output_attentions: + outputs += (attn_outputs[1],) + + return outputs + + +class FlaxBertIntermediate(nn.Module): + config: BertConfig + dtype: jnp.dtype = jnp.float32 # the dtype of the computation + + def setup(self): + self.dense = nn.Dense( + self.config.intermediate_size, + kernel_init=jax.nn.initializers.normal(self.config.initializer_range), + dtype=self.dtype, + ) + self.activation = ACT2FN[self.config.hidden_act] + + def __call__(self, hidden_states): + hidden_states = self.dense(hidden_states) + hidden_states = self.activation(hidden_states) + return hidden_states + + +class FlaxBertOutput(nn.Module): + config: BertConfig + dtype: jnp.dtype = jnp.float32 # the dtype of the computation + + def setup(self): + self.dense = nn.Dense( + self.config.hidden_size, + kernel_init=jax.nn.initializers.normal(self.config.initializer_range), + dtype=self.dtype, + ) + self.dropout = nn.Dropout(rate=self.config.hidden_dropout_prob) + self.LayerNorm = nn.LayerNorm(epsilon=self.config.layer_norm_eps, dtype=self.dtype) + + def __call__(self, hidden_states, attention_output, deterministic: bool = True): + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states, deterministic=deterministic) + hidden_states = self.LayerNorm(hidden_states + attention_output) + return hidden_states + + +class FlaxBertLayer(nn.Module): + config: BertConfig + dtype: jnp.dtype = jnp.float32 # the dtype of the computation + + def setup(self): + self.attention = FlaxBertAttention(self.config, causal=self.config.is_decoder, dtype=self.dtype) + self.intermediate = FlaxBertIntermediate(self.config, dtype=self.dtype) + self.output = FlaxBertOutput(self.config, dtype=self.dtype) + if self.config.add_cross_attention: + self.crossattention = FlaxBertAttention(self.config, causal=False, dtype=self.dtype) + + def __call__( + self, + hidden_states, + attention_mask, + layer_head_mask, + encoder_hidden_states: Optional[jnp.ndarray] = None, + encoder_attention_mask: Optional[jnp.ndarray] = None, + init_cache: bool = False, + deterministic: bool = True, + output_attentions: bool = False, + ): + # Self Attention + attention_outputs = self.attention( + hidden_states, + attention_mask, + layer_head_mask=layer_head_mask, + init_cache=init_cache, + deterministic=deterministic, + output_attentions=output_attentions, + ) + attention_output = attention_outputs[0] + + # Cross-Attention Block + if encoder_hidden_states is not None: + cross_attention_outputs = self.crossattention( + attention_output, + attention_mask=encoder_attention_mask, + layer_head_mask=layer_head_mask, + key_value_states=encoder_hidden_states, + deterministic=deterministic, + output_attentions=output_attentions, + ) + attention_output = cross_attention_outputs[0] + + hidden_states = self.intermediate(attention_output) + hidden_states = self.output(hidden_states, attention_output, deterministic=deterministic) + + outputs = (hidden_states,) + + if output_attentions: + outputs += (attention_outputs[1],) + if encoder_hidden_states is not None: + outputs += (cross_attention_outputs[1],) + return outputs + + +class FlaxBertLayerCollection(nn.Module): + config: BertConfig + dtype: jnp.dtype = jnp.float32 # the dtype of the computation + gradient_checkpointing: bool = False + + def setup(self): + if self.gradient_checkpointing: + FlaxBertCheckpointLayer = remat(FlaxBertLayer, static_argnums=(5, 6, 7)) + self.layers = [ + FlaxBertCheckpointLayer(self.config, name=str(i), dtype=self.dtype) + for i in range(self.config.num_hidden_layers) + ] + else: + self.layers = [ + FlaxBertLayer(self.config, name=str(i), dtype=self.dtype) for i in range(self.config.num_hidden_layers) + ] + + def __call__( + self, + hidden_states, + attention_mask, + head_mask, + encoder_hidden_states: Optional[jnp.ndarray] = None, + encoder_attention_mask: Optional[jnp.ndarray] = None, + init_cache: bool = False, + deterministic: bool = True, + output_attentions: bool = False, + output_hidden_states: bool = False, + return_dict: bool = True, + ): + all_attentions = () if output_attentions else None + all_hidden_states = () if output_hidden_states else None + all_cross_attentions = () if (output_attentions and encoder_hidden_states is not None) else None + + # Check if head_mask has a correct number of layers specified if desired + if head_mask is not None: + if head_mask.shape[0] != (len(self.layers)): + raise ValueError( + f"The head_mask should be specified for {len(self.layers)} layers, but it is for " + f" {head_mask.shape[0]}." + ) + + for i, layer in enumerate(self.layers): + if output_hidden_states: + all_hidden_states += (hidden_states,) + + layer_outputs = layer( + hidden_states, + attention_mask, + head_mask[i] if head_mask is not None else None, + encoder_hidden_states, + encoder_attention_mask, + init_cache, + deterministic, + output_attentions, + ) + + hidden_states = layer_outputs[0] + + if output_attentions: + all_attentions += (layer_outputs[1],) + + if encoder_hidden_states is not None: + all_cross_attentions += (layer_outputs[2],) + + if output_hidden_states: + all_hidden_states += (hidden_states,) + + outputs = (hidden_states, all_hidden_states, all_attentions, all_cross_attentions) + + if not return_dict: + return tuple(v for v in outputs if v is not None) + + return FlaxBaseModelOutputWithPastAndCrossAttentions( + last_hidden_state=hidden_states, + hidden_states=all_hidden_states, + attentions=all_attentions, + cross_attentions=all_cross_attentions, + ) + + +class FlaxBertEncoder(nn.Module): + config: BertConfig + dtype: jnp.dtype = jnp.float32 # the dtype of the computation + gradient_checkpointing: bool = False + + def setup(self): + self.layer = FlaxBertLayerCollection( + self.config, + dtype=self.dtype, + gradient_checkpointing=self.gradient_checkpointing, + ) + + def __call__( + self, + hidden_states, + attention_mask, + head_mask, + encoder_hidden_states: Optional[jnp.ndarray] = None, + encoder_attention_mask: Optional[jnp.ndarray] = None, + init_cache: bool = False, + deterministic: bool = True, + output_attentions: bool = False, + output_hidden_states: bool = False, + return_dict: bool = True, + ): + return self.layer( + hidden_states, + attention_mask, + head_mask=head_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + init_cache=init_cache, + deterministic=deterministic, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + +class FlaxBertPooler(nn.Module): + config: BertConfig + dtype: jnp.dtype = jnp.float32 # the dtype of the computation + + def setup(self): + self.dense = nn.Dense( + self.config.hidden_size, + kernel_init=jax.nn.initializers.normal(self.config.initializer_range), + dtype=self.dtype, + ) + + def __call__(self, hidden_states): + cls_hidden_state = hidden_states[:, 0] + cls_hidden_state = self.dense(cls_hidden_state) + return nn.tanh(cls_hidden_state) + + +class FlaxBertPredictionHeadTransform(nn.Module): + config: BertConfig + dtype: jnp.dtype = jnp.float32 + + def setup(self): + self.dense = nn.Dense(self.config.hidden_size, dtype=self.dtype) + self.activation = ACT2FN[self.config.hidden_act] + self.LayerNorm = nn.LayerNorm(epsilon=self.config.layer_norm_eps, dtype=self.dtype) + + def __call__(self, hidden_states): + hidden_states = self.dense(hidden_states) + hidden_states = self.activation(hidden_states) + return self.LayerNorm(hidden_states) + + +class FlaxBertLMPredictionHead(nn.Module): + config: BertConfig + dtype: jnp.dtype = jnp.float32 + bias_init: Callable[..., np.ndarray] = jax.nn.initializers.zeros + + def setup(self): + self.transform = FlaxBertPredictionHeadTransform(self.config, dtype=self.dtype) + self.decoder = nn.Dense(self.config.vocab_size, dtype=self.dtype, use_bias=False) + self.bias = self.param("bias", self.bias_init, (self.config.vocab_size,)) + + def __call__(self, hidden_states, shared_embedding=None): + hidden_states = self.transform(hidden_states) + + if shared_embedding is not None: + hidden_states = self.decoder.apply({"params": {"kernel": shared_embedding.T}}, hidden_states) + else: + hidden_states = self.decoder(hidden_states) + + bias = jnp.asarray(self.bias, self.dtype) + hidden_states += bias + return hidden_states + + +class FlaxBertOnlyMLMHead(nn.Module): + config: BertConfig + dtype: jnp.dtype = jnp.float32 + + def setup(self): + self.predictions = FlaxBertLMPredictionHead(self.config, dtype=self.dtype) + + def __call__(self, hidden_states, shared_embedding=None): + hidden_states = self.predictions(hidden_states, shared_embedding=shared_embedding) + return hidden_states + + +class FlaxBertOnlyNSPHead(nn.Module): + dtype: jnp.dtype = jnp.float32 + + def setup(self): + self.seq_relationship = nn.Dense(2, dtype=self.dtype) + + def __call__(self, pooled_output): + return self.seq_relationship(pooled_output) + + +class FlaxBertPreTrainingHeads(nn.Module): + config: BertConfig + dtype: jnp.dtype = jnp.float32 + + def setup(self): + self.predictions = FlaxBertLMPredictionHead(self.config, dtype=self.dtype) + self.seq_relationship = nn.Dense(2, dtype=self.dtype) + + def __call__(self, hidden_states, pooled_output, shared_embedding=None): + prediction_scores = self.predictions(hidden_states, shared_embedding=shared_embedding) + seq_relationship_score = self.seq_relationship(pooled_output) + return prediction_scores, seq_relationship_score + + +class FlaxBertPreTrainedModel(FlaxPreTrainedModel): + """ + An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained + models. + """ + + config_class = BertConfig + base_model_prefix = "bert" + module_class: nn.Module = None + + def __init__( + self, + config: BertConfig, + input_shape: Tuple = (1, 1), + seed: int = 0, + dtype: jnp.dtype = jnp.float32, + _do_init: bool = True, + gradient_checkpointing: bool = False, + **kwargs, + ): + module = self.module_class( + config=config, + dtype=dtype, + gradient_checkpointing=gradient_checkpointing, + **kwargs, + ) + super().__init__(config, module, input_shape=input_shape, seed=seed, dtype=dtype, _do_init=_do_init) + + def enable_gradient_checkpointing(self): + self._module = self.module_class( + config=self.config, + dtype=self.dtype, + gradient_checkpointing=True, + ) + + def init_weights(self, rng: jax.random.PRNGKey, input_shape: Tuple, params: FrozenDict = None) -> FrozenDict: + # init input tensors + input_ids = jnp.zeros(input_shape, dtype="i4") + token_type_ids = jnp.zeros_like(input_ids) + position_ids = jnp.broadcast_to(jnp.arange(jnp.atleast_2d(input_ids).shape[-1]), input_shape) + attention_mask = jnp.ones_like(input_ids) + head_mask = jnp.ones((self.config.num_hidden_layers, self.config.num_attention_heads)) + + params_rng, dropout_rng = jax.random.split(rng) + rngs = {"params": params_rng, "dropout": dropout_rng} + + if self.config.add_cross_attention: + encoder_hidden_states = jnp.zeros(input_shape + (self.config.hidden_size,)) + encoder_attention_mask = attention_mask + module_init_outputs = self.module.init( + rngs, + input_ids, + attention_mask, + token_type_ids, + position_ids, + head_mask, + encoder_hidden_states, + encoder_attention_mask, + return_dict=False, + ) + else: + module_init_outputs = self.module.init( + rngs, input_ids, attention_mask, token_type_ids, position_ids, head_mask, return_dict=False + ) + + random_params = module_init_outputs["params"] + + if params is not None: + random_params = flatten_dict(unfreeze(random_params)) + params = flatten_dict(unfreeze(params)) + for missing_key in self._missing_keys: + params[missing_key] = random_params[missing_key] + self._missing_keys = set() + return freeze(unflatten_dict(params)) + else: + return random_params + + # Copied from transformers.models.bart.modeling_flax_bart.FlaxBartDecoderPreTrainedModel.init_cache + def init_cache(self, batch_size, max_length): + r""" + Args: + batch_size (`int`): + batch_size used for fast auto-regressive decoding. Defines the batch size of the initialized cache. + max_length (`int`): + maximum possible length for auto-regressive decoding. Defines the sequence length of the initialized + cache. + """ + # init input variables to retrieve cache + input_ids = jnp.ones((batch_size, max_length), dtype="i4") + attention_mask = jnp.ones_like(input_ids, dtype="i4") + position_ids = jnp.broadcast_to(jnp.arange(jnp.atleast_2d(input_ids).shape[-1]), input_ids.shape) + + init_variables = self.module.init( + jax.random.PRNGKey(0), input_ids, attention_mask, position_ids, return_dict=False, init_cache=True + ) + return unfreeze(init_variables["cache"]) + + @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + def __call__( + self, + input_ids, + attention_mask=None, + token_type_ids=None, + position_ids=None, + head_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + params: dict = None, + dropout_rng: jax.random.PRNGKey = None, + train: bool = False, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + past_key_values: dict = None, + ): + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.return_dict + + # init input tensors if not passed + if token_type_ids is None: + token_type_ids = jnp.zeros_like(input_ids) + + if position_ids is None: + position_ids = jnp.broadcast_to(jnp.arange(jnp.atleast_2d(input_ids).shape[-1]), input_ids.shape) + + if attention_mask is None: + attention_mask = jnp.ones_like(input_ids) + + if head_mask is None: + head_mask = jnp.ones((self.config.num_hidden_layers, self.config.num_attention_heads)) + + # Handle any PRNG if needed + rngs = {} + if dropout_rng is not None: + rngs["dropout"] = dropout_rng + + inputs = {"params": params or self.params} + + if self.config.add_cross_attention: + # if past_key_values are passed then cache is already initialized a private flag init_cache has to be passed + # down to ensure cache is used. It has to be made sure that cache is marked as mutable so that it can be + # changed by FlaxBertAttention module + if past_key_values: + inputs["cache"] = past_key_values + mutable = ["cache"] + else: + mutable = False + + outputs = self.module.apply( + inputs, + jnp.array(input_ids, dtype="i4"), + jnp.array(attention_mask, dtype="i4"), + token_type_ids=jnp.array(token_type_ids, dtype="i4"), + position_ids=jnp.array(position_ids, dtype="i4"), + head_mask=jnp.array(head_mask, dtype="i4"), + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + deterministic=not train, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + rngs=rngs, + mutable=mutable, + ) + + # add updated cache to model output + if past_key_values is not None and return_dict: + outputs, past_key_values = outputs + outputs["past_key_values"] = unfreeze(past_key_values["cache"]) + return outputs + elif past_key_values is not None and not return_dict: + outputs, past_key_values = outputs + outputs = outputs[:1] + (unfreeze(past_key_values["cache"]),) + outputs[1:] + + else: + outputs = self.module.apply( + inputs, + jnp.array(input_ids, dtype="i4"), + jnp.array(attention_mask, dtype="i4"), + token_type_ids=jnp.array(token_type_ids, dtype="i4"), + position_ids=jnp.array(position_ids, dtype="i4"), + head_mask=jnp.array(head_mask, dtype="i4"), + deterministic=not train, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + rngs=rngs, + ) + + return outputs + + +class FlaxBertModule(nn.Module): + config: BertConfig + dtype: jnp.dtype = jnp.float32 # the dtype of the computation + add_pooling_layer: bool = True + gradient_checkpointing: bool = False + + def setup(self): + self.embeddings = FlaxBertEmbeddings(self.config, dtype=self.dtype) + self.encoder = FlaxBertEncoder( + self.config, + dtype=self.dtype, + gradient_checkpointing=self.gradient_checkpointing, + ) + self.pooler = FlaxBertPooler(self.config, dtype=self.dtype) + + def __call__( + self, + input_ids, + attention_mask, + token_type_ids: Optional[jnp.ndarray] = None, + position_ids: Optional[jnp.ndarray] = None, + head_mask: Optional[jnp.ndarray] = None, + encoder_hidden_states: Optional[jnp.ndarray] = None, + encoder_attention_mask: Optional[jnp.ndarray] = None, + init_cache: bool = False, + deterministic: bool = True, + output_attentions: bool = False, + output_hidden_states: bool = False, + return_dict: bool = True, + ): + # make sure `token_type_ids` is correctly initialized when not passed + if token_type_ids is None: + token_type_ids = jnp.zeros_like(input_ids) + + # make sure `position_ids` is correctly initialized when not passed + if position_ids is None: + position_ids = jnp.broadcast_to(jnp.arange(jnp.atleast_2d(input_ids).shape[-1]), input_ids.shape) + + hidden_states = self.embeddings( + input_ids, token_type_ids, position_ids, attention_mask, deterministic=deterministic + ) + outputs = self.encoder( + hidden_states, + attention_mask, + head_mask=head_mask, + deterministic=deterministic, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + init_cache=init_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + hidden_states = outputs[0] + pooled = self.pooler(hidden_states) if self.add_pooling_layer else None + + if not return_dict: + # if pooled is None, don't return it + if pooled is None: + return (hidden_states,) + outputs[1:] + return (hidden_states, pooled) + outputs[1:] + + return FlaxBaseModelOutputWithPoolingAndCrossAttentions( + last_hidden_state=hidden_states, + pooler_output=pooled, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + cross_attentions=outputs.cross_attentions, + ) + + +@add_start_docstrings( + "The bare Bert Model transformer outputting raw hidden-states without any specific head on top.", + BERT_START_DOCSTRING, +) +class FlaxBertModel(FlaxBertPreTrainedModel): + module_class = FlaxBertModule + + +append_call_sample_docstring(FlaxBertModel, _CHECKPOINT_FOR_DOC, FlaxBaseModelOutputWithPooling, _CONFIG_FOR_DOC) + + +class FlaxBertForPreTrainingModule(nn.Module): + config: BertConfig + dtype: jnp.dtype = jnp.float32 + gradient_checkpointing: bool = False + + def setup(self): + self.bert = FlaxBertModule( + config=self.config, + dtype=self.dtype, + gradient_checkpointing=self.gradient_checkpointing, + ) + self.cls = FlaxBertPreTrainingHeads(config=self.config, dtype=self.dtype) + + def __call__( + self, + input_ids, + attention_mask, + token_type_ids, + position_ids, + head_mask, + deterministic: bool = True, + output_attentions: bool = False, + output_hidden_states: bool = False, + return_dict: bool = True, + ): + # Model + outputs = self.bert( + input_ids, + attention_mask, + token_type_ids, + position_ids, + head_mask, + deterministic=deterministic, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + if self.config.tie_word_embeddings: + shared_embedding = self.bert.variables["params"]["embeddings"]["word_embeddings"]["embedding"] + else: + shared_embedding = None + + hidden_states = outputs[0] + pooled_output = outputs[1] + + prediction_scores, seq_relationship_score = self.cls( + hidden_states, pooled_output, shared_embedding=shared_embedding + ) + + if not return_dict: + return (prediction_scores, seq_relationship_score) + outputs[2:] + + return FlaxBertForPreTrainingOutput( + prediction_logits=prediction_scores, + seq_relationship_logits=seq_relationship_score, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +@add_start_docstrings( + """ + Bert Model with two heads on top as done during the pretraining: a `masked language modeling` head and a `next + sentence prediction (classification)` head. + """, + BERT_START_DOCSTRING, +) +class FlaxBertForPreTraining(FlaxBertPreTrainedModel): + module_class = FlaxBertForPreTrainingModule + + +FLAX_BERT_FOR_PRETRAINING_DOCSTRING = """ + Returns: + + Example: + + ```python + >>> from transformers import AutoTokenizer, FlaxBertForPreTraining + + >>> tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased") + >>> model = FlaxBertForPreTraining.from_pretrained("google-bert/bert-base-uncased") + + >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="np") + >>> outputs = model(**inputs) + + >>> prediction_logits = outputs.prediction_logits + >>> seq_relationship_logits = outputs.seq_relationship_logits + ``` +""" + +overwrite_call_docstring( + FlaxBertForPreTraining, + BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length") + FLAX_BERT_FOR_PRETRAINING_DOCSTRING, +) +append_replace_return_docstrings( + FlaxBertForPreTraining, output_type=FlaxBertForPreTrainingOutput, config_class=_CONFIG_FOR_DOC +) + + +class FlaxBertForMaskedLMModule(nn.Module): + config: BertConfig + dtype: jnp.dtype = jnp.float32 + gradient_checkpointing: bool = False + + def setup(self): + self.bert = FlaxBertModule( + config=self.config, + add_pooling_layer=False, + dtype=self.dtype, + gradient_checkpointing=self.gradient_checkpointing, + ) + self.cls = FlaxBertOnlyMLMHead(config=self.config, dtype=self.dtype) + + def __call__( + self, + input_ids, + attention_mask, + token_type_ids, + position_ids, + head_mask, + deterministic: bool = True, + output_attentions: bool = False, + output_hidden_states: bool = False, + return_dict: bool = True, + ): + # Model + outputs = self.bert( + input_ids, + attention_mask, + token_type_ids, + position_ids, + head_mask, + deterministic=deterministic, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + hidden_states = outputs[0] + if self.config.tie_word_embeddings: + shared_embedding = self.bert.variables["params"]["embeddings"]["word_embeddings"]["embedding"] + else: + shared_embedding = None + + # Compute the prediction scores + logits = self.cls(hidden_states, shared_embedding=shared_embedding) + + if not return_dict: + return (logits,) + outputs[1:] + + return FlaxMaskedLMOutput( + logits=logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +@add_start_docstrings("""Bert Model with a `language modeling` head on top.""", BERT_START_DOCSTRING) +class FlaxBertForMaskedLM(FlaxBertPreTrainedModel): + module_class = FlaxBertForMaskedLMModule + + +append_call_sample_docstring(FlaxBertForMaskedLM, _CHECKPOINT_FOR_DOC, FlaxMaskedLMOutput, _CONFIG_FOR_DOC) + + +class FlaxBertForNextSentencePredictionModule(nn.Module): + config: BertConfig + dtype: jnp.dtype = jnp.float32 + gradient_checkpointing: bool = False + + def setup(self): + self.bert = FlaxBertModule( + config=self.config, + dtype=self.dtype, + gradient_checkpointing=self.gradient_checkpointing, + ) + self.cls = FlaxBertOnlyNSPHead(dtype=self.dtype) + + def __call__( + self, + input_ids, + attention_mask, + token_type_ids, + position_ids, + head_mask, + deterministic: bool = True, + output_attentions: bool = False, + output_hidden_states: bool = False, + return_dict: bool = True, + ): + return_dict = return_dict if return_dict is not None else self.config.return_dict + + # Model + outputs = self.bert( + input_ids, + attention_mask, + token_type_ids, + position_ids, + head_mask, + deterministic=deterministic, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + pooled_output = outputs[1] + seq_relationship_scores = self.cls(pooled_output) + + if not return_dict: + return (seq_relationship_scores,) + outputs[2:] + + return FlaxNextSentencePredictorOutput( + logits=seq_relationship_scores, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +@add_start_docstrings( + """Bert Model with a `next sentence prediction (classification)` head on top.""", + BERT_START_DOCSTRING, +) +class FlaxBertForNextSentencePrediction(FlaxBertPreTrainedModel): + module_class = FlaxBertForNextSentencePredictionModule + + +FLAX_BERT_FOR_NEXT_SENT_PRED_DOCSTRING = """ + Returns: + + Example: + + ```python + >>> from transformers import AutoTokenizer, FlaxBertForNextSentencePrediction + + >>> tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased") + >>> model = FlaxBertForNextSentencePrediction.from_pretrained("google-bert/bert-base-uncased") + + >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced." + >>> next_sentence = "The sky is blue due to the shorter wavelength of blue light." + >>> encoding = tokenizer(prompt, next_sentence, return_tensors="jax") + + >>> outputs = model(**encoding) + >>> logits = outputs.logits + >>> assert logits[0, 0] < logits[0, 1] # next sentence was random + ``` +""" + + +overwrite_call_docstring( + FlaxBertForNextSentencePrediction, + BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length") + FLAX_BERT_FOR_NEXT_SENT_PRED_DOCSTRING, +) +append_replace_return_docstrings( + FlaxBertForNextSentencePrediction, output_type=FlaxNextSentencePredictorOutput, config_class=_CONFIG_FOR_DOC +) + + +class FlaxBertForSequenceClassificationModule(nn.Module): + config: BertConfig + dtype: jnp.dtype = jnp.float32 + gradient_checkpointing: bool = False + + def setup(self): + self.bert = FlaxBertModule( + config=self.config, + dtype=self.dtype, + gradient_checkpointing=self.gradient_checkpointing, + ) + classifier_dropout = ( + self.config.classifier_dropout + if self.config.classifier_dropout is not None + else self.config.hidden_dropout_prob + ) + self.dropout = nn.Dropout(rate=classifier_dropout) + self.classifier = nn.Dense( + self.config.num_labels, + dtype=self.dtype, + ) + + def __call__( + self, + input_ids, + attention_mask, + token_type_ids, + position_ids, + head_mask, + deterministic: bool = True, + output_attentions: bool = False, + output_hidden_states: bool = False, + return_dict: bool = True, + ): + # Model + outputs = self.bert( + input_ids, + attention_mask, + token_type_ids, + position_ids, + head_mask, + deterministic=deterministic, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + pooled_output = outputs[1] + pooled_output = self.dropout(pooled_output, deterministic=deterministic) + logits = self.classifier(pooled_output) + + if not return_dict: + return (logits,) + outputs[2:] + + return FlaxSequenceClassifierOutput( + logits=logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +@add_start_docstrings( + """ + Bert Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled + output) e.g. for GLUE tasks. + """, + BERT_START_DOCSTRING, +) +class FlaxBertForSequenceClassification(FlaxBertPreTrainedModel): + module_class = FlaxBertForSequenceClassificationModule + + +append_call_sample_docstring( + FlaxBertForSequenceClassification, + _CHECKPOINT_FOR_DOC, + FlaxSequenceClassifierOutput, + _CONFIG_FOR_DOC, +) + + +class FlaxBertForMultipleChoiceModule(nn.Module): + config: BertConfig + dtype: jnp.dtype = jnp.float32 + gradient_checkpointing: bool = False + + def setup(self): + self.bert = FlaxBertModule( + config=self.config, + dtype=self.dtype, + gradient_checkpointing=self.gradient_checkpointing, + ) + self.dropout = nn.Dropout(rate=self.config.hidden_dropout_prob) + self.classifier = nn.Dense(1, dtype=self.dtype) + + def __call__( + self, + input_ids, + attention_mask, + token_type_ids, + position_ids, + head_mask, + deterministic: bool = True, + output_attentions: bool = False, + output_hidden_states: bool = False, + return_dict: bool = True, + ): + num_choices = input_ids.shape[1] + input_ids = input_ids.reshape(-1, input_ids.shape[-1]) if input_ids is not None else None + attention_mask = attention_mask.reshape(-1, attention_mask.shape[-1]) if attention_mask is not None else None + token_type_ids = token_type_ids.reshape(-1, token_type_ids.shape[-1]) if token_type_ids is not None else None + position_ids = position_ids.reshape(-1, position_ids.shape[-1]) if position_ids is not None else None + + # Model + outputs = self.bert( + input_ids, + attention_mask, + token_type_ids, + position_ids, + head_mask, + deterministic=deterministic, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + pooled_output = outputs[1] + pooled_output = self.dropout(pooled_output, deterministic=deterministic) + logits = self.classifier(pooled_output) + + reshaped_logits = logits.reshape(-1, num_choices) + + if not return_dict: + return (reshaped_logits,) + outputs[2:] + + return FlaxMultipleChoiceModelOutput( + logits=reshaped_logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +@add_start_docstrings( + """ + Bert Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a + softmax) e.g. for RocStories/SWAG tasks. + """, + BERT_START_DOCSTRING, +) +class FlaxBertForMultipleChoice(FlaxBertPreTrainedModel): + module_class = FlaxBertForMultipleChoiceModule + + +overwrite_call_docstring( + FlaxBertForMultipleChoice, BERT_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length") +) +append_call_sample_docstring( + FlaxBertForMultipleChoice, _CHECKPOINT_FOR_DOC, FlaxMultipleChoiceModelOutput, _CONFIG_FOR_DOC +) + + +class FlaxBertForTokenClassificationModule(nn.Module): + config: BertConfig + dtype: jnp.dtype = jnp.float32 + gradient_checkpointing: bool = False + + def setup(self): + self.bert = FlaxBertModule( + config=self.config, + dtype=self.dtype, + add_pooling_layer=False, + gradient_checkpointing=self.gradient_checkpointing, + ) + classifier_dropout = ( + self.config.classifier_dropout + if self.config.classifier_dropout is not None + else self.config.hidden_dropout_prob + ) + self.dropout = nn.Dropout(rate=classifier_dropout) + self.classifier = nn.Dense(self.config.num_labels, dtype=self.dtype) + + def __call__( + self, + input_ids, + attention_mask, + token_type_ids, + position_ids, + head_mask, + deterministic: bool = True, + output_attentions: bool = False, + output_hidden_states: bool = False, + return_dict: bool = True, + ): + # Model + outputs = self.bert( + input_ids, + attention_mask, + token_type_ids, + position_ids, + head_mask, + deterministic=deterministic, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + hidden_states = outputs[0] + hidden_states = self.dropout(hidden_states, deterministic=deterministic) + logits = self.classifier(hidden_states) + + if not return_dict: + return (logits,) + outputs[1:] + + return FlaxTokenClassifierOutput( + logits=logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +@add_start_docstrings( + """ + Bert Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for + Named-Entity-Recognition (NER) tasks. + """, + BERT_START_DOCSTRING, +) +class FlaxBertForTokenClassification(FlaxBertPreTrainedModel): + module_class = FlaxBertForTokenClassificationModule + + +append_call_sample_docstring( + FlaxBertForTokenClassification, _CHECKPOINT_FOR_DOC, FlaxTokenClassifierOutput, _CONFIG_FOR_DOC +) + + +class FlaxBertForQuestionAnsweringModule(nn.Module): + config: BertConfig + dtype: jnp.dtype = jnp.float32 + gradient_checkpointing: bool = False + + def setup(self): + self.bert = FlaxBertModule( + config=self.config, + dtype=self.dtype, + add_pooling_layer=False, + gradient_checkpointing=self.gradient_checkpointing, + ) + self.qa_outputs = nn.Dense(self.config.num_labels, dtype=self.dtype) + + def __call__( + self, + input_ids, + attention_mask, + token_type_ids, + position_ids, + head_mask, + deterministic: bool = True, + output_attentions: bool = False, + output_hidden_states: bool = False, + return_dict: bool = True, + ): + # Model + outputs = self.bert( + input_ids, + attention_mask, + token_type_ids, + position_ids, + head_mask, + deterministic=deterministic, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + hidden_states = outputs[0] + + logits = self.qa_outputs(hidden_states) + start_logits, end_logits = jnp.split(logits, self.config.num_labels, axis=-1) + start_logits = start_logits.squeeze(-1) + end_logits = end_logits.squeeze(-1) + + if not return_dict: + return (start_logits, end_logits) + outputs[1:] + + return FlaxQuestionAnsweringModelOutput( + start_logits=start_logits, + end_logits=end_logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +@add_start_docstrings( + """ + Bert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear + layers on top of the hidden-states output to compute `span start logits` and `span end logits`). + """, + BERT_START_DOCSTRING, +) +class FlaxBertForQuestionAnswering(FlaxBertPreTrainedModel): + module_class = FlaxBertForQuestionAnsweringModule + + +append_call_sample_docstring( + FlaxBertForQuestionAnswering, + _CHECKPOINT_FOR_DOC, + FlaxQuestionAnsweringModelOutput, + _CONFIG_FOR_DOC, +) + + +class FlaxBertForCausalLMModule(nn.Module): + config: BertConfig + dtype: jnp.dtype = jnp.float32 + gradient_checkpointing: bool = False + + def setup(self): + self.bert = FlaxBertModule( + config=self.config, + add_pooling_layer=False, + dtype=self.dtype, + gradient_checkpointing=self.gradient_checkpointing, + ) + self.cls = FlaxBertOnlyMLMHead(config=self.config, dtype=self.dtype) + + def __call__( + self, + input_ids, + attention_mask, + position_ids, + token_type_ids: Optional[jnp.ndarray] = None, + head_mask: Optional[jnp.ndarray] = None, + encoder_hidden_states: Optional[jnp.ndarray] = None, + encoder_attention_mask: Optional[jnp.ndarray] = None, + init_cache: bool = False, + deterministic: bool = True, + output_attentions: bool = False, + output_hidden_states: bool = False, + return_dict: bool = True, + ): + # Model + outputs = self.bert( + input_ids, + attention_mask, + token_type_ids, + position_ids, + head_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + init_cache=init_cache, + deterministic=deterministic, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + hidden_states = outputs[0] + if self.config.tie_word_embeddings: + shared_embedding = self.bert.variables["params"]["embeddings"]["word_embeddings"]["embedding"] + else: + shared_embedding = None + + # Compute the prediction scores + logits = self.cls(hidden_states, shared_embedding=shared_embedding) + + if not return_dict: + return (logits,) + outputs[1:] + + return FlaxCausalLMOutputWithCrossAttentions( + logits=logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + cross_attentions=outputs.cross_attentions, + ) + + +@add_start_docstrings( + """ + Bert Model with a language modeling head on top (a linear layer on top of the hidden-states output) e.g for + autoregressive tasks. + """, + BERT_START_DOCSTRING, +) +class FlaxBertForCausalLM(FlaxBertPreTrainedModel): + module_class = FlaxBertForCausalLMModule + + def prepare_inputs_for_generation(self, input_ids, max_length, attention_mask: Optional[jax.Array] = None): + # initializing the cache + batch_size, seq_length = input_ids.shape + + past_key_values = self.init_cache(batch_size, max_length) + # Note that usually one would have to put 0's in the attention_mask for x > input_ids.shape[-1] and x < cache_length. + # But since the decoder uses a causal mask, those positions are masked anyway. + # Thus, we can create a single static attention_mask here, which is more efficient for compilation + extended_attention_mask = jnp.ones((batch_size, max_length), dtype="i4") + if attention_mask is not None: + position_ids = attention_mask.cumsum(axis=-1) - 1 + extended_attention_mask = lax.dynamic_update_slice(extended_attention_mask, attention_mask, (0, 0)) + else: + position_ids = jnp.broadcast_to(jnp.arange(seq_length, dtype="i4")[None, :], (batch_size, seq_length)) + + return { + "past_key_values": past_key_values, + "attention_mask": extended_attention_mask, + "position_ids": position_ids, + } + + def update_inputs_for_generation(self, model_outputs, model_kwargs): + model_kwargs["past_key_values"] = model_outputs.past_key_values + model_kwargs["position_ids"] = model_kwargs["position_ids"][:, -1:] + 1 + return model_kwargs + + +append_call_sample_docstring( + FlaxBertForCausalLM, + _CHECKPOINT_FOR_DOC, + FlaxCausalLMOutputWithCrossAttentions, + _CONFIG_FOR_DOC, +) + + +__all__ = [ + "FlaxBertForCausalLM", + "FlaxBertForMaskedLM", + "FlaxBertForMultipleChoice", + "FlaxBertForNextSentencePrediction", + "FlaxBertForPreTraining", + "FlaxBertForQuestionAnswering", + "FlaxBertForSequenceClassification", + "FlaxBertForTokenClassification", + "FlaxBertModel", + "FlaxBertPreTrainedModel", +] diff --git a/docs/transformers/build/lib/transformers/models/bert/modeling_tf_bert.py b/docs/transformers/build/lib/transformers/models/bert/modeling_tf_bert.py new file mode 100644 index 0000000000000000000000000000000000000000..ba73faf23cf96f624a924b88b1df5ae4848e4a36 --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/bert/modeling_tf_bert.py @@ -0,0 +1,2126 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""TF 2.0 BERT model.""" + +from __future__ import annotations + +import math +import warnings +from dataclasses import dataclass +from typing import Dict, Optional, Tuple, Union + +import numpy as np +import tensorflow as tf + +from ...activations_tf import get_tf_activation +from ...modeling_tf_outputs import ( + TFBaseModelOutputWithPastAndCrossAttentions, + TFBaseModelOutputWithPoolingAndCrossAttentions, + TFCausalLMOutputWithCrossAttentions, + TFMaskedLMOutput, + TFMultipleChoiceModelOutput, + TFNextSentencePredictorOutput, + TFQuestionAnsweringModelOutput, + TFSequenceClassifierOutput, + TFTokenClassifierOutput, +) +from ...modeling_tf_utils import ( + TFCausalLanguageModelingLoss, + TFMaskedLanguageModelingLoss, + TFModelInputType, + TFMultipleChoiceLoss, + TFNextSentencePredictionLoss, + TFPreTrainedModel, + TFQuestionAnsweringLoss, + TFSequenceClassificationLoss, + TFTokenClassificationLoss, + get_initializer, + keras, + keras_serializable, + unpack_inputs, +) +from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax +from ...utils import ( + ModelOutput, + add_code_sample_docstrings, + add_start_docstrings, + add_start_docstrings_to_model_forward, + logging, + replace_return_docstrings, +) +from .configuration_bert import BertConfig + + +logger = logging.get_logger(__name__) + +_CHECKPOINT_FOR_DOC = "google-bert/bert-base-uncased" +_CONFIG_FOR_DOC = "BertConfig" + +# TokenClassification docstring +_CHECKPOINT_FOR_TOKEN_CLASSIFICATION = "dbmdz/bert-large-cased-finetuned-conll03-english" +_TOKEN_CLASS_EXPECTED_OUTPUT = ( + "['O', 'I-ORG', 'I-ORG', 'I-ORG', 'O', 'O', 'O', 'O', 'O', 'I-LOC', 'O', 'I-LOC', 'I-LOC'] " +) +_TOKEN_CLASS_EXPECTED_LOSS = 0.01 + +# QuestionAnswering docstring +_CHECKPOINT_FOR_QA = "ydshieh/bert-base-cased-squad2" +_QA_EXPECTED_OUTPUT = "'a nice puppet'" +_QA_EXPECTED_LOSS = 7.41 +_QA_TARGET_START_INDEX = 14 +_QA_TARGET_END_INDEX = 15 + +# SequenceClassification docstring +_CHECKPOINT_FOR_SEQUENCE_CLASSIFICATION = "ydshieh/bert-base-uncased-yelp-polarity" +_SEQ_CLASS_EXPECTED_OUTPUT = "'LABEL_1'" +_SEQ_CLASS_EXPECTED_LOSS = 0.01 + + +class TFBertPreTrainingLoss: + """ + Loss function suitable for BERT-like pretraining, that is, the task of pretraining a language model by combining + NSP + MLM. .. note:: Any label of -100 will be ignored (along with the corresponding logits) in the loss + computation. + """ + + def hf_compute_loss(self, labels: tf.Tensor, logits: tf.Tensor) -> tf.Tensor: + loss_fn = keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction=keras.losses.Reduction.NONE) + + # Clip negative labels to zero here to avoid NaNs and errors - those positions will get masked later anyway + unmasked_lm_losses = loss_fn(y_true=tf.nn.relu(labels["labels"]), y_pred=logits[0]) + # make sure only labels that are not equal to -100 + # are taken into account for the loss computation + lm_loss_mask = tf.cast(labels["labels"] != -100, dtype=unmasked_lm_losses.dtype) + masked_lm_losses = unmasked_lm_losses * lm_loss_mask + reduced_masked_lm_loss = tf.reduce_sum(masked_lm_losses) / tf.reduce_sum(lm_loss_mask) + + # Clip negative labels to zero here to avoid NaNs and errors - those positions will get masked later anyway + unmasked_ns_loss = loss_fn(y_true=tf.nn.relu(labels["next_sentence_label"]), y_pred=logits[1]) + ns_loss_mask = tf.cast(labels["next_sentence_label"] != -100, dtype=unmasked_ns_loss.dtype) + masked_ns_loss = unmasked_ns_loss * ns_loss_mask + + reduced_masked_ns_loss = tf.reduce_sum(masked_ns_loss) / tf.reduce_sum(ns_loss_mask) + + return tf.reshape(reduced_masked_lm_loss + reduced_masked_ns_loss, (1,)) + + +class TFBertEmbeddings(keras.layers.Layer): + """Construct the embeddings from word, position and token_type embeddings.""" + + def __init__(self, config: BertConfig, **kwargs): + super().__init__(**kwargs) + + self.config = config + self.hidden_size = config.hidden_size + self.max_position_embeddings = config.max_position_embeddings + self.initializer_range = config.initializer_range + self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") + self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob) + + def build(self, input_shape=None): + with tf.name_scope("word_embeddings"): + self.weight = self.add_weight( + name="weight", + shape=[self.config.vocab_size, self.hidden_size], + initializer=get_initializer(self.initializer_range), + ) + + with tf.name_scope("token_type_embeddings"): + self.token_type_embeddings = self.add_weight( + name="embeddings", + shape=[self.config.type_vocab_size, self.hidden_size], + initializer=get_initializer(self.initializer_range), + ) + + with tf.name_scope("position_embeddings"): + self.position_embeddings = self.add_weight( + name="embeddings", + shape=[self.max_position_embeddings, self.hidden_size], + initializer=get_initializer(self.initializer_range), + ) + + if self.built: + return + self.built = True + if getattr(self, "LayerNorm", None) is not None: + with tf.name_scope(self.LayerNorm.name): + self.LayerNorm.build([None, None, self.config.hidden_size]) + + def call( + self, + input_ids: Optional[tf.Tensor] = None, + position_ids: Optional[tf.Tensor] = None, + token_type_ids: Optional[tf.Tensor] = None, + inputs_embeds: Optional[tf.Tensor] = None, + past_key_values_length=0, + training: bool = False, + ) -> tf.Tensor: + """ + Applies embedding based on inputs tensor. + + Returns: + final_embeddings (`tf.Tensor`): output embedding tensor. + """ + if input_ids is None and inputs_embeds is None: + raise ValueError("Need to provide either `input_ids` or `input_embeds`.") + + if input_ids is not None: + check_embeddings_within_bounds(input_ids, self.config.vocab_size) + inputs_embeds = tf.gather(params=self.weight, indices=input_ids) + + input_shape = shape_list(inputs_embeds)[:-1] + + if token_type_ids is None: + token_type_ids = tf.fill(dims=input_shape, value=0) + + if position_ids is None: + position_ids = tf.expand_dims( + tf.range(start=past_key_values_length, limit=input_shape[1] + past_key_values_length), axis=0 + ) + + position_embeds = tf.gather(params=self.position_embeddings, indices=position_ids) + token_type_embeds = tf.gather(params=self.token_type_embeddings, indices=token_type_ids) + final_embeddings = inputs_embeds + position_embeds + token_type_embeds + final_embeddings = self.LayerNorm(inputs=final_embeddings) + final_embeddings = self.dropout(inputs=final_embeddings, training=training) + + return final_embeddings + + +class TFBertSelfAttention(keras.layers.Layer): + def __init__(self, config: BertConfig, **kwargs): + super().__init__(**kwargs) + + if config.hidden_size % config.num_attention_heads != 0: + raise ValueError( + f"The hidden size ({config.hidden_size}) is not a multiple of the number " + f"of attention heads ({config.num_attention_heads})" + ) + + self.num_attention_heads = config.num_attention_heads + self.attention_head_size = int(config.hidden_size / config.num_attention_heads) + self.all_head_size = self.num_attention_heads * self.attention_head_size + self.sqrt_att_head_size = math.sqrt(self.attention_head_size) + + self.query = keras.layers.Dense( + units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="query" + ) + self.key = keras.layers.Dense( + units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="key" + ) + self.value = keras.layers.Dense( + units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="value" + ) + self.dropout = keras.layers.Dropout(rate=config.attention_probs_dropout_prob) + + self.is_decoder = config.is_decoder + self.config = config + + def transpose_for_scores(self, tensor: tf.Tensor, batch_size: int) -> tf.Tensor: + # Reshape from [batch_size, seq_length, all_head_size] to [batch_size, seq_length, num_attention_heads, attention_head_size] + tensor = tf.reshape(tensor=tensor, shape=(batch_size, -1, self.num_attention_heads, self.attention_head_size)) + + # Transpose the tensor from [batch_size, seq_length, num_attention_heads, attention_head_size] to [batch_size, num_attention_heads, seq_length, attention_head_size] + return tf.transpose(tensor, perm=[0, 2, 1, 3]) + + def call( + self, + hidden_states: tf.Tensor, + attention_mask: tf.Tensor, + head_mask: tf.Tensor, + encoder_hidden_states: tf.Tensor, + encoder_attention_mask: tf.Tensor, + past_key_value: Tuple[tf.Tensor], + output_attentions: bool, + training: bool = False, + ) -> Tuple[tf.Tensor]: + batch_size = shape_list(hidden_states)[0] + mixed_query_layer = self.query(inputs=hidden_states) + + # If this is instantiated as a cross-attention module, the keys + # and values come from an encoder; the attention mask needs to be + # such that the encoder's padding tokens are not attended to. + is_cross_attention = encoder_hidden_states is not None + + if is_cross_attention and past_key_value is not None: + # reuse k,v, cross_attentions + key_layer = past_key_value[0] + value_layer = past_key_value[1] + attention_mask = encoder_attention_mask + elif is_cross_attention: + key_layer = self.transpose_for_scores(self.key(inputs=encoder_hidden_states), batch_size) + value_layer = self.transpose_for_scores(self.value(inputs=encoder_hidden_states), batch_size) + attention_mask = encoder_attention_mask + elif past_key_value is not None: + key_layer = self.transpose_for_scores(self.key(inputs=hidden_states), batch_size) + value_layer = self.transpose_for_scores(self.value(inputs=hidden_states), batch_size) + key_layer = tf.concat([past_key_value[0], key_layer], axis=2) + value_layer = tf.concat([past_key_value[1], value_layer], axis=2) + else: + key_layer = self.transpose_for_scores(self.key(inputs=hidden_states), batch_size) + value_layer = self.transpose_for_scores(self.value(inputs=hidden_states), batch_size) + + query_layer = self.transpose_for_scores(mixed_query_layer, batch_size) + + if self.is_decoder: + # if cross_attention save Tuple(tf.Tensor, tf.Tensor) of all cross attention key/value_states. + # Further calls to cross_attention layer can then reuse all cross-attention + # key/value_states (first "if" case) + # if uni-directional self-attention (decoder) save Tuple(tf.Tensor, tf.Tensor) of + # all previous decoder key/value_states. Further calls to uni-directional self-attention + # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case) + # if encoder bi-directional self-attention `past_key_value` is always `None` + past_key_value = (key_layer, value_layer) + + # Take the dot product between "query" and "key" to get the raw attention scores. + # (batch size, num_heads, seq_len_q, seq_len_k) + attention_scores = tf.matmul(query_layer, key_layer, transpose_b=True) + dk = tf.cast(self.sqrt_att_head_size, dtype=attention_scores.dtype) + attention_scores = tf.divide(attention_scores, dk) + + if attention_mask is not None: + # Apply the attention mask is (precomputed for all layers in TFBertModel call() function) + attention_scores = tf.add(attention_scores, attention_mask) + + # Normalize the attention scores to probabilities. + attention_probs = stable_softmax(logits=attention_scores, axis=-1) + + # This is actually dropping out entire tokens to attend to, which might + # seem a bit unusual, but is taken from the original Transformer paper. + attention_probs = self.dropout(inputs=attention_probs, training=training) + + # Mask heads if we want to + if head_mask is not None: + attention_probs = tf.multiply(attention_probs, head_mask) + + attention_output = tf.matmul(attention_probs, value_layer) + attention_output = tf.transpose(attention_output, perm=[0, 2, 1, 3]) + + # (batch_size, seq_len_q, all_head_size) + attention_output = tf.reshape(tensor=attention_output, shape=(batch_size, -1, self.all_head_size)) + outputs = (attention_output, attention_probs) if output_attentions else (attention_output,) + + if self.is_decoder: + outputs = outputs + (past_key_value,) + return outputs + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "query", None) is not None: + with tf.name_scope(self.query.name): + self.query.build([None, None, self.config.hidden_size]) + if getattr(self, "key", None) is not None: + with tf.name_scope(self.key.name): + self.key.build([None, None, self.config.hidden_size]) + if getattr(self, "value", None) is not None: + with tf.name_scope(self.value.name): + self.value.build([None, None, self.config.hidden_size]) + + +class TFBertSelfOutput(keras.layers.Layer): + def __init__(self, config: BertConfig, **kwargs): + super().__init__(**kwargs) + + self.dense = keras.layers.Dense( + units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" + ) + self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") + self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob) + self.config = config + + def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor: + hidden_states = self.dense(inputs=hidden_states) + hidden_states = self.dropout(inputs=hidden_states, training=training) + hidden_states = self.LayerNorm(inputs=hidden_states + input_tensor) + + return hidden_states + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build([None, None, self.config.hidden_size]) + if getattr(self, "LayerNorm", None) is not None: + with tf.name_scope(self.LayerNorm.name): + self.LayerNorm.build([None, None, self.config.hidden_size]) + + +class TFBertAttention(keras.layers.Layer): + def __init__(self, config: BertConfig, **kwargs): + super().__init__(**kwargs) + + self.self_attention = TFBertSelfAttention(config, name="self") + self.dense_output = TFBertSelfOutput(config, name="output") + + def prune_heads(self, heads): + raise NotImplementedError + + def call( + self, + input_tensor: tf.Tensor, + attention_mask: tf.Tensor, + head_mask: tf.Tensor, + encoder_hidden_states: tf.Tensor, + encoder_attention_mask: tf.Tensor, + past_key_value: Tuple[tf.Tensor], + output_attentions: bool, + training: bool = False, + ) -> Tuple[tf.Tensor]: + self_outputs = self.self_attention( + hidden_states=input_tensor, + attention_mask=attention_mask, + head_mask=head_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + past_key_value=past_key_value, + output_attentions=output_attentions, + training=training, + ) + attention_output = self.dense_output( + hidden_states=self_outputs[0], input_tensor=input_tensor, training=training + ) + # add attentions (possibly with past_key_value) if we output them + outputs = (attention_output,) + self_outputs[1:] + + return outputs + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "self_attention", None) is not None: + with tf.name_scope(self.self_attention.name): + self.self_attention.build(None) + if getattr(self, "dense_output", None) is not None: + with tf.name_scope(self.dense_output.name): + self.dense_output.build(None) + + +class TFBertIntermediate(keras.layers.Layer): + def __init__(self, config: BertConfig, **kwargs): + super().__init__(**kwargs) + + self.dense = keras.layers.Dense( + units=config.intermediate_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" + ) + + if isinstance(config.hidden_act, str): + self.intermediate_act_fn = get_tf_activation(config.hidden_act) + else: + self.intermediate_act_fn = config.hidden_act + self.config = config + + def call(self, hidden_states: tf.Tensor) -> tf.Tensor: + hidden_states = self.dense(inputs=hidden_states) + hidden_states = self.intermediate_act_fn(hidden_states) + + return hidden_states + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build([None, None, self.config.hidden_size]) + + +class TFBertOutput(keras.layers.Layer): + def __init__(self, config: BertConfig, **kwargs): + super().__init__(**kwargs) + + self.dense = keras.layers.Dense( + units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" + ) + self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") + self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob) + self.config = config + + def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor: + hidden_states = self.dense(inputs=hidden_states) + hidden_states = self.dropout(inputs=hidden_states, training=training) + hidden_states = self.LayerNorm(inputs=hidden_states + input_tensor) + + return hidden_states + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build([None, None, self.config.intermediate_size]) + if getattr(self, "LayerNorm", None) is not None: + with tf.name_scope(self.LayerNorm.name): + self.LayerNorm.build([None, None, self.config.hidden_size]) + + +class TFBertLayer(keras.layers.Layer): + def __init__(self, config: BertConfig, **kwargs): + super().__init__(**kwargs) + + self.attention = TFBertAttention(config, name="attention") + self.is_decoder = config.is_decoder + self.add_cross_attention = config.add_cross_attention + if self.add_cross_attention: + if not self.is_decoder: + raise ValueError(f"{self} should be used as a decoder model if cross attention is added") + self.crossattention = TFBertAttention(config, name="crossattention") + self.intermediate = TFBertIntermediate(config, name="intermediate") + self.bert_output = TFBertOutput(config, name="output") + + def call( + self, + hidden_states: tf.Tensor, + attention_mask: tf.Tensor, + head_mask: tf.Tensor, + encoder_hidden_states: tf.Tensor | None, + encoder_attention_mask: tf.Tensor | None, + past_key_value: Tuple[tf.Tensor] | None, + output_attentions: bool, + training: bool = False, + ) -> Tuple[tf.Tensor]: + # decoder uni-directional self-attention cached key/values tuple is at positions 1,2 + self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None + self_attention_outputs = self.attention( + input_tensor=hidden_states, + attention_mask=attention_mask, + head_mask=head_mask, + encoder_hidden_states=None, + encoder_attention_mask=None, + past_key_value=self_attn_past_key_value, + output_attentions=output_attentions, + training=training, + ) + attention_output = self_attention_outputs[0] + + # if decoder, the last output is tuple of self-attn cache + if self.is_decoder: + outputs = self_attention_outputs[1:-1] + present_key_value = self_attention_outputs[-1] + else: + outputs = self_attention_outputs[1:] # add self attentions if we output attention weights + + cross_attn_present_key_value = None + if self.is_decoder and encoder_hidden_states is not None: + if not hasattr(self, "crossattention"): + raise ValueError( + f"If `encoder_hidden_states` are passed, {self} has to be instantiated with cross-attention layers" + " by setting `config.add_cross_attention=True`" + ) + + # cross_attn cached key/values tuple is at positions 3,4 of past_key_value tuple + cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None + cross_attention_outputs = self.crossattention( + input_tensor=attention_output, + attention_mask=attention_mask, + head_mask=head_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + past_key_value=cross_attn_past_key_value, + output_attentions=output_attentions, + training=training, + ) + attention_output = cross_attention_outputs[0] + outputs = outputs + cross_attention_outputs[1:-1] # add cross attentions if we output attention weights + + # add cross-attn cache to positions 3,4 of present_key_value tuple + cross_attn_present_key_value = cross_attention_outputs[-1] + present_key_value = present_key_value + cross_attn_present_key_value + + intermediate_output = self.intermediate(hidden_states=attention_output) + layer_output = self.bert_output( + hidden_states=intermediate_output, input_tensor=attention_output, training=training + ) + outputs = (layer_output,) + outputs # add attentions if we output them + + # if decoder, return the attn key/values as the last output + if self.is_decoder: + outputs = outputs + (present_key_value,) + + return outputs + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "attention", None) is not None: + with tf.name_scope(self.attention.name): + self.attention.build(None) + if getattr(self, "intermediate", None) is not None: + with tf.name_scope(self.intermediate.name): + self.intermediate.build(None) + if getattr(self, "bert_output", None) is not None: + with tf.name_scope(self.bert_output.name): + self.bert_output.build(None) + if getattr(self, "crossattention", None) is not None: + with tf.name_scope(self.crossattention.name): + self.crossattention.build(None) + + +class TFBertEncoder(keras.layers.Layer): + def __init__(self, config: BertConfig, **kwargs): + super().__init__(**kwargs) + self.config = config + self.layer = [TFBertLayer(config, name=f"layer_._{i}") for i in range(config.num_hidden_layers)] + + def call( + self, + hidden_states: tf.Tensor, + attention_mask: tf.Tensor, + head_mask: tf.Tensor, + encoder_hidden_states: tf.Tensor | None, + encoder_attention_mask: tf.Tensor | None, + past_key_values: Tuple[Tuple[tf.Tensor]] | None, + use_cache: Optional[bool], + output_attentions: bool, + output_hidden_states: bool, + return_dict: bool, + training: bool = False, + ) -> Union[TFBaseModelOutputWithPastAndCrossAttentions, Tuple[tf.Tensor]]: + all_hidden_states = () if output_hidden_states else None + all_attentions = () if output_attentions else None + all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None + + next_decoder_cache = () if use_cache else None + for i, layer_module in enumerate(self.layer): + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + past_key_value = past_key_values[i] if past_key_values is not None else None + + layer_outputs = layer_module( + hidden_states=hidden_states, + attention_mask=attention_mask, + head_mask=head_mask[i], + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + past_key_value=past_key_value, + output_attentions=output_attentions, + training=training, + ) + hidden_states = layer_outputs[0] + + if use_cache: + next_decoder_cache += (layer_outputs[-1],) + + if output_attentions: + all_attentions = all_attentions + (layer_outputs[1],) + if self.config.add_cross_attention and encoder_hidden_states is not None: + all_cross_attentions = all_cross_attentions + (layer_outputs[2],) + + # Add last layer + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + if not return_dict: + return tuple( + v for v in [hidden_states, all_hidden_states, all_attentions, all_cross_attentions] if v is not None + ) + + return TFBaseModelOutputWithPastAndCrossAttentions( + last_hidden_state=hidden_states, + past_key_values=next_decoder_cache, + hidden_states=all_hidden_states, + attentions=all_attentions, + cross_attentions=all_cross_attentions, + ) + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "layer", None) is not None: + for layer in self.layer: + with tf.name_scope(layer.name): + layer.build(None) + + +class TFBertPooler(keras.layers.Layer): + def __init__(self, config: BertConfig, **kwargs): + super().__init__(**kwargs) + + self.dense = keras.layers.Dense( + units=config.hidden_size, + kernel_initializer=get_initializer(config.initializer_range), + activation="tanh", + name="dense", + ) + self.config = config + + def call(self, hidden_states: tf.Tensor) -> tf.Tensor: + # We "pool" the model by simply taking the hidden state corresponding + # to the first token. + first_token_tensor = hidden_states[:, 0] + pooled_output = self.dense(inputs=first_token_tensor) + + return pooled_output + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build([None, None, self.config.hidden_size]) + + +class TFBertPredictionHeadTransform(keras.layers.Layer): + def __init__(self, config: BertConfig, **kwargs): + super().__init__(**kwargs) + + self.dense = keras.layers.Dense( + units=config.hidden_size, + kernel_initializer=get_initializer(config.initializer_range), + name="dense", + ) + + if isinstance(config.hidden_act, str): + self.transform_act_fn = get_tf_activation(config.hidden_act) + else: + self.transform_act_fn = config.hidden_act + + self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") + self.config = config + + def call(self, hidden_states: tf.Tensor) -> tf.Tensor: + hidden_states = self.dense(inputs=hidden_states) + hidden_states = self.transform_act_fn(hidden_states) + hidden_states = self.LayerNorm(inputs=hidden_states) + + return hidden_states + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build([None, None, self.config.hidden_size]) + if getattr(self, "LayerNorm", None) is not None: + with tf.name_scope(self.LayerNorm.name): + self.LayerNorm.build([None, None, self.config.hidden_size]) + + +class TFBertLMPredictionHead(keras.layers.Layer): + def __init__(self, config: BertConfig, input_embeddings: keras.layers.Layer, **kwargs): + super().__init__(**kwargs) + + self.config = config + self.hidden_size = config.hidden_size + + self.transform = TFBertPredictionHeadTransform(config, name="transform") + + # The output weights are the same as the input embeddings, but there is + # an output-only bias for each token. + self.input_embeddings = input_embeddings + + def build(self, input_shape=None): + self.bias = self.add_weight(shape=(self.config.vocab_size,), initializer="zeros", trainable=True, name="bias") + + if self.built: + return + self.built = True + if getattr(self, "transform", None) is not None: + with tf.name_scope(self.transform.name): + self.transform.build(None) + + def get_output_embeddings(self) -> keras.layers.Layer: + return self.input_embeddings + + def set_output_embeddings(self, value: tf.Variable): + self.input_embeddings.weight = value + self.input_embeddings.vocab_size = shape_list(value)[0] + + def get_bias(self) -> Dict[str, tf.Variable]: + return {"bias": self.bias} + + def set_bias(self, value: tf.Variable): + self.bias = value["bias"] + self.config.vocab_size = shape_list(value["bias"])[0] + + def call(self, hidden_states: tf.Tensor) -> tf.Tensor: + hidden_states = self.transform(hidden_states=hidden_states) + seq_length = shape_list(hidden_states)[1] + hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, self.hidden_size]) + hidden_states = tf.matmul(a=hidden_states, b=self.input_embeddings.weight, transpose_b=True) + hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, seq_length, self.config.vocab_size]) + hidden_states = tf.nn.bias_add(value=hidden_states, bias=self.bias) + + return hidden_states + + +class TFBertMLMHead(keras.layers.Layer): + def __init__(self, config: BertConfig, input_embeddings: keras.layers.Layer, **kwargs): + super().__init__(**kwargs) + + self.predictions = TFBertLMPredictionHead(config, input_embeddings, name="predictions") + + def call(self, sequence_output: tf.Tensor) -> tf.Tensor: + prediction_scores = self.predictions(hidden_states=sequence_output) + + return prediction_scores + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "predictions", None) is not None: + with tf.name_scope(self.predictions.name): + self.predictions.build(None) + + +class TFBertNSPHead(keras.layers.Layer): + def __init__(self, config: BertConfig, **kwargs): + super().__init__(**kwargs) + + self.seq_relationship = keras.layers.Dense( + units=2, + kernel_initializer=get_initializer(config.initializer_range), + name="seq_relationship", + ) + self.config = config + + def call(self, pooled_output: tf.Tensor) -> tf.Tensor: + seq_relationship_score = self.seq_relationship(inputs=pooled_output) + + return seq_relationship_score + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "seq_relationship", None) is not None: + with tf.name_scope(self.seq_relationship.name): + self.seq_relationship.build([None, None, self.config.hidden_size]) + + +@keras_serializable +class TFBertMainLayer(keras.layers.Layer): + config_class = BertConfig + + def __init__(self, config: BertConfig, add_pooling_layer: bool = True, **kwargs): + super().__init__(**kwargs) + + self.config = config + self.is_decoder = config.is_decoder + + self.embeddings = TFBertEmbeddings(config, name="embeddings") + self.encoder = TFBertEncoder(config, name="encoder") + self.pooler = TFBertPooler(config, name="pooler") if add_pooling_layer else None + + def get_input_embeddings(self) -> keras.layers.Layer: + return self.embeddings + + def set_input_embeddings(self, value: tf.Variable): + self.embeddings.weight = value + self.embeddings.vocab_size = shape_list(value)[0] + + def _prune_heads(self, heads_to_prune): + """ + Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base + class PreTrainedModel + """ + raise NotImplementedError + + @unpack_inputs + def call( + self, + input_ids: TFModelInputType | None = None, + attention_mask: np.ndarray | tf.Tensor | None = None, + token_type_ids: np.ndarray | tf.Tensor | None = None, + position_ids: np.ndarray | tf.Tensor | None = None, + head_mask: np.ndarray | tf.Tensor | None = None, + inputs_embeds: np.ndarray | tf.Tensor | None = None, + encoder_hidden_states: np.ndarray | tf.Tensor | None = None, + encoder_attention_mask: np.ndarray | tf.Tensor | None = None, + past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + training: bool = False, + ) -> Union[TFBaseModelOutputWithPoolingAndCrossAttentions, Tuple[tf.Tensor]]: + if not self.config.is_decoder: + use_cache = False + + if input_ids is not None and inputs_embeds is not None: + raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") + elif input_ids is not None: + input_shape = shape_list(input_ids) + elif inputs_embeds is not None: + input_shape = shape_list(inputs_embeds)[:-1] + else: + raise ValueError("You have to specify either input_ids or inputs_embeds") + + batch_size, seq_length = input_shape + + if past_key_values is None: + past_key_values_length = 0 + past_key_values = [None] * len(self.encoder.layer) + else: + past_key_values_length = shape_list(past_key_values[0][0])[-2] + + if attention_mask is None: + attention_mask = tf.fill(dims=(batch_size, seq_length + past_key_values_length), value=1) + + if token_type_ids is None: + token_type_ids = tf.fill(dims=input_shape, value=0) + + embedding_output = self.embeddings( + input_ids=input_ids, + position_ids=position_ids, + token_type_ids=token_type_ids, + inputs_embeds=inputs_embeds, + past_key_values_length=past_key_values_length, + training=training, + ) + + # We create a 3D attention mask from a 2D tensor mask. + # Sizes are [batch_size, 1, 1, to_seq_length] + # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length] + # this attention mask is more simple than the triangular masking of causal attention + # used in OpenAI GPT, we just need to prepare the broadcast dimension here. + attention_mask_shape = shape_list(attention_mask) + + mask_seq_length = seq_length + past_key_values_length + # Copied from `modeling_tf_t5.py` + # Provided a padding mask of dimensions [batch_size, mask_seq_length] + # - if the model is a decoder, apply a causal mask in addition to the padding mask + # - if the model is an encoder, make the mask broadcastable to [batch_size, num_heads, mask_seq_length, mask_seq_length] + if self.is_decoder: + seq_ids = tf.range(mask_seq_length) + causal_mask = tf.less_equal( + tf.tile(seq_ids[None, None, :], (batch_size, mask_seq_length, 1)), + seq_ids[None, :, None], + ) + causal_mask = tf.cast(causal_mask, dtype=attention_mask.dtype) + extended_attention_mask = causal_mask * attention_mask[:, None, :] + attention_mask_shape = shape_list(extended_attention_mask) + extended_attention_mask = tf.reshape( + extended_attention_mask, (attention_mask_shape[0], 1, attention_mask_shape[1], attention_mask_shape[2]) + ) + if past_key_values[0] is not None: + # attention_mask needs to be sliced to the shape `[batch_size, 1, from_seq_length - cached_seq_length, to_seq_length] + extended_attention_mask = extended_attention_mask[:, :, -seq_length:, :] + else: + extended_attention_mask = tf.reshape( + attention_mask, (attention_mask_shape[0], 1, 1, attention_mask_shape[1]) + ) + + # Since attention_mask is 1.0 for positions we want to attend and 0.0 for + # masked positions, this operation will create a tensor which is 0.0 for + # positions we want to attend and -10000.0 for masked positions. + # Since we are adding it to the raw scores before the softmax, this is + # effectively the same as removing these entirely. + extended_attention_mask = tf.cast(extended_attention_mask, dtype=embedding_output.dtype) + one_cst = tf.constant(1.0, dtype=embedding_output.dtype) + ten_thousand_cst = tf.constant(-10000.0, dtype=embedding_output.dtype) + extended_attention_mask = tf.multiply(tf.subtract(one_cst, extended_attention_mask), ten_thousand_cst) + + # Copied from `modeling_tf_t5.py` with -1e9 -> -10000 + if self.is_decoder and encoder_attention_mask is not None: + # If a 2D ou 3D attention mask is provided for the cross-attention + # we need to make broadcastable to [batch_size, num_heads, mask_seq_length, mask_seq_length] + # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length] + encoder_attention_mask = tf.cast(encoder_attention_mask, dtype=extended_attention_mask.dtype) + num_dims_encoder_attention_mask = len(shape_list(encoder_attention_mask)) + if num_dims_encoder_attention_mask == 3: + encoder_extended_attention_mask = encoder_attention_mask[:, None, :, :] + if num_dims_encoder_attention_mask == 2: + encoder_extended_attention_mask = encoder_attention_mask[:, None, None, :] + + # T5 has a mask that can compare sequence ids, we can simulate this here with this transposition + # Cf. https://github.com/tensorflow/mesh/blob/8d2465e9bc93129b913b5ccc6a59aa97abd96ec6/mesh_tensorflow/transformer/transformer_layers.py#L270 + # encoder_extended_attention_mask = tf.math.equal(encoder_extended_attention_mask, + # tf.transpose(encoder_extended_attention_mask, perm=(-1, -2))) + + encoder_extended_attention_mask = (1.0 - encoder_extended_attention_mask) * -10000.0 + else: + encoder_extended_attention_mask = None + + # Prepare head mask if needed + # 1.0 in head_mask indicate we keep the head + # attention_probs has shape bsz x n_heads x N x N + # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads] + # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length] + if head_mask is not None: + raise NotImplementedError + else: + head_mask = [None] * self.config.num_hidden_layers + + encoder_outputs = self.encoder( + hidden_states=embedding_output, + attention_mask=extended_attention_mask, + head_mask=head_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_extended_attention_mask, + past_key_values=past_key_values, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + ) + + sequence_output = encoder_outputs[0] + pooled_output = self.pooler(hidden_states=sequence_output) if self.pooler is not None else None + + if not return_dict: + return ( + sequence_output, + pooled_output, + ) + encoder_outputs[1:] + + return TFBaseModelOutputWithPoolingAndCrossAttentions( + last_hidden_state=sequence_output, + pooler_output=pooled_output, + past_key_values=encoder_outputs.past_key_values, + hidden_states=encoder_outputs.hidden_states, + attentions=encoder_outputs.attentions, + cross_attentions=encoder_outputs.cross_attentions, + ) + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "embeddings", None) is not None: + with tf.name_scope(self.embeddings.name): + self.embeddings.build(None) + if getattr(self, "encoder", None) is not None: + with tf.name_scope(self.encoder.name): + self.encoder.build(None) + if getattr(self, "pooler", None) is not None: + with tf.name_scope(self.pooler.name): + self.pooler.build(None) + + +class TFBertPreTrainedModel(TFPreTrainedModel): + """ + An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained + models. + """ + + config_class = BertConfig + base_model_prefix = "bert" + + +@dataclass +class TFBertForPreTrainingOutput(ModelOutput): + """ + Output type of [`TFBertForPreTraining`]. + + Args: + prediction_logits (`tf.Tensor` of shape `(batch_size, sequence_length, config.vocab_size)`): + Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). + seq_relationship_logits (`tf.Tensor` of shape `(batch_size, 2)`): + Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation + before SoftMax). + hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape + `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + + loss: tf.Tensor | None = None + prediction_logits: Optional[tf.Tensor] = None + seq_relationship_logits: Optional[tf.Tensor] = None + hidden_states: Optional[Union[Tuple[tf.Tensor], tf.Tensor]] = None + attentions: Optional[Union[Tuple[tf.Tensor], tf.Tensor]] = None + + +BERT_START_DOCSTRING = r""" + + This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads + etc.) + + This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it + as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and + behavior. + + + + TensorFlow models and layers in `transformers` accept two formats as input: + + - having all inputs as keyword arguments (like PyTorch models), or + - having all inputs as a list, tuple or dict in the first positional argument. + + The reason the second format is supported is that Keras methods prefer this format when passing inputs to models + and layers. Because of this support, when using methods like `model.fit()` things should "just work" for you - just + pass your inputs and labels in any format that `model.fit()` supports! If, however, you want to use the second + format outside of Keras methods like `fit()` and `predict()`, such as when creating your own layers or models with + the Keras `Functional` API, there are three possibilities you can use to gather all the input Tensors in the first + positional argument: + + - a single Tensor with `input_ids` only and nothing else: `model(input_ids)` + - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring: + `model([input_ids, attention_mask])` or `model([input_ids, attention_mask, token_type_ids])` + - a dictionary with one or several input Tensors associated to the input names given in the docstring: + `model({"input_ids": input_ids, "token_type_ids": token_type_ids})` + + Note that when creating models and layers with + [subclassing](https://keras.io/guides/making_new_layers_and_models_via_subclassing/) then you don't need to worry + about any of this, as you can just pass inputs like you would to any other Python function! + + + + Args: + config ([`BertConfig`]): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the [`~TFPreTrainedModel.from_pretrained`] method to load the model weights. +""" + +BERT_INPUTS_DOCSTRING = r""" + Args: + input_ids (`np.ndarray`, `tf.Tensor`, `List[tf.Tensor]` ``Dict[str, tf.Tensor]` or `Dict[str, np.ndarray]` and each example must have the shape `({0})`): + Indices of input sequence tokens in the vocabulary. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.__call__`] and + [`PreTrainedTokenizer.encode`] for details. + + [What are input IDs?](../glossary#input-ids) + attention_mask (`np.ndarray` or `tf.Tensor` of shape `({0})`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + token_type_ids (`np.ndarray` or `tf.Tensor` of shape `({0})`, *optional*): + Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0, + 1]`: + + - 0 corresponds to a *sentence A* token, + - 1 corresponds to a *sentence B* token. + + [What are token type IDs?](../glossary#token-type-ids) + position_ids (`np.ndarray` or `tf.Tensor` of shape `({0})`, *optional*): + Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, + config.max_position_embeddings - 1]`. + + [What are position IDs?](../glossary#position-ids) + head_mask (`np.ndarray` or `tf.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*): + Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + inputs_embeds (`np.ndarray` or `tf.Tensor` of shape `({0}, hidden_size)`, *optional*): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This + is useful if you want more control over how to convert `input_ids` indices into associated vectors than the + model's internal embedding lookup matrix. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. This argument can be used only in eager mode, in graph mode the value in the + config will be used instead. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. This argument can be used only in eager mode, in graph mode the value in the config will be + used instead. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. This argument can be used in + eager mode, in graph mode the value will always be set to True. + training (`bool`, *optional*, defaults to `False``): + Whether or not to use the model in training mode (some modules like dropout modules have different + behaviors between training and evaluation). +""" + + +@add_start_docstrings( + "The bare Bert Model transformer outputting raw hidden-states without any specific head on top.", + BERT_START_DOCSTRING, +) +class TFBertModel(TFBertPreTrainedModel): + def __init__(self, config: BertConfig, add_pooling_layer: bool = True, *inputs, **kwargs): + super().__init__(config, *inputs, **kwargs) + + self.bert = TFBertMainLayer(config, add_pooling_layer, name="bert") + + @unpack_inputs + @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @add_code_sample_docstrings( + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=TFBaseModelOutputWithPoolingAndCrossAttentions, + config_class=_CONFIG_FOR_DOC, + ) + def call( + self, + input_ids: TFModelInputType | None = None, + attention_mask: np.ndarray | tf.Tensor | None = None, + token_type_ids: np.ndarray | tf.Tensor | None = None, + position_ids: np.ndarray | tf.Tensor | None = None, + head_mask: np.ndarray | tf.Tensor | None = None, + inputs_embeds: np.ndarray | tf.Tensor | None = None, + encoder_hidden_states: np.ndarray | tf.Tensor | None = None, + encoder_attention_mask: np.ndarray | tf.Tensor | None = None, + past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + training: Optional[bool] = False, + ) -> Union[TFBaseModelOutputWithPoolingAndCrossAttentions, Tuple[tf.Tensor]]: + r""" + encoder_hidden_states (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if + the model is configured as a decoder. + encoder_attention_mask (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in + the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + past_key_values (`Tuple[Tuple[tf.Tensor]]` of length `config.n_layers`) + contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding. + If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that + don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all + `decoder_input_ids` of shape `(batch_size, sequence_length)`. + use_cache (`bool`, *optional*, defaults to `True`): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see + `past_key_values`). Set to `False` during training, `True` during generation + """ + outputs = self.bert( + input_ids=input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + past_key_values=past_key_values, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + ) + return outputs + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "bert", None) is not None: + with tf.name_scope(self.bert.name): + self.bert.build(None) + + +@add_start_docstrings( + """ +Bert Model with two heads on top as done during the pretraining: + a `masked language modeling` head and a `next sentence prediction (classification)` head. + """, + BERT_START_DOCSTRING, +) +class TFBertForPreTraining(TFBertPreTrainedModel, TFBertPreTrainingLoss): + # names with a '.' represents the authorized unexpected/missing layers when a TF model is loaded from a PT model + _keys_to_ignore_on_load_unexpected = [ + r"position_ids", + r"cls.predictions.decoder.weight", + r"cls.predictions.decoder.bias", + ] + + def __init__(self, config: BertConfig, *inputs, **kwargs): + super().__init__(config, *inputs, **kwargs) + + self.bert = TFBertMainLayer(config, name="bert") + self.nsp = TFBertNSPHead(config, name="nsp___cls") + self.mlm = TFBertMLMHead(config, input_embeddings=self.bert.embeddings, name="mlm___cls") + + def get_lm_head(self) -> keras.layers.Layer: + return self.mlm.predictions + + def get_prefix_bias_name(self) -> str: + warnings.warn("The method get_prefix_bias_name is deprecated. Please use `get_bias` instead.", FutureWarning) + return self.name + "/" + self.mlm.name + "/" + self.mlm.predictions.name + + @unpack_inputs + @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @replace_return_docstrings(output_type=TFBertForPreTrainingOutput, config_class=_CONFIG_FOR_DOC) + def call( + self, + input_ids: TFModelInputType | None = None, + attention_mask: np.ndarray | tf.Tensor | None = None, + token_type_ids: np.ndarray | tf.Tensor | None = None, + position_ids: np.ndarray | tf.Tensor | None = None, + head_mask: np.ndarray | tf.Tensor | None = None, + inputs_embeds: np.ndarray | tf.Tensor | None = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + labels: np.ndarray | tf.Tensor | None = None, + next_sentence_label: np.ndarray | tf.Tensor | None = None, + training: Optional[bool] = False, + ) -> Union[TFBertForPreTrainingOutput, Tuple[tf.Tensor]]: + r""" + labels (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ..., + config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the + loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]` + next_sentence_label (`tf.Tensor` of shape `(batch_size,)`, *optional*): + Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair + (see `input_ids` docstring) Indices should be in `[0, 1]`: + + - 0 indicates sequence B is a continuation of sequence A, + - 1 indicates sequence B is a random sequence. + kwargs (`Dict[str, any]`, *optional*, defaults to `{}`): + Used to hide legacy arguments that have been deprecated. + + Return: + + Examples: + + ```python + >>> import tensorflow as tf + >>> from transformers import AutoTokenizer, TFBertForPreTraining + + >>> tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased") + >>> model = TFBertForPreTraining.from_pretrained("google-bert/bert-base-uncased") + >>> input_ids = tokenizer("Hello, my dog is cute", add_special_tokens=True, return_tensors="tf") + >>> # Batch size 1 + + >>> outputs = model(input_ids) + >>> prediction_logits, seq_relationship_logits = outputs[:2] + ```""" + outputs = self.bert( + input_ids=input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + ) + sequence_output, pooled_output = outputs[:2] + prediction_scores = self.mlm(sequence_output=sequence_output, training=training) + seq_relationship_score = self.nsp(pooled_output=pooled_output) + total_loss = None + + if labels is not None and next_sentence_label is not None: + d_labels = {"labels": labels} + d_labels["next_sentence_label"] = next_sentence_label + total_loss = self.hf_compute_loss(labels=d_labels, logits=(prediction_scores, seq_relationship_score)) + + if not return_dict: + output = (prediction_scores, seq_relationship_score) + outputs[2:] + return ((total_loss,) + output) if total_loss is not None else output + + return TFBertForPreTrainingOutput( + loss=total_loss, + prediction_logits=prediction_scores, + seq_relationship_logits=seq_relationship_score, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "bert", None) is not None: + with tf.name_scope(self.bert.name): + self.bert.build(None) + if getattr(self, "nsp", None) is not None: + with tf.name_scope(self.nsp.name): + self.nsp.build(None) + if getattr(self, "mlm", None) is not None: + with tf.name_scope(self.mlm.name): + self.mlm.build(None) + + +@add_start_docstrings("""Bert Model with a `language modeling` head on top.""", BERT_START_DOCSTRING) +class TFBertForMaskedLM(TFBertPreTrainedModel, TFMaskedLanguageModelingLoss): + # names with a '.' represents the authorized unexpected/missing layers when a TF model is loaded from a PT model + _keys_to_ignore_on_load_unexpected = [ + r"pooler", + r"cls.seq_relationship", + r"cls.predictions.decoder.weight", + r"nsp___cls", + ] + + def __init__(self, config: BertConfig, *inputs, **kwargs): + super().__init__(config, *inputs, **kwargs) + + if config.is_decoder: + logger.warning( + "If you want to use `TFBertForMaskedLM` make sure `config.is_decoder=False` for " + "bi-directional self-attention." + ) + + self.bert = TFBertMainLayer(config, add_pooling_layer=False, name="bert") + self.mlm = TFBertMLMHead(config, input_embeddings=self.bert.embeddings, name="mlm___cls") + + def get_lm_head(self) -> keras.layers.Layer: + return self.mlm.predictions + + def get_prefix_bias_name(self) -> str: + warnings.warn("The method get_prefix_bias_name is deprecated. Please use `get_bias` instead.", FutureWarning) + return self.name + "/" + self.mlm.name + "/" + self.mlm.predictions.name + + @unpack_inputs + @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @add_code_sample_docstrings( + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=TFMaskedLMOutput, + config_class=_CONFIG_FOR_DOC, + expected_output="'paris'", + expected_loss=0.88, + ) + def call( + self, + input_ids: TFModelInputType | None = None, + attention_mask: np.ndarray | tf.Tensor | None = None, + token_type_ids: np.ndarray | tf.Tensor | None = None, + position_ids: np.ndarray | tf.Tensor | None = None, + head_mask: np.ndarray | tf.Tensor | None = None, + inputs_embeds: np.ndarray | tf.Tensor | None = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + labels: np.ndarray | tf.Tensor | None = None, + training: Optional[bool] = False, + ) -> Union[TFMaskedLMOutput, Tuple[tf.Tensor]]: + r""" + labels (`tf.Tensor` or `np.ndarray` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ..., + config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the + loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]` + """ + outputs = self.bert( + input_ids=input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + ) + sequence_output = outputs[0] + prediction_scores = self.mlm(sequence_output=sequence_output, training=training) + loss = None if labels is None else self.hf_compute_loss(labels=labels, logits=prediction_scores) + + if not return_dict: + output = (prediction_scores,) + outputs[2:] + return ((loss,) + output) if loss is not None else output + + return TFMaskedLMOutput( + loss=loss, + logits=prediction_scores, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "bert", None) is not None: + with tf.name_scope(self.bert.name): + self.bert.build(None) + if getattr(self, "mlm", None) is not None: + with tf.name_scope(self.mlm.name): + self.mlm.build(None) + + +class TFBertLMHeadModel(TFBertPreTrainedModel, TFCausalLanguageModelingLoss): + # names with a '.' represents the authorized unexpected/missing layers when a TF model is loaded from a PT model + _keys_to_ignore_on_load_unexpected = [ + r"pooler", + r"cls.seq_relationship", + r"cls.predictions.decoder.weight", + r"nsp___cls", + ] + + def __init__(self, config: BertConfig, *inputs, **kwargs): + super().__init__(config, *inputs, **kwargs) + + if not config.is_decoder: + logger.warning("If you want to use `TFBertLMHeadModel` as a standalone, add `is_decoder=True.`") + + self.bert = TFBertMainLayer(config, add_pooling_layer=False, name="bert") + self.mlm = TFBertMLMHead(config, input_embeddings=self.bert.embeddings, name="mlm___cls") + + def get_lm_head(self) -> keras.layers.Layer: + return self.mlm.predictions + + def get_prefix_bias_name(self) -> str: + warnings.warn("The method get_prefix_bias_name is deprecated. Please use `get_bias` instead.", FutureWarning) + return self.name + "/" + self.mlm.name + "/" + self.mlm.predictions.name + + def prepare_inputs_for_generation(self, input_ids, past_key_values=None, attention_mask=None, **model_kwargs): + input_shape = input_ids.shape + # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly + if attention_mask is None: + attention_mask = tf.ones(input_shape) + + # cut decoder_input_ids if past is used + if past_key_values is not None: + input_ids = input_ids[:, -1:] + + return {"input_ids": input_ids, "attention_mask": attention_mask, "past_key_values": past_key_values} + + @unpack_inputs + @add_code_sample_docstrings( + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=TFCausalLMOutputWithCrossAttentions, + config_class=_CONFIG_FOR_DOC, + ) + def call( + self, + input_ids: TFModelInputType | None = None, + attention_mask: np.ndarray | tf.Tensor | None = None, + token_type_ids: np.ndarray | tf.Tensor | None = None, + position_ids: np.ndarray | tf.Tensor | None = None, + head_mask: np.ndarray | tf.Tensor | None = None, + inputs_embeds: np.ndarray | tf.Tensor | None = None, + encoder_hidden_states: np.ndarray | tf.Tensor | None = None, + encoder_attention_mask: np.ndarray | tf.Tensor | None = None, + past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + labels: np.ndarray | tf.Tensor | None = None, + training: Optional[bool] = False, + **kwargs, + ) -> Union[TFCausalLMOutputWithCrossAttentions, Tuple[tf.Tensor]]: + r""" + encoder_hidden_states (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if + the model is configured as a decoder. + encoder_attention_mask (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in + the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + past_key_values (`Tuple[Tuple[tf.Tensor]]` of length `config.n_layers`) + contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding. + If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that + don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all + `decoder_input_ids` of shape `(batch_size, sequence_length)`. + use_cache (`bool`, *optional*, defaults to `True`): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see + `past_key_values`). Set to `False` during training, `True` during generation + labels (`tf.Tensor` or `np.ndarray` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the cross entropy classification loss. Indices should be in `[0, ..., + config.vocab_size - 1]`. + """ + outputs = self.bert( + input_ids=input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + past_key_values=past_key_values, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + ) + sequence_output = outputs[0] + logits = self.mlm(sequence_output=sequence_output, training=training) + loss = None + + if labels is not None: + # shift labels to the left and cut last logit token + shifted_logits = logits[:, :-1] + labels = labels[:, 1:] + loss = self.hf_compute_loss(labels=labels, logits=shifted_logits) + + if not return_dict: + output = (logits,) + outputs[2:] + return ((loss,) + output) if loss is not None else output + + return TFCausalLMOutputWithCrossAttentions( + loss=loss, + logits=logits, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + cross_attentions=outputs.cross_attentions, + ) + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "bert", None) is not None: + with tf.name_scope(self.bert.name): + self.bert.build(None) + if getattr(self, "mlm", None) is not None: + with tf.name_scope(self.mlm.name): + self.mlm.build(None) + + +@add_start_docstrings( + """Bert Model with a `next sentence prediction (classification)` head on top.""", + BERT_START_DOCSTRING, +) +class TFBertForNextSentencePrediction(TFBertPreTrainedModel, TFNextSentencePredictionLoss): + # names with a '.' represents the authorized unexpected/missing layers when a TF model is loaded from a PT model + _keys_to_ignore_on_load_unexpected = [r"mlm___cls", r"cls.predictions"] + + def __init__(self, config: BertConfig, *inputs, **kwargs): + super().__init__(config, *inputs, **kwargs) + + self.bert = TFBertMainLayer(config, name="bert") + self.nsp = TFBertNSPHead(config, name="nsp___cls") + + @unpack_inputs + @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @replace_return_docstrings(output_type=TFNextSentencePredictorOutput, config_class=_CONFIG_FOR_DOC) + def call( + self, + input_ids: TFModelInputType | None = None, + attention_mask: np.ndarray | tf.Tensor | None = None, + token_type_ids: np.ndarray | tf.Tensor | None = None, + position_ids: np.ndarray | tf.Tensor | None = None, + head_mask: np.ndarray | tf.Tensor | None = None, + inputs_embeds: np.ndarray | tf.Tensor | None = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + next_sentence_label: np.ndarray | tf.Tensor | None = None, + training: Optional[bool] = False, + ) -> Union[TFNextSentencePredictorOutput, Tuple[tf.Tensor]]: + r""" + Return: + + Examples: + + ```python + >>> import tensorflow as tf + >>> from transformers import AutoTokenizer, TFBertForNextSentencePrediction + + >>> tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased") + >>> model = TFBertForNextSentencePrediction.from_pretrained("google-bert/bert-base-uncased") + + >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced." + >>> next_sentence = "The sky is blue due to the shorter wavelength of blue light." + >>> encoding = tokenizer(prompt, next_sentence, return_tensors="tf") + + >>> logits = model(encoding["input_ids"], token_type_ids=encoding["token_type_ids"])[0] + >>> assert logits[0][0] < logits[0][1] # the next sentence was random + ```""" + outputs = self.bert( + input_ids=input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + ) + pooled_output = outputs[1] + seq_relationship_scores = self.nsp(pooled_output=pooled_output) + next_sentence_loss = ( + None + if next_sentence_label is None + else self.hf_compute_loss(labels=next_sentence_label, logits=seq_relationship_scores) + ) + + if not return_dict: + output = (seq_relationship_scores,) + outputs[2:] + return ((next_sentence_loss,) + output) if next_sentence_loss is not None else output + + return TFNextSentencePredictorOutput( + loss=next_sentence_loss, + logits=seq_relationship_scores, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "bert", None) is not None: + with tf.name_scope(self.bert.name): + self.bert.build(None) + if getattr(self, "nsp", None) is not None: + with tf.name_scope(self.nsp.name): + self.nsp.build(None) + + +@add_start_docstrings( + """ + Bert Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled + output) e.g. for GLUE tasks. + """, + BERT_START_DOCSTRING, +) +class TFBertForSequenceClassification(TFBertPreTrainedModel, TFSequenceClassificationLoss): + # names with a '.' represents the authorized unexpected/missing layers when a TF model is loaded from a PT model + _keys_to_ignore_on_load_unexpected = [r"mlm___cls", r"nsp___cls", r"cls.predictions", r"cls.seq_relationship"] + _keys_to_ignore_on_load_missing = [r"dropout"] + + def __init__(self, config: BertConfig, *inputs, **kwargs): + super().__init__(config, *inputs, **kwargs) + + self.num_labels = config.num_labels + + self.bert = TFBertMainLayer(config, name="bert") + classifier_dropout = ( + config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob + ) + self.dropout = keras.layers.Dropout(rate=classifier_dropout) + self.classifier = keras.layers.Dense( + units=config.num_labels, + kernel_initializer=get_initializer(config.initializer_range), + name="classifier", + ) + self.config = config + + @unpack_inputs + @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @add_code_sample_docstrings( + checkpoint=_CHECKPOINT_FOR_SEQUENCE_CLASSIFICATION, + output_type=TFSequenceClassifierOutput, + config_class=_CONFIG_FOR_DOC, + expected_output=_SEQ_CLASS_EXPECTED_OUTPUT, + expected_loss=_SEQ_CLASS_EXPECTED_LOSS, + ) + def call( + self, + input_ids: TFModelInputType | None = None, + attention_mask: np.ndarray | tf.Tensor | None = None, + token_type_ids: np.ndarray | tf.Tensor | None = None, + position_ids: np.ndarray | tf.Tensor | None = None, + head_mask: np.ndarray | tf.Tensor | None = None, + inputs_embeds: np.ndarray | tf.Tensor | None = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + labels: np.ndarray | tf.Tensor | None = None, + training: Optional[bool] = False, + ) -> Union[TFSequenceClassifierOutput, Tuple[tf.Tensor]]: + r""" + labels (`tf.Tensor` or `np.ndarray` of shape `(batch_size,)`, *optional*): + Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., + config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If + `config.num_labels > 1` a classification loss is computed (Cross-Entropy). + """ + outputs = self.bert( + input_ids=input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + ) + pooled_output = outputs[1] + pooled_output = self.dropout(inputs=pooled_output, training=training) + logits = self.classifier(inputs=pooled_output) + loss = None if labels is None else self.hf_compute_loss(labels=labels, logits=logits) + + if not return_dict: + output = (logits,) + outputs[2:] + return ((loss,) + output) if loss is not None else output + + return TFSequenceClassifierOutput( + loss=loss, + logits=logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "bert", None) is not None: + with tf.name_scope(self.bert.name): + self.bert.build(None) + if getattr(self, "classifier", None) is not None: + with tf.name_scope(self.classifier.name): + self.classifier.build([None, None, self.config.hidden_size]) + + +@add_start_docstrings( + """ + Bert Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a + softmax) e.g. for RocStories/SWAG tasks. + """, + BERT_START_DOCSTRING, +) +class TFBertForMultipleChoice(TFBertPreTrainedModel, TFMultipleChoiceLoss): + # names with a '.' represents the authorized unexpected/missing layers when a TF model is loaded from a PT model + _keys_to_ignore_on_load_unexpected = [r"mlm___cls", r"nsp___cls", r"cls.predictions", r"cls.seq_relationship"] + _keys_to_ignore_on_load_missing = [r"dropout"] + + def __init__(self, config: BertConfig, *inputs, **kwargs): + super().__init__(config, *inputs, **kwargs) + + self.bert = TFBertMainLayer(config, name="bert") + self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob) + self.classifier = keras.layers.Dense( + units=1, kernel_initializer=get_initializer(config.initializer_range), name="classifier" + ) + self.config = config + + @unpack_inputs + @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length")) + @add_code_sample_docstrings( + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=TFMultipleChoiceModelOutput, + config_class=_CONFIG_FOR_DOC, + ) + def call( + self, + input_ids: TFModelInputType | None = None, + attention_mask: np.ndarray | tf.Tensor | None = None, + token_type_ids: np.ndarray | tf.Tensor | None = None, + position_ids: np.ndarray | tf.Tensor | None = None, + head_mask: np.ndarray | tf.Tensor | None = None, + inputs_embeds: np.ndarray | tf.Tensor | None = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + labels: np.ndarray | tf.Tensor | None = None, + training: Optional[bool] = False, + ) -> Union[TFMultipleChoiceModelOutput, Tuple[tf.Tensor]]: + r""" + labels (`tf.Tensor` or `np.ndarray` of shape `(batch_size,)`, *optional*): + Labels for computing the multiple choice classification loss. Indices should be in `[0, ..., num_choices]` + where `num_choices` is the size of the second dimension of the input tensors. (See `input_ids` above) + """ + if input_ids is not None: + num_choices = shape_list(input_ids)[1] + seq_length = shape_list(input_ids)[2] + else: + num_choices = shape_list(inputs_embeds)[1] + seq_length = shape_list(inputs_embeds)[2] + + flat_input_ids = tf.reshape(tensor=input_ids, shape=(-1, seq_length)) if input_ids is not None else None + flat_attention_mask = ( + tf.reshape(tensor=attention_mask, shape=(-1, seq_length)) if attention_mask is not None else None + ) + flat_token_type_ids = ( + tf.reshape(tensor=token_type_ids, shape=(-1, seq_length)) if token_type_ids is not None else None + ) + flat_position_ids = ( + tf.reshape(tensor=position_ids, shape=(-1, seq_length)) if position_ids is not None else None + ) + flat_inputs_embeds = ( + tf.reshape(tensor=inputs_embeds, shape=(-1, seq_length, shape_list(inputs_embeds)[3])) + if inputs_embeds is not None + else None + ) + outputs = self.bert( + input_ids=flat_input_ids, + attention_mask=flat_attention_mask, + token_type_ids=flat_token_type_ids, + position_ids=flat_position_ids, + head_mask=head_mask, + inputs_embeds=flat_inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + ) + pooled_output = outputs[1] + pooled_output = self.dropout(inputs=pooled_output, training=training) + logits = self.classifier(inputs=pooled_output) + reshaped_logits = tf.reshape(tensor=logits, shape=(-1, num_choices)) + loss = None if labels is None else self.hf_compute_loss(labels=labels, logits=reshaped_logits) + + if not return_dict: + output = (reshaped_logits,) + outputs[2:] + return ((loss,) + output) if loss is not None else output + + return TFMultipleChoiceModelOutput( + loss=loss, + logits=reshaped_logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "bert", None) is not None: + with tf.name_scope(self.bert.name): + self.bert.build(None) + if getattr(self, "classifier", None) is not None: + with tf.name_scope(self.classifier.name): + self.classifier.build([None, None, self.config.hidden_size]) + + +@add_start_docstrings( + """ + Bert Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for + Named-Entity-Recognition (NER) tasks. + """, + BERT_START_DOCSTRING, +) +class TFBertForTokenClassification(TFBertPreTrainedModel, TFTokenClassificationLoss): + # names with a '.' represents the authorized unexpected/missing layers when a TF model is loaded from a PT model + _keys_to_ignore_on_load_unexpected = [ + r"pooler", + r"mlm___cls", + r"nsp___cls", + r"cls.predictions", + r"cls.seq_relationship", + ] + _keys_to_ignore_on_load_missing = [r"dropout"] + + def __init__(self, config: BertConfig, *inputs, **kwargs): + super().__init__(config, *inputs, **kwargs) + + self.num_labels = config.num_labels + + self.bert = TFBertMainLayer(config, add_pooling_layer=False, name="bert") + classifier_dropout = ( + config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob + ) + self.dropout = keras.layers.Dropout(rate=classifier_dropout) + self.classifier = keras.layers.Dense( + units=config.num_labels, + kernel_initializer=get_initializer(config.initializer_range), + name="classifier", + ) + self.config = config + + @unpack_inputs + @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @add_code_sample_docstrings( + checkpoint=_CHECKPOINT_FOR_TOKEN_CLASSIFICATION, + output_type=TFTokenClassifierOutput, + config_class=_CONFIG_FOR_DOC, + expected_output=_TOKEN_CLASS_EXPECTED_OUTPUT, + expected_loss=_TOKEN_CLASS_EXPECTED_LOSS, + ) + def call( + self, + input_ids: TFModelInputType | None = None, + attention_mask: np.ndarray | tf.Tensor | None = None, + token_type_ids: np.ndarray | tf.Tensor | None = None, + position_ids: np.ndarray | tf.Tensor | None = None, + head_mask: np.ndarray | tf.Tensor | None = None, + inputs_embeds: np.ndarray | tf.Tensor | None = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + labels: np.ndarray | tf.Tensor | None = None, + training: Optional[bool] = False, + ) -> Union[TFTokenClassifierOutput, Tuple[tf.Tensor]]: + r""" + labels (`tf.Tensor` or `np.ndarray` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`. + """ + outputs = self.bert( + input_ids=input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + ) + sequence_output = outputs[0] + sequence_output = self.dropout(inputs=sequence_output, training=training) + logits = self.classifier(inputs=sequence_output) + loss = None if labels is None else self.hf_compute_loss(labels=labels, logits=logits) + + if not return_dict: + output = (logits,) + outputs[2:] + return ((loss,) + output) if loss is not None else output + + return TFTokenClassifierOutput( + loss=loss, + logits=logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "bert", None) is not None: + with tf.name_scope(self.bert.name): + self.bert.build(None) + if getattr(self, "classifier", None) is not None: + with tf.name_scope(self.classifier.name): + self.classifier.build([None, None, self.config.hidden_size]) + + +@add_start_docstrings( + """ + Bert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear + layer on top of the hidden-states output to compute `span start logits` and `span end logits`). + """, + BERT_START_DOCSTRING, +) +class TFBertForQuestionAnswering(TFBertPreTrainedModel, TFQuestionAnsweringLoss): + # names with a '.' represents the authorized unexpected/missing layers when a TF model is loaded from a PT model + _keys_to_ignore_on_load_unexpected = [ + r"pooler", + r"mlm___cls", + r"nsp___cls", + r"cls.predictions", + r"cls.seq_relationship", + ] + + def __init__(self, config: BertConfig, *inputs, **kwargs): + super().__init__(config, *inputs, **kwargs) + + self.num_labels = config.num_labels + + self.bert = TFBertMainLayer(config, add_pooling_layer=False, name="bert") + self.qa_outputs = keras.layers.Dense( + units=config.num_labels, + kernel_initializer=get_initializer(config.initializer_range), + name="qa_outputs", + ) + self.config = config + + @unpack_inputs + @add_start_docstrings_to_model_forward(BERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @add_code_sample_docstrings( + checkpoint=_CHECKPOINT_FOR_QA, + output_type=TFQuestionAnsweringModelOutput, + config_class=_CONFIG_FOR_DOC, + qa_target_start_index=_QA_TARGET_START_INDEX, + qa_target_end_index=_QA_TARGET_END_INDEX, + expected_output=_QA_EXPECTED_OUTPUT, + expected_loss=_QA_EXPECTED_LOSS, + ) + def call( + self, + input_ids: TFModelInputType | None = None, + attention_mask: np.ndarray | tf.Tensor | None = None, + token_type_ids: np.ndarray | tf.Tensor | None = None, + position_ids: np.ndarray | tf.Tensor | None = None, + head_mask: np.ndarray | tf.Tensor | None = None, + inputs_embeds: np.ndarray | tf.Tensor | None = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + start_positions: np.ndarray | tf.Tensor | None = None, + end_positions: np.ndarray | tf.Tensor | None = None, + training: Optional[bool] = False, + ) -> Union[TFQuestionAnsweringModelOutput, Tuple[tf.Tensor]]: + r""" + start_positions (`tf.Tensor` or `np.ndarray` of shape `(batch_size,)`, *optional*): + Labels for position (index) of the start of the labelled span for computing the token classification loss. + Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence + are not taken into account for computing the loss. + end_positions (`tf.Tensor` or `np.ndarray` of shape `(batch_size,)`, *optional*): + Labels for position (index) of the end of the labelled span for computing the token classification loss. + Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence + are not taken into account for computing the loss. + """ + outputs = self.bert( + input_ids=input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + ) + sequence_output = outputs[0] + logits = self.qa_outputs(inputs=sequence_output) + start_logits, end_logits = tf.split(value=logits, num_or_size_splits=2, axis=-1) + start_logits = tf.squeeze(input=start_logits, axis=-1) + end_logits = tf.squeeze(input=end_logits, axis=-1) + loss = None + + if start_positions is not None and end_positions is not None: + labels = {"start_position": start_positions} + labels["end_position"] = end_positions + loss = self.hf_compute_loss(labels=labels, logits=(start_logits, end_logits)) + + if not return_dict: + output = (start_logits, end_logits) + outputs[2:] + return ((loss,) + output) if loss is not None else output + + return TFQuestionAnsweringModelOutput( + loss=loss, + start_logits=start_logits, + end_logits=end_logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "bert", None) is not None: + with tf.name_scope(self.bert.name): + self.bert.build(None) + if getattr(self, "qa_outputs", None) is not None: + with tf.name_scope(self.qa_outputs.name): + self.qa_outputs.build([None, None, self.config.hidden_size]) + + +__all__ = [ + "TFBertEmbeddings", + "TFBertForMaskedLM", + "TFBertForMultipleChoice", + "TFBertForNextSentencePrediction", + "TFBertForPreTraining", + "TFBertForQuestionAnswering", + "TFBertForSequenceClassification", + "TFBertForTokenClassification", + "TFBertLMHeadModel", + "TFBertMainLayer", + "TFBertModel", + "TFBertPreTrainedModel", +] diff --git a/docs/transformers/build/lib/transformers/models/bert/tokenization_bert.py b/docs/transformers/build/lib/transformers/models/bert/tokenization_bert.py new file mode 100644 index 0000000000000000000000000000000000000000..42d4dd94554d411fd37b6328b51887a47401f114 --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/bert/tokenization_bert.py @@ -0,0 +1,507 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Tokenization classes for Bert.""" + +import collections +import os +import unicodedata +from typing import List, Optional, Tuple + +from ...tokenization_utils import PreTrainedTokenizer, _is_control, _is_punctuation, _is_whitespace +from ...utils import logging + + +logger = logging.get_logger(__name__) + +VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"} + + +def load_vocab(vocab_file): + """Loads a vocabulary file into a dictionary.""" + vocab = collections.OrderedDict() + with open(vocab_file, "r", encoding="utf-8") as reader: + tokens = reader.readlines() + for index, token in enumerate(tokens): + token = token.rstrip("\n") + vocab[token] = index + return vocab + + +def whitespace_tokenize(text): + """Runs basic whitespace cleaning and splitting on a piece of text.""" + text = text.strip() + if not text: + return [] + tokens = text.split() + return tokens + + +class BertTokenizer(PreTrainedTokenizer): + r""" + Construct a BERT tokenizer. Based on WordPiece. + + This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to + this superclass for more information regarding those methods. + + Args: + vocab_file (`str`): + File containing the vocabulary. + do_lower_case (`bool`, *optional*, defaults to `True`): + Whether or not to lowercase the input when tokenizing. + do_basic_tokenize (`bool`, *optional*, defaults to `True`): + Whether or not to do basic tokenization before WordPiece. + never_split (`Iterable`, *optional*): + Collection of tokens which will never be split during tokenization. Only has an effect when + `do_basic_tokenize=True` + unk_token (`str`, *optional*, defaults to `"[UNK]"`): + The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this + token instead. + sep_token (`str`, *optional*, defaults to `"[SEP]"`): + The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for + sequence classification or for a text and a question for question answering. It is also used as the last + token of a sequence built with special tokens. + pad_token (`str`, *optional*, defaults to `"[PAD]"`): + The token used for padding, for example when batching sequences of different lengths. + cls_token (`str`, *optional*, defaults to `"[CLS]"`): + The classifier token which is used when doing sequence classification (classification of the whole sequence + instead of per-token classification). It is the first token of the sequence when built with special tokens. + mask_token (`str`, *optional*, defaults to `"[MASK]"`): + The token used for masking values. This is the token used when training this model with masked language + modeling. This is the token which the model will try to predict. + tokenize_chinese_chars (`bool`, *optional*, defaults to `True`): + Whether or not to tokenize Chinese characters. + + This should likely be deactivated for Japanese (see this + [issue](https://github.com/huggingface/transformers/issues/328)). + strip_accents (`bool`, *optional*): + Whether or not to strip all accents. If this option is not specified, then it will be determined by the + value for `lowercase` (as in the original BERT). + clean_up_tokenization_spaces (`bool`, *optional*, defaults to `True`): + Whether or not to cleanup spaces after decoding, cleanup consists in removing potential artifacts like + extra spaces. + """ + + vocab_files_names = VOCAB_FILES_NAMES + + def __init__( + self, + vocab_file, + do_lower_case=True, + do_basic_tokenize=True, + never_split=None, + unk_token="[UNK]", + sep_token="[SEP]", + pad_token="[PAD]", + cls_token="[CLS]", + mask_token="[MASK]", + tokenize_chinese_chars=True, + strip_accents=None, + clean_up_tokenization_spaces=True, + **kwargs, + ): + if not os.path.isfile(vocab_file): + raise ValueError( + f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained" + " model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`" + ) + self.vocab = load_vocab(vocab_file) + self.ids_to_tokens = collections.OrderedDict([(ids, tok) for tok, ids in self.vocab.items()]) + self.do_basic_tokenize = do_basic_tokenize + if do_basic_tokenize: + self.basic_tokenizer = BasicTokenizer( + do_lower_case=do_lower_case, + never_split=never_split, + tokenize_chinese_chars=tokenize_chinese_chars, + strip_accents=strip_accents, + ) + + self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=str(unk_token)) + + super().__init__( + do_lower_case=do_lower_case, + do_basic_tokenize=do_basic_tokenize, + never_split=never_split, + unk_token=unk_token, + sep_token=sep_token, + pad_token=pad_token, + cls_token=cls_token, + mask_token=mask_token, + tokenize_chinese_chars=tokenize_chinese_chars, + strip_accents=strip_accents, + clean_up_tokenization_spaces=clean_up_tokenization_spaces, + **kwargs, + ) + + @property + def do_lower_case(self): + return self.basic_tokenizer.do_lower_case + + @property + def vocab_size(self): + return len(self.vocab) + + def get_vocab(self): + return dict(self.vocab, **self.added_tokens_encoder) + + def _tokenize(self, text, split_special_tokens=False): + split_tokens = [] + if self.do_basic_tokenize: + for token in self.basic_tokenizer.tokenize( + text, never_split=self.all_special_tokens if not split_special_tokens else None + ): + # If the token is part of the never_split set + if token in self.basic_tokenizer.never_split: + split_tokens.append(token) + else: + split_tokens += self.wordpiece_tokenizer.tokenize(token) + else: + split_tokens = self.wordpiece_tokenizer.tokenize(text) + return split_tokens + + def _convert_token_to_id(self, token): + """Converts a token (str) in an id using the vocab.""" + return self.vocab.get(token, self.vocab.get(self.unk_token)) + + def _convert_id_to_token(self, index): + """Converts an index (integer) in a token (str) using the vocab.""" + return self.ids_to_tokens.get(index, self.unk_token) + + def convert_tokens_to_string(self, tokens): + """Converts a sequence of tokens (string) in a single string.""" + out_string = " ".join(tokens).replace(" ##", "").strip() + return out_string + + def build_inputs_with_special_tokens( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None + ) -> List[int]: + """ + Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and + adding special tokens. A BERT sequence has the following format: + + - single sequence: `[CLS] X [SEP]` + - pair of sequences: `[CLS] A [SEP] B [SEP]` + + Args: + token_ids_0 (`List[int]`): + List of IDs to which the special tokens will be added. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + + Returns: + `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens. + """ + if token_ids_1 is None: + return [self.cls_token_id] + token_ids_0 + [self.sep_token_id] + cls = [self.cls_token_id] + sep = [self.sep_token_id] + return cls + token_ids_0 + sep + token_ids_1 + sep + + def get_special_tokens_mask( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False + ) -> List[int]: + """ + Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding + special tokens using the tokenizer `prepare_for_model` method. + + Args: + token_ids_0 (`List[int]`): + List of IDs. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + already_has_special_tokens (`bool`, *optional*, defaults to `False`): + Whether or not the token list is already formatted with special tokens for the model. + + Returns: + `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. + """ + + if already_has_special_tokens: + return super().get_special_tokens_mask( + token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True + ) + + if token_ids_1 is not None: + return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1] + return [1] + ([0] * len(token_ids_0)) + [1] + + def create_token_type_ids_from_sequences( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None + ) -> List[int]: + """ + Create a mask from the two sequences passed to be used in a sequence-pair classification task. A BERT sequence + pair mask has the following format: + + ``` + 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 + | first sequence | second sequence | + ``` + + If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s). + + Args: + token_ids_0 (`List[int]`): + List of IDs. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + + Returns: + `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s). + """ + sep = [self.sep_token_id] + cls = [self.cls_token_id] + if token_ids_1 is None: + return len(cls + token_ids_0 + sep) * [0] + return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1] + + def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]: + index = 0 + if os.path.isdir(save_directory): + vocab_file = os.path.join( + save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"] + ) + else: + vocab_file = (filename_prefix + "-" if filename_prefix else "") + save_directory + with open(vocab_file, "w", encoding="utf-8") as writer: + for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]): + if index != token_index: + logger.warning( + f"Saving vocabulary to {vocab_file}: vocabulary indices are not consecutive." + " Please check that the vocabulary is not corrupted!" + ) + index = token_index + writer.write(token + "\n") + index += 1 + return (vocab_file,) + + +class BasicTokenizer: + """ + Constructs a BasicTokenizer that will run basic tokenization (punctuation splitting, lower casing, etc.). + + Args: + do_lower_case (`bool`, *optional*, defaults to `True`): + Whether or not to lowercase the input when tokenizing. + never_split (`Iterable`, *optional*): + Collection of tokens which will never be split during tokenization. Only has an effect when + `do_basic_tokenize=True` + tokenize_chinese_chars (`bool`, *optional*, defaults to `True`): + Whether or not to tokenize Chinese characters. + + This should likely be deactivated for Japanese (see this + [issue](https://github.com/huggingface/transformers/issues/328)). + strip_accents (`bool`, *optional*): + Whether or not to strip all accents. If this option is not specified, then it will be determined by the + value for `lowercase` (as in the original BERT). + do_split_on_punc (`bool`, *optional*, defaults to `True`): + In some instances we want to skip the basic punctuation splitting so that later tokenization can capture + the full context of the words, such as contractions. + """ + + def __init__( + self, + do_lower_case=True, + never_split=None, + tokenize_chinese_chars=True, + strip_accents=None, + do_split_on_punc=True, + ): + if never_split is None: + never_split = [] + self.do_lower_case = do_lower_case + self.never_split = set(never_split) + self.tokenize_chinese_chars = tokenize_chinese_chars + self.strip_accents = strip_accents + self.do_split_on_punc = do_split_on_punc + + def tokenize(self, text, never_split=None): + """ + Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer. + + Args: + never_split (`List[str]`, *optional*) + Kept for backward compatibility purposes. Now implemented directly at the base class level (see + [`PreTrainedTokenizer.tokenize`]) List of token not to split. + """ + # union() returns a new set by concatenating the two sets. + never_split = self.never_split.union(set(never_split)) if never_split else self.never_split + text = self._clean_text(text) + + # This was added on November 1st, 2018 for the multilingual and Chinese + # models. This is also applied to the English models now, but it doesn't + # matter since the English models were not trained on any Chinese data + # and generally don't have any Chinese data in them (there are Chinese + # characters in the vocabulary because Wikipedia does have some Chinese + # words in the English Wikipedia.). + if self.tokenize_chinese_chars: + text = self._tokenize_chinese_chars(text) + # prevents treating the same character with different unicode codepoints as different characters + unicode_normalized_text = unicodedata.normalize("NFC", text) + orig_tokens = whitespace_tokenize(unicode_normalized_text) + split_tokens = [] + for token in orig_tokens: + if token not in never_split: + if self.do_lower_case: + token = token.lower() + if self.strip_accents is not False: + token = self._run_strip_accents(token) + elif self.strip_accents: + token = self._run_strip_accents(token) + split_tokens.extend(self._run_split_on_punc(token, never_split)) + + output_tokens = whitespace_tokenize(" ".join(split_tokens)) + return output_tokens + + def _run_strip_accents(self, text): + """Strips accents from a piece of text.""" + text = unicodedata.normalize("NFD", text) + output = [] + for char in text: + cat = unicodedata.category(char) + if cat == "Mn": + continue + output.append(char) + return "".join(output) + + def _run_split_on_punc(self, text, never_split=None): + """Splits punctuation on a piece of text.""" + if not self.do_split_on_punc or (never_split is not None and text in never_split): + return [text] + chars = list(text) + i = 0 + start_new_word = True + output = [] + while i < len(chars): + char = chars[i] + if _is_punctuation(char): + output.append([char]) + start_new_word = True + else: + if start_new_word: + output.append([]) + start_new_word = False + output[-1].append(char) + i += 1 + + return ["".join(x) for x in output] + + def _tokenize_chinese_chars(self, text): + """Adds whitespace around any CJK character.""" + output = [] + for char in text: + cp = ord(char) + if self._is_chinese_char(cp): + output.append(" ") + output.append(char) + output.append(" ") + else: + output.append(char) + return "".join(output) + + def _is_chinese_char(self, cp): + """Checks whether CP is the codepoint of a CJK character.""" + # This defines a "chinese character" as anything in the CJK Unicode block: + # https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block) + # + # Note that the CJK Unicode block is NOT all Japanese and Korean characters, + # despite its name. The modern Korean Hangul alphabet is a different block, + # as is Japanese Hiragana and Katakana. Those alphabets are used to write + # space-separated words, so they are not treated specially and handled + # like the all of the other languages. + if ( + (cp >= 0x4E00 and cp <= 0x9FFF) + or (cp >= 0x3400 and cp <= 0x4DBF) # + or (cp >= 0x20000 and cp <= 0x2A6DF) # + or (cp >= 0x2A700 and cp <= 0x2B73F) # + or (cp >= 0x2B740 and cp <= 0x2B81F) # + or (cp >= 0x2B820 and cp <= 0x2CEAF) # + or (cp >= 0xF900 and cp <= 0xFAFF) + or (cp >= 0x2F800 and cp <= 0x2FA1F) # + ): # + return True + + return False + + def _clean_text(self, text): + """Performs invalid character removal and whitespace cleanup on text.""" + output = [] + for char in text: + cp = ord(char) + if cp == 0 or cp == 0xFFFD or _is_control(char): + continue + if _is_whitespace(char): + output.append(" ") + else: + output.append(char) + return "".join(output) + + +class WordpieceTokenizer: + """Runs WordPiece tokenization.""" + + def __init__(self, vocab, unk_token, max_input_chars_per_word=100): + self.vocab = vocab + self.unk_token = unk_token + self.max_input_chars_per_word = max_input_chars_per_word + + def tokenize(self, text): + """ + Tokenizes a piece of text into its word pieces. This uses a greedy longest-match-first algorithm to perform + tokenization using the given vocabulary. + + For example, `input = "unaffable"` wil return as output `["un", "##aff", "##able"]`. + + Args: + text: A single token or whitespace separated tokens. This should have + already been passed through *BasicTokenizer*. + + Returns: + A list of wordpiece tokens. + """ + + output_tokens = [] + for token in whitespace_tokenize(text): + chars = list(token) + if len(chars) > self.max_input_chars_per_word: + output_tokens.append(self.unk_token) + continue + + is_bad = False + start = 0 + sub_tokens = [] + while start < len(chars): + end = len(chars) + cur_substr = None + while start < end: + substr = "".join(chars[start:end]) + if start > 0: + substr = "##" + substr + if substr in self.vocab: + cur_substr = substr + break + end -= 1 + if cur_substr is None: + is_bad = True + break + sub_tokens.append(cur_substr) + start = end + + if is_bad: + output_tokens.append(self.unk_token) + else: + output_tokens.extend(sub_tokens) + return output_tokens + + +__all__ = ["BasicTokenizer", "BertTokenizer", "WordpieceTokenizer"] diff --git a/docs/transformers/build/lib/transformers/models/bert/tokenization_bert_fast.py b/docs/transformers/build/lib/transformers/models/bert/tokenization_bert_fast.py new file mode 100644 index 0000000000000000000000000000000000000000..4a89e6053b988f5b9d8131304c7b3e7e74dba5fd --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/bert/tokenization_bert_fast.py @@ -0,0 +1,175 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Fast Tokenization classes for Bert.""" + +import json +from typing import List, Optional, Tuple + +from tokenizers import normalizers + +from ...tokenization_utils_fast import PreTrainedTokenizerFast +from ...utils import logging +from .tokenization_bert import BertTokenizer + + +logger = logging.get_logger(__name__) + +VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt", "tokenizer_file": "tokenizer.json"} + + +class BertTokenizerFast(PreTrainedTokenizerFast): + r""" + Construct a "fast" BERT tokenizer (backed by HuggingFace's *tokenizers* library). Based on WordPiece. + + This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should + refer to this superclass for more information regarding those methods. + + Args: + vocab_file (`str`): + File containing the vocabulary. + do_lower_case (`bool`, *optional*, defaults to `True`): + Whether or not to lowercase the input when tokenizing. + unk_token (`str`, *optional*, defaults to `"[UNK]"`): + The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this + token instead. + sep_token (`str`, *optional*, defaults to `"[SEP]"`): + The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for + sequence classification or for a text and a question for question answering. It is also used as the last + token of a sequence built with special tokens. + pad_token (`str`, *optional*, defaults to `"[PAD]"`): + The token used for padding, for example when batching sequences of different lengths. + cls_token (`str`, *optional*, defaults to `"[CLS]"`): + The classifier token which is used when doing sequence classification (classification of the whole sequence + instead of per-token classification). It is the first token of the sequence when built with special tokens. + mask_token (`str`, *optional*, defaults to `"[MASK]"`): + The token used for masking values. This is the token used when training this model with masked language + modeling. This is the token which the model will try to predict. + clean_text (`bool`, *optional*, defaults to `True`): + Whether or not to clean the text before tokenization by removing any control characters and replacing all + whitespaces by the classic one. + tokenize_chinese_chars (`bool`, *optional*, defaults to `True`): + Whether or not to tokenize Chinese characters. This should likely be deactivated for Japanese (see [this + issue](https://github.com/huggingface/transformers/issues/328)). + strip_accents (`bool`, *optional*): + Whether or not to strip all accents. If this option is not specified, then it will be determined by the + value for `lowercase` (as in the original BERT). + wordpieces_prefix (`str`, *optional*, defaults to `"##"`): + The prefix for subwords. + """ + + vocab_files_names = VOCAB_FILES_NAMES + slow_tokenizer_class = BertTokenizer + + def __init__( + self, + vocab_file=None, + tokenizer_file=None, + do_lower_case=True, + unk_token="[UNK]", + sep_token="[SEP]", + pad_token="[PAD]", + cls_token="[CLS]", + mask_token="[MASK]", + tokenize_chinese_chars=True, + strip_accents=None, + **kwargs, + ): + super().__init__( + vocab_file, + tokenizer_file=tokenizer_file, + do_lower_case=do_lower_case, + unk_token=unk_token, + sep_token=sep_token, + pad_token=pad_token, + cls_token=cls_token, + mask_token=mask_token, + tokenize_chinese_chars=tokenize_chinese_chars, + strip_accents=strip_accents, + **kwargs, + ) + + normalizer_state = json.loads(self.backend_tokenizer.normalizer.__getstate__()) + if ( + normalizer_state.get("lowercase", do_lower_case) != do_lower_case + or normalizer_state.get("strip_accents", strip_accents) != strip_accents + or normalizer_state.get("handle_chinese_chars", tokenize_chinese_chars) != tokenize_chinese_chars + ): + normalizer_class = getattr(normalizers, normalizer_state.pop("type")) + normalizer_state["lowercase"] = do_lower_case + normalizer_state["strip_accents"] = strip_accents + normalizer_state["handle_chinese_chars"] = tokenize_chinese_chars + self.backend_tokenizer.normalizer = normalizer_class(**normalizer_state) + + self.do_lower_case = do_lower_case + + def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None): + """ + Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and + adding special tokens. A BERT sequence has the following format: + + - single sequence: `[CLS] X [SEP]` + - pair of sequences: `[CLS] A [SEP] B [SEP]` + + Args: + token_ids_0 (`List[int]`): + List of IDs to which the special tokens will be added. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + + Returns: + `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens. + """ + output = [self.cls_token_id] + token_ids_0 + [self.sep_token_id] + + if token_ids_1 is not None: + output += token_ids_1 + [self.sep_token_id] + + return output + + def create_token_type_ids_from_sequences( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None + ) -> List[int]: + """ + Create a mask from the two sequences passed to be used in a sequence-pair classification task. A BERT sequence + pair mask has the following format: + + ``` + 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 + | first sequence | second sequence | + ``` + + If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s). + + Args: + token_ids_0 (`List[int]`): + List of IDs. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + + Returns: + `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s). + """ + sep = [self.sep_token_id] + cls = [self.cls_token_id] + if token_ids_1 is None: + return len(cls + token_ids_0 + sep) * [0] + return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1] + + def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]: + files = self._tokenizer.model.save(save_directory, name=filename_prefix) + return tuple(files) + + +__all__ = ["BertTokenizerFast"] diff --git a/docs/transformers/build/lib/transformers/models/bert/tokenization_bert_tf.py b/docs/transformers/build/lib/transformers/models/bert/tokenization_bert_tf.py new file mode 100644 index 0000000000000000000000000000000000000000..e98fe340ee8b2bc90a0c7effa66c90867a42d884 --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/bert/tokenization_bert_tf.py @@ -0,0 +1,259 @@ +import os +from typing import List, Optional, Union + +import tensorflow as tf +from tensorflow_text import BertTokenizer as BertTokenizerLayer +from tensorflow_text import FastBertTokenizer, ShrinkLongestTrimmer, case_fold_utf8, combine_segments, pad_model_inputs + +from ...modeling_tf_utils import keras +from ...utils.import_utils import requires +from .tokenization_bert import BertTokenizer + + +@requires(backends=("tf", "tensorflow_text")) +class TFBertTokenizer(keras.layers.Layer): + """ + This is an in-graph tokenizer for BERT. It should be initialized similarly to other tokenizers, using the + `from_pretrained()` method. It can also be initialized with the `from_tokenizer()` method, which imports settings + from an existing standard tokenizer object. + + In-graph tokenizers, unlike other Hugging Face tokenizers, are actually Keras layers and are designed to be run + when the model is called, rather than during preprocessing. As a result, they have somewhat more limited options + than standard tokenizer classes. They are most useful when you want to create an end-to-end model that goes + straight from `tf.string` inputs to outputs. + + Args: + vocab_list (`list`): + List containing the vocabulary. + do_lower_case (`bool`, *optional*, defaults to `True`): + Whether or not to lowercase the input when tokenizing. + cls_token_id (`str`, *optional*, defaults to `"[CLS]"`): + The classifier token which is used when doing sequence classification (classification of the whole sequence + instead of per-token classification). It is the first token of the sequence when built with special tokens. + sep_token_id (`str`, *optional*, defaults to `"[SEP]"`): + The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for + sequence classification or for a text and a question for question answering. It is also used as the last + token of a sequence built with special tokens. + pad_token_id (`str`, *optional*, defaults to `"[PAD]"`): + The token used for padding, for example when batching sequences of different lengths. + padding (`str`, defaults to `"longest"`): + The type of padding to use. Can be either `"longest"`, to pad only up to the longest sample in the batch, + or `"max_length", to pad all inputs to the maximum length supported by the tokenizer. + truncation (`bool`, *optional*, defaults to `True`): + Whether to truncate the sequence to the maximum length. + max_length (`int`, *optional*, defaults to `512`): + The maximum length of the sequence, used for padding (if `padding` is "max_length") and/or truncation (if + `truncation` is `True`). + pad_to_multiple_of (`int`, *optional*, defaults to `None`): + If set, the sequence will be padded to a multiple of this value. + return_token_type_ids (`bool`, *optional*, defaults to `True`): + Whether to return token_type_ids. + return_attention_mask (`bool`, *optional*, defaults to `True`): + Whether to return the attention_mask. + use_fast_bert_tokenizer (`bool`, *optional*, defaults to `True`): + If True, will use the FastBertTokenizer class from Tensorflow Text. If False, will use the BertTokenizer + class instead. BertTokenizer supports some additional options, but is slower and cannot be exported to + TFLite. + """ + + def __init__( + self, + vocab_list: List, + do_lower_case: bool, + cls_token_id: Optional[int] = None, + sep_token_id: Optional[int] = None, + pad_token_id: Optional[int] = None, + padding: str = "longest", + truncation: bool = True, + max_length: int = 512, + pad_to_multiple_of: Optional[int] = None, + return_token_type_ids: bool = True, + return_attention_mask: bool = True, + use_fast_bert_tokenizer: bool = True, + **tokenizer_kwargs, + ): + super().__init__() + if use_fast_bert_tokenizer: + self.tf_tokenizer = FastBertTokenizer( + vocab_list, token_out_type=tf.int64, lower_case_nfd_strip_accents=do_lower_case, **tokenizer_kwargs + ) + else: + lookup_table = tf.lookup.StaticVocabularyTable( + tf.lookup.KeyValueTensorInitializer( + keys=vocab_list, + key_dtype=tf.string, + values=tf.range(tf.size(vocab_list, out_type=tf.int64), dtype=tf.int64), + value_dtype=tf.int64, + ), + num_oov_buckets=1, + ) + self.tf_tokenizer = BertTokenizerLayer( + lookup_table, token_out_type=tf.int64, lower_case=do_lower_case, **tokenizer_kwargs + ) + + self.vocab_list = vocab_list + self.do_lower_case = do_lower_case + self.cls_token_id = vocab_list.index("[CLS]") if cls_token_id is None else cls_token_id + self.sep_token_id = vocab_list.index("[SEP]") if sep_token_id is None else sep_token_id + self.pad_token_id = vocab_list.index("[PAD]") if pad_token_id is None else pad_token_id + self.paired_trimmer = ShrinkLongestTrimmer(max_length - 3, axis=1) # Allow room for special tokens + self.max_length = max_length + self.padding = padding + self.truncation = truncation + self.pad_to_multiple_of = pad_to_multiple_of + self.return_token_type_ids = return_token_type_ids + self.return_attention_mask = return_attention_mask + + @classmethod + def from_tokenizer(cls, tokenizer: "PreTrainedTokenizerBase", **kwargs): # noqa: F821 + """ + Initialize a `TFBertTokenizer` from an existing `Tokenizer`. + + Args: + tokenizer (`PreTrainedTokenizerBase`): + The tokenizer to use to initialize the `TFBertTokenizer`. + + Examples: + + ```python + from transformers import AutoTokenizer, TFBertTokenizer + + tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased") + tf_tokenizer = TFBertTokenizer.from_tokenizer(tokenizer) + ``` + """ + do_lower_case = kwargs.pop("do_lower_case", None) + do_lower_case = tokenizer.do_lower_case if do_lower_case is None else do_lower_case + cls_token_id = kwargs.pop("cls_token_id", None) + cls_token_id = tokenizer.cls_token_id if cls_token_id is None else cls_token_id + sep_token_id = kwargs.pop("sep_token_id", None) + sep_token_id = tokenizer.sep_token_id if sep_token_id is None else sep_token_id + pad_token_id = kwargs.pop("pad_token_id", None) + pad_token_id = tokenizer.pad_token_id if pad_token_id is None else pad_token_id + + vocab = tokenizer.get_vocab() + vocab = sorted(vocab.items(), key=lambda x: x[1]) + vocab_list = [entry[0] for entry in vocab] + return cls( + vocab_list=vocab_list, + do_lower_case=do_lower_case, + cls_token_id=cls_token_id, + sep_token_id=sep_token_id, + pad_token_id=pad_token_id, + **kwargs, + ) + + @classmethod + def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], *init_inputs, **kwargs): + """ + Instantiate a `TFBertTokenizer` from a pre-trained tokenizer. + + Args: + pretrained_model_name_or_path (`str` or `os.PathLike`): + The name or path to the pre-trained tokenizer. + + Examples: + + ```python + from transformers import TFBertTokenizer + + tf_tokenizer = TFBertTokenizer.from_pretrained("google-bert/bert-base-uncased") + ``` + """ + try: + tokenizer = BertTokenizer.from_pretrained(pretrained_model_name_or_path, *init_inputs, **kwargs) + except: # noqa: E722 + from .tokenization_bert_fast import BertTokenizerFast + + tokenizer = BertTokenizerFast.from_pretrained(pretrained_model_name_or_path, *init_inputs, **kwargs) + return cls.from_tokenizer(tokenizer, **kwargs) + + def unpaired_tokenize(self, texts): + if self.do_lower_case: + texts = case_fold_utf8(texts) + tokens = self.tf_tokenizer.tokenize(texts) + return tokens.merge_dims(1, -1) + + def call( + self, + text, + text_pair=None, + padding=None, + truncation=None, + max_length=None, + pad_to_multiple_of=None, + return_token_type_ids=None, + return_attention_mask=None, + ): + if padding is None: + padding = self.padding + if padding not in ("longest", "max_length"): + raise ValueError("Padding must be either 'longest' or 'max_length'!") + if max_length is not None and text_pair is not None: + # Because we have to instantiate a Trimmer to do it properly + raise ValueError("max_length cannot be overridden at call time when truncating paired texts!") + if max_length is None: + max_length = self.max_length + if truncation is None: + truncation = self.truncation + if pad_to_multiple_of is None: + pad_to_multiple_of = self.pad_to_multiple_of + if return_token_type_ids is None: + return_token_type_ids = self.return_token_type_ids + if return_attention_mask is None: + return_attention_mask = self.return_attention_mask + if not isinstance(text, tf.Tensor): + text = tf.convert_to_tensor(text) + if text_pair is not None and not isinstance(text_pair, tf.Tensor): + text_pair = tf.convert_to_tensor(text_pair) + if text_pair is not None: + if text.shape.rank > 1: + raise ValueError("text argument should not be multidimensional when a text pair is supplied!") + if text_pair.shape.rank > 1: + raise ValueError("text_pair should not be multidimensional!") + if text.shape.rank == 2: + text, text_pair = text[:, 0], text[:, 1] + text = self.unpaired_tokenize(text) + if text_pair is None: # Unpaired text + if truncation: + text = text[:, : max_length - 2] # Allow room for special tokens + input_ids, token_type_ids = combine_segments( + (text,), start_of_sequence_id=self.cls_token_id, end_of_segment_id=self.sep_token_id + ) + else: # Paired text + text_pair = self.unpaired_tokenize(text_pair) + if truncation: + text, text_pair = self.paired_trimmer.trim([text, text_pair]) + input_ids, token_type_ids = combine_segments( + (text, text_pair), start_of_sequence_id=self.cls_token_id, end_of_segment_id=self.sep_token_id + ) + if padding == "longest": + pad_length = input_ids.bounding_shape(axis=1) + if pad_to_multiple_of is not None: + # No ceiling division in tensorflow, so we negate floordiv instead + pad_length = pad_to_multiple_of * (-tf.math.floordiv(-pad_length, pad_to_multiple_of)) + else: + pad_length = max_length + + input_ids, attention_mask = pad_model_inputs(input_ids, max_seq_length=pad_length, pad_value=self.pad_token_id) + output = {"input_ids": input_ids} + if return_attention_mask: + output["attention_mask"] = attention_mask + if return_token_type_ids: + token_type_ids, _ = pad_model_inputs( + token_type_ids, max_seq_length=pad_length, pad_value=self.pad_token_id + ) + output["token_type_ids"] = token_type_ids + return output + + def get_config(self): + return { + "vocab_list": self.vocab_list, + "do_lower_case": self.do_lower_case, + "cls_token_id": self.cls_token_id, + "sep_token_id": self.sep_token_id, + "pad_token_id": self.pad_token_id, + } + + +__all__ = ["TFBertTokenizer"] diff --git a/docs/transformers/build/lib/transformers/models/bert_generation/__init__.py b/docs/transformers/build/lib/transformers/models/bert_generation/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..3f83b1f6e5bba323d5636820ef89026f072be816 --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/bert_generation/__init__.py @@ -0,0 +1,28 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ...utils import _LazyModule +from ...utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .configuration_bert_generation import * + from .modeling_bert_generation import * + from .tokenization_bert_generation import * +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/docs/transformers/build/lib/transformers/models/bert_generation/configuration_bert_generation.py b/docs/transformers/build/lib/transformers/models/bert_generation/configuration_bert_generation.py new file mode 100644 index 0000000000000000000000000000000000000000..1abe7c1a1c44ab206f4e3ac459ef599ec007b9bb --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/bert_generation/configuration_bert_generation.py @@ -0,0 +1,127 @@ +# coding=utf-8 +# Copyright 2020 The Google AI Language Team Authors and The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""BertGeneration model configuration""" + +from ...configuration_utils import PretrainedConfig + + +class BertGenerationConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`BertGenerationPreTrainedModel`]. It is used to + instantiate a BertGeneration model according to the specified arguments, defining the model architecture. + Instantiating a configuration with the defaults will yield a similar configuration to that of the BertGeneration + [google/bert_for_seq_generation_L-24_bbc_encoder](https://huggingface.co/google/bert_for_seq_generation_L-24_bbc_encoder) + architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + vocab_size (`int`, *optional*, defaults to 50358): + Vocabulary size of the BERT model. Defines the number of different tokens that can be represented by the + `inputs_ids` passed when calling [`BertGeneration`]. + hidden_size (`int`, *optional*, defaults to 1024): + Dimensionality of the encoder layers and the pooler layer. + num_hidden_layers (`int`, *optional*, defaults to 24): + Number of hidden layers in the Transformer encoder. + num_attention_heads (`int`, *optional*, defaults to 16): + Number of attention heads for each attention layer in the Transformer encoder. + intermediate_size (`int`, *optional*, defaults to 4096): + Dimensionality of the "intermediate" (often called feed-forward) layer in the Transformer encoder. + hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`): + The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, + `"relu"`, `"silu"` and `"gelu_new"` are supported. + hidden_dropout_prob (`float`, *optional*, defaults to 0.1): + The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. + attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1): + The dropout ratio for the attention probabilities. + max_position_embeddings (`int`, *optional*, defaults to 512): + The maximum sequence length that this model might ever be used with. Typically set this to something large + just in case (e.g., 512 or 1024 or 2048). + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + layer_norm_eps (`float`, *optional*, defaults to 1e-12): + The epsilon used by the layer normalization layers. + pad_token_id (`int`, *optional*, defaults to 0): + Padding token id. + bos_token_id (`int`, *optional*, defaults to 2): + Beginning of stream token id. + eos_token_id (`int`, *optional*, defaults to 1): + End of stream token id. + position_embedding_type (`str`, *optional*, defaults to `"absolute"`): + Type of position embedding. Choose one of `"absolute"`, `"relative_key"`, `"relative_key_query"`. For + positional embeddings use `"absolute"`. For more information on `"relative_key"`, please refer to + [Self-Attention with Relative Position Representations (Shaw et al.)](https://arxiv.org/abs/1803.02155). + For more information on `"relative_key_query"`, please refer to *Method 4* in [Improve Transformer Models + with Better Relative Position Embeddings (Huang et al.)](https://arxiv.org/abs/2009.13658). + use_cache (`bool`, *optional*, defaults to `True`): + Whether or not the model should return the last key/values attentions (not used by all models). Only + relevant if `config.is_decoder=True`. + + Examples: + + ```python + >>> from transformers import BertGenerationConfig, BertGenerationEncoder + + >>> # Initializing a BertGeneration config + >>> configuration = BertGenerationConfig() + + >>> # Initializing a model (with random weights) from the config + >>> model = BertGenerationEncoder(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "bert-generation" + + def __init__( + self, + vocab_size=50358, + hidden_size=1024, + num_hidden_layers=24, + num_attention_heads=16, + intermediate_size=4096, + hidden_act="gelu", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + max_position_embeddings=512, + initializer_range=0.02, + layer_norm_eps=1e-12, + pad_token_id=0, + bos_token_id=2, + eos_token_id=1, + position_embedding_type="absolute", + use_cache=True, + **kwargs, + ): + super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs) + + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.hidden_act = hidden_act + self.intermediate_size = intermediate_size + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.max_position_embeddings = max_position_embeddings + self.initializer_range = initializer_range + self.layer_norm_eps = layer_norm_eps + self.position_embedding_type = position_embedding_type + self.use_cache = use_cache + + +__all__ = ["BertGenerationConfig"] diff --git a/docs/transformers/build/lib/transformers/models/bert_generation/modeling_bert_generation.py b/docs/transformers/build/lib/transformers/models/bert_generation/modeling_bert_generation.py new file mode 100644 index 0000000000000000000000000000000000000000..75fd1b17168e10740cb2e90fc8357cedf12cfe8a --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/bert_generation/modeling_bert_generation.py @@ -0,0 +1,1011 @@ +# coding=utf-8 +# Copyright 2020 The Google AI Language Team Authors and The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch BERT model specific for generation.""" + +import math +from typing import Optional, Tuple, Union + +import torch +import torch.utils.checkpoint +from torch import nn + +from ...activations import ACT2FN +from ...generation import GenerationMixin +from ...modeling_outputs import BaseModelOutputWithPastAndCrossAttentions, CausalLMOutputWithCrossAttentions +from ...modeling_utils import PreTrainedModel +from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer +from ...utils import ( + add_code_sample_docstrings, + add_start_docstrings, + add_start_docstrings_to_model_forward, + logging, + replace_return_docstrings, +) +from .configuration_bert_generation import BertGenerationConfig + + +logger = logging.get_logger(__name__) + +_CHECKPOINT_FOR_DOC = "google/bert_for_seq_generation_L-24_bbc_encoder" +_CONFIG_FOR_DOC = "BertGenerationConfig" + + +# Copied from transformers.models.bert.modeling_bert.BertSelfOutput with Bert->BertGeneration +class BertGenerationSelfOutput(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = self.LayerNorm(hidden_states + input_tensor) + return hidden_states + + +# Copied from transformers.models.bert.modeling_bert.BertSelfAttention with Bert->BertGeneration +class BertGenerationSelfAttention(nn.Module): + def __init__(self, config, position_embedding_type=None): + super().__init__() + if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"): + raise ValueError( + f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention " + f"heads ({config.num_attention_heads})" + ) + + self.num_attention_heads = config.num_attention_heads + self.attention_head_size = int(config.hidden_size / config.num_attention_heads) + self.all_head_size = self.num_attention_heads * self.attention_head_size + + self.query = nn.Linear(config.hidden_size, self.all_head_size) + self.key = nn.Linear(config.hidden_size, self.all_head_size) + self.value = nn.Linear(config.hidden_size, self.all_head_size) + + self.dropout = nn.Dropout(config.attention_probs_dropout_prob) + self.position_embedding_type = position_embedding_type or getattr( + config, "position_embedding_type", "absolute" + ) + if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query": + self.max_position_embeddings = config.max_position_embeddings + self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size) + + self.is_decoder = config.is_decoder + + def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor: + new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size) + x = x.view(new_x_shape) + return x.permute(0, 2, 1, 3) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.FloatTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + encoder_attention_mask: Optional[torch.FloatTensor] = None, + past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None, + output_attentions: Optional[bool] = False, + ) -> Tuple[torch.Tensor]: + mixed_query_layer = self.query(hidden_states) + + # If this is instantiated as a cross-attention module, the keys + # and values come from an encoder; the attention mask needs to be + # such that the encoder's padding tokens are not attended to. + is_cross_attention = encoder_hidden_states is not None + + if is_cross_attention and past_key_value is not None: + # reuse k,v, cross_attentions + key_layer = past_key_value[0] + value_layer = past_key_value[1] + attention_mask = encoder_attention_mask + elif is_cross_attention: + key_layer = self.transpose_for_scores(self.key(encoder_hidden_states)) + value_layer = self.transpose_for_scores(self.value(encoder_hidden_states)) + attention_mask = encoder_attention_mask + elif past_key_value is not None: + key_layer = self.transpose_for_scores(self.key(hidden_states)) + value_layer = self.transpose_for_scores(self.value(hidden_states)) + key_layer = torch.cat([past_key_value[0], key_layer], dim=2) + value_layer = torch.cat([past_key_value[1], value_layer], dim=2) + else: + key_layer = self.transpose_for_scores(self.key(hidden_states)) + value_layer = self.transpose_for_scores(self.value(hidden_states)) + + query_layer = self.transpose_for_scores(mixed_query_layer) + + use_cache = past_key_value is not None + if self.is_decoder: + # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states. + # Further calls to cross_attention layer can then reuse all cross-attention + # key/value_states (first "if" case) + # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of + # all previous decoder key/value_states. Further calls to uni-directional self-attention + # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case) + # if encoder bi-directional self-attention `past_key_value` is always `None` + past_key_value = (key_layer, value_layer) + + # Take the dot product between "query" and "key" to get the raw attention scores. + attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2)) + + if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query": + query_length, key_length = query_layer.shape[2], key_layer.shape[2] + if use_cache: + position_ids_l = torch.tensor(key_length - 1, dtype=torch.long, device=hidden_states.device).view( + -1, 1 + ) + else: + position_ids_l = torch.arange(query_length, dtype=torch.long, device=hidden_states.device).view(-1, 1) + position_ids_r = torch.arange(key_length, dtype=torch.long, device=hidden_states.device).view(1, -1) + distance = position_ids_l - position_ids_r + + positional_embedding = self.distance_embedding(distance + self.max_position_embeddings - 1) + positional_embedding = positional_embedding.to(dtype=query_layer.dtype) # fp16 compatibility + + if self.position_embedding_type == "relative_key": + relative_position_scores = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding) + attention_scores = attention_scores + relative_position_scores + elif self.position_embedding_type == "relative_key_query": + relative_position_scores_query = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding) + relative_position_scores_key = torch.einsum("bhrd,lrd->bhlr", key_layer, positional_embedding) + attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key + + attention_scores = attention_scores / math.sqrt(self.attention_head_size) + if attention_mask is not None: + # Apply the attention mask is (precomputed for all layers in BertGenerationModel forward() function) + attention_scores = attention_scores + attention_mask + + # Normalize the attention scores to probabilities. + attention_probs = nn.functional.softmax(attention_scores, dim=-1) + + # This is actually dropping out entire tokens to attend to, which might + # seem a bit unusual, but is taken from the original Transformer paper. + attention_probs = self.dropout(attention_probs) + + # Mask heads if we want to + if head_mask is not None: + attention_probs = attention_probs * head_mask + + context_layer = torch.matmul(attention_probs, value_layer) + + context_layer = context_layer.permute(0, 2, 1, 3).contiguous() + new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,) + context_layer = context_layer.view(new_context_layer_shape) + + outputs = (context_layer, attention_probs) if output_attentions else (context_layer,) + + if self.is_decoder: + outputs = outputs + (past_key_value,) + return outputs + + +BERT_GENERATION_SELF_ATTENTION_CLASSES = { + "eager": BertGenerationSelfAttention, +} + + +# Copied from transformers.models.bert.modeling_bert.BertAttention with Bert->BertGeneration,BERT->BERT_GENERATION +class BertGenerationAttention(nn.Module): + def __init__(self, config, position_embedding_type=None): + super().__init__() + self.self = BERT_GENERATION_SELF_ATTENTION_CLASSES[config._attn_implementation]( + config, position_embedding_type=position_embedding_type + ) + self.output = BertGenerationSelfOutput(config) + self.pruned_heads = set() + + def prune_heads(self, heads): + if len(heads) == 0: + return + heads, index = find_pruneable_heads_and_indices( + heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads + ) + + # Prune linear layers + self.self.query = prune_linear_layer(self.self.query, index) + self.self.key = prune_linear_layer(self.self.key, index) + self.self.value = prune_linear_layer(self.self.value, index) + self.output.dense = prune_linear_layer(self.output.dense, index, dim=1) + + # Update hyper params and store pruned heads + self.self.num_attention_heads = self.self.num_attention_heads - len(heads) + self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads + self.pruned_heads = self.pruned_heads.union(heads) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.FloatTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + encoder_attention_mask: Optional[torch.FloatTensor] = None, + past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None, + output_attentions: Optional[bool] = False, + ) -> Tuple[torch.Tensor]: + self_outputs = self.self( + hidden_states, + attention_mask, + head_mask, + encoder_hidden_states, + encoder_attention_mask, + past_key_value, + output_attentions, + ) + attention_output = self.output(self_outputs[0], hidden_states) + outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them + return outputs + + +# Copied from transformers.models.bert.modeling_bert.BertIntermediate with Bert->BertGeneration +class BertGenerationIntermediate(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.intermediate_size) + if isinstance(config.hidden_act, str): + self.intermediate_act_fn = ACT2FN[config.hidden_act] + else: + self.intermediate_act_fn = config.hidden_act + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.intermediate_act_fn(hidden_states) + return hidden_states + + +# Copied from transformers.models.bert.modeling_bert.BertOutput with Bert->BertGeneration +class BertGenerationOutput(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.intermediate_size, config.hidden_size) + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = self.LayerNorm(hidden_states + input_tensor) + return hidden_states + + +# Copied from transformers.models.bert.modeling_bert.BertLayer with Bert->BertGeneration +class BertGenerationLayer(nn.Module): + def __init__(self, config): + super().__init__() + self.chunk_size_feed_forward = config.chunk_size_feed_forward + self.seq_len_dim = 1 + self.attention = BertGenerationAttention(config) + self.is_decoder = config.is_decoder + self.add_cross_attention = config.add_cross_attention + if self.add_cross_attention: + if not self.is_decoder: + raise ValueError(f"{self} should be used as a decoder model if cross attention is added") + self.crossattention = BertGenerationAttention(config, position_embedding_type="absolute") + self.intermediate = BertGenerationIntermediate(config) + self.output = BertGenerationOutput(config) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.FloatTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + encoder_attention_mask: Optional[torch.FloatTensor] = None, + past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None, + output_attentions: Optional[bool] = False, + ) -> Tuple[torch.Tensor]: + # decoder uni-directional self-attention cached key/values tuple is at positions 1,2 + self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None + self_attention_outputs = self.attention( + hidden_states, + attention_mask, + head_mask, + output_attentions=output_attentions, + past_key_value=self_attn_past_key_value, + ) + attention_output = self_attention_outputs[0] + + # if decoder, the last output is tuple of self-attn cache + if self.is_decoder: + outputs = self_attention_outputs[1:-1] + present_key_value = self_attention_outputs[-1] + else: + outputs = self_attention_outputs[1:] # add self attentions if we output attention weights + + cross_attn_present_key_value = None + if self.is_decoder and encoder_hidden_states is not None: + if not hasattr(self, "crossattention"): + raise ValueError( + f"If `encoder_hidden_states` are passed, {self} has to be instantiated with cross-attention layers" + " by setting `config.add_cross_attention=True`" + ) + + # cross_attn cached key/values tuple is at positions 3,4 of past_key_value tuple + cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None + cross_attention_outputs = self.crossattention( + attention_output, + attention_mask, + head_mask, + encoder_hidden_states, + encoder_attention_mask, + cross_attn_past_key_value, + output_attentions, + ) + attention_output = cross_attention_outputs[0] + outputs = outputs + cross_attention_outputs[1:-1] # add cross attentions if we output attention weights + + # add cross-attn cache to positions 3,4 of present_key_value tuple + cross_attn_present_key_value = cross_attention_outputs[-1] + present_key_value = present_key_value + cross_attn_present_key_value + + layer_output = apply_chunking_to_forward( + self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attention_output + ) + outputs = (layer_output,) + outputs + + # if decoder, return the attn key/values as the last output + if self.is_decoder: + outputs = outputs + (present_key_value,) + + return outputs + + def feed_forward_chunk(self, attention_output): + intermediate_output = self.intermediate(attention_output) + layer_output = self.output(intermediate_output, attention_output) + return layer_output + + +# Copied from transformers.models.bert.modeling_bert.BertEncoder with Bert->BertGeneration +class BertEncoder(nn.Module): + def __init__(self, config): + super().__init__() + self.config = config + self.layer = nn.ModuleList([BertGenerationLayer(config) for _ in range(config.num_hidden_layers)]) + self.gradient_checkpointing = False + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.FloatTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + encoder_attention_mask: Optional[torch.FloatTensor] = None, + past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = False, + output_hidden_states: Optional[bool] = False, + return_dict: Optional[bool] = True, + ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPastAndCrossAttentions]: + all_hidden_states = () if output_hidden_states else None + all_self_attentions = () if output_attentions else None + all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None + + if self.gradient_checkpointing and self.training: + if use_cache: + logger.warning_once( + "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." + ) + use_cache = False + + next_decoder_cache = () if use_cache else None + for i, layer_module in enumerate(self.layer): + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + layer_head_mask = head_mask[i] if head_mask is not None else None + past_key_value = past_key_values[i] if past_key_values is not None else None + + if self.gradient_checkpointing and self.training: + layer_outputs = self._gradient_checkpointing_func( + layer_module.__call__, + hidden_states, + attention_mask, + layer_head_mask, + encoder_hidden_states, + encoder_attention_mask, + past_key_value, + output_attentions, + ) + else: + layer_outputs = layer_module( + hidden_states, + attention_mask, + layer_head_mask, + encoder_hidden_states, + encoder_attention_mask, + past_key_value, + output_attentions, + ) + + hidden_states = layer_outputs[0] + if use_cache: + next_decoder_cache += (layer_outputs[-1],) + if output_attentions: + all_self_attentions = all_self_attentions + (layer_outputs[1],) + if self.config.add_cross_attention: + all_cross_attentions = all_cross_attentions + (layer_outputs[2],) + + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + if not return_dict: + return tuple( + v + for v in [ + hidden_states, + next_decoder_cache, + all_hidden_states, + all_self_attentions, + all_cross_attentions, + ] + if v is not None + ) + return BaseModelOutputWithPastAndCrossAttentions( + last_hidden_state=hidden_states, + past_key_values=next_decoder_cache, + hidden_states=all_hidden_states, + attentions=all_self_attentions, + cross_attentions=all_cross_attentions, + ) + + +def load_tf_weights_in_bert_generation( + model, tf_hub_path, model_class, is_encoder_named_decoder=False, is_encoder=False +): + try: + import numpy as np + import tensorflow.compat.v1 as tf + import tensorflow_hub as hub + import tensorflow_text # noqa: F401 + + tf.disable_eager_execution() + except ImportError: + logger.error( + "Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see " + "https://www.tensorflow.org/install/ for installation instructions." + ) + raise + tf_model = hub.Module(tf_hub_path) + init = tf.global_variables_initializer() + with tf.Session() as sess: + init.run() + all_variables = tf_model.variable_map + keep_track_variables = all_variables.copy() + for key in list(all_variables.keys()): + if "global" in key: + logger.info(f"Skipping {key}...") + continue + if not is_encoder: + model_pointer = getattr(model, model_class) + else: + model_pointer = model + is_embedding = False + logger.info(f"Trying to match {key}...") + # remove start_string = "module/bert/" + sub_layers = key.split("/")[2:] + if is_encoder_named_decoder and sub_layers[0] == "encoder": + logger.info(f"Skipping encoder layer {key} for decoder") + continue + if is_encoder and sub_layers[0] == "decoder": + logger.info(f"Skipping decoder layer {key} for encoder") + continue + for i, sub_layer in enumerate(sub_layers): + if sub_layer == "embeddings": + is_embedding = True + elif sub_layer == "LayerNorm": + is_embedding = False + if "layer" in sub_layer: + model_pointer = model_pointer.layer[int(sub_layer.split("_")[-1])] + elif sub_layer in ["kernel", "gamma"]: + model_pointer = model_pointer.weight + elif sub_layer == "beta": + model_pointer = model_pointer.bias + elif sub_layer == "encdec": + model_pointer = model_pointer.crossattention.self + elif sub_layer == "encdec_output": + model_pointer = model_pointer.crossattention.output + elif is_encoder_named_decoder and sub_layer == "decoder": + model_pointer = model_pointer.encoder + else: + if sub_layer == "attention" and "encdec" in sub_layers[i + 1]: + continue + try: + model_pointer = getattr(model_pointer, sub_layer) + except AttributeError: + logger.info(f"Skipping to initialize {key} at {sub_layer}...") + raise AttributeError + + array = np.asarray(sess.run(all_variables[key])) + if not is_embedding: + logger.info(f"Transposing numpy weight of shape {array.shape} for {key}") + array = np.transpose(array) + else: + model_pointer = model_pointer.weight + + if model_pointer.shape != array.shape: + raise ValueError(f"Pointer shape {model_pointer.shape} and array shape {array.shape} mismatched") + logger.info(f"Initialize PyTorch weight {key}") + + model_pointer.data = torch.from_numpy(array.astype(np.float32)) + keep_track_variables.pop(key, None) + + logger.info(f"Weights not copied to PyTorch model: {', '.join(keep_track_variables.keys())}") + return model + + +class BertGenerationEmbeddings(nn.Module): + """Construct the embeddings from word and position embeddings.""" + + def __init__(self, config): + super().__init__() + self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id) + self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size) + # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load + # any TensorFlow checkpoint file + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + # position_ids (1, len position emb) is contiguous in memory and exported when serialized + self.register_buffer( + "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False + ) + + def forward(self, input_ids=None, position_ids=None, inputs_embeds=None, past_key_values_length=0): + if input_ids is not None: + input_shape = input_ids.size() + else: + input_shape = inputs_embeds.size()[:-1] + + seq_length = input_shape[1] + + if position_ids is None: + position_ids = self.position_ids[:, past_key_values_length : seq_length + past_key_values_length] + + if inputs_embeds is None: + inputs_embeds = self.word_embeddings(input_ids) + position_embeddings = self.position_embeddings(position_ids) + + embeddings = inputs_embeds + position_embeddings + embeddings = self.LayerNorm(embeddings) + embeddings = self.dropout(embeddings) + return embeddings + + +class BertGenerationPreTrainedModel(PreTrainedModel): + """ + An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained + models. + """ + + config_class = BertGenerationConfig + base_model_prefix = "bert" + supports_gradient_checkpointing = True + + def _init_weights(self, module): + """Initialize the weights""" + if isinstance(module, nn.Linear): + # Slightly different from the TF version which uses truncated_normal for initialization + # cf https://github.com/pytorch/pytorch/pull/5617 + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + elif isinstance(module, nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + elif isinstance(module, BertGenerationOnlyLMHead): + module.bias.data.zero_() + + +BERT_GENERATION_START_DOCSTRING = r""" + + This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads + etc.) + + This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. + Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage + and behavior. + + Parameters: + config ([`BertGenerationConfig`]): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights. +""" + +BERT_GENERATION_INPUTS_DOCSTRING = r""" + Args: + input_ids (`torch.LongTensor` of shape `({0})`): + Indices of input sequence tokens in the vocabulary. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.__call__`] and + [`PreTrainedTokenizer.encode`] for details. + + [What are input IDs?](../glossary#input-ids) + attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + position_ids (`torch.LongTensor` of shape `({0})`, *optional*): + Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, + config.max_position_embeddings - 1]`. + + [What are position IDs?](../glossary#position-ids) + head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*): + Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This + is useful if you want more control over how to convert `input_ids` indices into associated vectors than the + model's internal embedding lookup matrix. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. +""" + + +@add_start_docstrings( + "The bare BertGeneration model transformer outputting raw hidden-states without any specific head on top.", + BERT_GENERATION_START_DOCSTRING, +) +class BertGenerationEncoder(BertGenerationPreTrainedModel): + """ + + The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of + cross-attention is added between the self-attention layers, following the architecture described in [Attention is + all you need](https://arxiv.org/abs/1706.03762) by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, + Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin. + + This model should be used when leveraging Bert or Roberta checkpoints for the [`EncoderDecoderModel`] class as + described in [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) + by Sascha Rothe, Shashi Narayan, and Aliaksei Severyn. + + To behave as an decoder the model needs to be initialized with the `is_decoder` argument of the configuration set + to `True`. To be used in a Seq2Seq model, the model needs to initialized with both `is_decoder` argument and + `add_cross_attention` set to `True`; an `encoder_hidden_states` is then expected as an input to the forward pass. + """ + + def __init__(self, config): + super().__init__(config) + self.config = config + + self.embeddings = BertGenerationEmbeddings(config) + self.encoder = BertEncoder(config) + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.embeddings.word_embeddings + + def set_input_embeddings(self, value): + self.embeddings.word_embeddings = value + + def _prune_heads(self, heads_to_prune): + """ + Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base + class PreTrainedModel + """ + for layer, heads in heads_to_prune.items(): + self.encoder.layer[layer].attention.prune_heads(heads) + + @add_start_docstrings_to_model_forward(BERT_GENERATION_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @add_code_sample_docstrings( + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=BaseModelOutputWithPastAndCrossAttentions, + config_class=_CONFIG_FOR_DOC, + ) + def forward( + self, + input_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, + encoder_attention_mask: Optional[torch.Tensor] = None, + past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + **kwargs, # NOOP kwargs, for now + ) -> Union[Tuple, BaseModelOutputWithPastAndCrossAttentions]: + r""" + encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if + the model is configured as a decoder. + encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in + the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`: `1` for + tokens that are NOT MASKED, `0` for MASKED tokens. + past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`): + Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding. + + If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that + don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all + `decoder_input_ids` of shape `(batch_size, sequence_length)`. + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see + `past_key_values`). + """ + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if self.config.is_decoder: + use_cache = use_cache if use_cache is not None else self.config.use_cache + else: + use_cache = False + + if input_ids is not None and inputs_embeds is not None: + raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") + elif input_ids is not None: + self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask) + input_shape = input_ids.size() + elif inputs_embeds is not None: + input_shape = inputs_embeds.size()[:-1] + else: + raise ValueError("You have to specify either input_ids or inputs_embeds") + + batch_size, seq_length = input_shape + device = input_ids.device if input_ids is not None else inputs_embeds.device + + # past_key_values_length + past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0 + + if attention_mask is None: + attention_mask = torch.ones(((batch_size, seq_length + past_key_values_length)), device=device) + + # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length] + # ourselves in which case we just need to make it broadcastable to all heads. + extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape) + + # If a 2D or 3D attention mask is provided for the cross-attention + # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length] + if self.config.is_decoder and encoder_hidden_states is not None: + encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size() + encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length) + if encoder_attention_mask is None: + encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device) + encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask) + else: + encoder_extended_attention_mask = None + + # Prepare head mask if needed + # 1.0 in head_mask indicate we keep the head + # attention_probs has shape bsz x n_heads x N x N + # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads] + # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length] + head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers) + + embedding_output = self.embeddings( + input_ids=input_ids, + position_ids=position_ids, + inputs_embeds=inputs_embeds, + past_key_values_length=past_key_values_length, + ) + + encoder_outputs = self.encoder( + embedding_output, + attention_mask=extended_attention_mask, + head_mask=head_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_extended_attention_mask, + past_key_values=past_key_values, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + sequence_output = encoder_outputs[0] + + if not return_dict: + return (sequence_output,) + encoder_outputs[1:] + + return BaseModelOutputWithPastAndCrossAttentions( + last_hidden_state=sequence_output, + past_key_values=encoder_outputs.past_key_values, + hidden_states=encoder_outputs.hidden_states, + attentions=encoder_outputs.attentions, + cross_attentions=encoder_outputs.cross_attentions, + ) + + +class BertGenerationOnlyLMHead(nn.Module): + def __init__(self, config): + super().__init__() + self.decoder = nn.Linear(config.hidden_size, config.vocab_size) + self.bias = nn.Parameter(torch.zeros(config.vocab_size)) + self.decoder.bias = self.bias + + def forward(self, hidden_states): + logits = self.decoder(hidden_states) + return logits + + def _tie_weights(self): + # For accelerate compatibility and to not break backward compatibility + if self.decoder.bias.device.type == "meta": + self.decoder.bias = self.bias + else: + # To tie those two weights if they get disconnected (on TPU or when the bias is resized) + self.bias = self.decoder.bias + + +@add_start_docstrings( + """BertGeneration Model with a `language modeling` head on top for CLM fine-tuning.""", + BERT_GENERATION_START_DOCSTRING, +) +class BertGenerationDecoder(BertGenerationPreTrainedModel, GenerationMixin): + _tied_weights_keys = ["lm_head.decoder.weight", "lm_head.decoder.bias"] + + def __init__(self, config): + super().__init__(config) + + if not config.is_decoder: + logger.warning("If you want to use `BertGenerationDecoder` as a standalone, add `is_decoder=True.`") + + self.bert = BertGenerationEncoder(config) + self.lm_head = BertGenerationOnlyLMHead(config) + + # Initialize weights and apply final processing + self.post_init() + + def get_output_embeddings(self): + return self.lm_head.decoder + + def set_output_embeddings(self, new_embeddings): + self.lm_head.decoder = new_embeddings + self.lm_head.bias = new_embeddings.bias + + @add_start_docstrings_to_model_forward(BERT_GENERATION_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @replace_return_docstrings(output_type=CausalLMOutputWithCrossAttentions, config_class=_CONFIG_FOR_DOC) + def forward( + self, + input_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, + encoder_attention_mask: Optional[torch.Tensor] = None, + labels: Optional[torch.Tensor] = None, + past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + **kwargs, + ) -> Union[Tuple, CausalLMOutputWithCrossAttentions]: + r""" + encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if + the model is configured as a decoder. + encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in + the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in + `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are + ignored (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]` + past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`): + Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding. + + If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that + don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all + `decoder_input_ids` of shape `(batch_size, sequence_length)`. + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see + `past_key_values`). + + Returns: + + Example: + + ```python + >>> from transformers import AutoTokenizer, BertGenerationDecoder, BertGenerationConfig + >>> import torch + + >>> tokenizer = AutoTokenizer.from_pretrained("google/bert_for_seq_generation_L-24_bbc_encoder") + >>> config = BertGenerationConfig.from_pretrained("google/bert_for_seq_generation_L-24_bbc_encoder") + >>> config.is_decoder = True + >>> model = BertGenerationDecoder.from_pretrained( + ... "google/bert_for_seq_generation_L-24_bbc_encoder", config=config + ... ) + + >>> inputs = tokenizer("Hello, my dog is cute", return_token_type_ids=False, return_tensors="pt") + >>> outputs = model(**inputs) + + >>> prediction_logits = outputs.logits + ```""" + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + if labels is not None: + use_cache = False + + outputs = self.bert( + input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + past_key_values=past_key_values, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + **kwargs, + ) + + sequence_output = outputs[0] + prediction_scores = self.lm_head(sequence_output) + + lm_loss = None + if labels is not None: + lm_loss = self.loss_function( + prediction_scores, + labels, + vocab_size=self.config.vocab_size, + **kwargs, + ) + + if not return_dict: + output = (prediction_scores,) + outputs[1:] + return ((lm_loss,) + output) if lm_loss is not None else output + + return CausalLMOutputWithCrossAttentions( + loss=lm_loss, + logits=prediction_scores, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + cross_attentions=outputs.cross_attentions, + ) + + def _reorder_cache(self, past_key_values, beam_idx): + reordered_past = () + for layer_past in past_key_values: + reordered_past += ( + tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past), + ) + return reordered_past + + +__all__ = [ + "BertGenerationDecoder", + "BertGenerationEncoder", + "BertGenerationPreTrainedModel", + "load_tf_weights_in_bert_generation", +] diff --git a/docs/transformers/build/lib/transformers/models/bert_generation/tokenization_bert_generation.py b/docs/transformers/build/lib/transformers/models/bert_generation/tokenization_bert_generation.py new file mode 100644 index 0000000000000000000000000000000000000000..727727d4a11d6127872224f64db93772944b57a3 --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/bert_generation/tokenization_bert_generation.py @@ -0,0 +1,177 @@ +# coding=utf-8 +# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Tokenization class for model BertGeneration.""" + +import os +from shutil import copyfile +from typing import Any, Dict, List, Optional, Tuple + +import sentencepiece as spm + +from ...tokenization_utils import PreTrainedTokenizer +from ...utils import logging +from ...utils.import_utils import requires + + +logger = logging.get_logger(__name__) + +VOCAB_FILES_NAMES = {"vocab_file": "spiece.model"} + + +@requires(backends=("sentencepiece",)) +class BertGenerationTokenizer(PreTrainedTokenizer): + """ + Construct a BertGeneration tokenizer. Based on [SentencePiece](https://github.com/google/sentencepiece). + + This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to + this superclass for more information regarding those methods. + + Args: + vocab_file (`str`): + [SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that + contains the vocabulary necessary to instantiate a tokenizer. + bos_token (`str`, *optional*, defaults to `""`): + The begin of sequence token. + eos_token (`str`, *optional*, defaults to `""`): + The end of sequence token. + unk_token (`str`, *optional*, defaults to `""`): + The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this + token instead. + pad_token (`str`, *optional*, defaults to `""`): + The token used for padding, for example when batching sequences of different lengths. + sep_token (`str`, *optional*, defaults to `"<::::>"`): + The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for + sequence classification or for a text and a question for question answering. It is also used as the last + token of a sequence built with special tokens. + sp_model_kwargs (`dict`, *optional*): + Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for + SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things, + to set: + + - `enable_sampling`: Enable subword regularization. + - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout. + + - `nbest_size = {0,1}`: No sampling is performed. + - `nbest_size > 1`: samples from the nbest_size results. + - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice) + using forward-filtering-and-backward-sampling algorithm. + + - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for + BPE-dropout. + """ + + vocab_files_names = VOCAB_FILES_NAMES + prefix_tokens: List[int] = [] + model_input_names = ["input_ids", "attention_mask"] + + def __init__( + self, + vocab_file, + bos_token="", + eos_token="", + unk_token="", + pad_token="", + sep_token="<::::>", + sp_model_kwargs: Optional[Dict[str, Any]] = None, + **kwargs, + ) -> None: + self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs + + self.vocab_file = vocab_file + + self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) + self.sp_model.Load(vocab_file) + + # Add extra_ids to the special token list + super().__init__( + bos_token=bos_token, + eos_token=eos_token, + unk_token=unk_token, + pad_token=pad_token, + sep_token=sep_token, + sp_model_kwargs=self.sp_model_kwargs, + **kwargs, + ) + + @property + def vocab_size(self): + return self.sp_model.get_piece_size() + + def get_vocab(self): + vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)} + vocab.update(self.added_tokens_encoder) + return vocab + + def __getstate__(self): + state = self.__dict__.copy() + state["sp_model"] = None + return state + + def __setstate__(self, d): + self.__dict__ = d + + # for backward compatibility + if not hasattr(self, "sp_model_kwargs"): + self.sp_model_kwargs = {} + + self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) + self.sp_model.Load(self.vocab_file) + + def _tokenize(self, text: str) -> List[str]: + """Take as input a string and return a list of strings (tokens) for words/sub-words""" + return self.sp_model.encode(text, out_type=str) + + def _convert_token_to_id(self, token): + """Converts a token (str) in an id using the vocab.""" + return self.sp_model.piece_to_id(token) + + def _convert_id_to_token(self, index): + """Converts an index (integer) in a token (str) using the vocab.""" + token = self.sp_model.IdToPiece(index) + return token + + def convert_tokens_to_string(self, tokens): + """Converts a sequence of tokens (string) in a single string.""" + current_sub_tokens = [] + out_string = "" + for token in tokens: + # make sure that special tokens are not decoded using sentencepiece model + if token in self.all_special_tokens: + out_string += self.sp_model.decode(current_sub_tokens) + token + current_sub_tokens = [] + else: + current_sub_tokens.append(token) + out_string += self.sp_model.decode(current_sub_tokens) + return out_string.strip() + + def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]: + if not os.path.isdir(save_directory): + logger.error(f"Vocabulary path ({save_directory}) should be a directory") + return + out_vocab_file = os.path.join( + save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"] + ) + + if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file): + copyfile(self.vocab_file, out_vocab_file) + elif not os.path.isfile(self.vocab_file): + with open(out_vocab_file, "wb") as fi: + content_spiece_model = self.sp_model.serialized_model_proto() + fi.write(content_spiece_model) + + return (out_vocab_file,) + + +__all__ = ["BertGenerationTokenizer"] diff --git a/docs/transformers/build/lib/transformers/models/bert_japanese/__init__.py b/docs/transformers/build/lib/transformers/models/bert_japanese/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..f5296087db1d007eab946f795d0c9c8fa4bdaafe --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/bert_japanese/__init__.py @@ -0,0 +1,26 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ...utils import _LazyModule +from ...utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .tokenization_bert_japanese import * +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/docs/transformers/build/lib/transformers/models/bert_japanese/tokenization_bert_japanese.py b/docs/transformers/build/lib/transformers/models/bert_japanese/tokenization_bert_japanese.py new file mode 100644 index 0000000000000000000000000000000000000000..8a841a3091623d2e50fd62e91a152238789f0159 --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/bert_japanese/tokenization_bert_japanese.py @@ -0,0 +1,982 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Tokenization classes.""" + +import collections +import copy +import os +import unicodedata +from typing import Any, Dict, List, Optional, Tuple + +from ...tokenization_utils import PreTrainedTokenizer, _is_control, _is_punctuation, _is_whitespace +from ...utils import is_sentencepiece_available, is_sudachi_projection_available, logging + + +if is_sentencepiece_available(): + import sentencepiece as spm +else: + spm = None + +logger = logging.get_logger(__name__) + +VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt", "spm_file": "spiece.model"} + +SPIECE_UNDERLINE = "▁" + + +# Copied from transformers.models.bert.tokenization_bert.load_vocab +def load_vocab(vocab_file): + """Loads a vocabulary file into a dictionary.""" + vocab = collections.OrderedDict() + with open(vocab_file, "r", encoding="utf-8") as reader: + tokens = reader.readlines() + for index, token in enumerate(tokens): + token = token.rstrip("\n") + vocab[token] = index + return vocab + + +# Copied from transformers.models.bert.tokenization_bert.whitespace_tokenize +def whitespace_tokenize(text): + """Runs basic whitespace cleaning and splitting on a piece of text.""" + text = text.strip() + if not text: + return [] + tokens = text.split() + return tokens + + +class BertJapaneseTokenizer(PreTrainedTokenizer): + r""" + Construct a BERT tokenizer for Japanese text. + + This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer + to: this superclass for more information regarding those methods. + + Args: + vocab_file (`str`): + Path to a one-wordpiece-per-line vocabulary file. + spm_file (`str`, *optional*): + Path to [SentencePiece](https://github.com/google/sentencepiece) file (generally has a .spm or .model + extension) that contains the vocabulary. + do_lower_case (`bool`, *optional*, defaults to `True`): + Whether to lower case the input. Only has an effect when do_basic_tokenize=True. + do_word_tokenize (`bool`, *optional*, defaults to `True`): + Whether to do word tokenization. + do_subword_tokenize (`bool`, *optional*, defaults to `True`): + Whether to do subword tokenization. + word_tokenizer_type (`str`, *optional*, defaults to `"basic"`): + Type of word tokenizer. Choose from ["basic", "mecab", "sudachi", "jumanpp"]. + subword_tokenizer_type (`str`, *optional*, defaults to `"wordpiece"`): + Type of subword tokenizer. Choose from ["wordpiece", "character", "sentencepiece",]. + mecab_kwargs (`dict`, *optional*): + Dictionary passed to the `MecabTokenizer` constructor. + sudachi_kwargs (`dict`, *optional*): + Dictionary passed to the `SudachiTokenizer` constructor. + jumanpp_kwargs (`dict`, *optional*): + Dictionary passed to the `JumanppTokenizer` constructor. + """ + + vocab_files_names = VOCAB_FILES_NAMES + + def __init__( + self, + vocab_file, + spm_file=None, + do_lower_case=False, + do_word_tokenize=True, + do_subword_tokenize=True, + word_tokenizer_type="basic", + subword_tokenizer_type="wordpiece", + never_split=None, + unk_token="[UNK]", + sep_token="[SEP]", + pad_token="[PAD]", + cls_token="[CLS]", + mask_token="[MASK]", + mecab_kwargs=None, + sudachi_kwargs=None, + jumanpp_kwargs=None, + **kwargs, + ): + if subword_tokenizer_type == "sentencepiece": + if not os.path.isfile(spm_file): + raise ValueError( + f"Can't find a vocabulary file at path '{spm_file}'. To load the vocabulary from a Google" + " pretrained model use `tokenizer = AutoTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`" + ) + self.spm_file = spm_file + else: + if not os.path.isfile(vocab_file): + raise ValueError( + f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google" + " pretrained model use `tokenizer = AutoTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`" + ) + self.vocab = load_vocab(vocab_file) + self.ids_to_tokens = collections.OrderedDict([(ids, tok) for tok, ids in self.vocab.items()]) + + self.do_word_tokenize = do_word_tokenize + self.word_tokenizer_type = word_tokenizer_type + self.lower_case = do_lower_case + self.never_split = never_split + self.mecab_kwargs = copy.deepcopy(mecab_kwargs) + self.sudachi_kwargs = copy.deepcopy(sudachi_kwargs) + self.jumanpp_kwargs = copy.deepcopy(jumanpp_kwargs) + if do_word_tokenize: + if word_tokenizer_type == "basic": + self.word_tokenizer = BasicTokenizer( + do_lower_case=do_lower_case, never_split=never_split, tokenize_chinese_chars=False + ) + elif word_tokenizer_type == "mecab": + self.word_tokenizer = MecabTokenizer( + do_lower_case=do_lower_case, never_split=never_split, **(mecab_kwargs or {}) + ) + elif word_tokenizer_type == "sudachi": + self.word_tokenizer = SudachiTokenizer( + do_lower_case=do_lower_case, never_split=never_split, **(sudachi_kwargs or {}) + ) + elif word_tokenizer_type == "jumanpp": + self.word_tokenizer = JumanppTokenizer( + do_lower_case=do_lower_case, never_split=never_split, **(jumanpp_kwargs or {}) + ) + else: + raise ValueError(f"Invalid word_tokenizer_type '{word_tokenizer_type}' is specified.") + + self.do_subword_tokenize = do_subword_tokenize + self.subword_tokenizer_type = subword_tokenizer_type + if do_subword_tokenize: + if subword_tokenizer_type == "wordpiece": + self.subword_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=str(unk_token)) + elif subword_tokenizer_type == "character": + self.subword_tokenizer = CharacterTokenizer(vocab=self.vocab, unk_token=str(unk_token)) + elif subword_tokenizer_type == "sentencepiece": + self.subword_tokenizer = SentencepieceTokenizer(vocab=self.spm_file, unk_token=str(unk_token)) + else: + raise ValueError(f"Invalid subword_tokenizer_type '{subword_tokenizer_type}' is specified.") + super().__init__( + spm_file=spm_file, + unk_token=unk_token, + sep_token=sep_token, + pad_token=pad_token, + cls_token=cls_token, + mask_token=mask_token, + do_lower_case=do_lower_case, + do_word_tokenize=do_word_tokenize, + do_subword_tokenize=do_subword_tokenize, + word_tokenizer_type=word_tokenizer_type, + subword_tokenizer_type=subword_tokenizer_type, + never_split=never_split, + mecab_kwargs=mecab_kwargs, + sudachi_kwargs=sudachi_kwargs, + jumanpp_kwargs=jumanpp_kwargs, + **kwargs, + ) + + @property + def do_lower_case(self): + return self.lower_case + + def __getstate__(self): + state = dict(self.__dict__) + if self.word_tokenizer_type in ["mecab", "sudachi", "jumanpp"]: + del state["word_tokenizer"] + return state + + def __setstate__(self, state): + self.__dict__ = state + if self.word_tokenizer_type == "mecab": + self.word_tokenizer = MecabTokenizer( + do_lower_case=self.do_lower_case, never_split=self.never_split, **(self.mecab_kwargs or {}) + ) + elif self.word_tokenizer_type == "sudachi": + self.word_tokenizer = SudachiTokenizer( + do_lower_case=self.do_lower_case, never_split=self.never_split, **(self.sudachi_kwargs or {}) + ) + elif self.word_tokenizer_type == "jumanpp": + self.word_tokenizer = JumanppTokenizer( + do_lower_case=self.do_lower_case, never_split=self.never_split, **(self.jumanpp_kwargs or {}) + ) + + def _tokenize(self, text): + if self.do_word_tokenize: + tokens = self.word_tokenizer.tokenize(text, never_split=self.all_special_tokens) + else: + tokens = [text] + + if self.do_subword_tokenize: + split_tokens = [sub_token for token in tokens for sub_token in self.subword_tokenizer.tokenize(token)] + else: + split_tokens = tokens + + return split_tokens + + @property + def vocab_size(self): + if self.subword_tokenizer_type == "sentencepiece": + return len(self.subword_tokenizer.sp_model) + return len(self.vocab) + + def get_vocab(self): + if self.subword_tokenizer_type == "sentencepiece": + vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)} + vocab.update(self.added_tokens_encoder) + return vocab + return dict(self.vocab, **self.added_tokens_encoder) + + def _convert_token_to_id(self, token): + """Converts a token (str) in an id using the vocab.""" + if self.subword_tokenizer_type == "sentencepiece": + return self.subword_tokenizer.sp_model.PieceToId(token) + return self.vocab.get(token, self.vocab.get(self.unk_token)) + + def _convert_id_to_token(self, index): + """Converts an index (integer) in a token (str) using the vocab.""" + if self.subword_tokenizer_type == "sentencepiece": + return self.subword_tokenizer.sp_model.IdToPiece(index) + return self.ids_to_tokens.get(index, self.unk_token) + + def convert_tokens_to_string(self, tokens): + """Converts a sequence of tokens (string) in a single string.""" + if self.subword_tokenizer_type == "sentencepiece": + return self.subword_tokenizer.sp_model.decode(tokens) + out_string = " ".join(tokens).replace(" ##", "").strip() + return out_string + + # Copied from transformers.models.bert.tokenization_bert.BertTokenizer.build_inputs_with_special_tokens + def build_inputs_with_special_tokens( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None + ) -> List[int]: + """ + Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and + adding special tokens. A BERT sequence has the following format: + + - single sequence: `[CLS] X [SEP]` + - pair of sequences: `[CLS] A [SEP] B [SEP]` + + Args: + token_ids_0 (`List[int]`): + List of IDs to which the special tokens will be added. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + + Returns: + `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens. + """ + if token_ids_1 is None: + return [self.cls_token_id] + token_ids_0 + [self.sep_token_id] + cls = [self.cls_token_id] + sep = [self.sep_token_id] + return cls + token_ids_0 + sep + token_ids_1 + sep + + # Copied from transformers.models.bert.tokenization_bert.BertTokenizer.get_special_tokens_mask + def get_special_tokens_mask( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False + ) -> List[int]: + """ + Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding + special tokens using the tokenizer `prepare_for_model` method. + + Args: + token_ids_0 (`List[int]`): + List of IDs. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + already_has_special_tokens (`bool`, *optional*, defaults to `False`): + Whether or not the token list is already formatted with special tokens for the model. + + Returns: + `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. + """ + + if already_has_special_tokens: + return super().get_special_tokens_mask( + token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True + ) + + if token_ids_1 is not None: + return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1] + return [1] + ([0] * len(token_ids_0)) + [1] + + # Copied from transformers.models.bert.tokenization_bert.BertTokenizer.create_token_type_ids_from_sequences + def create_token_type_ids_from_sequences( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None + ) -> List[int]: + """ + Create a mask from the two sequences passed to be used in a sequence-pair classification task. A BERT sequence + pair mask has the following format: + + ``` + 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 + | first sequence | second sequence | + ``` + + If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s). + + Args: + token_ids_0 (`List[int]`): + List of IDs. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + + Returns: + `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s). + """ + sep = [self.sep_token_id] + cls = [self.cls_token_id] + if token_ids_1 is None: + return len(cls + token_ids_0 + sep) * [0] + return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1] + + def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]: + if os.path.isdir(save_directory): + if self.subword_tokenizer_type == "sentencepiece": + vocab_file = os.path.join( + save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["spm_file"] + ) + else: + vocab_file = os.path.join( + save_directory, + (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"], + ) + else: + vocab_file = (filename_prefix + "-" if filename_prefix else "") + save_directory + + if self.subword_tokenizer_type == "sentencepiece": + with open(vocab_file, "wb") as writer: + content_spiece_model = self.subword_tokenizer.sp_model.serialized_model_proto() + writer.write(content_spiece_model) + else: + with open(vocab_file, "w", encoding="utf-8") as writer: + index = 0 + for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]): + if index != token_index: + logger.warning( + f"Saving vocabulary to {vocab_file}: vocabulary indices are not consecutive." + " Please check that the vocabulary is not corrupted!" + ) + index = token_index + writer.write(token + "\n") + index += 1 + return (vocab_file,) + + +class MecabTokenizer: + """Runs basic tokenization with MeCab morphological parser.""" + + def __init__( + self, + do_lower_case=False, + never_split=None, + normalize_text=True, + mecab_dic: Optional[str] = "unidic_lite", + mecab_option: Optional[str] = None, + ): + """ + Constructs a MecabTokenizer. + + Args: + **do_lower_case**: (*optional*) boolean (default True) + Whether to lowercase the input. + **never_split**: (*optional*) list of str + Kept for backward compatibility purposes. Now implemented directly at the base class level (see + [`PreTrainedTokenizer.tokenize`]) List of tokens not to split. + **normalize_text**: (*optional*) boolean (default True) + Whether to apply unicode normalization to text before tokenization. + **mecab_dic**: (*optional*) string (default "ipadic") + Name of dictionary to be used for MeCab initialization. If you are using a system-installed dictionary, + set this option to `None` and modify *mecab_option*. + **mecab_option**: (*optional*) string + String passed to MeCab constructor. + """ + self.do_lower_case = do_lower_case + self.never_split = never_split if never_split is not None else [] + self.normalize_text = normalize_text + + try: + import fugashi + except ModuleNotFoundError as error: + raise error.__class__( + "You need to install fugashi to use MecabTokenizer. " + "See https://pypi.org/project/fugashi/ for installation." + ) + + mecab_option = mecab_option or "" + + if mecab_dic is not None: + if mecab_dic == "ipadic": + try: + import ipadic + except ModuleNotFoundError as error: + raise error.__class__( + "The ipadic dictionary is not installed. " + "See https://github.com/polm/ipadic-py for installation." + ) + + dic_dir = ipadic.DICDIR + + elif mecab_dic == "unidic_lite": + try: + import unidic_lite + except ModuleNotFoundError as error: + raise error.__class__( + "The unidic_lite dictionary is not installed. " + "See https://github.com/polm/unidic-lite for installation." + ) + + dic_dir = unidic_lite.DICDIR + + elif mecab_dic == "unidic": + try: + import unidic + except ModuleNotFoundError as error: + raise error.__class__( + "The unidic dictionary is not installed. " + "See https://github.com/polm/unidic-py for installation." + ) + + dic_dir = unidic.DICDIR + if not os.path.isdir(dic_dir): + raise RuntimeError( + "The unidic dictionary itself is not found. " + "See https://github.com/polm/unidic-py for installation." + ) + + else: + raise ValueError("Invalid mecab_dic is specified.") + + mecabrc = os.path.join(dic_dir, "mecabrc") + mecab_option = f'-d "{dic_dir}" -r "{mecabrc}" ' + mecab_option + + self.mecab = fugashi.GenericTagger(mecab_option) + + def tokenize(self, text, never_split=None, **kwargs): + """Tokenizes a piece of text.""" + if self.normalize_text: + text = unicodedata.normalize("NFKC", text) + + never_split = self.never_split + (never_split if never_split is not None else []) + tokens = [] + + for word in self.mecab(text): + token = word.surface + + if self.do_lower_case and token not in never_split: + token = token.lower() + + tokens.append(token) + + return tokens + + +class SudachiTokenizer: + """Runs basic tokenization with Sudachi morphological parser.""" + + def __init__( + self, + do_lower_case=False, + never_split=None, + normalize_text=True, + trim_whitespace=False, + sudachi_split_mode="A", + sudachi_config_path=None, + sudachi_resource_dir=None, + sudachi_dict_type="core", + sudachi_projection=None, + ): + """ + Constructs a SudachiTokenizer. + + Args: + **do_lower_case**: (*optional*) boolean (default True) + Whether to lowercase the input. + **never_split**: (*optional*) list of str + Kept for backward compatibility purposes. Now implemented directly at the base class level (see + [`PreTrainedTokenizer.tokenize`]) List of tokens not to split. + **normalize_text**: (*optional*) boolean (default True) + Whether to apply unicode normalization to text before tokenization. + **trim_whitespace**: (*optional*) boolean (default False) + Whether to trim all whitespace, tab, newline from tokens. + **sudachi_split_mode**: (*optional*) string + Split mode of sudachi, choose from `["A", "B", "C"]`. + **sudachi_config_path**: (*optional*) string + **sudachi_resource_dir**: (*optional*) string + **sudachi_dict_type**: (*optional*) string + dict type of sudachi, choose from `["small", "core", "full"]`. + **sudachi_projection**: (*optional*) string + Word projection mode of sudachi, choose from `["surface", "normalized", "reading", "dictionary", "dictionary_and_surface", "normalized_and_surface", "normalized_nouns"]`. + """ + + self.do_lower_case = do_lower_case + self.never_split = never_split if never_split is not None else [] + self.normalize_text = normalize_text + self.trim_whitespace = trim_whitespace + + try: + from sudachipy import dictionary, tokenizer + except ImportError: + raise ImportError( + "You need to install sudachipy to use SudachiTokenizer. " + "See https://github.com/WorksApplications/SudachiPy for installation." + ) + + if sudachi_split_mode == "A": + self.split_mode = tokenizer.Tokenizer.SplitMode.A + elif sudachi_split_mode == "B": + self.split_mode = tokenizer.Tokenizer.SplitMode.B + elif sudachi_split_mode == "C": + self.split_mode = tokenizer.Tokenizer.SplitMode.C + else: + raise ValueError("Invalid sudachi_split_mode is specified.") + + self.projection = sudachi_projection + + sudachi_dictionary = dictionary.Dictionary( + config_path=sudachi_config_path, resource_dir=sudachi_resource_dir, dict=sudachi_dict_type + ) + if is_sudachi_projection_available(): + self.sudachi = sudachi_dictionary.create(self.split_mode, projection=self.projection) + elif self.projection is not None: + raise ImportError("You need to install sudachipy>=0.6.8 to specify `projection` field in sudachi_kwargs.") + else: + self.sudachi = sudachi_dictionary.create(self.split_mode) + + def tokenize(self, text, never_split=None, **kwargs): + """Tokenizes a piece of text.""" + if self.normalize_text: + text = unicodedata.normalize("NFKC", text) + + never_split = self.never_split + (never_split if never_split is not None else []) + tokens = [] + + for word in self.sudachi.tokenize(text): + token = word.surface() + + if self.do_lower_case and token not in never_split: + token = token.lower() + + if self.trim_whitespace: + if token.strip() == "": + continue + else: + token = token.strip() + + tokens.append(token) + + return tokens + + +class JumanppTokenizer: + """Runs basic tokenization with jumanpp morphological parser.""" + + def __init__( + self, + do_lower_case=False, + never_split=None, + normalize_text=True, + trim_whitespace=False, + ): + """ + Constructs a JumanppTokenizer. + + Args: + **do_lower_case**: (*optional*) boolean (default True) + Whether to lowercase the input. + **never_split**: (*optional*) list of str + Kept for backward compatibility purposes. Now implemented directly at the base class level (see + [`PreTrainedTokenizer.tokenize`]) List of tokens not to split. + **normalize_text**: (*optional*) boolean (default True) + Whether to apply unicode normalization to text before tokenization. + **trim_whitespace**: (*optional*) boolean (default False) + Whether to trim all whitespace, tab, newline from tokens. + """ + + self.do_lower_case = do_lower_case + self.never_split = never_split if never_split is not None else [] + self.normalize_text = normalize_text + self.trim_whitespace = trim_whitespace + + try: + import rhoknp + except ImportError: + raise ImportError( + "You need to install rhoknp to use JumanppTokenizer. " + "See https://github.com/ku-nlp/rhoknp for installation." + ) + + self.juman = rhoknp.Jumanpp() + + def tokenize(self, text, never_split=None, **kwargs): + """Tokenizes a piece of text.""" + if self.normalize_text: + text = unicodedata.normalize("NFKC", text) + + text = text.strip() + + never_split = self.never_split + (never_split if never_split is not None else []) + tokens = [] + + for mrph in self.juman.apply_to_sentence(text).morphemes: + token = mrph.text + + if self.do_lower_case and token not in never_split: + token = token.lower() + + if self.trim_whitespace: + if token.strip() == "": + continue + else: + token = token.strip() + + tokens.append(token) + + return tokens + + +class CharacterTokenizer: + """Runs Character tokenization.""" + + def __init__(self, vocab, unk_token, normalize_text=True): + """ + Constructs a CharacterTokenizer. + + Args: + **vocab**: + Vocabulary object. + **unk_token**: str + A special symbol for out-of-vocabulary token. + **normalize_text**: (`optional`) boolean (default True) + Whether to apply unicode normalization to text before tokenization. + """ + self.vocab = vocab + self.unk_token = unk_token + self.normalize_text = normalize_text + + def tokenize(self, text): + """ + Tokenizes a piece of text into characters. + + For example, `input = "apple""` wil return as output `["a", "p", "p", "l", "e"]`. + + Args: + text: A single token or whitespace separated tokens. + This should have already been passed through *BasicTokenizer*. + + Returns: + A list of characters. + """ + if self.normalize_text: + text = unicodedata.normalize("NFKC", text) + + output_tokens = [] + for char in text: + if char not in self.vocab: + output_tokens.append(self.unk_token) + continue + + output_tokens.append(char) + + return output_tokens + + +# Copied from transformers.models.bert.tokenization_bert.BasicTokenizer +class BasicTokenizer: + """ + Constructs a BasicTokenizer that will run basic tokenization (punctuation splitting, lower casing, etc.). + + Args: + do_lower_case (`bool`, *optional*, defaults to `True`): + Whether or not to lowercase the input when tokenizing. + never_split (`Iterable`, *optional*): + Collection of tokens which will never be split during tokenization. Only has an effect when + `do_basic_tokenize=True` + tokenize_chinese_chars (`bool`, *optional*, defaults to `True`): + Whether or not to tokenize Chinese characters. + + This should likely be deactivated for Japanese (see this + [issue](https://github.com/huggingface/transformers/issues/328)). + strip_accents (`bool`, *optional*): + Whether or not to strip all accents. If this option is not specified, then it will be determined by the + value for `lowercase` (as in the original BERT). + do_split_on_punc (`bool`, *optional*, defaults to `True`): + In some instances we want to skip the basic punctuation splitting so that later tokenization can capture + the full context of the words, such as contractions. + """ + + def __init__( + self, + do_lower_case=True, + never_split=None, + tokenize_chinese_chars=True, + strip_accents=None, + do_split_on_punc=True, + ): + if never_split is None: + never_split = [] + self.do_lower_case = do_lower_case + self.never_split = set(never_split) + self.tokenize_chinese_chars = tokenize_chinese_chars + self.strip_accents = strip_accents + self.do_split_on_punc = do_split_on_punc + + def tokenize(self, text, never_split=None): + """ + Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer. + + Args: + never_split (`List[str]`, *optional*) + Kept for backward compatibility purposes. Now implemented directly at the base class level (see + [`PreTrainedTokenizer.tokenize`]) List of token not to split. + """ + # union() returns a new set by concatenating the two sets. + never_split = self.never_split.union(set(never_split)) if never_split else self.never_split + text = self._clean_text(text) + + # This was added on November 1st, 2018 for the multilingual and Chinese + # models. This is also applied to the English models now, but it doesn't + # matter since the English models were not trained on any Chinese data + # and generally don't have any Chinese data in them (there are Chinese + # characters in the vocabulary because Wikipedia does have some Chinese + # words in the English Wikipedia.). + if self.tokenize_chinese_chars: + text = self._tokenize_chinese_chars(text) + # prevents treating the same character with different unicode codepoints as different characters + unicode_normalized_text = unicodedata.normalize("NFC", text) + orig_tokens = whitespace_tokenize(unicode_normalized_text) + split_tokens = [] + for token in orig_tokens: + if token not in never_split: + if self.do_lower_case: + token = token.lower() + if self.strip_accents is not False: + token = self._run_strip_accents(token) + elif self.strip_accents: + token = self._run_strip_accents(token) + split_tokens.extend(self._run_split_on_punc(token, never_split)) + + output_tokens = whitespace_tokenize(" ".join(split_tokens)) + return output_tokens + + def _run_strip_accents(self, text): + """Strips accents from a piece of text.""" + text = unicodedata.normalize("NFD", text) + output = [] + for char in text: + cat = unicodedata.category(char) + if cat == "Mn": + continue + output.append(char) + return "".join(output) + + def _run_split_on_punc(self, text, never_split=None): + """Splits punctuation on a piece of text.""" + if not self.do_split_on_punc or (never_split is not None and text in never_split): + return [text] + chars = list(text) + i = 0 + start_new_word = True + output = [] + while i < len(chars): + char = chars[i] + if _is_punctuation(char): + output.append([char]) + start_new_word = True + else: + if start_new_word: + output.append([]) + start_new_word = False + output[-1].append(char) + i += 1 + + return ["".join(x) for x in output] + + def _tokenize_chinese_chars(self, text): + """Adds whitespace around any CJK character.""" + output = [] + for char in text: + cp = ord(char) + if self._is_chinese_char(cp): + output.append(" ") + output.append(char) + output.append(" ") + else: + output.append(char) + return "".join(output) + + def _is_chinese_char(self, cp): + """Checks whether CP is the codepoint of a CJK character.""" + # This defines a "chinese character" as anything in the CJK Unicode block: + # https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block) + # + # Note that the CJK Unicode block is NOT all Japanese and Korean characters, + # despite its name. The modern Korean Hangul alphabet is a different block, + # as is Japanese Hiragana and Katakana. Those alphabets are used to write + # space-separated words, so they are not treated specially and handled + # like the all of the other languages. + if ( + (cp >= 0x4E00 and cp <= 0x9FFF) + or (cp >= 0x3400 and cp <= 0x4DBF) # + or (cp >= 0x20000 and cp <= 0x2A6DF) # + or (cp >= 0x2A700 and cp <= 0x2B73F) # + or (cp >= 0x2B740 and cp <= 0x2B81F) # + or (cp >= 0x2B820 and cp <= 0x2CEAF) # + or (cp >= 0xF900 and cp <= 0xFAFF) + or (cp >= 0x2F800 and cp <= 0x2FA1F) # + ): # + return True + + return False + + def _clean_text(self, text): + """Performs invalid character removal and whitespace cleanup on text.""" + output = [] + for char in text: + cp = ord(char) + if cp == 0 or cp == 0xFFFD or _is_control(char): + continue + if _is_whitespace(char): + output.append(" ") + else: + output.append(char) + return "".join(output) + + +# Copied from transformers.models.bert.tokenization_bert.WordpieceTokenizer +class WordpieceTokenizer: + """Runs WordPiece tokenization.""" + + def __init__(self, vocab, unk_token, max_input_chars_per_word=100): + self.vocab = vocab + self.unk_token = unk_token + self.max_input_chars_per_word = max_input_chars_per_word + + def tokenize(self, text): + """ + Tokenizes a piece of text into its word pieces. This uses a greedy longest-match-first algorithm to perform + tokenization using the given vocabulary. + + For example, `input = "unaffable"` wil return as output `["un", "##aff", "##able"]`. + + Args: + text: A single token or whitespace separated tokens. This should have + already been passed through *BasicTokenizer*. + + Returns: + A list of wordpiece tokens. + """ + + output_tokens = [] + for token in whitespace_tokenize(text): + chars = list(token) + if len(chars) > self.max_input_chars_per_word: + output_tokens.append(self.unk_token) + continue + + is_bad = False + start = 0 + sub_tokens = [] + while start < len(chars): + end = len(chars) + cur_substr = None + while start < end: + substr = "".join(chars[start:end]) + if start > 0: + substr = "##" + substr + if substr in self.vocab: + cur_substr = substr + break + end -= 1 + if cur_substr is None: + is_bad = True + break + sub_tokens.append(cur_substr) + start = end + + if is_bad: + output_tokens.append(self.unk_token) + else: + output_tokens.extend(sub_tokens) + return output_tokens + + +class SentencepieceTokenizer: + """ + Runs sentencepiece tokenization. Based on transformers.models.albert.tokenization_albert.AlbertTokenizer. + """ + + def __init__( + self, + vocab, + unk_token, + do_lower_case=False, + remove_space=True, + keep_accents=True, + sp_model_kwargs: Optional[Dict[str, Any]] = None, + ): + self.vocab = vocab + self.unk_token = unk_token + self.do_lower_case = do_lower_case + self.remove_space = remove_space + self.keep_accents = keep_accents + + self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs + self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) + self.sp_model.Load(self.vocab) + + def preprocess_text(self, inputs): + if self.remove_space: + outputs = " ".join(inputs.strip().split()) + else: + outputs = inputs + outputs = outputs.replace("``", '"').replace("''", '"') + + if not self.keep_accents: + outputs = unicodedata.normalize("NFKD", outputs) + outputs = "".join([c for c in outputs if not unicodedata.combining(c)]) + if self.do_lower_case: + outputs = outputs.lower() + + return outputs + + def tokenize(self, text): + """ + Tokenizes text by sentencepiece. Based on [SentencePiece](https://github.com/google/sentencepiece). + Tokenization needs the given vocabulary. + + Args: + text: A string needs to be tokenized. + + Returns: + A list of sentencepiece tokens. + """ + text = self.preprocess_text(text) + pieces = self.sp_model.encode(text, out_type=str) + new_pieces = [] + for piece in pieces: + if len(piece) > 1 and piece[-1] == str(",") and piece[-2].isdigit(): + cur_pieces = self.sp_model.EncodeAsPieces(piece[:-1].replace(SPIECE_UNDERLINE, "")) + if piece[0] != SPIECE_UNDERLINE and cur_pieces[0][0] == SPIECE_UNDERLINE: + if len(cur_pieces[0]) == 1: + cur_pieces = cur_pieces[1:] + else: + cur_pieces[0] = cur_pieces[0][1:] + cur_pieces.append(piece[-1]) + new_pieces.extend(cur_pieces) + else: + new_pieces.append(piece) + + return new_pieces + + +__all__ = ["BertJapaneseTokenizer", "CharacterTokenizer", "MecabTokenizer"] diff --git a/docs/transformers/build/lib/transformers/models/bertweet/__init__.py b/docs/transformers/build/lib/transformers/models/bertweet/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..432622f1595d1a0d8bb1b3c9b9774b7d1e387d3e --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/bertweet/__init__.py @@ -0,0 +1,26 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ...utils import _LazyModule +from ...utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .tokenization_bertweet import * +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/docs/transformers/build/lib/transformers/models/bertweet/tokenization_bertweet.py b/docs/transformers/build/lib/transformers/models/bertweet/tokenization_bertweet.py new file mode 100644 index 0000000000000000000000000000000000000000..499238e5955fe03fff0fbc51ebec5629dd3a45ba --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/bertweet/tokenization_bertweet.py @@ -0,0 +1,769 @@ +# coding=utf-8 +# Copyright (c) 2020, VinAI Research and the HuggingFace Inc. team. +# Copyright 2018 The Open AI Team Authors and The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Tokenization classes for BERTweet""" + +import html +import os +import re +from shutil import copyfile +from typing import List, Optional, Tuple + +import regex + +from ...tokenization_utils import PreTrainedTokenizer +from ...utils import logging + + +logger = logging.get_logger(__name__) + +VOCAB_FILES_NAMES = { + "vocab_file": "vocab.txt", + "merges_file": "bpe.codes", +} + + +def get_pairs(word): + """ + Return set of symbol pairs in a word. + + Word is represented as tuple of symbols (symbols being variable-length strings). + """ + pairs = set() + prev_char = word[0] + for char in word[1:]: + pairs.add((prev_char, char)) + prev_char = char + + pairs = set(pairs) + return pairs + + +class BertweetTokenizer(PreTrainedTokenizer): + """ + Constructs a BERTweet tokenizer, using Byte-Pair-Encoding. + + This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to + this superclass for more information regarding those methods. + + Args: + vocab_file (`str`): + Path to the vocabulary file. + merges_file (`str`): + Path to the merges file. + normalization (`bool`, *optional*, defaults to `False`): + Whether or not to apply a normalization preprocess. + bos_token (`str`, *optional*, defaults to `""`): + The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token. + + + + When building a sequence using special tokens, this is not the token that is used for the beginning of + sequence. The token used is the `cls_token`. + + + + eos_token (`str`, *optional*, defaults to `""`): + The end of sequence token. + + + + When building a sequence using special tokens, this is not the token that is used for the end of sequence. + The token used is the `sep_token`. + + + + sep_token (`str`, *optional*, defaults to `""`): + The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for + sequence classification or for a text and a question for question answering. It is also used as the last + token of a sequence built with special tokens. + cls_token (`str`, *optional*, defaults to `""`): + The classifier token which is used when doing sequence classification (classification of the whole sequence + instead of per-token classification). It is the first token of the sequence when built with special tokens. + unk_token (`str`, *optional*, defaults to `""`): + The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this + token instead. + pad_token (`str`, *optional*, defaults to `""`): + The token used for padding, for example when batching sequences of different lengths. + mask_token (`str`, *optional*, defaults to `""`): + The token used for masking values. This is the token used when training this model with masked language + modeling. This is the token which the model will try to predict. + """ + + vocab_files_names = VOCAB_FILES_NAMES + + def __init__( + self, + vocab_file, + merges_file, + normalization=False, + bos_token="", + eos_token="", + sep_token="", + cls_token="", + unk_token="", + pad_token="", + mask_token="", + **kwargs, + ): + try: + from emoji import demojize + + self.demojizer = demojize + except ImportError: + logger.warning( + "emoji is not installed, thus not converting emoticons or emojis into text. Install emoji: pip3" + " install emoji==0.6.0" + ) + self.demojizer = None + + self.vocab_file = vocab_file + self.merges_file = merges_file + + self.encoder = {} + self.encoder[str(bos_token)] = 0 + self.encoder[str(pad_token)] = 1 + self.encoder[str(eos_token)] = 2 + self.encoder[str(unk_token)] = 3 + + self.add_from_file(vocab_file) + + self.decoder = {v: k for k, v in self.encoder.items()} + + with open(merges_file, encoding="utf-8") as merges_handle: + merges = merges_handle.read().split("\n")[:-1] + merges = [tuple(merge.split()[:-1]) for merge in merges] + self.bpe_ranks = dict(zip(merges, range(len(merges)))) + self.cache = {} + + self.normalization = normalization + self.tweetPreprocessor = TweetTokenizer() + self.special_puncts = {"’": "'", "…": "..."} + + super().__init__( + normalization=normalization, + bos_token=bos_token, + eos_token=eos_token, + sep_token=sep_token, + cls_token=cls_token, + unk_token=unk_token, + pad_token=pad_token, + mask_token=mask_token, + **kwargs, + ) + + def build_inputs_with_special_tokens( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None + ) -> List[int]: + """ + Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and + adding special tokens. A BERTweet sequence has the following format: + + - single sequence: ` X ` + - pair of sequences: ` A B ` + + Args: + token_ids_0 (`List[int]`): + List of IDs to which the special tokens will be added. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + + Returns: + `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens. + """ + + if token_ids_1 is None: + return [self.cls_token_id] + token_ids_0 + [self.sep_token_id] + cls = [self.cls_token_id] + sep = [self.sep_token_id] + return cls + token_ids_0 + sep + sep + token_ids_1 + sep + + def get_special_tokens_mask( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False + ) -> List[int]: + """ + Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding + special tokens using the tokenizer `prepare_for_model` method. + + Args: + token_ids_0 (`List[int]`): + List of IDs. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + already_has_special_tokens (`bool`, *optional*, defaults to `False`): + Whether or not the token list is already formatted with special tokens for the model. + + Returns: + `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. + """ + + if already_has_special_tokens: + return super().get_special_tokens_mask( + token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True + ) + + if token_ids_1 is None: + return [1] + ([0] * len(token_ids_0)) + [1] + return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1] + + def create_token_type_ids_from_sequences( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None + ) -> List[int]: + """ + Create a mask from the two sequences passed to be used in a sequence-pair classification task. BERTweet does + not make use of token type ids, therefore a list of zeros is returned. + + Args: + token_ids_0 (`List[int]`): + List of IDs. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + + Returns: + `List[int]`: List of zeros. + """ + + sep = [self.sep_token_id] + cls = [self.cls_token_id] + + if token_ids_1 is None: + return len(cls + token_ids_0 + sep) * [0] + return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0] + + @property + def vocab_size(self): + return len(self.encoder) + + def get_vocab(self): + return dict(self.encoder, **self.added_tokens_encoder) + + def bpe(self, token): + if token in self.cache: + return self.cache[token] + word = tuple(token) + word = tuple(list(word[:-1]) + [word[-1] + ""]) + pairs = get_pairs(word) + + if not pairs: + return token + + while True: + bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf"))) + if bigram not in self.bpe_ranks: + break + first, second = bigram + new_word = [] + i = 0 + while i < len(word): + try: + j = word.index(first, i) + except ValueError: + new_word.extend(word[i:]) + break + else: + new_word.extend(word[i:j]) + i = j + + if word[i] == first and i < len(word) - 1 and word[i + 1] == second: + new_word.append(first + second) + i += 2 + else: + new_word.append(word[i]) + i += 1 + new_word = tuple(new_word) + word = new_word + if len(word) == 1: + break + else: + pairs = get_pairs(word) + word = "@@ ".join(word) + word = word[:-4] + self.cache[token] = word + return word + + def _tokenize(self, text): + """Tokenize a string.""" + if self.normalization: # Perform Tweet normalization before performing BPE + text = self.normalizeTweet(text) + + split_tokens = [] + words = re.findall(r"\S+\n?", text) + for token in words: + split_tokens.extend(list(self.bpe(token).split(" "))) + return split_tokens + + def normalizeTweet(self, tweet): + """ + Normalize a raw Tweet + """ + for punct in self.special_puncts: + tweet = tweet.replace(punct, self.special_puncts[punct]) + + tokens = self.tweetPreprocessor.tokenize(tweet) + normTweet = " ".join([self.normalizeToken(token) for token in tokens]) + + normTweet = ( + normTweet.replace("cannot ", "can not ") + .replace("n't ", " n't ") + .replace("n 't ", " n't ") + .replace("ca n't", "can't") + .replace("ai n't", "ain't") + ) + normTweet = ( + normTweet.replace("'m ", " 'm ") + .replace("'re ", " 're ") + .replace("'s ", " 's ") + .replace("'ll ", " 'll ") + .replace("'d ", " 'd ") + .replace("'ve ", " 've ") + ) + normTweet = ( + normTweet.replace(" p . m .", " p.m.") + .replace(" p . m ", " p.m ") + .replace(" a . m .", " a.m.") + .replace(" a . m ", " a.m ") + ) + + return " ".join(normTweet.split()) + + def normalizeToken(self, token): + """ + Normalize tokens in a Tweet + """ + lowercased_token = token.lower() + if token.startswith("@"): + return "@USER" + elif lowercased_token.startswith("http") or lowercased_token.startswith("www"): + return "HTTPURL" + elif len(token) == 1: + if token in self.special_puncts: + return self.special_puncts[token] + if self.demojizer is not None: + return self.demojizer(token) + else: + return token + else: + return token + + def _convert_token_to_id(self, token): + """Converts a token (str) in an id using the vocab.""" + return self.encoder.get(token, self.encoder.get(self.unk_token)) + + def _convert_id_to_token(self, index): + """Converts an index (integer) in a token (str) using the vocab.""" + return self.decoder.get(index, self.unk_token) + + def convert_tokens_to_string(self, tokens): + """Converts a sequence of tokens (string) in a single string.""" + out_string = " ".join(tokens).replace("@@ ", "").strip() + return out_string + + def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]: + if not os.path.isdir(save_directory): + logger.error(f"Vocabulary path ({save_directory}) should be a directory") + return + out_vocab_file = os.path.join( + save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"] + ) + out_merge_file = os.path.join( + save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["merges_file"] + ) + + if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file): + copyfile(self.vocab_file, out_vocab_file) + elif not os.path.isfile(self.vocab_file): + with open(out_vocab_file, "wb") as fi: + content_spiece_model = self.sp_model.serialized_model_proto() + fi.write(content_spiece_model) + + if os.path.abspath(self.merges_file) != os.path.abspath(out_merge_file): + copyfile(self.merges_file, out_merge_file) + + return out_vocab_file, out_merge_file + + # def decode(self, token_ids, skip_special_tokens=False, clean_up_tokenization_spaces=True): + # filtered_tokens = ' '.join(self.convert_ids_to_tokens(token_ids, skip_special_tokens=skip_special_tokens)) + # tokens_generated_so_far = re.sub('(@@ )', '', string=filtered_tokens) + # tokens_generated_so_far = re.sub('(@@ ?$)', '', string=tokens_generated_so_far) + # return ''.join(tokens_generated_so_far) + + def add_from_file(self, f): + """ + Loads a pre-existing dictionary from a text file and adds its symbols to this instance. + """ + if isinstance(f, str): + try: + with open(f, "r", encoding="utf-8") as fd: + self.add_from_file(fd) + except FileNotFoundError as fnfe: + raise fnfe + except UnicodeError: + raise Exception(f"Incorrect encoding detected in {f}, please rebuild the dataset") + return + + lines = f.readlines() + for lineTmp in lines: + line = lineTmp.strip() + idx = line.rfind(" ") + if idx == -1: + raise ValueError("Incorrect dictionary format, expected ' '") + word = line[:idx] + self.encoder[word] = len(self.encoder) + + +# Natural Language Toolkit: Twitter Tokenizer +# +# Copyright (C) 2001-2020 NLTK Project +# Author: Christopher Potts +# Ewan Klein (modifications) +# Pierpaolo Pantone <> (modifications) +# URL: http://nltk.org/ +# For license information, see LICENSE.TXT +# + + +""" +Twitter-aware tokenizer, designed to be flexible and easy to adapt to new domains and tasks. The basic logic is this: + +1. The tuple regex_strings defines a list of regular expression strings. + +2. The regex_strings strings are put, in order, into a compiled regular expression object called word_re. + +3. The tokenization is done by word_re.findall(s), where s is the user-supplied string, inside the tokenize() method of + the class Tokenizer. + +4. When instantiating Tokenizer objects, there is a single option: preserve_case. By default, it is set to True. If it + is set to False, then the tokenizer will lowercase everything except for emoticons. + +""" + + +###################################################################### +# +# import regex # https://github.com/nltk/nltk/issues/2409 +# import html +# +###################################################################### +# The following strings are components in the regular expression +# that is used for tokenizing. It's important that phone_number +# appears first in the final regex (since it can contain whitespace). +# It also could matter that tags comes after emoticons, due to the +# possibility of having text like +# +# <:| and some text >:) +# +# Most importantly, the final element should always be last, since it +# does a last ditch whitespace-based tokenization of whatever is left. + +# ToDo: Update with http://en.wikipedia.org/wiki/List_of_emoticons ? + +# This particular element is used in a couple ways, so we define it +# with a name: +# docstyle-ignore +EMOTICONS = r""" + (?: + [<>]? + [:;=8] # eyes + [\-o\*\']? # optional nose + [\)\]\(\[dDpP/\:\}\{@\|\\] # mouth + | + [\)\]\(\[dDpP/\:\}\{@\|\\] # mouth + [\-o\*\']? # optional nose + [:;=8] # eyes + [<>]? + | + <3 # heart + )""" + +# URL pattern due to John Gruber, modified by Tom Winzig. See +# https://gist.github.com/winzig/8894715 +# docstyle-ignore +URLS = r""" # Capture 1: entire matched URL + (?: + https?: # URL protocol and colon + (?: + /{1,3} # 1-3 slashes + | # or + [a-z0-9%] # Single letter or digit or '%' + # (Trying not to match e.g. "URI::Escape") + ) + | # or + # looks like domain name followed by a slash: + [a-z0-9.\-]+[.] + (?:[a-z]{2,13}) + / + ) + (?: # One or more: + [^\s()<>{}\[\]]+ # Run of non-space, non-()<>{}[] + | # or + \([^\s()]*?\([^\s()]+\)[^\s()]*?\) # balanced parens, one level deep: (...(...)...) + | + \([^\s]+?\) # balanced parens, non-recursive: (...) + )+ + (?: # End with: + \([^\s()]*?\([^\s()]+\)[^\s()]*?\) # balanced parens, one level deep: (...(...)...) + | + \([^\s]+?\) # balanced parens, non-recursive: (...) + | # or + [^\s`!()\[\]{};:'".,<>?«»“”‘’] # not a space or one of these punct chars + ) + | # OR, the following to match naked domains: + (?: + (?\s]+>""", + # ASCII Arrows + r"""[\-]+>|<[\-]+""", + # Twitter username: + r"""(?:@[\w_]+)""", + # Twitter hashtags: + r"""(?:\#+[\w_]+[\w\'_\-]*[\w_]+)""", + # email addresses + r"""[\w.+-]+@[\w-]+\.(?:[\w-]\.?)+[\w-]""", + # docstyle-ignore + # Remaining word types: + r""" + (?:[^\W\d_](?:[^\W\d_]|['\-_])+[^\W\d_]) # Words with apostrophes or dashes. + | + (?:[+\-]?\d+[,/.:-]\d+[+\-]?) # Numbers, including fractions, decimals. + | + (?:[\w_]+) # Words without apostrophes or dashes. + | + (?:\.(?:\s*\.){1,}) # Ellipsis dots. + | + (?:\S) # Everything else that isn't whitespace. + """, +) + +###################################################################### +# This is the core tokenizing regex: + +WORD_RE = regex.compile(r"""(%s)""" % "|".join(REGEXPS), regex.VERBOSE | regex.I | regex.UNICODE) + +# WORD_RE performs poorly on these patterns: +HANG_RE = regex.compile(r"([^a-zA-Z0-9])\1{3,}") + +# The emoticon string gets its own regex so that we can preserve case for +# them as needed: +EMOTICON_RE = regex.compile(EMOTICONS, regex.VERBOSE | regex.I | regex.UNICODE) + +# These are for regularizing HTML entities to Unicode: +ENT_RE = regex.compile(r"&(#?(x?))([^&;\s]+);") + + +###################################################################### +# Functions for converting html entities +###################################################################### + + +def _str_to_unicode(text, encoding=None, errors="strict"): + if encoding is None: + encoding = "utf-8" + if isinstance(text, bytes): + return text.decode(encoding, errors) + return text + + +def _replace_html_entities(text, keep=(), remove_illegal=True, encoding="utf-8"): + """ + Remove entities from text by converting them to their corresponding unicode character. + + Args: + text: + A unicode string or a byte string encoded in the given *encoding* (which defaults to 'utf-8'). + keep (list): + List of entity names which should not be replaced. This supports both numeric entities (`&#nnnn;` and + `&#hhhh;`) and named entities (such as ` ` or `>`). + remove_illegal (bool): + If `True`, entities that can't be converted are removed. Otherwise, entities that can't be converted are + kept "as is". + + Returns: A unicode string with the entities removed. + + See https://github.com/scrapy/w3lib/blob/master/w3lib/html.py + + Examples: + + ```python + >>> from nltk.tokenize.casual import _replace_html_entities + + >>> _replace_html_entities(b"Price: £100") + 'Price: \\xa3100' + + >>> print(_replace_html_entities(b"Price: £100")) + Price: £100 + ```""" + + def _convert_entity(match): + entity_body = match.group(3) + if match.group(1): + try: + if match.group(2): + number = int(entity_body, 16) + else: + number = int(entity_body, 10) + # Numeric character references in the 80-9F range are typically + # interpreted by browsers as representing the characters mapped + # to bytes 80-9F in the Windows-1252 encoding. For more info + # see: https://en.wikipedia.org/wiki/ISO/IEC_8859-1#Similar_character_sets + if 0x80 <= number <= 0x9F: + return bytes((number,)).decode("cp1252") + except ValueError: + number = None + else: + if entity_body in keep: + return match.group(0) + else: + number = html.entities.name2codepoint.get(entity_body) + if number is not None: + try: + return chr(number) + except (ValueError, OverflowError): + pass + + return "" if remove_illegal else match.group(0) + + return ENT_RE.sub(_convert_entity, _str_to_unicode(text, encoding)) + + +###################################################################### + + +class TweetTokenizer: + r""" + Examples: + + ```python + >>> # Tokenizer for tweets. + >>> from nltk.tokenize import TweetTokenizer + + >>> tknzr = TweetTokenizer() + >>> s0 = "This is a cooool #dummysmiley: :-) :-P <3 and some arrows < > -> <--" + >>> tknzr.tokenize(s0) + ['This', 'is', 'a', 'cooool', '#dummysmiley', ':', ':-)', ':-P', '<3', 'and', 'some', 'arrows', '<', '>', '->', '<--'] + + >>> # Examples using *strip_handles* and *reduce_len parameters*: + >>> tknzr = TweetTokenizer(strip_handles=True, reduce_len=True) + >>> s1 = "@remy: This is waaaaayyyy too much for you!!!!!!" + >>> tknzr.tokenize(s1) + [':', 'This', 'is', 'waaayyy', 'too', 'much', 'for', 'you', '!', '!', '!'] + ```""" + + def __init__(self, preserve_case=True, reduce_len=False, strip_handles=False): + self.preserve_case = preserve_case + self.reduce_len = reduce_len + self.strip_handles = strip_handles + + def tokenize(self, text): + """ + Args: + text: str + + Returns: list(str) A tokenized list of strings; concatenating this list returns the original string if + `preserve_case=False` + """ + # Fix HTML character entities: + text = _replace_html_entities(text) + # Remove username handles + if self.strip_handles: + text = remove_handles(text) + # Normalize word lengthening + if self.reduce_len: + text = reduce_lengthening(text) + # Shorten problematic sequences of characters + safe_text = HANG_RE.sub(r"\1\1\1", text) + # Tokenize: + words = WORD_RE.findall(safe_text) + # Possibly alter the case, but avoid changing emoticons like :D into :d: + if not self.preserve_case: + words = [x if EMOTICON_RE.search(x) else x.lower() for x in words] + return words + + +###################################################################### +# Normalization Functions +###################################################################### + + +def reduce_lengthening(text): + """ + Replace repeated character sequences of length 3 or greater with sequences of length 3. + """ + pattern = regex.compile(r"(.)\1{2,}") + return pattern.sub(r"\1\1\1", text) + + +def remove_handles(text): + """ + Remove Twitter username handles from text. + """ + pattern = regex.compile( + r"(?>> from transformers import BigBirdConfig, BigBirdModel + + >>> # Initializing a BigBird google/bigbird-roberta-base style configuration + >>> configuration = BigBirdConfig() + + >>> # Initializing a model (with random weights) from the google/bigbird-roberta-base style configuration + >>> model = BigBirdModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "big_bird" + + def __init__( + self, + vocab_size=50358, + hidden_size=768, + num_hidden_layers=12, + num_attention_heads=12, + intermediate_size=3072, + hidden_act="gelu_new", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + max_position_embeddings=4096, + type_vocab_size=2, + initializer_range=0.02, + layer_norm_eps=1e-12, + use_cache=True, + pad_token_id=0, + bos_token_id=1, + eos_token_id=2, + sep_token_id=66, + attention_type="block_sparse", + use_bias=True, + rescale_embeddings=False, + block_size=64, + num_random_blocks=3, + classifier_dropout=None, + **kwargs, + ): + super().__init__( + pad_token_id=pad_token_id, + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + sep_token_id=sep_token_id, + **kwargs, + ) + + self.vocab_size = vocab_size + self.max_position_embeddings = max_position_embeddings + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.intermediate_size = intermediate_size + self.hidden_act = hidden_act + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.initializer_range = initializer_range + self.type_vocab_size = type_vocab_size + self.layer_norm_eps = layer_norm_eps + self.use_cache = use_cache + + self.rescale_embeddings = rescale_embeddings + self.attention_type = attention_type + self.use_bias = use_bias + self.block_size = block_size + self.num_random_blocks = num_random_blocks + self.classifier_dropout = classifier_dropout + + +class BigBirdOnnxConfig(OnnxConfig): + @property + def inputs(self) -> Mapping[str, Mapping[int, str]]: + if self.task == "multiple-choice": + dynamic_axis = {0: "batch", 1: "choice", 2: "sequence"} + else: + dynamic_axis = {0: "batch", 1: "sequence"} + return OrderedDict( + [ + ("input_ids", dynamic_axis), + ("attention_mask", dynamic_axis), + ] + ) + + +__all__ = ["BigBirdConfig", "BigBirdOnnxConfig"] diff --git a/docs/transformers/build/lib/transformers/models/big_bird/convert_bigbird_original_tf_checkpoint_to_pytorch.py b/docs/transformers/build/lib/transformers/models/big_bird/convert_bigbird_original_tf_checkpoint_to_pytorch.py new file mode 100644 index 0000000000000000000000000000000000000000..0b8e6590f937f959973cbd91cf52ba163e497e17 --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/big_bird/convert_bigbird_original_tf_checkpoint_to_pytorch.py @@ -0,0 +1,69 @@ +# coding=utf-8 +# Copyright 2021 The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Convert BigBird checkpoint.""" + +import argparse + +from transformers import BigBirdConfig, BigBirdForPreTraining, BigBirdForQuestionAnswering, load_tf_weights_in_big_bird +from transformers.utils import logging + + +logging.set_verbosity_info() + + +def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, big_bird_config_file, pytorch_dump_path, is_trivia_qa): + # Initialise PyTorch model + config = BigBirdConfig.from_json_file(big_bird_config_file) + print(f"Building PyTorch model from configuration: {config}") + + if is_trivia_qa: + model = BigBirdForQuestionAnswering(config) + else: + model = BigBirdForPreTraining(config) + + # Load weights from tf checkpoint + load_tf_weights_in_big_bird(model, tf_checkpoint_path, is_trivia_qa=is_trivia_qa) + + # Save pytorch-model + print(f"Save PyTorch model to {pytorch_dump_path}") + model.save_pretrained(pytorch_dump_path) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + # Required parameters + parser.add_argument( + "--tf_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path." + ) + parser.add_argument( + "--big_bird_config_file", + default=None, + type=str, + required=True, + help=( + "The config json file corresponding to the pre-trained BERT model. \n" + "This specifies the model architecture." + ), + ) + parser.add_argument( + "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model." + ) + parser.add_argument( + "--is_trivia_qa", action="store_true", help="Whether to convert a model with a trivia_qa head." + ) + args = parser.parse_args() + convert_tf_checkpoint_to_pytorch( + args.tf_checkpoint_path, args.big_bird_config_file, args.pytorch_dump_path, args.is_trivia_qa + ) diff --git a/docs/transformers/build/lib/transformers/models/big_bird/modeling_big_bird.py b/docs/transformers/build/lib/transformers/models/big_bird/modeling_big_bird.py new file mode 100644 index 0000000000000000000000000000000000000000..abc5a1df440968ef24029bbdd5a83fdbfbba4749 --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/big_bird/modeling_big_bird.py @@ -0,0 +1,3149 @@ +# coding=utf-8 +# Copyright 2021 Google Research and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch BigBird model.""" + +import math +import os +from dataclasses import dataclass +from typing import Optional, Tuple, Union + +import numpy as np +import torch +import torch.utils.checkpoint +from torch import nn +from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss + +from ...activations import ACT2FN +from ...generation import GenerationMixin +from ...modeling_outputs import ( + BaseModelOutputWithPastAndCrossAttentions, + BaseModelOutputWithPoolingAndCrossAttentions, + CausalLMOutputWithCrossAttentions, + MaskedLMOutput, + MultipleChoiceModelOutput, + SequenceClassifierOutput, + TokenClassifierOutput, +) +from ...modeling_utils import PreTrainedModel +from ...pytorch_utils import apply_chunking_to_forward +from ...utils import ( + ModelOutput, + add_code_sample_docstrings, + add_start_docstrings, + add_start_docstrings_to_model_forward, + logging, + replace_return_docstrings, +) +from .configuration_big_bird import BigBirdConfig + + +logger = logging.get_logger(__name__) + +_CHECKPOINT_FOR_DOC = "google/bigbird-roberta-base" +_CONFIG_FOR_DOC = "BigBirdConfig" + + +_TRIVIA_QA_MAPPING = { + "big_bird_attention": "attention/self", + "output_layer_norm": "output/LayerNorm", + "attention_output": "attention/output/dense", + "output": "output/dense", + "self_attention_layer_norm": "attention/output/LayerNorm", + "intermediate": "intermediate/dense", + "word_embeddings": "bert/embeddings/word_embeddings", + "position_embedding": "bert/embeddings/position_embeddings", + "type_embeddings": "bert/embeddings/token_type_embeddings", + "embeddings": "bert/embeddings", + "layer_normalization": "output/LayerNorm", + "layer_norm": "LayerNorm", + "trivia_qa_head": "qa_classifier", + "dense": "intermediate/dense", + "dense_1": "qa_outputs", +} + + +def load_tf_weights_in_big_bird(model, tf_checkpoint_path, is_trivia_qa=False): + """Load tf checkpoints in a pytorch model.""" + + def load_tf_weights_bert(init_vars, tf_path): + names = [] + tf_weights = {} + + for name, shape in init_vars: + array = tf.train.load_variable(tf_path, name) + name = name.replace("bert/encoder/LayerNorm", "bert/embeddings/LayerNorm") + logger.info(f"Loading TF weight {name} with shape {shape}") + names.append(name) + tf_weights[name] = array + + return names, tf_weights + + def load_tf_weights_trivia_qa(init_vars): + names = [] + tf_weights = {} + + for i, var in enumerate(init_vars): + name_items = var.name.split("/") + + if "transformer_scaffold" in name_items[0]: + layer_name_items = name_items[0].split("_") + if len(layer_name_items) < 3: + layer_name_items += [0] + + name_items[0] = f"bert/encoder/layer_{layer_name_items[2]}" + + name = "/".join([_TRIVIA_QA_MAPPING[x] if x in _TRIVIA_QA_MAPPING else x for x in name_items])[ + :-2 + ] # remove last :0 in variable + + if "self/attention/output" in name: + name = name.replace("self/attention/output", "output") + + if i >= len(init_vars) - 2: + name = name.replace("intermediate", "output") + + logger.info(f"Loading TF weight {name} with shape {var.shape}") + array = var.value().numpy() + names.append(name) + tf_weights[name] = array + + return names, tf_weights + + try: + import re + + import numpy as np + import tensorflow as tf + except ImportError: + logger.error( + "Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see " + "https://www.tensorflow.org/install/ for installation instructions." + ) + raise + tf_path = os.path.abspath(tf_checkpoint_path) + logger.info(f"Converting TensorFlow checkpoint from {tf_path}") + + # Load weights from TF model + init_vars = tf.saved_model.load(tf_path).variables if is_trivia_qa else tf.train.list_variables(tf_path) + + if len(init_vars) <= 0: + raise ValueError("Loaded trained variables cannot be empty.") + + pt_names = list(model.state_dict().keys()) + + if is_trivia_qa: + names, tf_weights = load_tf_weights_trivia_qa(init_vars) + else: + names, tf_weights = load_tf_weights_bert(init_vars, tf_path) + + for txt_name in names: + array = tf_weights[txt_name] + name = txt_name.split("/") + # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v + # which are not required for using pretrained model + if any( + n in ["adam_v", "adam_m", "AdamWeightDecayOptimizer", "AdamWeightDecayOptimizer_1", "global_step"] + for n in name + ): + logger.info(f"Skipping {'/'.join(name)}") + continue + pointer = model + pt_name = [] + for m_name in name: + if re.fullmatch(r"[A-Za-z]+_\d+", m_name): + scope_names = re.split(r"_(\d+)", m_name) + else: + scope_names = [m_name] + if scope_names[0] == "kernel" or scope_names[0] == "gamma": + pointer = getattr(pointer, "weight") + pt_name.append("weight") + elif scope_names[0] == "output_bias" or scope_names[0] == "beta": + pointer = getattr(pointer, "bias") + pt_name.append("bias") + elif scope_names[0] == "output_weights": + pointer = getattr(pointer, "weight") + pt_name.append("weight") + elif scope_names[0] == "squad": + pointer = getattr(pointer, "classifier") + pt_name.append("classifier") + elif scope_names[0] == "transform": + pointer = getattr(pointer, "transform") + pt_name.append("transform") + if ("bias" in name) or ("kernel" in name): + pointer = getattr(pointer, "dense") + pt_name.append("dense") + elif ("beta" in name) or ("gamma" in name): + pointer = getattr(pointer, "LayerNorm") + pt_name.append("LayerNorm") + else: + try: + pointer = getattr(pointer, scope_names[0]) + pt_name.append(f"{scope_names[0]}") + except AttributeError: + logger.info(f"Skipping {m_name}") + continue + if len(scope_names) >= 2: + num = int(scope_names[1]) + pointer = pointer[num] + pt_name.append(f"{num}") + if m_name[-11:] == "_embeddings" or m_name == "embeddings": + pointer = getattr(pointer, "weight") + pt_name.append("weight") + elif m_name == "kernel": + array = np.transpose(array) + try: + if len(array.shape) > len(pointer.shape) and math.prod(array.shape) == math.prod(pointer.shape): + # print(txt_name, array.shape) + if ( + txt_name.endswith("attention/self/key/kernel") + or txt_name.endswith("attention/self/query/kernel") + or txt_name.endswith("attention/self/value/kernel") + ): + array = array.transpose(1, 0, 2).reshape(pointer.shape) + elif txt_name.endswith("attention/output/dense/kernel"): + array = array.transpose(0, 2, 1).reshape(pointer.shape) + else: + array = array.reshape(pointer.shape) + + if pointer.shape != array.shape: + raise ValueError( + f"Pointer shape {pointer.shape} and array shape {array.shape} mismatched of {txt_name}." + ) + except ValueError as e: + e.args += (pointer.shape, array.shape) + raise + pt_weight_name = ".".join(pt_name) + logger.info(f"Initialize PyTorch weight {pt_weight_name} from {txt_name}.") + pointer.data = torch.from_numpy(array) + tf_weights.pop(txt_name, None) + pt_names.remove(pt_weight_name) + + logger.info(f"Weights not copied to PyTorch model: {', '.join(tf_weights.keys())}.") + logger.info(f"Weights not initialized in PyTorch model: {', '.join(pt_names)}.") + return model + + +class BigBirdEmbeddings(nn.Module): + """Construct the embeddings from word, position and token_type embeddings.""" + + # Copied from transformers.models.bert.modeling_bert.BertEmbeddings.__init__ + def __init__(self, config): + super().__init__() + self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id) + self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size) + self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size) + + # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load + # any TensorFlow checkpoint file + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + # position_ids (1, len position emb) is contiguous in memory and exported when serialized + self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") + self.register_buffer( + "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False + ) + self.register_buffer( + "token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False + ) + # End copy + + self.rescale_embeddings = config.rescale_embeddings + self.hidden_size = config.hidden_size + + def forward( + self, input_ids=None, token_type_ids=None, position_ids=None, inputs_embeds=None, past_key_values_length=0 + ): + if input_ids is not None: + input_shape = input_ids.size() + else: + input_shape = inputs_embeds.size()[:-1] + + seq_length = input_shape[1] + + if position_ids is None: + position_ids = self.position_ids[:, past_key_values_length : seq_length + past_key_values_length] + + # Setting the token_type_ids to the registered buffer in constructor where it is all zeros, which usually occurs + # when its auto-generated, registered buffer helps users when tracing the model without passing token_type_ids, solves + # issue #5664 + if token_type_ids is None: + if hasattr(self, "token_type_ids"): + buffered_token_type_ids = self.token_type_ids[:, :seq_length] + buffered_token_type_ids_expanded = buffered_token_type_ids.expand(input_shape[0], seq_length) + token_type_ids = buffered_token_type_ids_expanded + else: + token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device) + + if inputs_embeds is None: + inputs_embeds = self.word_embeddings(input_ids) + + if self.rescale_embeddings: + inputs_embeds = inputs_embeds * (self.hidden_size**0.5) + + token_type_embeddings = self.token_type_embeddings(token_type_ids) + + embeddings = inputs_embeds + token_type_embeddings + + position_embeddings = self.position_embeddings(position_ids) + embeddings += position_embeddings + + embeddings = self.dropout(embeddings) + embeddings = self.LayerNorm(embeddings) + return embeddings + + +class BigBirdSelfAttention(nn.Module): + def __init__(self, config): + super().__init__() + if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"): + raise ValueError( + f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention " + f"heads ({config.num_attention_heads})" + ) + + self.num_attention_heads = config.num_attention_heads + self.attention_head_size = int(config.hidden_size / config.num_attention_heads) + self.all_head_size = self.num_attention_heads * self.attention_head_size + + self.query = nn.Linear(config.hidden_size, self.all_head_size, bias=config.use_bias) + self.key = nn.Linear(config.hidden_size, self.all_head_size, bias=config.use_bias) + self.value = nn.Linear(config.hidden_size, self.all_head_size, bias=config.use_bias) + + self.dropout = nn.Dropout(config.attention_probs_dropout_prob) + self.is_decoder = config.is_decoder + + def transpose_for_scores(self, x): + new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size) + x = x.view(*new_x_shape) + return x.permute(0, 2, 1, 3) + + def forward( + self, + hidden_states, + attention_mask=None, + head_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + past_key_value=None, + output_attentions=False, + ): + mixed_query_layer = self.query(hidden_states) + + # If this is instantiated as a cross-attention module, the keys + # and values come from an encoder; the attention mask needs to be + # such that the encoder's padding tokens are not attended to. + is_cross_attention = encoder_hidden_states is not None + + if is_cross_attention and past_key_value is not None: + # reuse k,v, cross_attentions + key_layer = past_key_value[0] + value_layer = past_key_value[1] + attention_mask = encoder_attention_mask + elif is_cross_attention: + key_layer = self.transpose_for_scores(self.key(encoder_hidden_states)) + value_layer = self.transpose_for_scores(self.value(encoder_hidden_states)) + attention_mask = encoder_attention_mask + elif past_key_value is not None: + key_layer = self.transpose_for_scores(self.key(hidden_states)) + value_layer = self.transpose_for_scores(self.value(hidden_states)) + key_layer = torch.cat([past_key_value[0], key_layer], dim=2) + value_layer = torch.cat([past_key_value[1], value_layer], dim=2) + else: + key_layer = self.transpose_for_scores(self.key(hidden_states)) + value_layer = self.transpose_for_scores(self.value(hidden_states)) + + query_layer = self.transpose_for_scores(mixed_query_layer) + + if self.is_decoder: + # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states. + # Further calls to cross_attention layer can then reuse all cross-attention + # key/value_states (first "if" case) + # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of + # all previous decoder key/value_states. Further calls to uni-directional self-attention + # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case) + # if encoder bi-directional self-attention `past_key_value` is always `None` + past_key_value = (key_layer, value_layer) + + # Take the dot product between "query" and "key" to get the raw attention scores. + attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2)) + + attention_scores = attention_scores / math.sqrt(self.attention_head_size) + if attention_mask is not None: + # Apply the attention mask is (precomputed for all layers in BigBirdModel forward() function) + attention_scores = attention_scores + attention_mask + + # Normalize the attention scores to probabilities. + attention_probs = nn.functional.softmax(attention_scores, dim=-1) + + # This is actually dropping out entire tokens to attend to, which might + # seem a bit unusual, but is taken from the original Transformer paper. + attention_probs = self.dropout(attention_probs) + + # Mask heads if we want to + if head_mask is not None: + attention_probs = attention_probs * head_mask + + context_layer = torch.matmul(attention_probs, value_layer) + + context_layer = context_layer.permute(0, 2, 1, 3).contiguous() + new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,) + context_layer = context_layer.view(*new_context_layer_shape) + + outputs = (context_layer, attention_probs) if output_attentions else (context_layer,) + + if self.is_decoder: + outputs = outputs + (past_key_value,) + return outputs + + +class BigBirdBlockSparseAttention(nn.Module): + def __init__(self, config, seed=None): + super().__init__() + + self.max_seqlen = config.max_position_embeddings + self.seed = seed + + if config.hidden_size % config.num_attention_heads != 0: + raise ValueError( + f"The hidden size {config.hidden_size} is not a multiple of the number of attention " + f"heads {config.num_attention_heads}." + ) + + self.num_attention_heads = config.num_attention_heads + self.num_random_blocks = config.num_random_blocks + self.block_size = config.block_size + + self.attention_head_size = int(config.hidden_size / config.num_attention_heads) + self.all_head_size = self.num_attention_heads * self.attention_head_size + + self.query = nn.Linear(config.hidden_size, self.all_head_size, bias=config.use_bias) + self.key = nn.Linear(config.hidden_size, self.all_head_size, bias=config.use_bias) + self.value = nn.Linear(config.hidden_size, self.all_head_size, bias=config.use_bias) + + def transpose_for_scores(self, x): + new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size) + x = x.view(*new_x_shape) + return x.permute(0, 2, 1, 3) + + def forward( + self, + hidden_states, + band_mask=None, + from_mask=None, + to_mask=None, + from_blocked_mask=None, + to_blocked_mask=None, + output_attentions=None, + ): + # Currently this `class` can't be used in decoder. + + batch_size, seqlen, _ = hidden_states.size() + to_seq_length = from_seq_length = seqlen + from_block_size = to_block_size = self.block_size + + if from_seq_length % from_block_size != 0: + raise ValueError("Query sided sequence length must be multiple of block size") + + if to_seq_length % to_block_size != 0: + raise ValueError("Key/Value sided sequence length must be multiple of block size") + + query_layer = self.transpose_for_scores(self.query(hidden_states)) + key_layer = self.transpose_for_scores(self.key(hidden_states)) + value_layer = self.transpose_for_scores(self.value(hidden_states)) + + context_layer, attention_probs = self.bigbird_block_sparse_attention( + query_layer, + key_layer, + value_layer, + band_mask, + from_mask, + to_mask, + from_blocked_mask, + to_blocked_mask, + self.num_attention_heads, + self.num_random_blocks, + self.attention_head_size, + from_block_size, + to_block_size, + batch_size, + from_seq_length, + to_seq_length, + seed=self.seed, + plan_from_length=None, + plan_num_rand_blocks=None, + output_attentions=output_attentions, + ) + + context_layer = context_layer.contiguous().view(batch_size, from_seq_length, -1) + + outputs = (context_layer, attention_probs) if output_attentions else (context_layer,) + return outputs + + @staticmethod + def torch_bmm_nd(inp_1, inp_2, ndim=None): + """Fast nd matrix multiplication""" + # faster replacement of torch.einsum ("bhqk,bhkd->bhqd") + return torch.bmm(inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:])).view( + inp_1.shape[: ndim - 2] + (inp_1.shape[ndim - 2], inp_2.shape[ndim - 1]) + ) + + @staticmethod + def torch_bmm_nd_transpose(inp_1, inp_2, ndim=None): + """Fast nd matrix multiplication with transpose""" + # faster replacement of torch.einsum (bhqd,bhkd->bhqk) + return torch.bmm( + inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:]).transpose(1, 2) + ).view(inp_1.shape[: ndim - 2] + (inp_1.shape[ndim - 2], inp_2.shape[ndim - 2])) + + def bigbird_block_sparse_attention( + self, + query_layer, + key_layer, + value_layer, + band_mask, + from_mask, + to_mask, + from_blocked_mask, + to_blocked_mask, + n_heads, + n_rand_blocks, + attention_head_size, + from_block_size, + to_block_size, + batch_size, + from_seq_len, + to_seq_len, + seed, + plan_from_length, + plan_num_rand_blocks, + output_attentions, + ): + # BigBird block-sparse attention as suggested in paper + + # ITC: + # global tokens: 2 x block_size + # window tokens: 3 x block_size + # random tokens: num_rand_tokens x block_size + + # ETC: + # global tokens: extra_globals_tokens + 2 x block_size + # window tokens: 3 x block_size + # random tokens: num_rand_tokens x block_size + + # Note: + # 1) Currently, ETC is not supported. + # 2) Window size is fixed to 3 blocks & it can be changed only by + # changing `block_size`. + # 3) Number of global blocks are fixed (2 blocks here) & global tokens can be + # controlled only by `block_size`. + + # attention is calculated separately for q[0], q[1], q[2:-2], q[-2], q[-1] in order to use special trick of shifting tokens (for calculating sliding attention) + # hence following code can be divided into 5 parts. + + if from_seq_len // from_block_size != to_seq_len // to_block_size: + raise ValueError("Error the number of blocks needs to be same!") + + rsqrt_d = 1 / math.sqrt(attention_head_size) + bsz = batch_size + attn_mask_penalty = -10000.0 + + # generate random attention and corresponding masks + np.random.seed(seed) + if from_seq_len in [1024, 3072, 4096]: # old plans used in paper + rand_attn = [ + self._bigbird_block_rand_mask( + self.max_seqlen, self.max_seqlen, from_block_size, to_block_size, n_rand_blocks, last_idx=1024 + )[: (from_seq_len // from_block_size - 2)] + for _ in range(n_heads) + ] + else: + if plan_from_length is None: + plan_from_length, plan_num_rand_blocks = self._get_rand_attn_plan( + from_seq_len, from_block_size, n_rand_blocks + ) + + rand_attn = self._bigbird_block_rand_mask_with_head( + from_seq_length=from_seq_len, + to_seq_length=to_seq_len, + from_block_size=from_block_size, + to_block_size=to_block_size, + num_heads=n_heads, + plan_from_length=plan_from_length, + plan_num_rand_blocks=plan_num_rand_blocks, + ) + + rand_attn = np.stack(rand_attn, axis=0) + rand_attn = torch.tensor(rand_attn, device=query_layer.device, dtype=torch.long) + rand_attn.unsqueeze_(0) + rand_attn = torch.cat([rand_attn for _ in range(batch_size)], dim=0) + + rand_mask = self._create_rand_mask_from_inputs( + from_blocked_mask, to_blocked_mask, rand_attn, n_heads, n_rand_blocks, bsz, from_seq_len, from_block_size + ) + + blocked_query_matrix = query_layer.view(bsz, n_heads, from_seq_len // from_block_size, from_block_size, -1) + blocked_key_matrix = key_layer.view(bsz, n_heads, to_seq_len // to_block_size, to_block_size, -1) + blocked_value_matrix = value_layer.view(bsz, n_heads, to_seq_len // to_block_size, to_block_size, -1) + + # preparing block for randn attn + gathered_key = self.torch_gather_b2(blocked_key_matrix, rand_attn) + gathered_key = gathered_key.view( + bsz, n_heads, to_seq_len // to_block_size - 2, n_rand_blocks * to_block_size, -1 + ) # [bsz, n_heads, to_seq_len//to_block_size-2, n_rand_blocks, to_block_size, -1] + gathered_value = self.torch_gather_b2(blocked_value_matrix, rand_attn) + gathered_value = gathered_value.view( + bsz, n_heads, to_seq_len // to_block_size - 2, n_rand_blocks * to_block_size, -1 + ) # [bsz, n_heads, to_seq_len//to_block_size-2, n_rand_blocks, to_block_size, -1] + + # 1st PART + # 1st block (global block) attention scores + # q[0] x (k[0], k[1], k[2], k[3], k[4] .... ) + + # [bsz, n_heads, from_block_size, -1] x [bsz, n_heads, to_seq_len, -1] ==> [bsz, n_heads, from_block_size, to_seq_len] + first_product = self.torch_bmm_nd_transpose(blocked_query_matrix[:, :, 0], key_layer, ndim=4) + + first_product = first_product * rsqrt_d + first_product += (1.0 - to_mask) * attn_mask_penalty + first_attn_weights = nn.functional.softmax( + first_product, dim=-1 + ) # [bsz, n_heads, from_block_size, to_seq_len] + + # [bsz, n_heads, from_block_size, to_seq_len] x [bsz, n_heads, to_seq_len, -1] ==> [bsz, n_heads, from_block_size, -1] + first_context_layer = self.torch_bmm_nd(first_attn_weights, value_layer, ndim=4) + first_context_layer.unsqueeze_(2) + + # 2nd PART + # 2nd block attention scores + # q[1] x (sliding_keys, random_keys, global_keys) + # sliding key blocks -> 2nd, 3rd blocks + # global key blocks -> 1st block + + second_key_mat = torch.cat( + [ + blocked_key_matrix[:, :, 0], + blocked_key_matrix[:, :, 1], + blocked_key_matrix[:, :, 2], + blocked_key_matrix[:, :, -1], + gathered_key[:, :, 0], + ], + dim=2, + ) # [bsz, n_heads, (4+n_rand_blocks)*to_block_size, -1] + second_value_mat = torch.cat( + [ + blocked_value_matrix[:, :, 0], + blocked_value_matrix[:, :, 1], + blocked_value_matrix[:, :, 2], + blocked_value_matrix[:, :, -1], + gathered_value[:, :, 0], + ], + dim=2, + ) # [bsz, n_heads, (4+n_rand_blocks)*to_block_size, -1] + + # [bsz, n_heads, from_block_size, -1] x [bsz, n_heads, (4+n_rand_blocks)*to_block_size, -1] ==> [bsz, n_heads, from_block_size, (4+n_rand_blocks)*to_block_size] + second_product = self.torch_bmm_nd_transpose(blocked_query_matrix[:, :, 1], second_key_mat, ndim=4) + second_seq_pad = torch.cat( + [ + to_mask[:, :, :, : 3 * to_block_size], + to_mask[:, :, :, -to_block_size:], + to_mask.new_ones([bsz, 1, 1, n_rand_blocks * to_block_size]), + ], + dim=3, + ) + second_rand_pad = torch.cat( + [ + rand_mask.new_ones([bsz, n_heads, from_block_size, 4 * to_block_size]), + rand_mask[:, :, 0], + ], + dim=3, + ) + second_product = second_product * rsqrt_d + second_product += (1.0 - torch.minimum(second_seq_pad, second_rand_pad)) * attn_mask_penalty + second_attn_weights = nn.functional.softmax( + second_product, dim=-1 + ) # [bsz, n_heads, from_block_size, (4+n_rand_blocks)*to_block_size] + + # [bsz, n_heads, from_block_size, (4+n_rand_blocks)*to_block_size] x [bsz, n_heads, (4+n_rand_blocks)*to_block_size, -1] ==> [bsz, n_heads, from_block_size, -1] + second_context_layer = self.torch_bmm_nd(second_attn_weights, second_value_mat, ndim=4) + + second_context_layer.unsqueeze_(2) + + # 3rd PART + # Middle blocks attention scores + # q[-2:2] x (sliding_keys, random_keys, global_keys) + # sliding attn is calculated using special trick of shifting tokens as discussed in paper + # random keys are generated by taking random indices as per `rand_attn` + # global keys -> 1st & last block + + exp_blocked_key_matrix = torch.cat( + [blocked_key_matrix[:, :, 1:-3], blocked_key_matrix[:, :, 2:-2], blocked_key_matrix[:, :, 3:-1]], dim=3 + ) # [bsz, n_heads, from_seq_len//from_block_size-4, 3*to_block_size, -1] + exp_blocked_value_matrix = torch.cat( + [blocked_value_matrix[:, :, 1:-3], blocked_value_matrix[:, :, 2:-2], blocked_value_matrix[:, :, 3:-1]], + dim=3, + ) # [bsz, n_heads, from_seq_len//from_block_size-4, 3*to_block_size, -1] + middle_query_matrix = blocked_query_matrix[:, :, 2:-2] + + # sliding attention scores for q[-2:2] + # [bsz, n_heads, from_seq_len//from_block_size-4, from_block_size, -1] x [b, n_heads, from_seq_len//from_block_size-4, 3*to_block_size, -1] + inner_band_product = self.torch_bmm_nd_transpose(middle_query_matrix, exp_blocked_key_matrix, ndim=5) + # ==> [bsz, n_heads, from_seq_len//from_block_size-4, from_block_size, 3*to_block_size] + inner_band_product = inner_band_product * rsqrt_d + + # randn attention scores for q[-2:2] + # [bsz, n_heads, from_seq_len//from_block_size-4, from_block_size, -1] x [bsz, n_heads, from_seq_len//from_block_size-4, n_rand_blocks*to_block_size, -1] + rand_band_product = self.torch_bmm_nd_transpose(middle_query_matrix, gathered_key[:, :, 1:-1], ndim=5) + # ==> [bsz, n_heads, from_seq_len//from_block_size-4, from_block_size, n_rand_blocks*to_block_size] + rand_band_product = rand_band_product * rsqrt_d + + # Including 1st block (since it's global) + first_band_product = torch.einsum( + "bhlqd,bhkd->bhlqk", middle_query_matrix, blocked_key_matrix[:, :, 0] + ) # [bsz, n_heads, from_seq_len//from_block_size-4, from_block_size, -1] x [bsz, n_heads, to_block_size, -1] ==> [bsz, n_heads, from_seq_len//from_block_size-4, from_block_size, to_block_size] + first_band_product = first_band_product * rsqrt_d + + # Including last block (since it's global) + last_band_product = torch.einsum( + "bhlqd,bhkd->bhlqk", middle_query_matrix, blocked_key_matrix[:, :, -1] + ) # [bsz, n_heads, from_seq_len//from_block_size-4, from_block_size, -1] x [bsz, n_heads, to_block_size, -1] ==> [bsz, n_heads, from_seq_len//from_block_size-4, from_block_size, to_block_size] + last_band_product = last_band_product * rsqrt_d + + # masking padded tokens + inner_band_product += (1.0 - band_mask) * attn_mask_penalty + first_band_product += (1.0 - to_mask[:, :, :, :to_block_size].unsqueeze(3)) * attn_mask_penalty + last_band_product += (1.0 - to_mask[:, :, :, -to_block_size:].unsqueeze(3)) * attn_mask_penalty + rand_band_product += (1.0 - rand_mask[:, :, 1:-1]) * attn_mask_penalty + + # completing attention scores matrix for all q[-2:2] + band_product = torch.cat( + [first_band_product, inner_band_product, rand_band_product, last_band_product], dim=-1 + ) # [bsz, n_heads, from_seq_len//from_block_size-4, from_block_size, (5+n_rand_blocks)*to_block_size] + + # safely doing softmax since attention matrix is completed + attn_weights = nn.functional.softmax( + band_product, dim=-1 + ) # [bsz, n_heads, from_seq_len//from_block_size-4, from_block_size, (5+n_rand_blocks)*to_block_size] + + # contribution of sliding keys + # [bsz, n_heads, m//from_block_size-4, from_block_size, 3*to_block_size] x [bsz, n_heads, from_seq_len//from_block_size-4, 3*to_block_size, -1] + context_layer = self.torch_bmm_nd( + attn_weights[:, :, :, :, to_block_size : 4 * to_block_size], exp_blocked_value_matrix, ndim=5 + ) + # ==> [bsz, n_heads, from_seq_len//from_block_size-4, from_block_size, -1] + + # adding contribution of random keys + # [bsz, n_heads, from_seq_len//from_block_size-4, from_block_size, n_rand_blocks*to_block_size] x [bsz, n_heads, from_seq_len//from_block_size-4, n_rand_blocks*to_block_size, -1] + context_layer += self.torch_bmm_nd( + attn_weights[:, :, :, :, 4 * to_block_size : -to_block_size], gathered_value[:, :, 1:-1], ndim=5 + ) + # ==> [bsz, n_heads, from_seq_len//from_block_size-4, from_block_size, -1] + + # adding contribution of global keys + context_layer += torch.einsum( + "bhlqk,bhkd->bhlqd", attn_weights[:, :, :, :, :to_block_size], blocked_value_matrix[:, :, 0] + ) # [bsz, n_heads, from_seq_len//from_block_size-4, from_block_size, to_block_size] x [bsz, n_heads, to_block_size, -1] ==> [bsz, n_heads, from_seq_len//from_block_size-4, from_block_size, -1] + context_layer += torch.einsum( + "bhlqk,bhkd->bhlqd", attn_weights[:, :, :, :, -to_block_size:], blocked_value_matrix[:, :, -1] + ) # [bsz, n_heads, from_seq_len//from_block_size-4, from_block_size, to_block_size] x [bsz, n_heads, to_block_size, -1] ==> [bsz, n_heads, from_seq_len//from_block_size-4, from_block_size, -1] + + # 4th PART + # last 2nd token attention scores + # q[-2] x (sliding_keys, random_keys, global_keys) + # sliding key blocks -> last 3 blocks + # global key block -> 1st block + # random key block -> based on indices stored in `randn_attn` + + second_last_key_mat = torch.cat( + [ + blocked_key_matrix[:, :, 0], + blocked_key_matrix[:, :, -3], + blocked_key_matrix[:, :, -2], + blocked_key_matrix[:, :, -1], + gathered_key[:, :, -1], + ], + dim=2, + ) # [bsz, n_heads, (4+n_random_blocks)*to_block_size, -1] + second_last_value_mat = torch.cat( + [ + blocked_value_matrix[:, :, 0], + blocked_value_matrix[:, :, -3], + blocked_value_matrix[:, :, -2], + blocked_value_matrix[:, :, -1], + gathered_value[:, :, -1], + ], + dim=2, + ) # [bsz, n_heads, (4+r)*to_block_size, -1] + + # [bsz, n_heads, from_block_size, -1] x [bsz, n_heads, (4+n_rand_blocks)*to_block_size, -1] ==> [bsz, n_heads, from_block_size, (4+n_rand_blocks)*to_block_size] + second_last_product = self.torch_bmm_nd_transpose(blocked_query_matrix[:, :, -2], second_last_key_mat, ndim=4) + second_last_seq_pad = torch.cat( + [ + to_mask[:, :, :, :to_block_size], + to_mask[:, :, :, -3 * to_block_size :], + to_mask.new_ones([bsz, 1, 1, n_rand_blocks * to_block_size]), + ], + dim=3, + ) + second_last_rand_pad = torch.cat( + [ + rand_mask.new_ones([bsz, n_heads, from_block_size, 4 * to_block_size]), + rand_mask[:, :, -1], + ], + dim=3, + ) + second_last_product = second_last_product * rsqrt_d + second_last_product += (1.0 - torch.minimum(second_last_seq_pad, second_last_rand_pad)) * attn_mask_penalty + second_last_attn_weights = nn.functional.softmax( + second_last_product, dim=-1 + ) # [bsz, n_heads, from_block_size, (4+n_rand_blocks)*to_block_size] + + # [bsz, n_heads, from_block_size, (4+n_rand_blocks)*to_block_size] x [bsz, n_heads, (4+n_rand_blocks)*to_block_size, -1] ==> [bsz, n_heads, from_block_size, -1] + second_last_context_layer = self.torch_bmm_nd(second_last_attn_weights, second_last_value_mat, ndim=4) + second_last_context_layer.unsqueeze_(2) + + # 5th PART + # last block (global) attention scores + # q[-1] x (k[0], k[1], k[2], k[3], .... ) + + # [bsz, n_heads, from_block_size, -1] x [bsz, n_heads, to_seq_len, -1] ==> [bsz, n_heads, from_block_size, to_seq_len] + last_product = self.torch_bmm_nd_transpose(blocked_query_matrix[:, :, -1], key_layer, ndim=4) + last_product = last_product * rsqrt_d + last_product += (1.0 - to_mask) * attn_mask_penalty + last_attn_weights = nn.functional.softmax(last_product, dim=-1) # [bsz, n_heads, from_block_size, n] + + # [bsz, n_heads, from_block_size, to_seq_len] x [bsz, n_heads, to_seq_len, -1] ==> [bsz, n_heads, from_block_size, -1] + last_context_layer = self.torch_bmm_nd(last_attn_weights, value_layer, ndim=4) + last_context_layer.unsqueeze_(2) + + # combining representations of all tokens + context_layer = torch.cat( + [first_context_layer, second_context_layer, context_layer, second_last_context_layer, last_context_layer], + dim=2, + ) + context_layer = context_layer.view((bsz, n_heads, from_seq_len, -1)) * from_mask + context_layer = torch.transpose(context_layer, 1, 2) + + # this is just for visualizing; forward pass doesn't depend on following code + if output_attentions: + # TODO(PVP): need to verify if below code is correct + attention_probs = torch.zeros( + bsz, n_heads, from_seq_len, to_seq_len, dtype=torch.float, device=context_layer.device + ) + + # 1st query block + # corresponding to `first_context_layer` + attention_probs[:, :, :from_block_size, :] = first_attn_weights # all keys global + + # 2nd query block + # corresponding to `second_context_layer` + attention_probs[:, :, from_block_size : 2 * from_block_size, : 3 * to_block_size] = second_attn_weights[ + :, :, :, : 3 * to_block_size + ] # 1st three key blocks (global + sliding) + attention_probs[:, :, from_block_size : 2 * from_block_size, -to_block_size:] = second_attn_weights[ + :, :, :, 3 * to_block_size : 4 * to_block_size + ] # last key block (global) + # random keys + for p1, i1, w1 in zip(range(bsz), rand_attn, second_attn_weights): + # p1, i1, w1 corresponds to batch_dim i.e. following operation is done for each sequence in batch + for p2, i2, w2 in zip(range(n_heads), i1, w1): + # p2, i2, w2 corresponds to head_dim i.e. following operation is done for each heads + attn_probs_view = attention_probs.view( + bsz, + n_heads, + from_seq_len // from_block_size, + from_block_size, + to_seq_len // to_block_size, + to_block_size, + ) + right_slice = w2[:, 4 * to_block_size :] + attn_probs_view[p1, p2, 1, :, i2[0]] = right_slice.view( + from_block_size, n_rand_blocks, to_block_size + ) + + # Middle query blocks + # corresponding to `context_layer` + # sliding keys + for q_idx in range(from_seq_len // from_block_size - 4): + attn_probs_view = attention_probs.view( + bsz, + n_heads, + from_seq_len // from_block_size, + from_block_size, + to_seq_len // to_block_size, + to_block_size, + )[:, :, 2:-2, :, 1:-1, :] + right_slice = attn_weights[:, :, q_idx, :, to_block_size : 4 * to_block_size] + attn_probs_view[:, :, q_idx, :, q_idx : q_idx + 3, :] = right_slice.view( + bsz, n_heads, from_block_size, 3, to_block_size + ) # inner_band_product + # global keys (corresponding to 1st key block) + attention_probs[:, :, 2 * from_block_size : -2 * from_block_size, :to_block_size] = attn_weights[ + :, :, :, :, :to_block_size + ].view(bsz, n_heads, -1, to_block_size) # first_band_product + # global keys (corresponding to last key block) + attention_probs[:, :, 2 * from_block_size : -2 * from_block_size, -to_block_size:] = attn_weights[ + :, :, :, :, -to_block_size: + ].view(bsz, n_heads, -1, to_block_size) # last_band_product + # random keys + for p1, i1, w1 in zip(range(bsz), rand_attn, attn_weights): + # p1, i1, w1 corresponds to batch_dim i.e. following operation is done for each sequence in batch + for p2, i2, w2 in zip(range(n_heads), i1, w1): + # p2, i2, w2 corresponds to head_dim i.e. following operation is done for each heads + for q_idx in range(1, len(i2) - 1): + attn_probs_view = attention_probs.view( + bsz, + n_heads, + from_seq_len // from_block_size, + from_block_size, + to_seq_len // to_block_size, + to_block_size, + ) + right_slice = w2[q_idx - 1, :, 4 * to_block_size : -to_block_size] + attn_probs_view[p1, p2, q_idx + 1, :, i2[q_idx]] = right_slice.view( + from_block_size, n_rand_blocks, to_block_size + ) + + # Second-last query block + # corresponding to `second_last_context_layer` + attention_probs[:, :, -2 * from_block_size : -from_block_size, :to_block_size] = second_last_attn_weights[ + :, :, :, :to_block_size + ] # 1st key block (global) + attention_probs[:, :, -2 * from_block_size : -from_block_size, -3 * to_block_size :] = ( + second_last_attn_weights[:, :, :, to_block_size : 4 * to_block_size] + ) # last three blocks (global + sliding) + # random keys + for p1, i1, w1 in zip(range(bsz), rand_attn, second_last_attn_weights): + # p1, i1, w1 corresponds to batch_dim i.e. following operation is done for each sequence in batch + for p2, i2, w2 in zip(range(n_heads), i1, w1): + # p2, i2, w2 corresponds to head_dim i.e. following operation is done for each heads + attn_probs_view = attention_probs.view( + bsz, + n_heads, + from_seq_len // from_block_size, + from_block_size, + to_seq_len // to_block_size, + to_block_size, + ) + right_slice = w2[:, 4 * to_block_size :] + attn_probs_view[p1, p2, -2, :, i2[-1]] = right_slice.view( + from_block_size, n_rand_blocks, to_block_size + ) + + # last query block + # corresponding to `last_context_layer` + attention_probs[:, :, -from_block_size:, :] = last_attn_weights # all keys global + + else: + attention_probs = None + + return context_layer, attention_probs + + @staticmethod + def torch_gather_b2(params, indices): + # this operation is equivalent to tf.gather when batch_dims=2 + + if params.shape[:2] != indices.shape[:2]: + raise ValueError( + "Make sure that the first two dimensions of params and indices are identical, but" + f" they are params: {params.shape[:2]} vs. indices: {indices.shape[:2]}" + ) + num_indices_to_gather = indices.shape[-2] * indices.shape[-1] + num_indices_to_pick_from = params.shape[2] + + shift = torch.arange(indices.shape[0] * indices.shape[1] * num_indices_to_gather, device=indices.device) + indices_shift = torch.div(shift, num_indices_to_gather, rounding_mode="floor") * num_indices_to_pick_from + + flattened_indices = indices.view(-1) + indices_shift + flattened_params = params.reshape(-1, params.shape[-2], params.shape[-1]) + + out_flattened = flattened_params.index_select(0, flattened_indices) + + out = out_flattened.reshape(params.shape[:2] + (num_indices_to_gather,) + params.shape[3:]) + return out + + @staticmethod + def _create_rand_mask_from_inputs( + from_blocked_mask, + to_blocked_mask, + rand_attn, + num_attention_heads, + num_rand_blocks, + batch_size, + from_seq_length, + from_block_size, + ): + """ + Create 3D attention mask from a 2D tensor mask. + + Args: + from_blocked_mask: 2D Tensor of shape [batch_size, + from_seq_length//from_block_size, from_block_size]. + to_blocked_mask: int32 Tensor of shape [batch_size, + to_seq_length//to_block_size, to_block_size]. + rand_attn: [batch_size, num_attention_heads, + from_seq_length//from_block_size-2, num_rand_blocks] + num_attention_heads: int. Number of attention heads. + num_rand_blocks: int. Number of random chunks per row. + batch_size: int. Batch size for computation. + from_seq_length: int. length of from sequence. + from_block_size: int. size of block in from sequence. + + Returns: + float Tensor of shape [batch_size, num_attention_heads, from_seq_length//from_block_size-2, + from_block_size, num_rand_blocks*to_block_size]. + """ + num_windows = from_seq_length // from_block_size - 2 + rand_mask = torch.stack([p1[i1.flatten()] for p1, i1 in zip(to_blocked_mask, rand_attn)]) + rand_mask = rand_mask.view(batch_size, num_attention_heads, num_windows, num_rand_blocks * from_block_size) + rand_mask = torch.einsum("blq,bhlk->bhlqk", from_blocked_mask[:, 1:-1], rand_mask) + return rand_mask + + @staticmethod + def _get_rand_attn_plan(from_seq_length, from_block_size, num_rand_blocks): + """ + Gives the plan of where to put random attention. + + Args: + from_seq_length: int. length of from sequence. + from_block_size: int. size of block in from sequence. + num_rand_blocks: int. Number of random chunks per row. + + Returns: + plan_from_length: ending location of from block plan_num_rand_blocks: number of random ending location for + each block + """ + + plan_from_length = [] + plan_num_rand_blocks = [] + if (2 * num_rand_blocks + 5) < (from_seq_length // from_block_size): + plan_from_length.append(int((2 * num_rand_blocks + 5) * from_block_size)) + plan_num_rand_blocks.append(num_rand_blocks) + plan_from_length.append(from_seq_length) + plan_num_rand_blocks.append(0) + elif (num_rand_blocks + 5) < (from_seq_length // from_block_size): + plan_from_length.append(int((num_rand_blocks + 5) * from_block_size)) + plan_num_rand_blocks.append(num_rand_blocks // 2) + plan_from_length.append(from_seq_length) + plan_num_rand_blocks.append(num_rand_blocks - (num_rand_blocks // 2)) + else: + plan_from_length.append(from_seq_length) + plan_num_rand_blocks.append(num_rand_blocks) + + return plan_from_length, plan_num_rand_blocks + + def _bigbird_block_rand_mask( + self, from_seq_length, to_seq_length, from_block_size, to_block_size, num_rand_blocks, last_idx=-1 + ): + """ + Create adjacency list of random attention. + + Args: + from_seq_length: int. length of from sequence. + to_seq_length: int. length of to sequence. + from_block_size: int. size of block in from sequence. + to_block_size: int. size of block in to sequence. + num_rand_blocks: int. Number of random chunks per row. + last_idx: if -1 then num_rand_blocks blocks chosen anywhere in to sequence, + if positive then num_rand_blocks blocks chosen only up to last_idx. + + Returns: + adjacency list of size from_seq_length//from_block_size-2 by num_rand_blocks + """ + # using this method when from_seq_length in [1024, 3072, 4096] + + if from_seq_length // from_block_size != to_seq_length // to_block_size: + raise ValueError("Error the number of blocks needs to be same!") + + rand_attn = np.zeros((from_seq_length // from_block_size - 2, num_rand_blocks), dtype=np.int32) + # During inference (eval) no randomness + if not self.training: + return rand_attn + middle_seq = np.arange(1, to_seq_length // to_block_size - 1, dtype=np.int32) + last = to_seq_length // to_block_size - 1 + if last_idx > (2 * to_block_size): + last = (last_idx // to_block_size) - 1 + + r = num_rand_blocks # shorthand + for i in range(1, from_seq_length // from_block_size - 1): + start = i - 2 + end = i + if i == 1: + rand_attn[i - 1, :] = np.random.permutation(middle_seq[2:last])[:r] + elif i == 2: + rand_attn[i - 1, :] = np.random.permutation(middle_seq[3:last])[:r] + elif i == from_seq_length // from_block_size - 3: + rand_attn[i - 1, :] = np.random.permutation(middle_seq[:last])[:r] + # Missing -3: should have been sliced till last-3 + elif i == from_seq_length // from_block_size - 2: + rand_attn[i - 1, :] = np.random.permutation(middle_seq[:last])[:r] + # Missing -4: should have been sliced till last-4 + else: + if start > last: + start = last + rand_attn[i - 1, :] = np.random.permutation(middle_seq[:start])[:r] + elif (end + 1) == last: + rand_attn[i - 1, :] = np.random.permutation(middle_seq[:start])[:r] + else: + rand_attn[i - 1, :] = np.random.permutation( + np.concatenate((middle_seq[:start], middle_seq[end + 1 : last])) + )[:r] + return rand_attn + + def _bigbird_block_rand_mask_with_head( + self, + from_seq_length, + to_seq_length, + from_block_size, + to_block_size, + num_heads, + plan_from_length, + plan_num_rand_blocks, + window_block_left=1, + window_block_right=1, + global_block_top=1, + global_block_bottom=1, + global_block_left=1, + global_block_right=1, + ): + """ + Create adjacency list of random attention. + + Args: + from_seq_length: int. length of from sequence. + to_seq_length: int. length of to sequence. + from_block_size: int. size of block in from sequence. + to_block_size: int. size of block in to sequence. + num_heads: int. total number of heads. + plan_from_length: list. plan from length where num_random_blocks are chosen from. + plan_num_rand_blocks: list. number of rand blocks within the plan. + window_block_left: int. number of blocks of window to left of a block. + window_block_right: int. number of blocks of window to right of a block. + global_block_top: int. number of blocks at the top. + global_block_bottom: int. number of blocks at the bottom. + global_block_left: int. Number of blocks globally used to the left. + global_block_right: int. Number of blocks globally used to the right. + + Returns: + adjacency list of size num_head where each element is of size from_seq_length//from_block_size-2 by + num_rand_blocks + """ + # using this method when from_seq_length not in [1024, 3072, 4096] + + if from_seq_length // from_block_size != to_seq_length // to_block_size: + raise ValueError("Error the number of blocks needs to be same!") + + if from_seq_length not in plan_from_length: + raise ValueError("Error from sequence length not in plan!") + + # Total number of blocks in the mmask + num_blocks = from_seq_length // from_block_size + # Number of blocks per plan + plan_block_length = np.array(plan_from_length) // from_block_size + # till when to follow plan + max_plan_idx = plan_from_length.index(from_seq_length) + + # Random Attention adjacency list + rand_attn = [ + np.zeros((num_blocks, np.sum(plan_num_rand_blocks[: max_plan_idx + 1])), dtype=np.int32) + for i in range(num_heads) + ] + # During inference (eval) no randomness + if not self.training: + for nh in range(num_heads): + rand_attn[nh] = rand_attn[nh][global_block_top : num_blocks - global_block_bottom, :] + return rand_attn + + # We will go iteratively over the plan blocks and pick random number of + # Attention blocks from the legally allowed blocks + for plan_idx in range(max_plan_idx + 1): + rnd_r_cnt = 0 + if plan_idx > 0: + # set the row for all from_blocks starting from 0 to + # plan_block_length[plan_idx-1] + # column indx start fromm plan_block_length[plan_idx-1] and ends at + # plan_block_length[plan_idx] + if plan_num_rand_blocks[plan_idx] > 0: + rnd_r_cnt = int(np.sum(plan_num_rand_blocks[:plan_idx])) + curr_r_cnt = int(np.sum(plan_num_rand_blocks[: plan_idx + 1])) + for blk_rw_idx in range(global_block_top, plan_block_length[plan_idx - 1]): + for h in range(num_heads): + rand_attn[h][blk_rw_idx, rnd_r_cnt:curr_r_cnt] = self._get_single_block_row_attention( + block_id=blk_rw_idx, + to_start_block_id=plan_block_length[plan_idx - 1], + to_end_block_id=plan_block_length[plan_idx], + num_rand_blocks=plan_num_rand_blocks[plan_idx], + window_block_left=window_block_left, + window_block_right=window_block_right, + global_block_left=global_block_left, + global_block_right=global_block_right, + ) + + for pl_id in range(plan_idx): + if plan_num_rand_blocks[pl_id] == 0: + continue + for blk_rw_idx in range(plan_block_length[plan_idx - 1], plan_block_length[plan_idx]): + rnd_r_cnt = 0 + to_start_block_id = 0 + if pl_id > 0: + rnd_r_cnt = int(np.sum(plan_num_rand_blocks[:pl_id])) + to_start_block_id = plan_block_length[pl_id - 1] + curr_r_cnt = int(np.sum(plan_num_rand_blocks[: pl_id + 1])) + for h in range(num_heads): + rand_attn[h][blk_rw_idx, rnd_r_cnt:curr_r_cnt] = self._get_single_block_row_attention( + block_id=blk_rw_idx, + to_start_block_id=to_start_block_id, + to_end_block_id=plan_block_length[pl_id], + num_rand_blocks=plan_num_rand_blocks[pl_id], + window_block_left=window_block_left, + window_block_right=window_block_right, + global_block_left=global_block_left, + global_block_right=global_block_right, + ) + + if plan_num_rand_blocks[plan_idx] == 0: + continue + curr_r_cnt = int(np.sum(plan_num_rand_blocks[: plan_idx + 1])) + from_start_block_id = global_block_top + to_start_block_id = 0 + if plan_idx > 0: + rnd_r_cnt = int(np.sum(plan_num_rand_blocks[:plan_idx])) + from_start_block_id = plan_block_length[plan_idx - 1] + to_start_block_id = plan_block_length[plan_idx - 1] + + for blk_rw_idx in range(from_start_block_id, plan_block_length[plan_idx]): + for h in range(num_heads): + rand_attn[h][blk_rw_idx, rnd_r_cnt:curr_r_cnt] = self._get_single_block_row_attention( + block_id=blk_rw_idx, + to_start_block_id=to_start_block_id, + to_end_block_id=plan_block_length[plan_idx], + num_rand_blocks=plan_num_rand_blocks[plan_idx], + window_block_left=window_block_left, + window_block_right=window_block_right, + global_block_left=global_block_left, + global_block_right=global_block_right, + ) + + for nh in range(num_heads): + rand_attn[nh] = rand_attn[nh][global_block_top : num_blocks - global_block_bottom, :] + + return rand_attn + + @staticmethod + def _get_single_block_row_attention( + block_id, + to_start_block_id, + to_end_block_id, + num_rand_blocks, + window_block_left=1, + window_block_right=1, + global_block_left=1, + global_block_right=1, + ): + """ + For a single row block get random row attention. + + Args: + block_id: int. block id of row. + to_start_block_id: int. random attention column start id. + to_end_block_id: int. random attention column end id. + num_rand_blocks: int. number of random blocks to be selected. + window_block_left: int. number of blocks of window to left of a block. + window_block_right: int. number of blocks of window to right of a block. + global_block_left: int. Number of blocks globally used to the left. + global_block_right: int. Number of blocks globally used to the right. + + Returns: + row containing the random attention vector of size num_rand_blocks. + """ + # list of to_blocks from which to choose random attention + to_block_list = np.arange(to_start_block_id, to_end_block_id, dtype=np.int32) + # permute the blocks + perm_block = np.random.permutation(to_block_list) + + # illegal blocks for the current block id, using window + illegal_blocks = list(range(block_id - window_block_left, block_id + window_block_right + 1)) + + # Add blocks at the start and at the end + illegal_blocks.extend(list(range(global_block_left))) + illegal_blocks.extend(list(range(to_end_block_id - global_block_right, to_end_block_id))) + + # The second from_block cannot choose random attention on second last to_block + if block_id == 1: + illegal_blocks.append(to_end_block_id - 2) + + # The second last from_block cannot choose random attention on second to_block + if block_id == to_end_block_id - 2: + illegal_blocks.append(1) + + selected_random_blokcs = [] + + for i in range(to_end_block_id - to_start_block_id): + if perm_block[i] not in illegal_blocks: + selected_random_blokcs.append(perm_block[i]) + if len(selected_random_blokcs) == num_rand_blocks: + break + return np.array(selected_random_blokcs, dtype=np.int32) + + +# Copied from transformers.models.bert.modeling_bert.BertSelfOutput with Bert->BigBird +class BigBirdSelfOutput(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = self.LayerNorm(hidden_states + input_tensor) + return hidden_states + + +class BigBirdAttention(nn.Module): + def __init__(self, config, seed=None): + super().__init__() + self.attention_type = config.attention_type + self.config = config + self.seed = seed + + if self.config.attention_type == "original_full": + self.self = BigBirdSelfAttention(config) + elif self.config.attention_type == "block_sparse": + self.self = BigBirdBlockSparseAttention(config, seed) + else: + raise ValueError( + f"attention_type can either be original_full or block_sparse, but is {self.config.attention_type}" + ) + + self.output = BigBirdSelfOutput(config) + + def set_attention_type(self, value: str): + if value not in ["original_full", "block_sparse"]: + raise ValueError( + f"attention_type can only be set to either 'original_full' or 'block_sparse', but is {value}" + ) + # attention type is already correctly set + if value == self.attention_type: + return + + self.attention_type = value + if value == "original_full": + # copy all weights to new full attention class + attn_weights = BigBirdSelfAttention(self.config) + else: + # copy all weights to new sparse attention class + attn_weights = BigBirdBlockSparseAttention(self.config, self.seed) + + attn_weights.query = self.self.query + attn_weights.value = self.self.value + attn_weights.key = self.self.key + self.self = attn_weights + self.attention_type = value + if not self.training: + self.self.eval() + + def forward( + self, + hidden_states, + attention_mask=None, + head_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + past_key_value=None, + output_attentions=False, + # block_sparse config + band_mask=None, + from_mask=None, + to_mask=None, + from_blocked_mask=None, + to_blocked_mask=None, + ): + # fp16 compatibility + if band_mask is not None: + band_mask = band_mask.to(hidden_states.dtype) + if from_mask is not None: + from_mask = from_mask.to(hidden_states.dtype) + if to_mask is not None: + to_mask = to_mask.to(hidden_states.dtype) + if self.attention_type == "original_full": + self_outputs = self.self( + hidden_states, + attention_mask, + head_mask, + encoder_hidden_states, + encoder_attention_mask, + past_key_value, + output_attentions, + ) + else: + if encoder_hidden_states is not None: + raise ValueError("BigBird cannot be used as a decoder when config.attention_type != 'original_full'") + self_outputs = self.self( + hidden_states, band_mask, from_mask, to_mask, from_blocked_mask, to_blocked_mask, output_attentions + ) + + attention_output = self.output(self_outputs[0], hidden_states) + outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them + return outputs + + +# Copied from transformers.models.bert.modeling_bert.BertIntermediate with Bert->BigBird +class BigBirdIntermediate(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.intermediate_size) + if isinstance(config.hidden_act, str): + self.intermediate_act_fn = ACT2FN[config.hidden_act] + else: + self.intermediate_act_fn = config.hidden_act + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.intermediate_act_fn(hidden_states) + return hidden_states + + +# Copied from transformers.models.bert.modeling_bert.BertOutput with Bert->BigBird +class BigBirdOutput(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.intermediate_size, config.hidden_size) + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = self.LayerNorm(hidden_states + input_tensor) + return hidden_states + + +class BigBirdLayer(nn.Module): + def __init__(self, config, seed=None): + super().__init__() + self.config = config + self.attention_type = config.attention_type + self.chunk_size_feed_forward = config.chunk_size_feed_forward + self.seq_len_dim = 1 + self.attention = BigBirdAttention(config, seed=seed) + self.is_decoder = config.is_decoder + self.add_cross_attention = config.add_cross_attention + if self.add_cross_attention: + if not self.is_decoder: + raise TypeError(f"{self} should be used as a decoder model if cross attention is added") + self.crossattention = BigBirdAttention(config) + self.intermediate = BigBirdIntermediate(config) + self.output = BigBirdOutput(config) + + def set_attention_type(self, value: str): + if value not in ["original_full", "block_sparse"]: + raise ValueError( + f"attention_type can only be set to either 'original_full' or 'block_sparse', but is {value}" + ) + # attention type is already correctly set + if value == self.attention_type: + return + self.attention_type = value + self.attention.set_attention_type(value) + + if self.add_cross_attention: + self.crossattention.set_attention_type(value) + + def forward( + self, + hidden_states, + attention_mask=None, + head_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + band_mask=None, + from_mask=None, + to_mask=None, + blocked_encoder_mask=None, + past_key_value=None, + output_attentions=False, + ): + # decoder uni-directional self-attention cached key/values tuple is at positions 1,2 + self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None + self_attention_outputs = self.attention( + hidden_states, + attention_mask, + head_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + past_key_value=self_attn_past_key_value, + output_attentions=output_attentions, + band_mask=band_mask, + from_mask=from_mask, + to_mask=to_mask, + from_blocked_mask=blocked_encoder_mask, + to_blocked_mask=blocked_encoder_mask, + ) + attention_output = self_attention_outputs[0] + + # if decoder, the last output is tuple of self-attn cache + if self.is_decoder: + outputs = self_attention_outputs[1:-1] + present_key_value = self_attention_outputs[-1] + else: + outputs = self_attention_outputs[1:] # add self attentions if we output attention weights + + cross_attn_present_key_value = None + if self.is_decoder and encoder_hidden_states is not None: + if not hasattr(self, "crossattention"): + raise ValueError( + f"If `encoder_hidden_states` are passed, {self} has to be instantiated with " + " cross-attention layers by setting `config.add_cross_attention=True`" + ) + + # cross_attn cached key/values tuple is at positions 3,4 of past_key_value tuple + cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None + cross_attention_outputs = self.crossattention( + attention_output, + attention_mask, + head_mask, + encoder_hidden_states, + encoder_attention_mask, + cross_attn_past_key_value, + output_attentions, + ) + attention_output = cross_attention_outputs[0] + outputs = outputs + cross_attention_outputs[1:-1] # add cross attentions if we output attention weights + + # add cross-attn cache to positions 3,4 of present_key_value tuple + cross_attn_present_key_value = cross_attention_outputs[-1] + present_key_value = present_key_value + cross_attn_present_key_value + + layer_output = apply_chunking_to_forward( + self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attention_output + ) + + outputs = (layer_output,) + outputs + + # if decoder, return the attn key/values as the last output + if self.is_decoder: + outputs = outputs + (present_key_value,) + + return outputs + + def feed_forward_chunk(self, attention_output): + intermediate_output = self.intermediate(attention_output) + layer_output = self.output(intermediate_output, attention_output) + return layer_output + + +class BigBirdEncoder(nn.Module): + def __init__(self, config): + super().__init__() + self.config = config + self.attention_type = config.attention_type + + self.layer = nn.ModuleList( + [BigBirdLayer(config, seed=layer_idx) for layer_idx in range(config.num_hidden_layers)] + ) + self.gradient_checkpointing = False + + def set_attention_type(self, value: str): + if value not in ["original_full", "block_sparse"]: + raise ValueError( + f"attention_type can only be set to either 'original_full' or 'block_sparse', but is {value}" + ) + # attention type is already correctly set + if value == self.attention_type: + return + self.attention_type = value + for layer in self.layer: + layer.set_attention_type(value) + + def forward( + self, + hidden_states, + attention_mask=None, + head_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + past_key_values=None, + use_cache=None, + output_attentions=False, + output_hidden_states=False, + band_mask=None, + from_mask=None, + to_mask=None, + blocked_encoder_mask=None, + return_dict=True, + ) -> Union[BaseModelOutputWithPastAndCrossAttentions, Tuple]: + all_hidden_states = () if output_hidden_states else None + all_self_attentions = () if output_attentions else None + all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None + + if self.gradient_checkpointing and self.training: + if use_cache: + logger.warning_once( + "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." + ) + use_cache = False + + next_decoder_cache = () if use_cache else None + + for i, layer_module in enumerate(self.layer): + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + layer_head_mask = head_mask[i] if head_mask is not None else None + past_key_value = past_key_values[i] if past_key_values is not None else None + + if self.gradient_checkpointing and self.training: + layer_outputs = self._gradient_checkpointing_func( + layer_module.__call__, + hidden_states, + attention_mask, + layer_head_mask, + encoder_hidden_states, + encoder_attention_mask, + band_mask, + from_mask, + to_mask, + blocked_encoder_mask, + past_key_value, + output_attentions, + ) + else: + layer_outputs = layer_module( + hidden_states, + attention_mask, + layer_head_mask, + encoder_hidden_states, + encoder_attention_mask, + band_mask, + from_mask, + to_mask, + blocked_encoder_mask, + past_key_value, + output_attentions, + ) + + hidden_states = layer_outputs[0] + if use_cache: + next_decoder_cache += (layer_outputs[-1],) + if output_attentions: + all_self_attentions = all_self_attentions + (layer_outputs[1],) + if self.config.add_cross_attention: + all_cross_attentions = all_cross_attentions + (layer_outputs[2],) + + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + if not return_dict: + return tuple( + v + for v in [ + hidden_states, + next_decoder_cache, + all_hidden_states, + all_self_attentions, + all_cross_attentions, + ] + if v is not None + ) + return BaseModelOutputWithPastAndCrossAttentions( + last_hidden_state=hidden_states, + past_key_values=next_decoder_cache, + hidden_states=all_hidden_states, + attentions=all_self_attentions, + cross_attentions=all_cross_attentions, + ) + + +# Copied from transformers.models.bert.modeling_bert.BertPredictionHeadTransform with Bert->BigBird +class BigBirdPredictionHeadTransform(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + if isinstance(config.hidden_act, str): + self.transform_act_fn = ACT2FN[config.hidden_act] + else: + self.transform_act_fn = config.hidden_act + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.transform_act_fn(hidden_states) + hidden_states = self.LayerNorm(hidden_states) + return hidden_states + + +# Copied from transformers.models.bert.modeling_bert.BertLMPredictionHead with Bert->BigBird +class BigBirdLMPredictionHead(nn.Module): + def __init__(self, config): + super().__init__() + self.transform = BigBirdPredictionHeadTransform(config) + + # The output weights are the same as the input embeddings, but there is + # an output-only bias for each token. + self.decoder = nn.Linear(config.hidden_size, config.vocab_size, bias=False) + + self.bias = nn.Parameter(torch.zeros(config.vocab_size)) + + # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings` + self.decoder.bias = self.bias + + def _tie_weights(self): + self.decoder.bias = self.bias + + def forward(self, hidden_states): + hidden_states = self.transform(hidden_states) + hidden_states = self.decoder(hidden_states) + return hidden_states + + +# Copied from transformers.models.bert.modeling_bert.BertOnlyMLMHead with Bert->BigBird +class BigBirdOnlyMLMHead(nn.Module): + def __init__(self, config): + super().__init__() + self.predictions = BigBirdLMPredictionHead(config) + + def forward(self, sequence_output: torch.Tensor) -> torch.Tensor: + prediction_scores = self.predictions(sequence_output) + return prediction_scores + + +# Copied from transformers.models.bert.modeling_bert.BertOnlyNSPHead with Bert->BigBird +class BigBirdOnlyNSPHead(nn.Module): + def __init__(self, config): + super().__init__() + self.seq_relationship = nn.Linear(config.hidden_size, 2) + + def forward(self, pooled_output): + seq_relationship_score = self.seq_relationship(pooled_output) + return seq_relationship_score + + +# Copied from transformers.models.bert.modeling_bert.BertPreTrainingHeads with Bert->BigBird +class BigBirdPreTrainingHeads(nn.Module): + def __init__(self, config): + super().__init__() + self.predictions = BigBirdLMPredictionHead(config) + self.seq_relationship = nn.Linear(config.hidden_size, 2) + + def forward(self, sequence_output, pooled_output): + prediction_scores = self.predictions(sequence_output) + seq_relationship_score = self.seq_relationship(pooled_output) + return prediction_scores, seq_relationship_score + + +class BigBirdPreTrainedModel(PreTrainedModel): + """ + An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained + models. + """ + + config_class = BigBirdConfig + load_tf_weights = load_tf_weights_in_big_bird + base_model_prefix = "bert" + supports_gradient_checkpointing = True + + def _init_weights(self, module): + """Initialize the weights""" + if isinstance(module, nn.Linear): + # Slightly different from the TF version which uses truncated_normal for initialization + # cf https://github.com/pytorch/pytorch/pull/5617 + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + elif isinstance(module, nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + elif isinstance(module, BigBirdLMPredictionHead): + module.bias.data.zero_() + + +BIG_BIRD_START_DOCSTRING = r""" + This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use + it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and + behavior. + + Parameters: + config ([`BigBirdConfig`]): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights. +""" + +BIG_BIRD_INPUTS_DOCSTRING = r""" + Args: + input_ids (`torch.LongTensor` of shape `({0})`): + Indices of input sequence tokens in the vocabulary. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*): + Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0, + 1]`: + + - 0 corresponds to a *sentence A* token, + - 1 corresponds to a *sentence B* token. + + [What are token type IDs?](../glossary#token-type-ids) + position_ids (`torch.LongTensor` of shape `({0})`, *optional*): + Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, + config.max_position_embeddings - 1]`. + + [What are position IDs?](../glossary#position-ids) + head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*): + Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This + is useful if you want more control over how to convert *input_ids* indices into associated vectors than the + model's internal embedding lookup matrix. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. +""" + + +@dataclass +class BigBirdForPreTrainingOutput(ModelOutput): + """ + Output type of [`BigBirdForPreTraining`]. + + Args: + loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`): + Total loss as the sum of the masked language modeling loss and the next sequence prediction + (classification) loss. + prediction_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`): + Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). + seq_relationship_logits (`torch.FloatTensor` of shape `(batch_size, 2)`): + Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation + before SoftMax). + hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of + shape `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + + loss: Optional[torch.FloatTensor] = None + prediction_logits: Optional[torch.FloatTensor] = None + seq_relationship_logits: Optional[torch.FloatTensor] = None + hidden_states: Optional[Tuple[torch.FloatTensor]] = None + attentions: Optional[Tuple[torch.FloatTensor]] = None + + +@dataclass +class BigBirdForQuestionAnsweringModelOutput(ModelOutput): + """ + Base class for outputs of question answering models. + + Args: + loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided): + Total span extraction loss is the sum of a Cross-Entropy for the start and end positions. + start_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length)`): + Span-start scores (before SoftMax). + end_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length)`): + Span-end scores (before SoftMax). + pooler_output (`torch.FloatTensor` of shape `(batch_size, 1)`): + pooler output from BigBigModel + hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of + shape `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + + loss: Optional[torch.FloatTensor] = None + start_logits: Optional[torch.FloatTensor] = None + end_logits: Optional[torch.FloatTensor] = None + pooler_output: Optional[torch.FloatTensor] = None + hidden_states: Optional[Tuple[torch.FloatTensor]] = None + attentions: Optional[Tuple[torch.FloatTensor]] = None + + +@add_start_docstrings( + "The bare BigBird Model transformer outputting raw hidden-states without any specific head on top.", + BIG_BIRD_START_DOCSTRING, +) +class BigBirdModel(BigBirdPreTrainedModel): + """ + + The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of + cross-attention is added between the self-attention layers, following the architecture described in [Attention is + all you need](https://arxiv.org/abs/1706.03762) by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, + Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin. + + To behave as an decoder the model needs to be initialized with the `is_decoder` argument of the configuration set + to `True`. To be used in a Seq2Seq model, the model needs to initialized with both `is_decoder` argument and + `add_cross_attention` set to `True`; an `encoder_hidden_states` is then expected as an input to the forward pass. + """ + + def __init__(self, config, add_pooling_layer=True): + super().__init__(config) + self.attention_type = self.config.attention_type + self.config = config + + self.block_size = self.config.block_size + + self.embeddings = BigBirdEmbeddings(config) + self.encoder = BigBirdEncoder(config) + + if add_pooling_layer: + self.pooler = nn.Linear(config.hidden_size, config.hidden_size) + self.activation = nn.Tanh() + else: + self.pooler = None + self.activation = None + + if self.attention_type != "original_full" and config.add_cross_attention: + logger.warning( + "When using `BigBirdForCausalLM` as decoder, then `attention_type` must be `original_full`. Setting" + " `attention_type=original_full`" + ) + self.set_attention_type("original_full") + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.embeddings.word_embeddings + + def set_input_embeddings(self, value): + self.embeddings.word_embeddings = value + + def set_attention_type(self, value: str): + if value not in ["original_full", "block_sparse"]: + raise ValueError( + f"attention_type can only be set to either 'original_full' or 'block_sparse', but is {value}" + ) + # attention type is already correctly set + if value == self.attention_type: + return + self.attention_type = value + self.encoder.set_attention_type(value) + + @add_start_docstrings_to_model_forward(BIG_BIRD_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @add_code_sample_docstrings( + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=BaseModelOutputWithPoolingAndCrossAttentions, + config_class=_CONFIG_FOR_DOC, + ) + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.FloatTensor] = None, + token_type_ids: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + encoder_attention_mask: Optional[torch.FloatTensor] = None, + past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + **kwargs, # NOOP kwargs, for now + ) -> Union[BaseModelOutputWithPoolingAndCrossAttentions, Tuple[torch.FloatTensor]]: + r""" + encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if + the model is configured as a decoder. + encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in + the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`): + Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding. + If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that + don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all + `decoder_input_ids` of shape `(batch_size, sequence_length)`. + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see + `past_key_values`). + """ + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if self.config.is_decoder: + use_cache = use_cache if use_cache is not None else self.config.use_cache + else: + use_cache = False + + if input_ids is not None and inputs_embeds is not None: + raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") + elif input_ids is not None: + self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask) + input_shape = input_ids.size() + elif inputs_embeds is not None: + input_shape = inputs_embeds.size()[:-1] + else: + raise ValueError("You have to specify either input_ids or inputs_embeds") + + batch_size, seq_length = input_shape + device = input_ids.device if input_ids is not None else inputs_embeds.device + + # past_key_values_length + past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0 + + if attention_mask is None: + attention_mask = torch.ones(((batch_size, seq_length + past_key_values_length)), device=device) + if token_type_ids is None: + if hasattr(self.embeddings, "token_type_ids"): + buffered_token_type_ids = self.embeddings.token_type_ids[:, :seq_length] + buffered_token_type_ids_expanded = buffered_token_type_ids.expand(batch_size, seq_length) + token_type_ids = buffered_token_type_ids_expanded + else: + token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device) + + # in order to use block_sparse attention, sequence_length has to be at least + # bigger than all global attentions: 2 * block_size + # + sliding tokens: 3 * block_size + # + random tokens: 2 * num_random_blocks * block_size + max_tokens_to_attend = (5 + 2 * self.config.num_random_blocks) * self.config.block_size + if self.attention_type == "block_sparse" and seq_length <= max_tokens_to_attend: + # change attention_type from block_sparse to original_full + sequence_length = input_ids.size(1) if input_ids is not None else inputs_embeds.size(1) + logger.warning( + "Attention type 'block_sparse' is not possible if sequence_length: " + f"{sequence_length} <= num global tokens: 2 * config.block_size " + "+ min. num sliding tokens: 3 * config.block_size " + "+ config.num_random_blocks * config.block_size " + "+ additional buffer: config.num_random_blocks * config.block_size " + f"= {max_tokens_to_attend} with config.block_size " + f"= {self.config.block_size}, config.num_random_blocks " + f"= {self.config.num_random_blocks}. " + "Changing attention type to 'original_full'..." + ) + self.set_attention_type("original_full") + + if self.attention_type == "block_sparse": + ( + padding_len, + input_ids, + attention_mask, + token_type_ids, + position_ids, + inputs_embeds, + ) = self._pad_to_block_size( + input_ids=input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + inputs_embeds=inputs_embeds, + pad_token_id=self.config.pad_token_id, + ) + else: + padding_len = 0 + + if self.attention_type == "block_sparse": + blocked_encoder_mask, band_mask, from_mask, to_mask = self.create_masks_for_block_sparse_attn( + attention_mask, self.block_size + ) + extended_attention_mask = None + + elif self.attention_type == "original_full": + blocked_encoder_mask = None + band_mask = None + from_mask = None + to_mask = None + # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length] + # ourselves in which case we just need to make it broadcastable to all heads. + extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape) + else: + raise ValueError( + f"attention_type can either be original_full or block_sparse, but is {self.attention_type}" + ) + + # If a 2D or 3D attention mask is provided for the cross-attention + # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length] + if self.config.is_decoder and encoder_hidden_states is not None: + encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size() + encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length) + if encoder_attention_mask is None: + encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device) + encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask) + else: + encoder_extended_attention_mask = None + + # Prepare head mask if needed + # 1.0 in head_mask indicate we keep the head + # attention_probs has shape bsz x n_heads x N x N + # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads] + # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length] + head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers) + + embedding_output = self.embeddings( + input_ids=input_ids, + position_ids=position_ids, + token_type_ids=token_type_ids, + inputs_embeds=inputs_embeds, + past_key_values_length=past_key_values_length, + ) + + encoder_outputs = self.encoder( + embedding_output, + attention_mask=extended_attention_mask, + head_mask=head_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_extended_attention_mask, + past_key_values=past_key_values, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + band_mask=band_mask, + from_mask=from_mask, + to_mask=to_mask, + blocked_encoder_mask=blocked_encoder_mask, + return_dict=return_dict, + ) + sequence_output = encoder_outputs[0] + + pooler_output = self.activation(self.pooler(sequence_output[:, 0, :])) if (self.pooler is not None) else None + + # undo padding + if padding_len > 0: + # unpad `sequence_output` because the calling function is expecting a length == input_ids.size(1) + sequence_output = sequence_output[:, :-padding_len] + + if not return_dict: + return (sequence_output, pooler_output) + encoder_outputs[1:] + + return BaseModelOutputWithPoolingAndCrossAttentions( + last_hidden_state=sequence_output, + pooler_output=pooler_output, + past_key_values=encoder_outputs.past_key_values, + hidden_states=encoder_outputs.hidden_states, + attentions=encoder_outputs.attentions, + cross_attentions=encoder_outputs.cross_attentions, + ) + + @staticmethod + def create_masks_for_block_sparse_attn(attention_mask: torch.Tensor, block_size: int): + batch_size, seq_length = attention_mask.size() + if seq_length % block_size != 0: + raise ValueError( + f"Sequence length must be multiple of block size, but sequence length is {seq_length}, while block" + f" size is {block_size}." + ) + + def create_band_mask_from_inputs(from_blocked_mask, to_blocked_mask): + """ + Create 3D attention mask from a 2D tensor mask. + + Args: + from_blocked_mask: 2D Tensor of shape [batch_size, + from_seq_length//from_block_size, from_block_size]. + to_blocked_mask: int32 Tensor of shape [batch_size, + to_seq_length//to_block_size, to_block_size]. + + Returns: + float Tensor of shape [batch_size, 1, from_seq_length//from_block_size-4, from_block_size, + 3*to_block_size]. + """ + exp_blocked_to_pad = torch.cat( + [to_blocked_mask[:, 1:-3], to_blocked_mask[:, 2:-2], to_blocked_mask[:, 3:-1]], dim=2 + ) + band_mask = torch.einsum("blq,blk->blqk", from_blocked_mask[:, 2:-2], exp_blocked_to_pad) + band_mask.unsqueeze_(1) + return band_mask + + blocked_encoder_mask = attention_mask.view(batch_size, seq_length // block_size, block_size) + band_mask = create_band_mask_from_inputs(blocked_encoder_mask, blocked_encoder_mask) + + from_mask = attention_mask.view(batch_size, 1, seq_length, 1) + to_mask = attention_mask.view(batch_size, 1, 1, seq_length) + + return blocked_encoder_mask, band_mask, from_mask, to_mask + + def _pad_to_block_size( + self, + input_ids: torch.Tensor, + attention_mask: torch.Tensor, + token_type_ids: torch.Tensor, + position_ids: torch.Tensor, + inputs_embeds: torch.Tensor, + pad_token_id: int, + ): + """A helper function to pad tokens and mask to work with implementation of BigBird block-sparse attention.""" + # padding + block_size = self.config.block_size + + input_shape = input_ids.shape if input_ids is not None else inputs_embeds.shape + batch_size, seq_len = input_shape[:2] + + padding_len = (block_size - seq_len % block_size) % block_size + if padding_len > 0: + logger.warning_once( + f"Input ids are automatically padded from {seq_len} to {seq_len + padding_len} to be a multiple of " + f"`config.block_size`: {block_size}" + ) + if input_ids is not None: + input_ids = nn.functional.pad(input_ids, (0, padding_len), value=pad_token_id) + if position_ids is not None: + # pad with position_id = pad_token_id as in modeling_bigbird.BigBirdEmbeddings + position_ids = nn.functional.pad(position_ids, (0, padding_len), value=pad_token_id) + if inputs_embeds is not None: + input_ids_padding = inputs_embeds.new_full( + (batch_size, padding_len), + self.config.pad_token_id, + dtype=torch.long, + ) + inputs_embeds_padding = self.embeddings(input_ids_padding) + inputs_embeds = torch.cat([inputs_embeds, inputs_embeds_padding], dim=-2) + + attention_mask = nn.functional.pad( + attention_mask, (0, padding_len), value=False + ) # no attention on the padding tokens + token_type_ids = nn.functional.pad(token_type_ids, (0, padding_len), value=0) # pad with token_type_id = 0 + + return padding_len, input_ids, attention_mask, token_type_ids, position_ids, inputs_embeds + + +class BigBirdForPreTraining(BigBirdPreTrainedModel): + _tied_weights_keys = ["cls.predictions.decoder.weight", "cls.predictions.decoder.bias"] + + def __init__(self, config): + super().__init__(config) + + self.bert = BigBirdModel(config, add_pooling_layer=True) + self.cls = BigBirdPreTrainingHeads(config) + + # Initialize weights and apply final processing + self.post_init() + + def get_output_embeddings(self): + return self.cls.predictions.decoder + + def set_output_embeddings(self, new_embeddings): + self.cls.predictions.decoder = new_embeddings + self.cls.predictions.bias = new_embeddings.bias + + @add_start_docstrings_to_model_forward(BIG_BIRD_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @replace_return_docstrings(output_type=BigBirdForPreTrainingOutput, config_class=_CONFIG_FOR_DOC) + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.FloatTensor] = None, + token_type_ids: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.FloatTensor] = None, + next_sentence_label: Optional[torch.LongTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[BigBirdForPreTrainingOutput, Tuple[torch.FloatTensor]]: + r""" + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ..., + config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the + loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]` + next_sentence_label (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the next sequence prediction (classification) loss. If specified, nsp loss will be + added to masked_lm loss. Input should be a sequence pair (see `input_ids` docstring) Indices should be in + `[0, 1]`: + + - 0 indicates sequence B is a continuation of sequence A, + - 1 indicates sequence B is a random sequence. + kwargs (`Dict[str, any]`, *optional*, defaults to `{}`): + Used to hide legacy arguments that have been deprecated. + + Returns: + + Example: + + ```python + >>> from transformers import AutoTokenizer, BigBirdForPreTraining + >>> import torch + + >>> tokenizer = AutoTokenizer.from_pretrained("google/bigbird-roberta-base") + >>> model = BigBirdForPreTraining.from_pretrained("google/bigbird-roberta-base") + + >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt") + >>> outputs = model(**inputs) + + >>> prediction_logits = outputs.prediction_logits + >>> seq_relationship_logits = outputs.seq_relationship_logits + ```""" + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.bert( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + sequence_output, pooled_output = outputs[:2] + prediction_scores, seq_relationship_score = self.cls(sequence_output, pooled_output) + + total_loss = None + if labels is not None: + loss_fct = CrossEntropyLoss() + total_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1)) + + if next_sentence_label is not None and total_loss is not None: + next_sentence_loss = loss_fct(seq_relationship_score.view(-1, 2), next_sentence_label.view(-1)) + total_loss = total_loss + next_sentence_loss + + if not return_dict: + output = (prediction_scores, seq_relationship_score) + outputs[2:] + return ((total_loss,) + output) if total_loss is not None else output + + return BigBirdForPreTrainingOutput( + loss=total_loss, + prediction_logits=prediction_scores, + seq_relationship_logits=seq_relationship_score, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +@add_start_docstrings("""BigBird Model with a `language modeling` head on top.""", BIG_BIRD_START_DOCSTRING) +class BigBirdForMaskedLM(BigBirdPreTrainedModel): + _tied_weights_keys = ["cls.predictions.decoder.weight", "cls.predictions.decoder.bias"] + + def __init__(self, config): + super().__init__(config) + + if config.is_decoder: + logger.warning( + "If you want to use `BigBirdForMaskedLM` make sure `config.is_decoder=False` for " + "bi-directional self-attention." + ) + + self.bert = BigBirdModel(config) + self.cls = BigBirdOnlyMLMHead(config) + + # Initialize weights and apply final processing + self.post_init() + + def get_output_embeddings(self): + return self.cls.predictions.decoder + + def set_output_embeddings(self, new_embeddings): + self.cls.predictions.decoder = new_embeddings + self.cls.predictions.bias = new_embeddings.bias + + @add_start_docstrings_to_model_forward(BIG_BIRD_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @replace_return_docstrings(output_type=MaskedLMOutput, config_class=_CONFIG_FOR_DOC) + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.FloatTensor] = None, + token_type_ids: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + encoder_attention_mask: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[MaskedLMOutput, Tuple[torch.FloatTensor]]: + r""" + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ..., + config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the + loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. + + Returns: + + Example: + + ```python + >>> import torch + >>> from transformers import AutoTokenizer, BigBirdForMaskedLM + >>> from datasets import load_dataset + + >>> tokenizer = AutoTokenizer.from_pretrained("google/bigbird-roberta-base") + >>> model = BigBirdForMaskedLM.from_pretrained("google/bigbird-roberta-base") + >>> squad_ds = load_dataset("rajpurkar/squad_v2", split="train") # doctest: +IGNORE_RESULT + + >>> # select random long article + >>> LONG_ARTICLE_TARGET = squad_ds[81514]["context"] + >>> # select random sentence + >>> LONG_ARTICLE_TARGET[332:398] + 'the highest values are very close to the theoretical maximum value' + + >>> # add mask_token + >>> LONG_ARTICLE_TO_MASK = LONG_ARTICLE_TARGET.replace("maximum", "[MASK]") + >>> inputs = tokenizer(LONG_ARTICLE_TO_MASK, return_tensors="pt") + >>> # long article input + >>> list(inputs["input_ids"].shape) + [1, 919] + + >>> with torch.no_grad(): + ... logits = model(**inputs).logits + >>> # retrieve index of [MASK] + >>> mask_token_index = (inputs.input_ids == tokenizer.mask_token_id)[0].nonzero(as_tuple=True)[0] + >>> predicted_token_id = logits[0, mask_token_index].argmax(axis=-1) + >>> tokenizer.decode(predicted_token_id) + 'maximum' + ``` + + ```python + >>> labels = tokenizer(LONG_ARTICLE_TARGET, return_tensors="pt")["input_ids"] + >>> labels = torch.where(inputs.input_ids == tokenizer.mask_token_id, labels, -100) + >>> outputs = model(**inputs, labels=labels) + >>> round(outputs.loss.item(), 2) + 1.99 + ``` + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.bert( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + sequence_output = outputs[0] + prediction_scores = self.cls(sequence_output) + + masked_lm_loss = None + if labels is not None: + loss_fct = CrossEntropyLoss() # -100 index = padding token + masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1)) + + if not return_dict: + output = (prediction_scores,) + outputs[2:] + return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output + + return MaskedLMOutput( + loss=masked_lm_loss, + logits=prediction_scores, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + def prepare_inputs_for_generation(self, input_ids, attention_mask=None, **model_kwargs): + input_shape = input_ids.shape + effective_batch_size = input_shape[0] + + # add a dummy token + if self.config.pad_token_id is None: + raise ValueError("The PAD token should be defined for generation") + attention_mask = torch.cat([attention_mask, attention_mask.new_zeros((attention_mask.shape[0], 1))], dim=-1) + dummy_token = torch.full( + (effective_batch_size, 1), self.config.pad_token_id, dtype=torch.long, device=input_ids.device + ) + input_ids = torch.cat([input_ids, dummy_token], dim=1) + + return {"input_ids": input_ids, "attention_mask": attention_mask} + + +@add_start_docstrings( + """BigBird Model with a `language modeling` head on top for CLM fine-tuning.""", BIG_BIRD_START_DOCSTRING +) +class BigBirdForCausalLM(BigBirdPreTrainedModel, GenerationMixin): + _tied_weights_keys = ["cls.predictions.decoder.weight", "cls.predictions.decoder.bias"] + + def __init__(self, config): + super().__init__(config) + + if not config.is_decoder: + logger.warning("If you want to use `BigBirdForCausalLM` as a standalone, add `is_decoder=True.`") + + self.bert = BigBirdModel(config) + self.cls = BigBirdOnlyMLMHead(config) + + # Initialize weights and apply final processing + self.post_init() + + def get_output_embeddings(self): + return self.cls.predictions.decoder + + def set_output_embeddings(self, new_embeddings): + self.cls.predictions.decoder = new_embeddings + self.cls.predictions.bias = new_embeddings.bias + + @add_start_docstrings_to_model_forward(BIG_BIRD_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @add_code_sample_docstrings( + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=CausalLMOutputWithCrossAttentions, + config_class=_CONFIG_FOR_DOC, + ) + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.FloatTensor] = None, + token_type_ids: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + encoder_attention_mask: Optional[torch.FloatTensor] = None, + past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + **kwargs, + ) -> Union[CausalLMOutputWithCrossAttentions, Tuple[torch.FloatTensor]]: + r""" + encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if + the model is configured as a decoder. + encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in + the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`): + Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding. + If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that + don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all + `decoder_input_ids` of shape `(batch_size, sequence_length)`. + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in + `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are + ignored (masked), the loss is only computed for the tokens with labels n `[0, ..., config.vocab_size]`. + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see + `past_key_values`). + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.bert( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + past_key_values=past_key_values, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + **kwargs, + ) + + sequence_output = outputs[0] + prediction_scores = self.cls(sequence_output) + + lm_loss = None + if labels is not None: + lm_loss = self.loss_function( + prediction_scores, + labels, + vocab_size=self.config.vocab_size, + **kwargs, + ) + + if not return_dict: + output = (prediction_scores,) + outputs[2:] + return ((lm_loss,) + output) if lm_loss is not None else output + + return CausalLMOutputWithCrossAttentions( + loss=lm_loss, + logits=prediction_scores, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + cross_attentions=outputs.cross_attentions, + ) + + def _reorder_cache(self, past_key_values, beam_idx): + reordered_past = () + for layer_past in past_key_values: + reordered_past += ( + tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past[:2]) + + layer_past[2:], + ) + return reordered_past + + +class BigBirdClassificationHead(nn.Module): + """Head for sentence-level classification tasks.""" + + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + classifier_dropout = ( + config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob + ) + self.dropout = nn.Dropout(classifier_dropout) + self.out_proj = nn.Linear(config.hidden_size, config.num_labels) + + self.config = config + + def forward(self, features, **kwargs): + x = features[:, 0, :] # take token (equiv. to [CLS]) + x = self.dropout(x) + x = self.dense(x) + x = ACT2FN[self.config.hidden_act](x) + x = self.dropout(x) + x = self.out_proj(x) + return x + + +@add_start_docstrings( + """ + BigBird Model transformer with a sequence classification/regression head on top (a linear layer on top of the + pooled output) e.g. for GLUE tasks. + """, + BIG_BIRD_START_DOCSTRING, +) +class BigBirdForSequenceClassification(BigBirdPreTrainedModel): + def __init__(self, config): + super().__init__(config) + self.num_labels = config.num_labels + self.config = config + self.bert = BigBirdModel(config) + self.classifier = BigBirdClassificationHead(config) + + # Initialize weights and apply final processing + self.post_init() + + @add_start_docstrings_to_model_forward(BIG_BIRD_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @replace_return_docstrings(output_type=SequenceClassifierOutput, config_class=_CONFIG_FOR_DOC) + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.FloatTensor] = None, + token_type_ids: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[SequenceClassifierOutput, Tuple[torch.FloatTensor]]: + r""" + labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., + config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If + `config.num_labels > 1` a classification loss is computed (Cross-Entropy). + + Returns: + + Example: + + ```python + >>> import torch + >>> from transformers import AutoTokenizer, BigBirdForSequenceClassification + >>> from datasets import load_dataset + + >>> tokenizer = AutoTokenizer.from_pretrained("l-yohai/bigbird-roberta-base-mnli") + >>> model = BigBirdForSequenceClassification.from_pretrained("l-yohai/bigbird-roberta-base-mnli") + >>> squad_ds = load_dataset("rajpurkar/squad_v2", split="train") # doctest: +IGNORE_RESULT + + >>> LONG_ARTICLE = squad_ds[81514]["context"] + >>> inputs = tokenizer(LONG_ARTICLE, return_tensors="pt") + >>> # long input article + >>> list(inputs["input_ids"].shape) + [1, 919] + + >>> with torch.no_grad(): + ... logits = model(**inputs).logits + >>> predicted_class_id = logits.argmax().item() + >>> model.config.id2label[predicted_class_id] + 'LABEL_0' + ``` + + ```python + >>> num_labels = len(model.config.id2label) + >>> model = BigBirdForSequenceClassification.from_pretrained( + ... "l-yohai/bigbird-roberta-base-mnli", num_labels=num_labels + ... ) + >>> labels = torch.tensor(1) + >>> loss = model(**inputs, labels=labels).loss + >>> round(loss.item(), 2) + 1.13 + ``` + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.bert( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + sequence_output = outputs[0] + logits = self.classifier(sequence_output) + + loss = None + if labels is not None: + if self.config.problem_type is None: + if self.num_labels == 1: + self.config.problem_type = "regression" + elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int): + self.config.problem_type = "single_label_classification" + else: + self.config.problem_type = "multi_label_classification" + + if self.config.problem_type == "regression": + loss_fct = MSELoss() + if self.num_labels == 1: + loss = loss_fct(logits.squeeze(), labels.squeeze()) + else: + loss = loss_fct(logits, labels) + elif self.config.problem_type == "single_label_classification": + loss_fct = CrossEntropyLoss() + loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) + elif self.config.problem_type == "multi_label_classification": + loss_fct = BCEWithLogitsLoss() + loss = loss_fct(logits, labels) + + if not return_dict: + output = (logits,) + outputs[2:] + return ((loss,) + output) if loss is not None else output + + return SequenceClassifierOutput( + loss=loss, + logits=logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +@add_start_docstrings( + """ + BigBird Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a + softmax) e.g. for RocStories/SWAG tasks. + """, + BIG_BIRD_START_DOCSTRING, +) +class BigBirdForMultipleChoice(BigBirdPreTrainedModel): + def __init__(self, config): + super().__init__(config) + + self.bert = BigBirdModel(config) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + self.classifier = nn.Linear(config.hidden_size, 1) + + # Initialize weights and apply final processing + self.post_init() + + @add_start_docstrings_to_model_forward( + BIG_BIRD_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length") + ) + @add_code_sample_docstrings( + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=MultipleChoiceModelOutput, + config_class=_CONFIG_FOR_DOC, + ) + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.FloatTensor] = None, + token_type_ids: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[MultipleChoiceModelOutput, Tuple[torch.FloatTensor]]: + r""" + labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the multiple choice classification loss. Indices should be in `[0, ..., + num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See + `input_ids` above) + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1] + + input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None + attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None + token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None + position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None + inputs_embeds = ( + inputs_embeds.view(-1, inputs_embeds.size(-2), inputs_embeds.size(-1)) + if inputs_embeds is not None + else None + ) + + outputs = self.bert( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + pooled_output = outputs[1] + + pooled_output = self.dropout(pooled_output) + logits = self.classifier(pooled_output) + reshaped_logits = logits.view(-1, num_choices) + + loss = None + if labels is not None: + loss_fct = CrossEntropyLoss() + loss = loss_fct(reshaped_logits, labels) + + if not return_dict: + output = (reshaped_logits,) + outputs[2:] + return ((loss,) + output) if loss is not None else output + + return MultipleChoiceModelOutput( + loss=loss, + logits=reshaped_logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +@add_start_docstrings( + """ + BigBird Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for + Named-Entity-Recognition (NER) tasks. + """, + BIG_BIRD_START_DOCSTRING, +) +class BigBirdForTokenClassification(BigBirdPreTrainedModel): + def __init__(self, config): + super().__init__(config) + self.num_labels = config.num_labels + + self.bert = BigBirdModel(config) + classifier_dropout = ( + config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob + ) + self.dropout = nn.Dropout(classifier_dropout) + self.classifier = nn.Linear(config.hidden_size, config.num_labels) + + # Initialize weights and apply final processing + self.post_init() + + @add_start_docstrings_to_model_forward(BIG_BIRD_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @add_code_sample_docstrings( + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=TokenClassifierOutput, + config_class=_CONFIG_FOR_DOC, + ) + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.FloatTensor] = None, + token_type_ids: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[TokenClassifierOutput, Tuple[torch.FloatTensor]]: + r""" + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`. + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.bert( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + sequence_output = outputs[0] + + sequence_output = self.dropout(sequence_output) + logits = self.classifier(sequence_output) + + loss = None + if labels is not None: + loss_fct = CrossEntropyLoss() + loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) + + if not return_dict: + output = (logits,) + outputs[2:] + return ((loss,) + output) if loss is not None else output + + return TokenClassifierOutput( + loss=loss, + logits=logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +class BigBirdForQuestionAnsweringHead(nn.Module): + """Head for question answering tasks.""" + + def __init__(self, config): + super().__init__() + self.dropout = nn.Dropout(config.hidden_dropout_prob) + self.intermediate = BigBirdIntermediate(config) + self.output = BigBirdOutput(config) + self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels) + + def forward(self, encoder_output): + hidden_states = self.dropout(encoder_output) + hidden_states = self.intermediate(hidden_states) + hidden_states = self.output(hidden_states, encoder_output) + hidden_states = self.qa_outputs(hidden_states) + return hidden_states + + +@add_start_docstrings( + """ + BigBird Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear + layers on top of the hidden-states output to compute `span start logits` and `span end logits`). + """, + BIG_BIRD_START_DOCSTRING, +) +class BigBirdForQuestionAnswering(BigBirdPreTrainedModel): + def __init__(self, config, add_pooling_layer=False): + super().__init__(config) + + config.num_labels = 2 + self.num_labels = config.num_labels + self.sep_token_id = config.sep_token_id + + self.bert = BigBirdModel(config, add_pooling_layer=add_pooling_layer) + self.qa_classifier = BigBirdForQuestionAnsweringHead(config) + + # Initialize weights and apply final processing + self.post_init() + + @add_start_docstrings_to_model_forward(BIG_BIRD_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @replace_return_docstrings(output_type=BigBirdForQuestionAnsweringModelOutput, config_class=_CONFIG_FOR_DOC) + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.FloatTensor] = None, + question_lengths: Optional[torch.Tensor] = None, + token_type_ids: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + start_positions: Optional[torch.LongTensor] = None, + end_positions: Optional[torch.LongTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[BigBirdForQuestionAnsweringModelOutput, Tuple[torch.FloatTensor]]: + r""" + start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for position (index) of the start of the labelled span for computing the token classification loss. + Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence + are not taken into account for computing the loss. + end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for position (index) of the end of the labelled span for computing the token classification loss. + Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence + are not taken into account for computing the loss. + + Returns: + + Example: + + ```python + >>> import torch + >>> from transformers import AutoTokenizer, BigBirdForQuestionAnswering + >>> from datasets import load_dataset + + >>> tokenizer = AutoTokenizer.from_pretrained("google/bigbird-roberta-base") + >>> model = BigBirdForQuestionAnswering.from_pretrained("google/bigbird-roberta-base") + >>> squad_ds = load_dataset("rajpurkar/squad_v2", split="train") # doctest: +IGNORE_RESULT + + >>> # select random article and question + >>> LONG_ARTICLE = squad_ds[81514]["context"] + >>> QUESTION = squad_ds[81514]["question"] + >>> QUESTION + 'During daytime how high can the temperatures reach?' + + >>> inputs = tokenizer(QUESTION, LONG_ARTICLE, return_tensors="pt") + >>> # long article and question input + >>> list(inputs["input_ids"].shape) + [1, 929] + + >>> with torch.no_grad(): + ... outputs = model(**inputs) + + >>> answer_start_index = outputs.start_logits.argmax() + >>> answer_end_index = outputs.end_logits.argmax() + >>> predict_answer_token_ids = inputs.input_ids[0, answer_start_index : answer_end_index + 1] + >>> predict_answer_token = tokenizer.decode(predict_answer_token_ids) + ``` + + ```python + >>> target_start_index, target_end_index = torch.tensor([130]), torch.tensor([132]) + >>> outputs = model(**inputs, start_positions=target_start_index, end_positions=target_end_index) + >>> loss = outputs.loss + ``` + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + seqlen = input_ids.size(1) if input_ids is not None else inputs_embeds.size(1) + + if question_lengths is None and input_ids is not None: + # assuming input_ids format: context + question_lengths = torch.argmax(input_ids.eq(self.sep_token_id).int(), dim=-1) + 1 + question_lengths.unsqueeze_(1) + + logits_mask = None + if question_lengths is not None: + # setting lengths logits to `-inf` + logits_mask = self.prepare_question_mask(question_lengths, seqlen) + if token_type_ids is None: + token_type_ids = torch.ones(logits_mask.size(), dtype=int, device=logits_mask.device) - logits_mask + logits_mask = logits_mask + logits_mask[:, 0] = False + logits_mask.unsqueeze_(2) + + outputs = self.bert( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + sequence_output = outputs[0] + logits = self.qa_classifier(sequence_output) + + if logits_mask is not None: + # removing question tokens from the competition + logits = logits - logits_mask * 1e6 + + start_logits, end_logits = logits.split(1, dim=-1) + start_logits = start_logits.squeeze(-1).contiguous() + end_logits = end_logits.squeeze(-1).contiguous() + + total_loss = None + if start_positions is not None and end_positions is not None: + # If we are on multi-GPU, split add a dimension + if len(start_positions.size()) > 1: + start_positions = start_positions.squeeze(-1) + if len(end_positions.size()) > 1: + end_positions = end_positions.squeeze(-1) + # sometimes the start/end positions are outside our model inputs, we ignore these terms + ignored_index = start_logits.size(1) + start_positions = start_positions.clamp(0, ignored_index) + end_positions = end_positions.clamp(0, ignored_index) + + loss_fct = CrossEntropyLoss(ignore_index=ignored_index) + start_loss = loss_fct(start_logits, start_positions) + end_loss = loss_fct(end_logits, end_positions) + total_loss = (start_loss + end_loss) / 2 + + if not return_dict: + output = (start_logits, end_logits) + outputs[2:] + return ((total_loss,) + output) if total_loss is not None else output + + return BigBirdForQuestionAnsweringModelOutput( + loss=total_loss, + start_logits=start_logits, + end_logits=end_logits, + pooler_output=outputs.pooler_output, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + @staticmethod + def prepare_question_mask(q_lengths: torch.Tensor, maxlen: int): + # q_lengths -> (bz, 1) + mask = torch.arange(0, maxlen).to(q_lengths.device) + mask.unsqueeze_(0) # -> (1, maxlen) + mask = torch.where(mask < q_lengths, 1, 0) + return mask + + +__all__ = [ + "BigBirdForCausalLM", + "BigBirdForMaskedLM", + "BigBirdForMultipleChoice", + "BigBirdForPreTraining", + "BigBirdForQuestionAnswering", + "BigBirdForSequenceClassification", + "BigBirdForTokenClassification", + "BigBirdLayer", + "BigBirdModel", + "BigBirdPreTrainedModel", + "load_tf_weights_in_big_bird", +] diff --git a/docs/transformers/build/lib/transformers/models/big_bird/modeling_flax_big_bird.py b/docs/transformers/build/lib/transformers/models/big_bird/modeling_flax_big_bird.py new file mode 100644 index 0000000000000000000000000000000000000000..7f43a4c5abb8d3eaa6869d39c318d57209e5448c --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/big_bird/modeling_flax_big_bird.py @@ -0,0 +1,2648 @@ +# coding=utf-8 +# Copyright 2021 The Google Flax Team Authors and The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Callable, Optional, Tuple + +import flax +import flax.linen as nn +import jax +import jax.numpy as jnp +from flax.core.frozen_dict import FrozenDict, freeze, unfreeze +from flax.linen import combine_masks, make_causal_mask +from flax.linen import partitioning as nn_partitioning +from flax.linen.attention import dot_product_attention_weights +from flax.traverse_util import flatten_dict, unflatten_dict +from jax import lax + +from ...modeling_flax_outputs import ( + FlaxBaseModelOutputWithPastAndCrossAttentions, + FlaxBaseModelOutputWithPooling, + FlaxBaseModelOutputWithPoolingAndCrossAttentions, + FlaxCausalLMOutputWithCrossAttentions, + FlaxMaskedLMOutput, + FlaxMultipleChoiceModelOutput, + FlaxSequenceClassifierOutput, + FlaxTokenClassifierOutput, +) +from ...modeling_flax_utils import ( + ACT2FN, + FlaxPreTrainedModel, + append_call_sample_docstring, + append_replace_return_docstrings, + overwrite_call_docstring, +) +from ...utils import ModelOutput, add_start_docstrings, add_start_docstrings_to_model_forward, logging +from .configuration_big_bird import BigBirdConfig + + +logger = logging.get_logger(__name__) + +_CHECKPOINT_FOR_DOC = "google/bigbird-roberta-base" +_CONFIG_FOR_DOC = "BigBirdConfig" + +remat = nn_partitioning.remat + + +@flax.struct.dataclass +class FlaxBigBirdForPreTrainingOutput(ModelOutput): + """ + Output type of [`BigBirdForPreTraining`]. + + Args: + prediction_logits (`jnp.ndarray` of shape `(batch_size, sequence_length, config.vocab_size)`): + Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). + seq_relationship_logits (`jnp.ndarray` of shape `(batch_size, 2)`): + Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation + before SoftMax). + hidden_states (`tuple(jnp.ndarray)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `jnp.ndarray` (one for the output of the embeddings + one for the output of each layer) of shape + `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (`tuple(jnp.ndarray)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `jnp.ndarray` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + + prediction_logits: jnp.ndarray = None + seq_relationship_logits: jnp.ndarray = None + hidden_states: Optional[Tuple[jnp.ndarray]] = None + attentions: Optional[Tuple[jnp.ndarray]] = None + + +@flax.struct.dataclass +class FlaxBigBirdForQuestionAnsweringModelOutput(ModelOutput): + """ + Base class for outputs of question answering models. + + Args: + start_logits (`jnp.ndarray` of shape `(batch_size, sequence_length)`): + Span-start scores (before SoftMax). + end_logits (`jnp.ndarray` of shape `(batch_size, sequence_length)`): + Span-end scores (before SoftMax). + pooled_output (`jnp.ndarray` of shape `(batch_size, hidden_size)`): + pooled_output returned by FlaxBigBirdModel. + hidden_states (`tuple(jnp.ndarray)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `jnp.ndarray` (one for the output of the embeddings + one for the output of each layer) of shape + `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the initial embedding outputs. + attentions (`tuple(jnp.ndarray)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `jnp.ndarray` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + + start_logits: jnp.ndarray = None + end_logits: jnp.ndarray = None + pooled_output: jnp.ndarray = None + hidden_states: Optional[Tuple[jnp.ndarray]] = None + attentions: Optional[Tuple[jnp.ndarray]] = None + + +BIG_BIRD_START_DOCSTRING = r""" + + This model inherits from [`FlaxPreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading, saving and converting weights from PyTorch models) + + This model is also a + [flax.linen.Module](https://flax.readthedocs.io/en/latest/api_reference/flax.linen/module.html) subclass. Use it as + a regular Flax linen Module and refer to the Flax documentation for all matter related to general usage and + behavior. + + Finally, this model supports inherent JAX features such as: + + - [Just-In-Time (JIT) compilation](https://jax.readthedocs.io/en/latest/jax.html#just-in-time-compilation-jit) + - [Automatic Differentiation](https://jax.readthedocs.io/en/latest/jax.html#automatic-differentiation) + - [Vectorization](https://jax.readthedocs.io/en/latest/jax.html#vectorization-vmap) + - [Parallelization](https://jax.readthedocs.io/en/latest/jax.html#parallelization-pmap) + + Parameters: + config ([`BigBirdConfig`]): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the [`~FlaxPreTrainedModel.from_pretrained`] method to load the model weights. + dtype (`jax.numpy.dtype`, *optional*, defaults to `jax.numpy.float32`): + The data type of the computation. Can be one of `jax.numpy.float32`, `jax.numpy.float16` (on GPUs) and + `jax.numpy.bfloat16` (on TPUs). + + This can be used to enable mixed-precision training or half-precision inference on GPUs or TPUs. If + specified all the computation will be performed with the given `dtype`. + + **Note that this only specifies the dtype of the computation and does not influence the dtype of model + parameters.** + + If you wish to change the dtype of the model parameters, see [`~FlaxPreTrainedModel.to_fp16`] and + [`~FlaxPreTrainedModel.to_bf16`]. +""" + +BIG_BIRD_INPUTS_DOCSTRING = r""" + Args: + input_ids (`numpy.ndarray` of shape `({0})`): + Indices of input sequence tokens in the vocabulary. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + attention_mask (`numpy.ndarray` of shape `({0})`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + token_type_ids (`numpy.ndarray` of shape `({0})`, *optional*): + Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0, + 1]`: + + - 0 corresponds to a *sentence A* token, + - 1 corresponds to a *sentence B* token. + + [What are token type IDs?](../glossary#token-type-ids) + position_ids (`numpy.ndarray` of shape `({0})`, *optional*): + Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, + config.max_position_embeddings - 1]`. + head_mask (`numpy.ndarray` of shape `({0})`, `optional): + Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. + +""" + + +class FlaxBigBirdEmbeddings(nn.Module): + """Construct the embeddings from word, position and token_type embeddings.""" + + config: BigBirdConfig + dtype: jnp.dtype = jnp.float32 # the dtype of the computation + + # Copied from transformers.models.bert.modeling_flax_bert.FlaxBertEmbeddings.setup + def setup(self): + self.word_embeddings = nn.Embed( + self.config.vocab_size, + self.config.hidden_size, + embedding_init=jax.nn.initializers.normal(stddev=self.config.initializer_range), + dtype=self.dtype, + ) + self.position_embeddings = nn.Embed( + self.config.max_position_embeddings, + self.config.hidden_size, + embedding_init=jax.nn.initializers.normal(stddev=self.config.initializer_range), + dtype=self.dtype, + ) + self.token_type_embeddings = nn.Embed( + self.config.type_vocab_size, + self.config.hidden_size, + embedding_init=jax.nn.initializers.normal(stddev=self.config.initializer_range), + dtype=self.dtype, + ) + self.LayerNorm = nn.LayerNorm(epsilon=self.config.layer_norm_eps, dtype=self.dtype) + self.dropout = nn.Dropout(rate=self.config.hidden_dropout_prob) + + def __call__(self, input_ids, token_type_ids, position_ids, attention_mask, deterministic: bool = True): + # Embed + inputs_embeds = self.word_embeddings(input_ids.astype("i4")) + position_embeds = self.position_embeddings(position_ids.astype("i4")) + token_type_embeddings = self.token_type_embeddings(token_type_ids.astype("i4")) + + if self.config.rescale_embeddings: + inputs_embeds *= self.config.hidden_size**0.5 + + # Sum all embeddings + hidden_states = inputs_embeds + token_type_embeddings + position_embeds + + # Layer Norm + hidden_states = self.dropout(hidden_states, deterministic=deterministic) + hidden_states = self.LayerNorm(hidden_states) + return hidden_states + + +# Copied from transformers.models.bert.modeling_flax_bert.FlaxBertSelfAttention with Bert->BigBird +class FlaxBigBirdSelfAttention(nn.Module): + config: BigBirdConfig + causal: bool = False + dtype: jnp.dtype = jnp.float32 # the dtype of the computation + + def setup(self): + self.head_dim = self.config.hidden_size // self.config.num_attention_heads + if self.config.hidden_size % self.config.num_attention_heads != 0: + raise ValueError( + "`config.hidden_size`: {self.config.hidden_size} has to be a multiple of `config.num_attention_heads` " + " : {self.config.num_attention_heads}" + ) + + self.query = nn.Dense( + self.config.hidden_size, + dtype=self.dtype, + kernel_init=jax.nn.initializers.normal(self.config.initializer_range), + ) + self.key = nn.Dense( + self.config.hidden_size, + dtype=self.dtype, + kernel_init=jax.nn.initializers.normal(self.config.initializer_range), + ) + self.value = nn.Dense( + self.config.hidden_size, + dtype=self.dtype, + kernel_init=jax.nn.initializers.normal(self.config.initializer_range), + ) + + if self.causal: + self.causal_mask = make_causal_mask( + jnp.ones((1, self.config.max_position_embeddings), dtype="bool"), dtype="bool" + ) + + def _split_heads(self, hidden_states): + return hidden_states.reshape(hidden_states.shape[:2] + (self.config.num_attention_heads, self.head_dim)) + + def _merge_heads(self, hidden_states): + return hidden_states.reshape(hidden_states.shape[:2] + (self.config.hidden_size,)) + + @nn.compact + # Copied from transformers.models.bart.modeling_flax_bart.FlaxBartAttention._concatenate_to_cache + def _concatenate_to_cache(self, key, value, query, attention_mask): + """ + This function takes projected key, value states from a single input token and concatenates the states to cached + states from previous steps. This function is slightly adapted from the official Flax repository: + https://github.com/google/flax/blob/491ce18759622506588784b4fca0e4bf05f8c8cd/flax/linen/attention.py#L252 + """ + # detect if we're initializing by absence of existing cache data. + is_initialized = self.has_variable("cache", "cached_key") + cached_key = self.variable("cache", "cached_key", jnp.zeros, key.shape, key.dtype) + cached_value = self.variable("cache", "cached_value", jnp.zeros, value.shape, value.dtype) + cache_index = self.variable("cache", "cache_index", lambda: jnp.array(0, dtype=jnp.int32)) + + if is_initialized: + *batch_dims, max_length, num_heads, depth_per_head = cached_key.value.shape + # update key, value caches with our new 1d spatial slices + cur_index = cache_index.value + indices = (0,) * len(batch_dims) + (cur_index, 0, 0) + key = lax.dynamic_update_slice(cached_key.value, key, indices) + value = lax.dynamic_update_slice(cached_value.value, value, indices) + cached_key.value = key + cached_value.value = value + num_updated_cache_vectors = query.shape[1] + cache_index.value = cache_index.value + num_updated_cache_vectors + # causal mask for cached decoder self-attention: our single query position should only attend to those key positions that have already been generated and cached, not the remaining zero elements. + pad_mask = jnp.broadcast_to( + jnp.arange(max_length) < cur_index + num_updated_cache_vectors, + tuple(batch_dims) + (1, num_updated_cache_vectors, max_length), + ) + attention_mask = combine_masks(pad_mask, attention_mask) + return key, value, attention_mask + + def __call__( + self, + hidden_states, + attention_mask, + layer_head_mask, + key_value_states: Optional[jnp.ndarray] = None, + init_cache: bool = False, + deterministic=True, + output_attentions: bool = False, + ): + # if key_value_states are provided this layer is used as a cross-attention layer + # for the decoder + is_cross_attention = key_value_states is not None + batch_size = hidden_states.shape[0] + + # get query proj + query_states = self.query(hidden_states) + # get key, value proj + if is_cross_attention: + # cross_attentions + key_states = self.key(key_value_states) + value_states = self.value(key_value_states) + else: + # self_attention + key_states = self.key(hidden_states) + value_states = self.value(hidden_states) + + query_states = self._split_heads(query_states) + key_states = self._split_heads(key_states) + value_states = self._split_heads(value_states) + + # handle cache prepare causal attention mask + if self.causal: + query_length, key_length = query_states.shape[1], key_states.shape[1] + if self.has_variable("cache", "cached_key"): + mask_shift = self.variables["cache"]["cache_index"] + max_decoder_length = self.variables["cache"]["cached_key"].shape[1] + causal_mask = lax.dynamic_slice( + self.causal_mask, (0, 0, mask_shift, 0), (1, 1, query_length, max_decoder_length) + ) + else: + causal_mask = self.causal_mask[:, :, :query_length, :key_length] + causal_mask = jnp.broadcast_to(causal_mask, (batch_size,) + causal_mask.shape[1:]) + + # combine masks if needed + if attention_mask is not None and self.causal: + attention_mask = jnp.broadcast_to(jnp.expand_dims(attention_mask, axis=(-3, -2)), causal_mask.shape) + attention_mask = combine_masks(attention_mask, causal_mask) + elif self.causal: + attention_mask = causal_mask + elif attention_mask is not None: + attention_mask = jnp.expand_dims(attention_mask, axis=(-3, -2)) + + # During fast autoregressive decoding, we feed one position at a time, + # and cache the keys and values step by step. + if self.causal and (self.has_variable("cache", "cached_key") or init_cache): + key_states, value_states, attention_mask = self._concatenate_to_cache( + key_states, value_states, query_states, attention_mask + ) + + # Convert the boolean attention mask to an attention bias. + if attention_mask is not None: + # attention mask in the form of attention bias + attention_bias = lax.select( + attention_mask > 0, + jnp.full(attention_mask.shape, 0.0).astype(self.dtype), + jnp.full(attention_mask.shape, jnp.finfo(self.dtype).min).astype(self.dtype), + ) + else: + attention_bias = None + + dropout_rng = None + if not deterministic and self.config.attention_probs_dropout_prob > 0.0: + dropout_rng = self.make_rng("dropout") + + attn_weights = dot_product_attention_weights( + query_states, + key_states, + bias=attention_bias, + dropout_rng=dropout_rng, + dropout_rate=self.config.attention_probs_dropout_prob, + broadcast_dropout=True, + deterministic=deterministic, + dtype=self.dtype, + precision=None, + ) + + # Mask heads if we want to + if layer_head_mask is not None: + attn_weights = jnp.einsum("...hqk,h->...hqk", attn_weights, layer_head_mask) + + attn_output = jnp.einsum("...hqk,...khd->...qhd", attn_weights, value_states) + attn_output = attn_output.reshape(attn_output.shape[:2] + (-1,)) + + outputs = (attn_output, attn_weights) if output_attentions else (attn_output,) + return outputs + + +class FlaxBigBirdBlockSparseAttention(nn.Module): + config: BigBirdConfig + block_sparse_seed: Optional[int] = None + dtype: jnp.dtype = jnp.float32 + + def setup(self): + self.query = nn.Dense( + self.config.hidden_size, + dtype=self.dtype, + use_bias=self.config.use_bias, + kernel_init=jax.nn.initializers.normal(self.config.initializer_range), + ) + self.key = nn.Dense( + self.config.hidden_size, + dtype=self.dtype, + use_bias=self.config.use_bias, + kernel_init=jax.nn.initializers.normal(self.config.initializer_range), + ) + self.value = nn.Dense( + self.config.hidden_size, + dtype=self.dtype, + use_bias=self.config.use_bias, + kernel_init=jax.nn.initializers.normal(self.config.initializer_range), + ) + + @staticmethod + def transpose_for_scores(x, n_heads, head_size): + new_x_shape = x.shape[:-1] + (n_heads, head_size) + x = x.reshape(*new_x_shape) + return jnp.transpose(x, axes=(0, 2, 1, 3)) + + def __call__( + self, + hidden_states, + attention_mask, + deterministic=True, + output_attentions=False, + ): + n_heads = self.config.num_attention_heads + head_size = self.config.hidden_size // n_heads + + blocked_encoder_mask, band_mask, from_mask, to_mask = self.create_masks_for_block_sparse_attn( + attention_mask, self.config.block_size + ) + + query_layer = self.transpose_for_scores(self.query(hidden_states), n_heads, head_size) + key_layer = self.transpose_for_scores(self.key(hidden_states), n_heads, head_size) + value_layer = self.transpose_for_scores(self.value(hidden_states), n_heads, head_size) + + indices_prng_key = None + if not deterministic: + indices_prng_key = self.make_rng("indices") + + attn_output, attn_weights = self.bigbird_block_sparse_attention( + query_layer, + key_layer, + value_layer, + band_mask, + from_mask, + to_mask, + blocked_encoder_mask, + blocked_encoder_mask, + n_heads, + head_size, + indices_prng_key=indices_prng_key, + deterministic=deterministic, + plan_from_length=None, + plan_num_rand_blocks=None, + output_attentions=output_attentions, + ) + + outputs = (attn_output, attn_weights) if output_attentions else (attn_output,) + return outputs + + @staticmethod + def create_masks_for_block_sparse_attn(attention_mask, block_size: int): + batch_size, seq_length = attention_mask.shape + if seq_length % block_size != 0: + raise ValueError( + f"Sequence length must be multiple of block size, but sequence length is {seq_length}, while block" + f" size is {block_size}." + ) + + def create_band_mask_from_inputs(from_blocked_mask, to_blocked_mask): + """ + Create 3D attention mask from a 2D tensor mask. + + Args: + from_blocked_mask: 2D Tensor of shape [batch_size, + from_seq_length//from_block_size, from_block_size]. + to_blocked_mask: int32 Tensor of shape [batch_size, + to_seq_length//to_block_size, to_block_size]. + + Returns: + float Tensor of shape [batch_size, 1, from_seq_length//from_block_size-4, from_block_size, + 3*to_block_size]. + """ + exp_blocked_to_pad = jnp.concatenate( + [to_blocked_mask[:, 1:-3], to_blocked_mask[:, 2:-2], to_blocked_mask[:, 3:-1]], axis=2 + ) + band_mask = jnp.einsum("blq,blk->blqk", from_blocked_mask[:, 2:-2], exp_blocked_to_pad) + band_mask = jnp.expand_dims(band_mask, 1) + return band_mask + + blocked_encoder_mask = attention_mask.reshape(batch_size, seq_length // block_size, block_size) + band_mask = create_band_mask_from_inputs(blocked_encoder_mask, blocked_encoder_mask) + + from_mask = attention_mask.reshape(batch_size, 1, seq_length, 1) + to_mask = attention_mask.reshape(batch_size, 1, 1, seq_length) + + return blocked_encoder_mask, band_mask, from_mask, to_mask + + def bigbird_block_sparse_attention( + self, + query_layer, + key_layer, + value_layer, + band_mask, + from_mask, + to_mask, + from_blocked_mask, + to_blocked_mask, + n_heads, + head_size, + indices_prng_key: Optional[jax.random.PRNGKey] = None, + deterministic: Optional[bool] = True, + plan_from_length=None, + plan_num_rand_blocks=None, + output_attentions=None, + ): + # BigBird block-sparse attention as suggested in paper + + # ITC: + # global tokens: 2 x block_size + # window tokens: 3 x block_size + # random tokens: num_rand_tokens x block_size + + # ETC: + # global tokens: extra_globals_tokens + 2 x block_size + # window tokens: 3 x block_size + # random tokens: num_rand_tokens x block_size + + # Note: + # 1) Currently, ETC is not supported. + # 2) Window size is fixed to 3 blocks & it can be changed only by + # changing `block_size`. + # 3) Number of global blocks are fixed (2 blocks here) & global tokens can be + # controlled only by `block_size`. + + # attention is calculated separately for q[0], q[1], q[2:-2], q[-2], q[-1] in order to use special trick of + # shifting tokens (for calculating sliding attention). hence following code can be divided into 5 parts. + + bsz, _, from_seq_len, _ = query_layer.shape + to_seq_len = key_layer.shape[2] + from_block_size = to_block_size = self.config.block_size + + if from_seq_len % from_block_size != 0: + raise ValueError("Query sided sequence length must be multiple of block size") + + if to_seq_len % to_block_size != 0: + raise ValueError("Key/Value sided sequence length must be multiple of block size") + + if from_seq_len // from_block_size != to_seq_len // to_block_size: + raise ValueError("Error the number of blocks needs to be same!") + + n_rand_blocks = self.config.num_random_blocks + rsqrt_d = 1 / jnp.sqrt(head_size) + attn_mask_penalty = -10000.0 + + if from_seq_len in [1024, 3072, 4096]: # old plans used in paper + max_seqlen = self.config.max_position_embeddings + rand_attn = [ + self._bigbird_block_rand_mask( + max_seqlen, + max_seqlen, + from_block_size, + to_block_size, + n_rand_blocks, + indices_prng_key=indices_prng_key, + deterministic=deterministic, + last_idx=1024, + )[: (from_seq_len // from_block_size - 2)] + for _ in range(n_heads) + ] + else: + if plan_from_length is None: + plan_from_length, plan_num_rand_blocks = self._get_rand_attn_plan( + from_seq_len, from_block_size, n_rand_blocks + ) + rand_attn = self._bigbird_block_rand_mask_with_head( + from_seq_length=from_seq_len, + to_seq_length=to_seq_len, + from_block_size=from_block_size, + to_block_size=to_block_size, + num_heads=n_heads, + plan_from_length=plan_from_length, + plan_num_rand_blocks=plan_num_rand_blocks, + indices_prng_key=indices_prng_key, + ) + + rand_attn = jnp.stack(rand_attn, axis=0) + rand_attn = jnp.broadcast_to(rand_attn, (bsz,) + rand_attn.shape) + + rand_mask = self._create_rand_mask_from_inputs( + from_blocked_mask, to_blocked_mask, rand_attn, n_heads, n_rand_blocks, bsz, from_seq_len, from_block_size + ) + + blocked_query_matrix = query_layer.reshape(bsz, n_heads, from_seq_len // from_block_size, from_block_size, -1) + blocked_key_matrix = key_layer.reshape(bsz, n_heads, to_seq_len // to_block_size, to_block_size, -1) + blocked_value_matrix = value_layer.reshape(bsz, n_heads, to_seq_len // to_block_size, to_block_size, -1) + + shape = (bsz, n_heads, to_seq_len // to_block_size - 2, n_rand_blocks * to_block_size, -1) + gathered_key = self.jax_gather(blocked_key_matrix, rand_attn, batch_dims=2).reshape(*shape) + gathered_value = self.jax_gather(blocked_value_matrix, rand_attn, batch_dims=2).reshape(*shape) + + # 1st PART + # 1st block (global block) attention scores + # q[0] x (k[0], k[1], k[2], k[3], k[4] .... ) + + # [bsz, n_heads, from_block_size, -1] x [bsz, n_heads, to_seq_len, -1] ==> [bsz, n_heads, from_block_size, to_seq_len] + first_product = jnp.einsum("bhqd,bhkd->bhqk", blocked_query_matrix[:, :, 0], key_layer) + + first_product = first_product * rsqrt_d + first_product += (1.0 - to_mask) * attn_mask_penalty + first_attn_weights = jax.nn.softmax(first_product, axis=-1) # [bsz, n_heads, from_block_size, to_seq_len] + + # [bsz, n_heads, from_block_size, to_seq_len] x [bsz, n_heads, to_seq_len, -1] ==> [bsz, n_heads, from_block_size, -1] + first_context_layer = jnp.einsum("bhqk,bhkd->bhqd", first_attn_weights, value_layer) + first_context_layer = jnp.expand_dims(first_context_layer, 2) + + # 2nd PART + # 2nd block attention scores + # q[1] x (sliding_keys, random_keys, global_keys) + # sliding key blocks -> 2nd, 3rd blocks + # global key blocks -> 1st block + + second_key_mat = jnp.concatenate( + [ + blocked_key_matrix[:, :, 0], + blocked_key_matrix[:, :, 1], + blocked_key_matrix[:, :, 2], + blocked_key_matrix[:, :, -1], + gathered_key[:, :, 0], + ], + axis=2, + ) # [bsz, n_heads, (4+n_rand_blocks)*to_block_size, -1] + second_value_mat = jnp.concatenate( + [ + blocked_value_matrix[:, :, 0], + blocked_value_matrix[:, :, 1], + blocked_value_matrix[:, :, 2], + blocked_value_matrix[:, :, -1], + gathered_value[:, :, 0], + ], + axis=2, + ) # [bsz, n_heads, (4+n_rand_blocks)*to_block_size, -1] + + # [bsz, n_heads, from_block_size, -1] x [bsz, n_heads, (4+n_rand_blocks)*to_block_size, -1] + # ==> [bsz, n_heads, from_block_size, (4+n_rand_blocks)*to_block_size] + second_product = jnp.einsum("bhqd,bhkd->bhqk", blocked_query_matrix[:, :, 1], second_key_mat) + second_seq_pad = jnp.concatenate( + [ + to_mask[:, :, :, : 3 * to_block_size], + to_mask[:, :, :, -to_block_size:], + jnp.ones([bsz, 1, 1, n_rand_blocks * to_block_size], dtype=to_mask.dtype), + ], + axis=3, + ) + second_rand_pad = jnp.concatenate( + [ + jnp.ones([bsz, n_heads, from_block_size, 4 * to_block_size], dtype=rand_mask.dtype), + rand_mask[:, :, 0], + ], + axis=3, + ) + second_product = second_product * rsqrt_d + second_product += (1.0 - jnp.minimum(second_seq_pad, second_rand_pad)) * attn_mask_penalty + second_attn_weights = jax.nn.softmax( + second_product, axis=-1 + ) # [bsz, n_heads, from_block_size, (4+n_rand_blocks)*to_block_size] + + # [bsz, n_heads, from_block_size, (4+r)*to_block_size] x [bsz, n_heads, (4+r)*to_block_size, -1] + # ==> [bsz, n_heads, from_block_size, -1] + second_context_layer = jnp.einsum("bhqk,bhkd->bhqd", second_attn_weights, second_value_mat) + second_context_layer = jnp.expand_dims(second_context_layer, 2) + + # 3rd PART + # Middle blocks attention scores + # q[-2:2] x (sliding_keys, random_keys, global_keys) + # sliding attn is calculated using special trick of shifting tokens as discussed in paper + # random keys are generated by taking random indices as per `rand_attn` + # global keys -> 1st & last block + + exp_blocked_key_matrix = jnp.concatenate( + [blocked_key_matrix[:, :, 1:-3], blocked_key_matrix[:, :, 2:-2], blocked_key_matrix[:, :, 3:-1]], axis=3 + ) # [bsz, n_heads, from_seq_len//from_block_size-4, 3*to_block_size, -1] + exp_blocked_value_matrix = jnp.concatenate( + [blocked_value_matrix[:, :, 1:-3], blocked_value_matrix[:, :, 2:-2], blocked_value_matrix[:, :, 3:-1]], + axis=3, + ) # [bsz, n_heads, from_seq_len//from_block_size-4, 3*to_block_size, -1] + middle_query_matrix = blocked_query_matrix[:, :, 2:-2] + + # sliding attention scores for q[-2:2] + # [bsz, n_heads, from_seq_len//from_block_size-4, from_block_size, -1] x [b, n_heads, from_seq_len//from_block_size-4, 3*to_block_size, -1] + inner_band_product = jnp.einsum("bhlqd,bhlkd->bhlqk", middle_query_matrix, exp_blocked_key_matrix) + # ==> [bsz, n_heads, from_seq_len//from_block_size-4, from_block_size, 3*to_block_size] + inner_band_product = inner_band_product * rsqrt_d + + # randn attention scores for q[-2:2] + # [bsz, n_heads, from_seq_len//from_block_size-4, from_block_size, -1] + # x [bsz, n_heads, from_seq_len//from_block_size-4, n_rand_blocks*to_block_size, -1] + rand_band_product = jnp.einsum("bhlqd,bhlkd->bhlqk", middle_query_matrix, gathered_key[:, :, 1:-1]) + # ==> [bsz, n_heads, from_seq_len//from_block_size-4, from_block_size, n_rand_blocks*to_block_size] + rand_band_product = rand_band_product * rsqrt_d + + # Including 1st block (since it's global) + # [bsz, n_heads, from_seq_len//from_block_size-4, from_block_size, -1] x [bsz, n_heads, to_block_size, -1] + # ==> [bsz, n_heads, from_seq_len//from_block_size-4, from_block_size, to_block_size] + first_band_product = jnp.einsum("bhlqd,bhkd->bhlqk", middle_query_matrix, blocked_key_matrix[:, :, 0]) + first_band_product = first_band_product * rsqrt_d + + # Including last block (since it's global) + # [bsz, n_heads, from_seq_len//from_block_size-4, from_block_size, -1] x [bsz, n_heads, to_block_size, -1] + # ==> [bsz, n_heads, from_seq_len//from_block_size-4, from_block_size, to_block_size] + last_band_product = jnp.einsum("bhlqd,bhkd->bhlqk", middle_query_matrix, blocked_key_matrix[:, :, -1]) + last_band_product = last_band_product * rsqrt_d + + # masking padded tokens + inner_band_product += (1.0 - band_mask) * attn_mask_penalty + first_band_product += (1.0 - jnp.expand_dims(to_mask[:, :, :, :to_block_size], 3)) * attn_mask_penalty + last_band_product += (1.0 - jnp.expand_dims(to_mask[:, :, :, -to_block_size:], 3)) * attn_mask_penalty + rand_band_product += (1.0 - rand_mask[:, :, 1:-1]) * attn_mask_penalty + + # completing attention scores matrix for all q[-2:2] + band_product = jnp.concatenate( + [first_band_product, inner_band_product, rand_band_product, last_band_product], axis=-1 + ) # [bsz, n_heads, from_seq_len//from_block_size-4, from_block_size, (5+n_rand_blocks)*to_block_size] + + # safely doing softmax since attention matrix is completed + attn_weights = jax.nn.softmax( + band_product, axis=-1 + ) # [bsz, n_heads, from_seq_len//from_block_size-4, from_block_size, (5+n_rand_blocks)*to_block_size] + + # contribution of sliding keys + # [bsz, n_heads, m//from_block_size-4, from_block_size, 3*to_block_size] + # x [bsz, n_heads, from_seq_len//from_block_size-4, 3*to_block_size, -1] + context_layer = jnp.einsum( + "bhlqk,bhlkd->bhlqd", attn_weights[:, :, :, :, to_block_size : 4 * to_block_size], exp_blocked_value_matrix + ) + # ==> [bsz, n_heads, from_seq_len//from_block_size-4, from_block_size, -1] + + # adding contribution of random keys + # [bsz, n_heads, from_seq_len//from_block_size-4, from_block_size, n_rand_blocks*to_block_size] + # x [bsz, n_heads, from_seq_len//from_block_size-4, n_rand_blocks*to_block_size, -1] + context_layer += jnp.einsum( + "bhlqk,bhlkd->bhlqd", + attn_weights[:, :, :, :, 4 * to_block_size : -to_block_size], + gathered_value[:, :, 1:-1], + ) + # ==> [bsz, n_heads, from_seq_len//from_block_size-4, from_block_size, -1] + + # adding contribution of global keys + # [bsz, n_heads, from_seq_len//from_block_size-4, from_block_size, to_block_size] x [bsz, n_heads, to_block_size, -1] + # ==> [bsz, n_heads, from_seq_len//from_block_size-4, from_block_size, -1] + context_layer += jnp.einsum( + "bhlqk,bhkd->bhlqd", attn_weights[:, :, :, :, :to_block_size], blocked_value_matrix[:, :, 0] + ) + # [bsz, n_heads, from_seq_len//from_block_size-4, from_block_size, to_block_size] x [bsz, n_heads, to_block_size, -1] + # ==> [bsz, n_heads, from_seq_len//from_block_size-4, from_block_size, -1] + context_layer += jnp.einsum( + "bhlqk,bhkd->bhlqd", attn_weights[:, :, :, :, -to_block_size:], blocked_value_matrix[:, :, -1] + ) + + # 4th PART + # last 2nd token attention scores + # q[-2] x (sliding_keys, random_keys, global_keys) + # sliding key blocks -> last 3 blocks + # global key block -> 1st block + # random key block -> based on indices stored in `randn_attn` + + second_last_key_mat = jnp.concatenate( + [ + blocked_key_matrix[:, :, 0], + blocked_key_matrix[:, :, -3], + blocked_key_matrix[:, :, -2], + blocked_key_matrix[:, :, -1], + gathered_key[:, :, -1], + ], + axis=2, + ) # [bsz, n_heads, (4+n_random_blocks)*to_block_size, -1] + second_last_value_mat = jnp.concatenate( + [ + blocked_value_matrix[:, :, 0], + blocked_value_matrix[:, :, -3], + blocked_value_matrix[:, :, -2], + blocked_value_matrix[:, :, -1], + gathered_value[:, :, -1], + ], + axis=2, + ) # [bsz, n_heads, (4+r)*to_block_size, -1] + + # [bsz, n_heads, from_block_size, -1] x [bsz, n_heads, (4+n_rand_blocks)*to_block_size, -1] + # ==> [bsz, n_heads, from_block_size, (4+n_rand_blocks)*to_block_size] + second_last_product = jnp.einsum("bhqd,bhkd->bhqk", blocked_query_matrix[:, :, -2], second_last_key_mat) + second_last_seq_pad = jnp.concatenate( + [ + to_mask[:, :, :, :to_block_size], + to_mask[:, :, :, -3 * to_block_size :], + jnp.ones([bsz, 1, 1, n_rand_blocks * to_block_size], dtype=to_mask.dtype), + ], + axis=3, + ) + second_last_rand_pad = jnp.concatenate( + [ + jnp.ones([bsz, n_heads, from_block_size, 4 * to_block_size], dtype=rand_mask.dtype), + rand_mask[:, :, -1], + ], + axis=3, + ) + second_last_product = second_last_product * rsqrt_d + second_last_product += (1.0 - jnp.minimum(second_last_seq_pad, second_last_rand_pad)) * attn_mask_penalty + second_last_attn_weights = jax.nn.softmax( + second_last_product, axis=-1 + ) # [bsz, n_heads, from_block_size, (4+n_rand_blocks)*to_block_size] + + # [bsz, n_heads, from_block_size, (4+n_rand_blocks)*to_block_size] x [bsz, n_heads, (4+n_rand_blocks)*to_block_size, -1] + # ==> [bsz, n_heads, from_block_size, -1] + second_last_context_layer = jnp.einsum("bhqk,bhkd->bhqd", second_last_attn_weights, second_last_value_mat) + second_last_context_layer = jnp.expand_dims(second_last_context_layer, 2) + + # 5th PART + # last block (global) attention scores + # q[-1] x (k[0], k[1], k[2], k[3], .... ) + + # [bsz, n_heads, from_block_size, -1] x [bsz, n_heads, to_seq_len, -1] ==> [bsz, n_heads, from_block_size, to_seq_len] + last_product = jnp.einsum("bhqd,bhkd->bhqk", blocked_query_matrix[:, :, -1], key_layer) + last_product = last_product * rsqrt_d + last_product += (1.0 - to_mask) * attn_mask_penalty + last_attn_weights = jax.nn.softmax(last_product, axis=-1) # [bsz, n_heads, from_block_size, n] + + # [bsz, n_heads, from_block_size, to_seq_len] x [bsz, n_heads, to_seq_len, -1] ==> [bsz, n_heads, from_block_size, -1] + last_context_layer = jnp.einsum("bhqk,bhkd->bhqd", last_attn_weights, value_layer) + last_context_layer = jnp.expand_dims(last_context_layer, 2) + + # combining representations of all tokens + context_layer = jnp.concatenate( + [first_context_layer, second_context_layer, context_layer, second_last_context_layer, last_context_layer], + axis=2, + ) + context_layer = context_layer.reshape(bsz, n_heads, from_seq_len, -1) * from_mask + context_layer = jnp.transpose(context_layer, axes=(0, 2, 1, 3)).reshape(bsz, from_seq_len, -1) + + attention_probs = None + + return context_layer, attention_probs + + @staticmethod + def jax_gather(params, indices, batch_dims=2): + """ + Gather the indices from params correctly (equivalent to tf.gather but with modifications) + + Args: + params: (bsz, n_heads, num_blocks, block_size, head_dim) + indices: (bhlqk", from_blocked_mask[:, 1:-1], rand_mask) + return rand_mask + + @staticmethod + def _get_rand_attn_plan(from_seq_length, from_block_size, num_rand_blocks): + """ + Gives the plan of where to put random attention. + + Args: + from_seq_length: int. length of from sequence. + from_block_size: int. size of block in from sequence. + num_rand_blocks: int. Number of random chunks per row. + + Returns: + plan_from_length: ending location of from block plan_num_rand_blocks: number of random ending location for + each block + """ + + plan_from_length = [] + plan_num_rand_blocks = [] + if (2 * num_rand_blocks + 5) < (from_seq_length // from_block_size): + plan_from_length.append(int((2 * num_rand_blocks + 5) * from_block_size)) + plan_num_rand_blocks.append(num_rand_blocks) + plan_from_length.append(from_seq_length) + plan_num_rand_blocks.append(0) + elif (num_rand_blocks + 5) < (from_seq_length // from_block_size): + plan_from_length.append(int((num_rand_blocks + 5) * from_block_size)) + plan_num_rand_blocks.append(num_rand_blocks // 2) + plan_from_length.append(from_seq_length) + plan_num_rand_blocks.append(num_rand_blocks - (num_rand_blocks // 2)) + else: + plan_from_length.append(from_seq_length) + plan_num_rand_blocks.append(num_rand_blocks) + + return plan_from_length, plan_num_rand_blocks + + @staticmethod + def _bigbird_block_rand_mask( + from_seq_length, + to_seq_length, + from_block_size, + to_block_size, + num_rand_blocks, + indices_prng_key: Optional[jax.random.PRNGKey] = None, + deterministic: Optional[bool] = True, + last_idx: Optional[int] = -1, + ): + """ + Create adjacency list of random attention. + + Args: + from_seq_length: int. length of from sequence. + to_seq_length: int. length of to sequence. + from_block_size: int. size of block in from sequence. + to_block_size: int. size of block in to sequence. + num_rand_blocks: int. Number of random chunks per row. + indices_prng_key: jax.random.PRNGKey. PRNG key that is used to perform random jax operations. + deterministic: bool. When False random attention will be used. + last_idx: if -1 then num_rand_blocks blocks chosen anywhere in to sequence, + if positive then num_rand_blocks blocks chosen only up to last_idx. + + Returns: + adjacency list of size from_seq_length//from_block_size-2 by num_rand_blocks + """ + # using this method when from_seq_length in [1024, 3072, 4096] + + if from_seq_length // from_block_size != to_seq_length // to_block_size: + raise ValueError("Error the number of blocks needs to be same!") + rand_attn = jnp.zeros((from_seq_length // from_block_size - 2, num_rand_blocks), dtype=jnp.int32) + # deterministic nor randomness + if deterministic: + return rand_attn + + middle_seq = jnp.arange(1, to_seq_length // to_block_size - 1, dtype=jnp.int32) + last = to_seq_length // to_block_size - 1 + if last_idx > (2 * to_block_size): + last = (last_idx // to_block_size) - 1 + + r = num_rand_blocks # shorthand + for i in range(1, from_seq_length // from_block_size - 1): + start = i - 2 + end = i + if i == 1: + seq_values = jax.random.permutation(indices_prng_key, middle_seq[2:last])[:r] + rand_attn = rand_attn.at[i - 1].set(seq_values) + elif i == 2: + seq_values = jax.random.permutation(indices_prng_key, middle_seq[3:last])[:r] + rand_attn = rand_attn.at[i - 1].set(seq_values) + elif i == from_seq_length // from_block_size - 3: + seq_values = jax.random.permutation(indices_prng_key, middle_seq[:last])[:r] + rand_attn = rand_attn.at[i - 1].set(seq_values) + # Missing -3: should have been sliced till last-3 + elif i == from_seq_length // from_block_size - 2: + seq_values = jax.random.permutation(indices_prng_key, middle_seq[:last])[:r] + rand_attn = rand_attn.at[i - 1].set(seq_values) + # Missing -4: should have been sliced till last-4 + else: + if start > last: + start = last + seq_values = jax.random.permutation(indices_prng_key, middle_seq[:start])[:r] + rand_attn = rand_attn.at[i - 1].set(seq_values) + elif (end + 1) == last: + seq_values = jax.random.permutation(indices_prng_key, middle_seq[:start])[:r] + rand_attn = rand_attn.at[i - 1].set(seq_values) + else: + concat_values = jnp.concatenate((middle_seq[:start], middle_seq[end + 1 : last])) + seq_values = jax.random.permutation(indices_prng_key, concat_values)[:r] + rand_attn = rand_attn.at[i - 1].set(seq_values) + return rand_attn + + def _bigbird_block_rand_mask_with_head( + self, + from_seq_length, + to_seq_length, + from_block_size, + to_block_size, + num_heads, + plan_from_length, + plan_num_rand_blocks, + indices_prng_key: Optional[jax.random.PRNGKey] = None, + deterministic: Optional[bool] = True, + window_block_left=1, + window_block_right=1, + global_block_top=1, + global_block_bottom=1, + global_block_left=1, + global_block_right=1, + ): + """ + Create adjacency list of random attention. + + Args: + from_seq_length: int. length of from sequence. + to_seq_length: int. length of to sequence. + from_block_size: int. size of block in from sequence. + to_block_size: int. size of block in to sequence. + num_heads: int. total number of heads. + plan_from_length: list. plan from length where num_random_blocks are choosen from. + plan_num_rand_blocks: list. number of rand blocks within the plan. + indices_prng_key: jax.random.PRNGKey. PRNG key that is used to perform random jax operations. + deterministic: bool. When False random attention will be used. + window_block_left: int. number of blocks of window to left of a block. + window_block_right: int. number of blocks of window to right of a block. + global_block_top: int. number of blocks at the top. + global_block_bottom: int. number of blocks at the bottom. + global_block_left: int. Number of blocks globally used to the left. + global_block_right: int. Number of blocks globally used to the right. + + Returns: + adjacency list of size num_head where each element is of size from_seq_length//from_block_size-2 by + num_rand_blocks + """ + # using this method when from_seq_length not in [1024, 3072, 4096] + + if from_seq_length // from_block_size != to_seq_length // to_block_size: + raise ValueError("Error the number of blocks needs to be same!") + + if from_seq_length not in plan_from_length: + raise ValueError("Error from sequence length not in plan!") + + # Total number of blocks in the mmask + num_blocks = from_seq_length // from_block_size + # Number of blocks per plan + plan_block_length = jnp.array(plan_from_length) // from_block_size + # till when to follow plan + max_plan_idx = plan_from_length.index(from_seq_length) + + # Random Attention adjacency list + rand_attn = [ + jnp.zeros((num_blocks, sum(plan_num_rand_blocks[: max_plan_idx + 1])), dtype=jnp.int32) + for i in range(num_heads) + ] + + # deterministic + if deterministic: + for nh in range(num_heads): + rand_attn[nh] = rand_attn[nh][global_block_top : num_blocks - global_block_bottom, :] + return rand_attn + + # We will go iteratively over the plan blocks and pick random number of + # Attention blocks from the legally allowed blocks + for plan_idx in range(max_plan_idx + 1): + rnd_r_cnt = 0 + if plan_idx > 0: + # set the row for all from_blocks starting from 0 to + # plan_block_length[plan_idx-1] + # column indx start fromm plan_block_length[plan_idx-1] and ends at + # plan_block_length[plan_idx] + if plan_num_rand_blocks[plan_idx] > 0: + rnd_r_cnt = int(sum(plan_num_rand_blocks[:plan_idx])) + curr_r_cnt = int(sum(plan_num_rand_blocks[: plan_idx + 1])) + for blk_rw_idx in range(global_block_top, plan_block_length[plan_idx - 1]): + for h in range(num_heads): + single_block_row_attention = self._get_single_block_row_attention( + block_id=blk_rw_idx, + to_start_block_id=plan_block_length[plan_idx - 1], + to_end_block_id=plan_block_length[plan_idx], + num_rand_blocks=plan_num_rand_blocks[plan_idx], + window_block_left=window_block_left, + window_block_right=window_block_right, + global_block_left=global_block_left, + global_block_right=global_block_right, + indices_prng_key=indices_prng_key, + ) + rand_attn[h] = ( + rand_attn[h].at[blk_rw_idx, rnd_r_cnt:curr_r_cnt].set(single_block_row_attention) + ) + + for pl_id in range(plan_idx): + if plan_num_rand_blocks[pl_id] == 0: + continue + for blk_rw_idx in range(plan_block_length[plan_idx - 1], plan_block_length[plan_idx]): + rnd_r_cnt = 0 + to_start_block_id = 0 + if pl_id > 0: + rnd_r_cnt = int(sum(plan_num_rand_blocks[:pl_id])) + to_start_block_id = plan_block_length[pl_id - 1] + curr_r_cnt = int(sum(plan_num_rand_blocks[: pl_id + 1])) + for h in range(num_heads): + single_block_row_attention = self._get_single_block_row_attention( + block_id=blk_rw_idx, + to_start_block_id=to_start_block_id, + to_end_block_id=plan_block_length[pl_id], + num_rand_blocks=plan_num_rand_blocks[pl_id], + window_block_left=window_block_left, + window_block_right=window_block_right, + global_block_left=global_block_left, + global_block_right=global_block_right, + indices_prng_key=indices_prng_key, + ) + rand_attn[h] = ( + rand_attn[h].at[blk_rw_idx, rnd_r_cnt:curr_r_cnt].set(single_block_row_attention) + ) + + if plan_num_rand_blocks[plan_idx] == 0: + continue + curr_r_cnt = int(sum(plan_num_rand_blocks[: plan_idx + 1])) + from_start_block_id = global_block_top + to_start_block_id = 0 + if plan_idx > 0: + rnd_r_cnt = int(sum(plan_num_rand_blocks[:plan_idx])) + from_start_block_id = plan_block_length[plan_idx - 1] + to_start_block_id = plan_block_length[plan_idx - 1] + for blk_rw_idx in range(from_start_block_id, plan_block_length[plan_idx]): + for h in range(num_heads): + single_block_row_attention = self._get_single_block_row_attention( + block_id=blk_rw_idx, + to_start_block_id=to_start_block_id, + to_end_block_id=plan_block_length[plan_idx], + num_rand_blocks=plan_num_rand_blocks[plan_idx], + window_block_left=window_block_left, + window_block_right=window_block_right, + global_block_left=global_block_left, + global_block_right=global_block_right, + indices_prng_key=indices_prng_key, + ) + rand_attn[h] = rand_attn[h].at[blk_rw_idx, rnd_r_cnt:curr_r_cnt].set(single_block_row_attention) + + for nh in range(num_heads): + rand_attn[nh] = rand_attn[nh][global_block_top : num_blocks - global_block_bottom, :] + return rand_attn + + @staticmethod + def _get_single_block_row_attention( + block_id, + to_start_block_id, + to_end_block_id, + num_rand_blocks, + indices_prng_key: Optional[jax.random.PRNGKey] = None, + window_block_left=1, + window_block_right=1, + global_block_left=1, + global_block_right=1, + ): + """ + For a single row block get random row attention. + + Args: + block_id: int. block id of row. + to_start_block_id: int. random attention column start id. + to_end_block_id: int. random attention column end id. + num_rand_blocks: int. number of random blocks to be selected. + indices_prng_key: jax.random.PRNGKey. PRNG key that is used to perform random jax operations + window_block_left: int. number of blocks of window to left of a block. + window_block_right: int. number of blocks of window to right of a block. + global_block_left: int. Number of blocks globally used to the left. + global_block_right: int. Number of blocks globally used to the right. + + Returns: + row containing the random attention vector of size num_rand_blocks. + """ + # list of to_blocks from which to choose random attention + to_block_list = jnp.arange(to_start_block_id, to_end_block_id, dtype=jnp.int32) + # permute the blocks + perm_block = jax.random.permutation(indices_prng_key, to_block_list) + + # illegal blocks for the current block id, using window + illegal_blocks = list(range(block_id - window_block_left, block_id + window_block_right + 1)) + + # Add blocks at the start and at the end + illegal_blocks.extend(list(range(global_block_left))) + illegal_blocks.extend(list(range(to_end_block_id - global_block_right, to_end_block_id))) + + # The second from_block cannot choose random attention on second last to_block + if block_id == 1: + illegal_blocks.append(to_end_block_id - 2) + + # The second last from_block cannot choose random attention on second to_block + if block_id == to_end_block_id - 2: + illegal_blocks.append(1) + + selected_random_blocks = [] + + for i in range(to_end_block_id - to_start_block_id): + if perm_block[i] not in illegal_blocks: + selected_random_blocks.append(perm_block[i]) + if len(selected_random_blocks) == num_rand_blocks: + break + return jnp.array(selected_random_blocks, dtype=jnp.int32) + + +# Copied from transformers.models.bert.modeling_flax_bert.FlaxBertSelfOutput with Bert->BigBird +class FlaxBigBirdSelfOutput(nn.Module): + config: BigBirdConfig + dtype: jnp.dtype = jnp.float32 # the dtype of the computation + + def setup(self): + self.dense = nn.Dense( + self.config.hidden_size, + kernel_init=jax.nn.initializers.normal(self.config.initializer_range), + dtype=self.dtype, + ) + self.LayerNorm = nn.LayerNorm(epsilon=self.config.layer_norm_eps, dtype=self.dtype) + self.dropout = nn.Dropout(rate=self.config.hidden_dropout_prob) + + def __call__(self, hidden_states, input_tensor, deterministic: bool = True): + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states, deterministic=deterministic) + hidden_states = self.LayerNorm(hidden_states + input_tensor) + return hidden_states + + +class FlaxBigBirdAttention(nn.Module): + config: BigBirdConfig + layer_id: Optional[int] = None + causal: bool = False + dtype: jnp.dtype = jnp.float32 + + def setup(self): + if self.config.attention_type == "original_full": + self.self = FlaxBigBirdSelfAttention(self.config, causal=self.causal, dtype=self.dtype) + elif self.config.attention_type == "block_sparse": + self.self = FlaxBigBirdBlockSparseAttention(self.config, block_sparse_seed=self.layer_id, dtype=self.dtype) + else: + raise ValueError( + f"Your `config.attention_type` is {self.config.attention_type} but it can either be `original_full` or" + " `block_sparse`" + ) + + self.output = FlaxBigBirdSelfOutput(self.config, dtype=self.dtype) + + def __call__( + self, + hidden_states, + attention_mask, + layer_head_mask, + key_value_states=None, + init_cache=False, + deterministic=True, + output_attentions: bool = False, + ): + # Attention mask comes in as attention_mask.shape == (*batch_sizes, kv_length) + # FLAX expects: attention_mask.shape == (*batch_sizes, 1, 1, kv_length) such that it is broadcastable + # with attn_weights.shape == (*batch_sizes, num_heads, q_length, kv_length) + if self.config.attention_type == "original_full": + attn_outputs = self.self( + hidden_states, + attention_mask, + layer_head_mask=layer_head_mask, + key_value_states=key_value_states, + init_cache=init_cache, + deterministic=deterministic, + output_attentions=output_attentions, + ) + else: + attn_outputs = self.self( + hidden_states, + attention_mask, + deterministic=deterministic, + output_attentions=output_attentions, + ) + attn_output = attn_outputs[0] + hidden_states = self.output(attn_output, hidden_states, deterministic=deterministic) + + outputs = (hidden_states,) + + if output_attentions: + outputs += (attn_outputs[1],) + + return outputs + + +# Copied from transformers.models.bert.modeling_flax_bert.FlaxBertIntermediate with Bert->BigBird +class FlaxBigBirdIntermediate(nn.Module): + config: BigBirdConfig + dtype: jnp.dtype = jnp.float32 # the dtype of the computation + + def setup(self): + self.dense = nn.Dense( + self.config.intermediate_size, + kernel_init=jax.nn.initializers.normal(self.config.initializer_range), + dtype=self.dtype, + ) + self.activation = ACT2FN[self.config.hidden_act] + + def __call__(self, hidden_states): + hidden_states = self.dense(hidden_states) + hidden_states = self.activation(hidden_states) + return hidden_states + + +# Copied from transformers.models.bert.modeling_flax_bert.FlaxBertOutput with Bert->BigBird +class FlaxBigBirdOutput(nn.Module): + config: BigBirdConfig + dtype: jnp.dtype = jnp.float32 # the dtype of the computation + + def setup(self): + self.dense = nn.Dense( + self.config.hidden_size, + kernel_init=jax.nn.initializers.normal(self.config.initializer_range), + dtype=self.dtype, + ) + self.dropout = nn.Dropout(rate=self.config.hidden_dropout_prob) + self.LayerNorm = nn.LayerNorm(epsilon=self.config.layer_norm_eps, dtype=self.dtype) + + def __call__(self, hidden_states, attention_output, deterministic: bool = True): + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states, deterministic=deterministic) + hidden_states = self.LayerNorm(hidden_states + attention_output) + return hidden_states + + +class FlaxBigBirdLayer(nn.Module): + config: BigBirdConfig + layer_id: Optional[int] = None + dtype: jnp.dtype = jnp.float32 # the dtype of the computation + + def setup(self): + self.attention = FlaxBigBirdAttention( + self.config, layer_id=self.layer_id, causal=self.config.is_decoder, dtype=self.dtype + ) + self.intermediate = FlaxBigBirdIntermediate(self.config, dtype=self.dtype) + self.output = FlaxBigBirdOutput(self.config, dtype=self.dtype) + if self.config.add_cross_attention: + self.crossattention = FlaxBigBirdAttention(self.config, causal=False, dtype=self.dtype) + + # Copied from transformers.models.bert.modeling_flax_bert.FlaxBertLayer.__call__ with Bert->BigBird + def __call__( + self, + hidden_states, + attention_mask, + layer_head_mask, + encoder_hidden_states: Optional[jnp.ndarray] = None, + encoder_attention_mask: Optional[jnp.ndarray] = None, + init_cache: bool = False, + deterministic: bool = True, + output_attentions: bool = False, + ): + # Self Attention + attention_outputs = self.attention( + hidden_states, + attention_mask, + layer_head_mask=layer_head_mask, + init_cache=init_cache, + deterministic=deterministic, + output_attentions=output_attentions, + ) + attention_output = attention_outputs[0] + + # Cross-Attention Block + if encoder_hidden_states is not None: + cross_attention_outputs = self.crossattention( + attention_output, + attention_mask=encoder_attention_mask, + layer_head_mask=layer_head_mask, + key_value_states=encoder_hidden_states, + deterministic=deterministic, + output_attentions=output_attentions, + ) + attention_output = cross_attention_outputs[0] + + hidden_states = self.intermediate(attention_output) + hidden_states = self.output(hidden_states, attention_output, deterministic=deterministic) + + outputs = (hidden_states,) + + if output_attentions: + outputs += (attention_outputs[1],) + if encoder_hidden_states is not None: + outputs += (cross_attention_outputs[1],) + return outputs + + +class FlaxBigBirdLayerCollection(nn.Module): + config: BigBirdConfig + dtype: jnp.dtype = jnp.float32 # the dtype of the computation + gradient_checkpointing: bool = False + + def setup(self): + if self.gradient_checkpointing: + FlaxBigBirdCheckpointLayer = remat(FlaxBigBirdLayer, static_argnums=(5, 6, 7)) + self.layers = [ + FlaxBigBirdCheckpointLayer(self.config, layer_id=i, name=str(i), dtype=self.dtype) + for i in range(self.config.num_hidden_layers) + ] + else: + self.layers = [ + FlaxBigBirdLayer(self.config, layer_id=i, name=str(i), dtype=self.dtype) + for i in range(self.config.num_hidden_layers) + ] + + # Copied from transformers.models.bert.modeling_flax_bert.FlaxBertLayerCollection.__call__ with Bert->BigBird + def __call__( + self, + hidden_states, + attention_mask, + head_mask, + encoder_hidden_states: Optional[jnp.ndarray] = None, + encoder_attention_mask: Optional[jnp.ndarray] = None, + init_cache: bool = False, + deterministic: bool = True, + output_attentions: bool = False, + output_hidden_states: bool = False, + return_dict: bool = True, + ): + all_attentions = () if output_attentions else None + all_hidden_states = () if output_hidden_states else None + all_cross_attentions = () if (output_attentions and encoder_hidden_states is not None) else None + + # Check if head_mask has a correct number of layers specified if desired + if head_mask is not None: + if head_mask.shape[0] != (len(self.layers)): + raise ValueError( + f"The head_mask should be specified for {len(self.layers)} layers, but it is for " + f" {head_mask.shape[0]}." + ) + + for i, layer in enumerate(self.layers): + if output_hidden_states: + all_hidden_states += (hidden_states,) + + layer_outputs = layer( + hidden_states, + attention_mask, + head_mask[i] if head_mask is not None else None, + encoder_hidden_states, + encoder_attention_mask, + init_cache, + deterministic, + output_attentions, + ) + + hidden_states = layer_outputs[0] + + if output_attentions: + all_attentions += (layer_outputs[1],) + + if encoder_hidden_states is not None: + all_cross_attentions += (layer_outputs[2],) + + if output_hidden_states: + all_hidden_states += (hidden_states,) + + outputs = (hidden_states, all_hidden_states, all_attentions, all_cross_attentions) + + if not return_dict: + return tuple(v for v in outputs if v is not None) + + return FlaxBaseModelOutputWithPastAndCrossAttentions( + last_hidden_state=hidden_states, + hidden_states=all_hidden_states, + attentions=all_attentions, + cross_attentions=all_cross_attentions, + ) + + +# Copied from transformers.models.bert.modeling_flax_bert.FlaxBertEncoder with Bert->BigBird +class FlaxBigBirdEncoder(nn.Module): + config: BigBirdConfig + dtype: jnp.dtype = jnp.float32 # the dtype of the computation + gradient_checkpointing: bool = False + + def setup(self): + self.layer = FlaxBigBirdLayerCollection( + self.config, + dtype=self.dtype, + gradient_checkpointing=self.gradient_checkpointing, + ) + + def __call__( + self, + hidden_states, + attention_mask, + head_mask, + encoder_hidden_states: Optional[jnp.ndarray] = None, + encoder_attention_mask: Optional[jnp.ndarray] = None, + init_cache: bool = False, + deterministic: bool = True, + output_attentions: bool = False, + output_hidden_states: bool = False, + return_dict: bool = True, + ): + return self.layer( + hidden_states, + attention_mask, + head_mask=head_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + init_cache=init_cache, + deterministic=deterministic, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + +# Copied from transformers.models.bert.modeling_flax_bert.FlaxBertPredictionHeadTransform with Bert->BigBird +class FlaxBigBirdPredictionHeadTransform(nn.Module): + config: BigBirdConfig + dtype: jnp.dtype = jnp.float32 + + def setup(self): + self.dense = nn.Dense(self.config.hidden_size, dtype=self.dtype) + self.activation = ACT2FN[self.config.hidden_act] + self.LayerNorm = nn.LayerNorm(epsilon=self.config.layer_norm_eps, dtype=self.dtype) + + def __call__(self, hidden_states): + hidden_states = self.dense(hidden_states) + hidden_states = self.activation(hidden_states) + return self.LayerNorm(hidden_states) + + +# Copied from transformers.models.bert.modeling_flax_bert.FlaxBertLMPredictionHead with Bert->BigBird, np.ndarray->jnp.ndarray +class FlaxBigBirdLMPredictionHead(nn.Module): + config: BigBirdConfig + dtype: jnp.dtype = jnp.float32 + bias_init: Callable[..., jnp.ndarray] = jax.nn.initializers.zeros + + def setup(self): + self.transform = FlaxBigBirdPredictionHeadTransform(self.config, dtype=self.dtype) + self.decoder = nn.Dense(self.config.vocab_size, dtype=self.dtype, use_bias=False) + self.bias = self.param("bias", self.bias_init, (self.config.vocab_size,)) + + def __call__(self, hidden_states, shared_embedding=None): + hidden_states = self.transform(hidden_states) + + if shared_embedding is not None: + hidden_states = self.decoder.apply({"params": {"kernel": shared_embedding.T}}, hidden_states) + else: + hidden_states = self.decoder(hidden_states) + + bias = jnp.asarray(self.bias, self.dtype) + hidden_states += bias + return hidden_states + + +# Copied from transformers.models.bert.modeling_flax_bert.FlaxBertOnlyMLMHead with Bert->BigBird +class FlaxBigBirdOnlyMLMHead(nn.Module): + config: BigBirdConfig + dtype: jnp.dtype = jnp.float32 + + def setup(self): + self.predictions = FlaxBigBirdLMPredictionHead(self.config, dtype=self.dtype) + + def __call__(self, hidden_states, shared_embedding=None): + hidden_states = self.predictions(hidden_states, shared_embedding=shared_embedding) + return hidden_states + + +class FlaxBigBirdPreTrainingHeads(nn.Module): + config: BigBirdConfig + dtype: jnp.dtype = jnp.float32 + + def setup(self): + self.predictions = FlaxBigBirdLMPredictionHead(self.config, dtype=self.dtype) + self.seq_relationship = nn.Dense(2, dtype=self.dtype) + + def __call__(self, hidden_states, pooled_output, shared_embedding=None): + prediction_scores = self.predictions(hidden_states, shared_embedding=shared_embedding) + seq_relationship_score = self.seq_relationship(pooled_output) + return prediction_scores, seq_relationship_score + + +class FlaxBigBirdPreTrainedModel(FlaxPreTrainedModel): + """ + An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained + models. + """ + + config_class = BigBirdConfig + base_model_prefix = "bert" + module_class: nn.Module = None + + def __init__( + self, + config: BigBirdConfig, + input_shape: Optional[tuple] = None, + seed: int = 0, + dtype: jnp.dtype = jnp.float32, + _do_init: bool = True, + gradient_checkpointing: bool = False, + **kwargs, + ): + module = self.module_class(config=config, dtype=dtype, gradient_checkpointing=gradient_checkpointing, **kwargs) + if config.attention_type == "block_sparse" and input_shape is None: + input_shape = (1, 12 * config.block_size) + elif input_shape is None: + input_shape = (1, 1) + + super().__init__(config, module, input_shape=input_shape, seed=seed, dtype=dtype, _do_init=_do_init) + + # Copied from transformers.models.bert.modeling_flax_bert.FlaxBertPreTrainedModel.enable_gradient_checkpointing + def enable_gradient_checkpointing(self): + self._module = self.module_class( + config=self.config, + dtype=self.dtype, + gradient_checkpointing=True, + ) + + def init_weights(self, rng: jax.random.PRNGKey, input_shape: Tuple, params: FrozenDict = None) -> FrozenDict: + # init input tensors + input_ids = jnp.zeros(input_shape, dtype="i4") + token_type_ids = jnp.zeros_like(input_ids) + position_ids = jnp.broadcast_to(jnp.arange(jnp.atleast_2d(input_ids).shape[-1]), input_shape) + attention_mask = jnp.ones_like(input_ids) + head_mask = jnp.ones((self.config.num_hidden_layers, self.config.num_attention_heads)) + + params_rng, dropout_rng, indices_rng = jax.random.split(rng, num=3) + rngs = {"params": params_rng, "dropout": dropout_rng, "indices": indices_rng} + + if self.config.add_cross_attention: + encoder_hidden_states = jnp.zeros(input_shape + (self.config.hidden_size,)) + encoder_attention_mask = attention_mask + module_init_outputs = self.module.init( + rngs, + input_ids, + attention_mask, + token_type_ids, + position_ids, + head_mask, + encoder_hidden_states, + encoder_attention_mask, + return_dict=False, + ) + else: + module_init_outputs = self.module.init( + rngs, + input_ids, + attention_mask, + token_type_ids, + position_ids, + head_mask, + return_dict=False, + ) + + random_params = module_init_outputs["params"] + + if params is not None: + random_params = flatten_dict(unfreeze(random_params)) + params = flatten_dict(unfreeze(params)) + for missing_key in self._missing_keys: + params[missing_key] = random_params[missing_key] + self._missing_keys = set() + return freeze(unflatten_dict(params)) + else: + return random_params + + # Copied from transformers.models.bart.modeling_flax_bart.FlaxBartDecoderPreTrainedModel.init_cache + def init_cache(self, batch_size, max_length): + r""" + Args: + batch_size (`int`): + batch_size used for fast auto-regressive decoding. Defines the batch size of the initialized cache. + max_length (`int`): + maximum possible length for auto-regressive decoding. Defines the sequence length of the initialized + cache. + """ + # init input variables to retrieve cache + input_ids = jnp.ones((batch_size, max_length), dtype="i4") + attention_mask = jnp.ones_like(input_ids, dtype="i4") + position_ids = jnp.broadcast_to(jnp.arange(jnp.atleast_2d(input_ids).shape[-1]), input_ids.shape) + + init_variables = self.module.init( + jax.random.PRNGKey(0), input_ids, attention_mask, position_ids, return_dict=False, init_cache=True + ) + return unfreeze(init_variables["cache"]) + + @add_start_docstrings_to_model_forward(BIG_BIRD_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + def __call__( + self, + input_ids, + attention_mask=None, + token_type_ids=None, + position_ids=None, + head_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + params: dict = None, + dropout_rng: Optional[jax.random.PRNGKey] = None, + indices_rng: Optional[jax.random.PRNGKey] = None, + train: bool = False, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + past_key_values: dict = None, + ): + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.return_dict + + # init input tensors if not passed + if token_type_ids is None: + token_type_ids = jnp.zeros_like(input_ids) + + if position_ids is None: + position_ids = jnp.broadcast_to(jnp.arange(jnp.atleast_2d(input_ids).shape[-1]), input_ids.shape) + + if attention_mask is None: + attention_mask = jnp.ones_like(input_ids) + + if head_mask is None: + head_mask = jnp.ones((self.config.num_hidden_layers, self.config.num_attention_heads)) + + # Handle any PRNG if needed + rngs = {} + if indices_rng is not None: + rngs["indices"] = indices_rng + + if dropout_rng is not None: + rngs["dropout"] = dropout_rng + + inputs = {"params": params or self.params} + + if self.config.add_cross_attention: + # if past_key_values are passed then cache is already initialized a private flag init_cache has to be passed + # down to ensure cache is used. It has to be made sure that cache is marked as mutable so that it can be + # changed by FlaxBigBirdAttention module + if past_key_values: + inputs["cache"] = past_key_values + mutable = ["cache"] + else: + mutable = False + + outputs = self.module.apply( + inputs, + jnp.array(input_ids, dtype="i4"), + jnp.array(attention_mask, dtype="i4"), + token_type_ids=jnp.array(token_type_ids, dtype="i4"), + position_ids=jnp.array(position_ids, dtype="i4"), + head_mask=jnp.array(head_mask, dtype="i4"), + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + deterministic=not train, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + rngs=rngs, + mutable=mutable, + ) + + # add updated cache to model output + if past_key_values is not None and return_dict: + outputs, past_key_values = outputs + outputs["past_key_values"] = unfreeze(past_key_values["cache"]) + return outputs + elif past_key_values is not None and not return_dict: + outputs, past_key_values = outputs + outputs = outputs[:1] + (unfreeze(past_key_values["cache"]),) + outputs[1:] + + else: + outputs = self.module.apply( + inputs, + jnp.array(input_ids, dtype="i4"), + jnp.array(attention_mask, dtype="i4"), + token_type_ids=jnp.array(token_type_ids, dtype="i4"), + position_ids=jnp.array(position_ids, dtype="i4"), + head_mask=jnp.array(head_mask, dtype="i4"), + deterministic=not train, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + rngs=rngs, + ) + + return outputs + + +class FlaxBigBirdModule(nn.Module): + config: BigBirdConfig + dtype: jnp.dtype = jnp.float32 # the dtype of the computation + add_pooling_layer: bool = True + gradient_checkpointing: bool = False + + def setup(self): + self.embeddings = FlaxBigBirdEmbeddings(self.config, dtype=self.dtype) + self.encoder = FlaxBigBirdEncoder( + self.config, dtype=self.dtype, gradient_checkpointing=self.gradient_checkpointing + ) + self.pooler = nn.Dense( + self.config.hidden_size, + kernel_init=jax.nn.initializers.normal(self.config.initializer_range), + dtype=self.dtype, + ) + + def __call__( + self, + input_ids, + attention_mask, + token_type_ids, + position_ids, + head_mask, + encoder_hidden_states: Optional[jnp.ndarray] = None, + encoder_attention_mask: Optional[jnp.ndarray] = None, + init_cache: bool = False, + deterministic: bool = True, + output_attentions: bool = False, + output_hidden_states: bool = False, + return_dict: bool = True, + ): + hidden_states = self.embeddings( + input_ids, token_type_ids, position_ids, attention_mask, deterministic=deterministic + ) + outputs = self.encoder( + hidden_states, + attention_mask, + head_mask=head_mask, + deterministic=deterministic, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + init_cache=init_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + hidden_states = outputs[0] + + pooled = nn.tanh(self.pooler(hidden_states[:, 0, :])) if self.add_pooling_layer else None + + if not return_dict: + # if pooled is None, don't return it + if pooled is None: + return (hidden_states,) + outputs[1:] + return (hidden_states, pooled) + outputs[1:] + + return FlaxBaseModelOutputWithPoolingAndCrossAttentions( + last_hidden_state=hidden_states, + pooler_output=pooled, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + cross_attentions=outputs.cross_attentions, + ) + + +@add_start_docstrings( + "The bare BigBird Model transformer outputting raw hidden-states without any specific head on top.", + BIG_BIRD_START_DOCSTRING, +) +# Copied from transformers.models.bert.modeling_flax_bert.FlaxBertModel with Bert->BigBird +class FlaxBigBirdModel(FlaxBigBirdPreTrainedModel): + module_class = FlaxBigBirdModule + + +append_call_sample_docstring(FlaxBigBirdModel, _CHECKPOINT_FOR_DOC, FlaxBaseModelOutputWithPooling, _CONFIG_FOR_DOC) + + +# Copied from transformers.models.bert.modeling_flax_bert.FlaxBertForPreTrainingModule with Bert->BigBird +class FlaxBigBirdForPreTrainingModule(nn.Module): + config: BigBirdConfig + dtype: jnp.dtype = jnp.float32 + gradient_checkpointing: bool = False + + def setup(self): + self.bert = FlaxBigBirdModule( + config=self.config, + dtype=self.dtype, + gradient_checkpointing=self.gradient_checkpointing, + ) + self.cls = FlaxBigBirdPreTrainingHeads(config=self.config, dtype=self.dtype) + + def __call__( + self, + input_ids, + attention_mask, + token_type_ids, + position_ids, + head_mask, + deterministic: bool = True, + output_attentions: bool = False, + output_hidden_states: bool = False, + return_dict: bool = True, + ): + # Model + outputs = self.bert( + input_ids, + attention_mask, + token_type_ids, + position_ids, + head_mask, + deterministic=deterministic, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + if self.config.tie_word_embeddings: + shared_embedding = self.bert.variables["params"]["embeddings"]["word_embeddings"]["embedding"] + else: + shared_embedding = None + + hidden_states = outputs[0] + pooled_output = outputs[1] + + prediction_scores, seq_relationship_score = self.cls( + hidden_states, pooled_output, shared_embedding=shared_embedding + ) + + if not return_dict: + return (prediction_scores, seq_relationship_score) + outputs[2:] + + return FlaxBigBirdForPreTrainingOutput( + prediction_logits=prediction_scores, + seq_relationship_logits=seq_relationship_score, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +@add_start_docstrings( + """ + BigBird Model with two heads on top as done during the pretraining: a `masked language modeling` head and a `next + sentence prediction (classification)` head. + """, + BIG_BIRD_START_DOCSTRING, +) +# Copied from transformers.models.bert.modeling_flax_bert.FlaxBertForPreTraining with Bert->BigBird +class FlaxBigBirdForPreTraining(FlaxBigBirdPreTrainedModel): + module_class = FlaxBigBirdForPreTrainingModule + + +FLAX_BIG_BIRD_FOR_PRETRAINING_DOCSTRING = """ + Returns: + + Example: + + ```python + >>> from transformers import AutoTokenizer, FlaxBigBirdForPreTraining + + >>> tokenizer = AutoTokenizer.from_pretrained("google/bigbird-roberta-base") + >>> model = FlaxBigBirdForPreTraining.from_pretrained("google/bigbird-roberta-base") + + >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="np") + >>> outputs = model(**inputs) + + >>> prediction_logits = outputs.prediction_logits + >>> seq_relationship_logits = outputs.seq_relationship_logits + ``` +""" + +overwrite_call_docstring( + FlaxBigBirdForPreTraining, + BIG_BIRD_INPUTS_DOCSTRING.format("batch_size, sequence_length") + FLAX_BIG_BIRD_FOR_PRETRAINING_DOCSTRING, +) +append_replace_return_docstrings( + FlaxBigBirdForPreTraining, output_type=FlaxBigBirdForPreTrainingOutput, config_class=_CONFIG_FOR_DOC +) + + +# Copied from transformers.models.bert.modeling_flax_bert.FlaxBertForMaskedLMModule with Bert->BigBird +class FlaxBigBirdForMaskedLMModule(nn.Module): + config: BigBirdConfig + dtype: jnp.dtype = jnp.float32 + gradient_checkpointing: bool = False + + def setup(self): + self.bert = FlaxBigBirdModule( + config=self.config, + add_pooling_layer=False, + dtype=self.dtype, + gradient_checkpointing=self.gradient_checkpointing, + ) + self.cls = FlaxBigBirdOnlyMLMHead(config=self.config, dtype=self.dtype) + + def __call__( + self, + input_ids, + attention_mask, + token_type_ids, + position_ids, + head_mask, + deterministic: bool = True, + output_attentions: bool = False, + output_hidden_states: bool = False, + return_dict: bool = True, + ): + # Model + outputs = self.bert( + input_ids, + attention_mask, + token_type_ids, + position_ids, + head_mask, + deterministic=deterministic, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + hidden_states = outputs[0] + if self.config.tie_word_embeddings: + shared_embedding = self.bert.variables["params"]["embeddings"]["word_embeddings"]["embedding"] + else: + shared_embedding = None + + # Compute the prediction scores + logits = self.cls(hidden_states, shared_embedding=shared_embedding) + + if not return_dict: + return (logits,) + outputs[1:] + + return FlaxMaskedLMOutput( + logits=logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +@add_start_docstrings("""BigBird Model with a `language modeling` head on top.""", BIG_BIRD_START_DOCSTRING) +# Copied from transformers.models.bert.modeling_flax_bert.FlaxBertForMaskedLM with Bert->BigBird +class FlaxBigBirdForMaskedLM(FlaxBigBirdPreTrainedModel): + module_class = FlaxBigBirdForMaskedLMModule + + +append_call_sample_docstring(FlaxBigBirdForMaskedLM, _CHECKPOINT_FOR_DOC, FlaxMaskedLMOutput, _CONFIG_FOR_DOC) + + +class FlaxBigBirdClassificationHead(nn.Module): + """Head for sentence-level classification tasks.""" + + config: BigBirdConfig + dtype: jnp.dtype = jnp.float32 + + def setup(self): + self.dense = nn.Dense(self.config.hidden_size, dtype=self.dtype) + classifier_dropout = ( + self.config.classifier_dropout + if self.config.classifier_dropout is not None + else self.config.hidden_dropout_prob + ) + self.dropout = nn.Dropout(classifier_dropout) + self.out_proj = nn.Dense(self.config.num_labels, dtype=self.dtype) + + def __call__(self, features, deterministic=True): + x = features[:, 0, :] # take token (equiv. to [CLS]) + x = self.dropout(x, deterministic=deterministic) + x = self.dense(x) + x = ACT2FN[self.config.hidden_act](x) + x = self.dropout(x, deterministic=deterministic) + x = self.out_proj(x) + return x + + +class FlaxBigBirdForSequenceClassificationModule(nn.Module): + config: BigBirdConfig + dtype: jnp.dtype = jnp.float32 + gradient_checkpointing: bool = False + + def setup(self): + self.bert = FlaxBigBirdModule( + config=self.config, dtype=self.dtype, gradient_checkpointing=self.gradient_checkpointing + ) + self.classifier = FlaxBigBirdClassificationHead(self.config, dtype=self.dtype) + + def __call__( + self, + input_ids, + attention_mask, + token_type_ids, + position_ids, + head_mask, + deterministic: bool = True, + output_attentions: bool = False, + output_hidden_states: bool = False, + return_dict: bool = True, + ): + # Model + outputs = self.bert( + input_ids, + attention_mask, + token_type_ids, + position_ids, + head_mask, + deterministic=deterministic, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + sequence_output = outputs[0] + logits = self.classifier(sequence_output, deterministic=deterministic) + + if not return_dict: + return (logits,) + outputs[2:] + + return FlaxSequenceClassifierOutput( + logits=logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +@add_start_docstrings( + """ + BigBird Model transformer with a sequence classification/regression head on top (a linear layer on top of the + pooled output) e.g. for GLUE tasks. + """, + BIG_BIRD_START_DOCSTRING, +) +# Copied from transformers.models.bert.modeling_flax_bert.FlaxBertForSequenceClassification with Bert->BigBird +class FlaxBigBirdForSequenceClassification(FlaxBigBirdPreTrainedModel): + module_class = FlaxBigBirdForSequenceClassificationModule + + +append_call_sample_docstring( + FlaxBigBirdForSequenceClassification, + _CHECKPOINT_FOR_DOC, + FlaxSequenceClassifierOutput, + _CONFIG_FOR_DOC, +) + + +# Copied from transformers.models.bert.modeling_flax_bert.FlaxBertForMultipleChoiceModule with Bert->BigBird +class FlaxBigBirdForMultipleChoiceModule(nn.Module): + config: BigBirdConfig + dtype: jnp.dtype = jnp.float32 + gradient_checkpointing: bool = False + + def setup(self): + self.bert = FlaxBigBirdModule( + config=self.config, + dtype=self.dtype, + gradient_checkpointing=self.gradient_checkpointing, + ) + self.dropout = nn.Dropout(rate=self.config.hidden_dropout_prob) + self.classifier = nn.Dense(1, dtype=self.dtype) + + def __call__( + self, + input_ids, + attention_mask, + token_type_ids, + position_ids, + head_mask, + deterministic: bool = True, + output_attentions: bool = False, + output_hidden_states: bool = False, + return_dict: bool = True, + ): + num_choices = input_ids.shape[1] + input_ids = input_ids.reshape(-1, input_ids.shape[-1]) if input_ids is not None else None + attention_mask = attention_mask.reshape(-1, attention_mask.shape[-1]) if attention_mask is not None else None + token_type_ids = token_type_ids.reshape(-1, token_type_ids.shape[-1]) if token_type_ids is not None else None + position_ids = position_ids.reshape(-1, position_ids.shape[-1]) if position_ids is not None else None + + # Model + outputs = self.bert( + input_ids, + attention_mask, + token_type_ids, + position_ids, + head_mask, + deterministic=deterministic, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + pooled_output = outputs[1] + pooled_output = self.dropout(pooled_output, deterministic=deterministic) + logits = self.classifier(pooled_output) + + reshaped_logits = logits.reshape(-1, num_choices) + + if not return_dict: + return (reshaped_logits,) + outputs[2:] + + return FlaxMultipleChoiceModelOutput( + logits=reshaped_logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +@add_start_docstrings( + """ + BigBird Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a + softmax) e.g. for RocStories/SWAG tasks. + """, + BIG_BIRD_START_DOCSTRING, +) +class FlaxBigBirdForMultipleChoice(FlaxBigBirdPreTrainedModel): + module_class = FlaxBigBirdForMultipleChoiceModule + + def __init__( + self, + config: BigBirdConfig, + input_shape: Optional[tuple] = None, + seed: int = 0, + dtype: jnp.dtype = jnp.float32, + _do_init: bool = True, + **kwargs, + ): + if config.attention_type == "block_sparse" and input_shape is None: + input_shape = (1, 1, 12 * config.block_size) + elif input_shape is None: + input_shape = (1, 1) + super().__init__(config, input_shape=input_shape, seed=seed, dtype=dtype, _do_init=_do_init) + + +overwrite_call_docstring( + FlaxBigBirdForMultipleChoice, BIG_BIRD_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length") +) +append_call_sample_docstring( + FlaxBigBirdForMultipleChoice, + _CHECKPOINT_FOR_DOC, + FlaxMultipleChoiceModelOutput, + _CONFIG_FOR_DOC, +) + + +# Copied from transformers.models.bert.modeling_flax_bert.FlaxBertForTokenClassificationModule with Bert->BigBird +class FlaxBigBirdForTokenClassificationModule(nn.Module): + config: BigBirdConfig + dtype: jnp.dtype = jnp.float32 + gradient_checkpointing: bool = False + + def setup(self): + self.bert = FlaxBigBirdModule( + config=self.config, + dtype=self.dtype, + add_pooling_layer=False, + gradient_checkpointing=self.gradient_checkpointing, + ) + classifier_dropout = ( + self.config.classifier_dropout + if self.config.classifier_dropout is not None + else self.config.hidden_dropout_prob + ) + self.dropout = nn.Dropout(rate=classifier_dropout) + self.classifier = nn.Dense(self.config.num_labels, dtype=self.dtype) + + def __call__( + self, + input_ids, + attention_mask, + token_type_ids, + position_ids, + head_mask, + deterministic: bool = True, + output_attentions: bool = False, + output_hidden_states: bool = False, + return_dict: bool = True, + ): + # Model + outputs = self.bert( + input_ids, + attention_mask, + token_type_ids, + position_ids, + head_mask, + deterministic=deterministic, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + hidden_states = outputs[0] + hidden_states = self.dropout(hidden_states, deterministic=deterministic) + logits = self.classifier(hidden_states) + + if not return_dict: + return (logits,) + outputs[1:] + + return FlaxTokenClassifierOutput( + logits=logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +@add_start_docstrings( + """ + BigBird Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for + Named-Entity-Recognition (NER) tasks. + """, + BIG_BIRD_START_DOCSTRING, +) +# Copied from transformers.models.bert.modeling_flax_bert.FlaxBertForTokenClassification with Bert->BigBird +class FlaxBigBirdForTokenClassification(FlaxBigBirdPreTrainedModel): + module_class = FlaxBigBirdForTokenClassificationModule + + +append_call_sample_docstring( + FlaxBigBirdForTokenClassification, + _CHECKPOINT_FOR_DOC, + FlaxTokenClassifierOutput, + _CONFIG_FOR_DOC, +) + + +class FlaxBigBirdForQuestionAnsweringHead(nn.Module): + config: BigBirdConfig + dtype: jnp.dtype = jnp.float32 + + def setup(self): + self.dropout = nn.Dropout(rate=self.config.hidden_dropout_prob) + self.intermediate = FlaxBigBirdIntermediate(self.config, dtype=self.dtype) + self.output = FlaxBigBirdOutput(self.config, dtype=self.dtype) + self.qa_outputs = nn.Dense(self.config.num_labels, dtype=self.dtype) + + def __call__(self, encoder_output, deterministic=True): + hidden_states = self.dropout(encoder_output, deterministic=deterministic) + hidden_states = self.intermediate(hidden_states) + hidden_states = self.output(hidden_states, encoder_output) + hidden_states = self.qa_outputs(hidden_states) + return hidden_states + + +class FlaxBigBirdForQuestionAnsweringModule(nn.Module): + config: BigBirdConfig + dtype: jnp.dtype = jnp.float32 + add_pooling_layer: bool = False + gradient_checkpointing: bool = False + + def setup(self): + self.config.num_labels = 2 + self.bert = FlaxBigBirdModule( + self.config, + dtype=self.dtype, + add_pooling_layer=self.add_pooling_layer, + gradient_checkpointing=self.gradient_checkpointing, + ) + self.qa_classifier = FlaxBigBirdForQuestionAnsweringHead(self.config, dtype=self.dtype) + + def __call__( + self, + input_ids, + attention_mask, + token_type_ids, + position_ids, + head_mask, + logits_mask=None, + deterministic: bool = True, + output_attentions: bool = False, + output_hidden_states: bool = False, + return_dict: bool = True, + ): + # Model + outputs = self.bert( + input_ids, + attention_mask, + token_type_ids, + position_ids, + head_mask, + deterministic=deterministic, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + hidden_states = outputs[0] + pooled_output = outputs[1] if self.add_pooling_layer else None + logits = self.qa_classifier(hidden_states, deterministic=deterministic) + + if logits_mask is not None: + # removing question tokens from the competition + logits = logits - logits_mask * 1e6 + + start_logits, end_logits = jnp.split(logits, self.config.num_labels, axis=-1) + start_logits = start_logits.squeeze(-1) + end_logits = end_logits.squeeze(-1) + + if not return_dict: + return (start_logits, end_logits) + outputs[1:] + + return FlaxBigBirdForQuestionAnsweringModelOutput( + start_logits=start_logits, + end_logits=end_logits, + pooled_output=pooled_output, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +@add_start_docstrings( + """ + BigBird Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear + layers on top of the hidden-states output to compute `span start logits` and `span end logits`). + """, + BIG_BIRD_START_DOCSTRING, +) +class FlaxBigBirdForQuestionAnswering(FlaxBigBirdPreTrainedModel): + module_class = FlaxBigBirdForQuestionAnsweringModule + + @add_start_docstrings_to_model_forward(BIG_BIRD_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + def __call__( + self, + input_ids, + attention_mask=None, + token_type_ids=None, + position_ids=None, + head_mask=None, + question_lengths=None, + params: dict = None, + dropout_rng: Optional[jax.random.PRNGKey] = None, + indices_rng: Optional[jax.random.PRNGKey] = None, + train: bool = False, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ): + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.return_dict + + if position_ids is None: + position_ids = jnp.broadcast_to(jnp.arange(jnp.atleast_2d(input_ids).shape[-1]), input_ids.shape) + + if attention_mask is None: + attention_mask = jnp.ones_like(input_ids) + + if head_mask is None: + head_mask = jnp.ones((self.config.num_hidden_layers, self.config.num_attention_heads)) + + if question_lengths is None and input_ids is not None: + # assuming input_ids format: context + question_lengths = jnp.argmax((input_ids == self.config.sep_token_id).astype("i4"), axis=-1) + 1 + question_lengths = jnp.expand_dims(question_lengths, axis=1) + + seqlen = input_ids.shape[1] + + logits_mask = None + if question_lengths is not None: + # setting lengths logits to `-inf` + logits_mask = self.prepare_question_mask(question_lengths, seqlen) + if token_type_ids is None: + token_type_ids = (~logits_mask).astype("i4") + logits_mask = jnp.expand_dims(logits_mask, axis=2) + logits_mask = logits_mask.at[:, 0].set(False) + + # init input tensors if not passed + if token_type_ids is None: + token_type_ids = jnp.zeros_like(input_ids) + + # Handle any PRNG if needed + rngs = {} + if dropout_rng is not None: + rngs["dropout"] = dropout_rng + + if indices_rng is not None: + rngs["indices"] = indices_rng + + return self.module.apply( + {"params": params or self.params}, + jnp.array(input_ids, dtype="i4"), + jnp.array(attention_mask, dtype="i4"), + token_type_ids, + jnp.array(position_ids, dtype="i4"), + jnp.array(head_mask, dtype="i4"), + logits_mask, + not train, + output_attentions, + output_hidden_states, + return_dict, + rngs=rngs, + ) + + @staticmethod + def prepare_question_mask(q_lengths, maxlen: int): + # q_lengths -> (bz, 1) + mask = jnp.arange(0, maxlen) + mask = jnp.expand_dims(mask, axis=0) < q_lengths + return mask + + +append_call_sample_docstring( + FlaxBigBirdForQuestionAnswering, + _CHECKPOINT_FOR_DOC, + FlaxBigBirdForQuestionAnsweringModelOutput, + _CONFIG_FOR_DOC, +) + + +class FlaxBigBirdForCausalLMModule(nn.Module): + config: BigBirdConfig + dtype: jnp.dtype = jnp.float32 + gradient_checkpointing: bool = False + + def setup(self): + self.bert = FlaxBigBirdModule( + config=self.config, + add_pooling_layer=False, + dtype=self.dtype, + gradient_checkpointing=self.gradient_checkpointing, + ) + self.cls = FlaxBigBirdOnlyMLMHead(config=self.config, dtype=self.dtype) + + def __call__( + self, + input_ids, + attention_mask, + position_ids, + token_type_ids: Optional[jnp.ndarray] = None, + head_mask: Optional[jnp.ndarray] = None, + encoder_hidden_states: Optional[jnp.ndarray] = None, + encoder_attention_mask: Optional[jnp.ndarray] = None, + init_cache: bool = False, + deterministic: bool = True, + output_attentions: bool = False, + output_hidden_states: bool = False, + return_dict: bool = True, + ): + # Model + outputs = self.bert( + input_ids=input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + init_cache=init_cache, + deterministic=deterministic, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + hidden_states = outputs[0] + if self.config.tie_word_embeddings: + shared_embedding = self.bert.variables["params"]["embeddings"]["word_embeddings"]["embedding"] + else: + shared_embedding = None + + # Compute the prediction scores + logits = self.cls(hidden_states, shared_embedding=shared_embedding) + + if not return_dict: + return (logits,) + outputs[1:] + + return FlaxCausalLMOutputWithCrossAttentions( + logits=logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + cross_attentions=outputs.cross_attentions, + ) + + +@add_start_docstrings( + """ + BigBird Model with a language modeling head on top (a linear layer on top of the hidden-states output) e.g for + autoregressive tasks. + """, + BIG_BIRD_START_DOCSTRING, +) +# Copied from transformers.models.bert.modeling_flax_bert.FlaxBertForCausalLM with Bert->BigBird +class FlaxBigBirdForCausalLM(FlaxBigBirdPreTrainedModel): + module_class = FlaxBigBirdForCausalLMModule + + def prepare_inputs_for_generation(self, input_ids, max_length, attention_mask: Optional[jax.Array] = None): + # initializing the cache + batch_size, seq_length = input_ids.shape + + past_key_values = self.init_cache(batch_size, max_length) + # Note that usually one would have to put 0's in the attention_mask for x > input_ids.shape[-1] and x < cache_length. + # But since the decoder uses a causal mask, those positions are masked anyway. + # Thus, we can create a single static attention_mask here, which is more efficient for compilation + extended_attention_mask = jnp.ones((batch_size, max_length), dtype="i4") + if attention_mask is not None: + position_ids = attention_mask.cumsum(axis=-1) - 1 + extended_attention_mask = lax.dynamic_update_slice(extended_attention_mask, attention_mask, (0, 0)) + else: + position_ids = jnp.broadcast_to(jnp.arange(seq_length, dtype="i4")[None, :], (batch_size, seq_length)) + + return { + "past_key_values": past_key_values, + "attention_mask": extended_attention_mask, + "position_ids": position_ids, + } + + def update_inputs_for_generation(self, model_outputs, model_kwargs): + model_kwargs["past_key_values"] = model_outputs.past_key_values + model_kwargs["position_ids"] = model_kwargs["position_ids"][:, -1:] + 1 + return model_kwargs + + +append_call_sample_docstring( + FlaxBigBirdForCausalLM, + _CHECKPOINT_FOR_DOC, + FlaxCausalLMOutputWithCrossAttentions, + _CONFIG_FOR_DOC, +) + + +__all__ = [ + "FlaxBigBirdForCausalLM", + "FlaxBigBirdForMaskedLM", + "FlaxBigBirdForMultipleChoice", + "FlaxBigBirdForPreTraining", + "FlaxBigBirdForQuestionAnswering", + "FlaxBigBirdForSequenceClassification", + "FlaxBigBirdForTokenClassification", + "FlaxBigBirdModel", + "FlaxBigBirdPreTrainedModel", +] diff --git a/docs/transformers/build/lib/transformers/models/big_bird/tokenization_big_bird.py b/docs/transformers/build/lib/transformers/models/big_bird/tokenization_big_bird.py new file mode 100644 index 0000000000000000000000000000000000000000..43172e20f35fd9df4b9a19fce0f0097bdec28403 --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/big_bird/tokenization_big_bird.py @@ -0,0 +1,326 @@ +# coding=utf-8 +# Copyright 2021 Google Research and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Tokenization classes for BigBird.""" + +import os +import re +from shutil import copyfile +from typing import Any, Dict, List, Optional, Tuple + +import sentencepiece as spm + +from ...tokenization_utils import AddedToken, PreTrainedTokenizer +from ...utils import logging +from ...utils.import_utils import requires + + +logger = logging.get_logger(__name__) + +VOCAB_FILES_NAMES = {"vocab_file": "spiece.model"} + + +@requires(backends=("sentencepiece",)) +class BigBirdTokenizer(PreTrainedTokenizer): + """ + Construct a BigBird tokenizer. Based on [SentencePiece](https://github.com/google/sentencepiece). + + This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to + this superclass for more information regarding those methods. + + Args: + vocab_file (`str`): + [SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that + contains the vocabulary necessary to instantiate a tokenizer. + unk_token (`str`, *optional*, defaults to `""`): + The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this + token instead. + bos_token (`str`, *optional*, defaults to `""`): + The begin of sequence token. + eos_token (`str`, *optional*, defaults to `""`): + The end of sequence token. + pad_token (`str`, *optional*, defaults to `""`): + The token used for padding, for example when batching sequences of different lengths. + sep_token (`str`, *optional*, defaults to `"[SEP]"`): + The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for + sequence classification or for a text and a question for question answering. It is also used as the last + token of a sequence built with special tokens. + mask_token (`str`, *optional*, defaults to `"[MASK]"`): + The token used for masking values. This is the token used when training this model with masked language + modeling. This is the token which the model will try to predict. + cls_token (`str`, *optional*, defaults to `"[CLS]"`): + The classifier token which is used when doing sequence classification (classification of the whole sequence + instead of per-token classification). It is the first token of the sequence when built with special tokens. + sp_model_kwargs (`dict`, *optional*): + Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for + SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things, + to set: + + - `enable_sampling`: Enable subword regularization. + - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout. + + - `nbest_size = {0,1}`: No sampling is performed. + - `nbest_size > 1`: samples from the nbest_size results. + - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice) + using forward-filtering-and-backward-sampling algorithm. + + - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for + BPE-dropout. + """ + + vocab_files_names = VOCAB_FILES_NAMES + model_input_names = ["input_ids", "attention_mask"] + prefix_tokens: List[int] = [] + + def __init__( + self, + vocab_file, + unk_token="", + bos_token="", + eos_token="", + pad_token="", + sep_token="[SEP]", + mask_token="[MASK]", + cls_token="[CLS]", + sp_model_kwargs: Optional[Dict[str, Any]] = None, + **kwargs, + ) -> None: + bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token + eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token + unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token + pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token + cls_token = AddedToken(cls_token, lstrip=False, rstrip=False) if isinstance(cls_token, str) else cls_token + sep_token = AddedToken(sep_token, lstrip=False, rstrip=False) if isinstance(sep_token, str) else sep_token + + # Mask token behave like a normal word, i.e. include the space before it + mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token + + self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs + + self.vocab_file = vocab_file + + self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) + self.sp_model.Load(vocab_file) + + super().__init__( + bos_token=bos_token, + eos_token=eos_token, + unk_token=unk_token, + pad_token=pad_token, + sep_token=sep_token, + mask_token=mask_token, + cls_token=cls_token, + sp_model_kwargs=self.sp_model_kwargs, + **kwargs, + ) + + @property + def vocab_size(self): + return self.sp_model.get_piece_size() + + def get_vocab(self): + vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)} + vocab.update(self.added_tokens_encoder) + return vocab + + def __getstate__(self): + state = self.__dict__.copy() + state["sp_model"] = None + return state + + def __setstate__(self, d): + self.__dict__ = d + + # for backward compatibility + if not hasattr(self, "sp_model_kwargs"): + self.sp_model_kwargs = {} + + self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) + self.sp_model.Load(self.vocab_file) + + def _tokenize(self, text: str) -> List[str]: + """Take as input a string and return a list of strings (tokens) for words/sub-words""" + return self.sp_model.encode(text, out_type=str) + + def _convert_token_to_id(self, token): + """Converts a token (str) in an id using the vocab.""" + return self.sp_model.piece_to_id(token) + + def _convert_id_to_token(self, index): + """Converts an index (integer) in a token (str) using the vocab.""" + token = self.sp_model.IdToPiece(index) + return token + + # Copied from transformers.models.albert.tokenization_albert.AlbertTokenizer.convert_tokens_to_string + def convert_tokens_to_string(self, tokens): + """Converts a sequence of tokens (string) in a single string.""" + current_sub_tokens = [] + out_string = "" + prev_is_special = False + for token in tokens: + # make sure that special tokens are not decoded using sentencepiece model + if token in self.all_special_tokens: + if not prev_is_special: + out_string += " " + out_string += self.sp_model.decode(current_sub_tokens) + token + prev_is_special = True + current_sub_tokens = [] + else: + current_sub_tokens.append(token) + prev_is_special = False + out_string += self.sp_model.decode(current_sub_tokens) + return out_string.strip() + + def _decode( + self, + token_ids: List[int], + skip_special_tokens: bool = False, + clean_up_tokenization_spaces: Optional[bool] = None, + spaces_between_special_tokens: bool = True, + **kwargs, + ) -> str: + self._decode_use_source_tokenizer = kwargs.pop("use_source_tokenizer", False) + + filtered_tokens = self.convert_ids_to_tokens(token_ids, skip_special_tokens=skip_special_tokens) + + # To avoid mixing byte-level and unicode for byte-level BPT + # we need to build string separately for added tokens and byte-level tokens + # cf. https://github.com/huggingface/transformers/issues/1133 + sub_texts = [] + current_sub_text = [] + for token in filtered_tokens: + if skip_special_tokens and token in self.all_special_ids: + continue + if token in self.added_tokens_encoder: + if current_sub_text: + sub_texts.append(self.convert_tokens_to_string(current_sub_text)) + current_sub_text = [] + sub_texts.append(token) + else: + current_sub_text.append(token) + if current_sub_text: + sub_texts.append(self.convert_tokens_to_string(current_sub_text)) + + # Mimic the behavior of the Rust tokenizer: + # No space before [MASK] and [SEP] + if spaces_between_special_tokens: + text = re.sub(r" (\[(MASK|SEP)\])", r"\1", " ".join(sub_texts)) + else: + text = "".join(sub_texts) + + clean_up_tokenization_spaces = ( + clean_up_tokenization_spaces + if clean_up_tokenization_spaces is not None + else self.clean_up_tokenization_spaces + ) + if clean_up_tokenization_spaces: + clean_text = self.clean_up_tokenization(text) + return clean_text + else: + return text + + def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]: + if not os.path.isdir(save_directory): + logger.error(f"Vocabulary path ({save_directory}) should be a directory") + return + out_vocab_file = os.path.join( + save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"] + ) + + if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file): + copyfile(self.vocab_file, out_vocab_file) + elif not os.path.isfile(self.vocab_file): + with open(out_vocab_file, "wb") as fi: + content_spiece_model = self.sp_model.serialized_model_proto() + fi.write(content_spiece_model) + + return (out_vocab_file,) + + def build_inputs_with_special_tokens( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None + ) -> List[int]: + """ + Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and + adding special tokens. A Big Bird sequence has the following format: + + - single sequence: `[CLS] X [SEP]` + - pair of sequences: `[CLS] A [SEP] B [SEP]` + + Args: + token_ids_0 (`List[int]`): + List of IDs to which the special tokens will be added. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + + Returns: + `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens. + """ + if token_ids_1 is None: + return [self.cls_token_id] + token_ids_0 + [self.sep_token_id] + cls = [self.cls_token_id] + sep = [self.sep_token_id] + return cls + token_ids_0 + sep + token_ids_1 + sep + + def get_special_tokens_mask( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False + ) -> List[int]: + """ + Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding + special tokens using the tokenizer `prepare_for_model` method. + + Args: + token_ids_0 (`List[int]`): + List of IDs. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + already_has_special_tokens (`bool`, *optional*, defaults to `False`): + Whether or not the token list is already formatted with special tokens for the model. + + Returns: + `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. + """ + if already_has_special_tokens: + return super().get_special_tokens_mask( + token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True + ) + + if token_ids_1 is None: + return [1] + ([0] * len(token_ids_0)) + [1] + return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1] + + def create_token_type_ids_from_sequences( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None + ) -> List[int]: + """ + Create a mask from the two sequences passed to be used in a sequence-pair classification task. A BERT sequence + pair mask has the following format: :: 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 | first sequence | second + sequence | If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s). + + Args: + token_ids_0 (`List[int]`): + List of IDs. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + + Returns: + `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s). + """ + sep = [self.sep_token_id] + cls = [self.cls_token_id] + if token_ids_1 is None: + return len(cls + token_ids_0 + sep) * [0] + return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1] + + +__all__ = ["BigBirdTokenizer"] diff --git a/docs/transformers/build/lib/transformers/models/big_bird/tokenization_big_bird_fast.py b/docs/transformers/build/lib/transformers/models/big_bird/tokenization_big_bird_fast.py new file mode 100644 index 0000000000000000000000000000000000000000..83f2fac07fae72b758f8671458ac0843538eba4b --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/big_bird/tokenization_big_bird_fast.py @@ -0,0 +1,232 @@ +# coding=utf-8 +# Copyright 2018 Google AI, Google Brain and the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Tokenization classes for Big Bird model.""" + +import os +from shutil import copyfile +from typing import List, Optional, Tuple + +from ...tokenization_utils import AddedToken +from ...tokenization_utils_fast import PreTrainedTokenizerFast +from ...utils import is_sentencepiece_available, logging + + +if is_sentencepiece_available(): + from .tokenization_big_bird import BigBirdTokenizer +else: + BigBirdTokenizer = None + +logger = logging.get_logger(__name__) +VOCAB_FILES_NAMES = {"vocab_file": "spiece.model", "tokenizer_file": "tokenizer.json"} + + +SPIECE_UNDERLINE = "▁" + + +class BigBirdTokenizerFast(PreTrainedTokenizerFast): + """ + Construct a "fast" BigBird tokenizer (backed by HuggingFace's *tokenizers* library). Based on + [Unigram](https://huggingface.co/docs/tokenizers/python/latest/components.html?highlight=unigram#models). This + tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should refer to + this superclass for more information regarding those methods + + Args: + vocab_file (`str`): + [SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that + contains the vocabulary necessary to instantiate a tokenizer. + bos_token (`str`, *optional*, defaults to `""`): + The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token. + + + + When building a sequence using special tokens, this is not the token that is used for the beginning of + sequence. The token used is the `cls_token`. + + + + eos_token (`str`, *optional*, defaults to `""`): + The end of sequence token. .. note:: When building a sequence using special tokens, this is not the token + that is used for the end of sequence. The token used is the `sep_token`. + unk_token (`str`, *optional*, defaults to `""`): + The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this + token instead. + sep_token (`str`, *optional*, defaults to `"[SEP]"`): + The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for + sequence classification or for a text and a question for question answering. It is also used as the last + token of a sequence built with special tokens. + pad_token (`str`, *optional*, defaults to `""`): + The token used for padding, for example when batching sequences of different lengths. + cls_token (`str`, *optional*, defaults to `"[CLS]"`): + The classifier token which is used when doing sequence classification (classification of the whole sequence + instead of per-token classification). It is the first token of the sequence when built with special tokens. + mask_token (`str`, *optional*, defaults to `"[MASK]"`): + The token used for masking values. This is the token used when training this model with masked language + modeling. This is the token which the model will try to predict. + """ + + vocab_files_names = VOCAB_FILES_NAMES + slow_tokenizer_class = BigBirdTokenizer + model_input_names = ["input_ids", "attention_mask"] + prefix_tokens: List[int] = [] + + def __init__( + self, + vocab_file=None, + tokenizer_file=None, + unk_token="", + bos_token="", + eos_token="", + pad_token="", + sep_token="[SEP]", + mask_token="[MASK]", + cls_token="[CLS]", + **kwargs, + ): + bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token + eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token + unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token + pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token + cls_token = AddedToken(cls_token, lstrip=False, rstrip=False) if isinstance(cls_token, str) else cls_token + sep_token = AddedToken(sep_token, lstrip=False, rstrip=False) if isinstance(sep_token, str) else sep_token + + # Mask token behave like a normal word, i.e. include the space before it + mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token + + super().__init__( + vocab_file, + tokenizer_file=tokenizer_file, + bos_token=bos_token, + eos_token=eos_token, + unk_token=unk_token, + sep_token=sep_token, + pad_token=pad_token, + cls_token=cls_token, + mask_token=mask_token, + **kwargs, + ) + + self.vocab_file = vocab_file + + @property + def can_save_slow_tokenizer(self) -> bool: + return os.path.isfile(self.vocab_file) if self.vocab_file else False + + def build_inputs_with_special_tokens( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None + ) -> List[int]: + """ + Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and + adding special tokens. An BigBird sequence has the following format: + + - single sequence: `[CLS] X [SEP]` + - pair of sequences: `[CLS] A [SEP] B [SEP]` + + Args: + token_ids_0 (`List[int]`): + List of IDs to which the special tokens will be added + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + + Returns: + `List[int]`: list of [input IDs](../glossary#input-ids) with the appropriate special tokens. + """ + sep = [self.sep_token_id] + cls = [self.cls_token_id] + if token_ids_1 is None: + return cls + token_ids_0 + sep + return cls + token_ids_0 + sep + token_ids_1 + sep + + def get_special_tokens_mask( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False + ) -> List[int]: + """ + Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding + special tokens using the tokenizer `prepare_for_model` method. + + Args: + token_ids_0 (`List[int]`): + List of ids. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + already_has_special_tokens (`bool`, *optional*, defaults to `False`): + Set to True if the token list is already formatted with special tokens for the model + + Returns: + `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. + """ + + if already_has_special_tokens: + if token_ids_1 is not None: + raise ValueError( + "You should not supply a second sequence if the provided sequence of " + "ids is already formatted with special tokens for the model." + ) + return [1 if x in [self.sep_token_id, self.cls_token_id] else 0 for x in token_ids_0] + + if token_ids_1 is None: + return [1] + ([0] * len(token_ids_0)) + [1] + return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1] + + def create_token_type_ids_from_sequences( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None + ) -> List[int]: + """ + Creates a mask from the two sequences passed to be used in a sequence-pair classification task. An ALBERT + sequence pair mask has the following format: + + ``` + 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 + | first sequence | second sequence | + ``` + + if token_ids_1 is None, only returns the first portion of the mask (0s). + + Args: + token_ids_0 (`List[int]`): + List of ids. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + + Returns: + `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s). + """ + sep = [self.sep_token_id] + cls = [self.cls_token_id] + + if token_ids_1 is None: + return len(cls + token_ids_0 + sep) * [0] + return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1] + + def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]: + if not self.can_save_slow_tokenizer: + raise ValueError( + "Your fast tokenizer does not have the necessary information to save the vocabulary for a slow " + "tokenizer." + ) + + if not os.path.isdir(save_directory): + logger.error(f"Vocabulary path ({save_directory}) should be a directory") + return + out_vocab_file = os.path.join( + save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"] + ) + + if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file): + copyfile(self.vocab_file, out_vocab_file) + + return (out_vocab_file,) + + +__all__ = ["BigBirdTokenizerFast"] diff --git a/docs/transformers/build/lib/transformers/models/bigbird_pegasus/__init__.py b/docs/transformers/build/lib/transformers/models/bigbird_pegasus/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..d203b2a578f9a4c6125cffb6041a627167518680 --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/bigbird_pegasus/__init__.py @@ -0,0 +1,27 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ...utils import _LazyModule +from ...utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .configuration_bigbird_pegasus import * + from .modeling_bigbird_pegasus import * +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/docs/transformers/build/lib/transformers/models/bigbird_pegasus/configuration_bigbird_pegasus.py b/docs/transformers/build/lib/transformers/models/bigbird_pegasus/configuration_bigbird_pegasus.py new file mode 100644 index 0000000000000000000000000000000000000000..5d9c9bf1a4b0b228f60c90c2c8ffbd4856557a62 --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/bigbird_pegasus/configuration_bigbird_pegasus.py @@ -0,0 +1,412 @@ +# coding=utf-8 +# Copyright Google Research and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""BigBirdPegasus model configuration""" + +from collections import OrderedDict +from typing import Any, Mapping, Optional + +from ... import PreTrainedTokenizer +from ...configuration_utils import PretrainedConfig +from ...onnx import OnnxConfig, OnnxConfigWithPast, OnnxSeq2SeqConfigWithPast +from ...onnx.utils import compute_effective_axis_dimension +from ...utils import TensorType, is_torch_available, logging + + +logger = logging.get_logger(__name__) + + +class BigBirdPegasusConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`BigBirdPegasusModel`]. It is used to instantiate + an BigBirdPegasus model according to the specified arguments, defining the model architecture. Instantiating a + configuration with the defaults will yield a similar configuration to that of the BigBirdPegasus + [google/bigbird-pegasus-large-arxiv](https://huggingface.co/google/bigbird-pegasus-large-arxiv) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + + Args: + vocab_size (`int`, *optional*, defaults to 96103): + Vocabulary size of the BigBirdPegasus model. Defines the number of different tokens that can be represented + by the `inputs_ids` passed when calling [`BigBirdPegasusModel`]. + d_model (`int`, *optional*, defaults to 1024): + Dimension of the layers and the pooler layer. + encoder_layers (`int`, *optional*, defaults to 16): + Number of encoder layers. + decoder_layers (`int`, *optional*, defaults to 16): + Number of decoder layers. + encoder_attention_heads (`int`, *optional*, defaults to 16): + Number of attention heads for each attention layer in the Transformer encoder. + decoder_attention_heads (`int`, *optional*, defaults to 16): + Number of attention heads for each attention layer in the Transformer decoder. + decoder_ffn_dim (`int`, *optional*, defaults to 4096): + Dimension of the "intermediate" (often named feed-forward) layer in decoder. + encoder_ffn_dim (`int`, *optional*, defaults to 4096): + Dimension of the "intermediate" (often named feed-forward) layer in decoder. + activation_function (`str` or `function`, *optional*, defaults to `"gelu_new"`): + The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, + `"relu"`, `"silu"` and `"gelu_new"` are supported. + dropout (`float`, *optional*, defaults to 0.1): + The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. + attention_dropout (`float`, *optional*, defaults to 0.0): + The dropout ratio for the attention probabilities. + activation_dropout (`float`, *optional*, defaults to 0.0): + The dropout ratio for activations inside the fully connected layer. + classifier_dropout (`float`, *optional*, defaults to 0.0): + The dropout ratio for classifier. + max_position_embeddings (`int`, *optional*, defaults to 4096): + The maximum sequence length that this model might ever be used with. Typically set this to something large + just in case (e.g., 1024 or 2048 or 4096). + init_std (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + encoder_layerdrop (`float`, *optional*, defaults to 0.0): + The LayerDrop probability for the encoder. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556) + for more details. + decoder_layerdrop (`float`, *optional*, defaults to 0.0): + The LayerDrop probability for the decoder. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556) + for more details. + use_cache (`bool`, *optional*, defaults to `True`): + Whether or not the model should return the last key/values attentions (not used by all models). + attention_type (`str`, *optional*, defaults to `"block_sparse"`) + Whether to use block sparse attention (with n complexity) as introduced in paper or original attention + layer (with n^2 complexity) in encoder. Possible values are `"original_full"` and `"block_sparse"`. + use_bias (`bool`, *optional*, defaults to `False`) + Whether to use bias in query, key, value. + block_size (`int`, *optional*, defaults to 64) + Size of each block. Useful only when `attention_type == "block_sparse"`. + num_random_blocks (`int`, *optional*, defaults to 3) + Each query is going to attend these many number of random blocks. Useful only when `attention_type == + "block_sparse"`. + scale_embeddings (`bool`, *optional*, defaults to `True`) + Whether to rescale embeddings with (hidden_size ** 0.5). + + Example: + + ```python + >>> from transformers import BigBirdPegasusConfig, BigBirdPegasusModel + + >>> # Initializing a BigBirdPegasus bigbird-pegasus-base style configuration + >>> configuration = BigBirdPegasusConfig() + + >>> # Initializing a model (with random weights) from the bigbird-pegasus-base style configuration + >>> model = BigBirdPegasusModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "bigbird_pegasus" + keys_to_ignore_at_inference = ["past_key_values"] + attribute_map = { + "num_attention_heads": "encoder_attention_heads", + "hidden_size": "d_model", + "attention_probs_dropout_prob": "attention_dropout", + } + + def __init__( + self, + vocab_size=96103, + max_position_embeddings=4096, + encoder_layers=16, + encoder_ffn_dim=4096, + encoder_attention_heads=16, + decoder_layers=16, + decoder_ffn_dim=4096, + decoder_attention_heads=16, + encoder_layerdrop=0.0, + decoder_layerdrop=0.0, + use_cache=True, + is_encoder_decoder=True, + activation_function="gelu_new", + d_model=1024, + dropout=0.1, + attention_dropout=0.0, + activation_dropout=0.0, + init_std=0.02, + decoder_start_token_id=2, + classifier_dropout=0.0, + scale_embedding=True, + pad_token_id=0, + bos_token_id=2, + eos_token_id=1, + attention_type="block_sparse", # only for encoder + block_size=64, + num_random_blocks=3, + use_bias=False, + **kwargs, + ): + self.vocab_size = vocab_size + self.max_position_embeddings = max_position_embeddings + self.d_model = d_model + self.encoder_ffn_dim = encoder_ffn_dim + self.encoder_layers = encoder_layers + self.encoder_attention_heads = encoder_attention_heads + self.decoder_ffn_dim = decoder_ffn_dim + self.decoder_layers = decoder_layers + self.decoder_attention_heads = decoder_attention_heads + self.dropout = dropout + self.attention_dropout = attention_dropout + self.activation_dropout = activation_dropout + self.activation_function = activation_function + self.init_std = init_std + self.encoder_layerdrop = encoder_layerdrop + self.decoder_layerdrop = decoder_layerdrop + self.classifier_dropout = classifier_dropout + self.use_cache = use_cache + self.num_hidden_layers = encoder_layers + self.scale_embedding = scale_embedding # scale factor will be sqrt(d_model) if True + + # extra config + self.attention_type = attention_type + self.block_size = block_size + self.num_random_blocks = num_random_blocks + self.use_bias = use_bias + + super().__init__( + pad_token_id=pad_token_id, + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + is_encoder_decoder=is_encoder_decoder, + decoder_start_token_id=decoder_start_token_id, + **kwargs, + ) + + +# Copied from transformers.models.bart.configuration_bart.BartOnnxConfig +class BigBirdPegasusOnnxConfig(OnnxSeq2SeqConfigWithPast): + @property + def inputs(self) -> Mapping[str, Mapping[int, str]]: + if self.task in ["default", "seq2seq-lm"]: + common_inputs = OrderedDict( + [ + ("input_ids", {0: "batch", 1: "encoder_sequence"}), + ("attention_mask", {0: "batch", 1: "encoder_sequence"}), + ] + ) + + if self.use_past: + common_inputs["decoder_input_ids"] = {0: "batch"} + common_inputs["decoder_attention_mask"] = {0: "batch", 1: "past_decoder_sequence + sequence"} + else: + common_inputs["decoder_input_ids"] = {0: "batch", 1: "decoder_sequence"} + common_inputs["decoder_attention_mask"] = {0: "batch", 1: "decoder_sequence"} + + if self.use_past: + self.fill_with_past_key_values_(common_inputs, direction="inputs") + elif self.task == "causal-lm": + # TODO: figure this case out. + common_inputs = OrderedDict( + [ + ("input_ids", {0: "batch", 1: "encoder_sequence"}), + ("attention_mask", {0: "batch", 1: "encoder_sequence"}), + ] + ) + if self.use_past: + num_encoder_layers, _ = self.num_layers + for i in range(num_encoder_layers): + common_inputs[f"past_key_values.{i}.key"] = {0: "batch", 2: "past_sequence + sequence"} + common_inputs[f"past_key_values.{i}.value"] = {0: "batch", 2: "past_sequence + sequence"} + else: + common_inputs = OrderedDict( + [ + ("input_ids", {0: "batch", 1: "encoder_sequence"}), + ("attention_mask", {0: "batch", 1: "encoder_sequence"}), + ("decoder_input_ids", {0: "batch", 1: "decoder_sequence"}), + ("decoder_attention_mask", {0: "batch", 1: "decoder_sequence"}), + ] + ) + + return common_inputs + + @property + def outputs(self) -> Mapping[str, Mapping[int, str]]: + if self.task in ["default", "seq2seq-lm"]: + common_outputs = super().outputs + else: + common_outputs = super(OnnxConfigWithPast, self).outputs + if self.use_past: + num_encoder_layers, _ = self.num_layers + for i in range(num_encoder_layers): + common_outputs[f"present.{i}.key"] = {0: "batch", 2: "past_sequence + sequence"} + common_outputs[f"present.{i}.value"] = {0: "batch", 2: "past_sequence + sequence"} + return common_outputs + + def _generate_dummy_inputs_for_default_and_seq2seq_lm( + self, + tokenizer: PreTrainedTokenizer, + batch_size: int = -1, + seq_length: int = -1, + is_pair: bool = False, + framework: Optional[TensorType] = None, + ) -> Mapping[str, Any]: + encoder_inputs = self._generate_dummy_inputs_for_sequence_classification_and_question_answering( + tokenizer, batch_size, seq_length, is_pair, framework + ) + + # Generate decoder inputs + decoder_seq_length = seq_length if not self.use_past else 1 + decoder_inputs = self._generate_dummy_inputs_for_sequence_classification_and_question_answering( + tokenizer, batch_size, decoder_seq_length, is_pair, framework + ) + decoder_inputs = {f"decoder_{name}": tensor for name, tensor in decoder_inputs.items()} + common_inputs = dict(**encoder_inputs, **decoder_inputs) + + if self.use_past: + if not is_torch_available(): + raise ValueError("Cannot generate dummy past_keys inputs without PyTorch installed.") + else: + import torch + batch, encoder_seq_length = common_inputs["input_ids"].shape + decoder_seq_length = common_inputs["decoder_input_ids"].shape[1] + num_encoder_attention_heads, num_decoder_attention_heads = self.num_attention_heads + encoder_shape = ( + batch, + num_encoder_attention_heads, + encoder_seq_length, + self._config.hidden_size // num_encoder_attention_heads, + ) + decoder_past_length = decoder_seq_length + 3 + decoder_shape = ( + batch, + num_decoder_attention_heads, + decoder_past_length, + self._config.hidden_size // num_decoder_attention_heads, + ) + + common_inputs["decoder_attention_mask"] = torch.cat( + [common_inputs["decoder_attention_mask"], torch.ones(batch, decoder_past_length)], dim=1 + ) + + common_inputs["past_key_values"] = [] + # If the number of encoder and decoder layers are present in the model configuration, both are considered + num_encoder_layers, num_decoder_layers = self.num_layers + min_num_layers = min(num_encoder_layers, num_decoder_layers) + max_num_layers = max(num_encoder_layers, num_decoder_layers) - min_num_layers + remaining_side_name = "encoder" if num_encoder_layers > num_decoder_layers else "decoder" + + for _ in range(min_num_layers): + common_inputs["past_key_values"].append( + ( + torch.zeros(decoder_shape), + torch.zeros(decoder_shape), + torch.zeros(encoder_shape), + torch.zeros(encoder_shape), + ) + ) + # TODO: test this. + shape = encoder_shape if remaining_side_name == "encoder" else decoder_shape + for _ in range(min_num_layers, max_num_layers): + common_inputs["past_key_values"].append((torch.zeros(shape), torch.zeros(shape))) + return common_inputs + + def _generate_dummy_inputs_for_causal_lm( + self, + tokenizer: PreTrainedTokenizer, + batch_size: int = -1, + seq_length: int = -1, + is_pair: bool = False, + framework: Optional[TensorType] = None, + ) -> Mapping[str, Any]: + common_inputs = self._generate_dummy_inputs_for_sequence_classification_and_question_answering( + tokenizer, batch_size, seq_length, is_pair, framework + ) + + if self.use_past: + if not is_torch_available(): + raise ValueError("Cannot generate dummy past_keys inputs without PyTorch installed.") + else: + import torch + batch, seqlen = common_inputs["input_ids"].shape + # Not using the same length for past_key_values + past_key_values_length = seqlen + 2 + num_encoder_layers, _ = self.num_layers + num_encoder_attention_heads, _ = self.num_attention_heads + past_shape = ( + batch, + num_encoder_attention_heads, + past_key_values_length, + self._config.hidden_size // num_encoder_attention_heads, + ) + + mask_dtype = common_inputs["attention_mask"].dtype + common_inputs["attention_mask"] = torch.cat( + [common_inputs["attention_mask"], torch.ones(batch, past_key_values_length, dtype=mask_dtype)], dim=1 + ) + common_inputs["past_key_values"] = [ + (torch.zeros(past_shape), torch.zeros(past_shape)) for _ in range(num_encoder_layers) + ] + return common_inputs + + def _generate_dummy_inputs_for_sequence_classification_and_question_answering( + self, + tokenizer: PreTrainedTokenizer, + batch_size: int = -1, + seq_length: int = -1, + is_pair: bool = False, + framework: Optional[TensorType] = None, + ) -> Mapping[str, Any]: + # Copied from OnnxConfig.generate_dummy_inputs + # Did not use super(OnnxConfigWithPast, self).generate_dummy_inputs for code clarity. + # If dynamic axis (-1) we forward with a fixed dimension of 2 samples to avoid optimizations made by ONNX + batch_size = compute_effective_axis_dimension( + batch_size, fixed_dimension=OnnxConfig.default_fixed_batch, num_token_to_add=0 + ) + + # If dynamic axis (-1) we forward with a fixed dimension of 8 tokens to avoid optimizations made by ONNX + token_to_add = tokenizer.num_special_tokens_to_add(is_pair) + seq_length = compute_effective_axis_dimension( + seq_length, fixed_dimension=OnnxConfig.default_fixed_sequence, num_token_to_add=token_to_add + ) + + # Generate dummy inputs according to compute batch and sequence + dummy_input = [" ".join([tokenizer.unk_token]) * seq_length] * batch_size + common_inputs = dict(tokenizer(dummy_input, return_tensors=framework)) + return common_inputs + + def generate_dummy_inputs( + self, + tokenizer: PreTrainedTokenizer, + batch_size: int = -1, + seq_length: int = -1, + is_pair: bool = False, + framework: Optional[TensorType] = None, + ) -> Mapping[str, Any]: + if self.task in ["default", "seq2seq-lm"]: + common_inputs = self._generate_dummy_inputs_for_default_and_seq2seq_lm( + tokenizer, batch_size=batch_size, seq_length=seq_length, is_pair=is_pair, framework=framework + ) + + elif self.task == "causal-lm": + common_inputs = self._generate_dummy_inputs_for_causal_lm( + tokenizer, batch_size=batch_size, seq_length=seq_length, is_pair=is_pair, framework=framework + ) + else: + common_inputs = self._generate_dummy_inputs_for_sequence_classification_and_question_answering( + tokenizer, batch_size=batch_size, seq_length=seq_length, is_pair=is_pair, framework=framework + ) + + return common_inputs + + def _flatten_past_key_values_(self, flattened_output, name, idx, t): + if self.task in ["default", "seq2seq-lm"]: + flattened_output = super()._flatten_past_key_values_(flattened_output, name, idx, t) + else: + flattened_output = super(OnnxSeq2SeqConfigWithPast, self)._flatten_past_key_values_( + flattened_output, name, idx, t + ) + + +__all__ = ["BigBirdPegasusConfig", "BigBirdPegasusOnnxConfig"] diff --git a/docs/transformers/build/lib/transformers/models/bigbird_pegasus/convert_bigbird_pegasus_tf_to_pytorch.py b/docs/transformers/build/lib/transformers/models/bigbird_pegasus/convert_bigbird_pegasus_tf_to_pytorch.py new file mode 100644 index 0000000000000000000000000000000000000000..e17369e48041c6e861cddd0d6e5681c2ca55ecea --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/bigbird_pegasus/convert_bigbird_pegasus_tf_to_pytorch.py @@ -0,0 +1,170 @@ +# coding=utf-8 +# Copyright 2021 The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +from typing import Dict + +import tensorflow as tf +import torch +from tqdm import tqdm + +from transformers import BigBirdPegasusConfig, BigBirdPegasusForConditionalGeneration + + +INIT_COMMON = [ + # tf -> hf + ("/", "."), + ("layer_", "layers."), + ("kernel", "weight"), + ("beta", "bias"), + ("gamma", "weight"), + ("pegasus", "model"), +] +END_COMMON = [ + (".output.dense", ".fc2"), + ("intermediate.LayerNorm", "final_layer_norm"), + ("intermediate.dense", "fc1"), +] + +DECODER_PATTERNS = ( + INIT_COMMON + + [ + ("attention.self.LayerNorm", "self_attn_layer_norm"), + ("attention.output.dense", "self_attn.out_proj"), + ("attention.self", "self_attn"), + ("attention.encdec.LayerNorm", "encoder_attn_layer_norm"), + ("attention.encdec_output.dense", "encoder_attn.out_proj"), + ("attention.encdec", "encoder_attn"), + ("key", "k_proj"), + ("value", "v_proj"), + ("query", "q_proj"), + ("decoder.LayerNorm", "decoder.layernorm_embedding"), + ] + + END_COMMON +) + +REMAINING_PATTERNS = ( + INIT_COMMON + + [ + ("embeddings.word_embeddings", "shared.weight"), + ("embeddings.position_embeddings", "embed_positions.weight"), + ("attention.self.LayerNorm", "self_attn_layer_norm"), + ("attention.output.dense", "self_attn.output"), + ("attention.self", "self_attn.self"), + ("encoder.LayerNorm", "encoder.layernorm_embedding"), + ] + + END_COMMON +) + +KEYS_TO_IGNORE = [ + "encdec/key/bias", + "encdec/query/bias", + "encdec/value/bias", + "self/key/bias", + "self/query/bias", + "self/value/bias", + "encdec_output/dense/bias", + "attention/output/dense/bias", +] + + +def rename_state_dict_key(k, patterns): + for tf_name, hf_name in patterns: + k = k.replace(tf_name, hf_name) + return k + + +def convert_bigbird_pegasus(tf_weights: dict, config_update: dict) -> BigBirdPegasusForConditionalGeneration: + cfg = BigBirdPegasusConfig(**config_update) + torch_model = BigBirdPegasusForConditionalGeneration(cfg) + state_dict = torch_model.state_dict() + mapping = {} + + # separating decoder weights + decoder_weights = {k: tf_weights[k] for k in tf_weights if k.startswith("pegasus/decoder")} + remaining_weights = {k: tf_weights[k] for k in tf_weights if not k.startswith("pegasus/decoder")} + + for k, v in tqdm(decoder_weights.items(), "tf -> hf conversion"): + conditions = [k.endswith(ending) for ending in KEYS_TO_IGNORE] + if any(conditions): + continue + patterns = DECODER_PATTERNS + new_k = rename_state_dict_key(k, patterns) + if new_k not in state_dict: + raise ValueError(f"could not find new key {new_k} in state dict. (converted from {k})") + if any(True if i in k else False for i in ["dense", "query", "key", "value"]): + v = v.T + mapping[new_k] = torch.from_numpy(v) + assert v.shape == state_dict[new_k].shape, f"{new_k}, {k}, {v.shape}, {state_dict[new_k].shape}" + + for k, v in tqdm(remaining_weights.items(), "tf -> hf conversion"): + conditions = [k.endswith(ending) for ending in KEYS_TO_IGNORE] + if any(conditions): + continue + patterns = REMAINING_PATTERNS + new_k = rename_state_dict_key(k, patterns) + if new_k not in state_dict and k != "pegasus/embeddings/position_embeddings": + raise ValueError(f"could not find new key {new_k} in state dict. (converted from {k})") + if any(True if i in k else False for i in ["dense", "query", "key", "value"]): + v = v.T + mapping[new_k] = torch.from_numpy(v) + if k != "pegasus/embeddings/position_embeddings": + assert v.shape == state_dict[new_k].shape, f"{new_k}, {k}, {v.shape}, {state_dict[new_k].shape}" + + mapping["model.encoder.embed_positions.weight"] = mapping["model.embed_positions.weight"] + mapping["model.decoder.embed_positions.weight"] = mapping.pop("model.embed_positions.weight") + missing, extra = torch_model.load_state_dict(mapping, strict=False) + unexpected_missing = [ + k + for k in missing + if k + not in [ + "final_logits_bias", + "model.encoder.embed_tokens.weight", + "model.decoder.embed_tokens.weight", + "lm_head.weight", + ] + ] + assert unexpected_missing == [], f"no matches found for the following torch keys {unexpected_missing}" + assert extra == [], f"no matches found for the following tf keys {extra}" + return torch_model + + +def get_tf_weights_as_numpy(path) -> Dict: + init_vars = tf.train.list_variables(path) + tf_weights = {} + ignore_name = ["global_step"] + for name, shape in tqdm(init_vars, desc="converting tf checkpoint to dict"): + skip_key = any(pat in name for pat in ignore_name) + if skip_key: + continue + array = tf.train.load_variable(path, name) + tf_weights[name] = array + return tf_weights + + +def convert_bigbird_pegasus_ckpt_to_pytorch(ckpt_path: str, save_dir: str, config_update: dict): + tf_weights = get_tf_weights_as_numpy(ckpt_path) + torch_model = convert_bigbird_pegasus(tf_weights, config_update) + torch_model.save_pretrained(save_dir) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--tf_ckpt_path", type=str, help="passed to tf.train.list_variables") + parser.add_argument("--save_dir", default=None, type=str, help="Path to the output PyTorch model.") + args = parser.parse_args() + config_update = {} + convert_bigbird_pegasus_ckpt_to_pytorch(args.tf_ckpt_path, args.save_dir, config_update=config_update) diff --git a/docs/transformers/build/lib/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py b/docs/transformers/build/lib/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py new file mode 100644 index 0000000000000000000000000000000000000000..6827a4a1882fd1f5406dc51e1ea64fceeeb74358 --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/bigbird_pegasus/modeling_bigbird_pegasus.py @@ -0,0 +1,3045 @@ +# coding=utf-8 +# Copyright 2021 Google Research The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch BigBirdPegasus model.""" + +import copy +import math +from typing import List, Optional, Tuple, Union + +import numpy as np +import torch +from torch import nn +from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss + +from ...activations import ACT2FN +from ...generation import GenerationMixin +from ...modeling_attn_mask_utils import _prepare_4d_attention_mask, _prepare_4d_causal_attention_mask +from ...modeling_outputs import ( + BaseModelOutput, + BaseModelOutputWithPastAndCrossAttentions, + CausalLMOutputWithCrossAttentions, + Seq2SeqLMOutput, + Seq2SeqModelOutput, + Seq2SeqQuestionAnsweringModelOutput, + Seq2SeqSequenceClassifierOutput, +) +from ...modeling_utils import PreTrainedModel +from ...utils import ( + add_code_sample_docstrings, + add_end_docstrings, + add_start_docstrings, + add_start_docstrings_to_model_forward, + logging, + replace_return_docstrings, +) +from .configuration_bigbird_pegasus import BigBirdPegasusConfig + + +logger = logging.get_logger(__name__) + +_CHECKPOINT_FOR_DOC = "google/bigbird-pegasus-large-arxiv" +_CONFIG_FOR_DOC = "BigBirdPegasusConfig" +_EXPECTED_OUTPUT_SHAPE = [1, 7, 1024] + + +def shift_tokens_right(input_ids: torch.Tensor, pad_token_id: int, decoder_start_token_id: int): + """ + Shift input ids one token to the right. + """ + shifted_input_ids = input_ids.new_zeros(input_ids.shape) + shifted_input_ids[:, 1:] = input_ids[:, :-1].clone() + shifted_input_ids[:, 0] = decoder_start_token_id + + if pad_token_id is None: + raise ValueError("self.model.config.pad_token_id has to be defined.") + # replace possible -100 values in labels by `pad_token_id` + shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id) + + return shifted_input_ids + + +class BigBirdPegasusLearnedPositionalEmbedding(nn.Embedding): + """ + This module learns positional embeddings up to a fixed maximum size. + """ + + def __init__(self, num_embeddings: int, embedding_dim: int): + super().__init__(num_embeddings, embedding_dim) + + def forward(self, input_ids_shape: torch.Size, past_key_values_length: int = 0): + """`input_ids_shape` is expected to be [bsz x seqlen].""" + bsz, seq_len = input_ids_shape[:2] + positions = torch.arange( + past_key_values_length, past_key_values_length + seq_len, dtype=torch.long, device=self.weight.device + ) + return super().forward(positions) + + +# Copied from transformers.models.bart.modeling_bart.BartScaledWordEmbedding with Bart->BigBirdPegasus +class BigBirdPegasusScaledWordEmbedding(nn.Embedding): + """ + This module overrides nn.Embeddings' forward by multiplying with embeddings scale. + """ + + def __init__(self, num_embeddings: int, embedding_dim: int, padding_idx: int, embed_scale: Optional[float] = 1.0): + super().__init__(num_embeddings, embedding_dim, padding_idx) + self.embed_scale = embed_scale + + def forward(self, input_ids: torch.Tensor): + return super().forward(input_ids) * self.embed_scale + + +# Copied from transformers.models.big_bird.modeling_big_bird.BigBirdSelfAttention with BigBird->BigBirdPegasus +class BigBirdPegasusSelfAttention(nn.Module): + def __init__(self, config): + super().__init__() + if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"): + raise ValueError( + f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention " + f"heads ({config.num_attention_heads})" + ) + + self.num_attention_heads = config.num_attention_heads + self.attention_head_size = int(config.hidden_size / config.num_attention_heads) + self.all_head_size = self.num_attention_heads * self.attention_head_size + + self.query = nn.Linear(config.hidden_size, self.all_head_size, bias=config.use_bias) + self.key = nn.Linear(config.hidden_size, self.all_head_size, bias=config.use_bias) + self.value = nn.Linear(config.hidden_size, self.all_head_size, bias=config.use_bias) + + self.dropout = nn.Dropout(config.attention_probs_dropout_prob) + self.is_decoder = config.is_decoder + + def transpose_for_scores(self, x): + new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size) + x = x.view(*new_x_shape) + return x.permute(0, 2, 1, 3) + + def forward( + self, + hidden_states, + attention_mask=None, + head_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + past_key_value=None, + output_attentions=False, + ): + mixed_query_layer = self.query(hidden_states) + + # If this is instantiated as a cross-attention module, the keys + # and values come from an encoder; the attention mask needs to be + # such that the encoder's padding tokens are not attended to. + is_cross_attention = encoder_hidden_states is not None + + if is_cross_attention and past_key_value is not None: + # reuse k,v, cross_attentions + key_layer = past_key_value[0] + value_layer = past_key_value[1] + attention_mask = encoder_attention_mask + elif is_cross_attention: + key_layer = self.transpose_for_scores(self.key(encoder_hidden_states)) + value_layer = self.transpose_for_scores(self.value(encoder_hidden_states)) + attention_mask = encoder_attention_mask + elif past_key_value is not None: + key_layer = self.transpose_for_scores(self.key(hidden_states)) + value_layer = self.transpose_for_scores(self.value(hidden_states)) + key_layer = torch.cat([past_key_value[0], key_layer], dim=2) + value_layer = torch.cat([past_key_value[1], value_layer], dim=2) + else: + key_layer = self.transpose_for_scores(self.key(hidden_states)) + value_layer = self.transpose_for_scores(self.value(hidden_states)) + + query_layer = self.transpose_for_scores(mixed_query_layer) + + if self.is_decoder: + # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states. + # Further calls to cross_attention layer can then reuse all cross-attention + # key/value_states (first "if" case) + # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of + # all previous decoder key/value_states. Further calls to uni-directional self-attention + # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case) + # if encoder bi-directional self-attention `past_key_value` is always `None` + past_key_value = (key_layer, value_layer) + + # Take the dot product between "query" and "key" to get the raw attention scores. + attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2)) + + attention_scores = attention_scores / math.sqrt(self.attention_head_size) + if attention_mask is not None: + # Apply the attention mask is (precomputed for all layers in BigBirdPegasusModel forward() function) + attention_scores = attention_scores + attention_mask + + # Normalize the attention scores to probabilities. + attention_probs = nn.functional.softmax(attention_scores, dim=-1) + + # This is actually dropping out entire tokens to attend to, which might + # seem a bit unusual, but is taken from the original Transformer paper. + attention_probs = self.dropout(attention_probs) + + # Mask heads if we want to + if head_mask is not None: + attention_probs = attention_probs * head_mask + + context_layer = torch.matmul(attention_probs, value_layer) + + context_layer = context_layer.permute(0, 2, 1, 3).contiguous() + new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,) + context_layer = context_layer.view(*new_context_layer_shape) + + outputs = (context_layer, attention_probs) if output_attentions else (context_layer,) + + if self.is_decoder: + outputs = outputs + (past_key_value,) + return outputs + + +# Copied from transformers.models.big_bird.modeling_big_bird.BigBirdBlockSparseAttention with BigBird->BigBirdPegasus +class BigBirdPegasusBlockSparseAttention(nn.Module): + def __init__(self, config, seed=None): + super().__init__() + + self.max_seqlen = config.max_position_embeddings + self.seed = seed + + if config.hidden_size % config.num_attention_heads != 0: + raise ValueError( + f"The hidden size {config.hidden_size} is not a multiple of the number of attention " + f"heads {config.num_attention_heads}." + ) + + self.num_attention_heads = config.num_attention_heads + self.num_random_blocks = config.num_random_blocks + self.block_size = config.block_size + + self.attention_head_size = int(config.hidden_size / config.num_attention_heads) + self.all_head_size = self.num_attention_heads * self.attention_head_size + + self.query = nn.Linear(config.hidden_size, self.all_head_size, bias=config.use_bias) + self.key = nn.Linear(config.hidden_size, self.all_head_size, bias=config.use_bias) + self.value = nn.Linear(config.hidden_size, self.all_head_size, bias=config.use_bias) + + def transpose_for_scores(self, x): + new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size) + x = x.view(*new_x_shape) + return x.permute(0, 2, 1, 3) + + def forward( + self, + hidden_states, + band_mask=None, + from_mask=None, + to_mask=None, + from_blocked_mask=None, + to_blocked_mask=None, + output_attentions=None, + ): + # Currently this `class` can't be used in decoder. + + batch_size, seqlen, _ = hidden_states.size() + to_seq_length = from_seq_length = seqlen + from_block_size = to_block_size = self.block_size + + if from_seq_length % from_block_size != 0: + raise ValueError("Query sided sequence length must be multiple of block size") + + if to_seq_length % to_block_size != 0: + raise ValueError("Key/Value sided sequence length must be multiple of block size") + + query_layer = self.transpose_for_scores(self.query(hidden_states)) + key_layer = self.transpose_for_scores(self.key(hidden_states)) + value_layer = self.transpose_for_scores(self.value(hidden_states)) + + context_layer, attention_probs = self.bigbird_block_sparse_attention( + query_layer, + key_layer, + value_layer, + band_mask, + from_mask, + to_mask, + from_blocked_mask, + to_blocked_mask, + self.num_attention_heads, + self.num_random_blocks, + self.attention_head_size, + from_block_size, + to_block_size, + batch_size, + from_seq_length, + to_seq_length, + seed=self.seed, + plan_from_length=None, + plan_num_rand_blocks=None, + output_attentions=output_attentions, + ) + + context_layer = context_layer.contiguous().view(batch_size, from_seq_length, -1) + + outputs = (context_layer, attention_probs) if output_attentions else (context_layer,) + return outputs + + @staticmethod + def torch_bmm_nd(inp_1, inp_2, ndim=None): + """Fast nd matrix multiplication""" + # faster replacement of torch.einsum ("bhqk,bhkd->bhqd") + return torch.bmm(inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:])).view( + inp_1.shape[: ndim - 2] + (inp_1.shape[ndim - 2], inp_2.shape[ndim - 1]) + ) + + @staticmethod + def torch_bmm_nd_transpose(inp_1, inp_2, ndim=None): + """Fast nd matrix multiplication with transpose""" + # faster replacement of torch.einsum (bhqd,bhkd->bhqk) + return torch.bmm( + inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:]).transpose(1, 2) + ).view(inp_1.shape[: ndim - 2] + (inp_1.shape[ndim - 2], inp_2.shape[ndim - 2])) + + def bigbird_block_sparse_attention( + self, + query_layer, + key_layer, + value_layer, + band_mask, + from_mask, + to_mask, + from_blocked_mask, + to_blocked_mask, + n_heads, + n_rand_blocks, + attention_head_size, + from_block_size, + to_block_size, + batch_size, + from_seq_len, + to_seq_len, + seed, + plan_from_length, + plan_num_rand_blocks, + output_attentions, + ): + # BigBirdPegasus block-sparse attention as suggested in paper + + # ITC: + # global tokens: 2 x block_size + # window tokens: 3 x block_size + # random tokens: num_rand_tokens x block_size + + # ETC: + # global tokens: extra_globals_tokens + 2 x block_size + # window tokens: 3 x block_size + # random tokens: num_rand_tokens x block_size + + # Note: + # 1) Currently, ETC is not supported. + # 2) Window size is fixed to 3 blocks & it can be changed only by + # changing `block_size`. + # 3) Number of global blocks are fixed (2 blocks here) & global tokens can be + # controlled only by `block_size`. + + # attention is calculated separately for q[0], q[1], q[2:-2], q[-2], q[-1] in order to use special trick of shifting tokens (for calculating sliding attention) + # hence following code can be divided into 5 parts. + + if from_seq_len // from_block_size != to_seq_len // to_block_size: + raise ValueError("Error the number of blocks needs to be same!") + + rsqrt_d = 1 / math.sqrt(attention_head_size) + bsz = batch_size + attn_mask_penalty = -10000.0 + + # generate random attention and corresponding masks + np.random.seed(seed) + if from_seq_len in [1024, 3072, 4096]: # old plans used in paper + rand_attn = [ + self._bigbird_block_rand_mask( + self.max_seqlen, self.max_seqlen, from_block_size, to_block_size, n_rand_blocks, last_idx=1024 + )[: (from_seq_len // from_block_size - 2)] + for _ in range(n_heads) + ] + else: + if plan_from_length is None: + plan_from_length, plan_num_rand_blocks = self._get_rand_attn_plan( + from_seq_len, from_block_size, n_rand_blocks + ) + + rand_attn = self._bigbird_block_rand_mask_with_head( + from_seq_length=from_seq_len, + to_seq_length=to_seq_len, + from_block_size=from_block_size, + to_block_size=to_block_size, + num_heads=n_heads, + plan_from_length=plan_from_length, + plan_num_rand_blocks=plan_num_rand_blocks, + ) + + rand_attn = np.stack(rand_attn, axis=0) + rand_attn = torch.tensor(rand_attn, device=query_layer.device, dtype=torch.long) + rand_attn.unsqueeze_(0) + rand_attn = torch.cat([rand_attn for _ in range(batch_size)], dim=0) + + rand_mask = self._create_rand_mask_from_inputs( + from_blocked_mask, to_blocked_mask, rand_attn, n_heads, n_rand_blocks, bsz, from_seq_len, from_block_size + ) + + blocked_query_matrix = query_layer.view(bsz, n_heads, from_seq_len // from_block_size, from_block_size, -1) + blocked_key_matrix = key_layer.view(bsz, n_heads, to_seq_len // to_block_size, to_block_size, -1) + blocked_value_matrix = value_layer.view(bsz, n_heads, to_seq_len // to_block_size, to_block_size, -1) + + # preparing block for randn attn + gathered_key = self.torch_gather_b2(blocked_key_matrix, rand_attn) + gathered_key = gathered_key.view( + bsz, n_heads, to_seq_len // to_block_size - 2, n_rand_blocks * to_block_size, -1 + ) # [bsz, n_heads, to_seq_len//to_block_size-2, n_rand_blocks, to_block_size, -1] + gathered_value = self.torch_gather_b2(blocked_value_matrix, rand_attn) + gathered_value = gathered_value.view( + bsz, n_heads, to_seq_len // to_block_size - 2, n_rand_blocks * to_block_size, -1 + ) # [bsz, n_heads, to_seq_len//to_block_size-2, n_rand_blocks, to_block_size, -1] + + # 1st PART + # 1st block (global block) attention scores + # q[0] x (k[0], k[1], k[2], k[3], k[4] .... ) + + # [bsz, n_heads, from_block_size, -1] x [bsz, n_heads, to_seq_len, -1] ==> [bsz, n_heads, from_block_size, to_seq_len] + first_product = self.torch_bmm_nd_transpose(blocked_query_matrix[:, :, 0], key_layer, ndim=4) + + first_product = first_product * rsqrt_d + first_product += (1.0 - to_mask) * attn_mask_penalty + first_attn_weights = nn.functional.softmax( + first_product, dim=-1 + ) # [bsz, n_heads, from_block_size, to_seq_len] + + # [bsz, n_heads, from_block_size, to_seq_len] x [bsz, n_heads, to_seq_len, -1] ==> [bsz, n_heads, from_block_size, -1] + first_context_layer = self.torch_bmm_nd(first_attn_weights, value_layer, ndim=4) + first_context_layer.unsqueeze_(2) + + # 2nd PART + # 2nd block attention scores + # q[1] x (sliding_keys, random_keys, global_keys) + # sliding key blocks -> 2nd, 3rd blocks + # global key blocks -> 1st block + + second_key_mat = torch.cat( + [ + blocked_key_matrix[:, :, 0], + blocked_key_matrix[:, :, 1], + blocked_key_matrix[:, :, 2], + blocked_key_matrix[:, :, -1], + gathered_key[:, :, 0], + ], + dim=2, + ) # [bsz, n_heads, (4+n_rand_blocks)*to_block_size, -1] + second_value_mat = torch.cat( + [ + blocked_value_matrix[:, :, 0], + blocked_value_matrix[:, :, 1], + blocked_value_matrix[:, :, 2], + blocked_value_matrix[:, :, -1], + gathered_value[:, :, 0], + ], + dim=2, + ) # [bsz, n_heads, (4+n_rand_blocks)*to_block_size, -1] + + # [bsz, n_heads, from_block_size, -1] x [bsz, n_heads, (4+n_rand_blocks)*to_block_size, -1] ==> [bsz, n_heads, from_block_size, (4+n_rand_blocks)*to_block_size] + second_product = self.torch_bmm_nd_transpose(blocked_query_matrix[:, :, 1], second_key_mat, ndim=4) + second_seq_pad = torch.cat( + [ + to_mask[:, :, :, : 3 * to_block_size], + to_mask[:, :, :, -to_block_size:], + to_mask.new_ones([bsz, 1, 1, n_rand_blocks * to_block_size]), + ], + dim=3, + ) + second_rand_pad = torch.cat( + [ + rand_mask.new_ones([bsz, n_heads, from_block_size, 4 * to_block_size]), + rand_mask[:, :, 0], + ], + dim=3, + ) + second_product = second_product * rsqrt_d + second_product += (1.0 - torch.minimum(second_seq_pad, second_rand_pad)) * attn_mask_penalty + second_attn_weights = nn.functional.softmax( + second_product, dim=-1 + ) # [bsz, n_heads, from_block_size, (4+n_rand_blocks)*to_block_size] + + # [bsz, n_heads, from_block_size, (4+n_rand_blocks)*to_block_size] x [bsz, n_heads, (4+n_rand_blocks)*to_block_size, -1] ==> [bsz, n_heads, from_block_size, -1] + second_context_layer = self.torch_bmm_nd(second_attn_weights, second_value_mat, ndim=4) + + second_context_layer.unsqueeze_(2) + + # 3rd PART + # Middle blocks attention scores + # q[-2:2] x (sliding_keys, random_keys, global_keys) + # sliding attn is calculated using special trick of shifting tokens as discussed in paper + # random keys are generated by taking random indices as per `rand_attn` + # global keys -> 1st & last block + + exp_blocked_key_matrix = torch.cat( + [blocked_key_matrix[:, :, 1:-3], blocked_key_matrix[:, :, 2:-2], blocked_key_matrix[:, :, 3:-1]], dim=3 + ) # [bsz, n_heads, from_seq_len//from_block_size-4, 3*to_block_size, -1] + exp_blocked_value_matrix = torch.cat( + [blocked_value_matrix[:, :, 1:-3], blocked_value_matrix[:, :, 2:-2], blocked_value_matrix[:, :, 3:-1]], + dim=3, + ) # [bsz, n_heads, from_seq_len//from_block_size-4, 3*to_block_size, -1] + middle_query_matrix = blocked_query_matrix[:, :, 2:-2] + + # sliding attention scores for q[-2:2] + # [bsz, n_heads, from_seq_len//from_block_size-4, from_block_size, -1] x [b, n_heads, from_seq_len//from_block_size-4, 3*to_block_size, -1] + inner_band_product = self.torch_bmm_nd_transpose(middle_query_matrix, exp_blocked_key_matrix, ndim=5) + # ==> [bsz, n_heads, from_seq_len//from_block_size-4, from_block_size, 3*to_block_size] + inner_band_product = inner_band_product * rsqrt_d + + # randn attention scores for q[-2:2] + # [bsz, n_heads, from_seq_len//from_block_size-4, from_block_size, -1] x [bsz, n_heads, from_seq_len//from_block_size-4, n_rand_blocks*to_block_size, -1] + rand_band_product = self.torch_bmm_nd_transpose(middle_query_matrix, gathered_key[:, :, 1:-1], ndim=5) + # ==> [bsz, n_heads, from_seq_len//from_block_size-4, from_block_size, n_rand_blocks*to_block_size] + rand_band_product = rand_band_product * rsqrt_d + + # Including 1st block (since it's global) + first_band_product = torch.einsum( + "bhlqd,bhkd->bhlqk", middle_query_matrix, blocked_key_matrix[:, :, 0] + ) # [bsz, n_heads, from_seq_len//from_block_size-4, from_block_size, -1] x [bsz, n_heads, to_block_size, -1] ==> [bsz, n_heads, from_seq_len//from_block_size-4, from_block_size, to_block_size] + first_band_product = first_band_product * rsqrt_d + + # Including last block (since it's global) + last_band_product = torch.einsum( + "bhlqd,bhkd->bhlqk", middle_query_matrix, blocked_key_matrix[:, :, -1] + ) # [bsz, n_heads, from_seq_len//from_block_size-4, from_block_size, -1] x [bsz, n_heads, to_block_size, -1] ==> [bsz, n_heads, from_seq_len//from_block_size-4, from_block_size, to_block_size] + last_band_product = last_band_product * rsqrt_d + + # masking padded tokens + inner_band_product += (1.0 - band_mask) * attn_mask_penalty + first_band_product += (1.0 - to_mask[:, :, :, :to_block_size].unsqueeze(3)) * attn_mask_penalty + last_band_product += (1.0 - to_mask[:, :, :, -to_block_size:].unsqueeze(3)) * attn_mask_penalty + rand_band_product += (1.0 - rand_mask[:, :, 1:-1]) * attn_mask_penalty + + # completing attention scores matrix for all q[-2:2] + band_product = torch.cat( + [first_band_product, inner_band_product, rand_band_product, last_band_product], dim=-1 + ) # [bsz, n_heads, from_seq_len//from_block_size-4, from_block_size, (5+n_rand_blocks)*to_block_size] + + # safely doing softmax since attention matrix is completed + attn_weights = nn.functional.softmax( + band_product, dim=-1 + ) # [bsz, n_heads, from_seq_len//from_block_size-4, from_block_size, (5+n_rand_blocks)*to_block_size] + + # contribution of sliding keys + # [bsz, n_heads, m//from_block_size-4, from_block_size, 3*to_block_size] x [bsz, n_heads, from_seq_len//from_block_size-4, 3*to_block_size, -1] + context_layer = self.torch_bmm_nd( + attn_weights[:, :, :, :, to_block_size : 4 * to_block_size], exp_blocked_value_matrix, ndim=5 + ) + # ==> [bsz, n_heads, from_seq_len//from_block_size-4, from_block_size, -1] + + # adding contribution of random keys + # [bsz, n_heads, from_seq_len//from_block_size-4, from_block_size, n_rand_blocks*to_block_size] x [bsz, n_heads, from_seq_len//from_block_size-4, n_rand_blocks*to_block_size, -1] + context_layer += self.torch_bmm_nd( + attn_weights[:, :, :, :, 4 * to_block_size : -to_block_size], gathered_value[:, :, 1:-1], ndim=5 + ) + # ==> [bsz, n_heads, from_seq_len//from_block_size-4, from_block_size, -1] + + # adding contribution of global keys + context_layer += torch.einsum( + "bhlqk,bhkd->bhlqd", attn_weights[:, :, :, :, :to_block_size], blocked_value_matrix[:, :, 0] + ) # [bsz, n_heads, from_seq_len//from_block_size-4, from_block_size, to_block_size] x [bsz, n_heads, to_block_size, -1] ==> [bsz, n_heads, from_seq_len//from_block_size-4, from_block_size, -1] + context_layer += torch.einsum( + "bhlqk,bhkd->bhlqd", attn_weights[:, :, :, :, -to_block_size:], blocked_value_matrix[:, :, -1] + ) # [bsz, n_heads, from_seq_len//from_block_size-4, from_block_size, to_block_size] x [bsz, n_heads, to_block_size, -1] ==> [bsz, n_heads, from_seq_len//from_block_size-4, from_block_size, -1] + + # 4th PART + # last 2nd token attention scores + # q[-2] x (sliding_keys, random_keys, global_keys) + # sliding key blocks -> last 3 blocks + # global key block -> 1st block + # random key block -> based on indices stored in `randn_attn` + + second_last_key_mat = torch.cat( + [ + blocked_key_matrix[:, :, 0], + blocked_key_matrix[:, :, -3], + blocked_key_matrix[:, :, -2], + blocked_key_matrix[:, :, -1], + gathered_key[:, :, -1], + ], + dim=2, + ) # [bsz, n_heads, (4+n_random_blocks)*to_block_size, -1] + second_last_value_mat = torch.cat( + [ + blocked_value_matrix[:, :, 0], + blocked_value_matrix[:, :, -3], + blocked_value_matrix[:, :, -2], + blocked_value_matrix[:, :, -1], + gathered_value[:, :, -1], + ], + dim=2, + ) # [bsz, n_heads, (4+r)*to_block_size, -1] + + # [bsz, n_heads, from_block_size, -1] x [bsz, n_heads, (4+n_rand_blocks)*to_block_size, -1] ==> [bsz, n_heads, from_block_size, (4+n_rand_blocks)*to_block_size] + second_last_product = self.torch_bmm_nd_transpose(blocked_query_matrix[:, :, -2], second_last_key_mat, ndim=4) + second_last_seq_pad = torch.cat( + [ + to_mask[:, :, :, :to_block_size], + to_mask[:, :, :, -3 * to_block_size :], + to_mask.new_ones([bsz, 1, 1, n_rand_blocks * to_block_size]), + ], + dim=3, + ) + second_last_rand_pad = torch.cat( + [ + rand_mask.new_ones([bsz, n_heads, from_block_size, 4 * to_block_size]), + rand_mask[:, :, -1], + ], + dim=3, + ) + second_last_product = second_last_product * rsqrt_d + second_last_product += (1.0 - torch.minimum(second_last_seq_pad, second_last_rand_pad)) * attn_mask_penalty + second_last_attn_weights = nn.functional.softmax( + second_last_product, dim=-1 + ) # [bsz, n_heads, from_block_size, (4+n_rand_blocks)*to_block_size] + + # [bsz, n_heads, from_block_size, (4+n_rand_blocks)*to_block_size] x [bsz, n_heads, (4+n_rand_blocks)*to_block_size, -1] ==> [bsz, n_heads, from_block_size, -1] + second_last_context_layer = self.torch_bmm_nd(second_last_attn_weights, second_last_value_mat, ndim=4) + second_last_context_layer.unsqueeze_(2) + + # 5th PART + # last block (global) attention scores + # q[-1] x (k[0], k[1], k[2], k[3], .... ) + + # [bsz, n_heads, from_block_size, -1] x [bsz, n_heads, to_seq_len, -1] ==> [bsz, n_heads, from_block_size, to_seq_len] + last_product = self.torch_bmm_nd_transpose(blocked_query_matrix[:, :, -1], key_layer, ndim=4) + last_product = last_product * rsqrt_d + last_product += (1.0 - to_mask) * attn_mask_penalty + last_attn_weights = nn.functional.softmax(last_product, dim=-1) # [bsz, n_heads, from_block_size, n] + + # [bsz, n_heads, from_block_size, to_seq_len] x [bsz, n_heads, to_seq_len, -1] ==> [bsz, n_heads, from_block_size, -1] + last_context_layer = self.torch_bmm_nd(last_attn_weights, value_layer, ndim=4) + last_context_layer.unsqueeze_(2) + + # combining representations of all tokens + context_layer = torch.cat( + [first_context_layer, second_context_layer, context_layer, second_last_context_layer, last_context_layer], + dim=2, + ) + context_layer = context_layer.view((bsz, n_heads, from_seq_len, -1)) * from_mask + context_layer = torch.transpose(context_layer, 1, 2) + + # this is just for visualizing; forward pass doesn't depend on following code + if output_attentions: + # TODO(PVP): need to verify if below code is correct + attention_probs = torch.zeros( + bsz, n_heads, from_seq_len, to_seq_len, dtype=torch.float, device=context_layer.device + ) + + # 1st query block + # corresponding to `first_context_layer` + attention_probs[:, :, :from_block_size, :] = first_attn_weights # all keys global + + # 2nd query block + # corresponding to `second_context_layer` + attention_probs[:, :, from_block_size : 2 * from_block_size, : 3 * to_block_size] = second_attn_weights[ + :, :, :, : 3 * to_block_size + ] # 1st three key blocks (global + sliding) + attention_probs[:, :, from_block_size : 2 * from_block_size, -to_block_size:] = second_attn_weights[ + :, :, :, 3 * to_block_size : 4 * to_block_size + ] # last key block (global) + # random keys + for p1, i1, w1 in zip(range(bsz), rand_attn, second_attn_weights): + # p1, i1, w1 corresponds to batch_dim i.e. following operation is done for each sequence in batch + for p2, i2, w2 in zip(range(n_heads), i1, w1): + # p2, i2, w2 corresponds to head_dim i.e. following operation is done for each heads + attn_probs_view = attention_probs.view( + bsz, + n_heads, + from_seq_len // from_block_size, + from_block_size, + to_seq_len // to_block_size, + to_block_size, + ) + right_slice = w2[:, 4 * to_block_size :] + attn_probs_view[p1, p2, 1, :, i2[0]] = right_slice.view( + from_block_size, n_rand_blocks, to_block_size + ) + + # Middle query blocks + # corresponding to `context_layer` + # sliding keys + for q_idx in range(from_seq_len // from_block_size - 4): + attn_probs_view = attention_probs.view( + bsz, + n_heads, + from_seq_len // from_block_size, + from_block_size, + to_seq_len // to_block_size, + to_block_size, + )[:, :, 2:-2, :, 1:-1, :] + right_slice = attn_weights[:, :, q_idx, :, to_block_size : 4 * to_block_size] + attn_probs_view[:, :, q_idx, :, q_idx : q_idx + 3, :] = right_slice.view( + bsz, n_heads, from_block_size, 3, to_block_size + ) # inner_band_product + # global keys (corresponding to 1st key block) + attention_probs[:, :, 2 * from_block_size : -2 * from_block_size, :to_block_size] = attn_weights[ + :, :, :, :, :to_block_size + ].view(bsz, n_heads, -1, to_block_size) # first_band_product + # global keys (corresponding to last key block) + attention_probs[:, :, 2 * from_block_size : -2 * from_block_size, -to_block_size:] = attn_weights[ + :, :, :, :, -to_block_size: + ].view(bsz, n_heads, -1, to_block_size) # last_band_product + # random keys + for p1, i1, w1 in zip(range(bsz), rand_attn, attn_weights): + # p1, i1, w1 corresponds to batch_dim i.e. following operation is done for each sequence in batch + for p2, i2, w2 in zip(range(n_heads), i1, w1): + # p2, i2, w2 corresponds to head_dim i.e. following operation is done for each heads + for q_idx in range(1, len(i2) - 1): + attn_probs_view = attention_probs.view( + bsz, + n_heads, + from_seq_len // from_block_size, + from_block_size, + to_seq_len // to_block_size, + to_block_size, + ) + right_slice = w2[q_idx - 1, :, 4 * to_block_size : -to_block_size] + attn_probs_view[p1, p2, q_idx + 1, :, i2[q_idx]] = right_slice.view( + from_block_size, n_rand_blocks, to_block_size + ) + + # Second-last query block + # corresponding to `second_last_context_layer` + attention_probs[:, :, -2 * from_block_size : -from_block_size, :to_block_size] = second_last_attn_weights[ + :, :, :, :to_block_size + ] # 1st key block (global) + attention_probs[:, :, -2 * from_block_size : -from_block_size, -3 * to_block_size :] = ( + second_last_attn_weights[:, :, :, to_block_size : 4 * to_block_size] + ) # last three blocks (global + sliding) + # random keys + for p1, i1, w1 in zip(range(bsz), rand_attn, second_last_attn_weights): + # p1, i1, w1 corresponds to batch_dim i.e. following operation is done for each sequence in batch + for p2, i2, w2 in zip(range(n_heads), i1, w1): + # p2, i2, w2 corresponds to head_dim i.e. following operation is done for each heads + attn_probs_view = attention_probs.view( + bsz, + n_heads, + from_seq_len // from_block_size, + from_block_size, + to_seq_len // to_block_size, + to_block_size, + ) + right_slice = w2[:, 4 * to_block_size :] + attn_probs_view[p1, p2, -2, :, i2[-1]] = right_slice.view( + from_block_size, n_rand_blocks, to_block_size + ) + + # last query block + # corresponding to `last_context_layer` + attention_probs[:, :, -from_block_size:, :] = last_attn_weights # all keys global + + else: + attention_probs = None + + return context_layer, attention_probs + + @staticmethod + def torch_gather_b2(params, indices): + # this operation is equivalent to tf.gather when batch_dims=2 + + if params.shape[:2] != indices.shape[:2]: + raise ValueError( + "Make sure that the first two dimensions of params and indices are identical, but" + f" they are params: {params.shape[:2]} vs. indices: {indices.shape[:2]}" + ) + num_indices_to_gather = indices.shape[-2] * indices.shape[-1] + num_indices_to_pick_from = params.shape[2] + + shift = torch.arange(indices.shape[0] * indices.shape[1] * num_indices_to_gather, device=indices.device) + indices_shift = torch.div(shift, num_indices_to_gather, rounding_mode="floor") * num_indices_to_pick_from + + flattened_indices = indices.view(-1) + indices_shift + flattened_params = params.reshape(-1, params.shape[-2], params.shape[-1]) + + out_flattened = flattened_params.index_select(0, flattened_indices) + + out = out_flattened.reshape(params.shape[:2] + (num_indices_to_gather,) + params.shape[3:]) + return out + + @staticmethod + def _create_rand_mask_from_inputs( + from_blocked_mask, + to_blocked_mask, + rand_attn, + num_attention_heads, + num_rand_blocks, + batch_size, + from_seq_length, + from_block_size, + ): + """ + Create 3D attention mask from a 2D tensor mask. + + Args: + from_blocked_mask: 2D Tensor of shape [batch_size, + from_seq_length//from_block_size, from_block_size]. + to_blocked_mask: int32 Tensor of shape [batch_size, + to_seq_length//to_block_size, to_block_size]. + rand_attn: [batch_size, num_attention_heads, + from_seq_length//from_block_size-2, num_rand_blocks] + num_attention_heads: int. Number of attention heads. + num_rand_blocks: int. Number of random chunks per row. + batch_size: int. Batch size for computation. + from_seq_length: int. length of from sequence. + from_block_size: int. size of block in from sequence. + + Returns: + float Tensor of shape [batch_size, num_attention_heads, from_seq_length//from_block_size-2, + from_block_size, num_rand_blocks*to_block_size]. + """ + num_windows = from_seq_length // from_block_size - 2 + rand_mask = torch.stack([p1[i1.flatten()] for p1, i1 in zip(to_blocked_mask, rand_attn)]) + rand_mask = rand_mask.view(batch_size, num_attention_heads, num_windows, num_rand_blocks * from_block_size) + rand_mask = torch.einsum("blq,bhlk->bhlqk", from_blocked_mask[:, 1:-1], rand_mask) + return rand_mask + + @staticmethod + def _get_rand_attn_plan(from_seq_length, from_block_size, num_rand_blocks): + """ + Gives the plan of where to put random attention. + + Args: + from_seq_length: int. length of from sequence. + from_block_size: int. size of block in from sequence. + num_rand_blocks: int. Number of random chunks per row. + + Returns: + plan_from_length: ending location of from block plan_num_rand_blocks: number of random ending location for + each block + """ + + plan_from_length = [] + plan_num_rand_blocks = [] + if (2 * num_rand_blocks + 5) < (from_seq_length // from_block_size): + plan_from_length.append(int((2 * num_rand_blocks + 5) * from_block_size)) + plan_num_rand_blocks.append(num_rand_blocks) + plan_from_length.append(from_seq_length) + plan_num_rand_blocks.append(0) + elif (num_rand_blocks + 5) < (from_seq_length // from_block_size): + plan_from_length.append(int((num_rand_blocks + 5) * from_block_size)) + plan_num_rand_blocks.append(num_rand_blocks // 2) + plan_from_length.append(from_seq_length) + plan_num_rand_blocks.append(num_rand_blocks - (num_rand_blocks // 2)) + else: + plan_from_length.append(from_seq_length) + plan_num_rand_blocks.append(num_rand_blocks) + + return plan_from_length, plan_num_rand_blocks + + def _bigbird_block_rand_mask( + self, from_seq_length, to_seq_length, from_block_size, to_block_size, num_rand_blocks, last_idx=-1 + ): + """ + Create adjacency list of random attention. + + Args: + from_seq_length: int. length of from sequence. + to_seq_length: int. length of to sequence. + from_block_size: int. size of block in from sequence. + to_block_size: int. size of block in to sequence. + num_rand_blocks: int. Number of random chunks per row. + last_idx: if -1 then num_rand_blocks blocks chosen anywhere in to sequence, + if positive then num_rand_blocks blocks chosen only up to last_idx. + + Returns: + adjacency list of size from_seq_length//from_block_size-2 by num_rand_blocks + """ + # using this method when from_seq_length in [1024, 3072, 4096] + + if from_seq_length // from_block_size != to_seq_length // to_block_size: + raise ValueError("Error the number of blocks needs to be same!") + + rand_attn = np.zeros((from_seq_length // from_block_size - 2, num_rand_blocks), dtype=np.int32) + # During inference (eval) no randomness + if not self.training: + return rand_attn + middle_seq = np.arange(1, to_seq_length // to_block_size - 1, dtype=np.int32) + last = to_seq_length // to_block_size - 1 + if last_idx > (2 * to_block_size): + last = (last_idx // to_block_size) - 1 + + r = num_rand_blocks # shorthand + for i in range(1, from_seq_length // from_block_size - 1): + start = i - 2 + end = i + if i == 1: + rand_attn[i - 1, :] = np.random.permutation(middle_seq[2:last])[:r] + elif i == 2: + rand_attn[i - 1, :] = np.random.permutation(middle_seq[3:last])[:r] + elif i == from_seq_length // from_block_size - 3: + rand_attn[i - 1, :] = np.random.permutation(middle_seq[:last])[:r] + # Missing -3: should have been sliced till last-3 + elif i == from_seq_length // from_block_size - 2: + rand_attn[i - 1, :] = np.random.permutation(middle_seq[:last])[:r] + # Missing -4: should have been sliced till last-4 + else: + if start > last: + start = last + rand_attn[i - 1, :] = np.random.permutation(middle_seq[:start])[:r] + elif (end + 1) == last: + rand_attn[i - 1, :] = np.random.permutation(middle_seq[:start])[:r] + else: + rand_attn[i - 1, :] = np.random.permutation( + np.concatenate((middle_seq[:start], middle_seq[end + 1 : last])) + )[:r] + return rand_attn + + def _bigbird_block_rand_mask_with_head( + self, + from_seq_length, + to_seq_length, + from_block_size, + to_block_size, + num_heads, + plan_from_length, + plan_num_rand_blocks, + window_block_left=1, + window_block_right=1, + global_block_top=1, + global_block_bottom=1, + global_block_left=1, + global_block_right=1, + ): + """ + Create adjacency list of random attention. + + Args: + from_seq_length: int. length of from sequence. + to_seq_length: int. length of to sequence. + from_block_size: int. size of block in from sequence. + to_block_size: int. size of block in to sequence. + num_heads: int. total number of heads. + plan_from_length: list. plan from length where num_random_blocks are chosen from. + plan_num_rand_blocks: list. number of rand blocks within the plan. + window_block_left: int. number of blocks of window to left of a block. + window_block_right: int. number of blocks of window to right of a block. + global_block_top: int. number of blocks at the top. + global_block_bottom: int. number of blocks at the bottom. + global_block_left: int. Number of blocks globally used to the left. + global_block_right: int. Number of blocks globally used to the right. + + Returns: + adjacency list of size num_head where each element is of size from_seq_length//from_block_size-2 by + num_rand_blocks + """ + # using this method when from_seq_length not in [1024, 3072, 4096] + + if from_seq_length // from_block_size != to_seq_length // to_block_size: + raise ValueError("Error the number of blocks needs to be same!") + + if from_seq_length not in plan_from_length: + raise ValueError("Error from sequence length not in plan!") + + # Total number of blocks in the mmask + num_blocks = from_seq_length // from_block_size + # Number of blocks per plan + plan_block_length = np.array(plan_from_length) // from_block_size + # till when to follow plan + max_plan_idx = plan_from_length.index(from_seq_length) + + # Random Attention adjacency list + rand_attn = [ + np.zeros((num_blocks, np.sum(plan_num_rand_blocks[: max_plan_idx + 1])), dtype=np.int32) + for i in range(num_heads) + ] + # During inference (eval) no randomness + if not self.training: + for nh in range(num_heads): + rand_attn[nh] = rand_attn[nh][global_block_top : num_blocks - global_block_bottom, :] + return rand_attn + + # We will go iteratively over the plan blocks and pick random number of + # Attention blocks from the legally allowed blocks + for plan_idx in range(max_plan_idx + 1): + rnd_r_cnt = 0 + if plan_idx > 0: + # set the row for all from_blocks starting from 0 to + # plan_block_length[plan_idx-1] + # column indx start fromm plan_block_length[plan_idx-1] and ends at + # plan_block_length[plan_idx] + if plan_num_rand_blocks[plan_idx] > 0: + rnd_r_cnt = int(np.sum(plan_num_rand_blocks[:plan_idx])) + curr_r_cnt = int(np.sum(plan_num_rand_blocks[: plan_idx + 1])) + for blk_rw_idx in range(global_block_top, plan_block_length[plan_idx - 1]): + for h in range(num_heads): + rand_attn[h][blk_rw_idx, rnd_r_cnt:curr_r_cnt] = self._get_single_block_row_attention( + block_id=blk_rw_idx, + to_start_block_id=plan_block_length[plan_idx - 1], + to_end_block_id=plan_block_length[plan_idx], + num_rand_blocks=plan_num_rand_blocks[plan_idx], + window_block_left=window_block_left, + window_block_right=window_block_right, + global_block_left=global_block_left, + global_block_right=global_block_right, + ) + + for pl_id in range(plan_idx): + if plan_num_rand_blocks[pl_id] == 0: + continue + for blk_rw_idx in range(plan_block_length[plan_idx - 1], plan_block_length[plan_idx]): + rnd_r_cnt = 0 + to_start_block_id = 0 + if pl_id > 0: + rnd_r_cnt = int(np.sum(plan_num_rand_blocks[:pl_id])) + to_start_block_id = plan_block_length[pl_id - 1] + curr_r_cnt = int(np.sum(plan_num_rand_blocks[: pl_id + 1])) + for h in range(num_heads): + rand_attn[h][blk_rw_idx, rnd_r_cnt:curr_r_cnt] = self._get_single_block_row_attention( + block_id=blk_rw_idx, + to_start_block_id=to_start_block_id, + to_end_block_id=plan_block_length[pl_id], + num_rand_blocks=plan_num_rand_blocks[pl_id], + window_block_left=window_block_left, + window_block_right=window_block_right, + global_block_left=global_block_left, + global_block_right=global_block_right, + ) + + if plan_num_rand_blocks[plan_idx] == 0: + continue + curr_r_cnt = int(np.sum(plan_num_rand_blocks[: plan_idx + 1])) + from_start_block_id = global_block_top + to_start_block_id = 0 + if plan_idx > 0: + rnd_r_cnt = int(np.sum(plan_num_rand_blocks[:plan_idx])) + from_start_block_id = plan_block_length[plan_idx - 1] + to_start_block_id = plan_block_length[plan_idx - 1] + + for blk_rw_idx in range(from_start_block_id, plan_block_length[plan_idx]): + for h in range(num_heads): + rand_attn[h][blk_rw_idx, rnd_r_cnt:curr_r_cnt] = self._get_single_block_row_attention( + block_id=blk_rw_idx, + to_start_block_id=to_start_block_id, + to_end_block_id=plan_block_length[plan_idx], + num_rand_blocks=plan_num_rand_blocks[plan_idx], + window_block_left=window_block_left, + window_block_right=window_block_right, + global_block_left=global_block_left, + global_block_right=global_block_right, + ) + + for nh in range(num_heads): + rand_attn[nh] = rand_attn[nh][global_block_top : num_blocks - global_block_bottom, :] + + return rand_attn + + @staticmethod + def _get_single_block_row_attention( + block_id, + to_start_block_id, + to_end_block_id, + num_rand_blocks, + window_block_left=1, + window_block_right=1, + global_block_left=1, + global_block_right=1, + ): + """ + For a single row block get random row attention. + + Args: + block_id: int. block id of row. + to_start_block_id: int. random attention column start id. + to_end_block_id: int. random attention column end id. + num_rand_blocks: int. number of random blocks to be selected. + window_block_left: int. number of blocks of window to left of a block. + window_block_right: int. number of blocks of window to right of a block. + global_block_left: int. Number of blocks globally used to the left. + global_block_right: int. Number of blocks globally used to the right. + + Returns: + row containing the random attention vector of size num_rand_blocks. + """ + # list of to_blocks from which to choose random attention + to_block_list = np.arange(to_start_block_id, to_end_block_id, dtype=np.int32) + # permute the blocks + perm_block = np.random.permutation(to_block_list) + + # illegal blocks for the current block id, using window + illegal_blocks = list(range(block_id - window_block_left, block_id + window_block_right + 1)) + + # Add blocks at the start and at the end + illegal_blocks.extend(list(range(global_block_left))) + illegal_blocks.extend(list(range(to_end_block_id - global_block_right, to_end_block_id))) + + # The second from_block cannot choose random attention on second last to_block + if block_id == 1: + illegal_blocks.append(to_end_block_id - 2) + + # The second last from_block cannot choose random attention on second to_block + if block_id == to_end_block_id - 2: + illegal_blocks.append(1) + + selected_random_blokcs = [] + + for i in range(to_end_block_id - to_start_block_id): + if perm_block[i] not in illegal_blocks: + selected_random_blokcs.append(perm_block[i]) + if len(selected_random_blokcs) == num_rand_blocks: + break + return np.array(selected_random_blokcs, dtype=np.int32) + + +class BigBirdPegasusEncoderAttention(nn.Module): + def __init__(self, config, seed=None): + super().__init__() + self.config = config + self.seed = seed + + self.attention_type = config.attention_type + + if self.attention_type == "original_full": + self.self = BigBirdPegasusSelfAttention(config) + elif self.attention_type == "block_sparse": + self.self = BigBirdPegasusBlockSparseAttention(config, seed) + else: + raise ValueError( + f"attention_type can either be original_full or block_sparse, but is {self.config.attention_type}" + ) + + self.output = nn.Linear(config.hidden_size, config.hidden_size, bias=config.use_bias) + + def set_attention_type(self, value: str): + if value not in ["original_full", "block_sparse"]: + raise ValueError( + f"attention_type can only be set to either 'original_full' or 'block_sparse', but is {value}" + ) + # attention type is already correctly set + if value == self.attention_type: + return + + self.attention_type = value + if value == "original_full": + # copy all weights to new full attention class + attn_weights = BigBirdPegasusSelfAttention(self.config) + else: + # copy all weights to new sparse attention class + attn_weights = BigBirdPegasusBlockSparseAttention(self.config, self.seed) + + attn_weights.query = self.self.query + attn_weights.value = self.self.value + attn_weights.key = self.self.key + self.self = attn_weights + self.attention_type = value + + if not self.training: + self.self.eval() + + def forward( + self, + hidden_states, + attention_mask=None, + head_mask=None, + past_key_value=None, + output_attentions=False, + band_mask=None, + from_mask=None, + to_mask=None, + from_blocked_mask=None, + to_blocked_mask=None, + ): + # Expand dims to enable multiplication in the self-attention module + head_mask = head_mask.reshape(1, -1, 1, 1) if head_mask is not None else None + + if self.config.attention_type == "original_full": + self_outputs = self.self( + hidden_states, + attention_mask, + head_mask, + past_key_value=past_key_value, + output_attentions=output_attentions, + ) + else: + self_outputs = self.self( + hidden_states, band_mask, from_mask, to_mask, from_blocked_mask, to_blocked_mask, output_attentions + ) + + attention_output = self.output(self_outputs[0]) + outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them + return outputs + + +# Copied from transformers.models.bart.modeling_bart.BartAttention with BartConfig->BigBirdPegasusConfig, Bart->BigBirdPegasusDecoder +class BigBirdPegasusDecoderAttention(nn.Module): + """Multi-headed attention from 'Attention Is All You Need' paper""" + + def __init__( + self, + embed_dim: int, + num_heads: int, + dropout: float = 0.0, + is_decoder: bool = False, + bias: bool = True, + is_causal: bool = False, + config: Optional[BigBirdPegasusConfig] = None, + ): + super().__init__() + self.embed_dim = embed_dim + self.num_heads = num_heads + self.dropout = dropout + self.head_dim = embed_dim // num_heads + self.config = config + + if (self.head_dim * num_heads) != self.embed_dim: + raise ValueError( + f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}" + f" and `num_heads`: {num_heads})." + ) + self.scaling = self.head_dim**-0.5 + self.is_decoder = is_decoder + self.is_causal = is_causal + + self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias) + self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias) + self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias) + self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias) + + def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int): + return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous() + + def forward( + self, + hidden_states: torch.Tensor, + key_value_states: Optional[torch.Tensor] = None, + past_key_value: Optional[Tuple[torch.Tensor]] = None, + attention_mask: Optional[torch.Tensor] = None, + layer_head_mask: Optional[torch.Tensor] = None, + output_attentions: bool = False, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + """Input shape: Batch x Time x Channel""" + + # if key_value_states are provided this layer is used as a cross-attention layer + # for the decoder + is_cross_attention = key_value_states is not None + + bsz, tgt_len, _ = hidden_states.size() + + # get query proj + query_states = self.q_proj(hidden_states) * self.scaling + # get key, value proj + # `past_key_value[0].shape[2] == key_value_states.shape[1]` + # is checking that the `sequence_length` of the `past_key_value` is the same as + # the provided `key_value_states` to support prefix tuning + if ( + is_cross_attention + and past_key_value is not None + and past_key_value[0].shape[2] == key_value_states.shape[1] + ): + # reuse k,v, cross_attentions + key_states = past_key_value[0] + value_states = past_key_value[1] + elif is_cross_attention: + # cross_attentions + key_states = self._shape(self.k_proj(key_value_states), -1, bsz) + value_states = self._shape(self.v_proj(key_value_states), -1, bsz) + elif past_key_value is not None: + # reuse k, v, self_attention + key_states = self._shape(self.k_proj(hidden_states), -1, bsz) + value_states = self._shape(self.v_proj(hidden_states), -1, bsz) + key_states = torch.cat([past_key_value[0], key_states], dim=2) + value_states = torch.cat([past_key_value[1], value_states], dim=2) + else: + # self_attention + key_states = self._shape(self.k_proj(hidden_states), -1, bsz) + value_states = self._shape(self.v_proj(hidden_states), -1, bsz) + + if self.is_decoder: + # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states. + # Further calls to cross_attention layer can then reuse all cross-attention + # key/value_states (first "if" case) + # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of + # all previous decoder key/value_states. Further calls to uni-directional self-attention + # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case) + # if encoder bi-directional self-attention `past_key_value` is always `None` + past_key_value = (key_states, value_states) + + proj_shape = (bsz * self.num_heads, -1, self.head_dim) + query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape) + key_states = key_states.reshape(*proj_shape) + value_states = value_states.reshape(*proj_shape) + + src_len = key_states.size(1) + attn_weights = torch.bmm(query_states, key_states.transpose(1, 2)) + + if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len): + raise ValueError( + f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is" + f" {attn_weights.size()}" + ) + + if attention_mask is not None: + if attention_mask.size() != (bsz, 1, tgt_len, src_len): + raise ValueError( + f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}" + ) + attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attention_mask + attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len) + + attn_weights = nn.functional.softmax(attn_weights, dim=-1) + + if layer_head_mask is not None: + if layer_head_mask.size() != (self.num_heads,): + raise ValueError( + f"Head mask for a single layer should be of size {(self.num_heads,)}, but is" + f" {layer_head_mask.size()}" + ) + attn_weights = layer_head_mask.view(1, -1, 1, 1) * attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len) + + if output_attentions: + # this operation is a bit awkward, but it's required to + # make sure that attn_weights keeps its gradient. + # In order to do so, attn_weights have to be reshaped + # twice and have to be reused in the following + attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, src_len) + else: + attn_weights_reshaped = None + + attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training) + + attn_output = torch.bmm(attn_probs, value_states) + + if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim): + raise ValueError( + f"`attn_output` should be of size {(bsz * self.num_heads, tgt_len, self.head_dim)}, but is" + f" {attn_output.size()}" + ) + + attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim) + attn_output = attn_output.transpose(1, 2) + + # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be + # partitioned across GPUs when using tensor-parallelism. + attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim) + + attn_output = self.out_proj(attn_output) + + return attn_output, attn_weights_reshaped, past_key_value + + +class BigBirdPegasusEncoderLayer(nn.Module): + def __init__(self, config: BigBirdPegasusConfig, seed=None): + super().__init__() + self.attention_type = config.attention_type + self.embed_dim = config.d_model + self.self_attn = BigBirdPegasusEncoderAttention(config, seed=seed) + self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim) + self.dropout = config.dropout + self.activation_fn = ACT2FN[config.activation_function] + self.activation_dropout = config.activation_dropout + self.fc1 = nn.Linear(self.embed_dim, config.encoder_ffn_dim) + self.fc2 = nn.Linear(config.encoder_ffn_dim, self.embed_dim) + self.final_layer_norm = nn.LayerNorm(self.embed_dim) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: torch.Tensor, + layer_head_mask: torch.Tensor, + band_mask=None, + from_mask=None, + to_mask=None, + from_blocked_mask=None, + to_blocked_mask=None, + output_attentions: bool = False, + ): + """ + Args: + hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)` + attention_mask (`torch.FloatTensor`): attention mask of size + `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + """ + residual = hidden_states + hidden_states = self.self_attn_layer_norm(hidden_states) + + self_attention_outputs = self.self_attn( + hidden_states=hidden_states, + attention_mask=attention_mask, + head_mask=layer_head_mask, + output_attentions=output_attentions, + band_mask=band_mask, + from_mask=from_mask, + to_mask=to_mask, + from_blocked_mask=from_blocked_mask, + to_blocked_mask=to_blocked_mask, + ) + hidden_states = self_attention_outputs[0] + + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = residual + hidden_states + + residual = hidden_states + hidden_states = self.final_layer_norm(hidden_states) + hidden_states = self.activation_fn(self.fc1(hidden_states)) + + hidden_states = self.fc2(hidden_states) + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = residual + hidden_states + + if hidden_states.dtype == torch.float16 and ( + torch.isinf(hidden_states).any() or torch.isnan(hidden_states).any() + ): + clamp_value = torch.finfo(hidden_states.dtype).max - 1000 + hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value) + + outputs = (hidden_states,) + + if output_attentions: + outputs += (self_attention_outputs[1],) + + return outputs + + def set_attention_type(self, value: str): + if value not in ["original_full", "block_sparse"]: + raise ValueError( + f"attention_type can only be set to either 'original_full' or 'block_sparse', but is {value}" + ) + # attention type is already correctly set + if value == self.attention_type: + return + self.attention_type = value + self.self_attn.set_attention_type(value) + + +class BigBirdPegasusDecoderLayer(nn.Module): + def __init__(self, config: BigBirdPegasusConfig): + super().__init__() + self.embed_dim = config.d_model + self.self_attn = BigBirdPegasusDecoderAttention( + embed_dim=self.embed_dim, + num_heads=config.decoder_attention_heads, + dropout=config.attention_dropout, + is_decoder=True, + bias=config.use_bias, + ) + self.dropout = config.dropout + self.activation_fn = ACT2FN[config.activation_function] + self.activation_dropout = config.activation_dropout + + self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim) + self.encoder_attn = BigBirdPegasusDecoderAttention( + self.embed_dim, + config.decoder_attention_heads, + dropout=config.attention_dropout, + is_decoder=True, + bias=config.use_bias, + ) + self.encoder_attn_layer_norm = nn.LayerNorm(self.embed_dim) + self.fc1 = nn.Linear(self.embed_dim, config.decoder_ffn_dim) + self.fc2 = nn.Linear(config.decoder_ffn_dim, self.embed_dim) + self.final_layer_norm = nn.LayerNorm(self.embed_dim) + + # Copied from transformers.models.mbart.modeling_mbart.MBartDecoderLayer.forward + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, + encoder_attention_mask: Optional[torch.Tensor] = None, + layer_head_mask: Optional[torch.Tensor] = None, + cross_attn_layer_head_mask: Optional[torch.Tensor] = None, + past_key_value: Optional[Tuple[torch.Tensor]] = None, + output_attentions: Optional[bool] = False, + use_cache: Optional[bool] = True, + ) -> torch.Tensor: + """ + Args: + hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)` + attention_mask (`torch.FloatTensor`): attention mask of size + `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values. + encoder_hidden_states (`torch.FloatTensor`): + cross attention input to the layer of shape `(batch, seq_len, embed_dim)` + encoder_attention_mask (`torch.FloatTensor`): encoder attention mask of size + `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values. + layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size + `(encoder_attention_heads,)`. + cross_attn_layer_head_mask (`torch.FloatTensor`): mask for cross-attention heads in a given layer of + size `(decoder_attention_heads,)`. + past_key_value (`Tuple(torch.FloatTensor)`): cached past key and value projection states + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + """ + residual = hidden_states + hidden_states = self.self_attn_layer_norm(hidden_states) + + # Self Attention + # decoder uni-directional self-attention cached key/values tuple is at positions 1,2 + self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None + # add present self-attn cache to positions 1,2 of present_key_value tuple + hidden_states, self_attn_weights, present_key_value = self.self_attn( + hidden_states=hidden_states, + past_key_value=self_attn_past_key_value, + attention_mask=attention_mask, + layer_head_mask=layer_head_mask, + output_attentions=output_attentions, + ) + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = residual + hidden_states + + # Cross-Attention Block + cross_attn_present_key_value = None + cross_attn_weights = None + if encoder_hidden_states is not None: + residual = hidden_states + hidden_states = self.encoder_attn_layer_norm(hidden_states) + + # cross_attn cached key/values tuple is at positions 3,4 of present_key_value tuple + cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None + hidden_states, cross_attn_weights, cross_attn_present_key_value = self.encoder_attn( + hidden_states=hidden_states, + key_value_states=encoder_hidden_states, + attention_mask=encoder_attention_mask, + layer_head_mask=cross_attn_layer_head_mask, + past_key_value=cross_attn_past_key_value, + output_attentions=output_attentions, + ) + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = residual + hidden_states + + # add cross-attn to positions 3,4 of present_key_value tuple + present_key_value = present_key_value + cross_attn_present_key_value + + # Fully Connected + residual = hidden_states + hidden_states = self.final_layer_norm(hidden_states) + hidden_states = self.activation_fn(self.fc1(hidden_states)) + hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training) + hidden_states = self.fc2(hidden_states) + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = residual + hidden_states + + outputs = (hidden_states,) + + if output_attentions: + outputs += (self_attn_weights, cross_attn_weights) + + if use_cache: + outputs += (present_key_value,) + + return outputs + + +# Copied from transformers.models.bart.modeling_bart.BartClassificationHead with Bart->BigBirdPegasus +class BigBirdPegasusClassificationHead(nn.Module): + """Head for sentence-level classification tasks.""" + + def __init__( + self, + input_dim: int, + inner_dim: int, + num_classes: int, + pooler_dropout: float, + ): + super().__init__() + self.dense = nn.Linear(input_dim, inner_dim) + self.dropout = nn.Dropout(p=pooler_dropout) + self.out_proj = nn.Linear(inner_dim, num_classes) + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + hidden_states = self.dropout(hidden_states) + hidden_states = self.dense(hidden_states) + hidden_states = torch.tanh(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = self.out_proj(hidden_states) + return hidden_states + + +class BigBirdPegasusPreTrainedModel(PreTrainedModel): + config_class = BigBirdPegasusConfig + base_model_prefix = "model" + supports_gradient_checkpointing = True + _no_split_modules = ["BigBirdPegasusEncoderLayer", "BigBirdPegasusDecoderLayer"] + _skip_keys_device_placement = "past_key_values" + _supports_param_buffer_assignment = False + + def _init_weights(self, module): + std = self.config.init_std + if isinstance(module, nn.Linear): + module.weight.data.normal_(mean=0.0, std=std) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=std) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + + @property + def dummy_inputs(self): + pad_token = self.config.pad_token_id + input_ids = torch.tensor([[0, 6, 10, 4, 2], [0, 8, 12, 2, pad_token]], device=self.device) + dummy_inputs = { + "attention_mask": input_ids.ne(pad_token), + "input_ids": input_ids, + } + return dummy_inputs + + +BIGBIRD_PEGASUS_START_DOCSTRING = r""" + This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading or saving, resizing the input embeddings etc.) + + This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. + Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage + and behavior. + + Parameters: + config ([`BigBirdPegasusConfig`]): + Model configuration class with all the parameters of the model. Initializing with a config file does not + load the weights associated with the model, only the configuration. Check out the + [`~PreTrainedModel.from_pretrained`] method to load the model weights. +""" + +BIGBIRD_PEGASUS_GENERATION_EXAMPLE = r""" + Summarization example: + + ```python + >>> from transformers import AutoTokenizer, BigBirdPegasusForConditionalGeneration + + >>> model = BigBirdPegasusForConditionalGeneration.from_pretrained("google/bigbird-pegasus-large-arxiv") + >>> tokenizer = AutoTokenizer.from_pretrained("google/bigbird-pegasus-large-arxiv") + + >>> ARTICLE_TO_SUMMARIZE = ( + ... "The dominant sequence transduction models are based on complex recurrent or convolutional neural " + ... "networks in an encoder-decoder configuration. The best performing models also connect the encoder " + ... "and decoder through an attention mechanism. We propose a new simple network architecture, the Transformer, " + ... "based solely on attention mechanisms, dispensing with recurrence and convolutions entirely. " + ... "Experiments on two machine translation tasks show these models to be superior in quality " + ... "while being more parallelizable and requiring significantly less time to train." + ... ) + >>> inputs = tokenizer([ARTICLE_TO_SUMMARIZE], max_length=4096, return_tensors="pt", truncation=True) + + >>> # Generate Summary + >>> summary_ids = model.generate(inputs["input_ids"], num_beams=4, max_length=15) + >>> tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] + 'dominant sequence models are based on recurrent or convolutional neural networks .' + ``` +""" + +BIGBIRD_PEGASUS_INPUTS_DOCSTRING = r""" + Args: + input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide + it. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*): + Provide for translation and summarization training. By default, the model will create this tensor by + shifting the `input_ids` to the right, following the paper. + decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*): + Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also + be used by default. + + If you want to change padding behavior, you should read + [`modeling_bigbird_pegasus._prepare_decoder_attention_mask`] and modify to your needs. See diagram 1 in + [the paper](https://arxiv.org/abs/1910.13461) for more information on the default strategy. + + decoder_head_mask (`torch.Tensor` of shape `(num_layers, num_heads)`, *optional*): + Mask to nullify selected heads of the attention modules in the decoder. Mask values selected in `[0, 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + encoder_outputs (`tuple(tuple(torch.FloatTensor)`, *optional*): + Tuple consists of (`last_hidden_state`, *optional*: `hidden_states`, *optional*: `attentions`) + `last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)`, *optional*) is a sequence of + hidden-states at the output of the last layer of the encoder. Used in the cross-attention of the decoder. + past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): + Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape + `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape + `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`. + + Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention + blocks) that can be used (see `past_key_values` input) to speed up sequential decoding. + + If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that + don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all + `decoder_input_ids` of shape `(batch_size, sequence_length)`. + inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. + This is useful if you want more control over how to convert `input_ids` indices into associated vectors + than the model's internal embedding lookup matrix. + decoder_inputs_embeds (`torch.FloatTensor` of shape `(batch_size, target_sequence_length, hidden_size)`, *optional*): + Optionally, instead of passing `decoder_input_ids` you can choose to directly pass an embedded + representation. If `past_key_values` is used, optionally only the last `decoder_inputs_embeds` have to be + input (see `past_key_values`). This is useful if you want more control over how to convert + `decoder_input_ids` indices into associated vectors than the model's internal embedding lookup matrix. + + If `decoder_input_ids` and `decoder_inputs_embeds` are both unset, `decoder_inputs_embeds` takes the value + of `inputs_embeds`. + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see + `past_key_values`). + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. +""" + +BIGBIRD_PEGASUS_STANDALONE_INPUTS_DOCSTRING = r""" + Args: + input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide + it. + + Indices can be obtained using [`ProphetNetTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. +""" + + +class BigBirdPegasusEncoder(BigBirdPegasusPreTrainedModel): + """ + Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a + [`BigBirdPegasusEncoderLayer`]. + + Args: + config: BigBirdPegasusConfig + embed_tokens (nn.Embedding): output embedding + """ + + def __init__(self, config: BigBirdPegasusConfig, embed_tokens: Optional[nn.Embedding] = None): + super().__init__(config) + + self.attention_type = config.attention_type + self.block_size = config.block_size + + self.dropout = config.dropout + self.layerdrop = config.encoder_layerdrop + + embed_dim = config.d_model + self.padding_idx = config.pad_token_id + self.max_source_positions = config.max_position_embeddings + embed_scale = math.sqrt(embed_dim) if config.scale_embedding else 1.0 + + self.embed_tokens = BigBirdPegasusScaledWordEmbedding( + config.vocab_size, embed_dim, self.padding_idx, embed_scale=embed_scale + ) + + if embed_tokens is not None: + self.embed_tokens.weight = embed_tokens.weight + + self.embed_positions = BigBirdPegasusLearnedPositionalEmbedding( + config.max_position_embeddings, + embed_dim, + ) + self.layers = nn.ModuleList([BigBirdPegasusEncoderLayer(config, seed=i) for i in range(config.encoder_layers)]) + self.layernorm_embedding = nn.LayerNorm(embed_dim) + + self.gradient_checkpointing = False + # Initialize weights and apply final processing + self.post_init() + + def forward( + self, + input_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ): + r""" + Args: + input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you + provide it. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + + inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. + This is useful if you want more control over how to convert `input_ids` indices into associated vectors + than the model's internal embedding lookup matrix. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors + for more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. + """ + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + # retrieve input_ids and inputs_embeds + if input_ids is not None and inputs_embeds is not None: + raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") + elif input_ids is not None: + self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask) + input_shape = input_ids.size() + input_ids = input_ids.view(-1, input_shape[-1]) + elif inputs_embeds is not None: + input_shape = inputs_embeds.size()[:-1] + else: + raise ValueError("You have to specify either input_ids or inputs_embeds") + + if inputs_embeds is None: + inputs_embeds = self.embed_tokens(input_ids) + + embed_pos = self.embed_positions(input_shape) + + hidden_states = inputs_embeds + embed_pos + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) + + if attention_mask is None: + attention_mask = torch.ones(input_shape, device=hidden_states.device) + attention_mask = attention_mask.long() + + # in order to use block_sparse attention, sequence_length has to be at least + # bigger than all global attentions: 2 * block_size + # + sliding tokens: 3 * block_size + # + random tokens: 2 * num_random_blocks * block_size + max_tokens_to_attend = (5 + 2 * self.config.num_random_blocks) * self.config.block_size + if self.attention_type == "block_sparse" and input_shape[1] <= max_tokens_to_attend: + # change attention_type from block_sparse to original_full + sequence_length = input_shape[1] + logger.warning( + "Attention type 'block_sparse' is not possible if sequence_length: " + f"{sequence_length} <= num global tokens: 2 * config.block_size " + "+ min. num sliding tokens: 3 * config.block_size " + "+ config.num_random_blocks * config.block_size " + "+ additional buffer: config.num_random_blocks * config.block_size " + f"= {max_tokens_to_attend} with config.block_size " + f"= {self.config.block_size}, config.num_random_blocks " + f"= {self.config.num_random_blocks}. " + "Changing attention type to 'original_full'..." + ) + self.set_attention_type("original_full") + + if self.attention_type == "block_sparse": + padding_len, hidden_states, attention_mask = self._pad_to_block_size(hidden_states, attention_mask) + else: + padding_len = 0 + + # expand attention_mask + if self.attention_type == "original_full": + # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] + attention_mask = _prepare_4d_attention_mask(attention_mask, inputs_embeds.dtype) + blocked_encoder_mask = band_mask = from_mask = to_mask = None + elif self.attention_type == "block_sparse": + blocked_encoder_mask, band_mask, from_mask, to_mask = self.create_masks_for_block_sparse_attn( + attention_mask, self.block_size + ) + attention_mask = None + else: + raise ValueError( + f"attention_type can either be original_full or block_sparse, but is {self.attention_type}" + ) + + encoder_states = () if output_hidden_states else None + all_attentions = () if output_attentions else None + + # check if head_mask has a correct number of layers specified if desired + if head_mask is not None: + if head_mask.size()[0] != len(self.layers): + raise ValueError( + f"The head_mask should be specified for {len(self.layers)} layers, but it is for" + f" {head_mask.size()[0]}." + ) + + for idx, encoder_layer in enumerate(self.layers): + if output_hidden_states: + encoder_states = encoder_states + (hidden_states,) + # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) + to_drop = False + if self.training: + dropout_probability = torch.rand([]) + if dropout_probability < self.layerdrop: # skip the layer + to_drop = True + + if to_drop: + layer_outputs = (None, None) + else: + if self.gradient_checkpointing and self.training: + layer_outputs = self._gradient_checkpointing_func( + encoder_layer.__call__, + hidden_states, + attention_mask, + (head_mask[idx] if head_mask is not None else None), + band_mask, + from_mask, + to_mask, + blocked_encoder_mask, + blocked_encoder_mask, + output_attentions, + ) + else: + layer_outputs = encoder_layer( + hidden_states, + attention_mask, + layer_head_mask=(head_mask[idx] if head_mask is not None else None), + band_mask=band_mask, + from_mask=from_mask, + to_mask=to_mask, + from_blocked_mask=blocked_encoder_mask, + to_blocked_mask=blocked_encoder_mask, + output_attentions=output_attentions, + ) + + hidden_states = layer_outputs[0] + + if output_attentions: + all_attentions = all_attentions + (layer_outputs[1],) + + hidden_states = self.layernorm_embedding(hidden_states) + + if output_hidden_states: + encoder_states = encoder_states + (hidden_states,) + + if padding_len > 0: + # unpad `sequence_output` because the calling function is expecting a length == input_ids.size(1) + hidden_states = hidden_states[:, :-padding_len] + + if not return_dict: + return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None) + + return BaseModelOutput( + last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions + ) + + def set_attention_type(self, value: str): + if value not in ["original_full", "block_sparse"]: + raise ValueError( + f"attention_type can only be set to either 'original_full' or 'block_sparse', but is {value}" + ) + # attention type is already correctly set + if value == self.attention_type: + return + self.attention_type = value + for layer in self.layers: + layer.set_attention_type(value) + + @staticmethod # Copied from transformers.models.big_bird.modeling_big_bird.BigBirdModel.create_masks_for_block_sparse_attn + def create_masks_for_block_sparse_attn(attention_mask: torch.Tensor, block_size: int): + batch_size, seq_length = attention_mask.size() + if seq_length % block_size != 0: + raise ValueError( + f"Sequence length must be multiple of block size, but sequence length is {seq_length}, while block" + f" size is {block_size}." + ) + + def create_band_mask_from_inputs(from_blocked_mask, to_blocked_mask): + """ + Create 3D attention mask from a 2D tensor mask. + + Args: + from_blocked_mask: 2D Tensor of shape [batch_size, + from_seq_length//from_block_size, from_block_size]. + to_blocked_mask: int32 Tensor of shape [batch_size, + to_seq_length//to_block_size, to_block_size]. + + Returns: + float Tensor of shape [batch_size, 1, from_seq_length//from_block_size-4, from_block_size, + 3*to_block_size]. + """ + exp_blocked_to_pad = torch.cat( + [to_blocked_mask[:, 1:-3], to_blocked_mask[:, 2:-2], to_blocked_mask[:, 3:-1]], dim=2 + ) + band_mask = torch.einsum("blq,blk->blqk", from_blocked_mask[:, 2:-2], exp_blocked_to_pad) + band_mask.unsqueeze_(1) + return band_mask + + blocked_encoder_mask = attention_mask.view(batch_size, seq_length // block_size, block_size) + band_mask = create_band_mask_from_inputs(blocked_encoder_mask, blocked_encoder_mask) + + from_mask = attention_mask.view(batch_size, 1, seq_length, 1) + to_mask = attention_mask.view(batch_size, 1, 1, seq_length) + + return blocked_encoder_mask, band_mask, from_mask, to_mask + + def _pad_to_block_size(self, hidden_states: torch.Tensor, attention_mask: torch.Tensor): + """A helper function to pad tokens and mask to work with implementation of BigBird block-sparse attention.""" + # padding + block_size = self.config.block_size + batch_size, seq_len = hidden_states.shape[:2] + + padding_len = (block_size - seq_len % block_size) % block_size + if padding_len > 0: + logger.warning_once( + f"Input ids are automatically padded from {seq_len} to {seq_len + padding_len} to be a multiple of " + f"`config.block_size`: {block_size}" + ) + pad_id = self.config.pad_token_id + device = hidden_states.device + input_ids_padding = torch.ones((batch_size, padding_len), dtype=torch.long, device=device) * pad_id + inputs_embeds_padding = self.embed_tokens(input_ids_padding) + hidden_states = torch.cat([hidden_states, inputs_embeds_padding], dim=-2) + + attention_mask = nn.functional.pad( + attention_mask, (0, padding_len), value=0 + ) # no attention on the padding tokens + + return padding_len, hidden_states, attention_mask + + +class BigBirdPegasusDecoder(BigBirdPegasusPreTrainedModel): + """ + Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`BigBirdPegasusDecoderLayer`] + + Args: + config: BigBirdPegasusConfig + embed_tokens (nn.Embedding): output embedding + """ + + def __init__(self, config: BigBirdPegasusConfig, embed_tokens: Optional[nn.Embedding] = None): + super().__init__(config) + self.dropout = config.dropout + self.layerdrop = config.decoder_layerdrop + self.padding_idx = config.pad_token_id + self.max_target_positions = config.max_position_embeddings + embed_scale = math.sqrt(config.d_model) if config.scale_embedding else 1.0 + + self.embed_tokens = BigBirdPegasusScaledWordEmbedding( + config.vocab_size, config.d_model, self.padding_idx, embed_scale=embed_scale + ) + + if embed_tokens is not None: + self.embed_tokens.weight = embed_tokens.weight + + self.embed_positions = BigBirdPegasusLearnedPositionalEmbedding( + config.max_position_embeddings, + config.d_model, + ) + self.layers = nn.ModuleList([BigBirdPegasusDecoderLayer(config) for _ in range(config.decoder_layers)]) + self.layernorm_embedding = nn.LayerNorm(config.d_model) + + self.gradient_checkpointing = False + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.embed_tokens + + def set_input_embeddings(self, value): + self.embed_tokens = value + + def forward( + self, + input_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, + encoder_attention_mask: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + cross_attn_head_mask: Optional[torch.Tensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.Tensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ): + r""" + Args: + input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you + provide it. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*): + Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention + of the decoder. + encoder_attention_mask (`torch.LongTensor` of shape `(batch_size, encoder_sequence_length)`, *optional*): + Mask to avoid performing cross-attention on padding tokens indices of encoder input_ids. Mask values + selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*): + Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*): + Mask to nullify selected heads of the cross-attention modules in decoder to avoid performing + cross-attention on hidden heads. Mask values selected in `[0, 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): + Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of + shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of + shape `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`. + + Contains pre-computed hidden-states (key and values in the self-attention blocks and in the + cross-attention blocks) that can be used (see `past_key_values` input) to speed up sequential decoding. + + If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those + that don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of + all `decoder_input_ids` of shape `(batch_size, sequence_length)`. + inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. + This is useful if you want more control over how to convert `input_ids` indices into associated vectors + than the model's internal embedding lookup matrix. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors + for more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. + """ + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + use_cache = use_cache if use_cache is not None else self.config.use_cache + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + # retrieve input_ids and inputs_embeds + if input_ids is not None and inputs_embeds is not None: + raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time") + elif input_ids is not None: + input_shape = input_ids.size() + input_ids = input_ids.view(-1, input_shape[-1]) + elif inputs_embeds is not None: + input_shape = inputs_embeds.size()[:-1] + else: + raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds") + + # past_key_values_length + past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0 + + if inputs_embeds is None: + inputs_embeds = self.embed_tokens(input_ids) + + attention_mask = _prepare_4d_causal_attention_mask( + attention_mask, input_shape, inputs_embeds, past_key_values_length + ) + + # expand encoder attention mask + if encoder_hidden_states is not None and encoder_attention_mask is not None: + # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] + encoder_attention_mask = _prepare_4d_attention_mask( + encoder_attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1] + ) + + # embed positions + positions = self.embed_positions(input_shape, past_key_values_length) + positions = positions.to(inputs_embeds.device) + + hidden_states = inputs_embeds + positions + + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) + + if self.gradient_checkpointing and self.training: + if use_cache: + logger.warning_once( + "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." + ) + use_cache = False + + # decoder layers + all_hidden_states = () if output_hidden_states else None + all_self_attns = () if output_attentions else None + all_cross_attentions = () if (output_attentions and encoder_hidden_states is not None) else None + next_decoder_cache = () if use_cache else None + + # check if head_mask/cross_attn_head_mask has a correct number of layers specified if desired + for attn_mask, mask_name in zip([head_mask, cross_attn_head_mask], ["head_mask", "cross_attn_head_mask"]): + if attn_mask is not None: + if attn_mask.size()[0] != len(self.layers): + raise ValueError( + f"The `{mask_name}` should be specified for {len(self.layers)} layers, but it is for" + f" {head_mask.size()[0]}." + ) + for idx, decoder_layer in enumerate(self.layers): + # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) + if output_hidden_states: + all_hidden_states += (hidden_states,) + if self.training: + dropout_probability = torch.rand([]) + if dropout_probability < self.layerdrop: + continue + + past_key_value = past_key_values[idx] if past_key_values is not None else None + + if self.gradient_checkpointing and self.training: + layer_outputs = self._gradient_checkpointing_func( + decoder_layer.__call__, + hidden_states, + attention_mask, + encoder_hidden_states, + encoder_attention_mask, + head_mask[idx] if head_mask is not None else None, + cross_attn_head_mask[idx] if cross_attn_head_mask is not None else None, + None, + output_attentions, + use_cache, + ) + else: + layer_outputs = decoder_layer( + hidden_states, + attention_mask=attention_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + layer_head_mask=(head_mask[idx] if head_mask is not None else None), + cross_attn_layer_head_mask=( + cross_attn_head_mask[idx] if cross_attn_head_mask is not None else None + ), + past_key_value=past_key_value, + output_attentions=output_attentions, + use_cache=use_cache, + ) + hidden_states = layer_outputs[0] + + if use_cache: + next_decoder_cache += (layer_outputs[3 if output_attentions else 1],) + + if output_attentions: + all_self_attns += (layer_outputs[1],) + + if encoder_hidden_states is not None: + all_cross_attentions += (layer_outputs[2],) + + hidden_states = self.layernorm_embedding(hidden_states) + + # add hidden states from the last decoder layer + if output_hidden_states: + all_hidden_states += (hidden_states,) + + next_cache = next_decoder_cache if use_cache else None + if not return_dict: + return tuple( + v + for v in [hidden_states, next_cache, all_hidden_states, all_self_attns, all_cross_attentions] + if v is not None + ) + return BaseModelOutputWithPastAndCrossAttentions( + last_hidden_state=hidden_states, + past_key_values=next_cache, + hidden_states=all_hidden_states, + attentions=all_self_attns, + cross_attentions=all_cross_attentions, + ) + + +@add_start_docstrings( + "The bare BigBirdPegasus Model outputting raw hidden-states without any specific head on top.", + BIGBIRD_PEGASUS_START_DOCSTRING, +) +class BigBirdPegasusModel(BigBirdPegasusPreTrainedModel): + _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"] + + def __init__(self, config: BigBirdPegasusConfig): + super().__init__(config) + + padding_idx, vocab_size = config.pad_token_id, config.vocab_size + embed_scale = math.sqrt(config.d_model) if config.scale_embedding else 1.0 + self.shared = BigBirdPegasusScaledWordEmbedding( + vocab_size, config.d_model, padding_idx, embed_scale=embed_scale + ) + + self.encoder = BigBirdPegasusEncoder(config, self.shared) + self.decoder = BigBirdPegasusDecoder(config, self.shared) + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.shared + + def set_input_embeddings(self, value): + self.shared = value + self.encoder.embed_tokens = self.shared + self.decoder.embed_tokens = self.shared + + def _tie_weights(self): + if self.config.tie_word_embeddings: + self._tie_or_clone_weights(self.encoder.embed_tokens, self.shared) + self._tie_or_clone_weights(self.decoder.embed_tokens, self.shared) + + def get_encoder(self): + return self.encoder + + def get_decoder(self): + return self.decoder + + @add_start_docstrings_to_model_forward(BIGBIRD_PEGASUS_INPUTS_DOCSTRING) + @add_code_sample_docstrings( + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=Seq2SeqModelOutput, + config_class=_CONFIG_FOR_DOC, + expected_output=_EXPECTED_OUTPUT_SHAPE, + ) + # Copied from transformers.models.bart.modeling_bart.BartModel.forward with Bart->BigBirdPegasus + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + decoder_input_ids: Optional[torch.LongTensor] = None, + decoder_attention_mask: Optional[torch.LongTensor] = None, + head_mask: Optional[torch.Tensor] = None, + decoder_head_mask: Optional[torch.Tensor] = None, + cross_attn_head_mask: Optional[torch.Tensor] = None, + encoder_outputs: Optional[List[torch.FloatTensor]] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + decoder_inputs_embeds: Optional[torch.FloatTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, Seq2SeqModelOutput]: + # different to other models, BigBirdPegasus automatically creates decoder_input_ids from + # input_ids if no decoder_input_ids are provided + if decoder_input_ids is None and decoder_inputs_embeds is None: + if input_ids is None: + raise ValueError( + "If no `decoder_input_ids` or `decoder_inputs_embeds` are " + "passed, `input_ids` cannot be `None`. Please pass either " + "`input_ids` or `decoder_input_ids` or `decoder_inputs_embeds`." + ) + + decoder_input_ids = shift_tokens_right( + input_ids, self.config.pad_token_id, self.config.decoder_start_token_id + ) + + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + use_cache = use_cache if use_cache is not None else self.config.use_cache + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if encoder_outputs is None: + encoder_outputs = self.encoder( + input_ids=input_ids, + attention_mask=attention_mask, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + # If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput when return_dict=True + elif return_dict and not isinstance(encoder_outputs, BaseModelOutput): + encoder_outputs = BaseModelOutput( + last_hidden_state=encoder_outputs[0], + hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None, + attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None, + ) + + # decoder outputs consists of (dec_features, past_key_value, dec_hidden, dec_attn) + decoder_outputs = self.decoder( + input_ids=decoder_input_ids, + attention_mask=decoder_attention_mask, + encoder_hidden_states=encoder_outputs[0], + encoder_attention_mask=attention_mask, + head_mask=decoder_head_mask, + cross_attn_head_mask=cross_attn_head_mask, + past_key_values=past_key_values, + inputs_embeds=decoder_inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + if not return_dict: + return decoder_outputs + encoder_outputs + + return Seq2SeqModelOutput( + last_hidden_state=decoder_outputs.last_hidden_state, + past_key_values=decoder_outputs.past_key_values, + decoder_hidden_states=decoder_outputs.hidden_states, + decoder_attentions=decoder_outputs.attentions, + cross_attentions=decoder_outputs.cross_attentions, + encoder_last_hidden_state=encoder_outputs.last_hidden_state, + encoder_hidden_states=encoder_outputs.hidden_states, + encoder_attentions=encoder_outputs.attentions, + ) + + +@add_start_docstrings( + "The BigBirdPegasus Model with a language modeling head. Can be used for summarization.", + BIGBIRD_PEGASUS_START_DOCSTRING, +) +# Copied from transformers.models.bart.modeling_bart.BartForConditionalGeneration with Bart->BigBirdPegasus, BART->BIGBIRD_PEGASUS +class BigBirdPegasusForConditionalGeneration(BigBirdPegasusPreTrainedModel, GenerationMixin): + base_model_prefix = "model" + _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight", "lm_head.weight"] + _keys_to_ignore_on_load_missing = ["final_logits_bias"] + + def __init__(self, config: BigBirdPegasusConfig): + super().__init__(config) + self.model = BigBirdPegasusModel(config) + self.register_buffer("final_logits_bias", torch.zeros((1, self.model.shared.num_embeddings))) + self.lm_head = nn.Linear(config.d_model, self.model.shared.num_embeddings, bias=False) + + # Initialize weights and apply final processing + self.post_init() + + def get_encoder(self): + return self.model.get_encoder() + + def get_decoder(self): + return self.model.get_decoder() + + def resize_token_embeddings( + self, new_num_tokens: int, pad_to_multiple_of: Optional[int] = None, mean_resizing: bool = True + ) -> nn.Embedding: + new_embeddings = super().resize_token_embeddings(new_num_tokens, pad_to_multiple_of, mean_resizing) + self._resize_final_logits_bias(new_embeddings.weight.shape[0]) + return new_embeddings + + def _resize_final_logits_bias(self, new_num_tokens: int) -> None: + old_num_tokens = self.final_logits_bias.shape[-1] + if new_num_tokens <= old_num_tokens: + new_bias = self.final_logits_bias[:, :new_num_tokens] + else: + extra_bias = torch.zeros((1, new_num_tokens - old_num_tokens), device=self.final_logits_bias.device) + new_bias = torch.cat([self.final_logits_bias, extra_bias], dim=1) + self.register_buffer("final_logits_bias", new_bias) + + def get_output_embeddings(self): + return self.lm_head + + def set_output_embeddings(self, new_embeddings): + self.lm_head = new_embeddings + + def _tie_weights(self): + if self.config.tie_word_embeddings: + self.model._tie_weights() + self._tie_or_clone_weights(self.lm_head, self.model.shared) + + @add_start_docstrings_to_model_forward(BIGBIRD_PEGASUS_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=Seq2SeqLMOutput, config_class=_CONFIG_FOR_DOC) + @add_end_docstrings(BIGBIRD_PEGASUS_GENERATION_EXAMPLE) + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + decoder_input_ids: Optional[torch.LongTensor] = None, + decoder_attention_mask: Optional[torch.LongTensor] = None, + head_mask: Optional[torch.Tensor] = None, + decoder_head_mask: Optional[torch.Tensor] = None, + cross_attn_head_mask: Optional[torch.Tensor] = None, + encoder_outputs: Optional[List[torch.FloatTensor]] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + decoder_inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, Seq2SeqLMOutput]: + r""" + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., + config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored + (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. + + Returns: + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if labels is not None: + if use_cache: + logger.warning("The `use_cache` argument is changed to `False` since `labels` is provided.") + use_cache = False + if decoder_input_ids is None and decoder_inputs_embeds is None: + decoder_input_ids = shift_tokens_right( + labels, self.config.pad_token_id, self.config.decoder_start_token_id + ) + + outputs = self.model( + input_ids, + attention_mask=attention_mask, + decoder_input_ids=decoder_input_ids, + encoder_outputs=encoder_outputs, + decoder_attention_mask=decoder_attention_mask, + head_mask=head_mask, + decoder_head_mask=decoder_head_mask, + cross_attn_head_mask=cross_attn_head_mask, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + decoder_inputs_embeds=decoder_inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + lm_logits = self.lm_head(outputs[0]) + lm_logits = lm_logits + self.final_logits_bias.to(lm_logits.device) + + masked_lm_loss = None + if labels is not None: + labels = labels.to(lm_logits.device) + loss_fct = CrossEntropyLoss() + masked_lm_loss = loss_fct(lm_logits.view(-1, self.config.vocab_size), labels.view(-1)) + + if not return_dict: + output = (lm_logits,) + outputs[1:] + return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output + + return Seq2SeqLMOutput( + loss=masked_lm_loss, + logits=lm_logits, + past_key_values=outputs.past_key_values, + decoder_hidden_states=outputs.decoder_hidden_states, + decoder_attentions=outputs.decoder_attentions, + cross_attentions=outputs.cross_attentions, + encoder_last_hidden_state=outputs.encoder_last_hidden_state, + encoder_hidden_states=outputs.encoder_hidden_states, + encoder_attentions=outputs.encoder_attentions, + ) + + def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor): + return shift_tokens_right(labels, self.config.pad_token_id, self.config.decoder_start_token_id) + + @staticmethod + def _reorder_cache(past_key_values, beam_idx): + reordered_past = () + for layer_past in past_key_values: + # cached cross_attention states don't have to be reordered -> they are always the same + reordered_past += ( + tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past[:2]) + + layer_past[2:], + ) + return reordered_past + + +@add_start_docstrings( + """ + BigBirdPegasus model with a sequence classification/head on top (a linear layer on top of the pooled output) e.g. + for GLUE tasks. + """, + BIGBIRD_PEGASUS_START_DOCSTRING, +) +class BigBirdPegasusForSequenceClassification(BigBirdPegasusPreTrainedModel): + _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"] + + def __init__(self, config: BigBirdPegasusConfig, **kwargs): + super().__init__(config, **kwargs) + self.model = BigBirdPegasusModel(config) + self.classification_head = BigBirdPegasusClassificationHead( + config.d_model, + config.d_model, + config.num_labels, + config.classifier_dropout, + ) + + # Initialize weights and apply final processing + self.post_init() + + @add_start_docstrings_to_model_forward(BIGBIRD_PEGASUS_INPUTS_DOCSTRING) + @add_code_sample_docstrings( + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=Seq2SeqSequenceClassifierOutput, + config_class=_CONFIG_FOR_DOC, + ) + # Copied from transformers.models.bart.modeling_bart.BartForSequenceClassification.forward + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + decoder_input_ids: Optional[torch.LongTensor] = None, + decoder_attention_mask: Optional[torch.LongTensor] = None, + head_mask: Optional[torch.Tensor] = None, + decoder_head_mask: Optional[torch.Tensor] = None, + cross_attn_head_mask: Optional[torch.Tensor] = None, + encoder_outputs: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + decoder_inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, Seq2SeqSequenceClassifierOutput]: + r""" + labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., + config.num_labels - 1]`. If `config.num_labels > 1` a classification loss is computed (Cross-Entropy). + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + if labels is not None: + use_cache = False + + if input_ids is None and inputs_embeds is not None: + raise NotImplementedError( + f"Passing input embeddings is currently not supported for {self.__class__.__name__}" + ) + + outputs = self.model( + input_ids, + attention_mask=attention_mask, + decoder_input_ids=decoder_input_ids, + decoder_attention_mask=decoder_attention_mask, + head_mask=head_mask, + decoder_head_mask=decoder_head_mask, + cross_attn_head_mask=cross_attn_head_mask, + encoder_outputs=encoder_outputs, + inputs_embeds=inputs_embeds, + decoder_inputs_embeds=decoder_inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + hidden_states = outputs[0] # last hidden state + + eos_mask = input_ids.eq(self.config.eos_token_id).to(hidden_states.device) + + if len(torch.unique_consecutive(eos_mask.sum(1))) > 1: + raise ValueError("All examples must have the same number of tokens.") + sentence_representation = hidden_states[eos_mask, :].view(hidden_states.size(0), -1, hidden_states.size(-1))[ + :, -1, : + ] + logits = self.classification_head(sentence_representation) + + loss = None + if labels is not None: + labels = labels.to(logits.device) + if self.config.problem_type is None: + if self.config.num_labels == 1: + self.config.problem_type = "regression" + elif self.config.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int): + self.config.problem_type = "single_label_classification" + else: + self.config.problem_type = "multi_label_classification" + + if self.config.problem_type == "regression": + loss_fct = MSELoss() + if self.config.num_labels == 1: + loss = loss_fct(logits.squeeze(), labels.squeeze()) + else: + loss = loss_fct(logits, labels) + elif self.config.problem_type == "single_label_classification": + loss_fct = CrossEntropyLoss() + loss = loss_fct(logits.view(-1, self.config.num_labels), labels.view(-1)) + elif self.config.problem_type == "multi_label_classification": + loss_fct = BCEWithLogitsLoss() + loss = loss_fct(logits, labels) + if not return_dict: + output = (logits,) + outputs[1:] + return ((loss,) + output) if loss is not None else output + + return Seq2SeqSequenceClassifierOutput( + loss=loss, + logits=logits, + past_key_values=outputs.past_key_values, + decoder_hidden_states=outputs.decoder_hidden_states, + decoder_attentions=outputs.decoder_attentions, + cross_attentions=outputs.cross_attentions, + encoder_last_hidden_state=outputs.encoder_last_hidden_state, + encoder_hidden_states=outputs.encoder_hidden_states, + encoder_attentions=outputs.encoder_attentions, + ) + + +@add_start_docstrings( + """ + BigBirdPegasus Model with a span classification head on top for extractive question-answering tasks like SQuAD (a + linear layer on top of the hidden-states output to compute `span start logits` and `span end logits`). + """, + BIGBIRD_PEGASUS_START_DOCSTRING, +) +class BigBirdPegasusForQuestionAnswering(BigBirdPegasusPreTrainedModel): + _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"] + + def __init__(self, config): + super().__init__(config) + + config.num_labels = 2 + self.num_labels = config.num_labels + + self.model = BigBirdPegasusModel(config) + self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels) + + # Initialize weights and apply final processing + self.post_init() + + @add_start_docstrings_to_model_forward(BIGBIRD_PEGASUS_INPUTS_DOCSTRING) + @add_code_sample_docstrings( + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=Seq2SeqQuestionAnsweringModelOutput, + config_class=_CONFIG_FOR_DOC, + ) + # Copied from transformers.models.bart.modeling_bart.BartForQuestionAnswering.forward + def forward( + self, + input_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + decoder_input_ids: Optional[torch.LongTensor] = None, + decoder_attention_mask: Optional[torch.LongTensor] = None, + head_mask: Optional[torch.Tensor] = None, + decoder_head_mask: Optional[torch.Tensor] = None, + cross_attn_head_mask: Optional[torch.Tensor] = None, + encoder_outputs: Optional[List[torch.FloatTensor]] = None, + start_positions: Optional[torch.LongTensor] = None, + end_positions: Optional[torch.LongTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + decoder_inputs_embeds: Optional[torch.FloatTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, Seq2SeqQuestionAnsweringModelOutput]: + r""" + start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for position (index) of the start of the labelled span for computing the token classification loss. + Positions are clamped to the length of the sequence (*sequence_length*). Position outside of the sequence + are not taken into account for computing the loss. + end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for position (index) of the end of the labelled span for computing the token classification loss. + Positions are clamped to the length of the sequence (*sequence_length*). Position outside of the sequence + are not taken into account for computing the loss. + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + if start_positions is not None and end_positions is not None: + use_cache = False + + outputs = self.model( + input_ids, + attention_mask=attention_mask, + decoder_input_ids=decoder_input_ids, + decoder_attention_mask=decoder_attention_mask, + head_mask=head_mask, + decoder_head_mask=decoder_head_mask, + cross_attn_head_mask=cross_attn_head_mask, + encoder_outputs=encoder_outputs, + inputs_embeds=inputs_embeds, + decoder_inputs_embeds=decoder_inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + sequence_output = outputs[0] + + logits = self.qa_outputs(sequence_output) + start_logits, end_logits = logits.split(1, dim=-1) + start_logits = start_logits.squeeze(-1).contiguous() + end_logits = end_logits.squeeze(-1).contiguous() + + total_loss = None + if start_positions is not None and end_positions is not None: + # If we are on multi-GPU, split add a dimension + if len(start_positions.size()) > 1: + start_positions = start_positions.squeeze(-1) + if len(end_positions.size()) > 1: + end_positions = end_positions.squeeze(-1) + # sometimes the start/end positions are outside our model inputs, we ignore these terms + ignored_index = start_logits.size(1) + start_positions = start_positions.clamp(0, ignored_index) + end_positions = end_positions.clamp(0, ignored_index) + + loss_fct = CrossEntropyLoss(ignore_index=ignored_index) + start_loss = loss_fct(start_logits, start_positions) + end_loss = loss_fct(end_logits, end_positions) + total_loss = (start_loss + end_loss) / 2 + + if not return_dict: + output = ( + start_logits, + end_logits, + ) + outputs[1:] + return ((total_loss,) + output) if total_loss is not None else output + + return Seq2SeqQuestionAnsweringModelOutput( + loss=total_loss, + start_logits=start_logits, + end_logits=end_logits, + past_key_values=outputs.past_key_values, + decoder_hidden_states=outputs.decoder_hidden_states, + decoder_attentions=outputs.decoder_attentions, + cross_attentions=outputs.cross_attentions, + encoder_last_hidden_state=outputs.encoder_last_hidden_state, + encoder_hidden_states=outputs.encoder_hidden_states, + encoder_attentions=outputs.encoder_attentions, + ) + + +# Copied from transformers.models.pegasus.modeling_pegasus.PegasusDecoderWrapper with Pegasus->BigBirdPegasus +class BigBirdPegasusDecoderWrapper(BigBirdPegasusPreTrainedModel): + """ + This wrapper class is a helper class to correctly load pretrained checkpoints when the causal language model is + used in combination with the [`EncoderDecoderModel`] framework. + """ + + def __init__(self, config): + super().__init__(config) + self.decoder = BigBirdPegasusDecoder(config) + + def forward(self, *args, **kwargs): + return self.decoder(*args, **kwargs) + + +class BigBirdPegasusForCausalLM(BigBirdPegasusPreTrainedModel, GenerationMixin): + _tied_weights_keys = ["lm_head.weight"] + + def __init__(self, config): + config = copy.deepcopy(config) + config.is_decoder = True + config.is_encoder_decoder = False + super().__init__(config) + self.model = BigBirdPegasusDecoderWrapper(config) + + self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.model.decoder.embed_tokens + + def set_input_embeddings(self, value): + self.model.decoder.embed_tokens = value + + def get_output_embeddings(self): + return self.lm_head + + def set_output_embeddings(self, new_embeddings): + self.lm_head = new_embeddings + + def set_decoder(self, decoder): + self.model.decoder = decoder + + def get_decoder(self): + return self.model.decoder + + @replace_return_docstrings(output_type=CausalLMOutputWithCrossAttentions, config_class=_CONFIG_FOR_DOC) + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + encoder_attention_mask: Optional[torch.FloatTensor] = None, + head_mask: Optional[torch.Tensor] = None, + cross_attn_head_mask: Optional[torch.Tensor] = None, + past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, CausalLMOutputWithCrossAttentions]: + r""" + Args: + input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you + provide it. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention + if the model is configured as a decoder. + encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used + in the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`: + head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*): + Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*): + Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): + Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of + shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of + shape `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`. The two additional + tensors are only required when the model is used as a decoder in a Sequence to Sequence model. + + Contains pre-computed hidden-states (key and values in the self-attention blocks and in the + cross-attention blocks) that can be used (see `past_key_values` input) to speed up sequential decoding. + + If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those + that don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of + all `decoder_input_ids` of shape `(batch_size, sequence_length)`. + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., + config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored + (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding + (see `past_key_values`). + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors + for more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. + + Returns: + + Example: + + ```python + >>> from transformers import AutoTokenizer, BigBirdPegasusForCausalLM + + >>> tokenizer = AutoTokenizer.from_pretrained("google/bigbird-pegasus-large-arxiv") + >>> model = BigBirdPegasusForCausalLM.from_pretrained( + ... "google/bigbird-pegasus-large-arxiv", add_cross_attention=False + ... ) + >>> assert model.config.is_decoder, f"{model.__class__} has to be configured as a decoder." + >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt") + >>> outputs = model(**inputs) + + >>> logits = outputs.logits + ```""" + + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn) + outputs = self.model.decoder( + input_ids=input_ids, + attention_mask=attention_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + head_mask=head_mask, + cross_attn_head_mask=cross_attn_head_mask, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + logits = self.lm_head(outputs[0]) + + loss = None + if labels is not None: + loss_fct = CrossEntropyLoss() + loss = loss_fct(logits.view(-1, self.config.vocab_size), labels.view(-1)) + + if not return_dict: + output = (logits,) + outputs[1:] + return (loss,) + output if loss is not None else output + + return CausalLMOutputWithCrossAttentions( + loss=loss, + logits=logits, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + cross_attentions=outputs.cross_attentions, + ) + + @staticmethod + def _reorder_cache(past_key_values, beam_idx): + reordered_past = () + for layer_past in past_key_values: + reordered_past += ( + tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past), + ) + return reordered_past + + +__all__ = [ + "BigBirdPegasusForCausalLM", + "BigBirdPegasusForConditionalGeneration", + "BigBirdPegasusForQuestionAnswering", + "BigBirdPegasusForSequenceClassification", + "BigBirdPegasusModel", + "BigBirdPegasusPreTrainedModel", +] diff --git a/docs/transformers/build/lib/transformers/models/biogpt/__init__.py b/docs/transformers/build/lib/transformers/models/biogpt/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..641cdb592117b466cf49be31be701e494ee7f9fc --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/biogpt/__init__.py @@ -0,0 +1,28 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ...utils import _LazyModule +from ...utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .configuration_biogpt import * + from .modeling_biogpt import * + from .tokenization_biogpt import * +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/docs/transformers/build/lib/transformers/models/biogpt/configuration_biogpt.py b/docs/transformers/build/lib/transformers/models/biogpt/configuration_biogpt.py new file mode 100644 index 0000000000000000000000000000000000000000..b338092edd1d0b544843276289afe6a58ed0c8de --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/biogpt/configuration_biogpt.py @@ -0,0 +1,134 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Team and Microsoft Research AI4Science All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""BioGPT model configuration""" + +from ...configuration_utils import PretrainedConfig +from ...utils import logging + + +logger = logging.get_logger(__name__) + + +class BioGptConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`BioGptModel`]. It is used to instantiate an + BioGPT model according to the specified arguments, defining the model architecture. Instantiating a configuration + with the defaults will yield a similar configuration to that of the BioGPT + [microsoft/biogpt](https://huggingface.co/microsoft/biogpt) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + + Args: + vocab_size (`int`, *optional*, defaults to 42384): + Vocabulary size of the BioGPT model. Defines the number of different tokens that can be represented by the + `inputs_ids` passed when calling [`BioGptModel`]. + hidden_size (`int`, *optional*, defaults to 1024): + Dimension of the encoder layers and the pooler layer. + num_hidden_layers (`int`, *optional*, defaults to 24): + Number of hidden layers in the Transformer encoder. + num_attention_heads (`int`, *optional*, defaults to 16): + Number of attention heads for each attention layer in the Transformer encoder. + intermediate_size (`int`, *optional*, defaults to 4096): + Dimension of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder. + hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`): + The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, + `"relu"`, `"selu"` and `"gelu_new"` are supported. + hidden_dropout_prob (`float`, *optional*, defaults to 0.1): + The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. + attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1): + The dropout ratio for the attention probabilities. + max_position_embeddings (`int`, *optional*, defaults to 1024): + The maximum sequence length that this model might ever be used with. Typically set this to something large + just in case (e.g., 512 or 1024 or 2048). + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + layer_norm_eps (`float`, *optional*, defaults to 1e-12): + The epsilon used by the layer normalization layers. + scale_embedding (`bool`, *optional*, defaults to `True`): + Scale embeddings by diving by sqrt(d_model). + use_cache (`bool`, *optional*, defaults to `True`): + Whether or not the model should return the last key/values attentions (not used by all models). Only + relevant if `config.is_decoder=True`. + layerdrop (`float`, *optional*, defaults to 0.0): + Please refer to the paper about LayerDrop: https://arxiv.org/abs/1909.11556 for further details + activation_dropout (`float`, *optional*, defaults to 0.0): + The dropout ratio for activations inside the fully connected layer. + pad_token_id (`int`, *optional*, defaults to 1): + Padding token id. + bos_token_id (`int`, *optional*, defaults to 0): + Beginning of stream token id. + eos_token_id (`int`, *optional*, defaults to 2): + End of stream token id. + + Example: + + ```python + >>> from transformers import BioGptModel, BioGptConfig + + >>> # Initializing a BioGPT microsoft/biogpt style configuration + >>> configuration = BioGptConfig() + + >>> # Initializing a model from the microsoft/biogpt style configuration + >>> model = BioGptModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "biogpt" + + def __init__( + self, + vocab_size=42384, + hidden_size=1024, + num_hidden_layers=24, + num_attention_heads=16, + intermediate_size=4096, + hidden_act="gelu", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + max_position_embeddings=1024, + initializer_range=0.02, + layer_norm_eps=1e-12, + scale_embedding=True, + use_cache=True, + layerdrop=0.0, + activation_dropout=0.0, + pad_token_id=1, + bos_token_id=0, + eos_token_id=2, + **kwargs, + ): + self.vocab_size = vocab_size + self.max_position_embeddings = max_position_embeddings + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.intermediate_size = intermediate_size + self.hidden_act = hidden_act + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.initializer_range = initializer_range + self.layer_norm_eps = layer_norm_eps + self.scale_embedding = scale_embedding + self.use_cache = use_cache + self.layerdrop = layerdrop + self.activation_dropout = activation_dropout + super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs) + + +__all__ = ["BioGptConfig"] diff --git a/docs/transformers/build/lib/transformers/models/biogpt/convert_biogpt_original_pytorch_checkpoint_to_pytorch.py b/docs/transformers/build/lib/transformers/models/biogpt/convert_biogpt_original_pytorch_checkpoint_to_pytorch.py new file mode 100644 index 0000000000000000000000000000000000000000..c390d2e39f6e7056da4cceda5dcdd9e0c6caf8fd --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/biogpt/convert_biogpt_original_pytorch_checkpoint_to_pytorch.py @@ -0,0 +1,292 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import argparse +import json +import os +import re +import shutil + +import torch + +from transformers import BioGptConfig, BioGptForCausalLM +from transformers.models.biogpt.tokenization_biogpt import VOCAB_FILES_NAMES +from transformers.tokenization_utils_base import TOKENIZER_CONFIG_FILE +from transformers.utils import WEIGHTS_NAME, logging + + +logging.set_verbosity_warning() + +json_indent = 2 + + +# modified from https://github.com/facebookresearch/fairseq/blob/dd74992d0d143155998e9ed4076826bcea80fb06/fairseq/data/dictionary.py#L18 +class Dictionary: + """A mapping from symbols to consecutive integers""" + + def __init__( + self, + *, # begin keyword-only arguments + bos="", + pad="", + eos="", + unk="", + extra_special_symbols=None, + ): + self.bos_word, self.unk_word, self.pad_word, self.eos_word = bos, unk, pad, eos + self.symbols = [] + self.count = [] + self.indices = {} + self.bos_index = self.add_symbol(bos) + self.pad_index = self.add_symbol(pad) + self.eos_index = self.add_symbol(eos) + self.unk_index = self.add_symbol(unk) + if extra_special_symbols: + for s in extra_special_symbols: + self.add_symbol(s) + self.nspecial = len(self.symbols) + + def __eq__(self, other): + return self.indices == other.indices + + def __getitem__(self, idx): + if idx < len(self.symbols): + return self.symbols[idx] + return self.unk_word + + def __len__(self): + """Returns the number of symbols in the dictionary""" + return len(self.symbols) + + def __contains__(self, sym): + return sym in self.indices + + @classmethod + def load(cls, f): + """Loads the dictionary from a text file with the format: + + ``` + + + ... + ``` + """ + d = cls() + d.add_from_file(f) + return d + + def add_symbol(self, word, n=1, overwrite=False): + """Adds a word to the dictionary""" + if word in self.indices and not overwrite: + idx = self.indices[word] + self.count[idx] = self.count[idx] + n + return idx + else: + idx = len(self.symbols) + self.indices[word] = idx + self.symbols.append(word) + self.count.append(n) + return idx + + def _load_meta(self, lines): + return 0 + + def add_from_file(self, f): + """ + Loads a pre-existing dictionary from a text file and adds its symbols to this instance. + """ + if isinstance(f, str): + try: + with open(f, "r", encoding="utf-8") as fd: + self.add_from_file(fd) + except FileNotFoundError as fnfe: + raise fnfe + except UnicodeError: + raise Exception("Incorrect encoding detected in {}, please rebuild the dataset".format(f)) + return + + lines = f.readlines() + indices_start_line = self._load_meta(lines) + + for line in lines[indices_start_line:]: + try: + line, field = line.rstrip().rsplit(" ", 1) + if field == "#fairseq:overwrite": + overwrite = True + line, field = line.rsplit(" ", 1) + else: + overwrite = False + count = int(field) + word = line + if word in self and not overwrite: + raise RuntimeError( + "Duplicate word found when loading Dictionary: '{}'. " + "Duplicate words can overwrite earlier ones by adding the " + "#fairseq:overwrite flag at the end of the corresponding row " + "in the dictionary file. If using the Camembert model, please " + "download an updated copy of the model file.".format(word) + ) + self.add_symbol(word, n=count, overwrite=overwrite) + except ValueError: + raise ValueError("Incorrect dictionary format, expected ' [flags]'") + + +def rewrite_dict_keys(d): + # (1) remove word breaking symbol, (2) add word ending symbol where the word is not broken up, + # e.g.: d = {'le@@': 5, 'tt@@': 6, 'er': 7} => {'le': 5, 'tt': 6, 'er': 7} + d2 = dict((re.sub(r"@@$", "", k), v) if k.endswith("@@") else (re.sub(r"$", "", k), v) for k, v in d.items()) + keep_keys = " ".split() + # restore the special tokens + for k in keep_keys: + del d2[f"{k}"] + d2[k] = d[k] # restore + return d2 + + +def convert_biogpt_checkpoint_to_pytorch(biogpt_checkpoint_path, pytorch_dump_folder_path): + # prep + if not os.path.exists(biogpt_checkpoint_path): + raise ValueError(f"path {biogpt_checkpoint_path} does not exist!") + os.makedirs(pytorch_dump_folder_path, exist_ok=True) + print(f"Writing results to {pytorch_dump_folder_path}") + + # handle various types of models + + checkpoint_file = os.path.join(biogpt_checkpoint_path, "checkpoint.pt") + if not os.path.isfile(checkpoint_file): + raise ValueError(f"path to the file {checkpoint_file} does not exist!") + chkpt = torch.load(checkpoint_file, map_location="cpu", weights_only=True) + + args = chkpt["cfg"]["model"] + + # dicts + dict_file = os.path.join(biogpt_checkpoint_path, "dict.txt") + if not os.path.isfile(dict_file): + raise ValueError(f"path to the file {dict_file} does not exist!") + src_dict = Dictionary.load(dict_file) + src_vocab = rewrite_dict_keys(src_dict.indices) + src_vocab_size = len(src_vocab) + src_vocab_file = os.path.join(pytorch_dump_folder_path, VOCAB_FILES_NAMES["vocab_file"]) + print(f"Generating {src_vocab_file} of {src_vocab_size} records") + with open(src_vocab_file, "w", encoding="utf-8") as f: + f.write(json.dumps(src_vocab, ensure_ascii=False, indent=json_indent)) + + # merges_file (bpecodes) + bpecodes_file = os.path.join(biogpt_checkpoint_path, "bpecodes") + if not os.path.isfile(bpecodes_file): + raise ValueError(f"path to the file {bpecodes_file} does not exist!") + + merges_file = os.path.join(pytorch_dump_folder_path, VOCAB_FILES_NAMES["merges_file"]) + shutil.copyfile(bpecodes_file, merges_file) + + # model config + biogpt_model_config_file = os.path.join(pytorch_dump_folder_path, "config.json") + + model_conf = { + "activation_dropout": args["activation_dropout"], + "architectures": ["BioGptForCausalLM"], + "attention_probs_dropout_prob": args["attention_dropout"], + "bos_token_id": 0, + "eos_token_id": 2, + "hidden_act": args["activation_fn"], + "hidden_dropout_prob": args["dropout"], + "hidden_size": args["decoder_embed_dim"], + "initializer_range": 0.02, + "intermediate_size": args["decoder_ffn_embed_dim"], + "layer_norm_eps": 1e-12, + "layerdrop": args["decoder_layerdrop"], + "max_position_embeddings": args["max_target_positions"], + "model_type": "biogpt", + "num_attention_heads": args["decoder_attention_heads"], + "num_hidden_layers": args["decoder_layers"], + "pad_token_id": 1, + "scale_embedding": not args["no_scale_embedding"], + "tie_word_embeddings": args["share_decoder_input_output_embed"], + "vocab_size": src_vocab_size, + } + + # good hparam defaults to start with + + print(f"Generating {biogpt_model_config_file}") + with open(biogpt_model_config_file, "w", encoding="utf-8") as f: + f.write(json.dumps(model_conf, ensure_ascii=False, indent=json_indent)) + + # tokenizer config + biogpt_tokenizer_config_file = os.path.join(pytorch_dump_folder_path, TOKENIZER_CONFIG_FILE) + + tokenizer_conf = { + "bos_token": "", + "eos_token": "", + "model_max_length": 1024, + "pad_token": "", + "special_tokens_map_file": None, + "tokenizer_class": "BioGptTokenizer", + "unk_token": "", + } + + print(f"Generating {biogpt_tokenizer_config_file}") + with open(biogpt_tokenizer_config_file, "w", encoding="utf-8") as f: + f.write(json.dumps(tokenizer_conf, ensure_ascii=False, indent=json_indent)) + + # model + model_state_dict = chkpt["model"] + + # remove unneeded keys + ignore_keys = [ + "decoder.version", + ] + for k in ignore_keys: + model_state_dict.pop(k, None) + + layer_names = list(model_state_dict.keys()) + for layer_name in layer_names: + if layer_name.endswith("output_projection.weight"): + model_state_dict[layer_name.replace("decoder.", "")] = model_state_dict.pop(layer_name) + else: + model_state_dict[layer_name.replace("decoder", "biogpt")] = model_state_dict.pop(layer_name) + + config = BioGptConfig.from_pretrained(pytorch_dump_folder_path) + model_new = BioGptForCausalLM(config) + + # check that it loads ok + model_new.load_state_dict(model_state_dict) + + # save + pytorch_weights_dump_path = os.path.join(pytorch_dump_folder_path, WEIGHTS_NAME) + print(f"Generating {pytorch_weights_dump_path}") + torch.save(model_state_dict, pytorch_weights_dump_path) + + print("Conversion is done!") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + # Required parameters + parser.add_argument( + "--biogpt_checkpoint_path", + default=None, + type=str, + required=True, + help=( + "Path to the official PyTorch checkpoint file which is expected to reside in the dump dir with dicts," + " bpecodes, etc." + ), + ) + parser.add_argument( + "--pytorch_dump_folder_path", default=None, type=str, required=True, help="Path to the output PyTorch model." + ) + args = parser.parse_args() + convert_biogpt_checkpoint_to_pytorch(args.biogpt_checkpoint_path, args.pytorch_dump_folder_path) diff --git a/docs/transformers/build/lib/transformers/models/biogpt/modeling_biogpt.py b/docs/transformers/build/lib/transformers/models/biogpt/modeling_biogpt.py new file mode 100644 index 0000000000000000000000000000000000000000..51f298098ba1d64d22154bc6cf6e5e89e4edb479 --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/biogpt/modeling_biogpt.py @@ -0,0 +1,1042 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Team and Microsoft Research AI4Science All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch BioGPT model.""" + +import math +from typing import Optional, Tuple, Union + +import torch +import torch.utils.checkpoint +from torch import nn +from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss + +from ...activations import ACT2FN +from ...generation import GenerationMixin +from ...modeling_attn_mask_utils import _prepare_4d_causal_attention_mask, _prepare_4d_causal_attention_mask_for_sdpa +from ...modeling_outputs import ( + BaseModelOutputWithPastAndCrossAttentions, + CausalLMOutputWithCrossAttentions, + SequenceClassifierOutputWithPast, + TokenClassifierOutput, +) +from ...modeling_utils import PreTrainedModel +from ...utils import ( + add_code_sample_docstrings, + add_start_docstrings, + add_start_docstrings_to_model_forward, + logging, +) +from .configuration_biogpt import BioGptConfig + + +logger = logging.get_logger(__name__) + +_CHECKPOINT_FOR_DOC = "microsoft/biogpt" +_CONFIG_FOR_DOC = "BioGptConfig" + + +# copied from transformers.models.opt.modeling_opt.OPTLearnedPositionalEmbedding with OPT->BioGpt +# TODO @ArthurZucker bring copied from back +class BioGptLearnedPositionalEmbedding(nn.Embedding): + """ + This module learns positional embeddings up to a fixed maximum size. + """ + + def __init__(self, num_embeddings: int, embedding_dim: int): + # BioGpt is set up so that if padding_idx is specified then offset the embedding ids by 2 + # and adjust num_embeddings appropriately. Other models don't have this hack + self.offset = 2 + super().__init__(num_embeddings + self.offset, embedding_dim) + + def forward(self, attention_mask: torch.LongTensor, past_key_values_length: int = 0): + """`input_ids_shape` is expected to be [bsz x seqlen].""" + attention_mask = attention_mask.long() + + # create positions depending on attention_mask + positions = (torch.cumsum(attention_mask, dim=1).type_as(attention_mask) * attention_mask).long() - 1 + + # cut positions if `past_key_values_length` is > 0 + positions = positions[:, past_key_values_length:] + + return super().forward(positions + self.offset) + + +# Copied from transformers.models.bart.modeling_bart.BartScaledWordEmbedding with Bart->BioGpt +class BioGptScaledWordEmbedding(nn.Embedding): + """ + This module overrides nn.Embeddings' forward by multiplying with embeddings scale. + """ + + def __init__(self, num_embeddings: int, embedding_dim: int, padding_idx: int, embed_scale: Optional[float] = 1.0): + super().__init__(num_embeddings, embedding_dim, padding_idx) + self.embed_scale = embed_scale + + def forward(self, input_ids: torch.Tensor): + return super().forward(input_ids) * self.embed_scale + + +# Copied from transformers.models.bart.modeling_bart.BartAttention with Bart->BioGpt +class BioGptAttention(nn.Module): + """Multi-headed attention from 'Attention Is All You Need' paper""" + + def __init__( + self, + embed_dim: int, + num_heads: int, + dropout: float = 0.0, + is_decoder: bool = False, + bias: bool = True, + is_causal: bool = False, + config: Optional[BioGptConfig] = None, + ): + super().__init__() + self.embed_dim = embed_dim + self.num_heads = num_heads + self.dropout = dropout + self.head_dim = embed_dim // num_heads + self.config = config + + if (self.head_dim * num_heads) != self.embed_dim: + raise ValueError( + f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}" + f" and `num_heads`: {num_heads})." + ) + self.scaling = self.head_dim**-0.5 + self.is_decoder = is_decoder + self.is_causal = is_causal + + self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias) + self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias) + self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias) + self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias) + + def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int): + return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous() + + def forward( + self, + hidden_states: torch.Tensor, + key_value_states: Optional[torch.Tensor] = None, + past_key_value: Optional[Tuple[torch.Tensor]] = None, + attention_mask: Optional[torch.Tensor] = None, + layer_head_mask: Optional[torch.Tensor] = None, + output_attentions: bool = False, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + """Input shape: Batch x Time x Channel""" + + # if key_value_states are provided this layer is used as a cross-attention layer + # for the decoder + is_cross_attention = key_value_states is not None + + bsz, tgt_len, _ = hidden_states.size() + + # get query proj + query_states = self.q_proj(hidden_states) * self.scaling + # get key, value proj + # `past_key_value[0].shape[2] == key_value_states.shape[1]` + # is checking that the `sequence_length` of the `past_key_value` is the same as + # the provided `key_value_states` to support prefix tuning + if ( + is_cross_attention + and past_key_value is not None + and past_key_value[0].shape[2] == key_value_states.shape[1] + ): + # reuse k,v, cross_attentions + key_states = past_key_value[0] + value_states = past_key_value[1] + elif is_cross_attention: + # cross_attentions + key_states = self._shape(self.k_proj(key_value_states), -1, bsz) + value_states = self._shape(self.v_proj(key_value_states), -1, bsz) + elif past_key_value is not None: + # reuse k, v, self_attention + key_states = self._shape(self.k_proj(hidden_states), -1, bsz) + value_states = self._shape(self.v_proj(hidden_states), -1, bsz) + key_states = torch.cat([past_key_value[0], key_states], dim=2) + value_states = torch.cat([past_key_value[1], value_states], dim=2) + else: + # self_attention + key_states = self._shape(self.k_proj(hidden_states), -1, bsz) + value_states = self._shape(self.v_proj(hidden_states), -1, bsz) + + if self.is_decoder: + # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states. + # Further calls to cross_attention layer can then reuse all cross-attention + # key/value_states (first "if" case) + # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of + # all previous decoder key/value_states. Further calls to uni-directional self-attention + # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case) + # if encoder bi-directional self-attention `past_key_value` is always `None` + past_key_value = (key_states, value_states) + + proj_shape = (bsz * self.num_heads, -1, self.head_dim) + query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape) + key_states = key_states.reshape(*proj_shape) + value_states = value_states.reshape(*proj_shape) + + src_len = key_states.size(1) + attn_weights = torch.bmm(query_states, key_states.transpose(1, 2)) + + if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len): + raise ValueError( + f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is" + f" {attn_weights.size()}" + ) + + if attention_mask is not None: + if attention_mask.size() != (bsz, 1, tgt_len, src_len): + raise ValueError( + f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}" + ) + attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attention_mask + attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len) + + attn_weights = nn.functional.softmax(attn_weights, dim=-1) + + if layer_head_mask is not None: + if layer_head_mask.size() != (self.num_heads,): + raise ValueError( + f"Head mask for a single layer should be of size {(self.num_heads,)}, but is" + f" {layer_head_mask.size()}" + ) + attn_weights = layer_head_mask.view(1, -1, 1, 1) * attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len) + + if output_attentions: + # this operation is a bit awkward, but it's required to + # make sure that attn_weights keeps its gradient. + # In order to do so, attn_weights have to be reshaped + # twice and have to be reused in the following + attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, src_len) + else: + attn_weights_reshaped = None + + attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training) + + attn_output = torch.bmm(attn_probs, value_states) + + if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim): + raise ValueError( + f"`attn_output` should be of size {(bsz * self.num_heads, tgt_len, self.head_dim)}, but is" + f" {attn_output.size()}" + ) + + attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim) + attn_output = attn_output.transpose(1, 2) + + # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be + # partitioned across GPUs when using tensor-parallelism. + attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim) + + attn_output = self.out_proj(attn_output) + + return attn_output, attn_weights_reshaped, past_key_value + + +# Copied from transformers.models.bart.modeling_bart.BartSdpaAttention with Bart->BioGpt +class BioGptSdpaAttention(BioGptAttention): + def forward( + self, + hidden_states: torch.Tensor, + key_value_states: Optional[torch.Tensor] = None, + past_key_value: Optional[Tuple[torch.Tensor]] = None, + attention_mask: Optional[torch.Tensor] = None, + layer_head_mask: Optional[torch.Tensor] = None, + output_attentions: bool = False, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + """Input shape: Batch x Time x Channel""" + if output_attentions or layer_head_mask is not None: + # TODO: Improve this warning with e.g. `model.config._attn_implementation = "manual"` once this is implemented. + logger.warning_once( + "BioGptModel is using BioGptSdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True` or `layer_head_mask` not None. Falling back to the manual attention" + ' implementation, but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.' + ) + return super().forward( + hidden_states, + key_value_states=key_value_states, + past_key_value=past_key_value, + attention_mask=attention_mask, + layer_head_mask=layer_head_mask, + output_attentions=output_attentions, + ) + + # if key_value_states are provided this layer is used as a cross-attention layer + # for the decoder + is_cross_attention = key_value_states is not None + + bsz, tgt_len, _ = hidden_states.size() + + # get query proj + query_states = self.q_proj(hidden_states) + # get key, value proj + # `past_key_value[0].shape[2] == key_value_states.shape[1]` + # is checking that the `sequence_length` of the `past_key_value` is the same as + # the provided `key_value_states` to support prefix tuning + if ( + is_cross_attention + and past_key_value is not None + and past_key_value[0].shape[2] == key_value_states.shape[1] + ): + # reuse k,v, cross_attentions + key_states = past_key_value[0] + value_states = past_key_value[1] + elif is_cross_attention: + # cross_attentions + key_states = self._shape(self.k_proj(key_value_states), -1, bsz) + value_states = self._shape(self.v_proj(key_value_states), -1, bsz) + elif past_key_value is not None: + # reuse k, v, self_attention + key_states = self._shape(self.k_proj(hidden_states), -1, bsz) + value_states = self._shape(self.v_proj(hidden_states), -1, bsz) + key_states = torch.cat([past_key_value[0], key_states], dim=2) + value_states = torch.cat([past_key_value[1], value_states], dim=2) + else: + # self_attention + key_states = self._shape(self.k_proj(hidden_states), -1, bsz) + value_states = self._shape(self.v_proj(hidden_states), -1, bsz) + + if self.is_decoder: + # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states. + # Further calls to cross_attention layer can then reuse all cross-attention + # key/value_states (first "if" case) + # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of + # all previous decoder key/value_states. Further calls to uni-directional self-attention + # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case) + # if encoder bi-directional self-attention `past_key_value` is always `None` + past_key_value = (key_states, value_states) + + query_states = self._shape(query_states, tgt_len, bsz) + + # We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment + # in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling. + # The tgt_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case tgt_len == 1. + is_causal = True if self.is_causal and attention_mask is None and tgt_len > 1 else False + + # NOTE: SDPA with memory-efficient backend is currently (torch==2.1.2) bugged when using non-contiguous inputs and a custom attn_mask, + # but we are fine here as `_shape` do call `.contiguous()`. Reference: https://github.com/pytorch/pytorch/issues/112577 + attn_output = torch.nn.functional.scaled_dot_product_attention( + query_states, + key_states, + value_states, + attn_mask=attention_mask, + dropout_p=self.dropout if self.training else 0.0, + is_causal=is_causal, + ) + + if attn_output.size() != (bsz, self.num_heads, tgt_len, self.head_dim): + raise ValueError( + f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is" + f" {attn_output.size()}" + ) + + attn_output = attn_output.transpose(1, 2) + + # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be + # partitioned across GPUs when using tensor-parallelism. + attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim) + + attn_output = self.out_proj(attn_output) + + return attn_output, None, past_key_value + + +BIOGPT_ATTENTION_CLASSES = { + "eager": BioGptAttention, + "sdpa": BioGptSdpaAttention, +} + + +class BioGptDecoderLayer(nn.Module): + def __init__(self, config: BioGptConfig): + super().__init__() + self.embed_dim = config.hidden_size + + self.self_attn = BIOGPT_ATTENTION_CLASSES[config._attn_implementation]( + embed_dim=self.embed_dim, + num_heads=config.num_attention_heads, + dropout=config.attention_probs_dropout_prob, + is_decoder=True, + is_causal=True, + ) + self.dropout = config.hidden_dropout_prob + self.activation_fn = ACT2FN[config.hidden_act] + self.activation_dropout = config.activation_dropout + + self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim) + + self.fc1 = nn.Linear(self.embed_dim, config.intermediate_size) + self.fc2 = nn.Linear(config.intermediate_size, self.embed_dim) + self.final_layer_norm = nn.LayerNorm(self.embed_dim) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + layer_head_mask: Optional[torch.Tensor] = None, + past_key_value: Optional[Tuple[torch.Tensor]] = None, + output_attentions: Optional[bool] = False, + use_cache: Optional[bool] = True, + ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]: + """ + Args: + hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)` + attention_mask (`torch.FloatTensor`): attention mask of size + `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values. + layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size + `(encoder_attention_heads,)`. + past_key_value (`Tuple(torch.FloatTensor)`): cached past key and value projection states + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding + (see `past_key_values`). + """ + residual = hidden_states + + hidden_states = self.self_attn_layer_norm(hidden_states) + + # Self Attention + # decoder uni-directional self-attention cached key/values tuple is at positions 1,2 + self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None + # add present self-attn cache to positions 1,2 of present_key_value tuple + hidden_states, self_attn_weights, present_key_value = self.self_attn( + hidden_states=hidden_states, + past_key_value=self_attn_past_key_value, + attention_mask=attention_mask, + layer_head_mask=layer_head_mask, + output_attentions=output_attentions, + ) + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = residual + hidden_states + + # Fully Connected + residual = hidden_states + hidden_states = self.final_layer_norm(hidden_states) + hidden_states = self.fc1(hidden_states) + hidden_states = self.activation_fn(hidden_states) + hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training) + hidden_states = self.fc2(hidden_states) + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = residual + hidden_states + + outputs = (hidden_states,) + + if output_attentions: + outputs += (self_attn_weights,) + + if use_cache: + outputs += (present_key_value,) + + return outputs + + +class BioGptPreTrainedModel(PreTrainedModel): + """ + An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained + models. + """ + + config_class = BioGptConfig + base_model_prefix = "biogpt" + supports_gradient_checkpointing = True + _supports_sdpa = True + + def _init_weights(self, module): + """Initialize the weights""" + if isinstance(module, nn.Linear): + # Slightly different from the TF version which uses truncated_normal for initialization + # cf https://github.com/pytorch/pytorch/pull/5617 + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + elif isinstance(module, nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + + +BIOGPT_START_DOCSTRING = r""" + This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use + it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and + behavior. + + Parameters: + config ([`~BioGptConfig`]): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights. +""" + +BIOGPT_INPUTS_DOCSTRING = r""" + Args: + input_ids (`torch.LongTensor` of shape `({0})`): + Indices of input sequence tokens in the vocabulary. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*): + Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This + is useful if you want more control over how to convert *input_ids* indices into associated vectors than the + model's internal embedding lookup matrix. + past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): + Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape + `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape + `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`. + + Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention + blocks) that can be used (see `past_key_values` input) to speed up sequential decoding. + + If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that + don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all + `decoder_input_ids` of shape `(batch_size, sequence_length)`. + inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. + This is useful if you want more control over how to convert `input_ids` indices into associated vectors + than the model's internal embedding lookup matrix. + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see + `past_key_values`). + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. +""" + + +@add_start_docstrings( + "The bare BioGPT Model transformer outputting raw hidden-states without any specific head on top.", + BIOGPT_START_DOCSTRING, +) +class BioGptModel(BioGptPreTrainedModel): + def __init__(self, config: BioGptConfig): + super().__init__(config) + self.config = config + self.layerdrop = config.layerdrop + self.dropout = config.hidden_dropout_prob + self.embed_dim = config.hidden_size + self.padding_idx = config.pad_token_id + embed_scale = math.sqrt(config.hidden_size) if config.scale_embedding else 1.0 + + self.embed_tokens = BioGptScaledWordEmbedding( + config.vocab_size, self.embed_dim, self.padding_idx, embed_scale=embed_scale + ) + self.embed_positions = BioGptLearnedPositionalEmbedding(config.max_position_embeddings, self.embed_dim) + + self.layers = nn.ModuleList([BioGptDecoderLayer(config) for _ in range(config.num_hidden_layers)]) + self.layer_norm = nn.LayerNorm(self.embed_dim) + + self.gradient_checkpointing = False + self._use_sdpa = config._attn_implementation == "sdpa" + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.embed_tokens + + def set_input_embeddings(self, value): + self.embed_tokens = value + + @add_start_docstrings_to_model_forward(BIOGPT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @add_code_sample_docstrings( + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=BaseModelOutputWithPastAndCrossAttentions, + config_class=_CONFIG_FOR_DOC, + ) + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.FloatTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + **kwargs, # NOOP kwargs, for now + ) -> Union[Tuple, BaseModelOutputWithPastAndCrossAttentions]: + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + use_cache = use_cache if use_cache is not None else self.config.use_cache + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + # retrieve input_ids and inputs_embeds + if input_ids is not None and inputs_embeds is not None: + raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") + elif input_ids is not None: + input = input_ids + input_shape = input.size() + elif inputs_embeds is not None: + input_shape = inputs_embeds.size()[:-1] + input = inputs_embeds[:, :, -1] + else: + raise ValueError("You have to specify either input_ids or inputs_embeds") + + # past_key_values_length + past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0 + + if inputs_embeds is None: + inputs_embeds = self.embed_tokens(input) + + if attention_mask is None: + attention_mask = torch.ones( + (inputs_embeds.shape[0], inputs_embeds.shape[1] + past_key_values_length), + dtype=torch.bool, + device=inputs_embeds.device, + ) + elif attention_mask.shape[1] != past_key_values_length + input_shape[1]: + raise ValueError( + f"The provided attention mask has length {attention_mask.shape[1]}, but its length should be " + f"{past_key_values_length + input_shape[1]} (sum of the lengths of current and past inputs)" + ) + + # embed positions + positions = self.embed_positions(attention_mask, past_key_values_length) + + if self._use_sdpa and not output_attentions and head_mask is None: + # output_attentions=True & head_mask can not be supported when using SDPA, fall back to + # the manual implementation that requires a 4D causal mask in all cases. + attention_mask = _prepare_4d_causal_attention_mask_for_sdpa( + attention_mask, input_shape, inputs_embeds, past_key_values_length + ) + else: + attention_mask = _prepare_4d_causal_attention_mask( + attention_mask, input_shape, inputs_embeds, past_key_values_length + ) + + hidden_states = inputs_embeds + positions + + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) + + if self.gradient_checkpointing and self.training: + if use_cache: + logger.warning_once( + "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." + ) + use_cache = False + + all_hidden_states = () if output_hidden_states else None + all_self_attns = () if output_attentions else None + all_cross_attentions = None + next_decoder_cache = () if use_cache else None + + for idx, decoder_layer in enumerate(self.layers): + # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) + if output_hidden_states: + all_hidden_states += (hidden_states,) + if self.training: + dropout_probability = torch.rand([]) + if dropout_probability < self.layerdrop: + continue + + past_key_value = past_key_values[idx] if past_key_values is not None else None + + if self.gradient_checkpointing and self.training: + layer_outputs = self._gradient_checkpointing_func( + decoder_layer.__call__, + hidden_states, + attention_mask, + head_mask[idx] if head_mask is not None else None, + None, + output_attentions, + use_cache, + ) + else: + layer_outputs = decoder_layer( + hidden_states, + attention_mask=attention_mask, + layer_head_mask=(head_mask[idx] if head_mask is not None else None), + past_key_value=past_key_value, + output_attentions=output_attentions, + use_cache=use_cache, + ) + + hidden_states = layer_outputs[0] + + if use_cache: + next_decoder_cache += (layer_outputs[2 if output_attentions else 1],) + + if output_attentions: + all_self_attns += (layer_outputs[1],) + + # add hidden states from the last decoder layer + if output_hidden_states: + all_hidden_states += (hidden_states,) + + hidden_states = self.layer_norm(hidden_states) + + next_cache = next_decoder_cache if use_cache else None + + if not return_dict: + return tuple( + v + for v in [hidden_states, next_cache, all_hidden_states, all_self_attns, all_cross_attentions] + if v is not None + ) + return BaseModelOutputWithPastAndCrossAttentions( + last_hidden_state=hidden_states, + past_key_values=next_cache, + hidden_states=all_hidden_states, + attentions=all_self_attns, + cross_attentions=all_cross_attentions, + ) + + +@add_start_docstrings( + """BioGPT Model with a `language modeling` head on top for CLM fine-tuning.""", BIOGPT_START_DOCSTRING +) +class BioGptForCausalLM(BioGptPreTrainedModel, GenerationMixin): + _tied_weights_keys = ["output_projection.weight"] + + def __init__(self, config): + super().__init__(config) + + self.biogpt = BioGptModel(config) + self.output_projection = nn.Linear(config.hidden_size, config.vocab_size, bias=False) + + # Initialize weights and apply final processing + self.post_init() + + def get_output_embeddings(self): + return self.output_projection + + def set_output_embeddings(self, new_embeddings): + self.output_projection = new_embeddings + + @add_start_docstrings_to_model_forward(BIOGPT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @add_code_sample_docstrings( + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=CausalLMOutputWithCrossAttentions, + config_class=_CONFIG_FOR_DOC, + ) + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.FloatTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + **kwargs, + ) -> Union[Tuple, CausalLMOutputWithCrossAttentions]: + r""" + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set + `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100` + are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]` + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.biogpt( + input_ids, + attention_mask=attention_mask, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + past_key_values=past_key_values, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + sequence_output = outputs[0] + prediction_scores = self.output_projection(sequence_output) + + lm_loss = None + if labels is not None: + lm_loss = self.loss_function( + prediction_scores, + labels, + vocab_size=self.config.vocab_size, + **kwargs, + ) + + if not return_dict: + output = (prediction_scores,) + outputs[1:] + return ((lm_loss,) + output) if lm_loss is not None else output + + return CausalLMOutputWithCrossAttentions( + loss=lm_loss, + logits=prediction_scores, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + cross_attentions=outputs.cross_attentions, + ) + + @staticmethod + def _reorder_cache(past_key_values, beam_idx): + reordered_past = () + for layer_past in past_key_values: + reordered_past += ( + tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past), + ) + return reordered_past + + +@add_start_docstrings( + """ + BioGPT Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for + Named-Entity-Recognition (NER) tasks. + """, + BIOGPT_START_DOCSTRING, +) +class BioGptForTokenClassification(BioGptPreTrainedModel): + def __init__(self, config): + super().__init__(config) + self.num_labels = config.num_labels + + self.biogpt = BioGptModel(config) + if hasattr(config, "classifier_dropout") and config.classifier_dropout is not None: + classifier_dropout = config.classifier_dropout + else: + classifier_dropout = config.hidden_dropout_prob + self.dropout = nn.Dropout(classifier_dropout) + self.classifier = nn.Linear(config.hidden_size, config.num_labels) + + self.post_init() + + @add_start_docstrings_to_model_forward(BIOGPT_INPUTS_DOCSTRING) + @add_code_sample_docstrings( + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=TokenClassifierOutput, + config_class=_CONFIG_FOR_DOC, + ) + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + token_type_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.FloatTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, TokenClassifierOutput]: + r""" + labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., + config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If + `config.num_labels > 1` a classification loss is computed (Cross-Entropy). + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + transformer_outputs = self.biogpt( + input_ids, + past_key_values=past_key_values, + attention_mask=attention_mask, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + hidden_states = transformer_outputs[0] + hidden_states = self.dropout(hidden_states) + logits = self.classifier(hidden_states) + + loss = None + if labels is not None: + loss_fct = CrossEntropyLoss() + # Only keep active parts of the loss + if attention_mask is not None: + active_loss = attention_mask.view(-1) == 1 + active_logits = logits.view(-1, self.num_labels) + active_labels = torch.where( + active_loss, labels.view(-1), torch.tensor(loss_fct.ignore_index).type_as(labels) + ) + loss = loss_fct(active_logits, active_labels) + else: + loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) + + if not return_dict: + output = (logits,) + transformer_outputs[2:] + return ((loss,) + output) if loss is not None else output + + return TokenClassifierOutput( + loss=loss, + logits=logits, + hidden_states=transformer_outputs.hidden_states, + attentions=transformer_outputs.attentions, + ) + + +@add_start_docstrings( + """ + The BioGpt Model transformer with a sequence classification head on top (linear layer). + + [`BioGptForSequenceClassification`] uses the last token in order to do the classification, as other causal models + (e.g. GPT-2) do. + + Since it does classification on the last token, it is required to know the position of the last token. If a + `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If + no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the + padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in + each row of the batch). + """, + BIOGPT_START_DOCSTRING, +) +class BioGptForSequenceClassification(BioGptPreTrainedModel): + def __init__(self, config: BioGptConfig): + super().__init__(config) + self.num_labels = config.num_labels + self.biogpt = BioGptModel(config) + self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False) + + # Initialize weights and apply final processing + self.post_init() + + @add_start_docstrings_to_model_forward(BIOGPT_INPUTS_DOCSTRING) + @add_code_sample_docstrings( + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=SequenceClassifierOutputWithPast, + config_class=_CONFIG_FOR_DOC, + ) + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.FloatTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, SequenceClassifierOutputWithPast]: + r""" + labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., + config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If + `config.num_labels > 1` a classification loss is computed (Cross-Entropy). + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + transformer_outputs = self.biogpt( + input_ids, + past_key_values=past_key_values, + attention_mask=attention_mask, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + hidden_states = transformer_outputs[0] + logits = self.score(hidden_states) + + if input_ids is not None: + batch_size, sequence_length = input_ids.shape[:2] + else: + batch_size, sequence_length = inputs_embeds.shape[:2] + + if self.config.pad_token_id is None: + sequence_length = -1 + else: + if input_ids is not None: + sequence_length = (torch.ne(input_ids, self.config.pad_token_id).sum(-1) - 1).to(logits.device) + else: + sequence_length = -1 + logger.warning_once( + f"{self.__class__.__name__} will not detect padding tokens in `inputs_embeds`. Results may be " + "unexpected if using padding tokens in conjunction with `inputs_embeds.`" + ) + + pooled_logits = logits[torch.arange(batch_size, device=logits.device), sequence_length] + + loss = None + if labels is not None: + if self.config.problem_type is None: + if self.num_labels == 1: + self.config.problem_type = "regression" + elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int): + self.config.problem_type = "single_label_classification" + else: + self.config.problem_type = "multi_label_classification" + + if self.config.problem_type == "regression": + loss_fct = MSELoss() + if self.num_labels == 1: + loss = loss_fct(pooled_logits.squeeze(), labels.squeeze()) + else: + loss = loss_fct(pooled_logits, labels) + elif self.config.problem_type == "single_label_classification": + loss_fct = CrossEntropyLoss() + loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1)) + elif self.config.problem_type == "multi_label_classification": + loss_fct = BCEWithLogitsLoss() + loss = loss_fct(pooled_logits, labels) + if not return_dict: + output = (pooled_logits,) + transformer_outputs[1:] + return ((loss,) + output) if loss is not None else output + + return SequenceClassifierOutputWithPast( + loss=loss, + logits=pooled_logits, + past_key_values=transformer_outputs.past_key_values, + hidden_states=transformer_outputs.hidden_states, + attentions=transformer_outputs.attentions, + ) + + def get_input_embeddings(self): + return self.biogpt.embed_tokens + + def set_input_embeddings(self, value): + self.biogpt.embed_tokens = value + + +__all__ = [ + "BioGptForCausalLM", + "BioGptForTokenClassification", + "BioGptForSequenceClassification", + "BioGptModel", + "BioGptPreTrainedModel", +] diff --git a/docs/transformers/build/lib/transformers/models/biogpt/tokenization_biogpt.py b/docs/transformers/build/lib/transformers/models/biogpt/tokenization_biogpt.py new file mode 100644 index 0000000000000000000000000000000000000000..a898976d985f582770602b3a5731081be5f59a0f --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/biogpt/tokenization_biogpt.py @@ -0,0 +1,361 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Team and Microsoft Research AI4Science. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Tokenization classes for BioGPT.""" + +import json +import os +from typing import List, Optional, Tuple + +from ...tokenization_utils import PreTrainedTokenizer +from ...utils import logging + + +logger = logging.get_logger(__name__) + +VOCAB_FILES_NAMES = { + "vocab_file": "vocab.json", + "merges_file": "merges.txt", +} + + +def get_pairs(word): + """ + Return set of symbol pairs in a word. word is represented as tuple of symbols (symbols being variable-length + strings) + """ + pairs = set() + prev_char = word[0] + for char in word[1:]: + pairs.add((prev_char, char)) + prev_char = char + return pairs + + +class BioGptTokenizer(PreTrainedTokenizer): + """ + Construct an FAIRSEQ Transformer tokenizer. Moses tokenization followed by Byte-Pair Encoding. + + This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to + this superclass for more information regarding those methods. + + Args: + vocab_file (`str`): + Path to the vocabulary file. + merges_file (`str`): + Merges file. + unk_token (`str`, *optional*, defaults to `""`): + The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this + token instead. + bos_token (`str`, *optional*, defaults to `""`): + The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token. + + + + When building a sequence using special tokens, this is not the token that is used for the beginning of + sequence. The token used is the `cls_token`. + + + + eos_token (`str`, *optional*, defaults to `""`): + The end of sequence token. + + + + When building a sequence using special tokens, this is not the token that is used for the end of sequence. + The token used is the `sep_token`. + + + + sep_token (`str`, *optional*, defaults to `""`): + The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for + sequence classification or for a text and a question for question answering. It is also used as the last + token of a sequence built with special tokens. + pad_token (`str`, *optional*, defaults to `""`): + The token used for padding, for example when batching sequences of different lengths. + """ + + vocab_files_names = VOCAB_FILES_NAMES + model_input_names = ["input_ids", "attention_mask"] + + def __init__( + self, + vocab_file, + merges_file, + unk_token="", + bos_token="", + eos_token="", + sep_token="", + pad_token="", + **kwargs, + ): + try: + import sacremoses + except ImportError: + raise ImportError( + "You need to install sacremoses to use BioGptTokenizer. " + "See https://pypi.org/project/sacremoses/ for installation." + ) + + self.lang = "en" + self.sm = sacremoses + # cache of sm.MosesTokenizer instance + self.cache_moses_tokenizer = {} + self.cache_moses_detokenizer = {} + + """ Initialisation""" + with open(vocab_file, encoding="utf-8") as vocab_handle: + self.encoder = json.load(vocab_handle) + self.decoder = {v: k for k, v in self.encoder.items()} + with open(merges_file, encoding="utf-8") as merges_handle: + merges = merges_handle.read().split("\n")[:-1] + merges = [tuple(merge.split()[:2]) for merge in merges] + self.bpe_ranks = dict(zip(merges, range(len(merges)))) + self.cache = {} + + super().__init__( + bos_token=bos_token, + eos_token=eos_token, + sep_token=sep_token, + unk_token=unk_token, + pad_token=pad_token, + **kwargs, + ) + + @property + def vocab_size(self): + """Returns vocab size""" + return len(self.encoder) + + def get_vocab(self): + return dict(self.encoder, **self.added_tokens_encoder) + + def moses_tokenize(self, text, lang): + if lang not in self.cache_moses_tokenizer: + moses_tokenizer = self.sm.MosesTokenizer(lang=lang) + self.cache_moses_tokenizer[lang] = moses_tokenizer + return self.cache_moses_tokenizer[lang].tokenize( + text, aggressive_dash_splits=True, return_str=False, escape=True + ) + + def moses_detokenize(self, tokens, lang): + if lang not in self.cache_moses_detokenizer: + moses_detokenizer = self.sm.MosesDetokenizer(lang=lang) + self.cache_moses_detokenizer[lang] = moses_detokenizer + return self.cache_moses_detokenizer[lang].detokenize(tokens) + + def bpe(self, token): + word = tuple(token[:-1]) + (token[-1] + "",) + if token in self.cache: + return self.cache[token] + pairs = get_pairs(word) + + if not pairs: + return token + "" + + while True: + bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf"))) + if bigram not in self.bpe_ranks: + break + first, second = bigram + new_word = [] + i = 0 + while i < len(word): + try: + j = word.index(first, i) + except ValueError: + new_word.extend(word[i:]) + break + else: + new_word.extend(word[i:j]) + i = j + + if word[i] == first and i < len(word) - 1 and word[i + 1] == second: + new_word.append(first + second) + i += 2 + else: + new_word.append(word[i]) + i += 1 + new_word = tuple(new_word) + word = new_word + if len(word) == 1: + break + else: + pairs = get_pairs(word) + word = " ".join(word) + if word == "\n ": + word = "\n" + self.cache[token] = word + return word + + def _tokenize(self, text, bypass_tokenizer=False): + """Returns a tokenized string.""" + if bypass_tokenizer: + text = text.split() + else: + text = self.moses_tokenize(text, self.lang) + + split_tokens = [] + for token in text: + if token: + split_tokens.extend(list(self.bpe(token).split(" "))) + + return split_tokens + + def _convert_token_to_id(self, token): + """Converts a token (str) in an id using the vocab.""" + return self.encoder.get(token, self.encoder.get(self.unk_token)) + + def _convert_id_to_token(self, index): + """Converts an index (integer) in a token (str) using the vocab.""" + return self.decoder.get(index, self.unk_token) + + def convert_tokens_to_string(self, tokens): + """Converts a sequence of tokens (string) in a single string.""" + # remove BPE + tokens = [t.replace(" ", "").replace("", " ") for t in tokens] + tokens = "".join(tokens).split() + # detokenize + text = self.moses_detokenize(tokens, self.lang) + return text + + def build_inputs_with_special_tokens( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None + ) -> List[int]: + """ + Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and + adding special tokens. A BioGPT sequence has the following format: + + - single sequence: ` X ` + - pair of sequences: ` A B ` + + Args: + token_ids_0 (`List[int]`): + List of IDs to which the special tokens will be added. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + + Returns: + `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens. + """ + if token_ids_1 is None: + return [self.sep_token_id] + token_ids_0 + sep = [self.sep_token_id] + return sep + token_ids_0 + sep + token_ids_1 + + def get_special_tokens_mask( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False + ) -> List[int]: + """ + Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding + special tokens using the tokenizer `prepare_for_model` method. + + Args: + token_ids_0 (`List[int]`): + List of IDs. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + already_has_special_tokens (`bool`, *optional*, defaults to `False`): + Whether or not the token list is already formatted with special tokens for the model. + + Returns: + `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. + """ + if already_has_special_tokens: + return super().get_special_tokens_mask( + token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True + ) + # no bos used in fairseq + if token_ids_1 is not None: + return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + return [1] + ([0] * len(token_ids_0)) + + def create_token_type_ids_from_sequences( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None + ) -> List[int]: + """ + Create a mask from the two sequences passed to be used in a sequence-pair classification task. A FAIRSEQ + Transformer sequence pair mask has the following format: + + ``` + 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 + | first sequence | second sequence | + ``` + + If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s). + + Args: + token_ids_0 (`List[int]`): + List of IDs. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + + Returns: + `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s). + """ + sep = [self.sep_token_id] + + # no bos used in fairseq + if token_ids_1 is None: + return len(token_ids_0 + sep) * [0] + return len(token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1] + + def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]: + if not os.path.isdir(save_directory): + logger.error(f"Vocabulary path ({save_directory}) should be a directory") + return + vocab_file = os.path.join( + save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"] + ) + merge_file = os.path.join( + save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["merges_file"] + ) + + with open(vocab_file, "w", encoding="utf-8") as f: + f.write(json.dumps(self.encoder, indent=2, sort_keys=True, ensure_ascii=False) + "\n") + + index = 0 + with open(merge_file, "w", encoding="utf-8") as writer: + for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]): + if index != token_index: + logger.warning( + f"Saving vocabulary to {merge_file}: BPE merge indices are not consecutive." + " Please check that the tokenizer is not corrupted!" + ) + index = token_index + writer.write(" ".join(bpe_tokens) + "\n") + index += 1 + + return vocab_file, merge_file + + def __getstate__(self): + state = self.__dict__.copy() + state["sm"] = None + return state + + def __setstate__(self, d): + self.__dict__ = d + + try: + import sacremoses + except ImportError: + raise ImportError( + "You need to install sacremoses to use XLMTokenizer. " + "See https://pypi.org/project/sacremoses/ for installation." + ) + + self.sm = sacremoses + + +__all__ = ["BioGptTokenizer"] diff --git a/docs/transformers/build/lib/transformers/models/bit/__init__.py b/docs/transformers/build/lib/transformers/models/bit/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..edfeb4dbe75bb53c011719d6c550b245ac814b28 --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/bit/__init__.py @@ -0,0 +1,29 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ...utils import _LazyModule +from ...utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .configuration_bit import * + from .image_processing_bit import * + from .image_processing_bit_fast import * + from .modeling_bit import * +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/docs/transformers/build/lib/transformers/models/bit/configuration_bit.py b/docs/transformers/build/lib/transformers/models/bit/configuration_bit.py new file mode 100644 index 0000000000000000000000000000000000000000..238749f1fbe70ff2decedacf7604de603d53f07c --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/bit/configuration_bit.py @@ -0,0 +1,136 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""BiT model configuration""" + +from ...configuration_utils import PretrainedConfig +from ...utils import logging +from ...utils.backbone_utils import BackboneConfigMixin, get_aligned_output_features_output_indices + + +logger = logging.get_logger(__name__) + + +class BitConfig(BackboneConfigMixin, PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`BitModel`]. It is used to instantiate an BiT + model according to the specified arguments, defining the model architecture. Instantiating a configuration with the + defaults will yield a similar configuration to that of the BiT + [google/bit-50](https://huggingface.co/google/bit-50) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + num_channels (`int`, *optional*, defaults to 3): + The number of input channels. + embedding_size (`int`, *optional*, defaults to 64): + Dimensionality (hidden size) for the embedding layer. + hidden_sizes (`List[int]`, *optional*, defaults to `[256, 512, 1024, 2048]`): + Dimensionality (hidden size) at each stage. + depths (`List[int]`, *optional*, defaults to `[3, 4, 6, 3]`): + Depth (number of layers) for each stage. + layer_type (`str`, *optional*, defaults to `"preactivation"`): + The layer to use, it can be either `"preactivation"` or `"bottleneck"`. + hidden_act (`str`, *optional*, defaults to `"relu"`): + The non-linear activation function in each block. If string, `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"` + are supported. + global_padding (`str`, *optional*): + Padding strategy to use for the convolutional layers. Can be either `"valid"`, `"same"`, or `None`. + num_groups (`int`, *optional*, defaults to 32): + Number of groups used for the `BitGroupNormActivation` layers. + drop_path_rate (`float`, *optional*, defaults to 0.0): + The drop path rate for the stochastic depth. + embedding_dynamic_padding (`bool`, *optional*, defaults to `False`): + Whether or not to make use of dynamic padding for the embedding layer. + output_stride (`int`, *optional*, defaults to 32): + The output stride of the model. + width_factor (`int`, *optional*, defaults to 1): + The width factor for the model. + out_features (`List[str]`, *optional*): + If used as backbone, list of features to output. Can be any of `"stem"`, `"stage1"`, `"stage2"`, etc. + (depending on how many stages the model has). If unset and `out_indices` is set, will default to the + corresponding stages. If unset and `out_indices` is unset, will default to the last stage. Must be in the + same order as defined in the `stage_names` attribute. + out_indices (`List[int]`, *optional*): + If used as backbone, list of indices of features to output. Can be any of 0, 1, 2, etc. (depending on how + many stages the model has). If unset and `out_features` is set, will default to the corresponding stages. + If unset and `out_features` is unset, will default to the last stage. Must be in the + same order as defined in the `stage_names` attribute. + + Example: + ```python + >>> from transformers import BitConfig, BitModel + + >>> # Initializing a BiT bit-50 style configuration + >>> configuration = BitConfig() + + >>> # Initializing a model (with random weights) from the bit-50 style configuration + >>> model = BitModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ``` + """ + + model_type = "bit" + layer_types = ["preactivation", "bottleneck"] + supported_padding = ["SAME", "VALID"] + + def __init__( + self, + num_channels=3, + embedding_size=64, + hidden_sizes=[256, 512, 1024, 2048], + depths=[3, 4, 6, 3], + layer_type="preactivation", + hidden_act="relu", + global_padding=None, + num_groups=32, + drop_path_rate=0.0, + embedding_dynamic_padding=False, + output_stride=32, + width_factor=1, + out_features=None, + out_indices=None, + **kwargs, + ): + super().__init__(**kwargs) + if layer_type not in self.layer_types: + raise ValueError(f"layer_type={layer_type} is not one of {','.join(self.layer_types)}") + if global_padding is not None: + if global_padding.upper() in self.supported_padding: + global_padding = global_padding.upper() + else: + raise ValueError(f"Padding strategy {global_padding} not supported") + self.num_channels = num_channels + self.embedding_size = embedding_size + self.hidden_sizes = hidden_sizes + self.depths = depths + self.layer_type = layer_type + self.hidden_act = hidden_act + self.global_padding = global_padding + self.num_groups = num_groups + self.drop_path_rate = drop_path_rate + self.embedding_dynamic_padding = embedding_dynamic_padding + self.output_stride = output_stride + self.width_factor = width_factor + + self.stage_names = ["stem"] + [f"stage{idx}" for idx in range(1, len(depths) + 1)] + self._out_features, self._out_indices = get_aligned_output_features_output_indices( + out_features=out_features, out_indices=out_indices, stage_names=self.stage_names + ) + + +__all__ = ["BitConfig"] diff --git a/docs/transformers/build/lib/transformers/models/bit/convert_bit_to_pytorch.py b/docs/transformers/build/lib/transformers/models/bit/convert_bit_to_pytorch.py new file mode 100644 index 0000000000000000000000000000000000000000..abc24290ab26e57f0a2962003c1cd09d7bddb9ff --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/bit/convert_bit_to_pytorch.py @@ -0,0 +1,177 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Convert BiT checkpoints from the timm library.""" + +import argparse +import json +from pathlib import Path + +import requests +import torch +from huggingface_hub import hf_hub_download +from PIL import Image +from timm import create_model +from timm.data import resolve_data_config +from timm.data.transforms_factory import create_transform + +from transformers import BitConfig, BitForImageClassification, BitImageProcessor +from transformers.image_utils import PILImageResampling +from transformers.utils import logging + + +logging.set_verbosity_info() +logger = logging.get_logger(__name__) + + +def get_config(model_name): + repo_id = "huggingface/label-files" + filename = "imagenet-1k-id2label.json" + id2label = json.load(open(hf_hub_download(repo_id, filename, repo_type="dataset"), "r")) + id2label = {int(k): v for k, v in id2label.items()} + label2id = {v: k for k, v in id2label.items()} + + conv_layer = "std_conv" if "bit" in model_name else False + + # note that when using BiT as backbone for ViT-hybrid checkpoints, + # one needs to additionally set config.layer_type = "bottleneck", config.stem_type = "same", + # config.conv_layer = "std_conv_same" + config = BitConfig( + conv_layer=conv_layer, + num_labels=1000, + id2label=id2label, + label2id=label2id, + ) + + return config + + +def rename_key(name): + if "stem.conv" in name: + name = name.replace("stem.conv", "bit.embedder.convolution") + if "blocks" in name: + name = name.replace("blocks", "layers") + if "head.fc" in name: + name = name.replace("head.fc", "classifier.1") + if name.startswith("norm"): + name = "bit." + name + if "bit" not in name and "classifier" not in name: + name = "bit.encoder." + name + + return name + + +# We will verify our results on an image of cute cats +def prepare_img(): + url = "http://images.cocodataset.org/val2017/000000039769.jpg" + im = Image.open(requests.get(url, stream=True).raw) + return im + + +@torch.no_grad() +def convert_bit_checkpoint(model_name, pytorch_dump_folder_path, push_to_hub=False): + """ + Copy/paste/tweak model's weights to our BiT structure. + """ + + # define default BiT configuration + config = get_config(model_name) + + # load original model from timm + timm_model = create_model(model_name, pretrained=True) + timm_model.eval() + + # load state_dict of original model + state_dict = timm_model.state_dict() + for key in state_dict.copy().keys(): + val = state_dict.pop(key) + state_dict[rename_key(key)] = val.squeeze() if "head" in key else val + + # load HuggingFace model + model = BitForImageClassification(config) + model.eval() + model.load_state_dict(state_dict) + + # create image processor + transform = create_transform(**resolve_data_config({}, model=timm_model)) + timm_transforms = transform.transforms + + pillow_resamplings = { + "bilinear": PILImageResampling.BILINEAR, + "bicubic": PILImageResampling.BICUBIC, + "nearest": PILImageResampling.NEAREST, + } + + processor = BitImageProcessor( + do_resize=True, + size={"shortest_edge": timm_transforms[0].size}, + resample=pillow_resamplings[timm_transforms[0].interpolation.value], + do_center_crop=True, + crop_size={"height": timm_transforms[1].size[0], "width": timm_transforms[1].size[1]}, + do_normalize=True, + image_mean=timm_transforms[-1].mean.tolist(), + image_std=timm_transforms[-1].std.tolist(), + ) + + image = prepare_img() + timm_pixel_values = transform(image).unsqueeze(0) + pixel_values = processor(image, return_tensors="pt").pixel_values + + # verify pixel values + assert torch.allclose(timm_pixel_values, pixel_values) + + # verify logits + with torch.no_grad(): + outputs = model(pixel_values) + logits = outputs.logits + + print("Logits:", logits[0, :3]) + print("Predicted class:", model.config.id2label[logits.argmax(-1).item()]) + timm_logits = timm_model(pixel_values) + assert timm_logits.shape == outputs.logits.shape + assert torch.allclose(timm_logits, outputs.logits, atol=1e-3) + print("Looks ok!") + + if pytorch_dump_folder_path is not None: + Path(pytorch_dump_folder_path).mkdir(exist_ok=True) + print(f"Saving model {model_name} and processor to {pytorch_dump_folder_path}") + model.save_pretrained(pytorch_dump_folder_path) + processor.save_pretrained(pytorch_dump_folder_path) + + if push_to_hub: + print(f"Pushing model {model_name} and processor to the hub") + model.push_to_hub(f"ybelkada/{model_name}") + processor.push_to_hub(f"ybelkada/{model_name}") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + # Required parameters + parser.add_argument( + "--model_name", + default="resnetv2_50x1_bitm", + type=str, + help="Name of the BiT timm model you'd like to convert.", + ) + parser.add_argument( + "--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model directory." + ) + parser.add_argument( + "--push_to_hub", + action="store_true", + help="Whether to push the model to the hub.", + ) + + args = parser.parse_args() + convert_bit_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub) diff --git a/docs/transformers/build/lib/transformers/models/bit/image_processing_bit.py b/docs/transformers/build/lib/transformers/models/bit/image_processing_bit.py new file mode 100644 index 0000000000000000000000000000000000000000..2b1f307a29fb091ec028fdd01f0bcd0851be73d0 --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/bit/image_processing_bit.py @@ -0,0 +1,324 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Image processor class for BiT.""" + +from typing import Dict, List, Optional, Union + +import numpy as np + +from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict +from ...image_transforms import ( + convert_to_rgb, + get_resize_output_image_size, + resize, + to_channel_dimension_format, +) +from ...image_utils import ( + OPENAI_CLIP_MEAN, + OPENAI_CLIP_STD, + ChannelDimension, + ImageInput, + PILImageResampling, + infer_channel_dimension_format, + is_scaled_image, + make_list_of_images, + to_numpy_array, + valid_images, + validate_preprocess_arguments, +) +from ...utils import TensorType, filter_out_non_signature_kwargs, is_vision_available, logging + + +logger = logging.get_logger(__name__) + + +if is_vision_available(): + import PIL + + +class BitImageProcessor(BaseImageProcessor): + r""" + Constructs a BiT image processor. + + Args: + do_resize (`bool`, *optional*, defaults to `True`): + Whether to resize the image's (height, width) dimensions to the specified `size`. Can be overridden by + `do_resize` in the `preprocess` method. + size (`Dict[str, int]` *optional*, defaults to `{"shortest_edge": 224}`): + Size of the image after resizing. The shortest edge of the image is resized to size["shortest_edge"], with + the longest edge resized to keep the input aspect ratio. Can be overridden by `size` in the `preprocess` + method. + resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`): + Resampling filter to use if resizing the image. Can be overridden by `resample` in the `preprocess` method. + do_center_crop (`bool`, *optional*, defaults to `True`): + Whether to center crop the image to the specified `crop_size`. Can be overridden by `do_center_crop` in the + `preprocess` method. + crop_size (`Dict[str, int]` *optional*, defaults to 224): + Size of the output image after applying `center_crop`. Can be overridden by `crop_size` in the `preprocess` + method. + do_rescale (`bool`, *optional*, defaults to `True`): + Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by `do_rescale` in + the `preprocess` method. + rescale_factor (`int` or `float`, *optional*, defaults to `1/255`): + Scale factor to use if rescaling the image. Can be overridden by `rescale_factor` in the `preprocess` + method. + do_normalize: + Whether to normalize the image. Can be overridden by `do_normalize` in the `preprocess` method. + image_mean (`float` or `List[float]`, *optional*, defaults to `OPENAI_CLIP_MEAN`): + Mean to use if normalizing the image. This is a float or list of floats the length of the number of + channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method. + image_std (`float` or `List[float]`, *optional*, defaults to `OPENAI_CLIP_MEAN`): + Standard deviation to use if normalizing the image. This is a float or list of floats the length of the + number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method. + Can be overridden by the `image_std` parameter in the `preprocess` method. + do_convert_rgb (`bool`, *optional*, defaults to `True`): + Whether to convert the image to RGB. + """ + + model_input_names = ["pixel_values"] + + def __init__( + self, + do_resize: bool = True, + size: Dict[str, int] = None, + resample: PILImageResampling = PILImageResampling.BICUBIC, + do_center_crop: bool = True, + crop_size: Dict[str, int] = None, + do_rescale: bool = True, + rescale_factor: Union[int, float] = 1 / 255, + do_normalize: bool = True, + image_mean: Optional[Union[float, List[float]]] = None, + image_std: Optional[Union[float, List[float]]] = None, + do_convert_rgb: bool = True, + **kwargs, + ) -> None: + super().__init__(**kwargs) + size = size if size is not None else {"shortest_edge": 224} + size = get_size_dict(size, default_to_square=False) + crop_size = crop_size if crop_size is not None else {"height": 224, "width": 224} + crop_size = get_size_dict(crop_size, default_to_square=True, param_name="crop_size") + + self.do_resize = do_resize + self.size = size + self.resample = resample + self.do_center_crop = do_center_crop + self.crop_size = crop_size + self.do_rescale = do_rescale + self.rescale_factor = rescale_factor + self.do_normalize = do_normalize + self.image_mean = image_mean if image_mean is not None else OPENAI_CLIP_MEAN + self.image_std = image_std if image_std is not None else OPENAI_CLIP_STD + self.do_convert_rgb = do_convert_rgb + + # Copied from transformers.models.clip.image_processing_clip.CLIPImageProcessor.resize + def resize( + self, + image: np.ndarray, + size: Dict[str, int], + resample: PILImageResampling = PILImageResampling.BICUBIC, + data_format: Optional[Union[str, ChannelDimension]] = None, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + **kwargs, + ) -> np.ndarray: + """ + Resize an image. The shortest edge of the image is resized to size["shortest_edge"], with the longest edge + resized to keep the input aspect ratio. + + Args: + image (`np.ndarray`): + Image to resize. + size (`Dict[str, int]`): + Size of the output image. + resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`): + Resampling filter to use when resiizing the image. + data_format (`str` or `ChannelDimension`, *optional*): + The channel dimension format of the image. If not provided, it will be the same as the input image. + input_data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format of the input image. If not provided, it will be inferred. + """ + default_to_square = True + if "shortest_edge" in size: + size = size["shortest_edge"] + default_to_square = False + elif "height" in size and "width" in size: + size = (size["height"], size["width"]) + else: + raise ValueError("Size must contain either 'shortest_edge' or 'height' and 'width'.") + + output_size = get_resize_output_image_size( + image, + size=size, + default_to_square=default_to_square, + input_data_format=input_data_format, + ) + return resize( + image, + size=output_size, + resample=resample, + data_format=data_format, + input_data_format=input_data_format, + **kwargs, + ) + + @filter_out_non_signature_kwargs() + def preprocess( + self, + images: ImageInput, + do_resize: Optional[bool] = None, + size: Dict[str, int] = None, + resample: PILImageResampling = None, + do_center_crop: Optional[bool] = None, + crop_size: Optional[int] = None, + do_rescale: Optional[bool] = None, + rescale_factor: Optional[float] = None, + do_normalize: Optional[bool] = None, + image_mean: Optional[Union[float, List[float]]] = None, + image_std: Optional[Union[float, List[float]]] = None, + do_convert_rgb: Optional[bool] = None, + return_tensors: Optional[Union[str, TensorType]] = None, + data_format: Optional[ChannelDimension] = ChannelDimension.FIRST, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + ) -> PIL.Image.Image: + """ + Preprocess an image or batch of images. + + Args: + images (`ImageInput`): + Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If + passing in images with pixel values between 0 and 1, set `do_rescale=False`. + do_resize (`bool`, *optional*, defaults to `self.do_resize`): + Whether to resize the image. + size (`Dict[str, int]`, *optional*, defaults to `self.size`): + Size of the image after resizing. Shortest edge of the image is resized to size["shortest_edge"], with + the longest edge resized to keep the input aspect ratio. + resample (`int`, *optional*, defaults to `self.resample`): + Resampling filter to use if resizing the image. This can be one of the enum `PILImageResampling`. Only + has an effect if `do_resize` is set to `True`. + do_center_crop (`bool`, *optional*, defaults to `self.do_center_crop`): + Whether to center crop the image. + crop_size (`Dict[str, int]`, *optional*, defaults to `self.crop_size`): + Size of the center crop. Only has an effect if `do_center_crop` is set to `True`. + do_rescale (`bool`, *optional*, defaults to `self.do_rescale`): + Whether to rescale the image. + rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`): + Rescale factor to rescale the image by if `do_rescale` is set to `True`. + do_normalize (`bool`, *optional*, defaults to `self.do_normalize`): + Whether to normalize the image. + image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`): + Image mean to use for normalization. Only has an effect if `do_normalize` is set to `True`. + image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`): + Image standard deviation to use for normalization. Only has an effect if `do_normalize` is set to + `True`. + do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`): + Whether to convert the image to RGB. + return_tensors (`str` or `TensorType`, *optional*): + The type of tensors to return. Can be one of: + - Unset: Return a list of `np.ndarray`. + - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`. + - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`. + - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`. + - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`. + data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`): + The channel dimension format for the output image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + - Unset: Use the channel dimension format of the input image. + input_data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format for the input image. If unset, the channel dimension format is inferred + from the input image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + - `"none"` or `ChannelDimension.NONE`: image in (height, width) format. + """ + do_resize = do_resize if do_resize is not None else self.do_resize + size = size if size is not None else self.size + size = get_size_dict(size, param_name="size", default_to_square=False) + resample = resample if resample is not None else self.resample + do_center_crop = do_center_crop if do_center_crop is not None else self.do_center_crop + crop_size = crop_size if crop_size is not None else self.crop_size + crop_size = get_size_dict(crop_size, param_name="crop_size", default_to_square=True) + do_rescale = do_rescale if do_rescale is not None else self.do_rescale + rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor + do_normalize = do_normalize if do_normalize is not None else self.do_normalize + image_mean = image_mean if image_mean is not None else self.image_mean + image_std = image_std if image_std is not None else self.image_std + do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb + + images = make_list_of_images(images) + + if not valid_images(images): + raise ValueError( + "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, " + "torch.Tensor, tf.Tensor or jax.ndarray." + ) + + validate_preprocess_arguments( + do_rescale=do_rescale, + rescale_factor=rescale_factor, + do_normalize=do_normalize, + image_mean=image_mean, + image_std=image_std, + do_center_crop=do_center_crop, + crop_size=crop_size, + do_resize=do_resize, + size=size, + resample=resample, + ) + + # PIL RGBA images are converted to RGB + if do_convert_rgb: + images = [convert_to_rgb(image) for image in images] + + # All transformations expect numpy arrays. + images = [to_numpy_array(image) for image in images] + + if do_rescale and is_scaled_image(images[0]): + logger.warning_once( + "It looks like you are trying to rescale already rescaled images. If the input" + " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again." + ) + + if input_data_format is None: + # We assume that all images have the same channel dimension format. + input_data_format = infer_channel_dimension_format(images[0]) + + all_images = [] + for image in images: + if do_resize: + image = self.resize(image=image, size=size, resample=resample, input_data_format=input_data_format) + + if do_center_crop: + image = self.center_crop(image=image, size=crop_size, input_data_format=input_data_format) + + if do_rescale: + image = self.rescale(image=image, scale=rescale_factor, input_data_format=input_data_format) + + if do_normalize: + image = self.normalize( + image=image, mean=image_mean, std=image_std, input_data_format=input_data_format + ) + + all_images.append(image) + + images = [ + to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) + for image in all_images + ] + + data = {"pixel_values": images} + return BatchFeature(data=data, tensor_type=return_tensors) + + +__all__ = ["BitImageProcessor"] diff --git a/docs/transformers/build/lib/transformers/models/bit/image_processing_bit_fast.py b/docs/transformers/build/lib/transformers/models/bit/image_processing_bit_fast.py new file mode 100644 index 0000000000000000000000000000000000000000..19b3fbfd32b3ae77fd6bda46587ac07bf4319515 --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/bit/image_processing_bit_fast.py @@ -0,0 +1,44 @@ +# coding=utf-8 +# Copyright 2025 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Fast Image processor class for BiT.""" + +from ...image_processing_utils_fast import ( + BASE_IMAGE_PROCESSOR_FAST_DOCSTRING, + BaseImageProcessorFast, +) +from ...image_utils import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD, PILImageResampling +from ...utils import add_start_docstrings + + +@add_start_docstrings( + "Constructs a fast Bit image processor.", + BASE_IMAGE_PROCESSOR_FAST_DOCSTRING, +) +class BitImageProcessorFast(BaseImageProcessorFast): + resample = PILImageResampling.BICUBIC + image_mean = OPENAI_CLIP_MEAN + image_std = OPENAI_CLIP_STD + size = {"shortest_edge": 224} + default_to_square = False + crop_size = {"height": 224, "width": 224} + rescale_factor = 1 / 255 + do_resize = True + do_center_crop = True + do_rescale = True + do_normalize = True + do_convert_rgb = True + + +__all__ = ["BitImageProcessorFast"] diff --git a/docs/transformers/build/lib/transformers/models/bit/modeling_bit.py b/docs/transformers/build/lib/transformers/models/bit/modeling_bit.py new file mode 100644 index 0000000000000000000000000000000000000000..d71e3bf94635215b93811afe9eca2fa21c2d5584 --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/bit/modeling_bit.py @@ -0,0 +1,906 @@ +# coding=utf-8 +# Copyright 2022 Google AI and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch BiT model. Also supports backbone for ViT hybrid.""" + +import collections +import math +from typing import Optional, Tuple + +import numpy as np +import torch +import torch.utils.checkpoint +from torch import Tensor, nn +from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss + +from ...activations import ACT2FN +from ...modeling_outputs import ( + BackboneOutput, + BaseModelOutputWithNoAttention, + BaseModelOutputWithPoolingAndNoAttention, + ImageClassifierOutputWithNoAttention, +) +from ...modeling_utils import PreTrainedModel +from ...utils import ( + add_code_sample_docstrings, + add_start_docstrings, + add_start_docstrings_to_model_forward, + logging, + replace_return_docstrings, +) +from ...utils.backbone_utils import BackboneMixin +from .configuration_bit import BitConfig + + +logger = logging.get_logger(__name__) + +# General docstring +_CONFIG_FOR_DOC = "BitConfig" + +# Base docstring +_CHECKPOINT_FOR_DOC = "google/bit-50" +_EXPECTED_OUTPUT_SHAPE = [1, 2048, 7, 7] + +# Image classification docstring +_IMAGE_CLASS_CHECKPOINT = "google/bit-50" +_IMAGE_CLASS_EXPECTED_OUTPUT = "tiger cat" + + +def get_padding_value(padding=None, kernel_size=7, stride=1, dilation=1) -> Tuple[Tuple, bool]: + r""" + Utility function to get the tuple padding value given the kernel_size and padding. + + Args: + padding (Union[`str`, `int`], *optional*): + Padding value, can be either `"same"`, `"valid"`. If a different value is provided the default padding from + PyTorch is used. + kernel_size (`int`, *optional*, defaults to 7): + Kernel size of the convolution layers. + stride (`int`, *optional*, defaults to 1): + Stride value of the convolution layers. + dilation (`int`, *optional*, defaults to 1): + Dilation value of the convolution layers. + """ + dynamic = False + if padding is None: + padding = ((stride - 1) + dilation * (kernel_size - 1)) // 2 + return padding, dynamic + + if isinstance(padding, str): + # for any string padding, the padding will be calculated for you, one of three ways + padding = padding.lower() + if padding == "same": + # TF compatible 'SAME' padding, has a performance and GPU memory allocation impact + if stride == 1 and (dilation * (kernel_size - 1)) % 2 == 0: + # static case, no extra overhead + padding = ((stride - 1) + dilation * (kernel_size - 1)) // 2 + else: + # dynamic 'SAME' padding, has runtime/GPU memory overhead + padding = 0 + dynamic = True + elif padding == "valid": + # 'VALID' padding, same as padding=0 + padding = 0 + else: + # Default to PyTorch style 'same'-ish symmetric padding + padding = ((stride - 1) + dilation * (kernel_size - 1)) // 2 + return padding, dynamic + + +class WeightStandardizedConv2d(nn.Conv2d): + """Conv2d with Weight Standardization. Includes TensorFlow compatible SAME padding. Used for ViT Hybrid model. + + Paper: [Micro-Batch Training with Batch-Channel Normalization and Weight + Standardization](https://arxiv.org/abs/1903.10520v2) + """ + + def __init__( + self, + in_channel, + out_channels, + kernel_size, + stride=1, + padding="SAME", + dilation=1, + groups=1, + bias=False, + eps=1e-6, + ): + padding, is_dynamic = get_padding_value(padding, kernel_size, stride=stride, dilation=dilation) + super().__init__( + in_channel, + out_channels, + kernel_size, + stride=stride, + padding=padding, + dilation=dilation, + groups=groups, + bias=bias, + ) + if is_dynamic: + self.pad = DynamicPad2d(kernel_size, stride, dilation) + else: + self.pad = None + self.eps = eps + + def forward(self, hidden_state): + if self.pad is not None: + hidden_state = self.pad(hidden_state) + weight = nn.functional.batch_norm( + self.weight.reshape(1, self.out_channels, -1), None, None, training=True, momentum=0.0, eps=self.eps + ).reshape_as(self.weight) + hidden_state = nn.functional.conv2d( + hidden_state, weight, self.bias, self.stride, self.padding, self.dilation, self.groups + ) + return hidden_state + + +class BitGroupNormActivation(nn.GroupNorm): + r""" + A module that combines group normalization with an activation function. + """ + + def __init__(self, config, num_channels, eps=1e-5, affine=True, apply_activation=True): + super(BitGroupNormActivation, self).__init__(config.num_groups, num_channels, eps=eps, affine=affine) + if apply_activation: + self.activation = ACT2FN[config.hidden_act] + else: + self.activation = nn.Identity() + + def forward(self, hidden_state): + hidden_state = nn.functional.group_norm(hidden_state, self.num_groups, self.weight, self.bias, self.eps) + hidden_state = self.activation(hidden_state) + return hidden_state + + +class DynamicPad2d(nn.Module): + r""" + A module that wraps dynamic padding of any input, given the parameters of the convolutional layer and the input + hidden states. + """ + + def __init__(self, kernel_size, stride, dilation, value=0): + super().__init__() + # Safety checkers + if isinstance(kernel_size, int): + kernel_size = (kernel_size, kernel_size) + + if isinstance(stride, int): + stride = (stride, stride) + + if isinstance(dilation, int): + dilation = (dilation, dilation) + + self.kernel_size = kernel_size + self.stride = stride + self.dilation = dilation + self.value = value + + def compute_padding(x, kernel_size, stride, dilation): + return max((math.ceil(x / stride) - 1) * stride + (kernel_size - 1) * dilation + 1 - x, 0) + + self.compute_padding = compute_padding + + def forward(self, input): + # Get width and height + input_height, input_width = input.size()[-2:] + + # Compute the padding values + padding_height = self.compute_padding(input_height, self.kernel_size[0], self.stride[0], self.dilation[0]) + padding_width = self.compute_padding(input_width, self.kernel_size[1], self.stride[1], self.dilation[1]) + + # apply pad + if padding_height > 0 or padding_width > 0: + input = nn.functional.pad( + input, + [ + padding_width // 2, + padding_width - padding_width // 2, + padding_height // 2, + padding_height - padding_height // 2, + ], + value=self.value, + ) + return input + + +class BitMaxPool2d(nn.MaxPool2d): + """Tensorflow like 'SAME' wrapper for 2D max pooling""" + + def __init__( + self, + kernel_size: int, + stride=None, + dilation=1, + ceil_mode=False, + padding=(0, 0), + padding_value=0, + use_dynamic_padding=True, + ): + kernel_size = kernel_size if isinstance(kernel_size, collections.abc.Iterable) else (kernel_size, kernel_size) + stride = stride if isinstance(stride, collections.abc.Iterable) else (stride, stride) + dilation = dilation if isinstance(dilation, collections.abc.Iterable) else (dilation, dilation) + super().__init__(kernel_size, stride, padding, dilation, ceil_mode) + if use_dynamic_padding: + self.pad = DynamicPad2d(kernel_size, stride, dilation, padding_value) + else: + self.pad = nn.Identity() + + def forward(self, hidden_states): + hidden_states = self.pad(hidden_states) + return nn.functional.max_pool2d( + hidden_states, self.kernel_size, self.stride, self.padding, self.dilation, self.ceil_mode + ) + + +class BitEmbeddings(nn.Module): + """ + BiT Embeddings (stem) composed of a single aggressive convolution. + """ + + def __init__(self, config: BitConfig): + super().__init__() + + self.convolution = WeightStandardizedConv2d( + config.num_channels, + config.embedding_size, + kernel_size=7, + stride=2, + eps=1e-8, + padding=config.global_padding, + ) + + self.pooler = BitMaxPool2d(kernel_size=3, stride=2, use_dynamic_padding=config.embedding_dynamic_padding) + + # Use the same padding strategy as convolutional layers + if config.global_padding is not None and config.global_padding.upper() == "SAME": + self.pad = nn.Identity() + else: + self.pad = nn.ConstantPad2d(padding=(1, 1, 1, 1), value=0.0) + + if not config.layer_type == "preactivation": + self.norm = BitGroupNormActivation(config, num_channels=config.embedding_size) + else: + self.norm = nn.Identity() + + self.num_channels = config.num_channels + + def forward(self, pixel_values: Tensor) -> Tensor: + num_channels = pixel_values.shape[1] + if num_channels != self.num_channels: + raise ValueError( + "Make sure that the channel dimension of the pixel values match with the one set in the configuration." + ) + + embedding = self.convolution(pixel_values) + + embedding = self.pad(embedding) + + embedding = self.norm(embedding) + + embedding = self.pooler(embedding) + + return embedding + + +# Copied from transformers.models.convnext.modeling_convnext.drop_path +def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = False) -> torch.Tensor: + """ + Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). + + Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks, + however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper... + See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the + layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the + argument. + """ + if drop_prob == 0.0 or not training: + return input + keep_prob = 1 - drop_prob + shape = (input.shape[0],) + (1,) * (input.ndim - 1) # work with diff dim tensors, not just 2D ConvNets + random_tensor = keep_prob + torch.rand(shape, dtype=input.dtype, device=input.device) + random_tensor.floor_() # binarize + output = input.div(keep_prob) * random_tensor + return output + + +# Copied from transformers.models.beit.modeling_beit.BeitDropPath with Beit->Bit +class BitDropPath(nn.Module): + """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).""" + + def __init__(self, drop_prob: Optional[float] = None) -> None: + super().__init__() + self.drop_prob = drop_prob + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + return drop_path(hidden_states, self.drop_prob, self.training) + + def extra_repr(self) -> str: + return "p={}".format(self.drop_prob) + + +def make_div(value, divisor=8): + min_value = divisor + new_value = max(min_value, int(value + divisor / 2) // divisor * divisor) + if new_value < 0.9 * value: + new_value += divisor + return new_value + + +class BitPreActivationBottleneckLayer(nn.Module): + """Pre-activation (v2) bottleneck block. + Follows the implementation of "Identity Mappings in Deep Residual Networks": + https://github.com/KaimingHe/resnet-1k-layers/blob/master/resnet-pre-act.lua + + Except it puts the stride on 3x3 conv when available. + """ + + def __init__( + self, + config, + in_channels, + out_channels=None, + bottle_ratio=0.25, + stride=1, + dilation=1, + first_dilation=None, + groups=1, + drop_path_rate=0.0, + is_first_layer=False, + ): + super().__init__() + + first_dilation = first_dilation or dilation + + out_channels = out_channels or in_channels + mid_channels = make_div(out_channels * bottle_ratio) + + if is_first_layer: + self.downsample = BitDownsampleConv( + config, + in_channels, + out_channels, + stride=stride, + preact=True, + ) + else: + self.downsample = None + + self.norm1 = BitGroupNormActivation(config, in_channels) + self.conv1 = WeightStandardizedConv2d(in_channels, mid_channels, 1, eps=1e-8, padding=config.global_padding) + + self.norm2 = BitGroupNormActivation(config, num_channels=mid_channels) + self.conv2 = WeightStandardizedConv2d( + mid_channels, mid_channels, 3, stride=stride, groups=groups, eps=1e-8, padding=config.global_padding + ) + + self.norm3 = BitGroupNormActivation(config, mid_channels) + self.conv3 = WeightStandardizedConv2d(mid_channels, out_channels, 1, eps=1e-8, padding=config.global_padding) + + self.drop_path = BitDropPath(drop_path_rate) if drop_path_rate > 0 else nn.Identity() + + def forward(self, hidden_states): + hidden_states_preact = self.norm1(hidden_states) + + # shortcut branch + shortcut = hidden_states + if self.downsample is not None: + shortcut = self.downsample(hidden_states_preact) + + # residual branch + hidden_states = self.conv1(hidden_states_preact) + hidden_states = self.conv2(self.norm2(hidden_states)) + hidden_states = self.conv3(self.norm3(hidden_states)) + hidden_states = self.drop_path(hidden_states) + return hidden_states + shortcut + + +class BitBottleneckLayer(nn.Module): + """Non Pre-activation bottleneck block, equivalent to V1.5/V1b bottleneck. Used for ViT Hybrid.""" + + def __init__( + self, + config, + in_channels, + out_channels=None, + bottle_ratio=0.25, + stride=1, + dilation=1, + first_dilation=None, + groups=1, + drop_path_rate=0.0, + is_first_layer=False, + ): + super().__init__() + first_dilation = first_dilation or dilation + + out_channels = out_channels or in_channels + mid_chs = make_div(out_channels * bottle_ratio) + + if is_first_layer: + self.downsample = BitDownsampleConv( + config, + in_channels, + out_channels, + stride=stride, + preact=False, + ) + else: + self.downsample = None + + self.conv1 = WeightStandardizedConv2d(in_channels, mid_chs, 1, eps=1e-8, padding=config.global_padding) + self.norm1 = BitGroupNormActivation(config, num_channels=mid_chs) + self.conv2 = WeightStandardizedConv2d( + mid_chs, + mid_chs, + 3, + stride=stride, + dilation=first_dilation, + groups=groups, + eps=1e-8, + padding=config.global_padding, + ) + self.norm2 = BitGroupNormActivation(config, num_channels=mid_chs) + self.conv3 = WeightStandardizedConv2d(mid_chs, out_channels, 1, eps=1e-8, padding=config.global_padding) + self.norm3 = BitGroupNormActivation(config, num_channels=out_channels, apply_activation=False) + self.drop_path = BitDropPath(drop_path_rate) if drop_path_rate > 0 else nn.Identity() + + self.activation = ACT2FN[config.hidden_act] + + def forward(self, hidden_states): + # shortcut branch + shortcut = hidden_states + if self.downsample is not None: + shortcut = self.downsample(hidden_states) + + # residual + hidden_states = self.conv1(hidden_states) + hidden_states = self.norm1(hidden_states) + + hidden_states = self.conv2(hidden_states) + hidden_states = self.norm2(hidden_states) + + hidden_states = self.conv3(hidden_states) + hidden_states = self.norm3(hidden_states) + + hidden_states = self.drop_path(hidden_states) + hidden_states = self.activation(hidden_states + shortcut) + return hidden_states + + +class BitDownsampleConv(nn.Module): + def __init__( + self, + config, + in_channels, + out_channels, + stride=1, + preact=True, + ): + super().__init__() + self.conv = WeightStandardizedConv2d( + in_channels, out_channels, 1, stride=stride, eps=1e-8, padding=config.global_padding + ) + self.norm = ( + nn.Identity() + if preact + else BitGroupNormActivation(config, num_channels=out_channels, apply_activation=False) + ) + + def forward(self, x): + return self.norm(self.conv(x)) + + +class BitStage(nn.Module): + """ + A ResNet v2 stage composed by stacked layers. + """ + + def __init__( + self, + config, + in_channels, + out_channels, + stride, + dilation, + depth, + bottle_ratio=0.25, + layer_dropout=None, + ): + super().__init__() + + first_dilation = 1 if dilation in (1, 2) else 2 + + # Get the layer type + if config.layer_type == "bottleneck": + layer_cls = BitBottleneckLayer + else: + layer_cls = BitPreActivationBottleneckLayer + + prev_chs = in_channels + self.layers = nn.Sequential() + for layer_idx in range(depth): + # Get the current hyper-parameters + stride, drop_path_rate, is_first_layer = self._get_updated_hyperparameters( + layer_idx, stride, layer_dropout + ) + + self.layers.add_module( + str(layer_idx), + layer_cls( + config, + prev_chs, + out_channels, + stride=stride, + dilation=dilation, + bottle_ratio=bottle_ratio, + first_dilation=first_dilation, + drop_path_rate=drop_path_rate, + is_first_layer=is_first_layer, + ), + ) + prev_chs = out_channels + first_dilation = dilation + + def _get_updated_hyperparameters(self, layer_idx, stride, layer_dropout): + r""" + Get the new hyper-parameters with respect to the previous ones and the index of the current layer. + """ + if layer_dropout: + drop_path_rate = layer_dropout[layer_idx] + else: + drop_path_rate = 0.0 + + if layer_idx != 0: + stride = 1 + + is_first_layer = layer_idx == 0 + + return stride, drop_path_rate, is_first_layer + + def forward(self, input: Tensor) -> Tensor: + hidden_state = input + for _, layer in enumerate(self.layers): + hidden_state = layer(hidden_state) + return hidden_state + + +class BitEncoder(nn.Module): + def __init__(self, config: BitConfig): + super().__init__() + self.stages = nn.ModuleList([]) + + prev_chs = config.embedding_size + + # These needs to stay hardcoded + current_stride = 4 + dilation = 1 + + layer_dropouts = [ + x.tolist() + for x in torch.Tensor(np.linspace(0, config.drop_path_rate, sum(config.depths))).split(config.depths) + ] + + for stage_idx, (current_depth, current_hidden_size, layer_dropout) in enumerate( + zip(config.depths, config.hidden_sizes, layer_dropouts) + ): + # Get the updated hyper params + out_channels, stride, dilation = self._get_updated_hyperparameters( + stage_idx, current_stride, current_hidden_size, dilation, config + ) + + stage = BitStage( + config, + prev_chs, + out_channels, + stride=stride, + dilation=dilation, + depth=current_depth, + layer_dropout=layer_dropout, + ) + + prev_chs = out_channels + current_stride *= stride + + self.stages.add_module(str(stage_idx), stage) + + def _get_updated_hyperparameters(self, stage_idx, current_stride, current_hidden_size, dilation, config): + out_channels = make_div(current_hidden_size * config.width_factor) + stride = 1 if stage_idx == 0 else 2 + if current_stride >= config.output_stride: + dilation *= stride + stride = 1 + return out_channels, stride, dilation + + def forward( + self, hidden_state: Tensor, output_hidden_states: bool = False, return_dict: bool = True + ) -> BaseModelOutputWithNoAttention: + hidden_states = () if output_hidden_states else None + + for stage_module in self.stages: + if output_hidden_states: + hidden_states = hidden_states + (hidden_state,) + + hidden_state = stage_module(hidden_state) + + if output_hidden_states: + hidden_states = hidden_states + (hidden_state,) + + if not return_dict: + return tuple(v for v in [hidden_state, hidden_states] if v is not None) + + return BaseModelOutputWithNoAttention( + last_hidden_state=hidden_state, + hidden_states=hidden_states, + ) + + +class BitPreTrainedModel(PreTrainedModel): + """ + An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained + models. + """ + + config_class = BitConfig + base_model_prefix = "bit" + main_input_name = "pixel_values" + _no_split_modules = ["BitEmbeddings"] + + def _init_weights(self, module): + if isinstance(module, nn.Conv2d): + nn.init.kaiming_normal_(module.weight, mode="fan_out", nonlinearity="relu") + # copied from the `reset_parameters` method of `class Linear(Module)` in `torch`. + elif isinstance(module, nn.Linear): + nn.init.kaiming_uniform_(module.weight, a=math.sqrt(5)) + if module.bias is not None: + fan_in, _ = nn.init._calculate_fan_in_and_fan_out(module.weight) + bound = 1 / math.sqrt(fan_in) if fan_in > 0 else 0 + nn.init.uniform_(module.bias, -bound, bound) + elif isinstance(module, (nn.BatchNorm2d, nn.GroupNorm)): + nn.init.constant_(module.weight, 1) + nn.init.constant_(module.bias, 0) + + +BIT_START_DOCSTRING = r""" + This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it + as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and + behavior. + + Parameters: + config ([`BitConfig`]): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights. +""" + +BIT_INPUTS_DOCSTRING = r""" + Args: + pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`): + Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See [`BitImageProcessor.__call__`] + for details. + + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. +""" + + +@add_start_docstrings( + "The bare BiT model outputting raw features without any specific head on top.", + BIT_START_DOCSTRING, +) +class BitModel(BitPreTrainedModel): + def __init__(self, config): + super().__init__(config) + self.config = config + + self.embedder = BitEmbeddings(config) + + self.encoder = BitEncoder(config) + self.norm = ( + BitGroupNormActivation(config, num_channels=config.hidden_sizes[-1]) + if config.layer_type == "preactivation" + else nn.Identity() + ) + + self.pooler = nn.AdaptiveAvgPool2d((1, 1)) + # Initialize weights and apply final processing + self.post_init() + + @add_start_docstrings_to_model_forward(BIT_INPUTS_DOCSTRING) + @add_code_sample_docstrings( + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=BaseModelOutputWithPoolingAndNoAttention, + config_class=_CONFIG_FOR_DOC, + modality="vision", + expected_output=_EXPECTED_OUTPUT_SHAPE, + ) + def forward( + self, pixel_values: Tensor, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None + ) -> BaseModelOutputWithPoolingAndNoAttention: + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + embedding_output = self.embedder(pixel_values) + + encoder_outputs = self.encoder( + embedding_output, output_hidden_states=output_hidden_states, return_dict=return_dict + ) + + last_hidden_state = encoder_outputs[0] + + last_hidden_state = self.norm(last_hidden_state) + + pooled_output = self.pooler(last_hidden_state) + + if not return_dict: + return (last_hidden_state, pooled_output) + encoder_outputs[1:] + + return BaseModelOutputWithPoolingAndNoAttention( + last_hidden_state=last_hidden_state, + pooler_output=pooled_output, + hidden_states=encoder_outputs.hidden_states, + ) + + +@add_start_docstrings( + """ + BiT Model with an image classification head on top (a linear layer on top of the pooled features), e.g. for + ImageNet. + """, + BIT_START_DOCSTRING, +) +class BitForImageClassification(BitPreTrainedModel): + def __init__(self, config): + super().__init__(config) + self.num_labels = config.num_labels + self.bit = BitModel(config) + # classification head + self.classifier = nn.Sequential( + nn.Flatten(), + nn.Linear(config.hidden_sizes[-1], config.num_labels) if config.num_labels > 0 else nn.Identity(), + ) + # initialize weights and apply final processing + self.post_init() + + @add_start_docstrings_to_model_forward(BIT_INPUTS_DOCSTRING) + @add_code_sample_docstrings( + checkpoint=_IMAGE_CLASS_CHECKPOINT, + output_type=ImageClassifierOutputWithNoAttention, + config_class=_CONFIG_FOR_DOC, + expected_output=_IMAGE_CLASS_EXPECTED_OUTPUT, + ) + def forward( + self, + pixel_values: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> ImageClassifierOutputWithNoAttention: + r""" + labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the image classification/regression loss. Indices should be in `[0, ..., + config.num_labels - 1]`. If `config.num_labels > 1` a classification loss is computed (Cross-Entropy). + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.bit(pixel_values, output_hidden_states=output_hidden_states, return_dict=return_dict) + + pooled_output = outputs.pooler_output if return_dict else outputs[1] + + logits = self.classifier(pooled_output) + + loss = None + + if labels is not None: + if self.config.problem_type is None: + if self.num_labels == 1: + self.config.problem_type = "regression" + elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int): + self.config.problem_type = "single_label_classification" + else: + self.config.problem_type = "multi_label_classification" + if self.config.problem_type == "regression": + loss_fct = MSELoss() + if self.num_labels == 1: + loss = loss_fct(logits.squeeze(), labels.squeeze()) + else: + loss = loss_fct(logits, labels) + elif self.config.problem_type == "single_label_classification": + loss_fct = CrossEntropyLoss() + loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) + elif self.config.problem_type == "multi_label_classification": + loss_fct = BCEWithLogitsLoss() + loss = loss_fct(logits, labels) + + if not return_dict: + output = (logits,) + outputs[2:] + return (loss,) + output if loss is not None else output + + return ImageClassifierOutputWithNoAttention(loss=loss, logits=logits, hidden_states=outputs.hidden_states) + + +@add_start_docstrings( + """ + BiT backbone, to be used with frameworks like DETR and MaskFormer. + """, + BIT_START_DOCSTRING, +) +class BitBackbone(BitPreTrainedModel, BackboneMixin): + def __init__(self, config): + super().__init__(config) + super()._init_backbone(config) + + self.bit = BitModel(config) + self.num_features = [config.embedding_size] + config.hidden_sizes + + # initialize weights and apply final processing + self.post_init() + + @add_start_docstrings_to_model_forward(BIT_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=BackboneOutput, config_class=_CONFIG_FOR_DOC) + def forward( + self, pixel_values: Tensor, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None + ) -> BackboneOutput: + """ + Returns: + + Examples: + + ```python + >>> from transformers import AutoImageProcessor, AutoBackbone + >>> import torch + >>> from PIL import Image + >>> import requests + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + + >>> processor = AutoImageProcessor.from_pretrained("google/bit-50") + >>> model = AutoBackbone.from_pretrained("google/bit-50") + + >>> inputs = processor(image, return_tensors="pt") + >>> outputs = model(**inputs) + ```""" + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + + outputs = self.bit(pixel_values, output_hidden_states=True, return_dict=True) + + hidden_states = outputs.hidden_states + + feature_maps = () + for idx, stage in enumerate(self.stage_names): + if stage in self.out_features: + feature_maps += (hidden_states[idx],) + + if not return_dict: + output = (feature_maps,) + if output_hidden_states: + output += (outputs.hidden_states,) + return output + + return BackboneOutput( + feature_maps=feature_maps, + hidden_states=outputs.hidden_states if output_hidden_states else None, + attentions=None, + ) + + +__all__ = ["BitForImageClassification", "BitModel", "BitPreTrainedModel", "BitBackbone"] diff --git a/docs/transformers/build/lib/transformers/models/blenderbot/__init__.py b/docs/transformers/build/lib/transformers/models/blenderbot/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..76ece6853b381b0853f52022078f9c580deb0f04 --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/blenderbot/__init__.py @@ -0,0 +1,31 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ...utils import _LazyModule +from ...utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .configuration_blenderbot import * + from .modeling_blenderbot import * + from .modeling_flax_blenderbot import * + from .modeling_tf_blenderbot import * + from .tokenization_blenderbot import * + from .tokenization_blenderbot_fast import * +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/docs/transformers/build/lib/transformers/models/blenderbot/configuration_blenderbot.py b/docs/transformers/build/lib/transformers/models/blenderbot/configuration_blenderbot.py new file mode 100644 index 0000000000000000000000000000000000000000..c9f323210e8c477dad96575c24884295051cbac4 --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/blenderbot/configuration_blenderbot.py @@ -0,0 +1,395 @@ +# coding=utf-8 +# Copyright 2021 The Facebook, Inc. and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Blenderbot model configuration""" + +from collections import OrderedDict +from typing import Any, Mapping, Optional + +from ... import PreTrainedTokenizer +from ...configuration_utils import PretrainedConfig +from ...file_utils import TensorType, is_torch_available +from ...onnx import OnnxConfig, OnnxConfigWithPast, OnnxSeq2SeqConfigWithPast +from ...onnx.utils import compute_effective_axis_dimension +from ...utils import logging + + +logger = logging.get_logger(__name__) + + +class BlenderbotConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`BlenderbotModel`]. It is used to instantiate an + Blenderbot model according to the specified arguments, defining the model architecture. Instantiating a + configuration with the defaults will yield a similar configuration to that of the Blenderbot + [facebook/blenderbot-3B](https://huggingface.co/facebook/blenderbot-3B) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + + Args: + vocab_size (`int`, *optional*, defaults to 50265): + Vocabulary size of the Blenderbot model. Defines the number of different tokens that can be represented by + the `inputs_ids` passed when calling [`BlenderbotModel`] or [`TFBlenderbotModel`]. + d_model (`int`, *optional*, defaults to 1024): + Dimensionality of the layers and the pooler layer. + encoder_layers (`int`, *optional*, defaults to 12): + Number of encoder layers. + decoder_layers (`int`, *optional*, defaults to 12): + Number of decoder layers. + encoder_attention_heads (`int`, *optional*, defaults to 16): + Number of attention heads for each attention layer in the Transformer encoder. + decoder_attention_heads (`int`, *optional*, defaults to 16): + Number of attention heads for each attention layer in the Transformer decoder. + decoder_ffn_dim (`int`, *optional*, defaults to 4096): + Dimensionality of the "intermediate" (often named feed-forward) layer in decoder. + encoder_ffn_dim (`int`, *optional*, defaults to 4096): + Dimensionality of the "intermediate" (often named feed-forward) layer in decoder. + activation_function (`str` or `function`, *optional*, defaults to `"gelu"`): + The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, + `"relu"`, `"silu"` and `"gelu_new"` are supported. + dropout (`float`, *optional*, defaults to 0.1): + The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. + attention_dropout (`float`, *optional*, defaults to 0.0): + The dropout ratio for the attention probabilities. + activation_dropout (`float`, *optional*, defaults to 0.0): + The dropout ratio for activations inside the fully connected layer. + max_position_embeddings (`int`, *optional*, defaults to 128): + The maximum sequence length that this model might ever be used with. Typically set this to something large + just in case (e.g., 512 or 1024 or 2048). + init_std (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + encoder_layerdrop (`float`, *optional*, defaults to 0.0): + The LayerDrop probability for the encoder. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556) + for more details. + decoder_layerdrop (`float`, *optional*, defaults to 0.0): + The LayerDrop probability for the decoder. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556) + for more details. + scale_embedding (`bool`, *optional*, defaults to `False`): + Scale embeddings by diving by sqrt(d_model). + use_cache (`bool`, *optional*, defaults to `True`): + Whether or not the model should return the last key/values attentions (not used by all models) + forced_eos_token_id (`int`, *optional*, defaults to 2): + The id of the token to force as the last generated token when `max_length` is reached. Usually set to + `eos_token_id`. + + Example: + + ```python + >>> from transformers import BlenderbotConfig, BlenderbotModel + + >>> # Initializing a Blenderbot facebook/blenderbot-3B style configuration + >>> configuration = BlenderbotConfig() + + >>> # Initializing a model (with random weights) from the facebook/blenderbot-3B style configuration + >>> model = BlenderbotModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "blenderbot" + keys_to_ignore_at_inference = ["past_key_values"] + attribute_map = {"num_attention_heads": "encoder_attention_heads", "hidden_size": "d_model"} + + def __init__( + self, + vocab_size=8008, + max_position_embeddings=128, + encoder_layers=2, + encoder_ffn_dim=10240, + encoder_attention_heads=32, + decoder_layers=24, + decoder_ffn_dim=10240, + decoder_attention_heads=32, + encoder_layerdrop=0.0, + decoder_layerdrop=0.0, + use_cache=True, + is_encoder_decoder=True, + activation_function="gelu", + d_model=2560, + dropout=0.1, + attention_dropout=0.0, + activation_dropout=0.0, + init_std=0.02, + decoder_start_token_id=1, + scale_embedding=False, + pad_token_id=0, + bos_token_id=1, + eos_token_id=2, + encoder_no_repeat_ngram_size=3, + forced_eos_token_id=2, + **kwargs, + ): + self.vocab_size = vocab_size + self.max_position_embeddings = max_position_embeddings + self.d_model = d_model + self.encoder_ffn_dim = encoder_ffn_dim + self.encoder_layers = encoder_layers + self.encoder_attention_heads = encoder_attention_heads + self.decoder_ffn_dim = decoder_ffn_dim + self.decoder_layers = decoder_layers + self.decoder_attention_heads = decoder_attention_heads + self.dropout = dropout + self.attention_dropout = attention_dropout + self.activation_dropout = activation_dropout + self.activation_function = activation_function + self.init_std = init_std + self.encoder_layerdrop = encoder_layerdrop + self.decoder_layerdrop = decoder_layerdrop + self.use_cache = use_cache + self.num_hidden_layers = encoder_layers + self.scale_embedding = scale_embedding # scale factor will be sqrt(d_model) if True + + super().__init__( + pad_token_id=pad_token_id, + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + is_encoder_decoder=is_encoder_decoder, + decoder_start_token_id=decoder_start_token_id, + encoder_no_repeat_ngram_size=encoder_no_repeat_ngram_size, + forced_eos_token_id=forced_eos_token_id, + **kwargs, + ) + + +class BlenderbotOnnxConfig(OnnxSeq2SeqConfigWithPast): + @property + def inputs(self) -> Mapping[str, Mapping[int, str]]: + if self.task in ["default", "seq2seq-lm"]: + common_inputs = OrderedDict( + [ + ("input_ids", {0: "batch", 1: "encoder_sequence"}), + ("attention_mask", {0: "batch", 1: "encoder_sequence"}), + ] + ) + if self.use_past: + common_inputs["decoder_input_ids"] = {0: "batch"} + common_inputs["decoder_attention_mask"] = {0: "batch", 1: "past_decoder_sequence + sequence"} + else: + common_inputs["decoder_input_ids"] = {0: "batch", 1: "decoder_sequence"} + common_inputs["decoder_attention_mask"] = {0: "batch", 1: "decoder_sequence"} + if self.use_past: + self.fill_with_past_key_values_(common_inputs, direction="inputs") + elif self.task == "causal-lm": + common_inputs = OrderedDict( + [ + ("input_ids", {0: "batch", 1: "encoder_sequence"}), + ("attention_mask", {0: "batch", 1: "encoder_sequence"}), + ] + ) + if self.use_past: + _, num_decoder_layers = self.num_layers + for i in range(num_decoder_layers): + common_inputs[f"past_key_values.{i}.key"] = {0: "batch", 2: "past_sequence + sequence"} + common_inputs[f"past_key_values.{i}.value"] = {0: "batch", 2: "past_sequence + sequence"} + else: + common_inputs = OrderedDict( + [ + ("input_ids", {0: "batch", 1: "encoder_sequence"}), + ("attention_mask", {0: "batch", 1: "encoder_sequence"}), + ("decoder_input_ids", {0: "batch", 1: "decoder_sequence"}), + ("decoder_attention_mask", {0: "batch", 1: "decoder_sequence"}), + ] + ) + + return common_inputs + + @property + # Copied from transformers.models.bart.configuration_bart.BartOnnxConfig.outputs + def outputs(self) -> Mapping[str, Mapping[int, str]]: + if self.task in ["default", "seq2seq-lm"]: + common_outputs = super().outputs + else: + common_outputs = super(OnnxConfigWithPast, self).outputs + if self.use_past: + num_encoder_layers, _ = self.num_layers + for i in range(num_encoder_layers): + common_outputs[f"present.{i}.key"] = {0: "batch", 2: "past_sequence + sequence"} + common_outputs[f"present.{i}.value"] = {0: "batch", 2: "past_sequence + sequence"} + return common_outputs + + def _generate_dummy_inputs_for_default_and_seq2seq_lm( + self, + tokenizer: PreTrainedTokenizer, + batch_size: int = -1, + seq_length: int = -1, + is_pair: bool = False, + framework: Optional[TensorType] = None, + ) -> Mapping[str, Any]: + encoder_inputs = self._generate_dummy_inputs_for_sequence_classification_and_question_answering( + tokenizer, batch_size, seq_length, is_pair, framework + ) + # Generate decoder inputs + decoder_seq_length = seq_length if not self.use_past else 1 + decoder_inputs = self._generate_dummy_inputs_for_sequence_classification_and_question_answering( + tokenizer, batch_size, decoder_seq_length, is_pair, framework + ) + decoder_inputs = {f"decoder_{name}": tensor for name, tensor in decoder_inputs.items()} + common_inputs = dict(**encoder_inputs, **decoder_inputs) + + if self.use_past: + if not is_torch_available(): + raise ValueError("Cannot generate dummy past_keys inputs without PyTorch installed.") + else: + import torch + batch, encoder_seq_length = common_inputs["input_ids"].shape + decoder_seq_length = common_inputs["decoder_input_ids"].shape[1] + num_encoder_attention_heads, num_decoder_attention_heads = self.num_attention_heads + encoder_shape = ( + batch, + num_encoder_attention_heads, + encoder_seq_length, + self._config.hidden_size // num_encoder_attention_heads, + ) + decoder_past_length = decoder_seq_length + decoder_shape = ( + batch, + num_decoder_attention_heads, + decoder_past_length, + self._config.hidden_size // num_decoder_attention_heads, + ) + common_inputs["decoder_attention_mask"] = torch.cat( + [common_inputs["decoder_attention_mask"], torch.ones(batch, decoder_past_length)], dim=1 + ) + common_inputs["past_key_values"] = [] + _, num_decoder_layers = self.num_layers + + for _ in range(num_decoder_layers): + common_inputs["past_key_values"].append( + ( + torch.zeros(decoder_shape), + torch.zeros(decoder_shape), + torch.zeros(encoder_shape), + torch.zeros(encoder_shape), + ) + ) + return common_inputs + + def _generate_dummy_inputs_for_causal_lm( + self, + tokenizer: PreTrainedTokenizer, + batch_size: int = -1, + seq_length: int = -1, + is_pair: bool = False, + framework: Optional[TensorType] = None, + ) -> Mapping[str, Any]: + common_inputs = self._generate_dummy_inputs_for_sequence_classification_and_question_answering( + tokenizer, batch_size, seq_length, is_pair, framework + ) + + if self.use_past: + if not is_torch_available(): + raise ValueError("Cannot generate dummy past_keys inputs without PyTorch installed.") + else: + import torch + batch, seqlen = common_inputs["input_ids"].shape + past_key_values_length = seqlen + _, num_decoder_layers = self.num_layers + num_encoder_attention_heads, _ = self.num_attention_heads + past_shape = ( + batch, + num_encoder_attention_heads, + past_key_values_length, + self._config.hidden_size // num_encoder_attention_heads, + ) + mask_dtype = common_inputs["attention_mask"].dtype + common_inputs["attention_mask"] = torch.cat( + [common_inputs["attention_mask"], torch.ones(batch, past_key_values_length, dtype=mask_dtype)], dim=1 + ) + common_inputs["past_key_values"] = [ + (torch.zeros(past_shape), torch.zeros(past_shape)) for _ in range(num_decoder_layers) + ] + return common_inputs + + # Copied from transformers.models.bart.configuration_bart.BartOnnxConfig._generate_dummy_inputs_for_sequence_classification_and_question_answering + def _generate_dummy_inputs_for_sequence_classification_and_question_answering( + self, + tokenizer: PreTrainedTokenizer, + batch_size: int = -1, + seq_length: int = -1, + is_pair: bool = False, + framework: Optional[TensorType] = None, + ) -> Mapping[str, Any]: + # Copied from OnnxConfig.generate_dummy_inputs + # Did not use super(OnnxConfigWithPast, self).generate_dummy_inputs for code clarity. + # If dynamic axis (-1) we forward with a fixed dimension of 2 samples to avoid optimizations made by ONNX + batch_size = compute_effective_axis_dimension( + batch_size, fixed_dimension=OnnxConfig.default_fixed_batch, num_token_to_add=0 + ) + + # If dynamic axis (-1) we forward with a fixed dimension of 8 tokens to avoid optimizations made by ONNX + token_to_add = tokenizer.num_special_tokens_to_add(is_pair) + seq_length = compute_effective_axis_dimension( + seq_length, fixed_dimension=OnnxConfig.default_fixed_sequence, num_token_to_add=token_to_add + ) + + # Generate dummy inputs according to compute batch and sequence + dummy_input = [" ".join([tokenizer.unk_token]) * seq_length] * batch_size + common_inputs = dict(tokenizer(dummy_input, return_tensors=framework)) + return common_inputs + + # Copied from transformers.models.bart.configuration_bart.BartOnnxConfig.generate_dummy_inputs + def generate_dummy_inputs( + self, + tokenizer: PreTrainedTokenizer, + batch_size: int = -1, + seq_length: int = -1, + is_pair: bool = False, + framework: Optional[TensorType] = None, + ) -> Mapping[str, Any]: + if self.task in ["default", "seq2seq-lm"]: + common_inputs = self._generate_dummy_inputs_for_default_and_seq2seq_lm( + tokenizer, batch_size=batch_size, seq_length=seq_length, is_pair=is_pair, framework=framework + ) + + elif self.task == "causal-lm": + common_inputs = self._generate_dummy_inputs_for_causal_lm( + tokenizer, batch_size=batch_size, seq_length=seq_length, is_pair=is_pair, framework=framework + ) + else: + common_inputs = self._generate_dummy_inputs_for_sequence_classification_and_question_answering( + tokenizer, batch_size=batch_size, seq_length=seq_length, is_pair=is_pair, framework=framework + ) + + return common_inputs + + # Copied from transformers.models.bart.configuration_bart.BartOnnxConfig._flatten_past_key_values_ + def _flatten_past_key_values_(self, flattened_output, name, idx, t): + if self.task in ["default", "seq2seq-lm"]: + flattened_output = super()._flatten_past_key_values_(flattened_output, name, idx, t) + else: + flattened_output = super(OnnxSeq2SeqConfigWithPast, self)._flatten_past_key_values_( + flattened_output, name, idx, t + ) + + def fill_with_past_key_values_(self, inputs_or_outputs: Mapping[str, Mapping[int, str]], direction: str): + if direction not in ["inputs", "outputs"]: + raise ValueError(f'direction must either be "inputs" or "outputs", but {direction} was given') + + name = "past_key_values" if direction == "inputs" else "present" + _, num_decoder_layers = self.num_layers + + encoder_sequence = "past_encoder_sequence" + decoder_sequence = "past_decoder_sequence" if direction == "inputs" else "past_decoder_sequence + sequence" + + for i in range(num_decoder_layers): + inputs_or_outputs[f"{name}.{i}.decoder.key"] = {0: "batch", 2: decoder_sequence} + inputs_or_outputs[f"{name}.{i}.decoder.value"] = {0: "batch", 2: decoder_sequence} + inputs_or_outputs[f"{name}.{i}.encoder.key"] = {0: "batch", 2: encoder_sequence} + inputs_or_outputs[f"{name}.{i}.encoder.value"] = {0: "batch", 2: encoder_sequence} + + +__all__ = ["BlenderbotConfig", "BlenderbotOnnxConfig"] diff --git a/docs/transformers/build/lib/transformers/models/blenderbot/convert_blenderbot_original_pytorch_checkpoint_to_pytorch.py b/docs/transformers/build/lib/transformers/models/blenderbot/convert_blenderbot_original_pytorch_checkpoint_to_pytorch.py new file mode 100644 index 0000000000000000000000000000000000000000..d8ce9b056c3da15911b90810f2ed1d7de7d078af --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/blenderbot/convert_blenderbot_original_pytorch_checkpoint_to_pytorch.py @@ -0,0 +1,114 @@ +# coding=utf-8 +# Copyright 2020 The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Convert Blenderbot checkpoint.""" + +import argparse + +import torch + +from transformers import BlenderbotConfig, BlenderbotForConditionalGeneration +from transformers.utils import logging + + +logging.set_verbosity_info() +logger = logging.get_logger(__name__) + +PATTERNS = [ + ["attention", "attn"], + ["encoder_attention", "encoder_attn"], + ["q_lin", "q_proj"], + ["k_lin", "k_proj"], + ["v_lin", "v_proj"], + ["out_lin", "out_proj"], + ["norm_embeddings", "layernorm_embedding"], + ["position_embeddings", "embed_positions"], + ["embeddings", "embed_tokens"], + ["ffn.lin", "fc"], +] + + +def rename_state_dict_key(k): + if k == "embeddings.weight": + return "shared.weight" + + for parlai_name, hf_name in PATTERNS: + k = k.replace(parlai_name, hf_name) + + if k.startswith("encoder"): + k = k.replace(".attn", ".self_attn") + k = k.replace("norm1", "self_attn_layer_norm") + k = k.replace("norm2", "final_layer_norm") + elif k.startswith("decoder"): + k = k.replace("norm1", "self_attn_layer_norm") + k = k.replace("norm2", "encoder_attn_layer_norm") + k = k.replace("norm3", "final_layer_norm") + return k + + +def rename_layernorm_keys(sd): + keys = [ + "model.encoder.layernorm_embedding.weight", + "model.encoder.layernorm_embedding.bias", + "model.decoder.layernorm_embedding.weight", + "model.decoder.layernorm_embedding.bias", + ] + for k in keys: + v = sd.pop(k) + new_k = k.replace("layernorm_embedding", "layer_norm") + assert new_k not in sd + sd[new_k] = v + + +IGNORE_KEYS = ["START"] + + +@torch.no_grad() +def convert_parlai_checkpoint(checkpoint_path, pytorch_dump_folder_path, config_json_path): + """ + Copy/paste/tweak model's weights to our BERT structure. + """ + model = torch.load(checkpoint_path, map_location="cpu", weights_only=True) + sd = model["model"] + cfg = BlenderbotConfig.from_json_file(config_json_path) + m = BlenderbotForConditionalGeneration(cfg) + valid_keys = m.model.state_dict().keys() + failures = [] + mapping = {} + for k, v in sd.items(): + if k in IGNORE_KEYS: + continue + + new_k = rename_state_dict_key(k) + if new_k not in valid_keys: + failures.append([k, new_k]) + else: + mapping[new_k] = v + if cfg.normalize_before: # Blenderbot-3B checkpoints. Rename layernorm_embedding -> layer_norm + rename_layernorm_keys(sd) + m.model.load_state_dict(mapping, strict=True) + m.half() + m.save_pretrained(pytorch_dump_folder_path) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + # Required parameters + parser.add_argument("--src_path", type=str, help="like blenderbot-model.bin") + parser.add_argument("--save_dir", default="hf_blenderbot", type=str, help="Where to save converted model.") + parser.add_argument( + "--hf_config_json", default="blenderbot-3b-config.json", type=str, help="Path to config to use" + ) + args = parser.parse_args() + convert_parlai_checkpoint(args.src_path, args.save_dir, args.hf_config_json) diff --git a/docs/transformers/build/lib/transformers/models/blenderbot/modeling_blenderbot.py b/docs/transformers/build/lib/transformers/models/blenderbot/modeling_blenderbot.py new file mode 100644 index 0000000000000000000000000000000000000000..9ce51bebe70c74b264c5290a96e303573c597d05 --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/blenderbot/modeling_blenderbot.py @@ -0,0 +1,1557 @@ +# coding=utf-8 +# Copyright 2021 The Facebook, Inc. and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch Blenderbot model.""" + +import copy +import math +import os +import warnings +from typing import List, Optional, Tuple, Union + +import torch +import torch.utils.checkpoint +from torch import nn +from torch.nn import CrossEntropyLoss + +from ...activations import ACT2FN +from ...generation import GenerationMixin +from ...modeling_attn_mask_utils import _prepare_4d_attention_mask, _prepare_4d_causal_attention_mask +from ...modeling_outputs import ( + BaseModelOutput, + BaseModelOutputWithPastAndCrossAttentions, + CausalLMOutputWithCrossAttentions, + Seq2SeqLMOutput, + Seq2SeqModelOutput, +) +from ...modeling_utils import PreTrainedModel +from ...utils import ( + add_end_docstrings, + add_start_docstrings, + add_start_docstrings_to_model_forward, + logging, + replace_return_docstrings, +) +from ..blenderbot_small import BlenderbotSmallForConditionalGeneration, BlenderbotSmallModel +from .configuration_blenderbot import BlenderbotConfig + + +logger = logging.get_logger(__name__) + +_CONFIG_FOR_DOC = "BlenderbotConfig" +_CHECKPOINT_FOR_DOC = "facebook/blenderbot-400M-distill" + + +# Copied from transformers.models.bart.modeling_bart.shift_tokens_right +def shift_tokens_right(input_ids: torch.Tensor, pad_token_id: int, decoder_start_token_id: int): + """ + Shift input ids one token to the right. + """ + shifted_input_ids = input_ids.new_zeros(input_ids.shape) + shifted_input_ids[:, 1:] = input_ids[:, :-1].clone() + shifted_input_ids[:, 0] = decoder_start_token_id + + if pad_token_id is None: + raise ValueError("self.model.config.pad_token_id has to be defined.") + # replace possible -100 values in labels by `pad_token_id` + shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id) + + return shifted_input_ids + + +class BlenderbotLearnedPositionalEmbedding(nn.Embedding): + """ + This module learns positional embeddings up to a fixed maximum size. + """ + + def __init__(self, num_embeddings: int, embedding_dim: int): + super().__init__(num_embeddings, embedding_dim) + + def forward(self, input_ids_shape: torch.Size, past_key_values_length: int = 0): + """`input_ids_shape` is expected to be [bsz x seqlen].""" + bsz, seq_len = input_ids_shape[:2] + positions = torch.arange( + past_key_values_length, past_key_values_length + seq_len, dtype=torch.long, device=self.weight.device + ) + return super().forward(positions) + + +# Copied from transformers.models.bart.modeling_bart.BartScaledWordEmbedding with Bart->Blenderbot +class BlenderbotScaledWordEmbedding(nn.Embedding): + """ + This module overrides nn.Embeddings' forward by multiplying with embeddings scale. + """ + + def __init__(self, num_embeddings: int, embedding_dim: int, padding_idx: int, embed_scale: Optional[float] = 1.0): + super().__init__(num_embeddings, embedding_dim, padding_idx) + self.embed_scale = embed_scale + + def forward(self, input_ids: torch.Tensor): + return super().forward(input_ids) * self.embed_scale + + +# Copied from transformers.models.bart.modeling_bart.BartAttention with Bart->Blenderbot +class BlenderbotAttention(nn.Module): + """Multi-headed attention from 'Attention Is All You Need' paper""" + + def __init__( + self, + embed_dim: int, + num_heads: int, + dropout: float = 0.0, + is_decoder: bool = False, + bias: bool = True, + is_causal: bool = False, + config: Optional[BlenderbotConfig] = None, + ): + super().__init__() + self.embed_dim = embed_dim + self.num_heads = num_heads + self.dropout = dropout + self.head_dim = embed_dim // num_heads + self.config = config + + if (self.head_dim * num_heads) != self.embed_dim: + raise ValueError( + f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}" + f" and `num_heads`: {num_heads})." + ) + self.scaling = self.head_dim**-0.5 + self.is_decoder = is_decoder + self.is_causal = is_causal + + self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias) + self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias) + self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias) + self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias) + + def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int): + return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous() + + def forward( + self, + hidden_states: torch.Tensor, + key_value_states: Optional[torch.Tensor] = None, + past_key_value: Optional[Tuple[torch.Tensor]] = None, + attention_mask: Optional[torch.Tensor] = None, + layer_head_mask: Optional[torch.Tensor] = None, + output_attentions: bool = False, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + """Input shape: Batch x Time x Channel""" + + # if key_value_states are provided this layer is used as a cross-attention layer + # for the decoder + is_cross_attention = key_value_states is not None + + bsz, tgt_len, _ = hidden_states.size() + + # get query proj + query_states = self.q_proj(hidden_states) * self.scaling + # get key, value proj + # `past_key_value[0].shape[2] == key_value_states.shape[1]` + # is checking that the `sequence_length` of the `past_key_value` is the same as + # the provided `key_value_states` to support prefix tuning + if ( + is_cross_attention + and past_key_value is not None + and past_key_value[0].shape[2] == key_value_states.shape[1] + ): + # reuse k,v, cross_attentions + key_states = past_key_value[0] + value_states = past_key_value[1] + elif is_cross_attention: + # cross_attentions + key_states = self._shape(self.k_proj(key_value_states), -1, bsz) + value_states = self._shape(self.v_proj(key_value_states), -1, bsz) + elif past_key_value is not None: + # reuse k, v, self_attention + key_states = self._shape(self.k_proj(hidden_states), -1, bsz) + value_states = self._shape(self.v_proj(hidden_states), -1, bsz) + key_states = torch.cat([past_key_value[0], key_states], dim=2) + value_states = torch.cat([past_key_value[1], value_states], dim=2) + else: + # self_attention + key_states = self._shape(self.k_proj(hidden_states), -1, bsz) + value_states = self._shape(self.v_proj(hidden_states), -1, bsz) + + if self.is_decoder: + # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states. + # Further calls to cross_attention layer can then reuse all cross-attention + # key/value_states (first "if" case) + # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of + # all previous decoder key/value_states. Further calls to uni-directional self-attention + # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case) + # if encoder bi-directional self-attention `past_key_value` is always `None` + past_key_value = (key_states, value_states) + + proj_shape = (bsz * self.num_heads, -1, self.head_dim) + query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape) + key_states = key_states.reshape(*proj_shape) + value_states = value_states.reshape(*proj_shape) + + src_len = key_states.size(1) + attn_weights = torch.bmm(query_states, key_states.transpose(1, 2)) + + if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len): + raise ValueError( + f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is" + f" {attn_weights.size()}" + ) + + if attention_mask is not None: + if attention_mask.size() != (bsz, 1, tgt_len, src_len): + raise ValueError( + f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}" + ) + attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attention_mask + attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len) + + attn_weights = nn.functional.softmax(attn_weights, dim=-1) + + if layer_head_mask is not None: + if layer_head_mask.size() != (self.num_heads,): + raise ValueError( + f"Head mask for a single layer should be of size {(self.num_heads,)}, but is" + f" {layer_head_mask.size()}" + ) + attn_weights = layer_head_mask.view(1, -1, 1, 1) * attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len) + + if output_attentions: + # this operation is a bit awkward, but it's required to + # make sure that attn_weights keeps its gradient. + # In order to do so, attn_weights have to be reshaped + # twice and have to be reused in the following + attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, src_len) + else: + attn_weights_reshaped = None + + attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training) + + attn_output = torch.bmm(attn_probs, value_states) + + if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim): + raise ValueError( + f"`attn_output` should be of size {(bsz * self.num_heads, tgt_len, self.head_dim)}, but is" + f" {attn_output.size()}" + ) + + attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim) + attn_output = attn_output.transpose(1, 2) + + # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be + # partitioned across GPUs when using tensor-parallelism. + attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim) + + attn_output = self.out_proj(attn_output) + + return attn_output, attn_weights_reshaped, past_key_value + + +BLENDERBOT_ATTENTION_CLASSES = {"eager": BlenderbotAttention} + + +# Copied from transformers.models.mbart.modeling_mbart.MBartEncoderLayer with MBart->Blenderbot, MBART->BLENDERBOT +class BlenderbotEncoderLayer(nn.Module): + def __init__(self, config: BlenderbotConfig): + super().__init__() + self.embed_dim = config.d_model + + self.self_attn = BLENDERBOT_ATTENTION_CLASSES[config._attn_implementation]( + embed_dim=self.embed_dim, + num_heads=config.encoder_attention_heads, + dropout=config.attention_dropout, + config=config, + ) + self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim) + self.dropout = config.dropout + self.activation_fn = ACT2FN[config.activation_function] + self.activation_dropout = config.activation_dropout + self.fc1 = nn.Linear(self.embed_dim, config.encoder_ffn_dim) + self.fc2 = nn.Linear(config.encoder_ffn_dim, self.embed_dim) + self.final_layer_norm = nn.LayerNorm(self.embed_dim) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: torch.Tensor, + layer_head_mask: torch.Tensor, + output_attentions: bool = False, + ) -> torch.Tensor: + """ + Args: + hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)` + attention_mask (`torch.FloatTensor`): attention mask of size + `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values. + layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size + `(encoder_attention_heads,)`. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + """ + residual = hidden_states + hidden_states = self.self_attn_layer_norm(hidden_states) + hidden_states, attn_weights, _ = self.self_attn( + hidden_states=hidden_states, + attention_mask=attention_mask, + layer_head_mask=layer_head_mask, + output_attentions=output_attentions, + ) + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = residual + hidden_states + + residual = hidden_states + hidden_states = self.final_layer_norm(hidden_states) + hidden_states = self.activation_fn(self.fc1(hidden_states)) + hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training) + hidden_states = self.fc2(hidden_states) + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = residual + hidden_states + + if hidden_states.dtype == torch.float16: + clamp_value = torch.finfo(hidden_states.dtype).max - 1000 + hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value) + + outputs = (hidden_states,) + + if output_attentions: + outputs += (attn_weights,) + + return outputs + + +# Copied from transformers.models.mbart.modeling_mbart.MBartDecoderLayer with MBart->Blenderbot, MBART->BLENDERBOT +class BlenderbotDecoderLayer(nn.Module): + def __init__(self, config: BlenderbotConfig): + super().__init__() + self.embed_dim = config.d_model + + self.self_attn = BLENDERBOT_ATTENTION_CLASSES[config._attn_implementation]( + embed_dim=self.embed_dim, + num_heads=config.decoder_attention_heads, + dropout=config.attention_dropout, + is_decoder=True, + is_causal=True, + config=config, + ) + self.dropout = config.dropout + self.activation_fn = ACT2FN[config.activation_function] + self.activation_dropout = config.activation_dropout + + self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim) + self.encoder_attn = BLENDERBOT_ATTENTION_CLASSES[config._attn_implementation]( + self.embed_dim, + config.decoder_attention_heads, + dropout=config.attention_dropout, + is_decoder=True, + config=config, + ) + self.encoder_attn_layer_norm = nn.LayerNorm(self.embed_dim) + self.fc1 = nn.Linear(self.embed_dim, config.decoder_ffn_dim) + self.fc2 = nn.Linear(config.decoder_ffn_dim, self.embed_dim) + self.final_layer_norm = nn.LayerNorm(self.embed_dim) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, + encoder_attention_mask: Optional[torch.Tensor] = None, + layer_head_mask: Optional[torch.Tensor] = None, + cross_attn_layer_head_mask: Optional[torch.Tensor] = None, + past_key_value: Optional[Tuple[torch.Tensor]] = None, + output_attentions: Optional[bool] = False, + use_cache: Optional[bool] = True, + ) -> torch.Tensor: + """ + Args: + hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)` + attention_mask (`torch.FloatTensor`): attention mask of size + `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values. + encoder_hidden_states (`torch.FloatTensor`): + cross attention input to the layer of shape `(batch, seq_len, embed_dim)` + encoder_attention_mask (`torch.FloatTensor`): encoder attention mask of size + `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values. + layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size + `(encoder_attention_heads,)`. + cross_attn_layer_head_mask (`torch.FloatTensor`): mask for cross-attention heads in a given layer of + size `(decoder_attention_heads,)`. + past_key_value (`Tuple(torch.FloatTensor)`): cached past key and value projection states + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + """ + residual = hidden_states + hidden_states = self.self_attn_layer_norm(hidden_states) + + # Self Attention + # decoder uni-directional self-attention cached key/values tuple is at positions 1,2 + self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None + # add present self-attn cache to positions 1,2 of present_key_value tuple + hidden_states, self_attn_weights, present_key_value = self.self_attn( + hidden_states=hidden_states, + past_key_value=self_attn_past_key_value, + attention_mask=attention_mask, + layer_head_mask=layer_head_mask, + output_attentions=output_attentions, + ) + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = residual + hidden_states + + # Cross-Attention Block + cross_attn_present_key_value = None + cross_attn_weights = None + if encoder_hidden_states is not None: + residual = hidden_states + hidden_states = self.encoder_attn_layer_norm(hidden_states) + + # cross_attn cached key/values tuple is at positions 3,4 of present_key_value tuple + cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None + hidden_states, cross_attn_weights, cross_attn_present_key_value = self.encoder_attn( + hidden_states=hidden_states, + key_value_states=encoder_hidden_states, + attention_mask=encoder_attention_mask, + layer_head_mask=cross_attn_layer_head_mask, + past_key_value=cross_attn_past_key_value, + output_attentions=output_attentions, + ) + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = residual + hidden_states + + # add cross-attn to positions 3,4 of present_key_value tuple + present_key_value = present_key_value + cross_attn_present_key_value + + # Fully Connected + residual = hidden_states + hidden_states = self.final_layer_norm(hidden_states) + hidden_states = self.activation_fn(self.fc1(hidden_states)) + hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training) + hidden_states = self.fc2(hidden_states) + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = residual + hidden_states + + outputs = (hidden_states,) + + if output_attentions: + outputs += (self_attn_weights, cross_attn_weights) + + if use_cache: + outputs += (present_key_value,) + + return outputs + + +class BlenderbotPreTrainedModel(PreTrainedModel): + config_class = BlenderbotConfig + base_model_prefix = "model" + supports_gradient_checkpointing = True + + def _init_weights(self, module): + std = self.config.init_std + if isinstance(module, nn.Linear): + module.weight.data.normal_(mean=0.0, std=std) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=std) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + + @property + def dummy_inputs(self): + pad_token = self.config.pad_token_id + input_ids = torch.tensor([[0, 6, 10, 4, 2], [0, 8, 12, 2, pad_token]], device=self.device) + dummy_inputs = { + "attention_mask": input_ids.ne(pad_token), + "input_ids": input_ids, + "decoder_input_ids": input_ids, + } + return dummy_inputs + + +BLENDERBOT_START_DOCSTRING = r""" + This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads + etc.) + + This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. + Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage + and behavior. + + Parameters: + config ([`BlenderbotConfig`]): + Model configuration class with all the parameters of the model. Initializing with a config file does not + load the weights associated with the model, only the configuration. Check out the + [`~PreTrainedModel.from_pretrained`] method to load the model weights. +""" + +BLENDERBOT_GENERATION_EXAMPLE = r""" + Conversation example: + + ```python + >>> from transformers import AutoTokenizer, BlenderbotForConditionalGeneration + + >>> mname = "facebook/blenderbot-400M-distill" + >>> model = BlenderbotForConditionalGeneration.from_pretrained(mname) + >>> tokenizer = AutoTokenizer.from_pretrained(mname) + >>> UTTERANCE = "My friends are cool but they eat too many carbs." + >>> print("Human: ", UTTERANCE) + Human: My friends are cool but they eat too many carbs. + + >>> inputs = tokenizer([UTTERANCE], return_tensors="pt") + >>> reply_ids = model.generate(**inputs) + >>> print("Bot: ", tokenizer.batch_decode(reply_ids, skip_special_tokens=True)[0]) + Bot: That's unfortunate. Are they trying to lose weight or are they just trying to be healthier? + + >>> REPLY = "I'm not sure" + >>> print("Human: ", REPLY) + Human: I'm not sure + + >>> NEXT_UTTERANCE = ( + ... "My friends are cool but they eat too many carbs. That's unfortunate. " + ... "Are they trying to lose weight or are they just trying to be healthier? " + ... " I'm not sure." + ... ) + >>> inputs = tokenizer([NEXT_UTTERANCE], return_tensors="pt") + >>> next_reply_ids = model.generate(**inputs) + >>> print("Bot: ", tokenizer.batch_decode(next_reply_ids, skip_special_tokens=True)[0]) + Bot: I see. Well, it's good that they're trying to change their eating habits. + ``` +""" + +BLENDERBOT_INPUTS_DOCSTRING = r""" + Args: + input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide + it. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*): + Indices of decoder input sequence tokens in the vocabulary. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are decoder input IDs?](../glossary#decoder-input-ids) + + Blenderbot uses the `bos_token_id` as the starting token for `decoder_input_ids` generation. If + `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see + `past_key_values`). + decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*): + Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also + be used by default. + head_mask (`torch.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*): + Mask to nullify selected heads of the attention modules in the encoder. Mask values selected in `[0, 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + decoder_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*): + Mask to nullify selected heads of the attention modules in the decoder. Mask values selected in `[0, 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*): + Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in `[0, + 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + encoder_outputs (`tuple(tuple(torch.FloatTensor)`, *optional*): + Tuple consists of (`last_hidden_state`, *optional*: `hidden_states`, *optional*: `attentions`) + `last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)`, *optional*) is a sequence of + hidden-states at the output of the last layer of the encoder. Used in the cross-attention of the decoder. + past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): + Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape + `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape + `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`. + + Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention + blocks) that can be used (see `past_key_values` input) to speed up sequential decoding. + + If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that + don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all + `decoder_input_ids` of shape `(batch_size, sequence_length)`. + inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. + This is useful if you want more control over how to convert `input_ids` indices into associated vectors + than the model's internal embedding lookup matrix. + decoder_inputs_embeds (`torch.FloatTensor` of shape `(batch_size, target_sequence_length, hidden_size)`, *optional*): + Optionally, instead of passing `decoder_input_ids` you can choose to directly pass an embedded + representation. If `past_key_values` is used, optionally only the last `decoder_inputs_embeds` have to be + input (see `past_key_values`). This is useful if you want more control over how to convert + `decoder_input_ids` indices into associated vectors than the model's internal embedding lookup matrix. + + If `decoder_input_ids` and `decoder_inputs_embeds` are both unset, `decoder_inputs_embeds` takes the value + of `inputs_embeds`. + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see + `past_key_values`). + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. +""" + + +class BlenderbotEncoder(BlenderbotPreTrainedModel): + """ + Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a + [`BlenderbotEncoderLayer`]. + + Args: + config: BlenderbotConfig + embed_tokens (nn.Embedding): output embedding + """ + + def __init__(self, config: BlenderbotConfig, embed_tokens: Optional[nn.Embedding] = None): + super().__init__(config) + + self.dropout = config.dropout + self.layerdrop = config.encoder_layerdrop + + embed_dim = config.d_model + self.padding_idx = config.pad_token_id + self.max_source_positions = config.max_position_embeddings + embed_scale = math.sqrt(embed_dim) if config.scale_embedding else 1.0 + + if embed_tokens is not None: + self.embed_tokens = embed_tokens + else: + self.embed_tokens = BlenderbotScaledWordEmbedding( + config.vocab_size, embed_dim, self.padding_idx, embed_scale=embed_scale + ) + + self.embed_positions = BlenderbotLearnedPositionalEmbedding( + config.max_position_embeddings, + embed_dim, + ) + self.layers = nn.ModuleList([BlenderbotEncoderLayer(config) for _ in range(config.encoder_layers)]) + self.layer_norm = nn.LayerNorm(config.d_model) + + self.gradient_checkpointing = False + # Initialize weights and apply final processing + self.post_init() + + def forward( + self, + input_ids=None, + attention_mask=None, + head_mask=None, + inputs_embeds=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + ): + r""" + Args: + input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you + provide it. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + head_mask (`torch.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*): + Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. + This is useful if you want more control over how to convert `input_ids` indices into associated vectors + than the model's internal embedding lookup matrix. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors + for more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. + """ + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + # retrieve input_ids and inputs_embeds + if input_ids is not None and inputs_embeds is not None: + raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") + elif input_ids is not None: + self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask) + input_shape = input_ids.size() + input_ids = input_ids.view(-1, input_shape[-1]) + elif inputs_embeds is not None: + input_shape = inputs_embeds.size()[:-1] + else: + raise ValueError("You have to specify either input_ids or inputs_embeds") + + if inputs_embeds is None: + inputs_embeds = self.embed_tokens(input_ids) + + embed_pos = self.embed_positions(input_shape) + + hidden_states = inputs_embeds + embed_pos + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) + + # expand attention_mask + if attention_mask is not None: + # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] + attention_mask = _prepare_4d_attention_mask(attention_mask, inputs_embeds.dtype) + + encoder_states = () if output_hidden_states else None + all_attentions = () if output_attentions else None + + # check if head_mask has a correct number of layers specified if desired + if head_mask is not None: + if head_mask.size()[0] != len(self.layers): + raise ValueError( + f"The head_mask should be specified for {len(self.layers)} layers, but it is for" + f" {head_mask.size()[0]}." + ) + for idx, encoder_layer in enumerate(self.layers): + if output_hidden_states: + encoder_states = encoder_states + (hidden_states,) + # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) + to_drop = False + if self.training: + dropout_probability = torch.rand([]) + if dropout_probability < self.layerdrop: # skip the layer + to_drop = True + + if to_drop: + layer_outputs = (None, None) + else: + if self.gradient_checkpointing and self.training: + layer_outputs = self._gradient_checkpointing_func( + encoder_layer.__call__, + hidden_states, + attention_mask, + (head_mask[idx] if head_mask is not None else None), + output_attentions, + ) + else: + layer_outputs = encoder_layer( + hidden_states, + attention_mask, + layer_head_mask=(head_mask[idx] if head_mask is not None else None), + output_attentions=output_attentions, + ) + + hidden_states = layer_outputs[0] + + if output_attentions: + all_attentions = all_attentions + (layer_outputs[1],) + + # add final layer norm + hidden_states = self.layer_norm(hidden_states) + + if output_hidden_states: + encoder_states = encoder_states + (hidden_states,) + + if not return_dict: + return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None) + return BaseModelOutput( + last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions + ) + + +class BlenderbotDecoder(BlenderbotPreTrainedModel): + """ + Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`BlenderbotDecoderLayer`] + + Args: + config: BlenderbotConfig + embed_tokens (nn.Embedding): output embedding + """ + + def __init__(self, config: BlenderbotConfig, embed_tokens: Optional[nn.Embedding] = None): + super().__init__(config) + self.dropout = config.dropout + self.layerdrop = config.decoder_layerdrop + self.padding_idx = config.pad_token_id + self.max_target_positions = config.max_position_embeddings + embed_scale = math.sqrt(config.d_model) if config.scale_embedding else 1.0 + + if embed_tokens is not None: + self.embed_tokens = embed_tokens + else: + self.embed_tokens = BlenderbotScaledWordEmbedding( + config.vocab_size, config.d_model, self.padding_idx, embed_scale=embed_scale + ) + + self.embed_positions = BlenderbotLearnedPositionalEmbedding( + config.max_position_embeddings, + config.d_model, + ) + self.layers = nn.ModuleList([BlenderbotDecoderLayer(config) for _ in range(config.decoder_layers)]) + self.layer_norm = nn.LayerNorm(config.d_model) + + self.gradient_checkpointing = False + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.embed_tokens + + def set_input_embeddings(self, value): + self.embed_tokens = value + + def forward( + self, + input_ids=None, + attention_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + head_mask=None, + cross_attn_head_mask=None, + past_key_values=None, + inputs_embeds=None, + use_cache=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + ): + r""" + Args: + input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you + provide it. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*): + Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention + of the decoder. + encoder_attention_mask (`torch.LongTensor` of shape `(batch_size, encoder_sequence_length)`, *optional*): + Mask to avoid performing cross-attention on padding tokens indices of encoder input_ids. Mask values + selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + head_mask (`torch.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*): + Mask to nullify selected heads of the attention modules in the encoder. Mask values selected in `[0, + 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*): + Mask to nullify selected heads of the cross-attention modules in the decoder to avoid performing + cross-attention on hidden heads. Mask values selected in `[0, 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): + Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of + shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of + shape `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`. + + Contains pre-computed hidden-states (key and values in the self-attention blocks and in the + cross-attention blocks) that can be used (see `past_key_values` input) to speed up sequential decoding. + + If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those + that don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of + all `decoder_input_ids` of shape `(batch_size, sequence_length)`. + inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. + This is useful if you want more control over how to convert `input_ids` indices into associated vectors + than the model's internal embedding lookup matrix. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors + for more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. + """ + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + use_cache = use_cache if use_cache is not None else self.config.use_cache + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + # retrieve input_ids and inputs_embeds + if input_ids is not None and inputs_embeds is not None: + raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time") + elif input_ids is not None: + input_shape = input_ids.size() + input_ids = input_ids.view(-1, input_shape[-1]) + elif inputs_embeds is not None: + input_shape = inputs_embeds.size()[:-1] + else: + raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds") + + # past_key_values_length + past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0 + + if inputs_embeds is None: + inputs_embeds = self.embed_tokens(input_ids) + + attention_mask = _prepare_4d_causal_attention_mask( + attention_mask, input_shape, inputs_embeds, past_key_values_length + ) + + # expand encoder attention mask + if encoder_hidden_states is not None and encoder_attention_mask is not None: + # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] + encoder_attention_mask = _prepare_4d_attention_mask( + encoder_attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1] + ) + + # embed positions + positions = self.embed_positions(input_shape, past_key_values_length) + + hidden_states = inputs_embeds + positions + + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) + + if self.gradient_checkpointing and self.training: + if use_cache: + logger.warning( + "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." + ) + use_cache = False + # decoder layers + all_hidden_states = () if output_hidden_states else None + all_self_attns = () if output_attentions else None + all_cross_attentions = () if (output_attentions and encoder_hidden_states is not None) else None + next_decoder_cache = () if use_cache else None + + # check if head_mask/cross_attn_head_mask has a correct number of layers specified if desired + for attn_mask, mask_name in zip([head_mask, cross_attn_head_mask], ["head_mask", "cross_attn_head_mask"]): + if attn_mask is not None: + if attn_mask.size()[0] != len(self.layers): + raise ValueError( + f"The `{mask_name}` should be specified for {len(self.layers)} layers, but it is for" + f" {head_mask.size()[0]}." + ) + for idx, decoder_layer in enumerate(self.layers): + # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) + if output_hidden_states: + all_hidden_states += (hidden_states,) + if self.training: + dropout_probability = torch.rand([]) + if dropout_probability < self.layerdrop: + continue + + past_key_value = past_key_values[idx] if past_key_values is not None else None + + if self.gradient_checkpointing and self.training: + layer_outputs = self._gradient_checkpointing_func( + decoder_layer.__call__, + hidden_states, + attention_mask, + encoder_hidden_states, + encoder_attention_mask, + head_mask[idx] if head_mask is not None else None, + cross_attn_head_mask[idx] if cross_attn_head_mask is not None else None, + None, + output_attentions, + use_cache, + ) + else: + layer_outputs = decoder_layer( + hidden_states, + attention_mask=attention_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + layer_head_mask=(head_mask[idx] if head_mask is not None else None), + cross_attn_layer_head_mask=( + cross_attn_head_mask[idx] if cross_attn_head_mask is not None else None + ), + past_key_value=past_key_value, + output_attentions=output_attentions, + use_cache=use_cache, + ) + hidden_states = layer_outputs[0] + + if use_cache: + next_decoder_cache += (layer_outputs[3 if output_attentions else 1],) + + if output_attentions: + all_self_attns += (layer_outputs[1],) + + if encoder_hidden_states is not None: + all_cross_attentions += (layer_outputs[2],) + + # add final layer norm + hidden_states = self.layer_norm(hidden_states) + + # add hidden states from the last decoder layer + if output_hidden_states: + all_hidden_states += (hidden_states,) + + next_cache = next_decoder_cache if use_cache else None + if not return_dict: + return tuple( + v + for v in [hidden_states, next_cache, all_hidden_states, all_self_attns, all_cross_attentions] + if v is not None + ) + return BaseModelOutputWithPastAndCrossAttentions( + last_hidden_state=hidden_states, + past_key_values=next_cache, + hidden_states=all_hidden_states, + attentions=all_self_attns, + cross_attentions=all_cross_attentions, + ) + + +@add_start_docstrings( + "The bare Blenderbot Model outputting raw hidden-states without any specific head on top.", + BLENDERBOT_START_DOCSTRING, +) +class BlenderbotModel(BlenderbotPreTrainedModel): + _tied_weights_keys = ["decoder.embed_tokens.weight", "encoder.embed_tokens.weight"] + + def __init__(self, config: BlenderbotConfig): + super().__init__(config) + + padding_idx, vocab_size = config.pad_token_id, config.vocab_size + embed_scale = math.sqrt(config.d_model) if config.scale_embedding else 1.0 + self.shared = BlenderbotScaledWordEmbedding(vocab_size, config.d_model, padding_idx, embed_scale=embed_scale) + self.encoder = BlenderbotEncoder(config, self.shared) + self.decoder = BlenderbotDecoder(config, self.shared) + + # Initialize weights and apply final processing + self.post_init() + + @classmethod + def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.PathLike]], *model_args, **kwargs): + if pretrained_model_name_or_path == "facebook/blenderbot-90M": + warnings.warn( + "The checkpoint `facebook/blenderbot-90M` is deprecated. In the future, please use the identical" + " checkpoint `facebook/small_blenderbot-90M` with" + " `BlenderbotSmallModel.from_pretrained('facebook/small_blenderbot-90M')` instead.", + FutureWarning, + ) + return BlenderbotSmallModel.from_pretrained(pretrained_model_name_or_path) + + return super(BlenderbotModel, cls).from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) + + def get_input_embeddings(self): + return self.shared + + def set_input_embeddings(self, value): + self.shared = value + self.encoder.embed_tokens = self.shared + self.decoder.embed_tokens = self.shared + + def get_encoder(self): + return self.encoder + + def get_decoder(self): + return self.decoder + + @add_start_docstrings_to_model_forward(BLENDERBOT_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=Seq2SeqModelOutput, config_class=_CONFIG_FOR_DOC) + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + decoder_input_ids: Optional[torch.LongTensor] = None, + decoder_attention_mask: Optional[torch.LongTensor] = None, + head_mask: Optional[torch.Tensor] = None, + decoder_head_mask: Optional[torch.Tensor] = None, + cross_attn_head_mask: Optional[torch.Tensor] = None, + encoder_outputs: Optional[Union[Tuple, BaseModelOutput]] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.Tensor] = None, + decoder_inputs_embeds: Optional[torch.FloatTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple[torch.FloatTensor], Seq2SeqModelOutput]: + r""" + Returns: + + Example: + + ```python + >>> from transformers import AutoTokenizer, BlenderbotModel + + >>> model = BlenderbotModel.from_pretrained("facebook/blenderbot-400M-distill") + >>> tokenizer = AutoTokenizer.from_pretrained("facebook/blenderbot-400M-distill") + + >>> inputs = tokenizer("Studies have been shown that owning a dog is good for you", return_tensors="pt") + >>> decoder_input_ids = tokenizer("Studies show that", return_tensors="pt").input_ids # Batch size 1 + >>> outputs = model(input_ids=inputs.input_ids, decoder_input_ids=decoder_input_ids) + + >>> last_hidden_states = outputs.last_hidden_state + >>> list(last_hidden_states.shape) + [1, 6, 1280] + ```""" + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + use_cache = use_cache if use_cache is not None else self.config.use_cache + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if encoder_outputs is None: + encoder_outputs = self.encoder( + input_ids=input_ids, + attention_mask=attention_mask, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + # If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput when return_dict=True + elif return_dict and not isinstance(encoder_outputs, BaseModelOutput): + encoder_outputs = BaseModelOutput( + last_hidden_state=encoder_outputs[0], + hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None, + attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None, + ) + + # decoder outputs consists of (dec_features, past_key_value, dec_hidden, dec_attn) + decoder_outputs = self.decoder( + input_ids=decoder_input_ids, + attention_mask=decoder_attention_mask, + encoder_hidden_states=encoder_outputs[0], + encoder_attention_mask=attention_mask, + head_mask=decoder_head_mask, + cross_attn_head_mask=cross_attn_head_mask, + past_key_values=past_key_values, + inputs_embeds=decoder_inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + if not return_dict: + return decoder_outputs + encoder_outputs + + return Seq2SeqModelOutput( + last_hidden_state=decoder_outputs.last_hidden_state, + past_key_values=decoder_outputs.past_key_values, + decoder_hidden_states=decoder_outputs.hidden_states, + decoder_attentions=decoder_outputs.attentions, + cross_attentions=decoder_outputs.cross_attentions, + encoder_last_hidden_state=encoder_outputs.last_hidden_state, + encoder_hidden_states=encoder_outputs.hidden_states, + encoder_attentions=encoder_outputs.attentions, + ) + + +@add_start_docstrings( + "The Blenderbot Model with a language modeling head. Can be used for summarization.", BLENDERBOT_START_DOCSTRING +) +class BlenderbotForConditionalGeneration(BlenderbotPreTrainedModel, GenerationMixin): + base_model_prefix = "model" + _keys_to_ignore_on_load_missing = ["final_logits_bias"] + _tied_weights_keys = ["decoder.embed_tokens.weight", "encoder.embed_tokens.weight", "lm_head.weight"] + + def __init__(self, config: BlenderbotConfig): + super().__init__(config) + self.model = BlenderbotModel(config) + self.register_buffer("final_logits_bias", torch.zeros((1, self.model.shared.num_embeddings))) + self.lm_head = nn.Linear(config.d_model, self.model.shared.num_embeddings, bias=False) + + # Initialize weights and apply final processing + self.post_init() + + @classmethod + def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.PathLike]], *model_args, **kwargs): + if pretrained_model_name_or_path == "facebook/blenderbot-90M": + warnings.warn( + "The checkpoint `facebook/blenderbot-90M` is deprecated. In the future, please use the identical" + " checkpoint `facebook/small_blenderbot-90M` with" + " `BlenderbotSmallForConditionalGeneration.from_pretrained('facebook/small_blenderbot-90M')` instead.", + FutureWarning, + ) + return BlenderbotSmallForConditionalGeneration.from_pretrained(pretrained_model_name_or_path) + + return super(BlenderbotForConditionalGeneration, cls).from_pretrained( + pretrained_model_name_or_path, *model_args, **kwargs + ) + + def get_encoder(self): + return self.model.get_encoder() + + def get_decoder(self): + return self.model.get_decoder() + + def resize_token_embeddings( + self, new_num_tokens: int, pad_to_multiple_of: Optional[int] = None, mean_resizing: bool = True + ) -> nn.Embedding: + new_embeddings = super().resize_token_embeddings(new_num_tokens, pad_to_multiple_of, mean_resizing) + self._resize_final_logits_bias(new_embeddings.weight.shape[0]) + return new_embeddings + + def _resize_final_logits_bias(self, new_num_tokens: int) -> None: + old_num_tokens = self.final_logits_bias.shape[-1] + if new_num_tokens <= old_num_tokens: + new_bias = self.final_logits_bias[:, :new_num_tokens] + else: + extra_bias = torch.zeros((1, new_num_tokens - old_num_tokens), device=self.final_logits_bias.device) + new_bias = torch.cat([self.final_logits_bias, extra_bias], dim=1) + self.register_buffer("final_logits_bias", new_bias) + + def get_output_embeddings(self): + return self.lm_head + + def set_output_embeddings(self, new_embeddings): + self.lm_head = new_embeddings + + @add_start_docstrings_to_model_forward(BLENDERBOT_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=Seq2SeqLMOutput, config_class=_CONFIG_FOR_DOC) + @add_end_docstrings(BLENDERBOT_GENERATION_EXAMPLE) + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + decoder_input_ids: Optional[torch.LongTensor] = None, + decoder_attention_mask: Optional[torch.LongTensor] = None, + head_mask: Optional[torch.Tensor] = None, + decoder_head_mask: Optional[torch.Tensor] = None, + cross_attn_head_mask: Optional[torch.Tensor] = None, + encoder_outputs: Optional[Union[Tuple, BaseModelOutput]] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.Tensor] = None, + decoder_inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple[torch.FloatTensor], Seq2SeqLMOutput]: + r""" + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., + config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored + (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. + + Returns: + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if labels is not None: + if use_cache: + logger.warning("The `use_cache` argument is changed to `False` since `labels` is provided.") + use_cache = False + if decoder_input_ids is None and decoder_inputs_embeds is None: + decoder_input_ids = shift_tokens_right( + labels, self.config.pad_token_id, self.config.decoder_start_token_id + ) + + outputs = self.model( + input_ids, + attention_mask=attention_mask, + decoder_input_ids=decoder_input_ids, + encoder_outputs=encoder_outputs, + decoder_attention_mask=decoder_attention_mask, + head_mask=head_mask, + decoder_head_mask=decoder_head_mask, + cross_attn_head_mask=cross_attn_head_mask, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + decoder_inputs_embeds=decoder_inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + lm_logits = self.lm_head(outputs[0]) + self.final_logits_bias + + masked_lm_loss = None + if labels is not None: + loss_fct = CrossEntropyLoss() + masked_lm_loss = loss_fct(lm_logits.view(-1, self.config.vocab_size), labels.view(-1)) + + if not return_dict: + output = (lm_logits,) + outputs[1:] + return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output + + return Seq2SeqLMOutput( + loss=masked_lm_loss, + logits=lm_logits, + past_key_values=outputs.past_key_values, + decoder_hidden_states=outputs.decoder_hidden_states, + decoder_attentions=outputs.decoder_attentions, + cross_attentions=outputs.cross_attentions, + encoder_last_hidden_state=outputs.encoder_last_hidden_state, + encoder_hidden_states=outputs.encoder_hidden_states, + encoder_attentions=outputs.encoder_attentions, + ) + + @staticmethod + def _reorder_cache(past_key_values, beam_idx): + reordered_past = () + for layer_past in past_key_values: + # cached cross_attention states don't have to be reordered -> they are always the same + reordered_past += ( + tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past[:2]) + + layer_past[2:], + ) + return reordered_past + + +# Copied from transformers.models.bart.modeling_bart.BartDecoderWrapper with Bart->Blenderbot +class BlenderbotDecoderWrapper(BlenderbotPreTrainedModel): + """ + This wrapper class is a helper class to correctly load pretrained checkpoints when the causal language model is + used in combination with the [`EncoderDecoderModel`] framework. + """ + + def __init__(self, config): + super().__init__(config) + self.decoder = BlenderbotDecoder(config) + + def forward(self, *args, **kwargs): + return self.decoder(*args, **kwargs) + + +# Copied from transformers.models.bart.modeling_bart.BartForCausalLM with Bart->Blenderbot, facebook/bart-base->facebook/blenderbot-400M-distill +class BlenderbotForCausalLM(BlenderbotPreTrainedModel, GenerationMixin): + _tied_weights_keys = ["lm_head.weight"] + + def __init__(self, config): + config = copy.deepcopy(config) + config.is_decoder = True + config.is_encoder_decoder = False + super().__init__(config) + self.model = BlenderbotDecoderWrapper(config) + + self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.model.decoder.embed_tokens + + def set_input_embeddings(self, value): + self.model.decoder.embed_tokens = value + + def get_output_embeddings(self): + return self.lm_head + + def set_output_embeddings(self, new_embeddings): + self.lm_head = new_embeddings + + def set_decoder(self, decoder): + self.model.decoder = decoder + + def get_decoder(self): + return self.model.decoder + + @replace_return_docstrings(output_type=CausalLMOutputWithCrossAttentions, config_class=_CONFIG_FOR_DOC) + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + encoder_attention_mask: Optional[torch.FloatTensor] = None, + head_mask: Optional[torch.Tensor] = None, + cross_attn_head_mask: Optional[torch.Tensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, CausalLMOutputWithCrossAttentions]: + r""" + Args: + input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you + provide it. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention + if the model is configured as a decoder. + encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used + in the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`: + head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*): + Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*): + Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): + Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of + shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of + shape `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`. The two additional + tensors are only required when the model is used as a decoder in a Sequence to Sequence model. + + Contains pre-computed hidden-states (key and values in the self-attention blocks and in the + cross-attention blocks) that can be used (see `past_key_values` input) to speed up sequential decoding. + + If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those + that don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of + all `decoder_input_ids` of shape `(batch_size, sequence_length)`. + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., + config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored + (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding + (see `past_key_values`). + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors + for more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. + + Returns: + + Example: + + ```python + >>> from transformers import AutoTokenizer, BlenderbotForCausalLM + + >>> tokenizer = AutoTokenizer.from_pretrained("facebook/blenderbot-400M-distill") + >>> model = BlenderbotForCausalLM.from_pretrained("facebook/blenderbot-400M-distill", add_cross_attention=False) + >>> assert model.config.is_decoder, f"{model.__class__} has to be configured as a decoder." + >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt") + >>> outputs = model(**inputs) + + >>> logits = outputs.logits + >>> expected_shape = [1, inputs.input_ids.shape[-1], model.config.vocab_size] + >>> list(logits.shape) == expected_shape + True + ```""" + + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn) + outputs = self.model.decoder( + input_ids=input_ids, + attention_mask=attention_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + head_mask=head_mask, + cross_attn_head_mask=cross_attn_head_mask, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + logits = self.lm_head(outputs[0]) + + loss = None + if labels is not None: + labels = labels.to(logits.device) + loss_fct = CrossEntropyLoss() + loss = loss_fct(logits.view(-1, self.config.vocab_size), labels.view(-1)) + + if not return_dict: + output = (logits,) + outputs[1:] + return (loss,) + output if loss is not None else output + + return CausalLMOutputWithCrossAttentions( + loss=loss, + logits=logits, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + cross_attentions=outputs.cross_attentions, + ) + + @staticmethod + def _reorder_cache(past_key_values, beam_idx): + reordered_past = () + for layer_past in past_key_values: + reordered_past += ( + tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past), + ) + return reordered_past + + +__all__ = [ + "BlenderbotForCausalLM", + "BlenderbotForConditionalGeneration", + "BlenderbotModel", + "BlenderbotPreTrainedModel", +] diff --git a/docs/transformers/build/lib/transformers/models/blenderbot/modeling_flax_blenderbot.py b/docs/transformers/build/lib/transformers/models/blenderbot/modeling_flax_blenderbot.py new file mode 100644 index 0000000000000000000000000000000000000000..1e0775cd08cc560d13f8a3885e0be201c67ac6a8 --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/blenderbot/modeling_flax_blenderbot.py @@ -0,0 +1,1508 @@ +# coding=utf-8 +# Copyright 2021 The Fairseq Authors and The Google Flax Team Authors And The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Flax Blenderbot model.""" + +import math +import random +from functools import partial +from typing import Callable, Optional, Tuple + +import flax.linen as nn +import jax +import jax.numpy as jnp +from flax.core.frozen_dict import FrozenDict, freeze, unfreeze +from flax.linen import combine_masks, make_causal_mask +from flax.linen.attention import dot_product_attention_weights +from flax.traverse_util import flatten_dict, unflatten_dict +from jax import lax +from jax.random import PRNGKey + +from ...modeling_flax_outputs import ( + FlaxBaseModelOutput, + FlaxBaseModelOutputWithPastAndCrossAttentions, + FlaxCausalLMOutputWithCrossAttentions, + FlaxSeq2SeqLMOutput, + FlaxSeq2SeqModelOutput, +) +from ...modeling_flax_utils import ( + ACT2FN, + FlaxPreTrainedModel, + append_call_sample_docstring, + append_replace_return_docstrings, + overwrite_call_docstring, +) +from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings +from .configuration_blenderbot import BlenderbotConfig + + +logger = logging.get_logger(__name__) + +_CONFIG_FOR_DOC = "BlenderbotConfig" +_CHECKPOINT_FOR_DOC = "facebook/blenderbot-400M-distill" + + +BLENDERBOT_START_DOCSTRING = r""" + This model inherits from [`FlaxPreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads + etc.) + + This model is also a Flax Linen + [flax.nn.Module](https://flax.readthedocs.io/en/latest/_autosummary/flax.nn.module.html) subclass. Use it as a + regular Flax Module and refer to the Flax documentation for all matter related to general usage and behavior. + + Finally, this model supports inherent JAX features such as: + + - [Just-In-Time (JIT) compilation](https://jax.readthedocs.io/en/latest/jax.html#just-in-time-compilation-jit) + - [Automatic Differentiation](https://jax.readthedocs.io/en/latest/jax.html#automatic-differentiation) + - [Vectorization](https://jax.readthedocs.io/en/latest/jax.html#vectorization-vmap) + - [Parallelization](https://jax.readthedocs.io/en/latest/jax.html#parallelization-pmap) + + Parameters: + config ([`BlenderbotConfig`]): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the [`~FlaxPreTrainedModel.from_pretrained`] method to load the model weights. +""" + +BLENDERBOT_INPUTS_DOCSTRING = r""" + Args: + input_ids (`jnp.ndarray` of shape `(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide + it. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + attention_mask (`jnp.ndarray` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + decoder_input_ids (`jnp.ndarray` of shape `(batch_size, target_sequence_length)`, *optional*): + Indices of decoder input sequence tokens in the vocabulary. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are decoder input IDs?](../glossary#decoder-input-ids) + + For translation and summarization training, `decoder_input_ids` should be provided. If no + `decoder_input_ids` is provided, the model will create this tensor by shifting the `input_ids` to the right + for denoising pre-training following the paper. + decoder_attention_mask (`jnp.ndarray` of shape `(batch_size, target_sequence_length)`, *optional*): + Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also + be used by default. + + If you want to change padding behavior, you should modify to your needs. See diagram 1 in [the + paper](https://arxiv.org/abs/1910.13461) for more information on the default strategy. + position_ids (`numpy.ndarray` of shape `(batch_size, sequence_length)`, *optional*): + Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, + config.max_position_embeddings - 1]`. + decoder_position_ids (`numpy.ndarray` of shape `(batch_size, sequence_length)`, *optional*): + Indices of positions of each decoder input sequence tokens in the position embeddings. Selected in the + range `[0, config.max_position_embeddings - 1]`. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. +""" + + +BLENDERBOT_ENCODE_INPUTS_DOCSTRING = r""" + Args: + input_ids (`jnp.ndarray` of shape `(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide + it. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + attention_mask (`jnp.ndarray` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + position_ids (`numpy.ndarray` of shape `(batch_size, sequence_length)`, *optional*): + Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, + config.max_position_embeddings - 1]`. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. +""" + +BLENDERBOT_DECODE_INPUTS_DOCSTRING = r""" + Args: + decoder_input_ids (`jnp.ndarray` of shape `(batch_size, target_sequence_length)`): + Indices of decoder input sequence tokens in the vocabulary. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are decoder input IDs?](../glossary#decoder-input-ids) + + For translation and summarization training, `decoder_input_ids` should be provided. If no + `decoder_input_ids` is provided, the model will create this tensor by shifting the `input_ids` to the right + for denoising pre-training following the paper. + encoder_outputs (`tuple(tuple(jnp.ndarray)`): + Tuple consists of (`last_hidden_state`, *optional*: `hidden_states`, *optional*: `attentions`) + `last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)`, *optional*) is a sequence of + hidden-states at the output of the last layer of the encoder. Used in the cross-attention of the decoder. + encoder_attention_mask (`jnp.ndarray` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + decoder_attention_mask (`jnp.ndarray` of shape `(batch_size, target_sequence_length)`, *optional*): + Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also + be used by default. + + If you want to change padding behavior, you should modify to your needs. See diagram 1 in [the + paper](https://arxiv.org/abs/1910.13461) for more information on the default strategy. + decoder_position_ids (`numpy.ndarray` of shape `(batch_size, sequence_length)`, *optional*): + Indices of positions of each decoder input sequence tokens in the position embeddings. Selected in the + range `[0, config.max_position_embeddings - 1]`. + past_key_values (`Dict[str, np.ndarray]`, *optional*, returned by `init_cache` or when passing previous `past_key_values`): + Dictionary of pre-computed hidden-states (key and values in the attention blocks) that can be used for fast + auto-regressive decoding. Pre-computed key and value hidden-states are of shape *[batch_size, max_length]*. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. +""" + + +# Copied from transformers.models.bart.modeling_flax_bart.shift_tokens_right +def shift_tokens_right(input_ids: jnp.ndarray, pad_token_id: int, decoder_start_token_id: int) -> jnp.ndarray: + """ + Shift input ids one token to the right. + """ + shifted_input_ids = jnp.zeros_like(input_ids) + shifted_input_ids = shifted_input_ids.at[:, 1:].set(input_ids[:, :-1]) + shifted_input_ids = shifted_input_ids.at[:, 0].set(decoder_start_token_id) + + shifted_input_ids = jnp.where(shifted_input_ids == -100, pad_token_id, shifted_input_ids) + return shifted_input_ids + + +# Copied from transformers.models.bart.modeling_flax_bart.FlaxBartAttention with Bart->Blenderbot +class FlaxBlenderbotAttention(nn.Module): + config: BlenderbotConfig + embed_dim: int + num_heads: int + dropout: float = 0.0 + causal: bool = False + bias: bool = True + dtype: jnp.dtype = jnp.float32 # the dtype of the computation + + def setup(self) -> None: + self.head_dim = self.embed_dim // self.num_heads + if self.head_dim * self.num_heads != self.embed_dim: + raise ValueError( + f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}" + f" and `num_heads`: {self.num_heads})." + ) + + dense = partial( + nn.Dense, + self.embed_dim, + use_bias=self.bias, + dtype=self.dtype, + kernel_init=jax.nn.initializers.normal(self.config.init_std), + ) + + self.q_proj, self.k_proj, self.v_proj = dense(), dense(), dense() + self.out_proj = dense() + + self.dropout_layer = nn.Dropout(rate=self.dropout) + + if self.causal: + self.causal_mask = make_causal_mask( + jnp.ones((1, self.config.max_position_embeddings), dtype="bool"), dtype="bool" + ) + + def _split_heads(self, hidden_states): + return hidden_states.reshape(hidden_states.shape[:2] + (self.num_heads, self.head_dim)) + + def _merge_heads(self, hidden_states): + return hidden_states.reshape(hidden_states.shape[:2] + (self.embed_dim,)) + + @nn.compact + def _concatenate_to_cache(self, key, value, query, attention_mask): + """ + This function takes projected key, value states from a single input token and concatenates the states to cached + states from previous steps. This function is slightly adapted from the official Flax repository: + https://github.com/google/flax/blob/491ce18759622506588784b4fca0e4bf05f8c8cd/flax/linen/attention.py#L252 + """ + # detect if we're initializing by absence of existing cache data. + is_initialized = self.has_variable("cache", "cached_key") + cached_key = self.variable("cache", "cached_key", jnp.zeros, key.shape, key.dtype) + cached_value = self.variable("cache", "cached_value", jnp.zeros, value.shape, value.dtype) + cache_index = self.variable("cache", "cache_index", lambda: jnp.array(0, dtype=jnp.int32)) + + if is_initialized: + *batch_dims, max_length, num_heads, depth_per_head = cached_key.value.shape + # update key, value caches with our new 1d spatial slices + cur_index = cache_index.value + indices = (0,) * len(batch_dims) + (cur_index, 0, 0) + key = lax.dynamic_update_slice(cached_key.value, key, indices) + value = lax.dynamic_update_slice(cached_value.value, value, indices) + cached_key.value = key + cached_value.value = value + num_updated_cache_vectors = query.shape[1] + cache_index.value = cache_index.value + num_updated_cache_vectors + # causal mask for cached decoder self-attention: our single query position should only attend to those key positions that have already been generated and cached, not the remaining zero elements. + pad_mask = jnp.broadcast_to( + jnp.arange(max_length) < cur_index + num_updated_cache_vectors, + tuple(batch_dims) + (1, num_updated_cache_vectors, max_length), + ) + attention_mask = combine_masks(pad_mask, attention_mask) + return key, value, attention_mask + + def __call__( + self, + hidden_states: jnp.ndarray, + key_value_states: Optional[jnp.ndarray] = None, + attention_mask: Optional[jnp.ndarray] = None, + init_cache: bool = False, + deterministic: bool = True, + ) -> Tuple[jnp.ndarray]: + """Input shape: Batch x Time x Channel""" + + # if key_value_states are provided this layer is used as a cross-attention layer + # for the decoder + is_cross_attention = key_value_states is not None + batch_size = hidden_states.shape[0] + + # get query proj + query_states = self.q_proj(hidden_states) + # get key, value proj + if is_cross_attention: + # cross_attentions + key_states = self.k_proj(key_value_states) + value_states = self.v_proj(key_value_states) + else: + # self_attention + key_states = self.k_proj(hidden_states) + value_states = self.v_proj(hidden_states) + + query_states = self._split_heads(query_states) + key_states = self._split_heads(key_states) + value_states = self._split_heads(value_states) + + # handle cache prepare causal attention mask + if self.causal: + query_length, key_length = query_states.shape[1], key_states.shape[1] + if self.has_variable("cache", "cached_key"): + mask_shift = self.variables["cache"]["cache_index"] + max_decoder_length = self.variables["cache"]["cached_key"].shape[1] + causal_mask = lax.dynamic_slice( + self.causal_mask, (0, 0, mask_shift, 0), (1, 1, query_length, max_decoder_length) + ) + else: + causal_mask = self.causal_mask[:, :, :query_length, :key_length] + causal_mask = jnp.broadcast_to(causal_mask, (batch_size,) + causal_mask.shape[1:]) + + # combine masks if needed + if attention_mask is not None and self.causal: + attention_mask = jnp.broadcast_to(jnp.expand_dims(attention_mask, axis=(-3, -2)), causal_mask.shape) + attention_mask = combine_masks(attention_mask, causal_mask) + elif self.causal: + attention_mask = causal_mask + elif attention_mask is not None: + attention_mask = jnp.expand_dims(attention_mask, axis=(-3, -2)) + + # During fast autoregressive decoding, we feed one position at a time, + # and cache the keys and values step by step. + if self.causal and (self.has_variable("cache", "cached_key") or init_cache): + key_states, value_states, attention_mask = self._concatenate_to_cache( + key_states, value_states, query_states, attention_mask + ) + + # Convert the boolean attention mask to an attention bias. + if attention_mask is not None: + # attention mask in the form of attention bias + attention_bias = lax.select( + attention_mask > 0, + jnp.full(attention_mask.shape, 0.0).astype(self.dtype), + jnp.full(attention_mask.shape, jnp.finfo(self.dtype).min).astype(self.dtype), + ) + else: + attention_bias = None + + dropout_rng = None + if not deterministic and self.dropout > 0.0: + dropout_rng = self.make_rng("dropout") + + attn_weights = dot_product_attention_weights( + query_states, + key_states, + bias=attention_bias, + dropout_rng=dropout_rng, + dropout_rate=self.dropout, + broadcast_dropout=True, + deterministic=deterministic, + dtype=self.dtype, + precision=None, + ) + + attn_output = jnp.einsum("...hqk,...khd->...qhd", attn_weights, value_states) + attn_output = self._merge_heads(attn_output) + attn_output = self.out_proj(attn_output) + + return attn_output, attn_weights + + +# Copied from transformers.models.mbart.modeling_flax_mbart.FlaxMBartEncoderLayer with MBart->Blenderbot +class FlaxBlenderbotEncoderLayer(nn.Module): + config: BlenderbotConfig + dtype: jnp.dtype = jnp.float32 + + def setup(self) -> None: + self.embed_dim = self.config.d_model + self.self_attn = FlaxBlenderbotAttention( + config=self.config, + embed_dim=self.embed_dim, + num_heads=self.config.encoder_attention_heads, + dropout=self.config.attention_dropout, + dtype=self.dtype, + ) + self.self_attn_layer_norm = nn.LayerNorm(dtype=self.dtype, epsilon=1e-05) + self.dropout_layer = nn.Dropout(rate=self.config.dropout) + self.activation_fn = ACT2FN[self.config.activation_function] + self.activation_dropout_layer = nn.Dropout(rate=self.config.activation_dropout) + self.fc1 = nn.Dense( + self.config.encoder_ffn_dim, + dtype=self.dtype, + kernel_init=jax.nn.initializers.normal(self.config.init_std), + ) + self.fc2 = nn.Dense( + self.embed_dim, dtype=self.dtype, kernel_init=jax.nn.initializers.normal(self.config.init_std) + ) + self.final_layer_norm = nn.LayerNorm(dtype=self.dtype, epsilon=1e-05) + + def __call__( + self, + hidden_states: jnp.ndarray, + attention_mask: jnp.ndarray, + output_attentions: bool = True, + deterministic: bool = True, + ) -> Tuple[jnp.ndarray]: + residual = hidden_states + hidden_states = self.self_attn_layer_norm(hidden_states) + hidden_states, attn_weights = self.self_attn(hidden_states=hidden_states, attention_mask=attention_mask) + hidden_states = self.dropout_layer(hidden_states, deterministic=deterministic) + hidden_states = residual + hidden_states + + residual = hidden_states + hidden_states = self.final_layer_norm(hidden_states) + hidden_states = self.activation_fn(self.fc1(hidden_states)) + hidden_states = self.activation_dropout_layer(hidden_states, deterministic=deterministic) + hidden_states = self.fc2(hidden_states) + hidden_states = self.dropout_layer(hidden_states, deterministic=deterministic) + hidden_states = residual + hidden_states + + outputs = (hidden_states,) + + if output_attentions: + outputs += (attn_weights,) + + return outputs + + +# Copied from transformers.models.bart.modeling_flax_bart.FlaxBartEncoderLayerCollection with Bart->Blenderbot +class FlaxBlenderbotEncoderLayerCollection(nn.Module): + config: BlenderbotConfig + dtype: jnp.dtype = jnp.float32 # the dtype of the computation + + def setup(self): + self.layers = [ + FlaxBlenderbotEncoderLayer(self.config, name=str(i), dtype=self.dtype) + for i in range(self.config.encoder_layers) + ] + self.layerdrop = self.config.encoder_layerdrop + + def __call__( + self, + hidden_states, + attention_mask, + deterministic: bool = True, + output_attentions: bool = False, + output_hidden_states: bool = False, + return_dict: bool = True, + ): + all_attentions = () if output_attentions else None + all_hidden_states = () if output_hidden_states else None + + for encoder_layer in self.layers: + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) + dropout_probability = random.uniform(0, 1) + if not deterministic and (dropout_probability < self.layerdrop): # skip the layer + layer_outputs = (None, None) + else: + layer_outputs = encoder_layer( + hidden_states, + attention_mask, + output_attentions, + deterministic, + ) + hidden_states = layer_outputs[0] + if output_attentions: + all_attentions = all_attentions + (layer_outputs[1],) + + if output_hidden_states: + all_hidden_states += (hidden_states,) + + outputs = (hidden_states, all_hidden_states, all_attentions) + + if not return_dict: + return tuple(v for v in outputs if v is not None) + + return FlaxBaseModelOutput( + last_hidden_state=hidden_states, hidden_states=all_hidden_states, attentions=all_attentions + ) + + +# Copied from transformers.models.mbart.modeling_flax_mbart.FlaxMBartDecoderLayer with MBart->Blenderbot +class FlaxBlenderbotDecoderLayer(nn.Module): + config: BlenderbotConfig + dtype: jnp.dtype = jnp.float32 + + def setup(self) -> None: + self.embed_dim = self.config.d_model + self.self_attn = FlaxBlenderbotAttention( + config=self.config, + embed_dim=self.embed_dim, + num_heads=self.config.decoder_attention_heads, + dropout=self.config.attention_dropout, + causal=True, + dtype=self.dtype, + ) + self.dropout_layer = nn.Dropout(rate=self.config.dropout) + self.activation_fn = ACT2FN[self.config.activation_function] + self.activation_dropout_layer = nn.Dropout(rate=self.config.activation_dropout) + + self.self_attn_layer_norm = nn.LayerNorm(dtype=self.dtype, epsilon=1e-05) + self.encoder_attn = FlaxBlenderbotAttention( + config=self.config, + embed_dim=self.embed_dim, + num_heads=self.config.decoder_attention_heads, + dropout=self.config.attention_dropout, + dtype=self.dtype, + ) + self.encoder_attn_layer_norm = nn.LayerNorm(dtype=self.dtype, epsilon=1e-05) + self.fc1 = nn.Dense( + self.config.decoder_ffn_dim, + dtype=self.dtype, + kernel_init=jax.nn.initializers.normal(self.config.init_std), + ) + self.fc2 = nn.Dense( + self.embed_dim, dtype=self.dtype, kernel_init=jax.nn.initializers.normal(self.config.init_std) + ) + self.final_layer_norm = nn.LayerNorm(dtype=self.dtype, epsilon=1e-05) + + def __call__( + self, + hidden_states: jnp.ndarray, + attention_mask: jnp.ndarray, + encoder_hidden_states: Optional[jnp.ndarray] = None, + encoder_attention_mask: Optional[jnp.ndarray] = None, + init_cache: bool = False, + output_attentions: bool = True, + deterministic: bool = True, + ) -> Tuple[jnp.ndarray]: + residual = hidden_states + hidden_states = self.self_attn_layer_norm(hidden_states) + + # Self Attention + hidden_states, self_attn_weights = self.self_attn( + hidden_states=hidden_states, attention_mask=attention_mask, init_cache=init_cache + ) + hidden_states = self.dropout_layer(hidden_states, deterministic=deterministic) + hidden_states = residual + hidden_states + + # Cross-Attention Block + cross_attn_weights = None + if encoder_hidden_states is not None: + residual = hidden_states + + hidden_states = self.encoder_attn_layer_norm(hidden_states) + hidden_states, cross_attn_weights = self.encoder_attn( + hidden_states=hidden_states, + key_value_states=encoder_hidden_states, + attention_mask=encoder_attention_mask, + ) + hidden_states = self.dropout_layer(hidden_states, deterministic=deterministic) + hidden_states = residual + hidden_states + + # Fully Connected + residual = hidden_states + hidden_states = self.final_layer_norm(hidden_states) + hidden_states = self.activation_fn(self.fc1(hidden_states)) + hidden_states = self.activation_dropout_layer(hidden_states, deterministic=deterministic) + hidden_states = self.fc2(hidden_states) + hidden_states = self.dropout_layer(hidden_states, deterministic=deterministic) + hidden_states = residual + hidden_states + + outputs = (hidden_states,) + + if output_attentions: + outputs += (self_attn_weights, cross_attn_weights) + + return outputs + + +# Copied from transformers.models.bart.modeling_flax_bart.FlaxBartDecoderLayerCollection with Bart->Blenderbot +class FlaxBlenderbotDecoderLayerCollection(nn.Module): + config: BlenderbotConfig + dtype: jnp.dtype = jnp.float32 # the dtype of the computation + + def setup(self): + self.layers = [ + FlaxBlenderbotDecoderLayer(self.config, name=str(i), dtype=self.dtype) + for i in range(self.config.decoder_layers) + ] + self.layerdrop = self.config.decoder_layerdrop + + def __call__( + self, + hidden_states, + attention_mask, + encoder_hidden_states: Optional[jnp.ndarray] = None, + encoder_attention_mask: Optional[jnp.ndarray] = None, + deterministic: bool = True, + init_cache: bool = False, + output_attentions: bool = False, + output_hidden_states: bool = False, + return_dict: bool = True, + ): + # decoder layers + all_hidden_states = () if output_hidden_states else None + all_self_attns = () if output_attentions else None + all_cross_attentions = () if (output_attentions and encoder_hidden_states is not None) else None + + for decoder_layer in self.layers: + if output_hidden_states: + all_hidden_states += (hidden_states,) + # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) + dropout_probability = random.uniform(0, 1) + if not deterministic and (dropout_probability < self.layerdrop): + layer_outputs = (None, None, None) + else: + layer_outputs = decoder_layer( + hidden_states, + attention_mask=attention_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + init_cache=init_cache, + output_attentions=output_attentions, + deterministic=deterministic, + ) + + hidden_states = layer_outputs[0] + if output_attentions: + all_self_attns += (layer_outputs[1],) + + if encoder_hidden_states is not None: + all_cross_attentions += (layer_outputs[2],) + + # add hidden states from the last decoder layer + if output_hidden_states: + all_hidden_states += (hidden_states,) + + outputs = [hidden_states, all_hidden_states, all_self_attns, all_cross_attentions] + + if not return_dict: + return tuple(v for v in outputs if v is not None) + + return FlaxBaseModelOutputWithPastAndCrossAttentions( + last_hidden_state=hidden_states, + hidden_states=all_hidden_states, + attentions=all_self_attns, + cross_attentions=all_cross_attentions, + ) + + +class FlaxBlenderbotEncoder(nn.Module): + config: BlenderbotConfig + embed_tokens: nn.Embed + dtype: jnp.dtype = jnp.float32 # the dtype of the computation + + def setup(self): + self.dropout_layer = nn.Dropout(rate=self.config.dropout) + + embed_dim = self.config.d_model + self.padding_idx = self.config.pad_token_id + self.max_source_positions = self.config.max_position_embeddings + self.embed_scale = math.sqrt(embed_dim) if self.config.scale_embedding else 1.0 + + self.embed_positions = nn.Embed( + self.config.max_position_embeddings, + embed_dim, + embedding_init=jax.nn.initializers.normal(self.config.init_std), + ) + self.layers = FlaxBlenderbotEncoderLayerCollection(self.config, self.dtype) + self.layer_norm = nn.LayerNorm(dtype=self.dtype, epsilon=1e-05) + + def __call__( + self, + input_ids, + attention_mask, + position_ids, + output_attentions: bool = False, + output_hidden_states: bool = False, + return_dict: bool = True, + deterministic: bool = True, + ): + input_shape = input_ids.shape + input_ids = input_ids.reshape(-1, input_shape[-1]) + + inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale + + embed_pos = self.embed_positions(position_ids) + + hidden_states = inputs_embeds + embed_pos + hidden_states = self.dropout_layer(hidden_states, deterministic=deterministic) + + outputs = self.layers( + hidden_states, + attention_mask, + deterministic=deterministic, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + last_hidden_states = outputs[0] + last_hidden_states = self.layer_norm(last_hidden_states) + + # update the last element in `hidden_states` after applying `layernorm` above + hidden_states = None + if output_hidden_states: + hidden_states = outputs[1] + hidden_states = hidden_states[:-1] + (last_hidden_states,) + + if not return_dict: + outputs = (last_hidden_states, hidden_states) + (outputs[2:] if output_hidden_states else outputs[1:]) + return tuple(v for v in outputs if v is not None) + + return FlaxBaseModelOutput( + last_hidden_state=last_hidden_states, + hidden_states=hidden_states, + attentions=outputs.attentions, + ) + + +class FlaxBlenderbotDecoder(nn.Module): + config: BlenderbotConfig + embed_tokens: nn.Embed + dtype: jnp.dtype = jnp.float32 # the dtype of the computation + + def setup(self): + self.dropout_layer = nn.Dropout(rate=self.config.dropout) + + embed_dim = self.config.d_model + self.padding_idx = self.config.pad_token_id + self.max_target_positions = self.config.max_position_embeddings + self.embed_scale = math.sqrt(self.config.d_model) if self.config.scale_embedding else 1.0 + + self.embed_positions = nn.Embed( + self.config.max_position_embeddings, + embed_dim, + embedding_init=jax.nn.initializers.normal(self.config.init_std), + ) + + self.layers = FlaxBlenderbotDecoderLayerCollection(self.config, self.dtype) + self.layer_norm = nn.LayerNorm(dtype=self.dtype, epsilon=1e-05) + + def __call__( + self, + input_ids, + attention_mask, + position_ids, + encoder_hidden_states: Optional[jnp.ndarray] = None, + encoder_attention_mask: Optional[jnp.ndarray] = None, + init_cache: bool = False, + output_attentions: bool = False, + output_hidden_states: bool = False, + return_dict: bool = True, + deterministic: bool = True, + ): + input_shape = input_ids.shape + input_ids = input_ids.reshape(-1, input_shape[-1]) + + inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale + + # embed positions + positions = self.embed_positions(position_ids) + + hidden_states = inputs_embeds + positions + hidden_states = self.dropout_layer(hidden_states, deterministic=deterministic) + + outputs = self.layers( + hidden_states, + attention_mask, + encoder_hidden_states, + encoder_attention_mask, + deterministic=deterministic, + init_cache=init_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + last_hidden_states = outputs[0] + last_hidden_states = self.layer_norm(last_hidden_states) + + # update the last element in `hidden_states` after applying `layernorm` above + hidden_states = None + if output_hidden_states: + hidden_states = outputs[1] + hidden_states = hidden_states[:-1] + (last_hidden_states,) + + if not return_dict: + outputs = (last_hidden_states, hidden_states) + (outputs[2:] if output_hidden_states else outputs[1:]) + return tuple(v for v in outputs if v is not None) + + return FlaxBaseModelOutputWithPastAndCrossAttentions( + last_hidden_state=last_hidden_states, + hidden_states=hidden_states, + attentions=outputs.attentions, + cross_attentions=outputs.cross_attentions, + ) + + +# Copied from transformers.models.bart.modeling_flax_bart.FlaxBartModule with Bart->Blenderbot +class FlaxBlenderbotModule(nn.Module): + config: BlenderbotConfig + dtype: jnp.dtype = jnp.float32 # the dtype of the computation + + def setup(self): + self.shared = nn.Embed( + self.config.vocab_size, + self.config.d_model, + embedding_init=jax.nn.initializers.normal(self.config.init_std), + dtype=self.dtype, + ) + + self.encoder = FlaxBlenderbotEncoder(self.config, dtype=self.dtype, embed_tokens=self.shared) + self.decoder = FlaxBlenderbotDecoder(self.config, dtype=self.dtype, embed_tokens=self.shared) + + def _get_encoder_module(self): + return self.encoder + + def _get_decoder_module(self): + return self.decoder + + def __call__( + self, + input_ids, + attention_mask, + decoder_input_ids, + decoder_attention_mask, + position_ids, + decoder_position_ids, + output_attentions: bool = False, + output_hidden_states: bool = False, + return_dict: bool = True, + deterministic: bool = True, + ): + encoder_outputs = self.encoder( + input_ids=input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + deterministic=deterministic, + ) + + decoder_outputs = self.decoder( + input_ids=decoder_input_ids, + attention_mask=decoder_attention_mask, + position_ids=decoder_position_ids, + encoder_hidden_states=encoder_outputs[0], + encoder_attention_mask=attention_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + deterministic=deterministic, + ) + + if not return_dict: + return decoder_outputs + encoder_outputs + + return FlaxSeq2SeqModelOutput( + last_hidden_state=decoder_outputs.last_hidden_state, + decoder_hidden_states=decoder_outputs.hidden_states, + decoder_attentions=decoder_outputs.attentions, + cross_attentions=decoder_outputs.cross_attentions, + encoder_last_hidden_state=encoder_outputs.last_hidden_state, + encoder_hidden_states=encoder_outputs.hidden_states, + encoder_attentions=encoder_outputs.attentions, + ) + + +class FlaxBlenderbotPreTrainedModel(FlaxPreTrainedModel): + config_class = BlenderbotConfig + base_model_prefix: str = "model" + module_class: nn.Module = None + + def __init__( + self, + config: BlenderbotConfig, + input_shape: Tuple[int] = (1, 1), + seed: int = 0, + dtype: jnp.dtype = jnp.float32, + _do_init: bool = True, + **kwargs, + ): + module = self.module_class(config=config, dtype=dtype, **kwargs) + super().__init__(config, module, input_shape=input_shape, seed=seed, dtype=dtype, _do_init=_do_init) + + def init_weights(self, rng: jax.random.PRNGKey, input_shape: Tuple, params: FrozenDict = None) -> FrozenDict: + # init input tensors + input_ids = jnp.zeros(input_shape, dtype="i4") + # make sure initialization pass will work for FlaxBlenderbotForSequenceClassificationModule + input_ids = input_ids.at[(..., -1)].set(self.config.eos_token_id) + attention_mask = jnp.ones_like(input_ids) + decoder_input_ids = input_ids + decoder_attention_mask = jnp.ones_like(input_ids) + + batch_size, sequence_length = input_ids.shape + position_ids = jnp.broadcast_to(jnp.arange(sequence_length)[None, :], (batch_size, sequence_length)) + decoder_position_ids = jnp.broadcast_to(jnp.arange(sequence_length)[None, :], (batch_size, sequence_length)) + + params_rng, dropout_rng = jax.random.split(rng) + rngs = {"params": params_rng, "dropout": dropout_rng} + + random_params = self.module.init( + rngs, + input_ids, + attention_mask, + decoder_input_ids, + decoder_attention_mask, + position_ids, + decoder_position_ids, + )["params"] + + if params is not None: + random_params = flatten_dict(unfreeze(random_params)) + params = flatten_dict(unfreeze(params)) + for missing_key in self._missing_keys: + params[missing_key] = random_params[missing_key] + self._missing_keys = set() + return freeze(unflatten_dict(params)) + else: + return random_params + + def init_cache(self, batch_size, max_length, encoder_outputs): + r""" + Args: + batch_size (`int`): + batch_size used for fast auto-regressive decoding. Defines the batch size of the initialized cache. + max_length (`int`): + maximum possible length for auto-regressive decoding. Defines the sequence length of the initialized + cache. + encoder_outputs (`Union[FlaxBaseModelOutput, tuple(tuple(jnp.ndarray)]`): + `encoder_outputs` consists of (`last_hidden_state`, *optional*: `hidden_states`, *optional*: + `attentions`). `last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)`, *optional*) + is a sequence of hidden-states at the output of the last layer of the encoder. Used in the + cross-attention of the decoder. + """ + # init input variables to retrieve cache + decoder_input_ids = jnp.ones((batch_size, max_length), dtype="i4") + decoder_attention_mask = jnp.ones_like(decoder_input_ids) + decoder_position_ids = jnp.broadcast_to( + jnp.arange(jnp.atleast_2d(decoder_input_ids).shape[-1]), decoder_input_ids.shape + ) + + def _decoder_forward(module, decoder_input_ids, decoder_attention_mask, decoder_position_ids, **kwargs): + decoder_module = module._get_decoder_module() + return decoder_module( + decoder_input_ids, + decoder_attention_mask, + decoder_position_ids, + **kwargs, + ) + + init_variables = self.module.init( + jax.random.PRNGKey(0), + decoder_input_ids=decoder_input_ids, + decoder_attention_mask=decoder_attention_mask, + decoder_position_ids=decoder_position_ids, + encoder_hidden_states=encoder_outputs[0], + init_cache=True, + method=_decoder_forward, # we only need to call the decoder to init the cache + ) + return unfreeze(init_variables["cache"]) + + @add_start_docstrings(BLENDERBOT_ENCODE_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=FlaxBaseModelOutput, config_class=BlenderbotConfig) + def encode( + self, + input_ids: jnp.ndarray, + attention_mask: Optional[jnp.ndarray] = None, + position_ids: Optional[jnp.ndarray] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + train: bool = False, + params: dict = None, + dropout_rng: PRNGKey = None, + ): + r""" + Returns: + + Example: + + ```python + >>> from transformers import AutoTokenizer, FlaxBlenderbotForConditionalGeneration + + >>> model = FlaxBlenderbotForConditionalGeneration.from_pretrained("facebook/blenderbot-400M-distill") + >>> tokenizer = AutoTokenizer.from_pretrained("facebook/blenderbot-400M-distill") + + >>> text = "My friends are cool but they eat too many carbs." + >>> inputs = tokenizer(text, max_length=1024, return_tensors="jax") + >>> encoder_outputs = model.encode(**inputs) + ```""" + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.return_dict + + if attention_mask is None: + attention_mask = jnp.ones_like(input_ids) + if position_ids is None: + batch_size, sequence_length = input_ids.shape + position_ids = jnp.broadcast_to(jnp.arange(sequence_length)[None, :], (batch_size, sequence_length)) + + # Handle any PRNG if needed + rngs = {} + if dropout_rng is not None: + rngs["dropout"] = dropout_rng + + def _encoder_forward(module, input_ids, attention_mask, position_ids, **kwargs): + encode_module = module._get_encoder_module() + return encode_module(input_ids, attention_mask, position_ids, **kwargs) + + return self.module.apply( + {"params": params or self.params}, + input_ids=jnp.array(input_ids, dtype="i4"), + attention_mask=jnp.array(attention_mask, dtype="i4"), + position_ids=jnp.array(position_ids, dtype="i4"), + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + deterministic=not train, + rngs=rngs, + method=_encoder_forward, + ) + + @add_start_docstrings(BLENDERBOT_DECODE_INPUTS_DOCSTRING) + @replace_return_docstrings( + output_type=FlaxBaseModelOutputWithPastAndCrossAttentions, config_class=BlenderbotConfig + ) + def decode( + self, + decoder_input_ids, + encoder_outputs, + encoder_attention_mask: Optional[jnp.ndarray] = None, + decoder_attention_mask: Optional[jnp.ndarray] = None, + decoder_position_ids: Optional[jnp.ndarray] = None, + past_key_values: dict = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + train: bool = False, + params: dict = None, + dropout_rng: PRNGKey = None, + ): + r""" + Returns: + + Example: + + ```python + >>> import jax.numpy as jnp + >>> from transformers import AutoTokenizer, FlaxBlenderbotForConditionalGeneration + + >>> model = FlaxBlenderbotForConditionalGeneration.from_pretrained("facebook/blenderbot-400M-distill") + >>> tokenizer = AutoTokenizer.from_pretrained("facebook/blenderbot-400M-distill") + + >>> text = "My friends are cool but they eat too many carbs." + >>> inputs = tokenizer(text, max_length=1024, return_tensors="jax") + >>> encoder_outputs = model.encode(**inputs) + + >>> decoder_start_token_id = model.config.decoder_start_token_id + >>> decoder_input_ids = jnp.ones((inputs.input_ids.shape[0], 1), dtype="i4") * decoder_start_token_id + + >>> outputs = model.decode(decoder_input_ids, encoder_outputs) + >>> last_decoder_hidden_states = outputs.last_hidden_state + ```""" + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.return_dict + + encoder_hidden_states = encoder_outputs[0] + if encoder_attention_mask is None: + batch_size, sequence_length = encoder_hidden_states.shape[:2] + encoder_attention_mask = jnp.ones((batch_size, sequence_length)) + + batch_size, sequence_length = decoder_input_ids.shape + if decoder_attention_mask is None: + decoder_attention_mask = jnp.ones((batch_size, sequence_length)) + + if decoder_position_ids is None: + if past_key_values is not None: + raise ValueError("Make sure to provide `decoder_position_ids` when passing `past_key_values`.") + + decoder_position_ids = jnp.broadcast_to( + jnp.arange(sequence_length)[None, :], (batch_size, sequence_length) + ) + + # Handle any PRNG if needed + rngs = {} + if dropout_rng is not None: + rngs["dropout"] = dropout_rng + + inputs = {"params": params or self.params} + + # if past_key_values are passed then cache is already initialized a private flag init_cache has to be + # passed down to ensure cache is used. It has to be made sure that cache is marked as mutable so that + # it can be changed by FlaxBlenderbotAttention module + if past_key_values: + inputs["cache"] = past_key_values + mutable = ["cache"] + else: + mutable = False + + def _decoder_forward(module, decoder_input_ids, decoder_attention_mask, decoder_position_ids, **kwargs): + decoder_module = module._get_decoder_module() + return decoder_module( + decoder_input_ids, + decoder_attention_mask, + decoder_position_ids, + **kwargs, + ) + + outputs = self.module.apply( + inputs, + decoder_input_ids=jnp.array(decoder_input_ids, dtype="i4"), + decoder_attention_mask=jnp.array(decoder_attention_mask, dtype="i4"), + decoder_position_ids=jnp.array(decoder_position_ids, dtype="i4"), + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=jnp.array(encoder_attention_mask, dtype="i4"), + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + deterministic=not train, + rngs=rngs, + mutable=mutable, + method=_decoder_forward, + ) + + # add updated cache to model output + if past_key_values is not None and return_dict: + outputs, past = outputs + outputs["past_key_values"] = unfreeze(past["cache"]) + return outputs + elif past_key_values is not None and not return_dict: + outputs, past = outputs + outputs = outputs[:1] + (unfreeze(past["cache"]),) + outputs[1:] + + return outputs + + @add_start_docstrings_to_model_forward(BLENDERBOT_INPUTS_DOCSTRING) + def __call__( + self, + input_ids: jnp.ndarray, + attention_mask: Optional[jnp.ndarray] = None, + decoder_input_ids: Optional[jnp.ndarray] = None, + decoder_attention_mask: Optional[jnp.ndarray] = None, + position_ids: Optional[jnp.ndarray] = None, + decoder_position_ids: Optional[jnp.ndarray] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + train: bool = False, + params: dict = None, + dropout_rng: PRNGKey = None, + ): + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.return_dict + + # prepare encoder inputs + if attention_mask is None: + attention_mask = jnp.ones_like(input_ids) + if position_ids is None: + batch_size, sequence_length = input_ids.shape + position_ids = jnp.broadcast_to(jnp.arange(sequence_length)[None, :], (batch_size, sequence_length)) + + # prepare decoder inputs + if decoder_input_ids is None: + decoder_input_ids = shift_tokens_right( + input_ids, self.config.pad_token_id, decoder_start_token_id=self.config.decoder_start_token_id + ) + if decoder_attention_mask is None: + decoder_attention_mask = jnp.ones_like(decoder_input_ids) + if decoder_position_ids is None: + batch_size, sequence_length = decoder_input_ids.shape + decoder_position_ids = jnp.broadcast_to( + jnp.arange(sequence_length)[None, :], (batch_size, sequence_length) + ) + + # Handle any PRNG if needed + rngs = {"dropout": dropout_rng} if dropout_rng is not None else {} + + return self.module.apply( + {"params": params or self.params}, + input_ids=jnp.array(input_ids, dtype="i4"), + attention_mask=jnp.array(attention_mask, dtype="i4"), + position_ids=jnp.array(position_ids, dtype="i4"), + decoder_input_ids=jnp.array(decoder_input_ids, dtype="i4"), + decoder_attention_mask=jnp.array(decoder_attention_mask, dtype="i4"), + decoder_position_ids=jnp.array(decoder_position_ids, dtype="i4"), + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + deterministic=not train, + rngs=rngs, + ) + + +@add_start_docstrings( + "The bare MBart Model transformer outputting raw hidden-states without any specific head on top.", + BLENDERBOT_START_DOCSTRING, +) +class FlaxBlenderbotModel(FlaxBlenderbotPreTrainedModel): + config: BlenderbotConfig + dtype: jnp.dtype = jnp.float32 # the dtype of the computation + module_class = FlaxBlenderbotModule + + +append_call_sample_docstring(FlaxBlenderbotModel, _CHECKPOINT_FOR_DOC, FlaxSeq2SeqModelOutput, _CONFIG_FOR_DOC) + + +# Copied from transformers.models.bart.modeling_flax_bart.FlaxBartForConditionalGenerationModule with Bart->Blenderbot +class FlaxBlenderbotForConditionalGenerationModule(nn.Module): + config: BlenderbotConfig + dtype: jnp.dtype = jnp.float32 + bias_init: Callable[..., jnp.ndarray] = jax.nn.initializers.zeros + + def setup(self): + self.model = FlaxBlenderbotModule(config=self.config, dtype=self.dtype) + self.lm_head = nn.Dense( + self.model.shared.num_embeddings, + use_bias=False, + dtype=self.dtype, + kernel_init=jax.nn.initializers.normal(self.config.init_std), + ) + self.final_logits_bias = self.param("final_logits_bias", self.bias_init, (1, self.model.shared.num_embeddings)) + + def _get_encoder_module(self): + return self.model.encoder + + def _get_decoder_module(self): + return self.model.decoder + + def __call__( + self, + input_ids, + attention_mask, + decoder_input_ids, + decoder_attention_mask, + position_ids, + decoder_position_ids, + output_attentions: bool = False, + output_hidden_states: bool = False, + return_dict: bool = True, + deterministic: bool = True, + ): + outputs = self.model( + input_ids=input_ids, + attention_mask=attention_mask, + decoder_input_ids=decoder_input_ids, + decoder_attention_mask=decoder_attention_mask, + position_ids=position_ids, + decoder_position_ids=decoder_position_ids, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + deterministic=deterministic, + ) + + hidden_states = outputs[0] + + if self.config.tie_word_embeddings: + shared_embedding = self.model.variables["params"]["shared"]["embedding"] + lm_logits = self.lm_head.apply({"params": {"kernel": shared_embedding.T}}, hidden_states) + else: + lm_logits = self.lm_head(hidden_states) + + lm_logits += jax.lax.stop_gradient(self.final_logits_bias.astype(self.dtype)) + + if not return_dict: + output = (lm_logits,) + outputs[1:] + return output + + return FlaxSeq2SeqLMOutput( + logits=lm_logits, + decoder_hidden_states=outputs.decoder_hidden_states, + decoder_attentions=outputs.decoder_attentions, + cross_attentions=outputs.cross_attentions, + encoder_last_hidden_state=outputs.encoder_last_hidden_state, + encoder_hidden_states=outputs.encoder_hidden_states, + encoder_attentions=outputs.encoder_attentions, + ) + + +@add_start_docstrings( + "The Blenderbot Model with a language modeling head. Can be used for summarization.", BLENDERBOT_START_DOCSTRING +) +class FlaxBlenderbotForConditionalGeneration(FlaxBlenderbotPreTrainedModel): + module_class = FlaxBlenderbotForConditionalGenerationModule + dtype: jnp.dtype = jnp.float32 + + @add_start_docstrings(BLENDERBOT_DECODE_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=FlaxCausalLMOutputWithCrossAttentions, config_class=BlenderbotConfig) + def decode( + self, + decoder_input_ids, + encoder_outputs, + encoder_attention_mask: Optional[jnp.ndarray] = None, + decoder_attention_mask: Optional[jnp.ndarray] = None, + decoder_position_ids: Optional[jnp.ndarray] = None, + past_key_values: dict = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + train: bool = False, + params: dict = None, + dropout_rng: PRNGKey = None, + ): + r""" + Returns: + + Example: + + ```python + >>> import jax.numpy as jnp + >>> from transformers import AutoTokenizer, FlaxBlenderbotForConditionalGeneration + + >>> model = FlaxBlenderbotForConditionalGeneration.from_pretrained("facebook/blenderbot-400M-distill") + >>> tokenizer = AutoTokenizer.from_pretrained("facebook/blenderbot-400M-distill") + + >>> text = "My friends are cool but they eat too many carbs." + >>> inputs = tokenizer(text, max_length=1024, return_tensors="jax") + >>> encoder_outputs = model.encode(**inputs) + + >>> decoder_start_token_id = model.config.decoder_start_token_id + >>> decoder_input_ids = jnp.ones((inputs.input_ids.shape[0], 1), dtype="i4") * decoder_start_token_id + + >>> outputs = model.decode(decoder_input_ids, encoder_outputs) + >>> logits = outputs.logits + ```""" + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.return_dict + + encoder_hidden_states = encoder_outputs[0] + if encoder_attention_mask is None: + batch_size, sequence_length = encoder_hidden_states.shape[:2] + encoder_attention_mask = jnp.ones((batch_size, sequence_length)) + + batch_size, sequence_length = decoder_input_ids.shape + if decoder_attention_mask is None: + decoder_attention_mask = jnp.ones((batch_size, sequence_length)) + + if decoder_position_ids is None: + if past_key_values is not None: + raise ValueError("Make sure to provide `decoder_position_ids` when passing `past_key_values`.") + + decoder_position_ids = jnp.broadcast_to( + jnp.arange(sequence_length)[None, :], (batch_size, sequence_length) + ) + + # Handle any PRNG if needed + rngs = {} + if dropout_rng is not None: + rngs["dropout"] = dropout_rng + + inputs = {"params": params or self.params} + + # if past_key_values are passed then cache is already initialized a private flag init_cache has to be + # passed down to ensure cache is used. It has to be made sure that cache is marked as mutable so that + # it can be changed by FlaxBlenderbotAttention module + if past_key_values: + inputs["cache"] = past_key_values + mutable = ["cache"] + else: + mutable = False + + def _decoder_forward(module, decoder_input_ids, decoder_attention_mask, decoder_position_ids, **kwargs): + decoder_module = module._get_decoder_module() + outputs = decoder_module( + decoder_input_ids, + decoder_attention_mask, + decoder_position_ids, + **kwargs, + ) + hidden_states = outputs[0] + + if self.config.tie_word_embeddings: + shared_embedding = module.model.variables["params"]["shared"]["embedding"] + lm_logits = module.lm_head.apply({"params": {"kernel": shared_embedding.T}}, hidden_states) + else: + lm_logits = module.lm_head(hidden_states) + + lm_logits += module.final_logits_bias + return lm_logits, outputs + + outputs = self.module.apply( + inputs, + decoder_input_ids=jnp.array(decoder_input_ids, dtype="i4"), + decoder_attention_mask=jnp.array(decoder_attention_mask, dtype="i4"), + decoder_position_ids=jnp.array(decoder_position_ids, dtype="i4"), + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=jnp.array(encoder_attention_mask, dtype="i4"), + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + deterministic=not train, + rngs=rngs, + mutable=mutable, + method=_decoder_forward, + ) + + if past_key_values is None: + lm_logits, decoder_outputs = outputs + else: + (lm_logits, decoder_outputs), past = outputs + + if return_dict: + outputs = FlaxCausalLMOutputWithCrossAttentions( + logits=lm_logits, + hidden_states=decoder_outputs.hidden_states, + attentions=decoder_outputs.attentions, + cross_attentions=decoder_outputs.cross_attentions, + ) + else: + outputs = (lm_logits,) + decoder_outputs[1:] + + # add updated cache to model output + if past_key_values is not None and return_dict: + outputs["past_key_values"] = unfreeze(past["cache"]) + return outputs + elif past_key_values is not None and not return_dict: + outputs = outputs[:1] + (unfreeze(past["cache"]),) + outputs[1:] + + return outputs + + def prepare_inputs_for_generation( + self, + decoder_input_ids, + max_length, + attention_mask: Optional[jax.Array] = None, + decoder_attention_mask: Optional[jax.Array] = None, + encoder_outputs=None, + **kwargs, + ): + # initializing the cache + batch_size, seq_length = decoder_input_ids.shape + + past_key_values = self.init_cache(batch_size, max_length, encoder_outputs) + # Note that usually one would have to put 0's in the attention_mask for x > input_ids.shape[-1] and x < cache_length. + # But since the decoder uses a causal mask, those positions are masked anyways. + # Thus we can create a single static attention_mask here, which is more efficient for compilation + extended_attention_mask = jnp.ones((batch_size, max_length), dtype="i4") + if decoder_attention_mask is not None: + position_ids = decoder_attention_mask.cumsum(axis=-1) - 1 + extended_attention_mask = lax.dynamic_update_slice(extended_attention_mask, decoder_attention_mask, (0, 0)) + else: + position_ids = jnp.broadcast_to(jnp.arange(seq_length, dtype="i4")[None, :], (batch_size, seq_length)) + + return { + "past_key_values": past_key_values, + "encoder_outputs": encoder_outputs, + "encoder_attention_mask": attention_mask, + "decoder_attention_mask": extended_attention_mask, + "decoder_position_ids": position_ids, + } + + def update_inputs_for_generation(self, model_outputs, model_kwargs): + model_kwargs["past_key_values"] = model_outputs.past_key_values + model_kwargs["decoder_position_ids"] = model_kwargs["decoder_position_ids"][:, -1:] + 1 + return model_kwargs + + +FLAX_BLENDERBOT_CONDITIONAL_GENERATION_DOCSTRING = r""" + Returns: + + Conversation example:: + + ```py + >>> from transformers import AutoTokenizer, FlaxBlenderbotForConditionalGeneration + + >>> model = FlaxBlenderbotForConditionalGeneration.from_pretrained("facebook/blenderbot-400M-distill") + >>> tokenizer = AutoTokenizer.from_pretrained("facebook/blenderbot-400M-distill") + + >>> UTTERANCE = "My friends are cool but they eat too many carbs." + >>> inputs = tokenizer([UTTERANCE], max_length=1024, return_tensors="np") + + >>> # Generate Reply + >>> reply_ids = model.generate(inputs["input_ids"], num_beams=4, max_length=5, early_stopping=True).sequences + >>> print([tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in reply_ids]) + ``` +""" + +overwrite_call_docstring( + FlaxBlenderbotForConditionalGeneration, + BLENDERBOT_INPUTS_DOCSTRING + FLAX_BLENDERBOT_CONDITIONAL_GENERATION_DOCSTRING, +) +append_replace_return_docstrings( + FlaxBlenderbotForConditionalGeneration, output_type=FlaxSeq2SeqLMOutput, config_class=_CONFIG_FOR_DOC +) + + +__all__ = ["FlaxBlenderbotForConditionalGeneration", "FlaxBlenderbotModel", "FlaxBlenderbotPreTrainedModel"] diff --git a/docs/transformers/build/lib/transformers/models/blenderbot/modeling_tf_blenderbot.py b/docs/transformers/build/lib/transformers/models/blenderbot/modeling_tf_blenderbot.py new file mode 100644 index 0000000000000000000000000000000000000000..f3476cb925b6b4c056ddcf8470b66c2b6b50e9b3 --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/blenderbot/modeling_tf_blenderbot.py @@ -0,0 +1,1558 @@ +# coding=utf-8 +# Copyright 2021 The Facebook, Inc and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""TF 2.0 Blenderbot model.""" + +from __future__ import annotations + +import os +import random +import warnings +from typing import List, Optional, Tuple, Union + +import tensorflow as tf + +from ...activations_tf import get_tf_activation +from ...modeling_tf_outputs import ( + TFBaseModelOutput, + TFBaseModelOutputWithPastAndCrossAttentions, + TFSeq2SeqLMOutput, + TFSeq2SeqModelOutput, +) + +# Public API +from ...modeling_tf_utils import ( + TFCausalLanguageModelingLoss, + TFPreTrainedModel, + keras, + keras_serializable, + unpack_inputs, +) +from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax +from ...utils import ( + add_code_sample_docstrings, + add_end_docstrings, + add_start_docstrings, + add_start_docstrings_to_model_forward, + logging, + replace_return_docstrings, +) +from .configuration_blenderbot import BlenderbotConfig + + +logger = logging.get_logger(__name__) + +_CHECKPOINT_FOR_DOC = "facebook/blenderbot-400M-distill" +_CONFIG_FOR_DOC = "BlenderbotConfig" + + +LARGE_NEGATIVE = -1e8 + + +# Copied from transformers.models.bart.modeling_tf_bart.shift_tokens_right +def shift_tokens_right(input_ids: tf.Tensor, pad_token_id: int, decoder_start_token_id: int): + pad_token_id = tf.cast(pad_token_id, input_ids.dtype) + decoder_start_token_id = tf.cast(decoder_start_token_id, input_ids.dtype) + start_tokens = tf.fill( + (shape_list(input_ids)[0], 1), tf.convert_to_tensor(decoder_start_token_id, input_ids.dtype) + ) + shifted_input_ids = tf.concat([start_tokens, input_ids[:, :-1]], -1) + # replace possible -100 values in labels by `pad_token_id` + shifted_input_ids = tf.where( + shifted_input_ids == -100, + tf.fill(shape_list(shifted_input_ids), tf.convert_to_tensor(pad_token_id, input_ids.dtype)), + shifted_input_ids, + ) + + # "Verify that `labels` has only positive values and -100" + assert_gte0 = tf.debugging.assert_greater_equal(shifted_input_ids, tf.constant(0, dtype=input_ids.dtype)) + + # Make sure the assertion op is called by wrapping the result in an identity no-op + with tf.control_dependencies([assert_gte0]): + shifted_input_ids = tf.identity(shifted_input_ids) + + return shifted_input_ids + + +# Copied from transformers.models.bart.modeling_tf_bart._make_causal_mask +def _make_causal_mask(input_ids_shape: tf.TensorShape, past_key_values_length: int = 0): + """ + Make causal mask used for bi-directional self-attention. + """ + bsz = input_ids_shape[0] + tgt_len = input_ids_shape[1] + mask = tf.ones((tgt_len, tgt_len)) * LARGE_NEGATIVE + mask_cond = tf.range(shape_list(mask)[-1]) + + mask = tf.where(mask_cond < tf.reshape(mask_cond + 1, (shape_list(mask)[-1], 1)), 0.0, mask) + + if past_key_values_length > 0: + mask = tf.concat([tf.zeros((tgt_len, past_key_values_length)), mask], axis=-1) + + return tf.tile(mask[None, None, :, :], (bsz, 1, 1, 1)) + + +# Copied from transformers.models.bart.modeling_tf_bart._expand_mask +def _expand_mask(mask: tf.Tensor, tgt_len: Optional[int] = None): + """ + Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`. + """ + src_len = shape_list(mask)[1] + tgt_len = tgt_len if tgt_len is not None else src_len + one_cst = tf.constant(1.0) + mask = tf.cast(mask, dtype=one_cst.dtype) + expanded_mask = tf.tile(mask[:, None, None, :], (1, 1, tgt_len, 1)) + + return (one_cst - expanded_mask) * LARGE_NEGATIVE + + +class TFBlenderbotLearnedPositionalEmbedding(keras.layers.Embedding): + """ + This module learns positional embeddings up to a fixed maximum size. + """ + + def __init__(self, num_embeddings: int, embedding_dim: int, **kwargs): + super().__init__(num_embeddings, embedding_dim, **kwargs) + + def call( + self, input_shape: tf.TensorShape, past_key_values_length: int = 0, position_ids: tf.Tensor | None = None + ): + """Input is expected to be of size [bsz x seqlen].""" + if position_ids is None: + seq_len = input_shape[1] + position_ids = tf.range(seq_len, delta=1, name="range") + position_ids += past_key_values_length + + return super().call(tf.cast(position_ids, dtype=tf.int32)) + + +# Copied from transformers.models.bart.modeling_tf_bart.TFBartAttention with Bart->Blenderbot +class TFBlenderbotAttention(keras.layers.Layer): + """Multi-headed attention from "Attention Is All You Need""" + + def __init__( + self, + embed_dim: int, + num_heads: int, + dropout: float = 0.0, + is_decoder: bool = False, + bias: bool = True, + **kwargs, + ): + super().__init__(**kwargs) + self.embed_dim = embed_dim + + self.num_heads = num_heads + self.dropout = keras.layers.Dropout(dropout) + self.head_dim = embed_dim // num_heads + if (self.head_dim * num_heads) != self.embed_dim: + raise ValueError( + f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}" + f" and `num_heads`: {num_heads})." + ) + self.scaling = self.head_dim**-0.5 + self.is_decoder = is_decoder + + self.k_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="k_proj") + self.q_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="q_proj") + self.v_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="v_proj") + self.out_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="out_proj") + + def _shape(self, tensor: tf.Tensor, seq_len: int, bsz: int): + return tf.transpose(tf.reshape(tensor, (bsz, seq_len, self.num_heads, self.head_dim)), (0, 2, 1, 3)) + + def call( + self, + hidden_states: tf.Tensor, + key_value_states: tf.Tensor | None = None, + past_key_value: Tuple[Tuple[tf.Tensor]] | None = None, + attention_mask: tf.Tensor | None = None, + layer_head_mask: tf.Tensor | None = None, + training: Optional[bool] = False, + ) -> Tuple[tf.Tensor, tf.Tensor | None]: + """Input shape: Batch x Time x Channel""" + + # if key_value_states are provided this layer is used as a cross-attention layer + # for the decoder + is_cross_attention = key_value_states is not None + bsz, tgt_len, embed_dim = shape_list(hidden_states) + + # get query proj + query_states = self.q_proj(hidden_states) * self.scaling + # get key, value proj + if is_cross_attention and past_key_value is not None: + # reuse k,v, cross_attentions + key_states = past_key_value[0] + value_states = past_key_value[1] + elif is_cross_attention: + # cross_attentions + key_states = self._shape(self.k_proj(key_value_states), -1, bsz) + value_states = self._shape(self.v_proj(key_value_states), -1, bsz) + elif past_key_value is not None: + # reuse k, v, self_attention + key_states = self._shape(self.k_proj(hidden_states), -1, bsz) + value_states = self._shape(self.v_proj(hidden_states), -1, bsz) + key_states = tf.concat([past_key_value[0], key_states], axis=2) + value_states = tf.concat([past_key_value[1], value_states], axis=2) + else: + # self_attention + key_states = self._shape(self.k_proj(hidden_states), -1, bsz) + value_states = self._shape(self.v_proj(hidden_states), -1, bsz) + + if self.is_decoder: + # if cross_attention save Tuple(tf.Tensor, tf.Tensor) of all cross attention key/value_states. + # Further calls to cross_attention layer can then reuse all cross-attention + # key/value_states (first "if" case) + # if uni-directional self-attention (decoder) save Tuple(tf.Tensor, tf.Tensor) of + # all previous decoder key/value_states. Further calls to uni-directional self-attention + # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case) + # if encoder bi-directional self-attention `past_key_value` is always `None` + past_key_value = (key_states, value_states) + + proj_shape = (bsz * self.num_heads, -1, self.head_dim) + query_states = tf.reshape(self._shape(query_states, tgt_len, bsz), proj_shape) + key_states = tf.reshape(key_states, proj_shape) + value_states = tf.reshape(value_states, proj_shape) + + src_len = shape_list(key_states)[1] + attn_weights = tf.matmul(query_states, key_states, transpose_b=True) + + tf.debugging.assert_equal( + shape_list(attn_weights), + [bsz * self.num_heads, tgt_len, src_len], + message=( + f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is" + f" {shape_list(attn_weights)}" + ), + ) + + if attention_mask is not None: + tf.debugging.assert_equal( + shape_list(attention_mask), + [bsz, 1, tgt_len, src_len], + message=( + f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is" + f" {shape_list(attention_mask)}" + ), + ) + + attention_mask = tf.cast(attention_mask, dtype=attn_weights.dtype) + attn_weights = tf.reshape(attn_weights, (bsz, self.num_heads, tgt_len, src_len)) + attention_mask + attn_weights = tf.reshape(attn_weights, (bsz * self.num_heads, tgt_len, src_len)) + + attn_weights = stable_softmax(attn_weights, axis=-1) + + if layer_head_mask is not None: + tf.debugging.assert_equal( + shape_list(layer_head_mask), + [self.num_heads], + message=( + f"Head mask for a single layer should be of size {(self.num_heads)}, but is" + f" {shape_list(layer_head_mask)}" + ), + ) + + attn_weights = tf.reshape(layer_head_mask, (1, -1, 1, 1)) * tf.reshape( + attn_weights, (bsz, self.num_heads, tgt_len, src_len) + ) + attn_weights = tf.reshape(attn_weights, (bsz * self.num_heads, tgt_len, src_len)) + + attn_probs = self.dropout(attn_weights, training=training) + attn_output = tf.matmul(attn_probs, value_states) + + tf.debugging.assert_equal( + shape_list(attn_output), + [bsz * self.num_heads, tgt_len, self.head_dim], + message=( + f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is" + f" {shape_list(attn_output)}" + ), + ) + + attn_output = tf.transpose( + tf.reshape(attn_output, (bsz, self.num_heads, tgt_len, self.head_dim)), (0, 2, 1, 3) + ) + attn_output = tf.reshape(attn_output, (bsz, tgt_len, embed_dim)) + + attn_output = self.out_proj(attn_output) + attn_weights: tf.Tensor = tf.reshape(attn_weights, (bsz, self.num_heads, tgt_len, src_len)) + + return attn_output, attn_weights, past_key_value + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "k_proj", None) is not None: + with tf.name_scope(self.k_proj.name): + self.k_proj.build([None, None, self.embed_dim]) + if getattr(self, "q_proj", None) is not None: + with tf.name_scope(self.q_proj.name): + self.q_proj.build([None, None, self.embed_dim]) + if getattr(self, "v_proj", None) is not None: + with tf.name_scope(self.v_proj.name): + self.v_proj.build([None, None, self.embed_dim]) + if getattr(self, "out_proj", None) is not None: + with tf.name_scope(self.out_proj.name): + self.out_proj.build([None, None, self.embed_dim]) + + +# Copied from transformers.models.mbart.modeling_tf_mbart.TFMBartEncoderLayer with MBart->Blenderbot +class TFBlenderbotEncoderLayer(keras.layers.Layer): + def __init__(self, config: BlenderbotConfig, **kwargs): + super().__init__(**kwargs) + self.embed_dim = config.d_model + self.self_attn = TFBlenderbotAttention( + self.embed_dim, config.encoder_attention_heads, dropout=config.attention_dropout, name="self_attn" + ) + self.self_attn_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="self_attn_layer_norm") + self.dropout = keras.layers.Dropout(config.dropout) + self.activation_fn = get_tf_activation(config.activation_function) + self.activation_dropout = keras.layers.Dropout(config.activation_dropout) + self.fc1 = keras.layers.Dense(config.encoder_ffn_dim, name="fc1") + self.fc2 = keras.layers.Dense(self.embed_dim, name="fc2") + self.final_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm") + self.config = config + + def call( + self, + hidden_states: tf.Tensor, + attention_mask: tf.Tensor, + layer_head_mask: tf.Tensor, + training: Optional[bool] = False, + ): + """ + Args: + hidden_states (`tf.Tensor`): input to the layer of shape *(batch, seq_len, embed_dim)* + attention_mask (`tf.Tensor`): attention mask of size + *(batch, 1, tgt_len, src_len)* where padding elements are indicated by very large negative values. + layer_head_mask (`tf.Tensor`): mask for attention heads in a given layer of size + *(encoder_attention_heads,)* + """ + residual = hidden_states + hidden_states = self.self_attn_layer_norm(hidden_states) + hidden_states, self_attn_weights, _ = self.self_attn( + hidden_states=hidden_states, attention_mask=attention_mask, layer_head_mask=layer_head_mask + ) + + tf.debugging.assert_equal( + shape_list(hidden_states), + shape_list(residual), + message=f"Self attn modified the shape of query {shape_list(residual)} to {shape_list(hidden_states)}", + ) + + hidden_states = self.dropout(hidden_states, training=training) + hidden_states = residual + hidden_states + + residual = hidden_states + hidden_states = self.final_layer_norm(hidden_states) + hidden_states = self.activation_fn(self.fc1(hidden_states)) + hidden_states = self.activation_dropout(hidden_states, training=training) + hidden_states = self.fc2(hidden_states) + hidden_states = self.dropout(hidden_states, training=training) + hidden_states = residual + hidden_states + + return hidden_states, self_attn_weights + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "self_attn", None) is not None: + with tf.name_scope(self.self_attn.name): + self.self_attn.build(None) + if getattr(self, "self_attn_layer_norm", None) is not None: + with tf.name_scope(self.self_attn_layer_norm.name): + self.self_attn_layer_norm.build([None, None, self.embed_dim]) + if getattr(self, "fc1", None) is not None: + with tf.name_scope(self.fc1.name): + self.fc1.build([None, None, self.embed_dim]) + if getattr(self, "fc2", None) is not None: + with tf.name_scope(self.fc2.name): + self.fc2.build([None, None, self.config.encoder_ffn_dim]) + if getattr(self, "final_layer_norm", None) is not None: + with tf.name_scope(self.final_layer_norm.name): + self.final_layer_norm.build([None, None, self.embed_dim]) + + +# Copied from transformers.models.mbart.modeling_tf_mbart.TFMBartDecoderLayer with MBart->Blenderbot +class TFBlenderbotDecoderLayer(keras.layers.Layer): + def __init__(self, config: BlenderbotConfig, **kwargs): + super().__init__(**kwargs) + self.embed_dim = config.d_model + self.self_attn = TFBlenderbotAttention( + embed_dim=self.embed_dim, + num_heads=config.decoder_attention_heads, + dropout=config.attention_dropout, + name="self_attn", + is_decoder=True, + ) + self.dropout = keras.layers.Dropout(config.dropout) + self.activation_fn = get_tf_activation(config.activation_function) + self.activation_dropout = keras.layers.Dropout(config.activation_dropout) + + self.self_attn_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="self_attn_layer_norm") + self.encoder_attn = TFBlenderbotAttention( + self.embed_dim, + config.decoder_attention_heads, + dropout=config.attention_dropout, + name="encoder_attn", + is_decoder=True, + ) + self.encoder_attn_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="encoder_attn_layer_norm") + self.fc1 = keras.layers.Dense(config.decoder_ffn_dim, name="fc1") + self.fc2 = keras.layers.Dense(self.embed_dim, name="fc2") + self.final_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm") + self.config = config + + def call( + self, + hidden_states: tf.Tensor, + attention_mask: tf.Tensor | None = None, + encoder_hidden_states: tf.Tensor | None = None, + encoder_attention_mask: tf.Tensor | None = None, + layer_head_mask: tf.Tensor | None = None, + cross_attn_layer_head_mask: tf.Tensor | None = None, + past_key_value: Tuple[tf.Tensor] | None = None, + training: Optional[bool] = False, + ) -> Tuple[tf.Tensor, tf.Tensor, Tuple[Tuple[tf.Tensor]]]: + """ + Args: + hidden_states (`tf.Tensor`): input to the layer of shape *(batch, seq_len, embed_dim)* + attention_mask (`tf.Tensor`): attention mask of size + *(batch, 1, tgt_len, src_len)* where padding elements are indicated by very large negative values. + encoder_hidden_states (`tf.Tensor`): + cross attention input to the layer of shape *(batch, seq_len, embed_dim)* + encoder_attention_mask (`tf.Tensor`): encoder attention mask of size + *(batch, 1, tgt_len, src_len)* where padding elements are indicated by very large negative values. + layer_head_mask (`tf.Tensor`): mask for attention heads in a given layer of size + *(decoder_attention_heads,)* + cross_attn_layer_head_mask (`tf.Tensor`): mask for heads of the cross-attention module. + *(decoder_attention_heads,)* + past_key_value (`Tuple(tf.Tensor)`): cached past key and value projection states + """ + residual = hidden_states + hidden_states = self.self_attn_layer_norm(hidden_states) + + # Self Attention + # decoder uni-directional self-attention cached key/values tuple is at positions 1,2 + self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None + # add present self-attn cache to positions 1,2 of present_key_value tuple + hidden_states, self_attn_weights, present_key_value = self.self_attn( + hidden_states=hidden_states, + past_key_value=self_attn_past_key_value, + attention_mask=attention_mask, + layer_head_mask=layer_head_mask, + ) + hidden_states = self.dropout(hidden_states, training=training) + hidden_states = residual + hidden_states + + # Cross-Attention Block + cross_attn_present_key_value = None + cross_attn_weights = None + if encoder_hidden_states is not None: + residual = hidden_states + hidden_states = self.encoder_attn_layer_norm(hidden_states) + + # cross_attn cached key/values tuple is at positions 3,4 of present_key_value tuple + cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None + hidden_states, cross_attn_weights, cross_attn_present_key_value = self.encoder_attn( + hidden_states=hidden_states, + key_value_states=encoder_hidden_states, + attention_mask=encoder_attention_mask, + layer_head_mask=cross_attn_layer_head_mask, + past_key_value=cross_attn_past_key_value, + ) + hidden_states = self.dropout(hidden_states, training=training) + hidden_states = residual + hidden_states + + # add cross-attn to positions 3,4 of present_key_value tuple + present_key_value = present_key_value + cross_attn_present_key_value + + # Fully Connected + residual = hidden_states + hidden_states = self.final_layer_norm(hidden_states) + hidden_states = self.activation_fn(self.fc1(hidden_states)) + hidden_states = self.activation_dropout(hidden_states, training=training) + hidden_states = self.fc2(hidden_states) + hidden_states = self.dropout(hidden_states, training=training) + hidden_states = residual + hidden_states + + return ( + hidden_states, + self_attn_weights, + cross_attn_weights, + present_key_value, + ) + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "self_attn", None) is not None: + with tf.name_scope(self.self_attn.name): + self.self_attn.build(None) + if getattr(self, "self_attn_layer_norm", None) is not None: + with tf.name_scope(self.self_attn_layer_norm.name): + self.self_attn_layer_norm.build([None, None, self.embed_dim]) + if getattr(self, "encoder_attn", None) is not None: + with tf.name_scope(self.encoder_attn.name): + self.encoder_attn.build(None) + if getattr(self, "encoder_attn_layer_norm", None) is not None: + with tf.name_scope(self.encoder_attn_layer_norm.name): + self.encoder_attn_layer_norm.build([None, None, self.embed_dim]) + if getattr(self, "fc1", None) is not None: + with tf.name_scope(self.fc1.name): + self.fc1.build([None, None, self.embed_dim]) + if getattr(self, "fc2", None) is not None: + with tf.name_scope(self.fc2.name): + self.fc2.build([None, None, self.config.decoder_ffn_dim]) + if getattr(self, "final_layer_norm", None) is not None: + with tf.name_scope(self.final_layer_norm.name): + self.final_layer_norm.build([None, None, self.embed_dim]) + + +class TFBlenderbotPreTrainedModel(TFPreTrainedModel): + config_class = BlenderbotConfig + base_model_prefix = "model" + + +BLENDERBOT_START_DOCSTRING = r""" + This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads + etc.) + + This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it + as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and + behavior. + + + + TensorFlow models and layers in `transformers` accept two formats as input: + + - having all inputs as keyword arguments (like PyTorch models), or + - having all inputs as a list, tuple or dict in the first positional argument. + + The reason the second format is supported is that Keras methods prefer this format when passing inputs to models + and layers. Because of this support, when using methods like `model.fit()` things should "just work" for you - just + pass your inputs and labels in any format that `model.fit()` supports! If, however, you want to use the second + format outside of Keras methods like `fit()` and `predict()`, such as when creating your own layers or models with + the Keras `Functional` API, there are three possibilities you can use to gather all the input Tensors in the first + positional argument: + + - a single Tensor with `input_ids` only and nothing else: `model(input_ids)` + - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring: + `model([input_ids, attention_mask])` or `model([input_ids, attention_mask, token_type_ids])` + - a dictionary with one or several input Tensors associated to the input names given in the docstring: + `model({"input_ids": input_ids, "token_type_ids": token_type_ids})` + + Note that when creating models and layers with + [subclassing](https://keras.io/guides/making_new_layers_and_models_via_subclassing/) then you don't need to worry + about any of this, as you can just pass inputs like you would to any other Python function! + + + + Args: + config ([`BlenderbotConfig`]): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the [`~TFPreTrainedModel.from_pretrained`] method to load the model weights. +""" + +BLENDERBOT_GENERATION_EXAMPLE = r""" + Conversation example:: + + ```py + >>> from transformers import AutoTokenizer, TFBlenderbotForConditionalGeneration + + >>> mname = "facebook/blenderbot-400M-distill" + >>> model = TFBlenderbotForConditionalGeneration.from_pretrained(mname) + >>> tokenizer = AutoTokenizer.from_pretrained(mname) + >>> UTTERANCE = "My friends are cool but they eat too many carbs." + >>> print("Human: ", UTTERANCE) + + >>> inputs = tokenizer([UTTERANCE], return_tensors="tf") + >>> reply_ids = model.generate(**inputs) + >>> print("Bot: ", tokenizer.batch_decode(reply_ids, skip_special_tokens=True)[0]) + + >>> REPLY = "I'm not sure" + >>> print("Human: ", REPLY) + >>> NEXT_UTTERANCE = ( + ... "My friends are cool but they eat too many carbs. That's unfortunate. " + ... "Are they trying to lose weight or are they just trying to be healthier? " + ... " I'm not sure." + ... ) + >>> inputs = tokenizer([NEXT_UTTERANCE], return_tensors="tf") + >>> next_reply_ids = model.generate(**inputs) + >>> print("Bot: ", tokenizer.batch_decode(next_reply_ids, skip_special_tokens=True)[0]) + ``` +""" + +BLENDERBOT_INPUTS_DOCSTRING = r""" + Args: + input_ids (`tf.Tensor` of shape `({0})`): + Indices of input sequence tokens in the vocabulary. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + attention_mask (`tf.Tensor` of shape `({0})`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + decoder_input_ids (`tf.Tensor` of shape `(batch_size, target_sequence_length)`, *optional*): + Indices of decoder input sequence tokens in the vocabulary. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are decoder input IDs?](../glossary#decoder-input-ids) + + Blenderbot uses the `bos_token_id` as the starting token for `decoder_input_ids` generation. If + `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see + `past_key_values`). + decoder_attention_mask (`tf.Tensor` of shape `(batch_size, target_sequence_length)`, *optional*): + will be made by default and ignore pad tokens. It is not recommended to set this for most use cases. + decoder_position_ids (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Indices of positions of each decoder input sequence tokens in the position embeddings. Selected in the + range `[0, config.max_position_embeddings - 1]`. + head_mask (`tf.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*): + Mask to nullify selected heads of the attention modules in the encoder. Mask values selected in `[0, 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + decoder_head_mask (`tf.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*): + Mask to nullify selected heads of the attention modules in the decoder. Mask values selected in `[0, 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + cross_attn_head_mask (`tf.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*): + Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + encoder_outputs (`tf.FloatTensor`, *optional*): + hidden states at the output of the last layer of the encoder. Used in the cross-attention of the decoder. + of shape `(batch_size, sequence_length, hidden_size)` is a sequence of + past_key_values (`Tuple[Tuple[tf.Tensor]]` of length `config.n_layers`) + contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding. + If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that + don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all + `decoder_input_ids` of shape `(batch_size, sequence_length)`. + use_cache (`bool`, *optional*, defaults to `True`): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see + `past_key_values`). Set to `False` during training, `True` during generation + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. This argument can be used only in eager mode, in graph mode the value in the + config will be used instead. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. This argument can be used only in eager mode, in graph mode the value in the config will be + used instead. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. This argument can be used in + eager mode, in graph mode the value will always be set to True. + training (`bool`, *optional*, defaults to `False`): + Whether or not to use the model in training mode (some modules like dropout modules have different + behaviors between training and evaluation). +""" + + +@keras_serializable +class TFBlenderbotEncoder(keras.layers.Layer): + config_class = BlenderbotConfig + """ + Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a + [`TFBlenderbotEncoderLayer`]. + + Args: + config: BlenderbotConfig + """ + + def __init__(self, config: BlenderbotConfig, embed_tokens: Optional[keras.layers.Embedding] = None, **kwargs): + super().__init__(**kwargs) + self.config = config + self.dropout = keras.layers.Dropout(config.dropout) + self.layerdrop = config.encoder_layerdrop + self.padding_idx = config.pad_token_id + self.max_source_positions = config.max_position_embeddings + self.embed_scale = tf.math.sqrt(float(config.d_model)) if config.scale_embedding else 1.0 + + self.embed_tokens = embed_tokens + self.embed_positions = TFBlenderbotLearnedPositionalEmbedding( + config.max_position_embeddings, + config.d_model, + name="embed_positions", + ) + self.layers = [TFBlenderbotEncoderLayer(config, name=f"layers.{i}") for i in range(config.encoder_layers)] + self.layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="layer_norm") + + def get_embed_tokens(self): + return self.embed_tokens + + def set_embed_tokens(self, embed_tokens): + self.embed_tokens = embed_tokens + + @unpack_inputs + def call( + self, + input_ids=None, + inputs_embeds=None, + attention_mask=None, + head_mask=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + training=False, + ): + """ + Args: + input_ids (`tf.Tensor` of shape `(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you + provide it. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + attention_mask (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + head_mask (`tf.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, `optional): + Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + inputs_embeds (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. + This is useful if you want more control over how to convert `input_ids` indices into associated vectors + than the model's internal embedding lookup matrix. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. This argument can be used only in eager mode, in graph mode the value + in the config will be used instead. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors + for more detail. This argument can be used only in eager mode, in graph mode the value in the config + will be used instead. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. This argument can be used + in eager mode, in graph mode the value will always be set to True. + training (`bool`, *optional*, defaults to `False`): + Whether or not to use the model in training mode (some modules like dropout modules have different + behaviors between training and evaluation). + """ + if input_ids is not None and inputs_embeds is not None: + raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") + elif input_ids is not None: + input_shape = shape_list(input_ids) + elif inputs_embeds is not None: + input_shape = shape_list(inputs_embeds)[:-1] + else: + raise ValueError("You have to specify either input_ids or inputs_embeds") + + if inputs_embeds is None: + check_embeddings_within_bounds(input_ids, self.embed_tokens.input_dim) + inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale + + embed_pos = self.embed_positions(input_shape) + hidden_states = inputs_embeds + embed_pos + hidden_states = self.dropout(hidden_states, training=training) + + # check attention mask and invert + if attention_mask is not None: + # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] + attention_mask = _expand_mask(attention_mask) + else: + attention_mask = None + + encoder_states = () if output_hidden_states else None + all_attentions = () if output_attentions else None + + # check if head_mask has a correct number of layers specified if desired + if head_mask is not None: + tf.debugging.assert_equal( + shape_list(head_mask)[0], + len(self.layers), + message=( + f"The head_mask should be specified for {len(self.layers)} layers, but it is for" + f" {shape_list(head_mask)[0]}." + ), + ) + + # encoder layers + for idx, encoder_layer in enumerate(self.layers): + if output_hidden_states: + encoder_states = encoder_states + (hidden_states,) + # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) + dropout_probability = random.uniform(0, 1) + if training and (dropout_probability < self.layerdrop): # skip the layer + continue + + hidden_states, attn = encoder_layer( + hidden_states, + attention_mask, + head_mask[idx] if head_mask is not None else None, + ) + + if output_attentions: + all_attentions += (attn,) + + hidden_states = self.layer_norm(hidden_states) + + if output_hidden_states: + encoder_states = encoder_states + (hidden_states,) + + if not return_dict: + return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None) + return TFBaseModelOutput( + last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions + ) + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "embed_positions", None) is not None: + with tf.name_scope(self.embed_positions.name): + self.embed_positions.build(None) + if getattr(self, "layer_norm", None) is not None: + with tf.name_scope(self.layer_norm.name): + self.layer_norm.build([None, None, self.config.d_model]) + if getattr(self, "layers", None) is not None: + for layer in self.layers: + with tf.name_scope(layer.name): + layer.build(None) + + +@keras_serializable +class TFBlenderbotDecoder(keras.layers.Layer): + config_class = BlenderbotConfig + """ + Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`TFBlenderbotDecoderLayer`] + + Args: + config: BlenderbotConfig + embed_tokens: output embedding + """ + + def __init__(self, config: BlenderbotConfig, embed_tokens: Optional[keras.layers.Embedding] = None, **kwargs): + super().__init__(**kwargs) + self.config = config + self.padding_idx = config.pad_token_id + self.embed_tokens = embed_tokens + self.layerdrop = config.decoder_layerdrop + self.embed_positions = TFBlenderbotLearnedPositionalEmbedding( + config.max_position_embeddings, + config.d_model, + name="embed_positions", + ) + self.embed_scale = tf.math.sqrt(float(config.d_model)) if config.scale_embedding else 1.0 + self.layers = [TFBlenderbotDecoderLayer(config, name=f"layers.{i}") for i in range(config.decoder_layers)] + self.layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="layer_norm") + + self.dropout = keras.layers.Dropout(config.dropout) + + def get_embed_tokens(self): + return self.embed_tokens + + def set_embed_tokens(self, embed_tokens): + self.embed_tokens = embed_tokens + + @unpack_inputs + def call( + self, + input_ids=None, + inputs_embeds=None, + attention_mask=None, + position_ids=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + head_mask=None, + cross_attn_head_mask=None, + past_key_values=None, + use_cache=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + training=False, + ): + r""" + Args: + input_ids (`tf.Tensor` of shape `(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you + provide it. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + attention_mask (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + position_ids (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Indices of positions of each decoder input sequence tokens in the position embeddings. Selected in the + range `[0, config.max_position_embeddings - 1]`. + encoder_hidden_states (`tf.Tensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*): + Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention + of the decoder. + encoder_attention_mask (`tf.Tensor` of shape `(batch_size, encoder_sequence_length)`, *optional*): + Mask to avoid performing cross-attention on padding tokens indices of encoder input_ids. Mask values + selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + head_mask (`tf.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*): + Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + cross_attn_head_mask (`tf.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*): + Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + past_key_values (`Tuple[Tuple[tf.Tensor]]` of length `config.n_layers` with each tuple having 2 tuples each of which has 2 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`): + Contains precomputed key and value hidden-states of the attention blocks. Can be used to speed up + decoding. + + If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those + that don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of + all `decoder_input_ids` of shape `(batch_size, sequence_length)`. + inputs_embeds (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. + This is useful if you want more control over how to convert `input_ids` indices into associated vectors + than the model's internal embedding lookup matrix. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. This argument can be used only in eager mode, in graph mode the value + in the config will be used instead. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors + for more detail. This argument can be used only in eager mode, in graph mode the value in the config + will be used instead. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. This argument can be used + in eager mode, in graph mode the value will always be set to True. + training (`bool`, *optional*, defaults to `False`): + Whether or not to use the model in training mode (some modules like dropout modules have different + behaviors between training and evaluation). + """ + if input_ids is not None and inputs_embeds is not None: + raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time") + elif input_ids is not None: + input_shape = shape_list(input_ids) + elif inputs_embeds is not None: + input_shape = shape_list(inputs_embeds)[:-1] + else: + raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds") + + past_key_values_length = shape_list(past_key_values[0][0])[2] if past_key_values is not None else 0 + + # embed positions + if position_ids is None: + positions = self.embed_positions(input_shape, past_key_values_length) + else: + positions = self.embed_positions(input_shape, position_ids=position_ids) + + if inputs_embeds is None: + check_embeddings_within_bounds(input_ids, self.embed_tokens.input_dim) + inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale + + hidden_states = inputs_embeds + + # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] + if input_shape[-1] > 1: + combined_attention_mask = _make_causal_mask(input_shape, past_key_values_length=past_key_values_length) + else: + combined_attention_mask = _expand_mask( + tf.ones((input_shape[0], input_shape[1] + past_key_values_length)), tgt_len=input_shape[-1] + ) + + if attention_mask is not None: + combined_attention_mask = combined_attention_mask + _expand_mask(attention_mask, tgt_len=input_shape[-1]) + + if encoder_hidden_states is not None and encoder_attention_mask is not None: + # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] + encoder_attention_mask = _expand_mask(encoder_attention_mask, tgt_len=input_shape[-1]) + + hidden_states = hidden_states + positions + hidden_states = self.dropout(hidden_states, training=training) + + # decoder layers + all_hidden_states = () if output_hidden_states else None + all_self_attns = () if output_attentions else None + all_cross_attns = () if (output_attentions and encoder_hidden_states is not None) else None + present_key_values = () if use_cache else None + + # check if head_mask and cross_attn_head_mask have a correct number of layers specified if desired + for attn_mask_name, attn_mask in [("head_mask", head_mask), ("cross_attn_head_mask", cross_attn_head_mask)]: + if attn_mask is not None: + tf.debugging.assert_equal( + shape_list(attn_mask)[0], + len(self.layers), + message=( + f"The {attn_mask_name} should be specified for {len(self.layers)} layers, but it is for" + f" {shape_list(attn_mask)[0]}." + ), + ) + for idx, decoder_layer in enumerate(self.layers): + # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) + if output_hidden_states: + all_hidden_states += (hidden_states,) + dropout_probability = random.uniform(0, 1) + + if training and (dropout_probability < self.layerdrop): + continue + + past_key_value = past_key_values[idx] if past_key_values is not None else None + + hidden_states, layer_self_attn, layer_cross_attn, present_key_value = decoder_layer( + hidden_states, + attention_mask=combined_attention_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + layer_head_mask=head_mask[idx] if head_mask is not None else None, + cross_attn_layer_head_mask=cross_attn_head_mask[idx] if cross_attn_head_mask is not None else None, + past_key_value=past_key_value, + ) + + if use_cache: + present_key_values += (present_key_value,) + + if output_attentions: + all_self_attns += (layer_self_attn,) + + if encoder_hidden_states is not None: + all_cross_attns += (layer_cross_attn,) + + hidden_states = self.layer_norm(hidden_states) + + if output_hidden_states: + all_hidden_states += (hidden_states,) + + if not return_dict: + return hidden_states, present_key_values, all_hidden_states, all_self_attns, all_cross_attns + else: + return TFBaseModelOutputWithPastAndCrossAttentions( + last_hidden_state=hidden_states, + past_key_values=present_key_values, + hidden_states=all_hidden_states, + attentions=all_self_attns, + cross_attentions=all_cross_attns, + ) + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "embed_positions", None) is not None: + with tf.name_scope(self.embed_positions.name): + self.embed_positions.build(None) + if getattr(self, "layer_norm", None) is not None: + with tf.name_scope(self.layer_norm.name): + self.layer_norm.build([None, None, self.config.d_model]) + if getattr(self, "layers", None) is not None: + for layer in self.layers: + with tf.name_scope(layer.name): + layer.build(None) + + +@keras_serializable +class TFBlenderbotMainLayer(keras.layers.Layer): + config_class = BlenderbotConfig + + def __init__(self, config: BlenderbotConfig, **kwargs): + super().__init__(**kwargs) + + self.config = config + self.shared = keras.layers.Embedding( + input_dim=config.vocab_size, + output_dim=config.d_model, + embeddings_initializer=keras.initializers.TruncatedNormal(stddev=self.config.init_std), + name="model.shared", + ) + # Additional attribute to specify the expected name scope of the layer (for loading/storing weights) + self.shared.load_weight_prefix = "model.shared" + + self.encoder = TFBlenderbotEncoder(config, self.shared, name="encoder") + self.decoder = TFBlenderbotDecoder(config, self.shared, name="decoder") + + def get_input_embeddings(self): + return self.shared + + def set_input_embeddings(self, new_embeddings): + self.shared = new_embeddings + self.encoder.embed_tokens = self.shared + self.decoder.embed_tokens = self.shared + + @unpack_inputs + def call( + self, + input_ids=None, + attention_mask=None, + decoder_input_ids=None, + decoder_attention_mask=None, + decoder_position_ids=None, + head_mask=None, + decoder_head_mask=None, + cross_attn_head_mask=None, + encoder_outputs: Optional[Union[Tuple, TFBaseModelOutput]] = None, + past_key_values=None, + inputs_embeds=None, + decoder_inputs_embeds=None, + use_cache=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + training=False, + **kwargs, + ): + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + + if encoder_outputs is None: + encoder_outputs = self.encoder( + input_ids=input_ids, + attention_mask=attention_mask, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + ) + # If the user passed a tuple for encoder_outputs, we wrap it in a TFBaseModelOutput when return_dict=True + elif return_dict and not isinstance(encoder_outputs, TFBaseModelOutput): + encoder_outputs = TFBaseModelOutput( + last_hidden_state=encoder_outputs[0], + hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None, + attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None, + ) + # If the user passed a TFBaseModelOutput for encoder_outputs, we wrap it in a tuple when return_dict=False + elif not return_dict and not isinstance(encoder_outputs, tuple): + encoder_outputs = encoder_outputs.to_tuple() + + decoder_outputs = self.decoder( + decoder_input_ids, + attention_mask=decoder_attention_mask, + position_ids=decoder_position_ids, + encoder_hidden_states=encoder_outputs[0], + encoder_attention_mask=attention_mask, + head_mask=decoder_head_mask, + cross_attn_head_mask=cross_attn_head_mask, + past_key_values=past_key_values, + inputs_embeds=decoder_inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + ) + + if not return_dict: + return decoder_outputs + encoder_outputs + + return TFSeq2SeqModelOutput( + last_hidden_state=decoder_outputs.last_hidden_state, + past_key_values=decoder_outputs.past_key_values, + decoder_hidden_states=decoder_outputs.hidden_states, + decoder_attentions=decoder_outputs.attentions, + cross_attentions=decoder_outputs.cross_attentions, + encoder_last_hidden_state=encoder_outputs.last_hidden_state, + encoder_hidden_states=encoder_outputs.hidden_states, + encoder_attentions=encoder_outputs.attentions, + ) + + def build(self, input_shape=None): + if self.built: + return + self.built = True + # The shared/tied weights expect to be in the model base namespace + # Adding "/" to the end (not the start!) of a tf.name_scope puts it in the root namespace rather than + # the current one. + with tf.name_scope(self.shared.load_weight_prefix + "/" + self.shared.name + "/"): + self.shared.build(None) + if getattr(self, "encoder", None) is not None: + with tf.name_scope(self.encoder.name): + self.encoder.build(None) + if getattr(self, "decoder", None) is not None: + with tf.name_scope(self.decoder.name): + self.decoder.build(None) + + +@add_start_docstrings( + "The bare BLENDERBOT Model outputting raw hidden-states without any specific head on top.", + BLENDERBOT_START_DOCSTRING, +) +class TFBlenderbotModel(TFBlenderbotPreTrainedModel): + def __init__(self, config: BlenderbotConfig, *inputs, **kwargs): + super().__init__(config, *inputs, **kwargs) + + self.model = TFBlenderbotMainLayer(config, name="model") + + def get_encoder(self): + return self.model.encoder + + def get_decoder(self): + return self.model.decoder + + @classmethod + def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.PathLike]], *model_args, **kwargs): + if pretrained_model_name_or_path == "facebook/blenderbot-90M": + from ..blenderbot_small import TFBlenderbotSmallModel + + warnings.warn( + "The checkpoint `facebook/blenderbot-90M` is deprecated. In the future, please use the identical" + " checkpoint `facebook/small_blenderbot-90M` with" + " `TFBlenderbotSmallForConditionalGeneration.from_pretrained('facebook/small_blenderbot-90M')`" + " instead.", + FutureWarning, + ) + return TFBlenderbotSmallModel.from_pretrained(pretrained_model_name_or_path) + + return super().from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) + + @unpack_inputs + @add_start_docstrings_to_model_forward(BLENDERBOT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @add_code_sample_docstrings( + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=TFSeq2SeqModelOutput, + config_class=_CONFIG_FOR_DOC, + ) + def call( + self, + input_ids: tf.Tensor | None = None, + attention_mask: tf.Tensor | None = None, + decoder_input_ids: tf.Tensor | None = None, + decoder_attention_mask: tf.Tensor | None = None, + decoder_position_ids: tf.Tensor | None = None, + head_mask: tf.Tensor | None = None, + decoder_head_mask: tf.Tensor | None = None, + cross_attn_head_mask: tf.Tensor | None = None, + encoder_outputs: Optional[Union[Tuple, TFBaseModelOutput]] = None, + past_key_values: List[tf.Tensor] | None = None, + inputs_embeds: tf.Tensor | None = None, + decoder_inputs_embeds: tf.Tensor | None = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + training: Optional[bool] = False, + **kwargs, + ) -> Union[Tuple[tf.Tensor], TFSeq2SeqModelOutput]: + outputs = self.model( + input_ids=input_ids, + attention_mask=attention_mask, + decoder_input_ids=decoder_input_ids, + decoder_attention_mask=decoder_attention_mask, + decoder_position_ids=decoder_position_ids, + head_mask=head_mask, + decoder_head_mask=decoder_head_mask, + cross_attn_head_mask=cross_attn_head_mask, + encoder_outputs=encoder_outputs, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + decoder_inputs_embeds=decoder_inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + ) + + return outputs + + # Copied from transformers.models.bart.modeling_tf_bart.TFBartModel.serving_output + def serving_output(self, output): + pkv = tf.tuple(output.past_key_values)[1] if self.config.use_cache else None + dec_hs = tf.convert_to_tensor(output.decoder_hidden_states) if self.config.output_hidden_states else None + dec_attns = tf.convert_to_tensor(output.decoder_attentions) if self.config.output_attentions else None + cross_attns = tf.convert_to_tensor(output.cross_attentions) if self.config.output_attentions else None + enc_hs = tf.convert_to_tensor(output.encoder_hidden_states) if self.config.output_hidden_states else None + enc_attns = tf.convert_to_tensor(output.encoder_attentions) if self.config.output_attentions else None + + return TFSeq2SeqModelOutput( + last_hidden_state=output.last_hidden_state, + past_key_values=pkv, + decoder_hidden_states=dec_hs, + decoder_attentions=dec_attns, + cross_attentions=cross_attns, + encoder_last_hidden_state=output.encoder_last_hidden_state, + encoder_hidden_states=enc_hs, + encoder_attentions=enc_attns, + ) + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "model", None) is not None: + with tf.name_scope(self.model.name): + self.model.build(None) + + +# Copied from transformers.models.bart.modeling_tf_bart.BiasLayer +class BiasLayer(keras.layers.Layer): + """ + Bias as a layer. It is used for serialization purposes: `keras.Model.save_weights` stores on a per-layer basis, + so all weights have to be registered in a layer. + """ + + def __init__(self, shape, initializer, trainable, name, **kwargs): + super().__init__(name=name, **kwargs) + # Note: the name of this variable will NOT be scoped when serialized, i.e. it will not be in the format of + # "outer_layer/inner_layer/.../name:0". Instead, it will be "name:0". For further details, see: + # https://github.com/huggingface/transformers/pull/18833#issuecomment-1233090214 + self.bias = self.add_weight(name=name, shape=shape, initializer=initializer, trainable=trainable) + + def call(self, x): + return x + self.bias + + +@add_start_docstrings( + "The BLENDERBOT Model with a language modeling head. Can be used for summarization.", + BLENDERBOT_START_DOCSTRING, +) +class TFBlenderbotForConditionalGeneration(TFBlenderbotPreTrainedModel, TFCausalLanguageModelingLoss): + _keys_to_ignore_on_load_unexpected = [ + r"model.encoder.embed_tokens.weight", + r"model.decoder.embed_tokens.weight", + ] + + def __init__(self, config, *inputs, **kwargs): + super().__init__(config, *inputs, **kwargs) + self.model = TFBlenderbotMainLayer(config, name="model") + self.use_cache = config.use_cache + # final_bias_logits is registered as a buffer in pytorch, so not trainable for the sake of consistency. + self.bias_layer = BiasLayer( + name="final_logits_bias", shape=[1, config.vocab_size], initializer="zeros", trainable=False + ) + + def get_decoder(self): + return self.model.decoder + + def get_encoder(self): + return self.model.encoder + + def get_output_embeddings(self): + return self.get_input_embeddings() + + def set_output_embeddings(self, value): + self.set_input_embeddings(value) + + def get_bias(self): + return {"final_logits_bias": self.bias_layer.bias} + + def set_bias(self, value): + # Replaces the existing layers containing bias for correct (de)serialization. + vocab_size = value["final_logits_bias"].shape[-1] + self.bias_layer = BiasLayer( + name="final_logits_bias", shape=[1, vocab_size], initializer="zeros", trainable=False + ) + self.bias_layer.bias.assign(value["final_logits_bias"]) + + @classmethod + def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.PathLike]], *model_args, **kwargs): + if pretrained_model_name_or_path == "facebook/blenderbot-90M": + from ..blenderbot_small import TFBlenderbotSmallForConditionalGeneration + + warnings.warn( + "The checkpoint `facebook/blenderbot-90M` is deprecated. In the future, please use the identical" + " checkpoint `facebook/small_blenderbot-90M` with" + " `TFBlenderbotSmallForConditionalGeneration.from_pretrained('facebook/small_blenderbot-90M')`" + " instead.", + FutureWarning, + ) + return TFBlenderbotSmallForConditionalGeneration.from_pretrained(pretrained_model_name_or_path) + + return super().from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) + + @unpack_inputs + @add_start_docstrings_to_model_forward(BLENDERBOT_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=TFSeq2SeqLMOutput, config_class=_CONFIG_FOR_DOC) + @add_end_docstrings(BLENDERBOT_GENERATION_EXAMPLE) + def call( + self, + input_ids: tf.Tensor | None = None, + attention_mask: tf.Tensor | None = None, + decoder_input_ids: tf.Tensor | None = None, + decoder_attention_mask: tf.Tensor | None = None, + decoder_position_ids: tf.Tensor | None = None, + head_mask: tf.Tensor | None = None, + decoder_head_mask: tf.Tensor | None = None, + cross_attn_head_mask: tf.Tensor | None = None, + encoder_outputs: Optional[Union[Tuple, TFBaseModelOutput]] = None, + past_key_values: List[tf.Tensor] | None = None, + inputs_embeds: tf.Tensor | None = None, + decoder_inputs_embeds: tf.Tensor | None = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + labels: tf.Tensor | None = None, + training: Optional[bool] = False, + ) -> Union[Tuple[tf.Tensor], TFSeq2SeqLMOutput]: + r""" + labels (`tf.tensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., + config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored + (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. + + Returns: + + """ + if labels is not None: + labels = tf.where( + labels == self.config.pad_token_id, + tf.cast(tf.fill(shape_list(labels), -100), labels.dtype), + labels, + ) + use_cache = False + if decoder_input_ids is None and decoder_inputs_embeds is None: + decoder_input_ids = shift_tokens_right( + labels, self.config.pad_token_id, self.config.decoder_start_token_id + ) + + outputs = self.model( + input_ids, + attention_mask=attention_mask, + decoder_input_ids=decoder_input_ids, + encoder_outputs=encoder_outputs, + decoder_attention_mask=decoder_attention_mask, + decoder_position_ids=decoder_position_ids, + head_mask=head_mask, + decoder_head_mask=decoder_head_mask, + cross_attn_head_mask=cross_attn_head_mask, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + decoder_inputs_embeds=decoder_inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + ) + lm_logits = tf.matmul(outputs[0], self.model.shared.weights, transpose_b=True) + lm_logits = self.bias_layer(lm_logits) + masked_lm_loss = None if labels is None else self.hf_compute_loss(labels, lm_logits) + + if not return_dict: + output = (lm_logits,) + outputs[1:] + return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output + return TFSeq2SeqLMOutput( + loss=masked_lm_loss, + logits=lm_logits, + past_key_values=outputs.past_key_values, # index 1 of d outputs + decoder_hidden_states=outputs.decoder_hidden_states, # index 2 of d outputs + decoder_attentions=outputs.decoder_attentions, # index 3 of d outputs + cross_attentions=outputs.cross_attentions, # index 4 of d outputs + encoder_last_hidden_state=outputs.encoder_last_hidden_state, # index 0 of encoder outputs + encoder_hidden_states=outputs.encoder_hidden_states, # 1 of e out + encoder_attentions=outputs.encoder_attentions, # 2 of e out + ) + + # Copied from transformers.models.bart.modeling_tf_bart.TFBartForConditionalGeneration.serving_output + def serving_output(self, output): + pkv = tf.tuple(output.past_key_values)[1] if self.config.use_cache else None + dec_hs = tf.convert_to_tensor(output.decoder_hidden_states) if self.config.output_hidden_states else None + dec_attns = tf.convert_to_tensor(output.decoder_attentions) if self.config.output_attentions else None + cross_attns = tf.convert_to_tensor(output.cross_attentions) if self.config.output_attentions else None + enc_hs = tf.convert_to_tensor(output.encoder_hidden_states) if self.config.output_hidden_states else None + enc_attns = tf.convert_to_tensor(output.encoder_attentions) if self.config.output_attentions else None + + return TFSeq2SeqLMOutput( + logits=output.logits, + past_key_values=pkv, + decoder_hidden_states=dec_hs, + decoder_attentions=dec_attns, + cross_attentions=cross_attns, + encoder_last_hidden_state=output.encoder_last_hidden_state, + encoder_hidden_states=enc_hs, + encoder_attentions=enc_attns, + ) + + # Copied from transformers.models.bart.modeling_tf_bart.TFBartForConditionalGeneration.prepare_inputs_for_generation + def prepare_inputs_for_generation( + self, + decoder_input_ids, + past_key_values=None, + attention_mask=None, + decoder_attention_mask=None, + head_mask=None, + decoder_head_mask=None, + cross_attn_head_mask=None, + use_cache=None, + encoder_outputs=None, + **kwargs, + ): + # cut decoder_input_ids if past_key_values is used + if past_key_values is not None: + decoder_input_ids = decoder_input_ids[:, -1:] + + if decoder_attention_mask is not None: # xla + decoder_position_ids = tf.math.cumsum(decoder_attention_mask, axis=-1, exclusive=True)[:, -1:] + elif past_key_values is not None: # no xla + past_key_values + decoder_position_ids = past_key_values[0][0].shape[2] + else: # no xla + no past_key_values + decoder_position_ids = tf.range(decoder_input_ids.shape[1]) + + return { + "input_ids": None, # encoder_outputs is defined. input_ids not needed + "encoder_outputs": encoder_outputs, + "past_key_values": past_key_values, + "decoder_input_ids": decoder_input_ids, + "attention_mask": attention_mask, + "decoder_attention_mask": decoder_attention_mask, + "decoder_position_ids": decoder_position_ids, + "head_mask": head_mask, + "decoder_head_mask": decoder_head_mask, + "cross_attn_head_mask": cross_attn_head_mask, + "use_cache": use_cache, # change this to avoid caching (presumably for debugging) + } + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "model", None) is not None: + with tf.name_scope(self.model.name): + self.model.build(None) + if getattr(self, "bias_layer", None) is not None: + with tf.name_scope(self.bias_layer.name): + self.bias_layer.build(None) + + +__all__ = ["TFBlenderbotForConditionalGeneration", "TFBlenderbotModel", "TFBlenderbotPreTrainedModel"] diff --git a/docs/transformers/build/lib/transformers/models/blenderbot/tokenization_blenderbot.py b/docs/transformers/build/lib/transformers/models/blenderbot/tokenization_blenderbot.py new file mode 100644 index 0000000000000000000000000000000000000000..08b2a8c1283b6753a25ce0348d1393ca24f32553 --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/blenderbot/tokenization_blenderbot.py @@ -0,0 +1,410 @@ +# coding=utf-8 +# Copyright 2021 The Facebook Inc. and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Tokenization class for Blenderbot.""" + +import json +import os +from functools import lru_cache +from typing import List, Optional, Tuple + +import regex as re + +from ...tokenization_utils import AddedToken, PreTrainedTokenizer +from ...utils import logging + + +logger = logging.get_logger(__name__) + + +VOCAB_FILES_NAMES = { + "vocab_file": "vocab.json", + "merges_file": "merges.txt", + "tokenizer_config_file": "tokenizer_config.json", +} + + +@lru_cache() +# Copied from transformers.models.roberta.tokenization_roberta.bytes_to_unicode +def bytes_to_unicode(): + """ + Returns list of utf-8 byte and a mapping to unicode strings. We specifically avoids mapping to whitespace/control + characters the bpe code barfs on. + + The reversible bpe codes work on unicode strings. This means you need a large # of unicode characters in your vocab + if you want to avoid UNKs. When you're at something like a 10B token dataset you end up needing around 5K for + decent coverage. This is a significant percentage of your normal, say, 32K bpe vocab. To avoid that, we want lookup + tables between utf-8 bytes and unicode strings. + """ + bs = ( + list(range(ord("!"), ord("~") + 1)) + list(range(ord("¡"), ord("¬") + 1)) + list(range(ord("®"), ord("ÿ") + 1)) + ) + cs = bs[:] + n = 0 + for b in range(2**8): + if b not in bs: + bs.append(b) + cs.append(2**8 + n) + n += 1 + cs = [chr(n) for n in cs] + return dict(zip(bs, cs)) + + +# Copied from transformers.models.roberta.tokenization_roberta.get_pairs +def get_pairs(word): + """ + Return set of symbol pairs in a word. + + Word is represented as tuple of symbols (symbols being variable-length strings). + """ + pairs = set() + prev_char = word[0] + for char in word[1:]: + pairs.add((prev_char, char)) + prev_char = char + return pairs + + +class BlenderbotTokenizer(PreTrainedTokenizer): + """ + Constructs a Blenderbot tokenizer, derived from the GPT-2 tokenizer, using byte-level Byte-Pair-Encoding. + + This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will + be encoded differently whether it is at the beginning of the sentence (without space) or not: + + ```python + >>> from transformers import BlenderbotTokenizer + + >>> tokenizer = BlenderbotTokenizer.from_pretrained("facebook/blenderbot-3B") + >>> tokenizer.add_prefix_space = False + >>> tokenizer("Hello world")["input_ids"] + [47, 921, 86, 1085, 2] + + >>> tokenizer(" Hello world")["input_ids"] + [6950, 1085, 2] + ``` + + You can get around that behavior by passing `add_prefix_space=True` when instantiating this tokenizer or when you + call it on some text, but since the model was not pretrained this way, it might yield a decrease in performance. + + + + When used with `is_split_into_words=True`, this tokenizer will add a space before each word (even the first one). + + + + This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to + this superclass for more information regarding those methods. + + Args: + vocab_file (`str`): + Path to the vocabulary file. + merges_file (`str`): + Path to the merges file. + errors (`str`, *optional*, defaults to `"replace"`): + Paradigm to follow when decoding bytes to UTF-8. See + [bytes.decode](https://docs.python.org/3/library/stdtypes.html#bytes.decode) for more information. + bos_token (`str`, *optional*, defaults to `""`): + The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token. + + + + When building a sequence using special tokens, this is not the token that is used for the beginning of + sequence. The token used is the `cls_token`. + + + + eos_token (`str`, *optional*, defaults to `""`): + The end of sequence token. + + + + When building a sequence using special tokens, this is not the token that is used for the end of sequence. + The token used is the `sep_token`. + + + + sep_token (`str`, *optional*, defaults to `""`): + The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for + sequence classification or for a text and a question for question answering. It is also used as the last + token of a sequence built with special tokens. + cls_token (`str`, *optional*, defaults to `""`): + The classifier token which is used when doing sequence classification (classification of the whole sequence + instead of per-token classification). It is the first token of the sequence when built with special tokens. + unk_token (`str`, *optional*, defaults to `""`): + The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this + token instead. + pad_token (`str`, *optional*, defaults to `""`): + The token used for padding, for example when batching sequences of different lengths. + mask_token (`str`, *optional*, defaults to `""`): + The token used for masking values. This is the token used when training this model with masked language + modeling. This is the token which the model will try to predict. + add_prefix_space (`bool`, *optional*, defaults to `False`): + Whether or not to add an initial space to the input. This allows to treat the leading word just as any + other word. (Blenderbot tokenizer detect beginning of words by the preceding space). + """ + + vocab_files_names = VOCAB_FILES_NAMES + model_input_names = ["input_ids", "attention_mask"] + + # Copied from transformers.models.roberta.tokenization_roberta.RobertaTokenizer.__init__ with Roberta->Blenderbot, RoBERTa->Blenderbot + def __init__( + self, + vocab_file, + merges_file, + errors="replace", + bos_token="", + eos_token="", + sep_token="", + cls_token="", + unk_token="", + pad_token="", + mask_token="", + add_prefix_space=False, + **kwargs, + ): + bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token + pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token + eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token + unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token + sep_token = AddedToken(sep_token, lstrip=False, rstrip=False) if isinstance(sep_token, str) else sep_token + cls_token = AddedToken(cls_token, lstrip=False, rstrip=False) if isinstance(cls_token, str) else cls_token + + # Mask token behave like a normal word, i.e. include the space before it + mask_token = ( + AddedToken(mask_token, lstrip=True, rstrip=False, normalized=False) + if isinstance(mask_token, str) + else mask_token + ) + + # these special tokens are not part of the vocab.json, let's add them in the correct order + + with open(vocab_file, encoding="utf-8") as vocab_handle: + self.encoder = json.load(vocab_handle) + self.decoder = {v: k for k, v in self.encoder.items()} + self.errors = errors # how to handle errors in decoding + self.byte_encoder = bytes_to_unicode() + self.byte_decoder = {v: k for k, v in self.byte_encoder.items()} + with open(merges_file, encoding="utf-8") as merges_handle: + bpe_merges = merges_handle.read().split("\n")[1:-1] + bpe_merges = [tuple(merge.split()) for merge in bpe_merges] + self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges)))) + self.cache = {} + self.add_prefix_space = add_prefix_space + + # Should have added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions + self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""") + + super().__init__( + errors=errors, + bos_token=bos_token, + eos_token=eos_token, + unk_token=unk_token, + sep_token=sep_token, + cls_token=cls_token, + pad_token=pad_token, + mask_token=mask_token, + add_prefix_space=add_prefix_space, + **kwargs, + ) + + @property + # Copied from transformers.models.roberta.tokenization_roberta.RobertaTokenizer.vocab_size with Roberta->Blenderbot, RoBERTa->Blenderbot + def vocab_size(self): + return len(self.encoder) + + # Copied from transformers.models.roberta.tokenization_roberta.RobertaTokenizer.get_vocab with Roberta->Blenderbot, RoBERTa->Blenderbot + def get_vocab(self): + vocab = dict(self.encoder).copy() + vocab.update(self.added_tokens_encoder) + return vocab + + # Copied from transformers.models.roberta.tokenization_roberta.RobertaTokenizer.bpe with Roberta->Blenderbot, RoBERTa->Blenderbot + def bpe(self, token): + if token in self.cache: + return self.cache[token] + word = tuple(token) + pairs = get_pairs(word) + + if not pairs: + return token + + while True: + bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf"))) + if bigram not in self.bpe_ranks: + break + first, second = bigram + new_word = [] + i = 0 + while i < len(word): + try: + j = word.index(first, i) + except ValueError: + new_word.extend(word[i:]) + break + else: + new_word.extend(word[i:j]) + i = j + + if word[i] == first and i < len(word) - 1 and word[i + 1] == second: + new_word.append(first + second) + i += 2 + else: + new_word.append(word[i]) + i += 1 + new_word = tuple(new_word) + word = new_word + if len(word) == 1: + break + else: + pairs = get_pairs(word) + word = " ".join(word) + self.cache[token] = word + return word + + # Copied from transformers.models.roberta.tokenization_roberta.RobertaTokenizer._tokenize with Roberta->Blenderbot, RoBERTa->Blenderbot + def _tokenize(self, text): + """Tokenize a string.""" + bpe_tokens = [] + for token in re.findall(self.pat, text): + token = "".join( + self.byte_encoder[b] for b in token.encode("utf-8") + ) # Maps all our bytes to unicode strings, avoiding control tokens of the BPE (spaces in our case) + bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(" ")) + return bpe_tokens + + # Copied from transformers.models.roberta.tokenization_roberta.RobertaTokenizer._convert_token_to_id with Roberta->Blenderbot, RoBERTa->Blenderbot + def _convert_token_to_id(self, token): + """Converts a token (str) in an id using the vocab.""" + return self.encoder.get(token, self.encoder.get(self.unk_token)) + + # Copied from transformers.models.roberta.tokenization_roberta.RobertaTokenizer._convert_id_to_token with Roberta->Blenderbot, RoBERTa->Blenderbot + def _convert_id_to_token(self, index): + """Converts an index (integer) in a token (str) using the vocab.""" + return self.decoder.get(index) + + # Copied from transformers.models.roberta.tokenization_roberta.RobertaTokenizer.convert_tokens_to_string with Roberta->Blenderbot, RoBERTa->Blenderbot + def convert_tokens_to_string(self, tokens): + """Converts a sequence of tokens (string) in a single string.""" + text = "".join(tokens) + text = bytearray([self.byte_decoder[c] for c in text]).decode("utf-8", errors=self.errors) + return text + + # Copied from transformers.models.roberta.tokenization_roberta.RobertaTokenizer.save_vocabulary with Roberta->Blenderbot, RoBERTa->Blenderbot + def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]: + if not os.path.isdir(save_directory): + logger.error(f"Vocabulary path ({save_directory}) should be a directory") + return + vocab_file = os.path.join( + save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"] + ) + merge_file = os.path.join( + save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["merges_file"] + ) + + with open(vocab_file, "w", encoding="utf-8") as f: + f.write(json.dumps(self.encoder, indent=2, sort_keys=True, ensure_ascii=False) + "\n") + + index = 0 + with open(merge_file, "w", encoding="utf-8") as writer: + writer.write("#version: 0.2\n") + for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]): + if index != token_index: + logger.warning( + f"Saving vocabulary to {merge_file}: BPE merge indices are not consecutive." + " Please check that the tokenizer is not corrupted!" + ) + index = token_index + writer.write(" ".join(bpe_tokens) + "\n") + index += 1 + + return vocab_file, merge_file + + # Copied from transformers.models.roberta.tokenization_roberta.RobertaTokenizer.get_special_tokens_mask with Roberta->Blenderbot, RoBERTa->Blenderbot + def get_special_tokens_mask( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False + ) -> List[int]: + """ + Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding + special tokens using the tokenizer `prepare_for_model` method. + + Args: + token_ids_0 (`List[int]`): + List of IDs. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + already_has_special_tokens (`bool`, *optional*, defaults to `False`): + Whether or not the token list is already formatted with special tokens for the model. + + Returns: + `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. + """ + if already_has_special_tokens: + return super().get_special_tokens_mask( + token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True + ) + + if token_ids_1 is None: + return [1] + ([0] * len(token_ids_0)) + [1] + return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1] + + # Copied from transformers.models.roberta.tokenization_roberta.RobertaTokenizer.create_token_type_ids_from_sequences with Roberta->Blenderbot, RoBERTa->Blenderbot + def create_token_type_ids_from_sequences( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None + ) -> List[int]: + """ + Create a mask from the two sequences passed to be used in a sequence-pair classification task. Blenderbot does not + make use of token type ids, therefore a list of zeros is returned. + + Args: + token_ids_0 (`List[int]`): + List of IDs. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + + Returns: + `List[int]`: List of zeros. + """ + sep = [self.sep_token_id] + cls = [self.cls_token_id] + + if token_ids_1 is None: + return len(cls + token_ids_0 + sep) * [0] + return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0] + + # Copied from transformers.models.roberta.tokenization_roberta.RobertaTokenizer.prepare_for_tokenization with Roberta->Blenderbot, RoBERTa->Blenderbot + def prepare_for_tokenization(self, text, is_split_into_words=False, **kwargs): + add_prefix_space = kwargs.pop("add_prefix_space", self.add_prefix_space) + if (is_split_into_words or add_prefix_space) and (len(text) > 0 and not text[0].isspace()): + text = " " + text + return (text, kwargs) + + def build_inputs_with_special_tokens(self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None): + """ + Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and + adding special tokens. A Blenderbot sequence has the following format: + - single sequence: ` X ` + + Args: + token_ids_0 (`List[int]`): + List of IDs to which the special tokens will be added + token_ids_1 (`List[int]`, *optional*): + Will be ignored + Returns: + `List[int]`: list of [input IDs](../glossary#input-ids) with the appropriate special tokens. + """ + return token_ids_0 + [self.eos_token_id] + + +__all__ = ["BlenderbotTokenizer"] diff --git a/docs/transformers/build/lib/transformers/models/blenderbot/tokenization_blenderbot_fast.py b/docs/transformers/build/lib/transformers/models/blenderbot/tokenization_blenderbot_fast.py new file mode 100644 index 0000000000000000000000000000000000000000..8667fe76349dfa786180a59581e2d3eff8040b67 --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/blenderbot/tokenization_blenderbot_fast.py @@ -0,0 +1,284 @@ +# coding=utf-8 +# Copyright 2021 The Facebook Inc. and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Fast Tokenization class for Blenderbot.""" + +import json +from typing import List, Optional, Tuple + +from tokenizers import processors + +from ...tokenization_utils_base import AddedToken, BatchEncoding +from ...tokenization_utils_fast import PreTrainedTokenizerFast +from ...utils import logging +from .tokenization_blenderbot import BlenderbotTokenizer + + +logger = logging.get_logger(__name__) + + +VOCAB_FILES_NAMES = { + "vocab_file": "vocab.json", + "merges_file": "merges.txt", + "tokenizer_config_file": "tokenizer_config.json", +} + + +class BlenderbotTokenizerFast(PreTrainedTokenizerFast): + """ + Construct a "fast" Blenderbot tokenizer (backed by HuggingFace's *tokenizers* library), derived from the GPT-2 + tokenizer, using byte-level Byte-Pair-Encoding. + + This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will + be encoded differently whether it is at the beginning of the sentence (without space) or not: + + ```python + >>> from transformers import BlenderbotTokenizerFast + + >>> tokenizer = BlenderbotTokenizerFast.from_pretrained("facebook/blenderbot-3B") + >>> tokenizer("Hello world")["input_ids"] + [6950, 1085, 2] + + >>> tokenizer(" Hello world")["input_ids"] + [6950, 1085, 2] + ``` + + You can get around that behavior by passing `add_prefix_space=True` when instantiating this tokenizer or when you + call it on some text, but since the model was not pretrained this way, it might yield a decrease in performance. + + + + When used with `is_split_into_words=True`, this tokenizer needs to be instantiated with `add_prefix_space=True`. + + + + This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should + refer to this superclass for more information regarding those methods. + + Args: + vocab_file (`str`): + Path to the vocabulary file. + merges_file (`str`): + Path to the merges file. + errors (`str`, *optional*, defaults to `"replace"`): + Paradigm to follow when decoding bytes to UTF-8. See + [bytes.decode](https://docs.python.org/3/library/stdtypes.html#bytes.decode) for more information. + bos_token (`str`, *optional*, defaults to `""`): + The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token. + + + + When building a sequence using special tokens, this is not the token that is used for the beginning of + sequence. The token used is the `cls_token`. + + + + eos_token (`str`, *optional*, defaults to `""`): + The end of sequence token. + + + + When building a sequence using special tokens, this is not the token that is used for the end of sequence. + The token used is the `sep_token`. + + + + sep_token (`str`, *optional*, defaults to `""`): + The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for + sequence classification or for a text and a question for question answering. It is also used as the last + token of a sequence built with special tokens. + cls_token (`str`, *optional*, defaults to `""`): + The classifier token which is used when doing sequence classification (classification of the whole sequence + instead of per-token classification). It is the first token of the sequence when built with special tokens. + unk_token (`str`, *optional*, defaults to `""`): + The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this + token instead. + pad_token (`str`, *optional*, defaults to `""`): + The token used for padding, for example when batching sequences of different lengths. + mask_token (`str`, *optional*, defaults to `""`): + The token used for masking values. This is the token used when training this model with masked language + modeling. This is the token which the model will try to predict. + add_prefix_space (`bool`, *optional*, defaults to `False`): + Whether or not to add an initial space to the input. This allows to treat the leading word just as any + other word. (Blenderbot tokenizer detect beginning of words by the preceding space). + trim_offsets (`bool`, *optional*, defaults to `True`): + Whether the post processing step should trim offsets to avoid including whitespaces. + """ + + vocab_files_names = VOCAB_FILES_NAMES + model_input_names = ["input_ids", "attention_mask"] + slow_tokenizer_class = BlenderbotTokenizer + + # Copied from transformers.models.roberta.tokenization_roberta_fast.RobertaTokenizerFast.__init__ with Roberta->Blenderbot, RoBERTa->Blenderbot + def __init__( + self, + vocab_file=None, + merges_file=None, + tokenizer_file=None, + errors="replace", + bos_token="", + eos_token="", + sep_token="", + cls_token="", + unk_token="", + pad_token="", + mask_token="", + add_prefix_space=False, + trim_offsets=True, + **kwargs, + ): + mask_token = ( + AddedToken(mask_token, lstrip=True, rstrip=False, normalized=False) + if isinstance(mask_token, str) + else mask_token + ) + super().__init__( + vocab_file, + merges_file, + tokenizer_file=tokenizer_file, + errors=errors, + bos_token=bos_token, + eos_token=eos_token, + sep_token=sep_token, + cls_token=cls_token, + unk_token=unk_token, + pad_token=pad_token, + mask_token=mask_token, + add_prefix_space=add_prefix_space, + trim_offsets=trim_offsets, + **kwargs, + ) + + tokenizer_component = "post_processor" + tokenizer_component_instance = getattr(self.backend_tokenizer, tokenizer_component, None) + if tokenizer_component_instance: + state = json.loads(tokenizer_component_instance.__getstate__()) + + # The lists 'sep' and 'cls' must be cased in tuples for the object `post_processor_class` + if "sep" in state: + state["sep"] = tuple(state["sep"]) + if "cls" in state: + state["cls"] = tuple(state["cls"]) + + changes_to_apply = False + + if state.get("add_prefix_space", add_prefix_space) != add_prefix_space: + state["add_prefix_space"] = add_prefix_space + changes_to_apply = True + + if state.get("trim_offsets", trim_offsets) != trim_offsets: + state["trim_offsets"] = trim_offsets + changes_to_apply = True + + if changes_to_apply: + component_class = getattr(processors, state.pop("type")) + new_value = component_class(**state) + setattr(self.backend_tokenizer, tokenizer_component, new_value) + + @property + # Copied from transformers.models.roberta.tokenization_roberta_fast.RobertaTokenizerFast.mask_token with Roberta->Blenderbot, RoBERTa->Blenderbot + def mask_token(self) -> str: + """ + `str`: Mask token, to use when training a model with masked-language modeling. Log an error if used while not + having been set. + + Blenderbot tokenizer has a special mask token to be usable in the fill-mask pipeline. The mask token will greedily + comprise the space before the **. + """ + if self._mask_token is None: + if self.verbose: + logger.error("Using mask_token, but it is not set yet.") + return None + return str(self._mask_token) + + @mask_token.setter + def mask_token(self, value): + """ + Overriding the default behavior of the mask token to have it eat the space before it. + + This is needed to preserve backward compatibility with all the previously used models based on Roberta. + """ + # Mask token behave like a normal word, i.e. include the space before it + # So we set lstrip to True + value = AddedToken(value, lstrip=True, rstrip=False) if isinstance(value, str) else value + self._mask_token = value + + # Copied from transformers.models.roberta.tokenization_roberta_fast.RobertaTokenizerFast._batch_encode_plus with Roberta->Blenderbot, RoBERTa->Blenderbot + def _batch_encode_plus(self, *args, **kwargs) -> BatchEncoding: + is_split_into_words = kwargs.get("is_split_into_words", False) + assert self.add_prefix_space or not is_split_into_words, ( + f"You need to instantiate {self.__class__.__name__} with add_prefix_space=True " + "to use it with pretokenized inputs." + ) + + return super()._batch_encode_plus(*args, **kwargs) + + # Copied from transformers.models.roberta.tokenization_roberta_fast.RobertaTokenizerFast._encode_plus with Roberta->Blenderbot, RoBERTa->Blenderbot + def _encode_plus(self, *args, **kwargs) -> BatchEncoding: + is_split_into_words = kwargs.get("is_split_into_words", False) + + assert self.add_prefix_space or not is_split_into_words, ( + f"You need to instantiate {self.__class__.__name__} with add_prefix_space=True " + "to use it with pretokenized inputs." + ) + + return super()._encode_plus(*args, **kwargs) + + # Copied from transformers.models.roberta.tokenization_roberta_fast.RobertaTokenizerFast.save_vocabulary with Roberta->Blenderbot, RoBERTa->Blenderbot + def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]: + files = self._tokenizer.model.save(save_directory, name=filename_prefix) + return tuple(files) + + # Copied from transformers.models.roberta.tokenization_roberta_fast.RobertaTokenizerFast.create_token_type_ids_from_sequences with Roberta->Blenderbot, RoBERTa->Blenderbot + def create_token_type_ids_from_sequences( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None + ) -> List[int]: + """ + Create a mask from the two sequences passed to be used in a sequence-pair classification task. Blenderbot does not + make use of token type ids, therefore a list of zeros is returned. + + Args: + token_ids_0 (`List[int]`): + List of IDs. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + + Returns: + `List[int]`: List of zeros. + """ + sep = [self.sep_token_id] + cls = [self.cls_token_id] + + if token_ids_1 is None: + return len(cls + token_ids_0 + sep) * [0] + return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0] + + def build_inputs_with_special_tokens(self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None): + """ + Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and + adding special tokens. A Blenderbot sequence has the following format: + - single sequence: ` X ` + + Args: + token_ids_0 (`List[int]`): + List of IDs to which the special tokens will be added + token_ids_1 (`List[int]`, *optional*): + Will be ignored + Returns: + `List[int]`: list of [input IDs](../glossary#input-ids) with the appropriate special tokens. + """ + return token_ids_0 + [self.eos_token_id] + + +__all__ = ["BlenderbotTokenizerFast"] diff --git a/docs/transformers/build/lib/transformers/models/blenderbot_small/__init__.py b/docs/transformers/build/lib/transformers/models/blenderbot_small/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..075d0070e4c4e2b95cce8a3bcaa2818021daee99 --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/blenderbot_small/__init__.py @@ -0,0 +1,31 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ...utils import _LazyModule +from ...utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .configuration_blenderbot_small import * + from .modeling_blenderbot_small import * + from .modeling_flax_blenderbot_small import * + from .modeling_tf_blenderbot_small import * + from .tokenization_blenderbot_small import * + from .tokenization_blenderbot_small_fast import * +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/docs/transformers/build/lib/transformers/models/blenderbot_small/configuration_blenderbot_small.py b/docs/transformers/build/lib/transformers/models/blenderbot_small/configuration_blenderbot_small.py new file mode 100644 index 0000000000000000000000000000000000000000..5865486370e5b972c07b1b4749ca2dd7778158d9 --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/blenderbot_small/configuration_blenderbot_small.py @@ -0,0 +1,390 @@ +# coding=utf-8 +# Copyright 2021 The Facebook, Inc. and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""BlenderbotSmall model configuration""" + +from collections import OrderedDict +from typing import Any, Mapping, Optional + +from ... import PreTrainedTokenizer +from ...configuration_utils import PretrainedConfig +from ...file_utils import TensorType, is_torch_available +from ...onnx import OnnxConfig, OnnxConfigWithPast, OnnxSeq2SeqConfigWithPast +from ...onnx.utils import compute_effective_axis_dimension +from ...utils import logging + + +logger = logging.get_logger(__name__) + + +class BlenderbotSmallConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`BlenderbotSmallModel`]. It is used to instantiate + an BlenderbotSmall model according to the specified arguments, defining the model architecture. Instantiating a + configuration with the defaults will yield a similar configuration to that of the BlenderbotSmall + [facebook/blenderbot_small-90M](https://huggingface.co/facebook/blenderbot_small-90M) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + + Args: + vocab_size (`int`, *optional*, defaults to 50265): + Vocabulary size of the BlenderbotSmall model. Defines the number of different tokens that can be + represented by the `inputs_ids` passed when calling [`BlenderbotSmallModel`] or [`TFBlenderbotSmallModel`]. + d_model (`int`, *optional*, defaults to 512): + Dimensionality of the layers and the pooler layer. + encoder_layers (`int`, *optional*, defaults to 8): + Number of encoder layers. + decoder_layers (`int`, *optional*, defaults to 8): + Number of decoder layers. + encoder_attention_heads (`int`, *optional*, defaults to 16): + Number of attention heads for each attention layer in the Transformer encoder. + decoder_attention_heads (`int`, *optional*, defaults to 16): + Number of attention heads for each attention layer in the Transformer decoder. + decoder_ffn_dim (`int`, *optional*, defaults to 2048): + Dimensionality of the "intermediate" (often named feed-forward) layer in decoder. + encoder_ffn_dim (`int`, *optional*, defaults to 2048): + Dimensionality of the "intermediate" (often named feed-forward) layer in decoder. + activation_function (`str` or `function`, *optional*, defaults to `"gelu"`): + The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, + `"relu"`, `"silu"` and `"gelu_new"` are supported. + dropout (`float`, *optional*, defaults to 0.1): + The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. + attention_dropout (`float`, *optional*, defaults to 0.0): + The dropout ratio for the attention probabilities. + activation_dropout (`float`, *optional*, defaults to 0.0): + The dropout ratio for activations inside the fully connected layer. + max_position_embeddings (`int`, *optional*, defaults to 512): + The maximum sequence length that this model might ever be used with. Typically set this to something large + just in case (e.g., 512 or 1024 or 2048). + init_std (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + encoder_layerdrop (`float`, *optional*, defaults to 0.0): + The LayerDrop probability for the encoder. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556) + for more details. + decoder_layerdrop (`float`, *optional*, defaults to 0.0): + The LayerDrop probability for the decoder. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556) + for more details. + scale_embedding (`bool`, *optional*, defaults to `False`): + Scale embeddings by diving by sqrt(d_model). + use_cache (`bool`, *optional*, defaults to `True`): + Whether or not the model should return the last key/values attentions (not used by all models) + forced_eos_token_id (`int`, *optional*, defaults to 2): + The id of the token to force as the last generated token when `max_length` is reached. Usually set to + `eos_token_id`. + + Example: + + ```python + >>> from transformers import BlenderbotSmallConfig, BlenderbotSmallModel + + >>> # Initializing a BlenderbotSmall facebook/blenderbot_small-90M style configuration + >>> configuration = BlenderbotSmallConfig() + + >>> # Initializing a model (with random weights) from the facebook/blenderbot_small-90M style configuration + >>> model = BlenderbotSmallModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "blenderbot-small" + keys_to_ignore_at_inference = ["past_key_values"] + attribute_map = {"num_attention_heads": "encoder_attention_heads", "hidden_size": "d_model"} + + def __init__( + self, + vocab_size=50265, + max_position_embeddings=512, + encoder_layers=8, + encoder_ffn_dim=2048, + encoder_attention_heads=16, + decoder_layers=8, + decoder_ffn_dim=2048, + decoder_attention_heads=16, + encoder_layerdrop=0.0, + decoder_layerdrop=0.0, + use_cache=True, + is_encoder_decoder=True, + activation_function="gelu", + d_model=512, + dropout=0.1, + attention_dropout=0.0, + activation_dropout=0.0, + init_std=0.02, + decoder_start_token_id=1, + scale_embedding=False, + pad_token_id=0, + bos_token_id=1, + eos_token_id=2, + forced_eos_token_id=2, + **kwargs, + ): + self.vocab_size = vocab_size + self.max_position_embeddings = max_position_embeddings + self.d_model = d_model + self.encoder_ffn_dim = encoder_ffn_dim + self.encoder_layers = encoder_layers + self.encoder_attention_heads = encoder_attention_heads + self.decoder_ffn_dim = decoder_ffn_dim + self.decoder_layers = decoder_layers + self.decoder_attention_heads = decoder_attention_heads + self.dropout = dropout + self.attention_dropout = attention_dropout + self.activation_dropout = activation_dropout + self.activation_function = activation_function + self.init_std = init_std + self.encoder_layerdrop = encoder_layerdrop + self.decoder_layerdrop = decoder_layerdrop + self.use_cache = use_cache + self.num_hidden_layers = encoder_layers + self.scale_embedding = scale_embedding # scale factor will be sqrt(d_model) if True + + super().__init__( + pad_token_id=pad_token_id, + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + is_encoder_decoder=is_encoder_decoder, + decoder_start_token_id=decoder_start_token_id, + forced_eos_token_id=forced_eos_token_id, + **kwargs, + ) + + +# Copied from transformers.models.bart.configuration_bart.BartOnnxConfig +class BlenderbotSmallOnnxConfig(OnnxSeq2SeqConfigWithPast): + @property + def inputs(self) -> Mapping[str, Mapping[int, str]]: + if self.task in ["default", "seq2seq-lm"]: + common_inputs = OrderedDict( + [ + ("input_ids", {0: "batch", 1: "encoder_sequence"}), + ("attention_mask", {0: "batch", 1: "encoder_sequence"}), + ] + ) + + if self.use_past: + common_inputs["decoder_input_ids"] = {0: "batch"} + common_inputs["decoder_attention_mask"] = {0: "batch", 1: "past_decoder_sequence + sequence"} + else: + common_inputs["decoder_input_ids"] = {0: "batch", 1: "decoder_sequence"} + common_inputs["decoder_attention_mask"] = {0: "batch", 1: "decoder_sequence"} + + if self.use_past: + self.fill_with_past_key_values_(common_inputs, direction="inputs") + elif self.task == "causal-lm": + # TODO: figure this case out. + common_inputs = OrderedDict( + [ + ("input_ids", {0: "batch", 1: "encoder_sequence"}), + ("attention_mask", {0: "batch", 1: "encoder_sequence"}), + ] + ) + if self.use_past: + num_encoder_layers, _ = self.num_layers + for i in range(num_encoder_layers): + common_inputs[f"past_key_values.{i}.key"] = {0: "batch", 2: "past_sequence + sequence"} + common_inputs[f"past_key_values.{i}.value"] = {0: "batch", 2: "past_sequence + sequence"} + else: + common_inputs = OrderedDict( + [ + ("input_ids", {0: "batch", 1: "encoder_sequence"}), + ("attention_mask", {0: "batch", 1: "encoder_sequence"}), + ("decoder_input_ids", {0: "batch", 1: "decoder_sequence"}), + ("decoder_attention_mask", {0: "batch", 1: "decoder_sequence"}), + ] + ) + + return common_inputs + + @property + def outputs(self) -> Mapping[str, Mapping[int, str]]: + if self.task in ["default", "seq2seq-lm"]: + common_outputs = super().outputs + else: + common_outputs = super(OnnxConfigWithPast, self).outputs + if self.use_past: + num_encoder_layers, _ = self.num_layers + for i in range(num_encoder_layers): + common_outputs[f"present.{i}.key"] = {0: "batch", 2: "past_sequence + sequence"} + common_outputs[f"present.{i}.value"] = {0: "batch", 2: "past_sequence + sequence"} + return common_outputs + + def _generate_dummy_inputs_for_default_and_seq2seq_lm( + self, + tokenizer: PreTrainedTokenizer, + batch_size: int = -1, + seq_length: int = -1, + is_pair: bool = False, + framework: Optional[TensorType] = None, + ) -> Mapping[str, Any]: + encoder_inputs = self._generate_dummy_inputs_for_sequence_classification_and_question_answering( + tokenizer, batch_size, seq_length, is_pair, framework + ) + + # Generate decoder inputs + decoder_seq_length = seq_length if not self.use_past else 1 + decoder_inputs = self._generate_dummy_inputs_for_sequence_classification_and_question_answering( + tokenizer, batch_size, decoder_seq_length, is_pair, framework + ) + decoder_inputs = {f"decoder_{name}": tensor for name, tensor in decoder_inputs.items()} + common_inputs = dict(**encoder_inputs, **decoder_inputs) + + if self.use_past: + if not is_torch_available(): + raise ValueError("Cannot generate dummy past_keys inputs without PyTorch installed.") + else: + import torch + batch, encoder_seq_length = common_inputs["input_ids"].shape + decoder_seq_length = common_inputs["decoder_input_ids"].shape[1] + num_encoder_attention_heads, num_decoder_attention_heads = self.num_attention_heads + encoder_shape = ( + batch, + num_encoder_attention_heads, + encoder_seq_length, + self._config.hidden_size // num_encoder_attention_heads, + ) + decoder_past_length = decoder_seq_length + 3 + decoder_shape = ( + batch, + num_decoder_attention_heads, + decoder_past_length, + self._config.hidden_size // num_decoder_attention_heads, + ) + + common_inputs["decoder_attention_mask"] = torch.cat( + [common_inputs["decoder_attention_mask"], torch.ones(batch, decoder_past_length)], dim=1 + ) + + common_inputs["past_key_values"] = [] + # If the number of encoder and decoder layers are present in the model configuration, both are considered + num_encoder_layers, num_decoder_layers = self.num_layers + min_num_layers = min(num_encoder_layers, num_decoder_layers) + max_num_layers = max(num_encoder_layers, num_decoder_layers) - min_num_layers + remaining_side_name = "encoder" if num_encoder_layers > num_decoder_layers else "decoder" + + for _ in range(min_num_layers): + common_inputs["past_key_values"].append( + ( + torch.zeros(decoder_shape), + torch.zeros(decoder_shape), + torch.zeros(encoder_shape), + torch.zeros(encoder_shape), + ) + ) + # TODO: test this. + shape = encoder_shape if remaining_side_name == "encoder" else decoder_shape + for _ in range(min_num_layers, max_num_layers): + common_inputs["past_key_values"].append((torch.zeros(shape), torch.zeros(shape))) + return common_inputs + + def _generate_dummy_inputs_for_causal_lm( + self, + tokenizer: PreTrainedTokenizer, + batch_size: int = -1, + seq_length: int = -1, + is_pair: bool = False, + framework: Optional[TensorType] = None, + ) -> Mapping[str, Any]: + common_inputs = self._generate_dummy_inputs_for_sequence_classification_and_question_answering( + tokenizer, batch_size, seq_length, is_pair, framework + ) + + if self.use_past: + if not is_torch_available(): + raise ValueError("Cannot generate dummy past_keys inputs without PyTorch installed.") + else: + import torch + batch, seqlen = common_inputs["input_ids"].shape + # Not using the same length for past_key_values + past_key_values_length = seqlen + 2 + num_encoder_layers, _ = self.num_layers + num_encoder_attention_heads, _ = self.num_attention_heads + past_shape = ( + batch, + num_encoder_attention_heads, + past_key_values_length, + self._config.hidden_size // num_encoder_attention_heads, + ) + + mask_dtype = common_inputs["attention_mask"].dtype + common_inputs["attention_mask"] = torch.cat( + [common_inputs["attention_mask"], torch.ones(batch, past_key_values_length, dtype=mask_dtype)], dim=1 + ) + common_inputs["past_key_values"] = [ + (torch.zeros(past_shape), torch.zeros(past_shape)) for _ in range(num_encoder_layers) + ] + return common_inputs + + def _generate_dummy_inputs_for_sequence_classification_and_question_answering( + self, + tokenizer: PreTrainedTokenizer, + batch_size: int = -1, + seq_length: int = -1, + is_pair: bool = False, + framework: Optional[TensorType] = None, + ) -> Mapping[str, Any]: + # Copied from OnnxConfig.generate_dummy_inputs + # Did not use super(OnnxConfigWithPast, self).generate_dummy_inputs for code clarity. + # If dynamic axis (-1) we forward with a fixed dimension of 2 samples to avoid optimizations made by ONNX + batch_size = compute_effective_axis_dimension( + batch_size, fixed_dimension=OnnxConfig.default_fixed_batch, num_token_to_add=0 + ) + + # If dynamic axis (-1) we forward with a fixed dimension of 8 tokens to avoid optimizations made by ONNX + token_to_add = tokenizer.num_special_tokens_to_add(is_pair) + seq_length = compute_effective_axis_dimension( + seq_length, fixed_dimension=OnnxConfig.default_fixed_sequence, num_token_to_add=token_to_add + ) + + # Generate dummy inputs according to compute batch and sequence + dummy_input = [" ".join([tokenizer.unk_token]) * seq_length] * batch_size + common_inputs = dict(tokenizer(dummy_input, return_tensors=framework)) + return common_inputs + + def generate_dummy_inputs( + self, + tokenizer: PreTrainedTokenizer, + batch_size: int = -1, + seq_length: int = -1, + is_pair: bool = False, + framework: Optional[TensorType] = None, + ) -> Mapping[str, Any]: + if self.task in ["default", "seq2seq-lm"]: + common_inputs = self._generate_dummy_inputs_for_default_and_seq2seq_lm( + tokenizer, batch_size=batch_size, seq_length=seq_length, is_pair=is_pair, framework=framework + ) + + elif self.task == "causal-lm": + common_inputs = self._generate_dummy_inputs_for_causal_lm( + tokenizer, batch_size=batch_size, seq_length=seq_length, is_pair=is_pair, framework=framework + ) + else: + common_inputs = self._generate_dummy_inputs_for_sequence_classification_and_question_answering( + tokenizer, batch_size=batch_size, seq_length=seq_length, is_pair=is_pair, framework=framework + ) + + return common_inputs + + def _flatten_past_key_values_(self, flattened_output, name, idx, t): + if self.task in ["default", "seq2seq-lm"]: + flattened_output = super()._flatten_past_key_values_(flattened_output, name, idx, t) + else: + flattened_output = super(OnnxSeq2SeqConfigWithPast, self)._flatten_past_key_values_( + flattened_output, name, idx, t + ) + + +__all__ = ["BlenderbotSmallConfig", "BlenderbotSmallOnnxConfig"] diff --git a/docs/transformers/build/lib/transformers/models/blenderbot_small/modeling_blenderbot_small.py b/docs/transformers/build/lib/transformers/models/blenderbot_small/modeling_blenderbot_small.py new file mode 100644 index 0000000000000000000000000000000000000000..7bb4a49bb8e384f26133b52a50e8ea9633940762 --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/blenderbot_small/modeling_blenderbot_small.py @@ -0,0 +1,1511 @@ +# coding=utf-8 +# Copyright 2021 The Facebook, Inc. and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch BlenderbotSmall model.""" + +import copy +import math +from typing import List, Optional, Tuple, Union + +import torch +import torch.utils.checkpoint +from torch import nn +from torch.nn import CrossEntropyLoss + +from ...activations import ACT2FN +from ...generation import GenerationMixin +from ...modeling_attn_mask_utils import _prepare_4d_attention_mask, _prepare_4d_causal_attention_mask +from ...modeling_outputs import ( + BaseModelOutput, + BaseModelOutputWithPastAndCrossAttentions, + CausalLMOutputWithCrossAttentions, + Seq2SeqLMOutput, + Seq2SeqModelOutput, +) +from ...modeling_utils import PreTrainedModel +from ...utils import ( + add_end_docstrings, + add_start_docstrings, + add_start_docstrings_to_model_forward, + logging, + replace_return_docstrings, +) +from .configuration_blenderbot_small import BlenderbotSmallConfig + + +logger = logging.get_logger(__name__) + +_CONFIG_FOR_DOC = "BlenderbotSmallConfig" + + +# Copied from transformers.models.bart.modeling_bart.shift_tokens_right +def shift_tokens_right(input_ids: torch.Tensor, pad_token_id: int, decoder_start_token_id: int): + """ + Shift input ids one token to the right. + """ + shifted_input_ids = input_ids.new_zeros(input_ids.shape) + shifted_input_ids[:, 1:] = input_ids[:, :-1].clone() + shifted_input_ids[:, 0] = decoder_start_token_id + + if pad_token_id is None: + raise ValueError("self.model.config.pad_token_id has to be defined.") + # replace possible -100 values in labels by `pad_token_id` + shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id) + + return shifted_input_ids + + +# Copied from transformers.models.blenderbot.modeling_blenderbot.BlenderbotLearnedPositionalEmbedding with Blenderbot->BlenderbotSmall +class BlenderbotSmallLearnedPositionalEmbedding(nn.Embedding): + """ + This module learns positional embeddings up to a fixed maximum size. + """ + + def __init__(self, num_embeddings: int, embedding_dim: int): + super().__init__(num_embeddings, embedding_dim) + + def forward(self, input_ids_shape: torch.Size, past_key_values_length: int = 0): + """`input_ids_shape` is expected to be [bsz x seqlen].""" + bsz, seq_len = input_ids_shape[:2] + positions = torch.arange( + past_key_values_length, past_key_values_length + seq_len, dtype=torch.long, device=self.weight.device + ) + return super().forward(positions) + + +# Copied from transformers.models.bart.modeling_bart.BartAttention with Bart->BlenderbotSmall +class BlenderbotSmallAttention(nn.Module): + """Multi-headed attention from 'Attention Is All You Need' paper""" + + def __init__( + self, + embed_dim: int, + num_heads: int, + dropout: float = 0.0, + is_decoder: bool = False, + bias: bool = True, + is_causal: bool = False, + config: Optional[BlenderbotSmallConfig] = None, + ): + super().__init__() + self.embed_dim = embed_dim + self.num_heads = num_heads + self.dropout = dropout + self.head_dim = embed_dim // num_heads + self.config = config + + if (self.head_dim * num_heads) != self.embed_dim: + raise ValueError( + f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}" + f" and `num_heads`: {num_heads})." + ) + self.scaling = self.head_dim**-0.5 + self.is_decoder = is_decoder + self.is_causal = is_causal + + self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias) + self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias) + self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias) + self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias) + + def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int): + return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous() + + def forward( + self, + hidden_states: torch.Tensor, + key_value_states: Optional[torch.Tensor] = None, + past_key_value: Optional[Tuple[torch.Tensor]] = None, + attention_mask: Optional[torch.Tensor] = None, + layer_head_mask: Optional[torch.Tensor] = None, + output_attentions: bool = False, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + """Input shape: Batch x Time x Channel""" + + # if key_value_states are provided this layer is used as a cross-attention layer + # for the decoder + is_cross_attention = key_value_states is not None + + bsz, tgt_len, _ = hidden_states.size() + + # get query proj + query_states = self.q_proj(hidden_states) * self.scaling + # get key, value proj + # `past_key_value[0].shape[2] == key_value_states.shape[1]` + # is checking that the `sequence_length` of the `past_key_value` is the same as + # the provided `key_value_states` to support prefix tuning + if ( + is_cross_attention + and past_key_value is not None + and past_key_value[0].shape[2] == key_value_states.shape[1] + ): + # reuse k,v, cross_attentions + key_states = past_key_value[0] + value_states = past_key_value[1] + elif is_cross_attention: + # cross_attentions + key_states = self._shape(self.k_proj(key_value_states), -1, bsz) + value_states = self._shape(self.v_proj(key_value_states), -1, bsz) + elif past_key_value is not None: + # reuse k, v, self_attention + key_states = self._shape(self.k_proj(hidden_states), -1, bsz) + value_states = self._shape(self.v_proj(hidden_states), -1, bsz) + key_states = torch.cat([past_key_value[0], key_states], dim=2) + value_states = torch.cat([past_key_value[1], value_states], dim=2) + else: + # self_attention + key_states = self._shape(self.k_proj(hidden_states), -1, bsz) + value_states = self._shape(self.v_proj(hidden_states), -1, bsz) + + if self.is_decoder: + # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states. + # Further calls to cross_attention layer can then reuse all cross-attention + # key/value_states (first "if" case) + # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of + # all previous decoder key/value_states. Further calls to uni-directional self-attention + # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case) + # if encoder bi-directional self-attention `past_key_value` is always `None` + past_key_value = (key_states, value_states) + + proj_shape = (bsz * self.num_heads, -1, self.head_dim) + query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape) + key_states = key_states.reshape(*proj_shape) + value_states = value_states.reshape(*proj_shape) + + src_len = key_states.size(1) + attn_weights = torch.bmm(query_states, key_states.transpose(1, 2)) + + if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len): + raise ValueError( + f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is" + f" {attn_weights.size()}" + ) + + if attention_mask is not None: + if attention_mask.size() != (bsz, 1, tgt_len, src_len): + raise ValueError( + f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}" + ) + attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attention_mask + attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len) + + attn_weights = nn.functional.softmax(attn_weights, dim=-1) + + if layer_head_mask is not None: + if layer_head_mask.size() != (self.num_heads,): + raise ValueError( + f"Head mask for a single layer should be of size {(self.num_heads,)}, but is" + f" {layer_head_mask.size()}" + ) + attn_weights = layer_head_mask.view(1, -1, 1, 1) * attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len) + + if output_attentions: + # this operation is a bit awkward, but it's required to + # make sure that attn_weights keeps its gradient. + # In order to do so, attn_weights have to be reshaped + # twice and have to be reused in the following + attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, src_len) + else: + attn_weights_reshaped = None + + attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training) + + attn_output = torch.bmm(attn_probs, value_states) + + if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim): + raise ValueError( + f"`attn_output` should be of size {(bsz * self.num_heads, tgt_len, self.head_dim)}, but is" + f" {attn_output.size()}" + ) + + attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim) + attn_output = attn_output.transpose(1, 2) + + # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be + # partitioned across GPUs when using tensor-parallelism. + attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim) + + attn_output = self.out_proj(attn_output) + + return attn_output, attn_weights_reshaped, past_key_value + + +# Copied from transformers.models.bart.modeling_bart.BartEncoderLayer with Bart->BlenderbotSmall, BART->BLENDERBOT_SMALL +class BlenderbotSmallEncoderLayer(nn.Module): + def __init__(self, config: BlenderbotSmallConfig): + super().__init__() + self.embed_dim = config.d_model + + self.self_attn = BLENDERBOT_SMALL_ATTENTION_CLASSES[config._attn_implementation]( + embed_dim=self.embed_dim, + num_heads=config.encoder_attention_heads, + dropout=config.attention_dropout, + config=config, + ) + self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim) + self.dropout = config.dropout + self.activation_fn = ACT2FN[config.activation_function] + self.activation_dropout = config.activation_dropout + self.fc1 = nn.Linear(self.embed_dim, config.encoder_ffn_dim) + self.fc2 = nn.Linear(config.encoder_ffn_dim, self.embed_dim) + self.final_layer_norm = nn.LayerNorm(self.embed_dim) + + def forward( + self, + hidden_states: torch.FloatTensor, + attention_mask: torch.FloatTensor, + layer_head_mask: torch.FloatTensor, + output_attentions: Optional[bool] = False, + ) -> Tuple[torch.FloatTensor, Optional[torch.FloatTensor]]: + """ + Args: + hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)` + attention_mask (`torch.FloatTensor`): attention mask of size + `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values. + layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size + `(encoder_attention_heads,)`. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + """ + residual = hidden_states + hidden_states, attn_weights, _ = self.self_attn( + hidden_states=hidden_states, + attention_mask=attention_mask, + layer_head_mask=layer_head_mask, + output_attentions=output_attentions, + ) + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = residual + hidden_states + hidden_states = self.self_attn_layer_norm(hidden_states) + + residual = hidden_states + hidden_states = self.activation_fn(self.fc1(hidden_states)) + hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training) + hidden_states = self.fc2(hidden_states) + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = residual + hidden_states + hidden_states = self.final_layer_norm(hidden_states) + + if hidden_states.dtype == torch.float16 and ( + torch.isinf(hidden_states).any() or torch.isnan(hidden_states).any() + ): + clamp_value = torch.finfo(hidden_states.dtype).max - 1000 + hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value) + + outputs = (hidden_states,) + + if output_attentions: + outputs += (attn_weights,) + + return outputs + + +# TODO: Implement attention with SDPA for TimeSeriesTransformer. +BLENDERBOT_SMALL_ATTENTION_CLASSES = { + "eager": BlenderbotSmallAttention, +} + + +# Copied from transformers.models.bart.modeling_bart.BartDecoderLayer with Bart->BlenderbotSmall, BART->BLENDERBOT_SMALL +class BlenderbotSmallDecoderLayer(nn.Module): + def __init__(self, config: BlenderbotSmallConfig): + super().__init__() + self.embed_dim = config.d_model + + self.self_attn = BLENDERBOT_SMALL_ATTENTION_CLASSES[config._attn_implementation]( + embed_dim=self.embed_dim, + num_heads=config.decoder_attention_heads, + dropout=config.attention_dropout, + is_decoder=True, + is_causal=True, + config=config, + ) + self.dropout = config.dropout + self.activation_fn = ACT2FN[config.activation_function] + self.activation_dropout = config.activation_dropout + + self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim) + self.encoder_attn = BLENDERBOT_SMALL_ATTENTION_CLASSES[config._attn_implementation]( + self.embed_dim, + config.decoder_attention_heads, + dropout=config.attention_dropout, + is_decoder=True, + config=config, + ) + self.encoder_attn_layer_norm = nn.LayerNorm(self.embed_dim) + self.fc1 = nn.Linear(self.embed_dim, config.decoder_ffn_dim) + self.fc2 = nn.Linear(config.decoder_ffn_dim, self.embed_dim) + self.final_layer_norm = nn.LayerNorm(self.embed_dim) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, + encoder_attention_mask: Optional[torch.Tensor] = None, + layer_head_mask: Optional[torch.Tensor] = None, + cross_attn_layer_head_mask: Optional[torch.Tensor] = None, + past_key_value: Optional[Tuple[torch.Tensor]] = None, + output_attentions: Optional[bool] = False, + use_cache: Optional[bool] = True, + ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]: + """ + Args: + hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)` + attention_mask (`torch.FloatTensor`): attention mask of size + `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values. + encoder_hidden_states (`torch.FloatTensor`): + cross attention input to the layer of shape `(batch, seq_len, embed_dim)` + encoder_attention_mask (`torch.FloatTensor`): encoder attention mask of size + `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values. + layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size + `(encoder_attention_heads,)`. + cross_attn_layer_head_mask (`torch.FloatTensor`): mask for cross-attention heads in a given layer of + size `(decoder_attention_heads,)`. + past_key_value (`Tuple(torch.FloatTensor)`): cached past key and value projection states + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + """ + residual = hidden_states + + # Self Attention + # decoder uni-directional self-attention cached key/values tuple is at positions 1,2 + self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None + # add present self-attn cache to positions 1,2 of present_key_value tuple + hidden_states, self_attn_weights, present_key_value = self.self_attn( + hidden_states=hidden_states, + past_key_value=self_attn_past_key_value, + attention_mask=attention_mask, + layer_head_mask=layer_head_mask, + output_attentions=output_attentions, + ) + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = residual + hidden_states + hidden_states = self.self_attn_layer_norm(hidden_states) + + # Cross-Attention Block + cross_attn_present_key_value = None + cross_attn_weights = None + if encoder_hidden_states is not None: + residual = hidden_states + + # cross_attn cached key/values tuple is at positions 3,4 of present_key_value tuple + cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None + hidden_states, cross_attn_weights, cross_attn_present_key_value = self.encoder_attn( + hidden_states=hidden_states, + key_value_states=encoder_hidden_states, + attention_mask=encoder_attention_mask, + layer_head_mask=cross_attn_layer_head_mask, + past_key_value=cross_attn_past_key_value, + output_attentions=output_attentions, + ) + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = residual + hidden_states + hidden_states = self.encoder_attn_layer_norm(hidden_states) + + # add cross-attn to positions 3,4 of present_key_value tuple + present_key_value = present_key_value + cross_attn_present_key_value + + # Fully Connected + residual = hidden_states + hidden_states = self.activation_fn(self.fc1(hidden_states)) + hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training) + hidden_states = self.fc2(hidden_states) + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) + hidden_states = residual + hidden_states + hidden_states = self.final_layer_norm(hidden_states) + + outputs = (hidden_states,) + + if output_attentions: + outputs += (self_attn_weights, cross_attn_weights) + + if use_cache: + outputs += (present_key_value,) + + return outputs + + +class BlenderbotSmallPreTrainedModel(PreTrainedModel): + config_class = BlenderbotSmallConfig + base_model_prefix = "model" + supports_gradient_checkpointing = True + + def _init_weights(self, module): + std = self.config.init_std + if isinstance(module, nn.Linear): + module.weight.data.normal_(mean=0.0, std=std) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=std) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + + @property + def dummy_inputs(self): + pad_token = self.config.pad_token_id + input_ids = torch.tensor([[0, 6, 10, 4, 2], [0, 8, 12, 2, pad_token]], device=self.device) + dummy_inputs = { + "attention_mask": input_ids.ne(pad_token), + "input_ids": input_ids, + "decoder_input_ids": input_ids, + } + return dummy_inputs + + +BLENDERBOT_SMALL_START_DOCSTRING = r""" + This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads + etc.) + + This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. + Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage + and behavior. + + Parameters: + config ([`BlenderbotSmallConfig`]): + Model configuration class with all the parameters of the model. Initializing with a config file does not + load the weights associated with the model, only the configuration. Check out the + [`~PreTrainedModel.from_pretrained`] method to load the model weights. +""" + +BLENDERBOT_SMALL_GENERATION_EXAMPLE = r""" + Conversation example: + + ```python + >>> from transformers import AutoTokenizer, BlenderbotSmallForConditionalGeneration + + >>> mname = "facebook/blenderbot_small-90M" + >>> model = BlenderbotSmallForConditionalGeneration.from_pretrained(mname) + >>> tokenizer = AutoTokenizer.from_pretrained(mname) + >>> UTTERANCE = "My friends are cool but they eat too many carbs." + >>> print("Human: ", UTTERANCE) + Human: My friends are cool but they eat too many carbs. + + >>> inputs = tokenizer([UTTERANCE], return_tensors="pt") + >>> reply_ids = model.generate(**inputs) + >>> print("Bot: ", tokenizer.batch_decode(reply_ids, skip_special_tokens=True)[0]) + Bot: what kind of carbs do they eat? i don't know much about carbs. + + >>> REPLY = "I'm not sure" + >>> print("Human: ", REPLY) + Human: I'm not sure + + >>> NEXT_UTTERANCE = ( + ... "My friends are cool but they eat too many carbs.__end__ __start__what kind of carbs do they eat? " + ... "i don't know much about carbs__end__ " + ... "__start__ I'm not sure." + ... ) + >>> inputs = tokenizer([NEXT_UTTERANCE], return_tensors="pt") + >>> next_reply_ids = model.generate(**inputs) + >>> print("Bot: ", tokenizer.batch_decode(next_reply_ids, skip_special_tokens=True)[0]) + Bot: they eat a lot of carbs. carbs are high in fat, protein, and fats. + ``` +""" + +BLENDERBOT_SMALL_INPUTS_DOCSTRING = r""" + Args: + input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide + it. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*): + Indices of decoder input sequence tokens in the vocabulary. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are decoder input IDs?](../glossary#decoder-input-ids) + + BlenderbotSmall uses the `bos_token_id` as the starting token for `decoder_input_ids` generation. If + `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see + `past_key_values`). + decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*): + Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also + be used by default. + head_mask (`torch.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*): + Mask to nullify selected heads of the attention modules in the encoder. Mask values selected in `[0, 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + decoder_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*): + Mask to nullify selected heads of the attention modules in the decoder. Mask values selected in `[0, 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*): + Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in `[0, + 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + encoder_outputs (`tuple(tuple(torch.FloatTensor)`, *optional*): + Tuple consists of (`last_hidden_state`, *optional*: `hidden_states`, *optional*: `attentions`) + `last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)`, *optional*) is a sequence of + hidden-states at the output of the last layer of the encoder. Used in the cross-attention of the decoder. + past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): + Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape + `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape + `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`. + + Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention + blocks) that can be used (see `past_key_values` input) to speed up sequential decoding. + + If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that + don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all + `decoder_input_ids` of shape `(batch_size, sequence_length)`. + inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. + This is useful if you want more control over how to convert `input_ids` indices into associated vectors + than the model's internal embedding lookup matrix. + decoder_inputs_embeds (`torch.FloatTensor` of shape `(batch_size, target_sequence_length, hidden_size)`, *optional*): + Optionally, instead of passing `decoder_input_ids` you can choose to directly pass an embedded + representation. If `past_key_values` is used, optionally only the last `decoder_inputs_embeds` have to be + input (see `past_key_values`). This is useful if you want more control over how to convert + `decoder_input_ids` indices into associated vectors than the model's internal embedding lookup matrix. + + If `decoder_input_ids` and `decoder_inputs_embeds` are both unset, `decoder_inputs_embeds` takes the value + of `inputs_embeds`. + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see + `past_key_values`). + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. +""" + + +class BlenderbotSmallEncoder(BlenderbotSmallPreTrainedModel): + """ + Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a + [`BlenderbotSmallEncoderLayer`]. + + Args: + config: BlenderbotSmallConfig + embed_tokens (nn.Embedding): output embedding + """ + + def __init__(self, config: BlenderbotSmallConfig, embed_tokens: Optional[nn.Embedding] = None): + super().__init__(config) + + self.dropout = config.dropout + self.layerdrop = config.encoder_layerdrop + + embed_dim = config.d_model + self.padding_idx = config.pad_token_id + self.max_source_positions = config.max_position_embeddings + self.embed_scale = math.sqrt(embed_dim) if config.scale_embedding else 1.0 + + if embed_tokens is not None: + self.embed_tokens = embed_tokens + else: + self.embed_tokens = nn.Embedding(config.vocab_size, embed_dim, self.padding_idx) + + self.embed_positions = BlenderbotSmallLearnedPositionalEmbedding( + config.max_position_embeddings, + embed_dim, + ) + self.layers = nn.ModuleList([BlenderbotSmallEncoderLayer(config) for _ in range(config.encoder_layers)]) + self.layernorm_embedding = nn.LayerNorm(embed_dim) + + self.gradient_checkpointing = False + # Initialize weights and apply final processing + self.post_init() + + def forward( + self, + input_ids=None, + attention_mask=None, + head_mask=None, + inputs_embeds=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + ): + r""" + Args: + input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you + provide it. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + head_mask (`torch.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*): + Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. + This is useful if you want more control over how to convert `input_ids` indices into associated vectors + than the model's internal embedding lookup matrix. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors + for more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. + """ + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + # retrieve input_ids and inputs_embeds + if input_ids is not None and inputs_embeds is not None: + raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") + elif input_ids is not None: + self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask) + input_shape = input_ids.size() + input_ids = input_ids.view(-1, input_shape[-1]) + elif inputs_embeds is not None: + input_shape = inputs_embeds.size()[:-1] + else: + raise ValueError("You have to specify either input_ids or inputs_embeds") + + if inputs_embeds is None: + inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale + + embed_pos = self.embed_positions(input_shape) + + hidden_states = inputs_embeds + embed_pos + hidden_states = self.layernorm_embedding(hidden_states) + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) + + # expand attention_mask + if attention_mask is not None: + # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] + attention_mask = _prepare_4d_attention_mask(attention_mask, inputs_embeds.dtype) + + encoder_states = () if output_hidden_states else None + all_attentions = () if output_attentions else None + + # check if head_mask has a correct number of layers specified if desired + if head_mask is not None: + if head_mask.size()[0] != len(self.layers): + raise ValueError( + f"The head_mask should be specified for {len(self.layers)} layers, but it is for" + f" {head_mask.size()[0]}." + ) + for idx, encoder_layer in enumerate(self.layers): + if output_hidden_states: + encoder_states = encoder_states + (hidden_states,) + # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) + to_drop = False + if self.training: + dropout_probability = torch.rand([]) + if dropout_probability < self.layerdrop: # skip the layer + to_drop = True + + if to_drop: + layer_outputs = (None, None) + else: + if self.gradient_checkpointing and self.training: + layer_outputs = self._gradient_checkpointing_func( + encoder_layer.__call__, + hidden_states, + attention_mask, + (head_mask[idx] if head_mask is not None else None), + output_attentions, + ) + else: + layer_outputs = encoder_layer( + hidden_states, + attention_mask, + layer_head_mask=(head_mask[idx] if head_mask is not None else None), + output_attentions=output_attentions, + ) + + hidden_states = layer_outputs[0] + + if output_attentions: + all_attentions = all_attentions + (layer_outputs[1],) + + if output_hidden_states: + encoder_states = encoder_states + (hidden_states,) + + if not return_dict: + return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None) + return BaseModelOutput( + last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions + ) + + +class BlenderbotSmallDecoder(BlenderbotSmallPreTrainedModel): + """ + Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`BlenderbotSmallDecoderLayer`] + + Args: + config: BlenderbotSmallConfig + embed_tokens (nn.Embedding): output embedding + """ + + def __init__(self, config: BlenderbotSmallConfig, embed_tokens: Optional[nn.Embedding] = None): + super().__init__(config) + self.dropout = config.dropout + self.layerdrop = config.decoder_layerdrop + self.padding_idx = config.pad_token_id + self.max_target_positions = config.max_position_embeddings + self.embed_scale = math.sqrt(config.d_model) if config.scale_embedding else 1.0 + + if embed_tokens is not None: + self.embed_tokens = embed_tokens + else: + self.embed_tokens = nn.Embedding(config.vocab_size, config.d_model, self.padding_idx) + + self.embed_positions = BlenderbotSmallLearnedPositionalEmbedding( + config.max_position_embeddings, + config.d_model, + ) + self.layers = nn.ModuleList([BlenderbotSmallDecoderLayer(config) for _ in range(config.decoder_layers)]) + self.layernorm_embedding = nn.LayerNorm(config.d_model) + + self.gradient_checkpointing = False + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.embed_tokens + + def set_input_embeddings(self, value): + self.embed_tokens = value + + def forward( + self, + input_ids=None, + attention_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + head_mask=None, + cross_attn_head_mask=None, + past_key_values=None, + inputs_embeds=None, + use_cache=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + ): + r""" + Args: + input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you + provide it. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*): + Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention + of the decoder. + encoder_attention_mask (`torch.LongTensor` of shape `(batch_size, encoder_sequence_length)`, *optional*): + Mask to avoid performing cross-attention on padding tokens indices of encoder input_ids. Mask values + selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*): + Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*): + Mask to nullify selected heads of the cross-attention modules in the decoder to avoid performing + cross-attention on hidden heads. Mask values selected in `[0, 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): + Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of + shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of + shape `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`. + + Contains pre-computed hidden-states (key and values in the self-attention blocks and in the + cross-attention blocks) that can be used (see `past_key_values` input) to speed up sequential decoding. + + If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those + that don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of + all `decoder_input_ids` of shape `(batch_size, sequence_length)`. + inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. + This is useful if you want more control over how to convert `input_ids` indices into associated vectors + than the model's internal embedding lookup matrix. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors + for more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. + """ + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + use_cache = use_cache if use_cache is not None else self.config.use_cache + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + # retrieve input_ids and inputs_embeds + if input_ids is not None and inputs_embeds is not None: + raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time") + elif input_ids is not None: + input_shape = input_ids.size() + input_ids = input_ids.view(-1, input_shape[-1]) + elif inputs_embeds is not None: + input_shape = inputs_embeds.size()[:-1] + else: + raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds") + + # past_key_values_length + past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0 + + if inputs_embeds is None: + inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale + + attention_mask = _prepare_4d_causal_attention_mask( + attention_mask, input_shape, inputs_embeds, past_key_values_length + ) + + # expand encoder attention mask + if encoder_hidden_states is not None and encoder_attention_mask is not None: + # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] + encoder_attention_mask = _prepare_4d_attention_mask( + encoder_attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1] + ) + + # embed positions + positions = self.embed_positions(input_shape, past_key_values_length) + + # BlenderbotSmall applies layer norm on hidden_states + inputs_embeds = self.layernorm_embedding(inputs_embeds) + hidden_states = inputs_embeds + positions + + hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training) + + if self.gradient_checkpointing and self.training: + if use_cache: + logger.warning_once( + "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." + ) + use_cache = False + + # decoder layers + all_hidden_states = () if output_hidden_states else None + all_self_attns = () if output_attentions else None + all_cross_attentions = () if (output_attentions and encoder_hidden_states is not None) else None + next_decoder_cache = () if use_cache else None + + # check if head_mask/cross_attn_head_mask has a correct number of layers specified if desired + for attn_mask, mask_name in zip([head_mask, cross_attn_head_mask], ["head_mask", "cross_attn_head_mask"]): + if attn_mask is not None: + if attn_mask.size()[0] != len(self.layers): + raise ValueError( + f"The `{mask_name}` should be specified for {len(self.layers)} layers, but it is for" + f" {head_mask.size()[0]}." + ) + for idx, decoder_layer in enumerate(self.layers): + # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) + if output_hidden_states: + all_hidden_states += (hidden_states,) + if self.training: + dropout_probability = torch.rand([]) + if dropout_probability < self.layerdrop: + continue + + past_key_value = past_key_values[idx] if past_key_values is not None else None + + if self.gradient_checkpointing and self.training: + layer_outputs = self._gradient_checkpointing_func( + decoder_layer.__call__, + hidden_states, + attention_mask, + encoder_hidden_states, + encoder_attention_mask, + head_mask[idx] if head_mask is not None else None, + cross_attn_head_mask[idx] if cross_attn_head_mask is not None else None, + None, + output_attentions, + use_cache, + ) + else: + layer_outputs = decoder_layer( + hidden_states, + attention_mask=attention_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + layer_head_mask=(head_mask[idx] if head_mask is not None else None), + cross_attn_layer_head_mask=( + cross_attn_head_mask[idx] if cross_attn_head_mask is not None else None + ), + past_key_value=past_key_value, + output_attentions=output_attentions, + use_cache=use_cache, + ) + hidden_states = layer_outputs[0] + + if use_cache: + next_decoder_cache += (layer_outputs[3 if output_attentions else 1],) + + if output_attentions: + all_self_attns += (layer_outputs[1],) + + if encoder_hidden_states is not None: + all_cross_attentions += (layer_outputs[2],) + + # add hidden states from the last decoder layer + if output_hidden_states: + all_hidden_states += (hidden_states,) + + next_cache = next_decoder_cache if use_cache else None + if not return_dict: + return tuple( + v + for v in [hidden_states, next_cache, all_hidden_states, all_self_attns, all_cross_attentions] + if v is not None + ) + return BaseModelOutputWithPastAndCrossAttentions( + last_hidden_state=hidden_states, + past_key_values=next_cache, + hidden_states=all_hidden_states, + attentions=all_self_attns, + cross_attentions=all_cross_attentions, + ) + + +@add_start_docstrings( + "The bare BlenderbotSmall Model outputting raw hidden-states without any specific head on top.", + BLENDERBOT_SMALL_START_DOCSTRING, +) +class BlenderbotSmallModel(BlenderbotSmallPreTrainedModel): + _tied_weights_keys = ["decoder.embed_tokens.weight", "encoder.embed_tokens.weight"] + + def __init__(self, config: BlenderbotSmallConfig): + super().__init__(config) + + padding_idx, vocab_size = config.pad_token_id, config.vocab_size + self.shared = nn.Embedding(vocab_size, config.d_model, padding_idx) + + self.encoder = BlenderbotSmallEncoder(config, self.shared) + self.decoder = BlenderbotSmallDecoder(config, self.shared) + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.shared + + def set_input_embeddings(self, value): + self.shared = value + self.encoder.embed_tokens = self.shared + self.decoder.embed_tokens = self.shared + + def get_encoder(self): + return self.encoder + + def get_decoder(self): + return self.decoder + + @add_start_docstrings_to_model_forward(BLENDERBOT_SMALL_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=Seq2SeqModelOutput, config_class=_CONFIG_FOR_DOC) + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + decoder_input_ids: Optional[torch.LongTensor] = None, + decoder_attention_mask: Optional[torch.LongTensor] = None, + head_mask: Optional[torch.Tensor] = None, + decoder_head_mask: Optional[torch.Tensor] = None, + cross_attn_head_mask: Optional[torch.Tensor] = None, + encoder_outputs: Optional[Union[Tuple, BaseModelOutput]] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.Tensor] = None, + decoder_inputs_embeds: Optional[torch.FloatTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple[torch.FloatTensor], Seq2SeqModelOutput]: + r""" + Returns: + + Example: + + ```python + >>> from transformers import AutoTokenizer, BlenderbotSmallModel + + >>> model = BlenderbotSmallModel.from_pretrained("facebook/blenderbot_small-90M") + >>> tokenizer = AutoTokenizer.from_pretrained("facebook/blenderbot_small-90M") + + >>> inputs = tokenizer("Studies have been shown that owning a dog is good for you", return_tensors="pt") + >>> decoder_inputs = tokenizer("Studies show that", return_tensors="pt") # Batch size 1 + >>> outputs = model(input_ids=inputs.input_ids, decoder_input_ids=decoder_inputs.input_ids) + + >>> last_hidden_states = outputs.last_hidden_state + >>> list(last_hidden_states.shape) + [1, 3, 512] + ```""" + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + use_cache = use_cache if use_cache is not None else self.config.use_cache + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if encoder_outputs is None: + encoder_outputs = self.encoder( + input_ids=input_ids, + attention_mask=attention_mask, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + # If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput when return_dict=True + elif return_dict and not isinstance(encoder_outputs, BaseModelOutput): + encoder_outputs = BaseModelOutput( + last_hidden_state=encoder_outputs[0], + hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None, + attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None, + ) + + # decoder outputs consists of (dec_features, past_key_value, dec_hidden, dec_attn) + decoder_outputs = self.decoder( + input_ids=decoder_input_ids, + attention_mask=decoder_attention_mask, + encoder_hidden_states=encoder_outputs[0], + encoder_attention_mask=attention_mask, + head_mask=decoder_head_mask, + cross_attn_head_mask=cross_attn_head_mask, + past_key_values=past_key_values, + inputs_embeds=decoder_inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + if not return_dict: + return decoder_outputs + encoder_outputs + + return Seq2SeqModelOutput( + last_hidden_state=decoder_outputs.last_hidden_state, + past_key_values=decoder_outputs.past_key_values, + decoder_hidden_states=decoder_outputs.hidden_states, + decoder_attentions=decoder_outputs.attentions, + cross_attentions=decoder_outputs.cross_attentions, + encoder_last_hidden_state=encoder_outputs.last_hidden_state, + encoder_hidden_states=encoder_outputs.hidden_states, + encoder_attentions=encoder_outputs.attentions, + ) + + +@add_start_docstrings( + "The BlenderbotSmall Model with a language modeling head. Can be used for summarization.", + BLENDERBOT_SMALL_START_DOCSTRING, +) +class BlenderbotSmallForConditionalGeneration(BlenderbotSmallPreTrainedModel, GenerationMixin): + base_model_prefix = "model" + _keys_to_ignore_on_load_missing = ["final_logits_bias"] + _tied_weights_keys = ["decoder.embed_tokens.weight", "encoder.embed_tokens.weight", "lm_head.weight"] + + def __init__(self, config: BlenderbotSmallConfig): + super().__init__(config) + self.model = BlenderbotSmallModel(config) + self.register_buffer("final_logits_bias", torch.zeros((1, self.model.shared.num_embeddings))) + self.lm_head = nn.Linear(config.d_model, self.model.shared.num_embeddings, bias=False) + + # Initialize weights and apply final processing + self.post_init() + + def get_encoder(self): + return self.model.get_encoder() + + def get_decoder(self): + return self.model.get_decoder() + + def resize_token_embeddings( + self, new_num_tokens: int, pad_to_multiple_of: Optional[int] = None, mean_resizing: bool = True + ) -> nn.Embedding: + new_embeddings = super().resize_token_embeddings(new_num_tokens, pad_to_multiple_of, mean_resizing) + self._resize_final_logits_bias(new_embeddings.weight.shape[0]) + return new_embeddings + + def _resize_final_logits_bias(self, new_num_tokens: int) -> None: + old_num_tokens = self.final_logits_bias.shape[-1] + if new_num_tokens <= old_num_tokens: + new_bias = self.final_logits_bias[:, :new_num_tokens] + else: + extra_bias = torch.zeros((1, new_num_tokens - old_num_tokens), device=self.final_logits_bias.device) + new_bias = torch.cat([self.final_logits_bias, extra_bias], dim=1) + self.register_buffer("final_logits_bias", new_bias) + + def get_output_embeddings(self): + return self.lm_head + + def set_output_embeddings(self, new_embeddings): + self.lm_head = new_embeddings + + @add_start_docstrings_to_model_forward(BLENDERBOT_SMALL_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=Seq2SeqLMOutput, config_class=_CONFIG_FOR_DOC) + @add_end_docstrings(BLENDERBOT_SMALL_GENERATION_EXAMPLE) + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + decoder_input_ids: Optional[torch.LongTensor] = None, + decoder_attention_mask: Optional[torch.LongTensor] = None, + head_mask: Optional[torch.Tensor] = None, + decoder_head_mask: Optional[torch.Tensor] = None, + cross_attn_head_mask: Optional[torch.Tensor] = None, + encoder_outputs: Optional[Union[Tuple, BaseModelOutput]] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.Tensor] = None, + decoder_inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple[torch.FloatTensor], Seq2SeqLMOutput]: + r""" + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., + config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored + (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. + + Returns: + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if labels is not None: + if use_cache: + logger.warning("The `use_cache` argument is changed to `False` since `labels` is provided.") + use_cache = False + if decoder_input_ids is None and decoder_inputs_embeds is None: + decoder_input_ids = shift_tokens_right( + labels, self.config.pad_token_id, self.config.decoder_start_token_id + ) + + outputs = self.model( + input_ids, + attention_mask=attention_mask, + decoder_input_ids=decoder_input_ids, + encoder_outputs=encoder_outputs, + decoder_attention_mask=decoder_attention_mask, + head_mask=head_mask, + decoder_head_mask=decoder_head_mask, + cross_attn_head_mask=cross_attn_head_mask, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + decoder_inputs_embeds=decoder_inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + lm_logits = self.lm_head(outputs[0]) + self.final_logits_bias + + masked_lm_loss = None + if labels is not None: + loss_fct = CrossEntropyLoss() + masked_lm_loss = loss_fct(lm_logits.view(-1, self.config.vocab_size), labels.view(-1)) + + if not return_dict: + output = (lm_logits,) + outputs[1:] + return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output + + return Seq2SeqLMOutput( + loss=masked_lm_loss, + logits=lm_logits, + past_key_values=outputs.past_key_values, + decoder_hidden_states=outputs.decoder_hidden_states, + decoder_attentions=outputs.decoder_attentions, + cross_attentions=outputs.cross_attentions, + encoder_last_hidden_state=outputs.encoder_last_hidden_state, + encoder_hidden_states=outputs.encoder_hidden_states, + encoder_attentions=outputs.encoder_attentions, + ) + + @staticmethod + def _reorder_cache(past_key_values, beam_idx): + reordered_past = () + for layer_past in past_key_values: + # cached cross_attention states don't have to be reordered -> they are always the same + reordered_past += ( + tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past[:2]) + + layer_past[2:], + ) + return reordered_past + + +# Copied from transformers.models.bart.modeling_bart.BartDecoderWrapper with Bart->BlenderbotSmall +class BlenderbotSmallDecoderWrapper(BlenderbotSmallPreTrainedModel): + """ + This wrapper class is a helper class to correctly load pretrained checkpoints when the causal language model is + used in combination with the [`EncoderDecoderModel`] framework. + """ + + def __init__(self, config): + super().__init__(config) + self.decoder = BlenderbotSmallDecoder(config) + + def forward(self, *args, **kwargs): + return self.decoder(*args, **kwargs) + + +# Copied from transformers.models.bart.modeling_bart.BartForCausalLM with Bart->BlenderbotSmall, facebook/bart-base->facebook/blenderbot_small-90M +class BlenderbotSmallForCausalLM(BlenderbotSmallPreTrainedModel, GenerationMixin): + _tied_weights_keys = ["lm_head.weight"] + + def __init__(self, config): + config = copy.deepcopy(config) + config.is_decoder = True + config.is_encoder_decoder = False + super().__init__(config) + self.model = BlenderbotSmallDecoderWrapper(config) + + self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.model.decoder.embed_tokens + + def set_input_embeddings(self, value): + self.model.decoder.embed_tokens = value + + def get_output_embeddings(self): + return self.lm_head + + def set_output_embeddings(self, new_embeddings): + self.lm_head = new_embeddings + + def set_decoder(self, decoder): + self.model.decoder = decoder + + def get_decoder(self): + return self.model.decoder + + @replace_return_docstrings(output_type=CausalLMOutputWithCrossAttentions, config_class=_CONFIG_FOR_DOC) + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + encoder_attention_mask: Optional[torch.FloatTensor] = None, + head_mask: Optional[torch.Tensor] = None, + cross_attn_head_mask: Optional[torch.Tensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, CausalLMOutputWithCrossAttentions]: + r""" + Args: + input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you + provide it. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention + if the model is configured as a decoder. + encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used + in the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`: + head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*): + Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*): + Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): + Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of + shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of + shape `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`. The two additional + tensors are only required when the model is used as a decoder in a Sequence to Sequence model. + + Contains pre-computed hidden-states (key and values in the self-attention blocks and in the + cross-attention blocks) that can be used (see `past_key_values` input) to speed up sequential decoding. + + If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those + that don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of + all `decoder_input_ids` of shape `(batch_size, sequence_length)`. + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., + config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored + (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding + (see `past_key_values`). + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors + for more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. + + Returns: + + Example: + + ```python + >>> from transformers import AutoTokenizer, BlenderbotSmallForCausalLM + + >>> tokenizer = AutoTokenizer.from_pretrained("facebook/blenderbot_small-90M") + >>> model = BlenderbotSmallForCausalLM.from_pretrained("facebook/blenderbot_small-90M", add_cross_attention=False) + >>> assert model.config.is_decoder, f"{model.__class__} has to be configured as a decoder." + >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt") + >>> outputs = model(**inputs) + + >>> logits = outputs.logits + >>> expected_shape = [1, inputs.input_ids.shape[-1], model.config.vocab_size] + >>> list(logits.shape) == expected_shape + True + ```""" + + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn) + outputs = self.model.decoder( + input_ids=input_ids, + attention_mask=attention_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + head_mask=head_mask, + cross_attn_head_mask=cross_attn_head_mask, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + logits = self.lm_head(outputs[0]) + + loss = None + if labels is not None: + labels = labels.to(logits.device) + loss_fct = CrossEntropyLoss() + loss = loss_fct(logits.view(-1, self.config.vocab_size), labels.view(-1)) + + if not return_dict: + output = (logits,) + outputs[1:] + return (loss,) + output if loss is not None else output + + return CausalLMOutputWithCrossAttentions( + loss=loss, + logits=logits, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + cross_attentions=outputs.cross_attentions, + ) + + @staticmethod + def _reorder_cache(past_key_values, beam_idx): + reordered_past = () + for layer_past in past_key_values: + reordered_past += ( + tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past), + ) + return reordered_past + + +__all__ = [ + "BlenderbotSmallForCausalLM", + "BlenderbotSmallForConditionalGeneration", + "BlenderbotSmallModel", + "BlenderbotSmallPreTrainedModel", +] diff --git a/docs/transformers/build/lib/transformers/models/blenderbot_small/modeling_flax_blenderbot_small.py b/docs/transformers/build/lib/transformers/models/blenderbot_small/modeling_flax_blenderbot_small.py new file mode 100644 index 0000000000000000000000000000000000000000..6aceaa611c956f2eabfee7fb04ee7bef81fbc097 --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/blenderbot_small/modeling_flax_blenderbot_small.py @@ -0,0 +1,1528 @@ +# coding=utf-8 +# Copyright 2021 The Facebook, Inc. and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Flax BlenderbotSmall model.""" + +import math +import random +from functools import partial +from typing import Callable, Optional, Tuple + +import flax.linen as nn +import jax +import jax.numpy as jnp +from flax.core.frozen_dict import FrozenDict, freeze, unfreeze +from flax.linen import combine_masks, make_causal_mask +from flax.linen.attention import dot_product_attention_weights +from flax.traverse_util import flatten_dict, unflatten_dict +from jax import lax +from jax.random import PRNGKey + +from ...modeling_flax_outputs import ( + FlaxBaseModelOutput, + FlaxBaseModelOutputWithPastAndCrossAttentions, + FlaxCausalLMOutputWithCrossAttentions, + FlaxSeq2SeqLMOutput, + FlaxSeq2SeqModelOutput, +) +from ...modeling_flax_utils import ( + ACT2FN, + FlaxPreTrainedModel, + append_call_sample_docstring, + append_replace_return_docstrings, + overwrite_call_docstring, +) +from ...utils import add_start_docstrings, logging, replace_return_docstrings +from .configuration_blenderbot_small import BlenderbotSmallConfig + + +logger = logging.get_logger(__name__) + +_CHECKPOINT_FOR_DOC = "facebook/blenderbot_small-90M" +_CONFIG_FOR_DOC = "BlenderbotSmallConfig" + +BLENDERBOT_SMALL_START_DOCSTRING = r""" + This model inherits from [`FlaxPreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads + etc.) + + This model is also a Flax Linen + [flax.nn.Module](https://flax.readthedocs.io/en/latest/_autosummary/flax.nn.module.html) subclass. Use it as a + regular Flax Module and refer to the Flax documentation for all matter related to general usage and behavior. + + Finally, this model supports inherent JAX features such as: + + - [Just-In-Time (JIT) compilation](https://jax.readthedocs.io/en/latest/jax.html#just-in-time-compilation-jit) + - [Automatic Differentiation](https://jax.readthedocs.io/en/latest/jax.html#automatic-differentiation) + - [Vectorization](https://jax.readthedocs.io/en/latest/jax.html#vectorization-vmap) + - [Parallelization](https://jax.readthedocs.io/en/latest/jax.html#parallelization-pmap) + + Parameters: + config ([`BlenderbotSmallConfig`]): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the [`~FlaxPreTrainedModel.from_pretrained`] method to load the model weights. + dtype (`jax.numpy.dtype`, *optional*, defaults to `jax.numpy.float32`): + The data type of the computation. Can be one of `jax.numpy.float32`, `jax.numpy.float16` (on GPUs) and + `jax.numpy.bfloat16` (on TPUs). + + This can be used to enable mixed-precision training or half-precision inference on GPUs or TPUs. If + specified all the computation will be performed with the given `dtype`. + + **Note that this only specifies the dtype of the computation and does not influence the dtype of model + parameters.** + + If you wish to change the dtype of the model parameters, see [`~FlaxPreTrainedModel.to_fp16`] and + [`~FlaxPreTrainedModel.to_bf16`]. +""" + +BLENDERBOT_SMALL_INPUTS_DOCSTRING = r""" + Args: + input_ids (`jnp.ndarray` of shape `(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide + it. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + attention_mask (`jnp.ndarray` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + decoder_input_ids (`jnp.ndarray` of shape `(batch_size, target_sequence_length)`, *optional*): + Indices of decoder input sequence tokens in the vocabulary. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are decoder input IDs?](../glossary#decoder-input-ids) + + For translation and summarization training, `decoder_input_ids` should be provided. If no + `decoder_input_ids` is provided, the model will create this tensor by shifting the `input_ids` to the right + for denoising pre-training following the paper. + decoder_attention_mask (`jnp.ndarray` of shape `(batch_size, target_sequence_length)`, *optional*): + Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also + be used by default. + + If you want to change padding behavior, you should modify to your needs. See diagram 1 in [the + paper](https://arxiv.org/abs/1910.13461) for more information on the default strategy. + position_ids (`numpy.ndarray` of shape `(batch_size, sequence_length)`, *optional*): + Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, + config.max_position_embeddings - 1]`. + decoder_position_ids (`numpy.ndarray` of shape `(batch_size, sequence_length)`, *optional*): + Indices of positions of each decoder input sequence tokens in the position embeddings. Selected in the + range `[0, config.max_position_embeddings - 1]`. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. +""" + + +BLENDERBOT_SMALL_ENCODE_INPUTS_DOCSTRING = r""" + Args: + input_ids (`jnp.ndarray` of shape `(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide + it. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + attention_mask (`jnp.ndarray` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + position_ids (`numpy.ndarray` of shape `(batch_size, sequence_length)`, *optional*): + Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, + config.max_position_embeddings - 1]`. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. +""" + +BLENDERBOT_SMALL_DECODE_INPUTS_DOCSTRING = r""" + Args: + decoder_input_ids (`jnp.ndarray` of shape `(batch_size, target_sequence_length)`): + Indices of decoder input sequence tokens in the vocabulary. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are decoder input IDs?](../glossary#decoder-input-ids) + + For translation and summarization training, `decoder_input_ids` should be provided. If no + `decoder_input_ids` is provided, the model will create this tensor by shifting the `input_ids` to the right + for denoising pre-training following the paper. + encoder_outputs (`tuple(tuple(jnp.ndarray)`): + Tuple consists of (`last_hidden_state`, *optional*: `hidden_states`, *optional*: `attentions`) + `last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)`, *optional*) is a sequence of + hidden-states at the output of the last layer of the encoder. Used in the cross-attention of the decoder. + encoder_attention_mask (`jnp.ndarray` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + decoder_attention_mask (`jnp.ndarray` of shape `(batch_size, target_sequence_length)`, *optional*): + Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also + be used by default. + + If you want to change padding behavior, you should modify to your needs. See diagram 1 in [the + paper](https://arxiv.org/abs/1910.13461) for more information on the default strategy. + decoder_position_ids (`numpy.ndarray` of shape `(batch_size, sequence_length)`, *optional*): + Indices of positions of each decoder input sequence tokens in the position embeddings. Selected in the + range `[0, config.max_position_embeddings - 1]`. + past_key_values (`Dict[str, np.ndarray]`, *optional*, returned by `init_cache` or when passing previous `past_key_values`): + Dictionary of pre-computed hidden-states (key and values in the attention blocks) that can be used for fast + auto-regressive decoding. Pre-computed key and value hidden-states are of shape *[batch_size, max_length]*. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. +""" + + +# Copied from transformers.models.bart.modeling_flax_bart.shift_tokens_right +def shift_tokens_right(input_ids: jnp.ndarray, pad_token_id: int, decoder_start_token_id: int) -> jnp.ndarray: + """ + Shift input ids one token to the right. + """ + shifted_input_ids = jnp.zeros_like(input_ids) + shifted_input_ids = shifted_input_ids.at[:, 1:].set(input_ids[:, :-1]) + shifted_input_ids = shifted_input_ids.at[:, 0].set(decoder_start_token_id) + + shifted_input_ids = jnp.where(shifted_input_ids == -100, pad_token_id, shifted_input_ids) + return shifted_input_ids + + +# Copied from transformers.models.bart.modeling_flax_bart.FlaxBartAttention with Bart->BlenderbotSmall +class FlaxBlenderbotSmallAttention(nn.Module): + config: BlenderbotSmallConfig + embed_dim: int + num_heads: int + dropout: float = 0.0 + causal: bool = False + bias: bool = True + dtype: jnp.dtype = jnp.float32 # the dtype of the computation + + def setup(self) -> None: + self.head_dim = self.embed_dim // self.num_heads + if self.head_dim * self.num_heads != self.embed_dim: + raise ValueError( + f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}" + f" and `num_heads`: {self.num_heads})." + ) + + dense = partial( + nn.Dense, + self.embed_dim, + use_bias=self.bias, + dtype=self.dtype, + kernel_init=jax.nn.initializers.normal(self.config.init_std), + ) + + self.q_proj, self.k_proj, self.v_proj = dense(), dense(), dense() + self.out_proj = dense() + + self.dropout_layer = nn.Dropout(rate=self.dropout) + + if self.causal: + self.causal_mask = make_causal_mask( + jnp.ones((1, self.config.max_position_embeddings), dtype="bool"), dtype="bool" + ) + + def _split_heads(self, hidden_states): + return hidden_states.reshape(hidden_states.shape[:2] + (self.num_heads, self.head_dim)) + + def _merge_heads(self, hidden_states): + return hidden_states.reshape(hidden_states.shape[:2] + (self.embed_dim,)) + + @nn.compact + def _concatenate_to_cache(self, key, value, query, attention_mask): + """ + This function takes projected key, value states from a single input token and concatenates the states to cached + states from previous steps. This function is slightly adapted from the official Flax repository: + https://github.com/google/flax/blob/491ce18759622506588784b4fca0e4bf05f8c8cd/flax/linen/attention.py#L252 + """ + # detect if we're initializing by absence of existing cache data. + is_initialized = self.has_variable("cache", "cached_key") + cached_key = self.variable("cache", "cached_key", jnp.zeros, key.shape, key.dtype) + cached_value = self.variable("cache", "cached_value", jnp.zeros, value.shape, value.dtype) + cache_index = self.variable("cache", "cache_index", lambda: jnp.array(0, dtype=jnp.int32)) + + if is_initialized: + *batch_dims, max_length, num_heads, depth_per_head = cached_key.value.shape + # update key, value caches with our new 1d spatial slices + cur_index = cache_index.value + indices = (0,) * len(batch_dims) + (cur_index, 0, 0) + key = lax.dynamic_update_slice(cached_key.value, key, indices) + value = lax.dynamic_update_slice(cached_value.value, value, indices) + cached_key.value = key + cached_value.value = value + num_updated_cache_vectors = query.shape[1] + cache_index.value = cache_index.value + num_updated_cache_vectors + # causal mask for cached decoder self-attention: our single query position should only attend to those key positions that have already been generated and cached, not the remaining zero elements. + pad_mask = jnp.broadcast_to( + jnp.arange(max_length) < cur_index + num_updated_cache_vectors, + tuple(batch_dims) + (1, num_updated_cache_vectors, max_length), + ) + attention_mask = combine_masks(pad_mask, attention_mask) + return key, value, attention_mask + + def __call__( + self, + hidden_states: jnp.ndarray, + key_value_states: Optional[jnp.ndarray] = None, + attention_mask: Optional[jnp.ndarray] = None, + init_cache: bool = False, + deterministic: bool = True, + ) -> Tuple[jnp.ndarray]: + """Input shape: Batch x Time x Channel""" + + # if key_value_states are provided this layer is used as a cross-attention layer + # for the decoder + is_cross_attention = key_value_states is not None + batch_size = hidden_states.shape[0] + + # get query proj + query_states = self.q_proj(hidden_states) + # get key, value proj + if is_cross_attention: + # cross_attentions + key_states = self.k_proj(key_value_states) + value_states = self.v_proj(key_value_states) + else: + # self_attention + key_states = self.k_proj(hidden_states) + value_states = self.v_proj(hidden_states) + + query_states = self._split_heads(query_states) + key_states = self._split_heads(key_states) + value_states = self._split_heads(value_states) + + # handle cache prepare causal attention mask + if self.causal: + query_length, key_length = query_states.shape[1], key_states.shape[1] + if self.has_variable("cache", "cached_key"): + mask_shift = self.variables["cache"]["cache_index"] + max_decoder_length = self.variables["cache"]["cached_key"].shape[1] + causal_mask = lax.dynamic_slice( + self.causal_mask, (0, 0, mask_shift, 0), (1, 1, query_length, max_decoder_length) + ) + else: + causal_mask = self.causal_mask[:, :, :query_length, :key_length] + causal_mask = jnp.broadcast_to(causal_mask, (batch_size,) + causal_mask.shape[1:]) + + # combine masks if needed + if attention_mask is not None and self.causal: + attention_mask = jnp.broadcast_to(jnp.expand_dims(attention_mask, axis=(-3, -2)), causal_mask.shape) + attention_mask = combine_masks(attention_mask, causal_mask) + elif self.causal: + attention_mask = causal_mask + elif attention_mask is not None: + attention_mask = jnp.expand_dims(attention_mask, axis=(-3, -2)) + + # During fast autoregressive decoding, we feed one position at a time, + # and cache the keys and values step by step. + if self.causal and (self.has_variable("cache", "cached_key") or init_cache): + key_states, value_states, attention_mask = self._concatenate_to_cache( + key_states, value_states, query_states, attention_mask + ) + + # Convert the boolean attention mask to an attention bias. + if attention_mask is not None: + # attention mask in the form of attention bias + attention_bias = lax.select( + attention_mask > 0, + jnp.full(attention_mask.shape, 0.0).astype(self.dtype), + jnp.full(attention_mask.shape, jnp.finfo(self.dtype).min).astype(self.dtype), + ) + else: + attention_bias = None + + dropout_rng = None + if not deterministic and self.dropout > 0.0: + dropout_rng = self.make_rng("dropout") + + attn_weights = dot_product_attention_weights( + query_states, + key_states, + bias=attention_bias, + dropout_rng=dropout_rng, + dropout_rate=self.dropout, + broadcast_dropout=True, + deterministic=deterministic, + dtype=self.dtype, + precision=None, + ) + + attn_output = jnp.einsum("...hqk,...khd->...qhd", attn_weights, value_states) + attn_output = self._merge_heads(attn_output) + attn_output = self.out_proj(attn_output) + + return attn_output, attn_weights + + +# Copied from transformers.models.bart.modeling_flax_bart.FlaxBartEncoderLayer with Bart->BlenderbotSmall +class FlaxBlenderbotSmallEncoderLayer(nn.Module): + config: BlenderbotSmallConfig + dtype: jnp.dtype = jnp.float32 + + def setup(self) -> None: + self.embed_dim = self.config.d_model + self.self_attn = FlaxBlenderbotSmallAttention( + config=self.config, + embed_dim=self.embed_dim, + num_heads=self.config.encoder_attention_heads, + dropout=self.config.attention_dropout, + dtype=self.dtype, + ) + self.self_attn_layer_norm = nn.LayerNorm(dtype=self.dtype, epsilon=1e-05) + self.dropout_layer = nn.Dropout(rate=self.config.dropout) + self.activation_fn = ACT2FN[self.config.activation_function] + self.activation_dropout_layer = nn.Dropout(rate=self.config.activation_dropout) + self.fc1 = nn.Dense( + self.config.encoder_ffn_dim, + dtype=self.dtype, + kernel_init=jax.nn.initializers.normal(self.config.init_std), + ) + self.fc2 = nn.Dense( + self.embed_dim, dtype=self.dtype, kernel_init=jax.nn.initializers.normal(self.config.init_std) + ) + self.final_layer_norm = nn.LayerNorm(dtype=self.dtype, epsilon=1e-05) + + def __call__( + self, + hidden_states: jnp.ndarray, + attention_mask: jnp.ndarray, + output_attentions: bool = True, + deterministic: bool = True, + ) -> Tuple[jnp.ndarray]: + residual = hidden_states + hidden_states, attn_weights = self.self_attn(hidden_states=hidden_states, attention_mask=attention_mask) + + hidden_states = self.dropout_layer(hidden_states, deterministic=deterministic) + hidden_states = residual + hidden_states + hidden_states = self.self_attn_layer_norm(hidden_states) + + residual = hidden_states + hidden_states = self.activation_fn(self.fc1(hidden_states)) + hidden_states = self.activation_dropout_layer(hidden_states, deterministic=deterministic) + hidden_states = self.fc2(hidden_states) + hidden_states = self.dropout_layer(hidden_states, deterministic=deterministic) + hidden_states = residual + hidden_states + hidden_states = self.final_layer_norm(hidden_states) + + outputs = (hidden_states,) + + if output_attentions: + outputs += (attn_weights,) + + return outputs + + +# Copied from transformers.models.bart.modeling_flax_bart.FlaxBartEncoderLayerCollection with Bart->BlenderbotSmall +class FlaxBlenderbotSmallEncoderLayerCollection(nn.Module): + config: BlenderbotSmallConfig + dtype: jnp.dtype = jnp.float32 # the dtype of the computation + + def setup(self): + self.layers = [ + FlaxBlenderbotSmallEncoderLayer(self.config, name=str(i), dtype=self.dtype) + for i in range(self.config.encoder_layers) + ] + self.layerdrop = self.config.encoder_layerdrop + + def __call__( + self, + hidden_states, + attention_mask, + deterministic: bool = True, + output_attentions: bool = False, + output_hidden_states: bool = False, + return_dict: bool = True, + ): + all_attentions = () if output_attentions else None + all_hidden_states = () if output_hidden_states else None + + for encoder_layer in self.layers: + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) + dropout_probability = random.uniform(0, 1) + if not deterministic and (dropout_probability < self.layerdrop): # skip the layer + layer_outputs = (None, None) + else: + layer_outputs = encoder_layer( + hidden_states, + attention_mask, + output_attentions, + deterministic, + ) + hidden_states = layer_outputs[0] + if output_attentions: + all_attentions = all_attentions + (layer_outputs[1],) + + if output_hidden_states: + all_hidden_states += (hidden_states,) + + outputs = (hidden_states, all_hidden_states, all_attentions) + + if not return_dict: + return tuple(v for v in outputs if v is not None) + + return FlaxBaseModelOutput( + last_hidden_state=hidden_states, hidden_states=all_hidden_states, attentions=all_attentions + ) + + +# Copied from transformers.models.bart.modeling_flax_bart.FlaxBartDecoderLayer with Bart->BlenderbotSmall +class FlaxBlenderbotSmallDecoderLayer(nn.Module): + config: BlenderbotSmallConfig + dtype: jnp.dtype = jnp.float32 + + def setup(self) -> None: + self.embed_dim = self.config.d_model + self.self_attn = FlaxBlenderbotSmallAttention( + config=self.config, + embed_dim=self.embed_dim, + num_heads=self.config.decoder_attention_heads, + dropout=self.config.attention_dropout, + causal=True, + dtype=self.dtype, + ) + self.dropout_layer = nn.Dropout(rate=self.config.dropout) + self.activation_fn = ACT2FN[self.config.activation_function] + self.activation_dropout_layer = nn.Dropout(rate=self.config.activation_dropout) + + self.self_attn_layer_norm = nn.LayerNorm(dtype=self.dtype, epsilon=1e-05) + self.encoder_attn = FlaxBlenderbotSmallAttention( + config=self.config, + embed_dim=self.embed_dim, + num_heads=self.config.decoder_attention_heads, + dropout=self.config.attention_dropout, + dtype=self.dtype, + ) + self.encoder_attn_layer_norm = nn.LayerNorm(dtype=self.dtype, epsilon=1e-05) + self.fc1 = nn.Dense( + self.config.decoder_ffn_dim, + dtype=self.dtype, + kernel_init=jax.nn.initializers.normal(self.config.init_std), + ) + self.fc2 = nn.Dense( + self.embed_dim, dtype=self.dtype, kernel_init=jax.nn.initializers.normal(self.config.init_std) + ) + self.final_layer_norm = nn.LayerNorm(dtype=self.dtype, epsilon=1e-05) + + def __call__( + self, + hidden_states: jnp.ndarray, + attention_mask: jnp.ndarray, + encoder_hidden_states: Optional[jnp.ndarray] = None, + encoder_attention_mask: Optional[jnp.ndarray] = None, + init_cache: bool = False, + output_attentions: bool = True, + deterministic: bool = True, + ) -> Tuple[jnp.ndarray]: + residual = hidden_states + + # Self Attention + hidden_states, self_attn_weights = self.self_attn( + hidden_states=hidden_states, attention_mask=attention_mask, init_cache=init_cache + ) + hidden_states = self.dropout_layer(hidden_states, deterministic=deterministic) + hidden_states = residual + hidden_states + hidden_states = self.self_attn_layer_norm(hidden_states) + + # Cross-Attention Block + cross_attn_weights = None + if encoder_hidden_states is not None: + residual = hidden_states + + hidden_states, cross_attn_weights = self.encoder_attn( + hidden_states=hidden_states, + key_value_states=encoder_hidden_states, + attention_mask=encoder_attention_mask, + ) + hidden_states = self.dropout_layer(hidden_states, deterministic=deterministic) + hidden_states = residual + hidden_states + hidden_states = self.encoder_attn_layer_norm(hidden_states) + + # Fully Connected + residual = hidden_states + hidden_states = self.activation_fn(self.fc1(hidden_states)) + hidden_states = self.activation_dropout_layer(hidden_states, deterministic=deterministic) + hidden_states = self.fc2(hidden_states) + hidden_states = self.dropout_layer(hidden_states, deterministic=deterministic) + hidden_states = residual + hidden_states + hidden_states = self.final_layer_norm(hidden_states) + + outputs = (hidden_states,) + + if output_attentions: + outputs += (self_attn_weights, cross_attn_weights) + + return outputs + + +# Copied from transformers.models.bart.modeling_flax_bart.FlaxBartDecoderLayerCollection with Bart->BlenderbotSmall +class FlaxBlenderbotSmallDecoderLayerCollection(nn.Module): + config: BlenderbotSmallConfig + dtype: jnp.dtype = jnp.float32 # the dtype of the computation + + def setup(self): + self.layers = [ + FlaxBlenderbotSmallDecoderLayer(self.config, name=str(i), dtype=self.dtype) + for i in range(self.config.decoder_layers) + ] + self.layerdrop = self.config.decoder_layerdrop + + def __call__( + self, + hidden_states, + attention_mask, + encoder_hidden_states: Optional[jnp.ndarray] = None, + encoder_attention_mask: Optional[jnp.ndarray] = None, + deterministic: bool = True, + init_cache: bool = False, + output_attentions: bool = False, + output_hidden_states: bool = False, + return_dict: bool = True, + ): + # decoder layers + all_hidden_states = () if output_hidden_states else None + all_self_attns = () if output_attentions else None + all_cross_attentions = () if (output_attentions and encoder_hidden_states is not None) else None + + for decoder_layer in self.layers: + if output_hidden_states: + all_hidden_states += (hidden_states,) + # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) + dropout_probability = random.uniform(0, 1) + if not deterministic and (dropout_probability < self.layerdrop): + layer_outputs = (None, None, None) + else: + layer_outputs = decoder_layer( + hidden_states, + attention_mask=attention_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + init_cache=init_cache, + output_attentions=output_attentions, + deterministic=deterministic, + ) + + hidden_states = layer_outputs[0] + if output_attentions: + all_self_attns += (layer_outputs[1],) + + if encoder_hidden_states is not None: + all_cross_attentions += (layer_outputs[2],) + + # add hidden states from the last decoder layer + if output_hidden_states: + all_hidden_states += (hidden_states,) + + outputs = [hidden_states, all_hidden_states, all_self_attns, all_cross_attentions] + + if not return_dict: + return tuple(v for v in outputs if v is not None) + + return FlaxBaseModelOutputWithPastAndCrossAttentions( + last_hidden_state=hidden_states, + hidden_states=all_hidden_states, + attentions=all_self_attns, + cross_attentions=all_cross_attentions, + ) + + +class FlaxBlenderbotSmallEncoder(nn.Module): + config: BlenderbotSmallConfig + embed_tokens: nn.Embed + dtype: jnp.dtype = jnp.float32 # the dtype of the computation + + def setup(self): + self.dropout_layer = nn.Dropout(rate=self.config.dropout) + + embed_dim = self.config.d_model + self.padding_idx = self.config.pad_token_id + self.max_source_positions = self.config.max_position_embeddings + self.embed_scale = math.sqrt(embed_dim) if self.config.scale_embedding else 1.0 + + self.embed_positions = nn.Embed( + self.config.max_position_embeddings, + embed_dim, + embedding_init=jax.nn.initializers.normal(self.config.init_std), + ) + self.layers = FlaxBlenderbotSmallEncoderLayerCollection(self.config, self.dtype) + self.layernorm_embedding = nn.LayerNorm(dtype=self.dtype, epsilon=1e-05) + + def __call__( + self, + input_ids, + attention_mask, + position_ids, + output_attentions: bool = False, + output_hidden_states: bool = False, + return_dict: bool = True, + deterministic: bool = True, + ): + input_shape = input_ids.shape + input_ids = input_ids.reshape(-1, input_shape[-1]) + + inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale + + embed_pos = self.embed_positions(position_ids) + + hidden_states = inputs_embeds + embed_pos + hidden_states = self.layernorm_embedding(hidden_states) + hidden_states = self.dropout_layer(hidden_states, deterministic=deterministic) + + outputs = self.layers( + hidden_states, + attention_mask, + deterministic=deterministic, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + if not return_dict: + return outputs + + return FlaxBaseModelOutput( + last_hidden_state=outputs.last_hidden_state, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +class FlaxBlenderbotSmallDecoder(nn.Module): + config: BlenderbotSmallConfig + embed_tokens: nn.Embed + dtype: jnp.dtype = jnp.float32 # the dtype of the computation + + def setup(self): + self.dropout_layer = nn.Dropout(rate=self.config.dropout) + + embed_dim = self.config.d_model + self.padding_idx = self.config.pad_token_id + self.max_target_positions = self.config.max_position_embeddings + self.embed_scale = math.sqrt(self.config.d_model) if self.config.scale_embedding else 1.0 + + self.embed_positions = nn.Embed( + self.config.max_position_embeddings, + embed_dim, + embedding_init=jax.nn.initializers.normal(self.config.init_std), + ) + + self.layers = FlaxBlenderbotSmallDecoderLayerCollection(self.config, self.dtype) + self.layernorm_embedding = nn.LayerNorm(dtype=self.dtype, epsilon=1e-05) + + def __call__( + self, + input_ids, + attention_mask, + position_ids, + encoder_hidden_states: Optional[jnp.ndarray] = None, + encoder_attention_mask: Optional[jnp.ndarray] = None, + init_cache: bool = False, + output_attentions: bool = False, + output_hidden_states: bool = False, + return_dict: bool = True, + deterministic: bool = True, + ): + input_shape = input_ids.shape + input_ids = input_ids.reshape(-1, input_shape[-1]) + + inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale + + # embed positions + positions = self.embed_positions(position_ids) + + # BlenderbotSmall applies layer norm on inputs_embeds in decoder + inputs_embeds = self.layernorm_embedding(inputs_embeds) + hidden_states = inputs_embeds + positions + + hidden_states = self.dropout_layer(hidden_states, deterministic=deterministic) + + outputs = self.layers( + hidden_states, + attention_mask, + encoder_hidden_states, + encoder_attention_mask, + deterministic=deterministic, + init_cache=init_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + if not return_dict: + return outputs + + return FlaxBaseModelOutputWithPastAndCrossAttentions( + last_hidden_state=outputs.last_hidden_state, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + cross_attentions=outputs.cross_attentions, + ) + + +# Copied from transformers.models.bart.modeling_flax_bart.FlaxBartModule with Bart->BlenderbotSmall +class FlaxBlenderbotSmallModule(nn.Module): + config: BlenderbotSmallConfig + dtype: jnp.dtype = jnp.float32 # the dtype of the computation + + def setup(self): + self.shared = nn.Embed( + self.config.vocab_size, + self.config.d_model, + embedding_init=jax.nn.initializers.normal(self.config.init_std), + dtype=self.dtype, + ) + + self.encoder = FlaxBlenderbotSmallEncoder(self.config, dtype=self.dtype, embed_tokens=self.shared) + self.decoder = FlaxBlenderbotSmallDecoder(self.config, dtype=self.dtype, embed_tokens=self.shared) + + def _get_encoder_module(self): + return self.encoder + + def _get_decoder_module(self): + return self.decoder + + def __call__( + self, + input_ids, + attention_mask, + decoder_input_ids, + decoder_attention_mask, + position_ids, + decoder_position_ids, + output_attentions: bool = False, + output_hidden_states: bool = False, + return_dict: bool = True, + deterministic: bool = True, + ): + encoder_outputs = self.encoder( + input_ids=input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + deterministic=deterministic, + ) + + decoder_outputs = self.decoder( + input_ids=decoder_input_ids, + attention_mask=decoder_attention_mask, + position_ids=decoder_position_ids, + encoder_hidden_states=encoder_outputs[0], + encoder_attention_mask=attention_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + deterministic=deterministic, + ) + + if not return_dict: + return decoder_outputs + encoder_outputs + + return FlaxSeq2SeqModelOutput( + last_hidden_state=decoder_outputs.last_hidden_state, + decoder_hidden_states=decoder_outputs.hidden_states, + decoder_attentions=decoder_outputs.attentions, + cross_attentions=decoder_outputs.cross_attentions, + encoder_last_hidden_state=encoder_outputs.last_hidden_state, + encoder_hidden_states=encoder_outputs.hidden_states, + encoder_attentions=encoder_outputs.attentions, + ) + + +class FlaxBlenderbotSmallPreTrainedModel(FlaxPreTrainedModel): + config_class = BlenderbotSmallConfig + base_model_prefix: str = "model" + module_class: nn.Module = None + + def __init__( + self, + config: BlenderbotSmallConfig, + input_shape: Tuple[int] = (1, 1), + seed: int = 0, + dtype: jnp.dtype = jnp.float32, + _do_init: bool = True, + **kwargs, + ): + module = self.module_class(config=config, dtype=dtype, **kwargs) + super().__init__(config, module, input_shape=input_shape, seed=seed, dtype=dtype, _do_init=_do_init) + + def init_weights(self, rng: jax.random.PRNGKey, input_shape: Tuple, params: FrozenDict = None) -> FrozenDict: + # init input tensors + input_ids = jnp.zeros(input_shape, dtype="i4") + # make sure initialization pass will work for FlaxBlenderbotSmallForSequenceClassificationModule + input_ids = input_ids.at[(..., -1)].set(self.config.eos_token_id) + attention_mask = jnp.ones_like(input_ids) + decoder_input_ids = input_ids + decoder_attention_mask = jnp.ones_like(input_ids) + + batch_size, sequence_length = input_ids.shape + position_ids = jnp.broadcast_to(jnp.arange(sequence_length)[None, :], (batch_size, sequence_length)) + decoder_position_ids = jnp.broadcast_to(jnp.arange(sequence_length)[None, :], (batch_size, sequence_length)) + + params_rng, dropout_rng = jax.random.split(rng) + rngs = {"params": params_rng, "dropout": dropout_rng} + + random_params = self.module.init( + rngs, + input_ids, + attention_mask, + decoder_input_ids, + decoder_attention_mask, + position_ids, + decoder_position_ids, + )["params"] + + if params is not None: + random_params = flatten_dict(unfreeze(random_params)) + params = flatten_dict(unfreeze(params)) + for missing_key in self._missing_keys: + params[missing_key] = random_params[missing_key] + self._missing_keys = set() + return freeze(unflatten_dict(params)) + else: + return random_params + + def init_cache(self, batch_size, max_length, encoder_outputs): + r""" + Args: + batch_size (`int`): + batch_size used for fast auto-regressive decoding. Defines the batch size of the initialized cache. + max_length (`int`): + maximum possible length for auto-regressive decoding. Defines the sequence length of the initialized + cache. + encoder_outputs (`Union[FlaxBaseModelOutput, tuple(tuple(jnp.ndarray)]`): + `encoder_outputs` consists of (`last_hidden_state`, *optional*: `hidden_states`, *optional*: + `attentions`). `last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)`, *optional*) + is a sequence of hidden-states at the output of the last layer of the encoder. Used in the + cross-attention of the decoder. + """ + # init input variables to retrieve cache + decoder_input_ids = jnp.ones((batch_size, max_length), dtype="i4") + decoder_attention_mask = jnp.ones_like(decoder_input_ids) + decoder_position_ids = jnp.broadcast_to( + jnp.arange(jnp.atleast_2d(decoder_input_ids).shape[-1]), decoder_input_ids.shape + ) + + def _decoder_forward(module, decoder_input_ids, decoder_attention_mask, decoder_position_ids, **kwargs): + decoder_module = module._get_decoder_module() + return decoder_module( + decoder_input_ids, + decoder_attention_mask, + decoder_position_ids, + **kwargs, + ) + + init_variables = self.module.init( + jax.random.PRNGKey(0), + decoder_input_ids=decoder_input_ids, + decoder_attention_mask=decoder_attention_mask, + decoder_position_ids=decoder_position_ids, + encoder_hidden_states=encoder_outputs[0], + init_cache=True, + method=_decoder_forward, # we only need to call the decoder to init the cache + ) + return unfreeze(init_variables["cache"]) + + @add_start_docstrings(BLENDERBOT_SMALL_ENCODE_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=FlaxBaseModelOutput, config_class=BlenderbotSmallConfig) + def encode( + self, + input_ids: jnp.ndarray, + attention_mask: Optional[jnp.ndarray] = None, + position_ids: Optional[jnp.ndarray] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + train: bool = False, + params: dict = None, + dropout_rng: PRNGKey = None, + ): + r""" + Returns: + + Example: + + ```python + >>> from transformers import AutoTokenizer, FlaxBlenderbotSmallForConditionalGeneration + + >>> model = FlaxBlenderbotSmallForConditionalGeneration.from_pretrained("facebook/blenderbot_small-90M") + >>> tokenizer = AutoTokenizer.from_pretrained("facebook/blenderbot_small-90M") + + >>> text = "My friends are cool but they eat too many carbs." + >>> inputs = tokenizer(text, max_length=1024, return_tensors="np") + >>> encoder_outputs = model.encode(**inputs) + ```""" + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.return_dict + + if attention_mask is None: + attention_mask = jnp.ones_like(input_ids) + if position_ids is None: + batch_size, sequence_length = input_ids.shape + position_ids = jnp.broadcast_to(jnp.arange(sequence_length)[None, :], (batch_size, sequence_length)) + + # Handle any PRNG if needed + rngs = {} + if dropout_rng is not None: + rngs["dropout"] = dropout_rng + + def _encoder_forward(module, input_ids, attention_mask, position_ids, **kwargs): + encode_module = module._get_encoder_module() + return encode_module(input_ids, attention_mask, position_ids, **kwargs) + + return self.module.apply( + {"params": params or self.params}, + input_ids=jnp.array(input_ids, dtype="i4"), + attention_mask=jnp.array(attention_mask, dtype="i4"), + position_ids=jnp.array(position_ids, dtype="i4"), + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + deterministic=not train, + rngs=rngs, + method=_encoder_forward, + ) + + @add_start_docstrings(BLENDERBOT_SMALL_DECODE_INPUTS_DOCSTRING) + @replace_return_docstrings( + output_type=FlaxBaseModelOutputWithPastAndCrossAttentions, config_class=BlenderbotSmallConfig + ) + def decode( + self, + decoder_input_ids, + encoder_outputs, + encoder_attention_mask: Optional[jnp.ndarray] = None, + decoder_attention_mask: Optional[jnp.ndarray] = None, + decoder_position_ids: Optional[jnp.ndarray] = None, + past_key_values: dict = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + train: bool = False, + params: dict = None, + dropout_rng: PRNGKey = None, + ): + r""" + Returns: + + Example: + + ```python + >>> import jax.numpy as jnp + >>> from transformers import AutoTokenizer, FlaxBlenderbotSmallForConditionalGeneration + + >>> model = FlaxBlenderbotSmallForConditionalGeneration.from_pretrained("facebook/blenderbot_small-90M") + >>> tokenizer = AutoTokenizer.from_pretrained("facebook/blenderbot_small-90M") + + >>> text = "My friends are cool but they eat too many carbs." + >>> inputs = tokenizer(text, max_length=1024, return_tensors="np") + >>> encoder_outputs = model.encode(**inputs) + + >>> decoder_start_token_id = model.config.decoder_start_token_id + >>> decoder_input_ids = jnp.ones((inputs.input_ids.shape[0], 1), dtype="i4") * decoder_start_token_id + + >>> outputs = model.decode(decoder_input_ids, encoder_outputs) + >>> last_decoder_hidden_states = outputs.last_hidden_state + ```""" + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.return_dict + + encoder_hidden_states = encoder_outputs[0] + if encoder_attention_mask is None: + batch_size, sequence_length = encoder_hidden_states.shape[:2] + encoder_attention_mask = jnp.ones((batch_size, sequence_length)) + + batch_size, sequence_length = decoder_input_ids.shape + if decoder_attention_mask is None: + decoder_attention_mask = jnp.ones((batch_size, sequence_length)) + + if decoder_position_ids is None: + if past_key_values is not None: + raise ValueError("Make sure to provide `decoder_position_ids` when passing `past_key_values`.") + + decoder_position_ids = jnp.broadcast_to( + jnp.arange(sequence_length)[None, :], (batch_size, sequence_length) + ) + + # Handle any PRNG if needed + rngs = {} + if dropout_rng is not None: + rngs["dropout"] = dropout_rng + + inputs = {"params": params or self.params} + + # if past_key_values are passed then cache is already initialized a private flag init_cache has to be + # passed down to ensure cache is used. It has to be made sure that cache is marked as mutable so that + # it can be changed by FlaxBlenderbotSmallAttention module + if past_key_values: + inputs["cache"] = past_key_values + mutable = ["cache"] + else: + mutable = False + + def _decoder_forward(module, decoder_input_ids, decoder_attention_mask, decoder_position_ids, **kwargs): + decoder_module = module._get_decoder_module() + return decoder_module( + decoder_input_ids, + decoder_attention_mask, + decoder_position_ids, + **kwargs, + ) + + outputs = self.module.apply( + inputs, + decoder_input_ids=jnp.array(decoder_input_ids, dtype="i4"), + decoder_attention_mask=jnp.array(decoder_attention_mask, dtype="i4"), + decoder_position_ids=jnp.array(decoder_position_ids, dtype="i4"), + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=jnp.array(encoder_attention_mask, dtype="i4"), + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + deterministic=not train, + rngs=rngs, + mutable=mutable, + method=_decoder_forward, + ) + + # add updated cache to model output + if past_key_values is not None and return_dict: + outputs, past = outputs + outputs["past_key_values"] = unfreeze(past["cache"]) + return outputs + elif past_key_values is not None and not return_dict: + outputs, past = outputs + outputs = outputs[:1] + (unfreeze(past["cache"]),) + outputs[1:] + + return outputs + + def __call__( + self, + input_ids: jnp.ndarray, + attention_mask: Optional[jnp.ndarray] = None, + decoder_input_ids: Optional[jnp.ndarray] = None, + decoder_attention_mask: Optional[jnp.ndarray] = None, + position_ids: Optional[jnp.ndarray] = None, + decoder_position_ids: Optional[jnp.ndarray] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + train: bool = False, + params: dict = None, + dropout_rng: PRNGKey = None, + ): + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.return_dict + + # prepare encoder inputs + if attention_mask is None: + attention_mask = jnp.ones_like(input_ids) + if position_ids is None: + batch_size, sequence_length = input_ids.shape + position_ids = jnp.broadcast_to(jnp.arange(sequence_length)[None, :], (batch_size, sequence_length)) + + # prepare decoder inputs + if decoder_input_ids is None: + decoder_input_ids = shift_tokens_right( + input_ids, self.config.pad_token_id, decoder_start_token_id=self.config.decoder_start_token_id + ) + if decoder_attention_mask is None: + decoder_attention_mask = jnp.ones_like(decoder_input_ids) + if decoder_position_ids is None: + batch_size, sequence_length = decoder_input_ids.shape + decoder_position_ids = jnp.broadcast_to( + jnp.arange(sequence_length)[None, :], (batch_size, sequence_length) + ) + + # Handle any PRNG if needed + rngs = {"dropout": dropout_rng} if dropout_rng is not None else {} + + return self.module.apply( + {"params": params or self.params}, + input_ids=jnp.array(input_ids, dtype="i4"), + attention_mask=jnp.array(attention_mask, dtype="i4"), + position_ids=jnp.array(position_ids, dtype="i4"), + decoder_input_ids=jnp.array(decoder_input_ids, dtype="i4"), + decoder_attention_mask=jnp.array(decoder_attention_mask, dtype="i4"), + decoder_position_ids=jnp.array(decoder_position_ids, dtype="i4"), + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + deterministic=not train, + rngs=rngs, + ) + + +@add_start_docstrings( + "The bare BlenderbotSmall Model transformer outputting raw hidden-states without any specific head on top.", + BLENDERBOT_SMALL_START_DOCSTRING, +) +class FlaxBlenderbotSmallModel(FlaxBlenderbotSmallPreTrainedModel): + config: BlenderbotSmallConfig + dtype: jnp.dtype = jnp.float32 # the dtype of the computation + module_class = FlaxBlenderbotSmallModule + + +append_call_sample_docstring(FlaxBlenderbotSmallModel, _CHECKPOINT_FOR_DOC, FlaxSeq2SeqModelOutput, _CONFIG_FOR_DOC) + + +# Copied from transformers.models.bart.modeling_flax_bart.FlaxBartForConditionalGenerationModule with Bart->BlenderbotSmall +class FlaxBlenderbotSmallForConditionalGenerationModule(nn.Module): + config: BlenderbotSmallConfig + dtype: jnp.dtype = jnp.float32 + bias_init: Callable[..., jnp.ndarray] = jax.nn.initializers.zeros + + def setup(self): + self.model = FlaxBlenderbotSmallModule(config=self.config, dtype=self.dtype) + self.lm_head = nn.Dense( + self.model.shared.num_embeddings, + use_bias=False, + dtype=self.dtype, + kernel_init=jax.nn.initializers.normal(self.config.init_std), + ) + self.final_logits_bias = self.param("final_logits_bias", self.bias_init, (1, self.model.shared.num_embeddings)) + + def _get_encoder_module(self): + return self.model.encoder + + def _get_decoder_module(self): + return self.model.decoder + + def __call__( + self, + input_ids, + attention_mask, + decoder_input_ids, + decoder_attention_mask, + position_ids, + decoder_position_ids, + output_attentions: bool = False, + output_hidden_states: bool = False, + return_dict: bool = True, + deterministic: bool = True, + ): + outputs = self.model( + input_ids=input_ids, + attention_mask=attention_mask, + decoder_input_ids=decoder_input_ids, + decoder_attention_mask=decoder_attention_mask, + position_ids=position_ids, + decoder_position_ids=decoder_position_ids, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + deterministic=deterministic, + ) + + hidden_states = outputs[0] + + if self.config.tie_word_embeddings: + shared_embedding = self.model.variables["params"]["shared"]["embedding"] + lm_logits = self.lm_head.apply({"params": {"kernel": shared_embedding.T}}, hidden_states) + else: + lm_logits = self.lm_head(hidden_states) + + lm_logits += jax.lax.stop_gradient(self.final_logits_bias.astype(self.dtype)) + + if not return_dict: + output = (lm_logits,) + outputs[1:] + return output + + return FlaxSeq2SeqLMOutput( + logits=lm_logits, + decoder_hidden_states=outputs.decoder_hidden_states, + decoder_attentions=outputs.decoder_attentions, + cross_attentions=outputs.cross_attentions, + encoder_last_hidden_state=outputs.encoder_last_hidden_state, + encoder_hidden_states=outputs.encoder_hidden_states, + encoder_attentions=outputs.encoder_attentions, + ) + + +@add_start_docstrings( + "The BLENDERBOT_SMALL Model with a language modeling head. Can be used for summarization.", + BLENDERBOT_SMALL_START_DOCSTRING, +) +class FlaxBlenderbotSmallForConditionalGeneration(FlaxBlenderbotSmallPreTrainedModel): + module_class = FlaxBlenderbotSmallForConditionalGenerationModule + dtype: jnp.dtype = jnp.float32 + + @add_start_docstrings(BLENDERBOT_SMALL_DECODE_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=FlaxCausalLMOutputWithCrossAttentions, config_class=BlenderbotSmallConfig) + def decode( + self, + decoder_input_ids, + encoder_outputs, + encoder_attention_mask: Optional[jnp.ndarray] = None, + decoder_attention_mask: Optional[jnp.ndarray] = None, + decoder_position_ids: Optional[jnp.ndarray] = None, + past_key_values: dict = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + deterministic: bool = True, + params: dict = None, + dropout_rng: PRNGKey = None, + ): + r""" + Returns: + + Example: + + ```python + >>> import jax.numpy as jnp + >>> from transformers import AutoTokenizer, FlaxBlenderbotSmallForConditionalGeneration + + >>> model = FlaxBlenderbotSmallForConditionalGeneration.from_pretrained("facebook/blenderbot_small-90M") + >>> tokenizer = AutoTokenizer.from_pretrained("facebook/blenderbot_small-90M") + + >>> text = "My friends are cool but they eat too many carbs." + >>> inputs = tokenizer(text, max_length=1024, return_tensors="np") + >>> encoder_outputs = model.encode(**inputs) + + >>> decoder_start_token_id = model.config.decoder_start_token_id + >>> decoder_input_ids = jnp.ones((inputs.input_ids.shape[0], 1), dtype="i4") * decoder_start_token_id + + >>> outputs = model.decode(decoder_input_ids, encoder_outputs) + >>> logits = outputs.logits + ```""" + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.return_dict + + encoder_hidden_states = encoder_outputs[0] + if encoder_attention_mask is None: + batch_size, sequence_length = encoder_hidden_states.shape[:2] + encoder_attention_mask = jnp.ones((batch_size, sequence_length)) + + batch_size, sequence_length = decoder_input_ids.shape + if decoder_attention_mask is None: + decoder_attention_mask = jnp.ones((batch_size, sequence_length)) + + if decoder_position_ids is None: + if past_key_values is not None: + raise ValueError("Make sure to provide `decoder_position_ids` when passing `past_key_values`.") + + decoder_position_ids = jnp.broadcast_to( + jnp.arange(sequence_length)[None, :], (batch_size, sequence_length) + ) + + # Handle any PRNG if needed + rngs = {} + if dropout_rng is not None: + rngs["dropout"] = dropout_rng + + inputs = {"params": params or self.params} + + # if past_key_values are passed then cache is already initialized a private flag init_cache has to be + # passed down to ensure cache is used. It has to be made sure that cache is marked as mutable so that + # it can be changed by FlaxBlenderbotSmallAttention module + if past_key_values: + inputs["cache"] = past_key_values + mutable = ["cache"] + else: + mutable = False + + def _decoder_forward(module, decoder_input_ids, decoder_attention_mask, decoder_position_ids, **kwargs): + decoder_module = module._get_decoder_module() + outputs = decoder_module( + decoder_input_ids, + decoder_attention_mask, + decoder_position_ids, + **kwargs, + ) + hidden_states = outputs[0] + + if self.config.tie_word_embeddings: + shared_embedding = module.model.variables["params"]["shared"]["embedding"] + lm_logits = module.lm_head.apply({"params": {"kernel": shared_embedding.T}}, hidden_states) + else: + lm_logits = module.lm_head(hidden_states) + + lm_logits += module.final_logits_bias.astype(self.dtype) + return lm_logits, outputs + + outputs = self.module.apply( + inputs, + decoder_input_ids=jnp.array(decoder_input_ids, dtype="i4"), + decoder_attention_mask=jnp.array(decoder_attention_mask, dtype="i4"), + decoder_position_ids=jnp.array(decoder_position_ids, dtype="i4"), + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=jnp.array(encoder_attention_mask, dtype="i4"), + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + deterministic=deterministic, + rngs=rngs, + mutable=mutable, + method=_decoder_forward, + ) + + if past_key_values is None: + lm_logits, decoder_outputs = outputs + else: + (lm_logits, decoder_outputs), past = outputs + + if return_dict: + outputs = FlaxCausalLMOutputWithCrossAttentions( + logits=lm_logits, + hidden_states=decoder_outputs.hidden_states, + attentions=decoder_outputs.attentions, + cross_attentions=decoder_outputs.cross_attentions, + ) + else: + outputs = (lm_logits,) + decoder_outputs[1:] + + # add updated cache to model output + if past_key_values is not None and return_dict: + outputs["past_key_values"] = unfreeze(past["cache"]) + return outputs + elif past_key_values is not None and not return_dict: + outputs = outputs[:1] + (unfreeze(past["cache"]),) + outputs[1:] + + return outputs + + def prepare_inputs_for_generation( + self, + decoder_input_ids, + max_length, + attention_mask: Optional[jax.Array] = None, + decoder_attention_mask: Optional[jax.Array] = None, + encoder_outputs=None, + **kwargs, + ): + # initializing the cache + batch_size, seq_length = decoder_input_ids.shape + + past_key_values = self.init_cache(batch_size, max_length, encoder_outputs) + # Note that usually one would have to put 0's in the attention_mask for x > input_ids.shape[-1] and x < cache_length. + # But since the decoder uses a causal mask, those positions are masked anyways. + # Thus we can create a single static attention_mask here, which is more efficient for compilation + extended_attention_mask = jnp.ones((batch_size, max_length), dtype="i4") + if decoder_attention_mask is not None: + position_ids = decoder_attention_mask.cumsum(axis=-1) - 1 + extended_attention_mask = lax.dynamic_update_slice(extended_attention_mask, decoder_attention_mask, (0, 0)) + else: + position_ids = jnp.broadcast_to(jnp.arange(seq_length, dtype="i4")[None, :], (batch_size, seq_length)) + + return { + "past_key_values": past_key_values, + "encoder_outputs": encoder_outputs, + "encoder_attention_mask": attention_mask, + "decoder_attention_mask": extended_attention_mask, + "decoder_position_ids": position_ids, + } + + def update_inputs_for_generation(self, model_outputs, model_kwargs): + model_kwargs["past_key_values"] = model_outputs.past_key_values + model_kwargs["decoder_position_ids"] = model_kwargs["decoder_position_ids"][:, -1:] + 1 + return model_kwargs + + +FLAX_BLENDERBOT_SMALL_CONDITIONAL_GENERATION_DOCSTRING = """ + Returns: + + Summarization example: + + ```py + >>> from transformers import AutoTokenizer, FlaxBlenderbotSmallForConditionalGeneration + + >>> model = FlaxBlenderbotSmallForConditionalGeneration.from_pretrained("facebook/blenderbot_small-90M") + >>> tokenizer = AutoTokenizer.from_pretrained("facebook/blenderbot_small-90M") + + >>> ARTICLE_TO_SUMMARIZE = "My friends are cool but they eat too many carbs." + >>> inputs = tokenizer([ARTICLE_TO_SUMMARIZE], max_length=1024, return_tensors="np") + + >>> # Generate Summary + >>> summary_ids = model.generate(inputs["input_ids"]).sequences + >>> print(tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)) + ``` + + Mask filling example: + + ```py + >>> from transformers import AutoTokenizer, FlaxBlenderbotSmallForConditionalGeneration + + >>> tokenizer = AutoTokenizer.from_pretrained("facebook/blenderbot_small-90M") + >>> TXT = "My friends are but they eat too many carbs." + + >>> model = FlaxBlenderbotSmallForConditionalGeneration.from_pretrained("facebook/blenderbot_small-90M") + >>> input_ids = tokenizer([TXT], return_tensors="np")["input_ids"] + >>> logits = model(input_ids).logits + + >>> masked_index = (input_ids[0] == tokenizer.mask_token_id).nonzero().item() + >>> probs = jax.nn.softmax(logits[0, masked_index], axis=0) + >>> values, predictions = jax.lax.top_k(probs) + + >>> tokenizer.decode(predictions).split() + ``` +""" + +overwrite_call_docstring( + FlaxBlenderbotSmallForConditionalGeneration, + BLENDERBOT_SMALL_INPUTS_DOCSTRING + FLAX_BLENDERBOT_SMALL_CONDITIONAL_GENERATION_DOCSTRING, +) +append_replace_return_docstrings( + FlaxBlenderbotSmallForConditionalGeneration, output_type=FlaxSeq2SeqLMOutput, config_class=_CONFIG_FOR_DOC +) + + +__all__ = [ + "FlaxBlenderbotSmallForConditionalGeneration", + "FlaxBlenderbotSmallModel", + "FlaxBlenderbotSmallPreTrainedModel", +] diff --git a/docs/transformers/build/lib/transformers/models/blenderbot_small/modeling_tf_blenderbot_small.py b/docs/transformers/build/lib/transformers/models/blenderbot_small/modeling_tf_blenderbot_small.py new file mode 100644 index 0000000000000000000000000000000000000000..4de98280836d4a21a1abf1cb4cf27b0567ddad2f --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/blenderbot_small/modeling_tf_blenderbot_small.py @@ -0,0 +1,1528 @@ +# coding=utf-8 +# Copyright 2021 The Facebook, Inc and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""TF 2.0 BlenderbotSmall model.""" + +from __future__ import annotations + +import random +from typing import List, Optional, Tuple, Union + +import numpy as np +import tensorflow as tf + +from ...activations_tf import get_tf_activation +from ...modeling_tf_outputs import ( + TFBaseModelOutput, + TFBaseModelOutputWithPastAndCrossAttentions, + TFSeq2SeqLMOutput, + TFSeq2SeqModelOutput, +) + +# Public API +from ...modeling_tf_utils import ( + TFCausalLanguageModelingLoss, + TFPreTrainedModel, + keras, + keras_serializable, + unpack_inputs, +) +from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax +from ...utils import ( + add_code_sample_docstrings, + add_end_docstrings, + add_start_docstrings, + add_start_docstrings_to_model_forward, + logging, + replace_return_docstrings, +) +from .configuration_blenderbot_small import BlenderbotSmallConfig + + +logger = logging.get_logger(__name__) + +_CHECKPOINT_FOR_DOC = "facebook/blenderbot_small-90M" +_CONFIG_FOR_DOC = "BlenderbotSmallConfig" + + +LARGE_NEGATIVE = -1e8 + + +# Copied from transformers.models.bart.modeling_tf_bart.shift_tokens_right +def shift_tokens_right(input_ids: tf.Tensor, pad_token_id: int, decoder_start_token_id: int): + pad_token_id = tf.cast(pad_token_id, input_ids.dtype) + decoder_start_token_id = tf.cast(decoder_start_token_id, input_ids.dtype) + start_tokens = tf.fill( + (shape_list(input_ids)[0], 1), tf.convert_to_tensor(decoder_start_token_id, input_ids.dtype) + ) + shifted_input_ids = tf.concat([start_tokens, input_ids[:, :-1]], -1) + # replace possible -100 values in labels by `pad_token_id` + shifted_input_ids = tf.where( + shifted_input_ids == -100, + tf.fill(shape_list(shifted_input_ids), tf.convert_to_tensor(pad_token_id, input_ids.dtype)), + shifted_input_ids, + ) + + # "Verify that `labels` has only positive values and -100" + assert_gte0 = tf.debugging.assert_greater_equal(shifted_input_ids, tf.constant(0, dtype=input_ids.dtype)) + + # Make sure the assertion op is called by wrapping the result in an identity no-op + with tf.control_dependencies([assert_gte0]): + shifted_input_ids = tf.identity(shifted_input_ids) + + return shifted_input_ids + + +# Copied from transformers.models.bart.modeling_tf_bart._make_causal_mask +def _make_causal_mask(input_ids_shape: tf.TensorShape, past_key_values_length: int = 0): + """ + Make causal mask used for bi-directional self-attention. + """ + bsz = input_ids_shape[0] + tgt_len = input_ids_shape[1] + mask = tf.ones((tgt_len, tgt_len)) * LARGE_NEGATIVE + mask_cond = tf.range(shape_list(mask)[-1]) + + mask = tf.where(mask_cond < tf.reshape(mask_cond + 1, (shape_list(mask)[-1], 1)), 0.0, mask) + + if past_key_values_length > 0: + mask = tf.concat([tf.zeros((tgt_len, past_key_values_length)), mask], axis=-1) + + return tf.tile(mask[None, None, :, :], (bsz, 1, 1, 1)) + + +# Copied from transformers.models.bart.modeling_tf_bart._expand_mask +def _expand_mask(mask: tf.Tensor, tgt_len: Optional[int] = None): + """ + Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`. + """ + src_len = shape_list(mask)[1] + tgt_len = tgt_len if tgt_len is not None else src_len + one_cst = tf.constant(1.0) + mask = tf.cast(mask, dtype=one_cst.dtype) + expanded_mask = tf.tile(mask[:, None, None, :], (1, 1, tgt_len, 1)) + + return (one_cst - expanded_mask) * LARGE_NEGATIVE + + +# Copied from transformers.models.blenderbot.modeling_tf_blenderbot.TFBlenderbotLearnedPositionalEmbedding with Blenderbot->BlenderbotSmall +class TFBlenderbotSmallLearnedPositionalEmbedding(keras.layers.Embedding): + """ + This module learns positional embeddings up to a fixed maximum size. + """ + + def __init__(self, num_embeddings: int, embedding_dim: int, **kwargs): + super().__init__(num_embeddings, embedding_dim, **kwargs) + + def call( + self, input_shape: tf.TensorShape, past_key_values_length: int = 0, position_ids: tf.Tensor | None = None + ): + """Input is expected to be of size [bsz x seqlen].""" + if position_ids is None: + seq_len = input_shape[1] + position_ids = tf.range(seq_len, delta=1, name="range") + position_ids += past_key_values_length + + return super().call(tf.cast(position_ids, dtype=tf.int32)) + + +# Copied from transformers.models.bart.modeling_tf_bart.TFBartAttention with Bart->BlenderbotSmall +class TFBlenderbotSmallAttention(keras.layers.Layer): + """Multi-headed attention from "Attention Is All You Need""" + + def __init__( + self, + embed_dim: int, + num_heads: int, + dropout: float = 0.0, + is_decoder: bool = False, + bias: bool = True, + **kwargs, + ): + super().__init__(**kwargs) + self.embed_dim = embed_dim + + self.num_heads = num_heads + self.dropout = keras.layers.Dropout(dropout) + self.head_dim = embed_dim // num_heads + if (self.head_dim * num_heads) != self.embed_dim: + raise ValueError( + f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}" + f" and `num_heads`: {num_heads})." + ) + self.scaling = self.head_dim**-0.5 + self.is_decoder = is_decoder + + self.k_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="k_proj") + self.q_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="q_proj") + self.v_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="v_proj") + self.out_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="out_proj") + + def _shape(self, tensor: tf.Tensor, seq_len: int, bsz: int): + return tf.transpose(tf.reshape(tensor, (bsz, seq_len, self.num_heads, self.head_dim)), (0, 2, 1, 3)) + + def call( + self, + hidden_states: tf.Tensor, + key_value_states: tf.Tensor | None = None, + past_key_value: Tuple[Tuple[tf.Tensor]] | None = None, + attention_mask: tf.Tensor | None = None, + layer_head_mask: tf.Tensor | None = None, + training: Optional[bool] = False, + ) -> Tuple[tf.Tensor, tf.Tensor | None]: + """Input shape: Batch x Time x Channel""" + + # if key_value_states are provided this layer is used as a cross-attention layer + # for the decoder + is_cross_attention = key_value_states is not None + bsz, tgt_len, embed_dim = shape_list(hidden_states) + + # get query proj + query_states = self.q_proj(hidden_states) * self.scaling + # get key, value proj + if is_cross_attention and past_key_value is not None: + # reuse k,v, cross_attentions + key_states = past_key_value[0] + value_states = past_key_value[1] + elif is_cross_attention: + # cross_attentions + key_states = self._shape(self.k_proj(key_value_states), -1, bsz) + value_states = self._shape(self.v_proj(key_value_states), -1, bsz) + elif past_key_value is not None: + # reuse k, v, self_attention + key_states = self._shape(self.k_proj(hidden_states), -1, bsz) + value_states = self._shape(self.v_proj(hidden_states), -1, bsz) + key_states = tf.concat([past_key_value[0], key_states], axis=2) + value_states = tf.concat([past_key_value[1], value_states], axis=2) + else: + # self_attention + key_states = self._shape(self.k_proj(hidden_states), -1, bsz) + value_states = self._shape(self.v_proj(hidden_states), -1, bsz) + + if self.is_decoder: + # if cross_attention save Tuple(tf.Tensor, tf.Tensor) of all cross attention key/value_states. + # Further calls to cross_attention layer can then reuse all cross-attention + # key/value_states (first "if" case) + # if uni-directional self-attention (decoder) save Tuple(tf.Tensor, tf.Tensor) of + # all previous decoder key/value_states. Further calls to uni-directional self-attention + # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case) + # if encoder bi-directional self-attention `past_key_value` is always `None` + past_key_value = (key_states, value_states) + + proj_shape = (bsz * self.num_heads, -1, self.head_dim) + query_states = tf.reshape(self._shape(query_states, tgt_len, bsz), proj_shape) + key_states = tf.reshape(key_states, proj_shape) + value_states = tf.reshape(value_states, proj_shape) + + src_len = shape_list(key_states)[1] + attn_weights = tf.matmul(query_states, key_states, transpose_b=True) + + tf.debugging.assert_equal( + shape_list(attn_weights), + [bsz * self.num_heads, tgt_len, src_len], + message=( + f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is" + f" {shape_list(attn_weights)}" + ), + ) + + if attention_mask is not None: + tf.debugging.assert_equal( + shape_list(attention_mask), + [bsz, 1, tgt_len, src_len], + message=( + f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is" + f" {shape_list(attention_mask)}" + ), + ) + + attention_mask = tf.cast(attention_mask, dtype=attn_weights.dtype) + attn_weights = tf.reshape(attn_weights, (bsz, self.num_heads, tgt_len, src_len)) + attention_mask + attn_weights = tf.reshape(attn_weights, (bsz * self.num_heads, tgt_len, src_len)) + + attn_weights = stable_softmax(attn_weights, axis=-1) + + if layer_head_mask is not None: + tf.debugging.assert_equal( + shape_list(layer_head_mask), + [self.num_heads], + message=( + f"Head mask for a single layer should be of size {(self.num_heads)}, but is" + f" {shape_list(layer_head_mask)}" + ), + ) + + attn_weights = tf.reshape(layer_head_mask, (1, -1, 1, 1)) * tf.reshape( + attn_weights, (bsz, self.num_heads, tgt_len, src_len) + ) + attn_weights = tf.reshape(attn_weights, (bsz * self.num_heads, tgt_len, src_len)) + + attn_probs = self.dropout(attn_weights, training=training) + attn_output = tf.matmul(attn_probs, value_states) + + tf.debugging.assert_equal( + shape_list(attn_output), + [bsz * self.num_heads, tgt_len, self.head_dim], + message=( + f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is" + f" {shape_list(attn_output)}" + ), + ) + + attn_output = tf.transpose( + tf.reshape(attn_output, (bsz, self.num_heads, tgt_len, self.head_dim)), (0, 2, 1, 3) + ) + attn_output = tf.reshape(attn_output, (bsz, tgt_len, embed_dim)) + + attn_output = self.out_proj(attn_output) + attn_weights: tf.Tensor = tf.reshape(attn_weights, (bsz, self.num_heads, tgt_len, src_len)) + + return attn_output, attn_weights, past_key_value + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "k_proj", None) is not None: + with tf.name_scope(self.k_proj.name): + self.k_proj.build([None, None, self.embed_dim]) + if getattr(self, "q_proj", None) is not None: + with tf.name_scope(self.q_proj.name): + self.q_proj.build([None, None, self.embed_dim]) + if getattr(self, "v_proj", None) is not None: + with tf.name_scope(self.v_proj.name): + self.v_proj.build([None, None, self.embed_dim]) + if getattr(self, "out_proj", None) is not None: + with tf.name_scope(self.out_proj.name): + self.out_proj.build([None, None, self.embed_dim]) + + +# Copied from transformers.models.bart.modeling_tf_bart.TFBartEncoderLayer with Bart->BlenderbotSmall +class TFBlenderbotSmallEncoderLayer(keras.layers.Layer): + def __init__(self, config: BlenderbotSmallConfig, **kwargs): + super().__init__(**kwargs) + self.embed_dim = config.d_model + self.self_attn = TFBlenderbotSmallAttention( + self.embed_dim, config.encoder_attention_heads, dropout=config.attention_dropout, name="self_attn" + ) + self.self_attn_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="self_attn_layer_norm") + self.dropout = keras.layers.Dropout(config.dropout) + self.activation_fn = get_tf_activation(config.activation_function) + self.activation_dropout = keras.layers.Dropout(config.activation_dropout) + self.fc1 = keras.layers.Dense(config.encoder_ffn_dim, name="fc1") + self.fc2 = keras.layers.Dense(self.embed_dim, name="fc2") + self.final_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm") + self.config = config + + def call( + self, + hidden_states: tf.Tensor, + attention_mask: np.ndarray | tf.Tensor | None, + layer_head_mask: tf.Tensor | None, + training: Optional[bool] = False, + ) -> tf.Tensor: + """ + Args: + hidden_states (`tf.Tensor`): input to the layer of shape `(batch, seq_len, embed_dim)` + attention_mask (`tf.Tensor`): attention mask of size + `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values. + layer_head_mask (`tf.Tensor`): mask for attention heads in a given layer of size + `(encoder_attention_heads,)` + """ + residual = hidden_states + hidden_states, self_attn_weights, _ = self.self_attn( + hidden_states=hidden_states, attention_mask=attention_mask, layer_head_mask=layer_head_mask + ) + + tf.debugging.assert_equal( + shape_list(hidden_states), + shape_list(residual), + message=f"Self attn modified the shape of query {shape_list(residual)} to {shape_list(hidden_states)}", + ) + + hidden_states = self.dropout(hidden_states, training=training) + hidden_states = residual + hidden_states + hidden_states = self.self_attn_layer_norm(hidden_states) + + residual = hidden_states + hidden_states = self.activation_fn(self.fc1(hidden_states)) + hidden_states = self.activation_dropout(hidden_states, training=training) + hidden_states = self.fc2(hidden_states) + hidden_states = self.dropout(hidden_states, training=training) + hidden_states = residual + hidden_states + hidden_states = self.final_layer_norm(hidden_states) + + return hidden_states, self_attn_weights + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "self_attn", None) is not None: + with tf.name_scope(self.self_attn.name): + self.self_attn.build(None) + if getattr(self, "self_attn_layer_norm", None) is not None: + with tf.name_scope(self.self_attn_layer_norm.name): + self.self_attn_layer_norm.build([None, None, self.embed_dim]) + if getattr(self, "fc1", None) is not None: + with tf.name_scope(self.fc1.name): + self.fc1.build([None, None, self.embed_dim]) + if getattr(self, "fc2", None) is not None: + with tf.name_scope(self.fc2.name): + self.fc2.build([None, None, self.config.encoder_ffn_dim]) + if getattr(self, "final_layer_norm", None) is not None: + with tf.name_scope(self.final_layer_norm.name): + self.final_layer_norm.build([None, None, self.embed_dim]) + + +# Copied from transformers.models.bart.modeling_tf_bart.TFBartDecoderLayer with Bart->BlenderbotSmall +class TFBlenderbotSmallDecoderLayer(keras.layers.Layer): + def __init__(self, config: BlenderbotSmallConfig, **kwargs): + super().__init__(**kwargs) + self.embed_dim = config.d_model + self.self_attn = TFBlenderbotSmallAttention( + embed_dim=self.embed_dim, + num_heads=config.decoder_attention_heads, + dropout=config.attention_dropout, + name="self_attn", + is_decoder=True, + ) + self.dropout = keras.layers.Dropout(config.dropout) + self.activation_fn = get_tf_activation(config.activation_function) + self.activation_dropout = keras.layers.Dropout(config.activation_dropout) + + self.self_attn_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="self_attn_layer_norm") + self.encoder_attn = TFBlenderbotSmallAttention( + self.embed_dim, + config.decoder_attention_heads, + dropout=config.attention_dropout, + name="encoder_attn", + is_decoder=True, + ) + self.encoder_attn_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="encoder_attn_layer_norm") + self.fc1 = keras.layers.Dense(config.decoder_ffn_dim, name="fc1") + self.fc2 = keras.layers.Dense(self.embed_dim, name="fc2") + self.final_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm") + self.config = config + + def call( + self, + hidden_states: tf.Tensor, + attention_mask: np.ndarray | tf.Tensor | None = None, + encoder_hidden_states: np.ndarray | tf.Tensor | None = None, + encoder_attention_mask: np.ndarray | tf.Tensor | None = None, + layer_head_mask: tf.Tensor | None = None, + cross_attn_layer_head_mask: tf.Tensor | None = None, + past_key_value: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None, + training: Optional[bool] = False, + ) -> Tuple[tf.Tensor, tf.Tensor, Tuple[Tuple[tf.Tensor]]]: + """ + Args: + hidden_states (`tf.Tensor`): input to the layer of shape `(batch, seq_len, embed_dim)` + attention_mask (`tf.Tensor`): attention mask of size + `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values. + encoder_hidden_states (`tf.Tensor`): + cross attention input to the layer of shape `(batch, seq_len, embed_dim)` + encoder_attention_mask (`tf.Tensor`): encoder attention mask of size + `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values. + layer_head_mask (`tf.Tensor`): mask for attention heads in a given layer of size + `(decoder_attention_heads,)` + cross_attn_layer_head_mask (`tf.Tensor`): mask for heads of the cross-attention module. + `(decoder_attention_heads,)` + past_key_value (`Tuple(tf.Tensor)`): cached past key and value projection states + """ + residual = hidden_states + + # Self Attention + # decoder uni-directional self-attention cached key/values tuple is at positions 1,2 + self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None + # add present self-attn cache to positions 1,2 of present_key_value tuple + hidden_states, self_attn_weights, present_key_value = self.self_attn( + hidden_states=hidden_states, + past_key_value=self_attn_past_key_value, + attention_mask=attention_mask, + layer_head_mask=layer_head_mask, + ) + hidden_states = self.dropout(hidden_states, training=training) + hidden_states = residual + hidden_states + hidden_states = self.self_attn_layer_norm(hidden_states) + + # Cross-Attention Block + cross_attn_present_key_value = None + cross_attn_weights = None + if encoder_hidden_states is not None: + residual = hidden_states + + # cross_attn cached key/values tuple is at positions 3,4 of present_key_value tuple + cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None + hidden_states, cross_attn_weights, cross_attn_present_key_value = self.encoder_attn( + hidden_states=hidden_states, + key_value_states=encoder_hidden_states, + attention_mask=encoder_attention_mask, + layer_head_mask=cross_attn_layer_head_mask, + past_key_value=cross_attn_past_key_value, + ) + hidden_states = self.dropout(hidden_states, training=training) + hidden_states = residual + hidden_states + hidden_states = self.encoder_attn_layer_norm(hidden_states) + + # add cross-attn to positions 3,4 of present_key_value tuple + present_key_value = present_key_value + cross_attn_present_key_value + + # Fully Connected + residual = hidden_states + hidden_states = self.activation_fn(self.fc1(hidden_states)) + hidden_states = self.activation_dropout(hidden_states, training=training) + hidden_states = self.fc2(hidden_states) + hidden_states = self.dropout(hidden_states, training=training) + hidden_states = residual + hidden_states + hidden_states = self.final_layer_norm(hidden_states) + + return ( + hidden_states, + self_attn_weights, + cross_attn_weights, + present_key_value, + ) + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "self_attn", None) is not None: + with tf.name_scope(self.self_attn.name): + self.self_attn.build(None) + if getattr(self, "self_attn_layer_norm", None) is not None: + with tf.name_scope(self.self_attn_layer_norm.name): + self.self_attn_layer_norm.build([None, None, self.embed_dim]) + if getattr(self, "encoder_attn", None) is not None: + with tf.name_scope(self.encoder_attn.name): + self.encoder_attn.build(None) + if getattr(self, "encoder_attn_layer_norm", None) is not None: + with tf.name_scope(self.encoder_attn_layer_norm.name): + self.encoder_attn_layer_norm.build([None, None, self.embed_dim]) + if getattr(self, "fc1", None) is not None: + with tf.name_scope(self.fc1.name): + self.fc1.build([None, None, self.embed_dim]) + if getattr(self, "fc2", None) is not None: + with tf.name_scope(self.fc2.name): + self.fc2.build([None, None, self.config.decoder_ffn_dim]) + if getattr(self, "final_layer_norm", None) is not None: + with tf.name_scope(self.final_layer_norm.name): + self.final_layer_norm.build([None, None, self.embed_dim]) + + +class TFBlenderbotSmallPreTrainedModel(TFPreTrainedModel): + config_class = BlenderbotSmallConfig + base_model_prefix = "model" + + +BLENDERBOT_SMALL_START_DOCSTRING = r""" + This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads + etc.) + + This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it + as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and + behavior. + + + + TensorFlow models and layers in `transformers` accept two formats as input: + + - having all inputs as keyword arguments (like PyTorch models), or + - having all inputs as a list, tuple or dict in the first positional argument. + + The reason the second format is supported is that Keras methods prefer this format when passing inputs to models + and layers. Because of this support, when using methods like `model.fit()` things should "just work" for you - just + pass your inputs and labels in any format that `model.fit()` supports! If, however, you want to use the second + format outside of Keras methods like `fit()` and `predict()`, such as when creating your own layers or models with + the Keras `Functional` API, there are three possibilities you can use to gather all the input Tensors in the first + positional argument: + + - a single Tensor with `input_ids` only and nothing else: `model(input_ids)` + - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring: + `model([input_ids, attention_mask])` or `model([input_ids, attention_mask, token_type_ids])` + - a dictionary with one or several input Tensors associated to the input names given in the docstring: + `model({"input_ids": input_ids, "token_type_ids": token_type_ids})` + + Note that when creating models and layers with + [subclassing](https://keras.io/guides/making_new_layers_and_models_via_subclassing/) then you don't need to worry + about any of this, as you can just pass inputs like you would to any other Python function! + + + + Args: + config ([`BlenderbotSmallConfig`]): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the [`~TFPreTrainedModel.from_pretrained`] method to load the model weights. +""" + +BLENDERBOT_SMALL_GENERATION_EXAMPLE = r""" + Conversation example:: + + ```py + >>> from transformers import AutoTokenizer, TFBlenderbotSmallForConditionalGeneration + + >>> mname = "facebook/blenderbot_small-90M" + >>> model = BlenderbotSmallForConditionalGeneration.from_pretrained(mname) + >>> tokenizer = AutoTokenizer.from_pretrained(mname) + + >>> UTTERANCE = "My friends are cool but they eat too many carbs." + >>> print("Human: ", UTTERANCE) + >>> inputs = tokenizer([UTTERANCE], return_tensors="tf") + + >>> reply_ids = model.generate(**inputs) + >>> print("Bot: ", tokenizer.batch_decode(reply_ids, skip_special_tokens=True)[0]) + what kind of carbs do they eat? i don't know much about carbs. + + >>> REPLY = "I'm not sure" + >>> print("Human: ", REPLY) + >>> NEXT_UTTERANCE = ( + ... "My friends are cool but they eat too many carbs. " + ... "what kind of carbs do they eat? i don't know much about carbs. " + ... "I'm not sure." + ... ) + + >>> inputs = tokenizer([NEXT_UTTERANCE], return_tensors="tf") + >>> inputs.pop("token_type_ids") + >>> next_reply_ids = model.generate(**inputs) + >>> print("Bot: ", tokenizer.batch_decode(next_reply_ids, skip_special_tokens=True)[0]) + ``` +""" + +BLENDERBOT_SMALL_INPUTS_DOCSTRING = r""" + Args: + input_ids (`tf.Tensor` of shape `({0})`): + Indices of input sequence tokens in the vocabulary. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + attention_mask (`tf.Tensor` of shape `({0})`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + decoder_input_ids (`tf.Tensor` of shape `(batch_size, target_sequence_length)`, *optional*): + Indices of decoder input sequence tokens in the vocabulary. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are decoder input IDs?](../glossary#decoder-input-ids) + + BlenderbotSmall uses the `bos_token_id` as the starting token for `decoder_input_ids` generation. If + `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see + `past_key_values`). + decoder_attention_mask (`tf.Tensor` of shape `(batch_size, target_sequence_length)`, *optional*): + will be made by default and ignore pad tokens. It is not recommended to set this for most use cases. + decoder_position_ids (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Indices of positions of each decoder input sequence tokens in the position embeddings. Selected in the + range `[0, config.max_position_embeddings - 1]`. + head_mask (`tf.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*): + Mask to nullify selected heads of the attention modules in the encoder. Mask values selected in `[0, 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + decoder_head_mask (`tf.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*): + Mask to nullify selected heads of the attention modules in the decoder. Mask values selected in `[0, 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + cross_attn_head_mask (`tf.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*): + Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + encoder_outputs (`tf.FloatTensor`, *optional*): + hidden states at the output of the last layer of the encoder. Used in the cross-attention of the decoder. + of shape `(batch_size, sequence_length, hidden_size)` is a sequence of + past_key_values (`Tuple[Tuple[tf.Tensor]]` of length `config.n_layers`) + contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding. + If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that + don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all + `decoder_input_ids` of shape `(batch_size, sequence_length)`. + use_cache (`bool`, *optional*, defaults to `True`): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see + `past_key_values`). Set to `False` during training, `True` during generation + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. This argument can be used only in eager mode, in graph mode the value in the + config will be used instead. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. This argument can be used only in eager mode, in graph mode the value in the config will be + used instead. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. This argument can be used in + eager mode, in graph mode the value will always be set to True. + training (`bool`, *optional*, defaults to `False`): + Whether or not to use the model in training mode (some modules like dropout modules have different + behaviors between training and evaluation). +""" + + +@keras_serializable +class TFBlenderbotSmallEncoder(keras.layers.Layer): + config_class = BlenderbotSmallConfig + """ + Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a + [`TFBlenderbotSmallEncoderLayer`]. + + Args: + config: BlenderbotSmallConfig + """ + + def __init__(self, config: BlenderbotSmallConfig, embed_tokens: Optional[keras.layers.Embedding] = None, **kwargs): + super().__init__(**kwargs) + self.config = config + self.dropout = keras.layers.Dropout(config.dropout) + self.layerdrop = config.encoder_layerdrop + self.padding_idx = config.pad_token_id + self.max_source_positions = config.max_position_embeddings + self.embed_scale = tf.math.sqrt(float(config.d_model)) if config.scale_embedding else 1.0 + + self.embed_tokens = embed_tokens + self.embed_positions = TFBlenderbotSmallLearnedPositionalEmbedding( + config.max_position_embeddings, + config.d_model, + name="embed_positions", + ) + self.layers = [TFBlenderbotSmallEncoderLayer(config, name=f"layers.{i}") for i in range(config.encoder_layers)] + self.layernorm_embedding = keras.layers.LayerNormalization(epsilon=1e-5, name="layernorm_embedding") + self.embed_dim = config.d_model + + def get_embed_tokens(self): + return self.embed_tokens + + def set_embed_tokens(self, embed_tokens): + self.embed_tokens = embed_tokens + + @unpack_inputs + def call( + self, + input_ids=None, + inputs_embeds=None, + attention_mask=None, + head_mask=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + training=False, + ): + """ + Args: + input_ids (`tf.Tensor` of shape `(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you + provide it. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + attention_mask (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + head_mask (`tf.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, `optional): + Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + inputs_embeds (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. + This is useful if you want more control over how to convert `input_ids` indices into associated vectors + than the model's internal embedding lookup matrix. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. This argument can be used only in eager mode, in graph mode the value + in the config will be used instead. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors + for more detail. This argument can be used only in eager mode, in graph mode the value in the config + will be used instead. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. This argument can be used + in eager mode, in graph mode the value will always be set to True. + training (`bool`, *optional*, defaults to `False`): + Whether or not to use the model in training mode (some modules like dropout modules have different + behaviors between training and evaluation). + """ + if input_ids is not None and inputs_embeds is not None: + raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") + elif input_ids is not None: + input_shape = shape_list(input_ids) + elif inputs_embeds is not None: + input_shape = shape_list(inputs_embeds)[:-1] + else: + raise ValueError("You have to specify either input_ids or inputs_embeds") + + if inputs_embeds is None: + check_embeddings_within_bounds(input_ids, self.embed_tokens.input_dim) + inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale + + embed_pos = self.embed_positions(input_shape) + hidden_states = inputs_embeds + embed_pos + hidden_states = self.layernorm_embedding(hidden_states) + hidden_states = self.dropout(hidden_states, training=training) + + # check attention mask and invert + if attention_mask is not None: + # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] + attention_mask = _expand_mask(attention_mask) + else: + attention_mask = None + + encoder_states = () if output_hidden_states else None + all_attentions = () if output_attentions else None + + # check if head_mask has a correct number of layers specified if desired + if head_mask is not None: + tf.debugging.assert_equal( + shape_list(head_mask)[0], + len(self.layers), + message=( + f"The head_mask should be specified for {len(self.layers)} layers, but it is for" + f" {shape_list(head_mask)[0]}." + ), + ) + + # encoder layers + for idx, encoder_layer in enumerate(self.layers): + if output_hidden_states: + encoder_states = encoder_states + (hidden_states,) + # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) + dropout_probability = random.uniform(0, 1) + if training and (dropout_probability < self.layerdrop): # skip the layer + continue + + hidden_states, attn = encoder_layer( + hidden_states, + attention_mask, + head_mask[idx] if head_mask is not None else None, + ) + + if output_attentions: + all_attentions += (attn,) + + if output_hidden_states: + encoder_states = encoder_states + (hidden_states,) + + if not return_dict: + return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None) + return TFBaseModelOutput( + last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions + ) + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "embed_positions", None) is not None: + with tf.name_scope(self.embed_positions.name): + self.embed_positions.build(None) + if getattr(self, "layernorm_embedding", None) is not None: + with tf.name_scope(self.layernorm_embedding.name): + self.layernorm_embedding.build([None, None, self.embed_dim]) + if getattr(self, "layers", None) is not None: + for layer in self.layers: + with tf.name_scope(layer.name): + layer.build(None) + + +@keras_serializable +class TFBlenderbotSmallDecoder(keras.layers.Layer): + config_class = BlenderbotSmallConfig + """ + Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`TFBlenderbotSmallDecoderLayer`] + + Args: + config: BlenderbotSmallConfig + embed_tokens: output embedding + """ + + def __init__(self, config: BlenderbotSmallConfig, embed_tokens: Optional[keras.layers.Embedding] = None, **kwargs): + super().__init__(**kwargs) + self.config = config + self.padding_idx = config.pad_token_id + self.embed_tokens = embed_tokens + self.layerdrop = config.decoder_layerdrop + self.embed_positions = TFBlenderbotSmallLearnedPositionalEmbedding( + config.max_position_embeddings, + config.d_model, + name="embed_positions", + ) + self.embed_scale = tf.math.sqrt(float(config.d_model)) if config.scale_embedding else 1.0 + self.layers = [TFBlenderbotSmallDecoderLayer(config, name=f"layers.{i}") for i in range(config.decoder_layers)] + self.layernorm_embedding = keras.layers.LayerNormalization(epsilon=1e-5, name="layernorm_embedding") + + self.dropout = keras.layers.Dropout(config.dropout) + + def get_embed_tokens(self): + return self.embed_tokens + + def set_embed_tokens(self, embed_tokens): + self.embed_tokens = embed_tokens + + @unpack_inputs + def call( + self, + input_ids=None, + inputs_embeds=None, + attention_mask=None, + position_ids=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + head_mask=None, + cross_attn_head_mask=None, + past_key_values=None, + use_cache=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + training=False, + ): + r""" + Args: + input_ids (`tf.Tensor` of shape `(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you + provide it. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + attention_mask (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + position_ids (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Indices of positions of each decoder input sequence tokens in the position embeddings. Selected in the + range `[0, config.max_position_embeddings - 1]`. + encoder_hidden_states (`tf.Tensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*): + Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention + of the decoder. + encoder_attention_mask (`tf.Tensor` of shape `(batch_size, encoder_sequence_length)`, *optional*): + Mask to avoid performing cross-attention on padding tokens indices of encoder input_ids. Mask values + selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + head_mask (`tf.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*): + Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + cross_attn_head_mask (`tf.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*): + Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + past_key_values (`Tuple[Tuple[tf.Tensor]]` of length `config.n_layers` with each tuple having 2 tuples each of which has 2 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`): + Contains precomputed key and value hidden-states of the attention blocks. Can be used to speed up + decoding. + + If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those + that don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of + all `decoder_input_ids` of shape `(batch_size, sequence_length)`. + inputs_embeds (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. + This is useful if you want more control over how to convert `input_ids` indices into associated vectors + than the model's internal embedding lookup matrix. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. This argument can be used only in eager mode, in graph mode the value + in the config will be used instead. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors + for more detail. This argument can be used only in eager mode, in graph mode the value in the config + will be used instead. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. This argument can be used + in eager mode, in graph mode the value will always be set to True. + training (`bool`, *optional*, defaults to `False`): + Whether or not to use the model in training mode (some modules like dropout modules have different + behaviors between training and evaluation). + """ + if input_ids is not None and inputs_embeds is not None: + raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time") + elif input_ids is not None: + input_shape = shape_list(input_ids) + elif inputs_embeds is not None: + input_shape = shape_list(inputs_embeds)[:-1] + else: + raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds") + + past_key_values_length = shape_list(past_key_values[0][0])[2] if past_key_values is not None else 0 + + if inputs_embeds is None: + check_embeddings_within_bounds(input_ids, self.embed_tokens.input_dim) + inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale + + # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] + if input_shape[-1] > 1: + combined_attention_mask = _make_causal_mask(input_shape, past_key_values_length=past_key_values_length) + else: + combined_attention_mask = _expand_mask( + tf.ones((input_shape[0], input_shape[1] + past_key_values_length)), tgt_len=input_shape[-1] + ) + + if attention_mask is not None: + combined_attention_mask = combined_attention_mask + _expand_mask(attention_mask, tgt_len=input_shape[-1]) + + if encoder_hidden_states is not None and encoder_attention_mask is not None: + # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] + encoder_attention_mask = _expand_mask(encoder_attention_mask, tgt_len=input_shape[-1]) + + # embed positions + if position_ids is None: + positions = self.embed_positions(input_shape, past_key_values_length) + else: + positions = self.embed_positions(input_shape, position_ids=position_ids) + + hidden_states = self.layernorm_embedding(inputs_embeds) + positions + hidden_states = self.dropout(hidden_states, training=training) + + # decoder layers + all_hidden_states = () if output_hidden_states else None + all_self_attns = () if output_attentions else None + all_cross_attns = () if (output_attentions and encoder_hidden_states is not None) else None + present_key_values = () if use_cache else None + + # check if head_mask and cross_attn_head_mask have a correct number of layers specified if desired + for attn_mask_name, attn_mask in [("head_mask", head_mask), ("cross_attn_head_mask", cross_attn_head_mask)]: + if attn_mask is not None: + tf.debugging.assert_equal( + shape_list(attn_mask)[0], + len(self.layers), + message=( + f"The {attn_mask_name} should be specified for {len(self.layers)} layers, but it is for" + f" {shape_list(attn_mask)[0]}." + ), + ) + + for idx, decoder_layer in enumerate(self.layers): + # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) + if output_hidden_states: + all_hidden_states += (hidden_states,) + dropout_probability = random.uniform(0, 1) + + if training and (dropout_probability < self.layerdrop): + continue + + past_key_value = past_key_values[idx] if past_key_values is not None else None + + hidden_states, layer_self_attn, layer_cross_attn, present_key_value = decoder_layer( + hidden_states, + attention_mask=combined_attention_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + layer_head_mask=head_mask[idx] if head_mask is not None else None, + cross_attn_layer_head_mask=cross_attn_head_mask[idx] if cross_attn_head_mask is not None else None, + past_key_value=past_key_value, + ) + + if use_cache: + present_key_values += (present_key_value,) + + if output_attentions: + all_self_attns += (layer_self_attn,) + + if encoder_hidden_states is not None: + all_cross_attns += (layer_cross_attn,) + + if output_hidden_states: + all_hidden_states += (hidden_states,) + + if not return_dict: + return hidden_states, present_key_values, all_hidden_states, all_self_attns, all_cross_attns + else: + return TFBaseModelOutputWithPastAndCrossAttentions( + last_hidden_state=hidden_states, + past_key_values=present_key_values, + hidden_states=all_hidden_states, + attentions=all_self_attns, + cross_attentions=all_cross_attns, + ) + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "embed_positions", None) is not None: + with tf.name_scope(self.embed_positions.name): + self.embed_positions.build(None) + if getattr(self, "layernorm_embedding", None) is not None: + with tf.name_scope(self.layernorm_embedding.name): + self.layernorm_embedding.build([None, None, self.config.d_model]) + if getattr(self, "layers", None) is not None: + for layer in self.layers: + with tf.name_scope(layer.name): + layer.build(None) + + +@keras_serializable +class TFBlenderbotSmallMainLayer(keras.layers.Layer): + config_class = BlenderbotSmallConfig + + def __init__(self, config: BlenderbotSmallConfig, **kwargs): + super().__init__(**kwargs) + + self.config = config + self.shared = keras.layers.Embedding( + input_dim=config.vocab_size, + output_dim=config.d_model, + embeddings_initializer=keras.initializers.TruncatedNormal(stddev=self.config.init_std), + name="model.shared", + ) + # Additional attribute to specify the expected name scope of the layer (for loading/storing weights) + self.shared.load_weight_prefix = "model.shared" + + self.encoder = TFBlenderbotSmallEncoder(config, self.shared, name="encoder") + self.decoder = TFBlenderbotSmallDecoder(config, self.shared, name="decoder") + + def get_input_embeddings(self): + return self.shared + + def set_input_embeddings(self, new_embeddings): + self.shared = new_embeddings + self.encoder.embed_tokens = self.shared + self.decoder.embed_tokens = self.shared + + @unpack_inputs + def call( + self, + input_ids=None, + attention_mask=None, + decoder_input_ids=None, + decoder_attention_mask=None, + decoder_position_ids=None, + head_mask=None, + decoder_head_mask=None, + cross_attn_head_mask=None, + encoder_outputs: Optional[Union[Tuple, TFBaseModelOutput]] = None, + past_key_values=None, + inputs_embeds=None, + decoder_inputs_embeds=None, + use_cache=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + training=False, + **kwargs, + ): + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + + if encoder_outputs is None: + encoder_outputs = self.encoder( + input_ids=input_ids, + attention_mask=attention_mask, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + ) + # If the user passed a tuple for encoder_outputs, we wrap it in a TFBaseModelOutput when return_dict=True + elif return_dict and not isinstance(encoder_outputs, TFBaseModelOutput): + encoder_outputs = TFBaseModelOutput( + last_hidden_state=encoder_outputs[0], + hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None, + attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None, + ) + # If the user passed a TFBaseModelOutput for encoder_outputs, we wrap it in a tuple when return_dict=False + elif not return_dict and not isinstance(encoder_outputs, tuple): + encoder_outputs = encoder_outputs.to_tuple() + + decoder_outputs = self.decoder( + decoder_input_ids, + attention_mask=decoder_attention_mask, + position_ids=decoder_position_ids, + encoder_hidden_states=encoder_outputs[0], + encoder_attention_mask=attention_mask, + head_mask=decoder_head_mask, + cross_attn_head_mask=cross_attn_head_mask, + past_key_values=past_key_values, + inputs_embeds=decoder_inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + ) + + if not return_dict: + return decoder_outputs + encoder_outputs + + return TFSeq2SeqModelOutput( + last_hidden_state=decoder_outputs.last_hidden_state, + past_key_values=decoder_outputs.past_key_values, + decoder_hidden_states=decoder_outputs.hidden_states, + decoder_attentions=decoder_outputs.attentions, + cross_attentions=decoder_outputs.cross_attentions, + encoder_last_hidden_state=encoder_outputs.last_hidden_state, + encoder_hidden_states=encoder_outputs.hidden_states, + encoder_attentions=encoder_outputs.attentions, + ) + + def build(self, input_shape=None): + if self.built: + return + self.built = True + # The shared/tied weights expect to be in the model base namespace + # Adding "/" to the end (not the start!) of a tf.name_scope puts it in the root namespace rather than + # the current one. + with tf.name_scope(self.shared.load_weight_prefix + "/" + self.shared.name + "/"): + self.shared.build(None) + if getattr(self, "encoder", None) is not None: + with tf.name_scope(self.encoder.name): + self.encoder.build(None) + if getattr(self, "decoder", None) is not None: + with tf.name_scope(self.decoder.name): + self.decoder.build(None) + + +@add_start_docstrings( + "The bare BLENDERBOT_SMALL Model outputting raw hidden-states without any specific head on top.", + BLENDERBOT_SMALL_START_DOCSTRING, +) +class TFBlenderbotSmallModel(TFBlenderbotSmallPreTrainedModel): + def __init__(self, config: BlenderbotSmallConfig, *inputs, **kwargs): + super().__init__(config, *inputs, **kwargs) + + self.model = TFBlenderbotSmallMainLayer(config, name="model") + + def get_encoder(self): + return self.model.encoder + + def get_decoder(self): + return self.model.decoder + + @unpack_inputs + @add_start_docstrings_to_model_forward(BLENDERBOT_SMALL_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @add_code_sample_docstrings( + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=TFSeq2SeqModelOutput, + config_class=_CONFIG_FOR_DOC, + ) + def call( + self, + input_ids: tf.Tensor | None = None, + attention_mask: tf.Tensor | None = None, + decoder_input_ids: tf.Tensor | None = None, + decoder_attention_mask: tf.Tensor | None = None, + decoder_position_ids: tf.Tensor | None = None, + head_mask: tf.Tensor | None = None, + decoder_head_mask: tf.Tensor | None = None, + cross_attn_head_mask: tf.Tensor | None = None, + encoder_outputs: Optional[Union[Tuple, TFBaseModelOutput]] = None, + past_key_values: List[tf.Tensor] | None = None, + inputs_embeds: tf.Tensor | None = None, + decoder_inputs_embeds: tf.Tensor | None = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + training: Optional[bool] = False, + **kwargs, + ) -> Union[Tuple[tf.Tensor], TFSeq2SeqModelOutput]: + outputs = self.model( + input_ids=input_ids, + attention_mask=attention_mask, + decoder_input_ids=decoder_input_ids, + decoder_attention_mask=decoder_attention_mask, + decoder_position_ids=decoder_position_ids, + head_mask=head_mask, + decoder_head_mask=decoder_head_mask, + cross_attn_head_mask=cross_attn_head_mask, + encoder_outputs=encoder_outputs, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + decoder_inputs_embeds=decoder_inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + ) + + return outputs + + # Copied from transformers.models.bart.modeling_tf_bart.TFBartModel.serving_output + def serving_output(self, output): + pkv = tf.tuple(output.past_key_values)[1] if self.config.use_cache else None + dec_hs = tf.convert_to_tensor(output.decoder_hidden_states) if self.config.output_hidden_states else None + dec_attns = tf.convert_to_tensor(output.decoder_attentions) if self.config.output_attentions else None + cross_attns = tf.convert_to_tensor(output.cross_attentions) if self.config.output_attentions else None + enc_hs = tf.convert_to_tensor(output.encoder_hidden_states) if self.config.output_hidden_states else None + enc_attns = tf.convert_to_tensor(output.encoder_attentions) if self.config.output_attentions else None + + return TFSeq2SeqModelOutput( + last_hidden_state=output.last_hidden_state, + past_key_values=pkv, + decoder_hidden_states=dec_hs, + decoder_attentions=dec_attns, + cross_attentions=cross_attns, + encoder_last_hidden_state=output.encoder_last_hidden_state, + encoder_hidden_states=enc_hs, + encoder_attentions=enc_attns, + ) + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "model", None) is not None: + with tf.name_scope(self.model.name): + self.model.build(None) + + +# Copied from transformers.models.bart.modeling_tf_bart.BiasLayer +class BiasLayer(keras.layers.Layer): + """ + Bias as a layer. It is used for serialization purposes: `keras.Model.save_weights` stores on a per-layer basis, + so all weights have to be registered in a layer. + """ + + def __init__(self, shape, initializer, trainable, name, **kwargs): + super().__init__(name=name, **kwargs) + # Note: the name of this variable will NOT be scoped when serialized, i.e. it will not be in the format of + # "outer_layer/inner_layer/.../name:0". Instead, it will be "name:0". For further details, see: + # https://github.com/huggingface/transformers/pull/18833#issuecomment-1233090214 + self.bias = self.add_weight(name=name, shape=shape, initializer=initializer, trainable=trainable) + + def call(self, x): + return x + self.bias + + +@add_start_docstrings( + "The BLENDERBOT_SMALL Model with a language modeling head. Can be used for summarization.", + BLENDERBOT_SMALL_START_DOCSTRING, +) +class TFBlenderbotSmallForConditionalGeneration(TFBlenderbotSmallPreTrainedModel, TFCausalLanguageModelingLoss): + _keys_to_ignore_on_load_unexpected = [ + r"model.encoder.embed_tokens.weight", + r"model.decoder.embed_tokens.weight", + ] + + def __init__(self, config, *inputs, **kwargs): + super().__init__(config, *inputs, **kwargs) + self.model = TFBlenderbotSmallMainLayer(config, name="model") + self.use_cache = config.use_cache + # final_bias_logits is registered as a buffer in pytorch, so not trainable for the sake of consistency. + self.bias_layer = BiasLayer( + name="final_logits_bias", shape=[1, config.vocab_size], initializer="zeros", trainable=False + ) + + def get_decoder(self): + return self.model.decoder + + def get_encoder(self): + return self.model.encoder + + def get_output_embeddings(self): + return self.get_input_embeddings() + + def set_output_embeddings(self, value): + self.set_input_embeddings(value) + + def get_bias(self): + return {"final_logits_bias": self.bias_layer.bias} + + def set_bias(self, value): + # Replaces the existing layers containing bias for correct (de)serialization. + vocab_size = value["final_logits_bias"].shape[-1] + self.bias_layer = BiasLayer( + name="final_logits_bias", shape=[1, vocab_size], initializer="zeros", trainable=False + ) + self.bias_layer.bias.assign(value["final_logits_bias"]) + + @unpack_inputs + @add_start_docstrings_to_model_forward(BLENDERBOT_SMALL_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=TFSeq2SeqLMOutput, config_class=_CONFIG_FOR_DOC) + @add_end_docstrings(BLENDERBOT_SMALL_GENERATION_EXAMPLE) + def call( + self, + input_ids: tf.Tensor | None = None, + attention_mask: tf.Tensor | None = None, + decoder_input_ids: tf.Tensor | None = None, + decoder_attention_mask: tf.Tensor | None = None, + decoder_position_ids: tf.Tensor | None = None, + head_mask: tf.Tensor | None = None, + decoder_head_mask: tf.Tensor | None = None, + cross_attn_head_mask: tf.Tensor | None = None, + encoder_outputs: Optional[TFBaseModelOutput] = None, + past_key_values: List[tf.Tensor] | None = None, + inputs_embeds: tf.Tensor | None = None, + decoder_inputs_embeds: tf.Tensor | None = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + labels: tf.Tensor | None = None, + training: Optional[bool] = False, + ) -> Union[Tuple[tf.Tensor], TFSeq2SeqLMOutput]: + r""" + labels (`tf.tensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., + config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored + (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. + + Returns: + + """ + + if labels is not None: + labels = tf.where( + labels == self.config.pad_token_id, + tf.cast(tf.fill(shape_list(labels), -100), labels.dtype), + labels, + ) + use_cache = False + if decoder_input_ids is None and decoder_inputs_embeds is None: + decoder_input_ids = shift_tokens_right( + labels, self.config.pad_token_id, self.config.decoder_start_token_id + ) + + outputs = self.model( + input_ids, + attention_mask=attention_mask, + decoder_input_ids=decoder_input_ids, + decoder_attention_mask=decoder_attention_mask, + decoder_position_ids=decoder_position_ids, + head_mask=head_mask, + decoder_head_mask=decoder_head_mask, + cross_attn_head_mask=cross_attn_head_mask, + encoder_outputs=encoder_outputs, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + decoder_inputs_embeds=decoder_inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + ) + lm_logits = tf.matmul(outputs[0], self.model.shared.weights, transpose_b=True) + lm_logits = self.bias_layer(lm_logits) + masked_lm_loss = None if labels is None else self.hf_compute_loss(labels, lm_logits) + + if not return_dict: + output = (lm_logits,) + outputs[1:] + return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output + return TFSeq2SeqLMOutput( + loss=masked_lm_loss, + logits=lm_logits, + past_key_values=outputs.past_key_values, # index 1 of d outputs + decoder_hidden_states=outputs.decoder_hidden_states, # index 2 of d outputs + decoder_attentions=outputs.decoder_attentions, # index 3 of d outputs + cross_attentions=outputs.cross_attentions, # index 4 of d outputs + encoder_last_hidden_state=outputs.encoder_last_hidden_state, # index 0 of encoder outputs + encoder_hidden_states=outputs.encoder_hidden_states, # 1 of e out + encoder_attentions=outputs.encoder_attentions, # 2 of e out + ) + + # Copied from transformers.models.bart.modeling_tf_bart.TFBartForConditionalGeneration.serving_output + def serving_output(self, output): + pkv = tf.tuple(output.past_key_values)[1] if self.config.use_cache else None + dec_hs = tf.convert_to_tensor(output.decoder_hidden_states) if self.config.output_hidden_states else None + dec_attns = tf.convert_to_tensor(output.decoder_attentions) if self.config.output_attentions else None + cross_attns = tf.convert_to_tensor(output.cross_attentions) if self.config.output_attentions else None + enc_hs = tf.convert_to_tensor(output.encoder_hidden_states) if self.config.output_hidden_states else None + enc_attns = tf.convert_to_tensor(output.encoder_attentions) if self.config.output_attentions else None + + return TFSeq2SeqLMOutput( + logits=output.logits, + past_key_values=pkv, + decoder_hidden_states=dec_hs, + decoder_attentions=dec_attns, + cross_attentions=cross_attns, + encoder_last_hidden_state=output.encoder_last_hidden_state, + encoder_hidden_states=enc_hs, + encoder_attentions=enc_attns, + ) + + # Copied from transformers.models.bart.modeling_tf_bart.TFBartForConditionalGeneration.prepare_inputs_for_generation + def prepare_inputs_for_generation( + self, + decoder_input_ids, + past_key_values=None, + attention_mask=None, + decoder_attention_mask=None, + head_mask=None, + decoder_head_mask=None, + cross_attn_head_mask=None, + use_cache=None, + encoder_outputs=None, + **kwargs, + ): + # cut decoder_input_ids if past_key_values is used + if past_key_values is not None: + decoder_input_ids = decoder_input_ids[:, -1:] + + if decoder_attention_mask is not None: # xla + decoder_position_ids = tf.math.cumsum(decoder_attention_mask, axis=-1, exclusive=True)[:, -1:] + elif past_key_values is not None: # no xla + past_key_values + decoder_position_ids = past_key_values[0][0].shape[2] + else: # no xla + no past_key_values + decoder_position_ids = tf.range(decoder_input_ids.shape[1]) + + return { + "input_ids": None, # encoder_outputs is defined. input_ids not needed + "encoder_outputs": encoder_outputs, + "past_key_values": past_key_values, + "decoder_input_ids": decoder_input_ids, + "attention_mask": attention_mask, + "decoder_attention_mask": decoder_attention_mask, + "decoder_position_ids": decoder_position_ids, + "head_mask": head_mask, + "decoder_head_mask": decoder_head_mask, + "cross_attn_head_mask": cross_attn_head_mask, + "use_cache": use_cache, # change this to avoid caching (presumably for debugging) + } + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "model", None) is not None: + with tf.name_scope(self.model.name): + self.model.build(None) + if getattr(self, "bias_layer", None) is not None: + with tf.name_scope(self.bias_layer.name): + self.bias_layer.build(None) + + +__all__ = ["TFBlenderbotSmallForConditionalGeneration", "TFBlenderbotSmallModel", "TFBlenderbotSmallPreTrainedModel"] diff --git a/docs/transformers/build/lib/transformers/models/blenderbot_small/tokenization_blenderbot_small.py b/docs/transformers/build/lib/transformers/models/blenderbot_small/tokenization_blenderbot_small.py new file mode 100644 index 0000000000000000000000000000000000000000..be950f0dbe629bb2ec1eef9e85ee1023132afd75 --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/blenderbot_small/tokenization_blenderbot_small.py @@ -0,0 +1,222 @@ +# coding=utf-8 +# Copyright 2021 The Facebook Inc. and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Tokenization class for BlenderbotSmall.""" + +import json +import os +from typing import Dict, List, Optional, Tuple + +import regex as re + +from ...tokenization_utils import PreTrainedTokenizer +from ...utils import logging + + +logger = logging.get_logger(__name__) + + +VOCAB_FILES_NAMES = { + "vocab_file": "vocab.json", + "merges_file": "merges.txt", + "tokenizer_config_file": "tokenizer_config.json", +} + + +def get_pairs(word): + """ + Return set of symbol pairs in a word. + + Word is represented as tuple of symbols (symbols being variable-length strings). + """ + pairs = set() + prev_char = word[0] + for char in word[1:]: + pairs.add((prev_char, char)) + prev_char = char + + pairs = set(pairs) + return pairs + + +class BlenderbotSmallTokenizer(PreTrainedTokenizer): + """ + Constructs a Blenderbot-90M tokenizer based on BPE (Byte-Pair-Encoding) + + This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to + the superclass for more information regarding methods. + + Args: + vocab_file (`str`): + File containing the vocabulary. + merges_file (`str`): + Path to the merges file. + bos_token (`str`, *optional*, defaults to `"__start__"`): + The beginning of sentence token. + eos_token (`str`, *optional*, defaults to `"__end__"`): + The end of sentence token. + unk_token (`str`, *optional*, defaults to `"__unk__"`): + The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this + token instead. + pad_token (`str`, *optional*, defaults to `"__null__"`): + The token used for padding, for example when batching sequences of different lengths. + kwargs (*optional*): + Additional keyword arguments passed along to [`PreTrainedTokenizer`] + """ + + vocab_files_names = VOCAB_FILES_NAMES + model_input_names = ["input_ids", "attention_mask"] + + def __init__( + self, + vocab_file, + merges_file, + bos_token="__start__", + eos_token="__end__", + unk_token="__unk__", + pad_token="__null__", + **kwargs, + ): + with open(vocab_file, encoding="utf-8") as vocab_handle: + self.encoder = json.load(vocab_handle) + self.decoder = {v: k for k, v in self.encoder.items()} + with open(merges_file, encoding="utf-8") as merges_handle: + merges = merges_handle.read().split("\n")[1:-1] + merges = [tuple(merge.split()) for merge in merges] + self.bpe_ranks = dict(zip(merges, range(len(merges)))) + self.cache = {} + super().__init__(unk_token=unk_token, bos_token=bos_token, eos_token=eos_token, pad_token=pad_token, **kwargs) + + @property + def vocab_size(self) -> int: + return len(self.encoder) + + def get_vocab(self) -> Dict: + return dict(self.encoder, **self.added_tokens_encoder) + + def bpe(self, token: str) -> str: + if token in self.cache: + return self.cache[token] + token = re.sub("([.,!?()])", r" \1", token) + token = re.sub("(')", r" \1 ", token) + token = re.sub(r"\s{2,}", " ", token) + if "\n" in token: + token = token.replace("\n", " __newln__") + + tokens = token.split(" ") + words = [] + for token in tokens: + if not len(token): + continue + + token = token.lower() + word = tuple(token) + word = tuple(list(word[:-1]) + [word[-1] + ""]) + pairs = get_pairs(word) + + if not pairs: + words.append(token) + continue + + while True: + bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf"))) + if bigram not in self.bpe_ranks: + break + first, second = bigram + new_word = [] + i = 0 + + while i < len(word): + try: + j = word.index(first, i) + new_word.extend(word[i:j]) + i = j + except ValueError: + new_word.extend(word[i:]) + break + + if word[i] == first and i < len(word) - 1 and word[i + 1] == second: + new_word.append(first + second) + i += 2 + else: + new_word.append(word[i]) + i += 1 + new_word = tuple(new_word) + word = new_word + if len(word) == 1: + break + else: + pairs = get_pairs(word) + word = "@@ ".join(word) + word = word[:-4] + + self.cache[token] = word + words.append(word) + return " ".join(words) + + def _tokenize(self, text: str) -> List[str]: + """Split a string into tokens using BPE.""" + split_tokens = [] + + words = re.findall(r"\S+\n?", text) + + for token in words: + split_tokens.extend(list(self.bpe(token).split(" "))) + return split_tokens + + def _convert_token_to_id(self, token: str) -> int: + """Converts a token to an id using the vocab.""" + token = token.lower() + return self.encoder.get(token, self.encoder.get(self.unk_token)) + + def _convert_id_to_token(self, index: int) -> str: + """Converts an index (integer) in a token (str) using the vocab.""" + return self.decoder.get(index, self.unk_token) + + def convert_tokens_to_string(self, tokens: List[str]) -> str: + """Converts a sequence of tokens in a single string.""" + out_string = " ".join(tokens).replace("@@ ", "").strip() + return out_string + + def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]: + if not os.path.isdir(save_directory): + logger.error(f"Vocabulary path ({save_directory}) should be a directory") + return + vocab_file = os.path.join( + save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"] + ) + merge_file = os.path.join( + save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["merges_file"] + ) + + with open(vocab_file, "w", encoding="utf-8") as f: + f.write(json.dumps(self.encoder, indent=2, sort_keys=True, ensure_ascii=False) + "\n") + + index = 0 + with open(merge_file, "w", encoding="utf-8") as writer: + writer.write("#version: 0.2\n") + for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]): + if index != token_index: + logger.warning( + f"Saving vocabulary to {merge_file}: BPE merge indices are not consecutive." + " Please check that the tokenizer is not corrupted!" + ) + index = token_index + writer.write(" ".join(bpe_tokens) + "\n") + index += 1 + + return vocab_file, merge_file + + +__all__ = ["BlenderbotSmallTokenizer"] diff --git a/docs/transformers/build/lib/transformers/models/blenderbot_small/tokenization_blenderbot_small_fast.py b/docs/transformers/build/lib/transformers/models/blenderbot_small/tokenization_blenderbot_small_fast.py new file mode 100644 index 0000000000000000000000000000000000000000..ac98ce008baad8d425afde9d76e46827cebd423c --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/blenderbot_small/tokenization_blenderbot_small_fast.py @@ -0,0 +1,103 @@ +# coding=utf-8 +# Copyright 2021, The Facebook, Inc. and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Fast tokenization class for BlenderbotSmall.""" + +from typing import List, Optional + +from tokenizers import ByteLevelBPETokenizer + +from ...tokenization_utils_fast import PreTrainedTokenizerFast +from ...utils import logging +from .tokenization_blenderbot_small import BlenderbotSmallTokenizer + + +logger = logging.get_logger(__name__) + +VOCAB_FILES_NAMES = { + "vocab_file": "vocab.json", + "merges_file": "merges.txt", + "tokenizer_config_file": "tokenizer_config.json", +} + + +class BlenderbotSmallTokenizerFast(PreTrainedTokenizerFast): + """ + Construct a "fast" BlenderbotSmall tokenizer (backed by HuggingFace's *tokenizers* library). + + Args: + vocab_file (`str`): + Path to the vocabulary file. + """ + + vocab_files_names = VOCAB_FILES_NAMES + slow_tokenizer_class = BlenderbotSmallTokenizer + + def __init__( + self, + vocab_file=None, + merges_file=None, + unk_token="<|endoftext|>", + bos_token="<|endoftext|>", + eos_token="<|endoftext|>", + add_prefix_space=False, + trim_offsets=True, + **kwargs, + ): + super().__init__( + ByteLevelBPETokenizer( + vocab=vocab_file, + merges=merges_file, + add_prefix_space=add_prefix_space, + trim_offsets=trim_offsets, + ), + bos_token=bos_token, + eos_token=eos_token, + unk_token=unk_token, + **kwargs, + ) + self.add_prefix_space = add_prefix_space + + def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None): + output = [self.bos_token_id] + token_ids_0 + [self.eos_token_id] + if token_ids_1 is None: + return output + + return output + [self.eos_token_id] + token_ids_1 + [self.eos_token_id] + + def create_token_type_ids_from_sequences( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None + ) -> List[int]: + """ + Create a mask from the two sequences passed to be used in a sequence-pair classification task. BlenderbotSmall + does not make use of token type ids, therefore a list of zeros is returned. + + Args: + token_ids_0 (`List[int]`): + List of IDs. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + + Returns: + `List[int]`: List of zeros. + """ + sep = [self.sep_token_id] + cls = [self.cls_token_id] + + if token_ids_1 is None: + return len(cls + token_ids_0 + sep) * [0] + return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0] + + +__all__ = ["BlenderbotSmallTokenizerFast"] diff --git a/docs/transformers/build/lib/transformers/models/blip/__init__.py b/docs/transformers/build/lib/transformers/models/blip/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..952de2f855a71591e950b31751af1fe6cfbae20a --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/blip/__init__.py @@ -0,0 +1,33 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ...utils import _LazyModule +from ...utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .configuration_blip import * + from .image_processing_blip import * + from .image_processing_blip_fast import * + from .modeling_blip import * + from .modeling_blip_text import * + from .modeling_tf_blip import * + from .modeling_tf_blip_text import * + from .processing_blip import * +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/docs/transformers/build/lib/transformers/models/blip/configuration_blip.py b/docs/transformers/build/lib/transformers/models/blip/configuration_blip.py new file mode 100644 index 0000000000000000000000000000000000000000..c46cd2a08be28e9240ccf7d580d0c950b3d318f8 --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/blip/configuration_blip.py @@ -0,0 +1,329 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Blip model configuration""" + +from ...configuration_utils import PretrainedConfig +from ...utils import logging + + +logger = logging.get_logger(__name__) + + +class BlipTextConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`BlipTextModel`]. It is used to instantiate a BLIP + text model according to the specified arguments, defining the model architecture. Instantiating a configuration + with the defaults will yield a similar configuration to that of the `BlipText` used by the [base + architectures](https://huggingface.co/Salesforce/blip-vqa-base). + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + + Args: + vocab_size (`int`, *optional*, defaults to 30524): + Vocabulary size of the `Blip` text model. Defines the number of different tokens that can be represented by + the `inputs_ids` passed when calling [`BlipModel`]. + hidden_size (`int`, *optional*, defaults to 768): + Dimensionality of the encoder layers and the pooler layer. + encoder_hidden_size (`int`, *optional*, defaults to 768): + Dimensionality of the encoder layers from the vision model. + intermediate_size (`int`, *optional*, defaults to 3072): + Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder. + num_hidden_layers (`int`, *optional*, defaults to 12): + Number of hidden layers in the Transformer encoder. + num_attention_heads (`int`, *optional*, defaults to 8): + Number of attention heads for each attention layer in the Transformer encoder. + max_position_embeddings (`int`, *optional*, defaults to 512): + The maximum sequence length that this model might ever be used with. Typically set this to something large + just in case (e.g., 512 or 1024 or 2048). + hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`): + The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, + `"relu"`, `"selu"` and `"gelu_new"` `"gelu"` are supported. + layer_norm_eps (`float`, *optional*, defaults to 1e-12): + The epsilon used by the layer normalization layers. + hidden_dropout_prob (`float`, *optional*, defaults to 0.0): + The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. + attention_dropout (`float`, *optional*, defaults to 0.0): + The dropout ratio for the attention probabilities. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + bos_token_id (`int`, *optional*, defaults to 30522): + The id of the `beginning-of-sequence` token. + eos_token_id (`int`, *optional*, defaults to 2): + The id of the `end-of-sequence` token. + pad_token_id (`int`, *optional*, defaults to 0): + The id of the `padding` token. + sep_token_id (`int`, *optional*, defaults to 102): + The id of the `separator` token. + is_decoder (`bool`, *optional*, defaults to `True`): + Whether the model is used as a decoder. + use_cache (`bool`, *optional*, defaults to `True`): + Whether or not the model should return the last key/values attentions (not used by all models). + label_smoothing (float, *optional*): + A float in [0.0, 1.0]. Specifies the amount of smoothing when computing the loss, where 0.0 means no smoothing. The targets + become a mixture of the original ground truth and a uniform distribution as described in + `Rethinking the Inception Architecture for Computer Vision `__. Default: :math:`0.0`. + + Example: + + ```python + >>> from transformers import BlipTextConfig, BlipTextModel + + >>> # Initializing a BlipTextConfig with Salesforce/blip-vqa-base style configuration + >>> configuration = BlipTextConfig() + + >>> # Initializing a BlipTextModel (with random weights) from the Salesforce/blip-vqa-base style configuration + >>> model = BlipTextModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "blip_text_model" + base_config_key = "text_config" + + def __init__( + self, + vocab_size=30524, + hidden_size=768, + encoder_hidden_size=768, + intermediate_size=3072, + projection_dim=768, + num_hidden_layers=12, + num_attention_heads=8, + max_position_embeddings=512, + hidden_act="gelu", + layer_norm_eps=1e-12, + hidden_dropout_prob=0.0, + attention_probs_dropout_prob=0.0, + initializer_range=0.02, + bos_token_id=30522, + eos_token_id=2, + pad_token_id=0, + sep_token_id=102, + is_decoder=True, + use_cache=True, + label_smoothing=0.0, + **kwargs, + ): + super().__init__( + pad_token_id=pad_token_id, + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + sep_token_id=sep_token_id, + **kwargs, + ) + + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.encoder_hidden_size = encoder_hidden_size + self.intermediate_size = intermediate_size + self.projection_dim = projection_dim + self.hidden_dropout_prob = hidden_dropout_prob + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.max_position_embeddings = max_position_embeddings + self.layer_norm_eps = layer_norm_eps + self.hidden_act = hidden_act + self.initializer_range = initializer_range + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.is_decoder = is_decoder + self.use_cache = use_cache + self.label_smoothing = label_smoothing + + +class BlipVisionConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`BlipVisionModel`]. It is used to instantiate a + BLIP vision model according to the specified arguments, defining the model architecture. Instantiating a + configuration defaults will yield a similar configuration to that of the Blip-base + [Salesforce/blip-vqa-base](https://huggingface.co/Salesforce/blip-vqa-base) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + + Args: + hidden_size (`int`, *optional*, defaults to 768): + Dimensionality of the encoder layers and the pooler layer. + intermediate_size (`int`, *optional*, defaults to 3072): + Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder. + num_hidden_layers (`int`, *optional*, defaults to 12): + Number of hidden layers in the Transformer encoder. + num_attention_heads (`int`, *optional*, defaults to 12): + Number of attention heads for each attention layer in the Transformer encoder. + image_size (`int`, *optional*, defaults to 384): + The size (resolution) of each image. + patch_size (`int`, *optional*, defaults to 16): + The size (resolution) of each patch. + hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`): + The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, + `"relu"`, `"selu"` and `"gelu_new"` `"gelu"` are supported. + layer_norm_eps (`float`, *optional*, defaults to 1e-5): + The epsilon used by the layer normalization layers. + attention_dropout (`float`, *optional*, defaults to 0.0): + The dropout ratio for the attention probabilities. + initializer_range (`float`, *optional*, defaults to 1e-10): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + + Example: + + ```python + >>> from transformers import BlipVisionConfig, BlipVisionModel + + >>> # Initializing a BlipVisionConfig with Salesforce/blip-vqa-base style configuration + >>> configuration = BlipVisionConfig() + + >>> # Initializing a BlipVisionModel (with random weights) from the Salesforce/blip-vqa-base style configuration + >>> model = BlipVisionModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "blip_vision_model" + base_config_key = "vision_config" + + def __init__( + self, + hidden_size=768, + intermediate_size=3072, + projection_dim=512, + num_hidden_layers=12, + num_attention_heads=12, + image_size=384, + patch_size=16, + hidden_act="gelu", + layer_norm_eps=1e-5, + attention_dropout=0.0, + initializer_range=1e-10, + **kwargs, + ): + super().__init__(**kwargs) + + self.hidden_size = hidden_size + self.intermediate_size = intermediate_size + self.projection_dim = projection_dim + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.patch_size = patch_size + self.image_size = image_size + self.initializer_range = initializer_range + self.attention_dropout = attention_dropout + self.layer_norm_eps = layer_norm_eps + self.hidden_act = hidden_act + + +class BlipConfig(PretrainedConfig): + r""" + [`BlipConfig`] is the configuration class to store the configuration of a [`BlipModel`]. It is used to instantiate + a BLIP model according to the specified arguments, defining the text model and vision model configs. Instantiating + a configuration with the defaults will yield a similar configuration to that of the BLIP-base + [Salesforce/blip-vqa-base](https://huggingface.co/Salesforce/blip-vqa-base) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + text_config (`dict`, *optional*): + Dictionary of configuration options used to initialize [`BlipTextConfig`]. + vision_config (`dict`, *optional*): + Dictionary of configuration options used to initialize [`BlipVisionConfig`]. + projection_dim (`int`, *optional*, defaults to 512): + Dimensionality of text and vision projection layers. + logit_scale_init_value (`float`, *optional*, defaults to 2.6592): + The initial value of the *logit_scale* parameter. Default is used as per the original BLIP implementation. + image_text_hidden_size (`int`, *optional*, defaults to 256): + Dimensionality of the hidden state of the image-text fusion layer. + label_smoothing (float, optional, *optional*, defaults to 0.0): + A float in [0.0, 1.0]. Specifies the amount of smoothing when computing the loss, where 0.0 means no smoothing. The targets + become a mixture of the original ground truth and a uniform distribution as described in + `Rethinking the Inception Architecture for Computer Vision `__. Default: :math:`0.0`. + kwargs (*optional*): + Dictionary of keyword arguments. + + Example: + + ```python + >>> from transformers import BlipConfig, BlipModel + + >>> # Initializing a BlipConfig with Salesforce/blip-vqa-base style configuration + >>> configuration = BlipConfig() + + >>> # Initializing a BlipPModel (with random weights) from the Salesforce/blip-vqa-base style configuration + >>> model = BlipModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + + >>> # We can also initialize a BlipConfig from a BlipTextConfig and a BlipVisionConfig + + >>> # Initializing a BLIPText and BLIPVision configuration + >>> config_text = BlipTextConfig() + >>> config_vision = BlipVisionConfig() + + >>> config = BlipConfig.from_text_vision_configs(config_text, config_vision) + ```""" + + model_type = "blip" + sub_configs = {"text_config": BlipTextConfig, "vision_config": BlipVisionConfig} + + def __init__( + self, + text_config=None, + vision_config=None, + projection_dim=512, + logit_scale_init_value=2.6592, + image_text_hidden_size=256, + label_smoothing=0.0, + **kwargs, + ): + super().__init__(**kwargs) + + if text_config is None: + text_config = {} + logger.info("`text_config` is `None`. Initializing the `BlipTextConfig` with default values.") + + if vision_config is None: + vision_config = {} + logger.info("`vision_config` is `None`. Initializing the `BlipVisionConfig` with default values.") + + self.text_config = BlipTextConfig(**text_config) + self.vision_config = BlipVisionConfig(**vision_config) + + self.text_config.encoder_hidden_size = self.vision_config.hidden_size + + self.projection_dim = projection_dim + self.logit_scale_init_value = logit_scale_init_value + self.initializer_factor = 1.0 + self.initializer_range = 0.02 + self.image_text_hidden_size = image_text_hidden_size + self.label_smoothing = label_smoothing + + @classmethod + def from_text_vision_configs(cls, text_config: BlipTextConfig, vision_config: BlipVisionConfig, **kwargs): + r""" + Instantiate a [`BlipConfig`] (or a derived class) from blip text model configuration and blip vision model + configuration. + + Returns: + [`BlipConfig`]: An instance of a configuration object + """ + + return cls(text_config=text_config.to_dict(), vision_config=vision_config.to_dict(), **kwargs) + + +__all__ = ["BlipConfig", "BlipTextConfig", "BlipVisionConfig"] diff --git a/docs/transformers/build/lib/transformers/models/blip/convert_blip_original_pytorch_to_hf.py b/docs/transformers/build/lib/transformers/models/blip/convert_blip_original_pytorch_to_hf.py new file mode 100644 index 0000000000000000000000000000000000000000..3de18c294ae8981712ee939730434e350dfbdfb8 --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/blip/convert_blip_original_pytorch_to_hf.py @@ -0,0 +1,191 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import argparse +import re + +import requests +import torch + +# git clone https://github.com/salesforce/BLIP.git +from models.blip import blip_decoder +from models.blip_itm import blip_itm +from models.blip_vqa import blip_vqa +from PIL import Image +from torchvision import transforms +from torchvision.transforms.functional import InterpolationMode + +from transformers import ( + BertTokenizer, + BlipConfig, + BlipForConditionalGeneration, + BlipForImageTextRetrieval, + BlipForQuestionAnswering, +) + + +def load_demo_image(image_size, device): + img_url = "https://storage.googleapis.com/sfr-vision-language-research/BLIP/demo.jpg" + raw_image = Image.open(requests.get(img_url, stream=True).raw).convert("RGB") + + transform = transforms.Compose( + [ + transforms.Resize((image_size, image_size), interpolation=InterpolationMode.BICUBIC), + transforms.ToTensor(), + transforms.Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711)), + ] + ) + image = transform(raw_image).unsqueeze(0).to(device) + return image + + +def rename_key(key): + if "visual_encoder" in key: + key = re.sub("visual_encoder*", "vision_model.encoder", key) + if "blocks" in key: + key = re.sub(r"blocks", "layers", key) + if "attn" in key: + key = re.sub(r"attn", "self_attn", key) + if "norm1" in key: + key = re.sub(r"norm1", "layer_norm1", key) + if "norm2" in key: + key = re.sub(r"norm2", "layer_norm2", key) + if "encoder.norm" in key: + key = re.sub(r"encoder.norm", "post_layernorm", key) + if "encoder.patch_embed.proj" in key: + key = re.sub(r"encoder.patch_embed.proj", "embeddings.patch_embedding", key) + + if "encoder.pos_embed" in key: + key = re.sub(r"encoder.pos_embed", "embeddings.position_embedding", key) + if "encoder.cls_token" in key: + key = re.sub(r"encoder.cls_token", "embeddings.class_embedding", key) + + if "self_attn" in key: + key = re.sub(r"self_attn.proj", "self_attn.projection", key) + + return key + + +@torch.no_grad() +def convert_blip_checkpoint(pytorch_dump_folder_path, config_path=None): + """ + Copy/paste/tweak model's weights to transformers design. + """ + if config_path is not None: + config = BlipConfig.from_pretrained(config_path) + else: + config = BlipConfig(projection_dim=512, text_config={}, vision_config={}) + + hf_model = BlipForConditionalGeneration(config).eval() + + model_url = "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_capfilt_large.pth" + + pt_model = blip_decoder(pretrained=model_url, image_size=384, vit="base") + pt_model = pt_model.eval() + + modified_state_dict = pt_model.state_dict() + for key in modified_state_dict.copy(): + value = modified_state_dict.pop(key) + renamed_key = rename_key(key) + modified_state_dict[renamed_key] = value + + hf_model.load_state_dict(modified_state_dict) + + image_size = 384 + image = load_demo_image(image_size=image_size, device="cpu") + tokenizer = BertTokenizer.from_pretrained("google-bert/bert-base-uncased") + input_ids = tokenizer(["a picture of"]).input_ids + + out = hf_model.generate(image, input_ids) + + assert out[0].tolist() == [30522, 1037, 3861, 1997, 1037, 2450, 3564, 2006, 1996, 3509, 2007, 2014, 3899, 102] + + out = hf_model.generate(image) + + assert out[0].tolist() == [30522, 1037, 2450, 3564, 2006, 1996, 3509, 2007, 2014, 3899, 102] + + if pytorch_dump_folder_path is not None: + hf_model.save_pretrained(pytorch_dump_folder_path) + + # model_url = 'https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_vqa.pth' + model_url = ( + "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_vqa_capfilt_large.pth" + ) + + vqa_model = blip_vqa(pretrained=model_url, image_size=image_size, vit="base") + vqa_model.eval() + + modified_state_dict = vqa_model.state_dict() + for key in modified_state_dict.copy(): + value = modified_state_dict.pop(key) + renamed_key = rename_key(key) + modified_state_dict[renamed_key] = value + + hf_vqa_model = BlipForQuestionAnswering(config) + + hf_vqa_model.load_state_dict(modified_state_dict) + + question = ["How many dogs are in this image?"] + question_input_ids = tokenizer(question, return_tensors="pt").input_ids + + answer = hf_vqa_model.generate(question_input_ids, image) + print(tokenizer.decode(answer[0])) + + assert tokenizer.decode(answer[0]) == "[UNK] 1 [SEP]" + if pytorch_dump_folder_path is not None: + hf_vqa_model.save_pretrained(pytorch_dump_folder_path + "_vqa") + + model_url = "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_retrieval_coco.pth" + + itm_model = blip_itm(pretrained=model_url, image_size=image_size, vit="base") + itm_model.eval() + + modified_state_dict = itm_model.state_dict() + for key in modified_state_dict.copy(): + value = modified_state_dict.pop(key) + renamed_key = rename_key(key) + modified_state_dict[renamed_key] = value + + hf_itm_model = BlipForImageTextRetrieval(config) + + question = ["A picture of a woman with a dog sitting in a beach"] + question_input_ids = tokenizer( + question, + return_tensors="pt", + padding="max_length", + truncation=True, + max_length=35, + ).input_ids + + hf_itm_model.load_state_dict(modified_state_dict) + hf_itm_model.eval() + + out_itm = hf_itm_model(question_input_ids, image, use_itm_head=True) + out = hf_itm_model(question_input_ids, image, use_itm_head=False) + + assert out[0].item() == 0.2110687494277954 + assert torch.nn.functional.softmax(out_itm[0], dim=1)[:, 1].item() == 0.45698845386505127 + + if pytorch_dump_folder_path is not None: + hf_itm_model.save_pretrained(pytorch_dump_folder_path + "_itm") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.") + parser.add_argument("--config_path", default=None, type=str, help="Path to hf config.json of model to convert") + args = parser.parse_args() + + convert_blip_checkpoint(args.pytorch_dump_folder_path, args.config_path) diff --git a/docs/transformers/build/lib/transformers/models/blip/image_processing_blip.py b/docs/transformers/build/lib/transformers/models/blip/image_processing_blip.py new file mode 100644 index 0000000000000000000000000000000000000000..9f28b33a66b57315286abb26901ab96a5cf42c76 --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/blip/image_processing_blip.py @@ -0,0 +1,296 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Image processor class for BLIP.""" + +from typing import Dict, List, Optional, Union + +import numpy as np + +from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict +from ...image_transforms import convert_to_rgb, resize, to_channel_dimension_format +from ...image_utils import ( + OPENAI_CLIP_MEAN, + OPENAI_CLIP_STD, + ChannelDimension, + ImageInput, + PILImageResampling, + infer_channel_dimension_format, + is_scaled_image, + make_flat_list_of_images, + to_numpy_array, + valid_images, + validate_preprocess_arguments, +) +from ...utils import TensorType, filter_out_non_signature_kwargs, is_vision_available, logging + + +if is_vision_available(): + import PIL + + +logger = logging.get_logger(__name__) + + +class BlipImageProcessor(BaseImageProcessor): + r""" + Constructs a BLIP image processor. + + Args: + do_resize (`bool`, *optional*, defaults to `True`): + Whether to resize the image's (height, width) dimensions to the specified `size`. Can be overridden by the + `do_resize` parameter in the `preprocess` method. + size (`dict`, *optional*, defaults to `{"height": 384, "width": 384}`): + Size of the output image after resizing. Can be overridden by the `size` parameter in the `preprocess` + method. + resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`): + Resampling filter to use if resizing the image. Only has an effect if `do_resize` is set to `True`. Can be + overridden by the `resample` parameter in the `preprocess` method. + do_rescale (`bool`, *optional*, defaults to `True`): + Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the + `do_rescale` parameter in the `preprocess` method. + rescale_factor (`int` or `float`, *optional*, defaults to `1/255`): + Scale factor to use if rescaling the image. Only has an effect if `do_rescale` is set to `True`. Can be + overridden by the `rescale_factor` parameter in the `preprocess` method. + do_normalize (`bool`, *optional*, defaults to `True`): + Whether to normalize the image. Can be overridden by the `do_normalize` parameter in the `preprocess` + method. Can be overridden by the `do_normalize` parameter in the `preprocess` method. + image_mean (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_MEAN`): + Mean to use if normalizing the image. This is a float or list of floats the length of the number of + channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method. Can be + overridden by the `image_mean` parameter in the `preprocess` method. + image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_STD`): + Standard deviation to use if normalizing the image. This is a float or list of floats the length of the + number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method. + Can be overridden by the `image_std` parameter in the `preprocess` method. + do_convert_rgb (`bool`, *optional*, defaults to `True`): + Whether to convert the image to RGB. + """ + + model_input_names = ["pixel_values"] + + def __init__( + self, + do_resize: bool = True, + size: Dict[str, int] = None, + resample: PILImageResampling = PILImageResampling.BICUBIC, + do_rescale: bool = True, + rescale_factor: Union[int, float] = 1 / 255, + do_normalize: bool = True, + image_mean: Optional[Union[float, List[float]]] = None, + image_std: Optional[Union[float, List[float]]] = None, + do_convert_rgb: bool = True, + **kwargs, + ) -> None: + super().__init__(**kwargs) + size = size if size is not None else {"height": 384, "width": 384} + size = get_size_dict(size, default_to_square=True) + + self.do_resize = do_resize + self.size = size + self.resample = resample + self.do_rescale = do_rescale + self.rescale_factor = rescale_factor + self.do_normalize = do_normalize + self.image_mean = image_mean if image_mean is not None else OPENAI_CLIP_MEAN + self.image_std = image_std if image_std is not None else OPENAI_CLIP_STD + self.do_convert_rgb = do_convert_rgb + + # Copied from transformers.models.vit.image_processing_vit.ViTImageProcessor.resize with PILImageResampling.BILINEAR->PILImageResampling.BICUBIC + def resize( + self, + image: np.ndarray, + size: Dict[str, int], + resample: PILImageResampling = PILImageResampling.BICUBIC, + data_format: Optional[Union[str, ChannelDimension]] = None, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + **kwargs, + ) -> np.ndarray: + """ + Resize an image to `(size["height"], size["width"])`. + + Args: + image (`np.ndarray`): + Image to resize. + size (`Dict[str, int]`): + Dictionary in the format `{"height": int, "width": int}` specifying the size of the output image. + resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`): + `PILImageResampling` filter to use when resizing the image e.g. `PILImageResampling.BICUBIC`. + data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format for the output image. If unset, the channel dimension format of the input + image is used. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + - `"none"` or `ChannelDimension.NONE`: image in (height, width) format. + input_data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format for the input image. If unset, the channel dimension format is inferred + from the input image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + - `"none"` or `ChannelDimension.NONE`: image in (height, width) format. + + Returns: + `np.ndarray`: The resized image. + """ + size = get_size_dict(size) + if "height" not in size or "width" not in size: + raise ValueError(f"The `size` dictionary must contain the keys `height` and `width`. Got {size.keys()}") + output_size = (size["height"], size["width"]) + return resize( + image, + size=output_size, + resample=resample, + data_format=data_format, + input_data_format=input_data_format, + **kwargs, + ) + + @filter_out_non_signature_kwargs() + def preprocess( + self, + images: ImageInput, + do_resize: Optional[bool] = None, + size: Optional[Dict[str, int]] = None, + resample: PILImageResampling = None, + do_rescale: Optional[bool] = None, + rescale_factor: Optional[float] = None, + do_normalize: Optional[bool] = None, + image_mean: Optional[Union[float, List[float]]] = None, + image_std: Optional[Union[float, List[float]]] = None, + return_tensors: Optional[Union[str, TensorType]] = None, + do_convert_rgb: Optional[bool] = None, + data_format: ChannelDimension = ChannelDimension.FIRST, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + ) -> PIL.Image.Image: + """ + Preprocess an image or batch of images. + + Args: + images (`ImageInput`): + Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If + passing in images with pixel values between 0 and 1, set `do_rescale=False`. + do_resize (`bool`, *optional*, defaults to `self.do_resize`): + Whether to resize the image. + size (`Dict[str, int]`, *optional*, defaults to `self.size`): + Controls the size of the image after `resize`. The shortest edge of the image is resized to + `size["shortest_edge"]` whilst preserving the aspect ratio. If the longest edge of this resized image + is > `int(size["shortest_edge"] * (1333 / 800))`, then the image is resized again to make the longest + edge equal to `int(size["shortest_edge"] * (1333 / 800))`. + resample (`PILImageResampling`, *optional*, defaults to `self.resample`): + Resampling filter to use if resizing the image. Only has an effect if `do_resize` is set to `True`. + do_rescale (`bool`, *optional*, defaults to `self.do_rescale`): + Whether to rescale the image values between [0 - 1]. + rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`): + Rescale factor to rescale the image by if `do_rescale` is set to `True`. + do_normalize (`bool`, *optional*, defaults to `self.do_normalize`): + Whether to normalize the image. + image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`): + Image mean to normalize the image by if `do_normalize` is set to `True`. + image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`): + Image standard deviation to normalize the image by if `do_normalize` is set to `True`. + do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`): + Whether to convert the image to RGB. + return_tensors (`str` or `TensorType`, *optional*): + The type of tensors to return. Can be one of: + - Unset: Return a list of `np.ndarray`. + - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`. + - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`. + - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`. + - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`. + data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`): + The channel dimension format for the output image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + - Unset: Use the channel dimension format of the input image. + input_data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format for the input image. If unset, the channel dimension format is inferred + from the input image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + - `"none"` or `ChannelDimension.NONE`: image in (height, width) format. + """ + do_resize = do_resize if do_resize is not None else self.do_resize + resample = resample if resample is not None else self.resample + do_rescale = do_rescale if do_rescale is not None else self.do_rescale + rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor + do_normalize = do_normalize if do_normalize is not None else self.do_normalize + image_mean = image_mean if image_mean is not None else self.image_mean + image_std = image_std if image_std is not None else self.image_std + do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb + + size = size if size is not None else self.size + size = get_size_dict(size, default_to_square=False) + images = make_flat_list_of_images(images) + + if not valid_images(images): + raise ValueError( + "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, " + "torch.Tensor, tf.Tensor or jax.ndarray." + ) + + validate_preprocess_arguments( + do_rescale=do_rescale, + rescale_factor=rescale_factor, + do_normalize=do_normalize, + image_mean=image_mean, + image_std=image_std, + do_resize=do_resize, + size=size, + resample=resample, + ) + # PIL RGBA images are converted to RGB + if do_convert_rgb: + images = [convert_to_rgb(image) for image in images] + + # All transformations expect numpy arrays. + images = [to_numpy_array(image) for image in images] + + if do_rescale and is_scaled_image(images[0]): + logger.warning_once( + "It looks like you are trying to rescale already rescaled images. If the input" + " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again." + ) + + if input_data_format is None: + # We assume that all images have the same channel dimension format. + input_data_format = infer_channel_dimension_format(images[0]) + + if do_resize: + images = [ + self.resize(image=image, size=size, resample=resample, input_data_format=input_data_format) + for image in images + ] + + if do_rescale: + images = [ + self.rescale(image=image, scale=rescale_factor, input_data_format=input_data_format) + for image in images + ] + + if do_normalize: + images = [ + self.normalize(image=image, mean=image_mean, std=image_std, input_data_format=input_data_format) + for image in images + ] + + images = [ + to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) for image in images + ] + + encoded_outputs = BatchFeature(data={"pixel_values": images}, tensor_type=return_tensors) + + return encoded_outputs + + +__all__ = ["BlipImageProcessor"] diff --git a/docs/transformers/build/lib/transformers/models/blip/image_processing_blip_fast.py b/docs/transformers/build/lib/transformers/models/blip/image_processing_blip_fast.py new file mode 100644 index 0000000000000000000000000000000000000000..acd5bae891ce192628442b085fdb8faa84eae1a2 --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/blip/image_processing_blip_fast.py @@ -0,0 +1,39 @@ +# coding=utf-8 +# Copyright 2024 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Fast Image processor class for BLIP.""" + +from ...image_processing_utils_fast import BASE_IMAGE_PROCESSOR_FAST_DOCSTRING, BaseImageProcessorFast +from ...image_utils import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD, PILImageResampling +from ...utils import add_start_docstrings + + +@add_start_docstrings( + "Constructs a fast BLIP image processor.", + BASE_IMAGE_PROCESSOR_FAST_DOCSTRING, +) +class BlipImageProcessorFast(BaseImageProcessorFast): + # To be checked against the slow image processor + # None values left after checking can be removed + resample = PILImageResampling.BICUBIC + image_mean = OPENAI_CLIP_MEAN + image_std = OPENAI_CLIP_STD + size = {"height": 384, "width": 384} + do_resize = True + do_rescale = True + do_normalize = True + do_convert_rgb = True + + +__all__ = ["BlipImageProcessorFast"] diff --git a/docs/transformers/build/lib/transformers/models/blip/modeling_blip.py b/docs/transformers/build/lib/transformers/models/blip/modeling_blip.py new file mode 100644 index 0000000000000000000000000000000000000000..1f248ab8bee669cd9f2582c509ec4a1e8e0d06bd --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/blip/modeling_blip.py @@ -0,0 +1,1605 @@ +# coding=utf-8 +# Copyright 2022 The Salesforce Team Authors and The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch BLIP model.""" + +import warnings +from dataclasses import dataclass +from typing import Any, Optional, Tuple, Union + +import torch +import torch.utils.checkpoint +from torch import nn +from torch.nn.functional import normalize + +from ...activations import ACT2FN +from ...generation import GenerationMixin +from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling +from ...modeling_utils import PreTrainedModel +from ...utils import ( + ModelOutput, + add_start_docstrings, + add_start_docstrings_to_model_forward, + logging, + replace_return_docstrings, + torch_int, +) +from .configuration_blip import BlipConfig, BlipTextConfig, BlipVisionConfig +from .modeling_blip_text import BlipTextLMHeadModel, BlipTextModel + + +logger = logging.get_logger(__name__) + +_CHECKPOINT_FOR_DOC = "Salesforce/blip-vqa-base" + + +# Copied from transformers.models.clip.modeling_clip.contrastive_loss +def contrastive_loss(logits: torch.Tensor) -> torch.Tensor: + return nn.functional.cross_entropy(logits, torch.arange(len(logits), device=logits.device)) + + +# Copied from transformers.models.clip.modeling_clip.clip_loss with clip->blip +def blip_loss(similarity: torch.Tensor) -> torch.Tensor: + caption_loss = contrastive_loss(similarity) + image_loss = contrastive_loss(similarity.t()) + return (caption_loss + image_loss) / 2.0 + + +@dataclass +class BlipForConditionalGenerationModelOutput(ModelOutput): + """ + Adapted from the base class for vision model's outputs that also contains image embeddings of the pooling of the + last hidden states. This class also adds the loss term from the text decoder. + + Args: + loss (`torch.FloatTensor`, *optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`): + Languge modeling loss from the text decoder. + logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`, *optional*): + Prediction scores of the language modeling head of the text decoder model. + image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)`, *optional*): + The image embeddings obtained after applying the Vision Transformer model to the input image. + last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Sequence of hidden-states at the output of the last layer of the model. + hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, + + one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the optional initial embedding outputs. + attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + + loss: Optional[Tuple[torch.FloatTensor]] = None + logits: Optional[Tuple[torch.FloatTensor]] = None + image_embeds: Optional[torch.FloatTensor] = None + last_hidden_state: Optional[torch.FloatTensor] = None + hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None + attentions: Optional[Tuple[torch.FloatTensor, ...]] = None + + @property + def decoder_logits(self): + warnings.warn( + "`decoder_logits` attribute is deprecated and will be removed in version 5 of Transformers." + " Please use the `logits` attribute to retrieve the final output instead.", + FutureWarning, + ) + return self.logits + + +@dataclass +class BlipTextVisionModelOutput(ModelOutput): + """ + Adapted from the base class for vision model's outputs that also contains image embeddings of the pooling of the + last hidden states. This class also adds the loss term from the text decoder. + + Args: + loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided): + Languge modeling loss from the text decoder. + image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`): + The image embeddings obtained by applying the projection layer to the pooler_output. + last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): + Sequence of hidden-states at the output of the last layer of the model. + hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, + + one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the optional initial embedding outputs. + attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + + loss: Optional[torch.FloatTensor] = None + image_embeds: Optional[torch.FloatTensor] = None + last_hidden_state: Optional[torch.FloatTensor] = None + hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None + attentions: Optional[Tuple[torch.FloatTensor, ...]] = None + + +@dataclass +class BlipImageTextMatchingModelOutput(ModelOutput): + """ + Adapted from the base class for vision model's outputs that also contains image embeddings of the pooling of the + last hidden states. This class also adds the loss term from the text decoder as well as the image-text similarity + scores. + + Args: + itm_score (`torch.FloatTensor`): + The image-text similarity scores. + loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided): + Languge modeling loss from the text decoder. + image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`): + The image embeddings obtained by applying the projection layer to the pooler_output. + last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): + Sequence of hidden-states at the output of the last layer of the model. + hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, + + one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the optional initial embedding outputs. + vision_pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`, *optional*): + Last layer hidden-state of the vision of the vision-only branch of the model. + attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + question_embeds (`torch.FloatTensor`): + The question embeddings obtained by the text projection layer. + """ + + itm_score: Optional[torch.FloatTensor] = None + loss: Optional[torch.FloatTensor] = None + image_embeds: Optional[torch.FloatTensor] = None + last_hidden_state: Optional[torch.FloatTensor] = None + hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None + vision_pooler_output: Optional[torch.FloatTensor] = None + attentions: Optional[Tuple[torch.FloatTensor, ...]] = None + question_embeds: Optional[Tuple[torch.FloatTensor]] = None + + +@dataclass +class BlipOutput(ModelOutput): + """ + Args: + loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`): + Contrastive loss for image-text similarity. + logits_per_image:(`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`): + The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text + similarity scores. + logits_per_text:(`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`): + The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image + similarity scores. + text_embeds(`torch.FloatTensor` of shape `(batch_size, output_dim`): + The text embeddings obtained by applying the projection layer to the pooled output of [`BlipTextModel`]. + image_embeds(`torch.FloatTensor` of shape `(batch_size, output_dim`): + The image embeddings obtained by applying the projection layer to the pooled output of [`BlipVisionModel`]. + text_model_output(`BaseModelOutputWithPooling`): + The output of the [`BlipTextModel`]. + vision_model_output(`BaseModelOutputWithPooling`): + The output of the [`BlipVisionModel`]. + """ + + loss: Optional[torch.FloatTensor] = None + logits_per_image: Optional[torch.FloatTensor] = None + logits_per_text: Optional[torch.FloatTensor] = None + text_embeds: Optional[torch.FloatTensor] = None + image_embeds: Optional[torch.FloatTensor] = None + text_model_output: BaseModelOutputWithPooling = None + vision_model_output: BaseModelOutputWithPooling = None + + def to_tuple(self) -> Tuple[Any]: + return tuple( + self[k] if k not in ["text_model_output", "vision_model_output"] else getattr(self, k).to_tuple() + for k in self.keys() + ) + + +class BlipVisionEmbeddings(nn.Module): + def __init__(self, config: BlipVisionConfig): + super().__init__() + self.config = config + self.embed_dim = config.hidden_size + self.image_size = config.image_size + self.patch_size = config.patch_size + + self.class_embedding = nn.Parameter(torch.randn(1, 1, self.embed_dim)) + + self.patch_embedding = nn.Conv2d( + in_channels=3, out_channels=self.embed_dim, kernel_size=self.patch_size, stride=self.patch_size + ) + + self.num_patches = (self.image_size // self.patch_size) ** 2 + self.num_positions = self.num_patches + 1 + + self.position_embedding = nn.Parameter(torch.randn(1, self.num_positions, self.embed_dim)) + + def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width: int) -> torch.Tensor: + """ + This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution + images. This method is also adapted to support torch.jit tracing. + + Adapted from: + - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and + - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211 + """ + + num_patches = embeddings.shape[1] - 1 + num_positions = self.position_embedding.shape[1] - 1 + + # always interpolate when tracing to ensure the exported model works for dynamic input shapes + if not torch.jit.is_tracing() and num_patches == num_positions and height == width: + return self.position_embedding + + class_pos_embed = self.position_embedding[:, :1] + patch_pos_embed = self.position_embedding[:, 1:] + + dim = embeddings.shape[-1] + + new_height = height // self.patch_size + new_width = width // self.patch_size + + sqrt_num_positions = torch_int(num_positions**0.5) + patch_pos_embed = patch_pos_embed.reshape(1, sqrt_num_positions, sqrt_num_positions, dim) + patch_pos_embed = patch_pos_embed.permute(0, 3, 1, 2) + + patch_pos_embed = nn.functional.interpolate( + patch_pos_embed, + size=(new_height, new_width), + mode="bicubic", + align_corners=False, + ) + + patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim) + + return torch.cat((class_pos_embed, patch_pos_embed), dim=1) + + def forward(self, pixel_values: torch.FloatTensor, interpolate_pos_encoding: bool = False) -> torch.Tensor: + batch_size, _, height, width = pixel_values.shape + target_dtype = self.patch_embedding.weight.dtype + patch_embeds = self.patch_embedding(pixel_values.to(dtype=target_dtype)) # shape = [*, width, grid, grid] + patch_embeds = patch_embeds.flatten(2).transpose(1, 2) + class_embeds = self.class_embedding.expand(batch_size, 1, -1).to(target_dtype) + embeddings = torch.cat([class_embeds, patch_embeds], dim=1) + if interpolate_pos_encoding: + position_embedding = self.interpolate_pos_encoding(embeddings, height, width) + else: + position_embedding = self.position_embedding + embeddings = embeddings + position_embedding[:, : embeddings.size(1), :].to(target_dtype) + return embeddings + + +# Copied from transformers.models.clip.modeling_clip.CLIPTextEmbeddings with CLIP->Blip +class BlipTextEmbeddings(nn.Module): + def __init__(self, config: BlipTextConfig): + super().__init__() + embed_dim = config.hidden_size + + self.token_embedding = nn.Embedding(config.vocab_size, embed_dim) + self.position_embedding = nn.Embedding(config.max_position_embeddings, embed_dim) + + # position_ids (1, len position emb) is contiguous in memory and exported when serialized + self.register_buffer( + "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False + ) + + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + ) -> torch.Tensor: + seq_length = input_ids.shape[-1] if input_ids is not None else inputs_embeds.shape[-2] + max_position_embedding = self.position_embedding.weight.shape[0] + + if seq_length > max_position_embedding: + raise ValueError( + f"Sequence length must be less than max_position_embeddings (got `sequence length`: " + f"{seq_length} and max_position_embeddings: {max_position_embedding}" + ) + + if position_ids is None: + position_ids = self.position_ids[:, :seq_length] + + if inputs_embeds is None: + inputs_embeds = self.token_embedding(input_ids) + + position_embeddings = self.position_embedding(position_ids) + embeddings = inputs_embeds + position_embeddings + + return embeddings + + +class BlipAttention(nn.Module): + """Multi-headed attention from 'Attention Is All You Need' paper""" + + def __init__(self, config): + super().__init__() + self.config = config + self.embed_dim = config.hidden_size + self.num_heads = config.num_attention_heads + self.head_dim = self.embed_dim // self.num_heads + if self.head_dim * self.num_heads != self.embed_dim: + raise ValueError( + f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:" + f" {self.num_heads})." + ) + self.scale = self.head_dim**-0.5 + self.dropout = nn.Dropout(config.attention_dropout) + + self.qkv = nn.Linear(self.embed_dim, 3 * self.embed_dim) + + self.projection = nn.Linear(self.embed_dim, self.embed_dim) + + def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int): + return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous() + + def forward( + self, + hidden_states: torch.Tensor, + head_mask: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = False, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + """Input shape: Batch x Time x Channel""" + + bsz, tgt_len, embed_dim = hidden_states.size() + + mixed_qkv = ( + self.qkv(hidden_states) + .reshape(bsz, tgt_len, 3, self.num_heads, embed_dim // self.num_heads) + .permute(2, 0, 3, 1, 4) + ) + query_states, key_states, value_states = mixed_qkv[0], mixed_qkv[1], mixed_qkv[2] + + # Take the dot product between "query" and "key" to get the raw attention scores. + attention_scores = torch.matmul(query_states, key_states.transpose(-1, -2)) + + attention_scores = attention_scores * self.scale + + # Normalize the attention scores to probabilities. + attention_probs = nn.functional.softmax(attention_scores, dim=-1) + + # This is actually dropping out entire tokens to attend to, which might + # seem a bit unusual, but is taken from the original Transformer paper. + attention_probs = self.dropout(attention_probs) + + # Mask heads if we want to + if head_mask is not None: + attention_probs = attention_probs * head_mask + + context_layer = torch.matmul(attention_probs, value_states).permute(0, 2, 1, 3) + + new_context_layer_shape = context_layer.size()[:-2] + (self.embed_dim,) + context_layer = context_layer.reshape(new_context_layer_shape) + + output = self.projection(context_layer) + + outputs = (output, attention_probs) if output_attentions else (output, None) + + return outputs + + +# Copied from transformers.models.clip.modeling_clip.CLIPMLP with CLIP->Blip +class BlipMLP(nn.Module): + def __init__(self, config): + super().__init__() + self.config = config + self.activation_fn = ACT2FN[config.hidden_act] + self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size) + self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size) + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + hidden_states = self.fc1(hidden_states) + hidden_states = self.activation_fn(hidden_states) + hidden_states = self.fc2(hidden_states) + return hidden_states + + +class BlipEncoderLayer(nn.Module): + def __init__(self, config: BlipConfig): + super().__init__() + self.embed_dim = config.hidden_size + self.self_attn = BlipAttention(config) + self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps) + self.mlp = BlipMLP(config) + self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: torch.Tensor, + output_attentions: Optional[bool] = False, + ) -> Tuple[torch.FloatTensor]: + """ + Args: + hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)` + attention_mask (`torch.FloatTensor`): attention mask of size + `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values. + `(config.encoder_attention_heads,)`. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + """ + residual = hidden_states + + hidden_states = self.layer_norm1(hidden_states) + hidden_states, attn_weights = self.self_attn( + hidden_states=hidden_states, + head_mask=attention_mask, + output_attentions=output_attentions, + ) + hidden_states = hidden_states + residual + residual = hidden_states + hidden_states = self.layer_norm2(hidden_states) + hidden_states = self.mlp(hidden_states) + + hidden_states = hidden_states + residual + + outputs = (hidden_states,) + + if output_attentions: + outputs += (attn_weights,) + + return outputs + + +class BlipPreTrainedModel(PreTrainedModel): + """ + An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained + models. + """ + + config_class = BlipConfig + base_model_prefix = "blip" + supports_gradient_checkpointing = True + _no_split_modules = ["BlipEncoderLayer", "BlipTextEmbeddings"] + _skip_keys_device_placement = ["past_key_value"] + + def _init_weights(self, module): + """Initialize the weights""" + factor = self.config.initializer_range + if isinstance(module, nn.Conv2d) or isinstance(module, nn.Embedding) or isinstance(module, nn.Linear): + module.weight.data.normal_(mean=0.0, std=factor) + if hasattr(module, "bias") and module.bias is not None: + module.bias.data.zero_() + + if isinstance(module, BlipVisionEmbeddings): + if hasattr(self.config, "vision_config"): + factor = self.config.vision_config.initializer_range + nn.init.trunc_normal_( + module.position_embedding, + mean=0.0, + std=factor, + ) + + nn.init.trunc_normal_( + module.class_embedding, + mean=0.0, + std=factor, + ) + + elif isinstance(module, nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + elif isinstance(module, nn.Linear) and module.bias is not None: + module.bias.data.zero_() + + +BLIP_START_DOCSTRING = r""" + This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads + etc.) + + This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. + Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage + and behavior. + + Parameters: + config ([`BlipConfig`]): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights. +""" + +BLIP_TEXT_INPUTS_DOCSTRING = r""" + Args: + input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide + it. + + Indices can be obtained using [`AutoProcessor`]. See [`BlipProcessor.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, + config.max_position_embeddings - 1]`. + + [What are position IDs?](../glossary#position-ids) + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. +""" + +BLIP_VISION_INPUTS_DOCSTRING = r""" + Args: + pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`): + Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using + [`BlipImageProcessor`]. See [`BlipImageProcessor.__call__`] for details. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. + interpolate_pos_encoding (`bool`, *optional*, defaults to `False`): + Whether to interpolate the pre-trained position encodings. +""" + +BLIP_INPUTS_DOCSTRING = r""" + Args: + input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide + it. + + Indices can be obtained using [`AutoProcessor`]. See [`BlipProcessor.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, + config.max_position_embeddings - 1]`. + + [What are position IDs?](../glossary#position-ids) + pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`): + Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using + [`BlipImageProcessor`]. See [`BlipImageProcessor.__call__`] for details. + return_loss (`bool`, *optional*): + Whether or not to return the contrastive loss. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. + interpolate_pos_encoding (`bool`, *optional*, defaults to `False`): + Whether to interpolate the pre-trained position encodings. +""" + + +class BlipEncoder(nn.Module): + """ + Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a + [`BlipEncoderLayer`]. + + Args: + config (`BlipConfig`): + The corresponding vision configuration for the `BlipEncoder`. + """ + + def __init__(self, config: BlipConfig): + super().__init__() + self.config = config + self.layers = nn.ModuleList([BlipEncoderLayer(config) for _ in range(config.num_hidden_layers)]) + self.gradient_checkpointing = False + + def forward( + self, + inputs_embeds, + attention_mask: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, BaseModelOutput]: + r""" + Args: + inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): + Embedded representation of the inputs. Should be float, not int tokens. + attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors + for more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. + """ + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + encoder_states = () if output_hidden_states else None + all_attentions = () if output_attentions else None + + hidden_states = inputs_embeds + for idx, encoder_layer in enumerate(self.layers): + if output_hidden_states: + encoder_states = encoder_states + (hidden_states,) + if self.gradient_checkpointing and self.training: + layer_outputs = self._gradient_checkpointing_func( + encoder_layer.__call__, + hidden_states, + attention_mask, + output_attentions, + ) + else: + layer_outputs = encoder_layer( + hidden_states, + attention_mask, + output_attentions=output_attentions, + ) + + hidden_states = layer_outputs[0] + + if output_attentions: + all_attentions = all_attentions + (layer_outputs[1],) + + if output_hidden_states: + encoder_states = encoder_states + (hidden_states,) + + if not return_dict: + return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None) + return BaseModelOutput( + last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions + ) + + +class BlipVisionModel(BlipPreTrainedModel): + main_input_name = "pixel_values" + config_class = BlipVisionConfig + + def __init__(self, config: BlipVisionConfig): + super().__init__(config) + self.config = config + embed_dim = config.hidden_size + + self.embeddings = BlipVisionEmbeddings(config) + self.encoder = BlipEncoder(config) + self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps) + + self.post_init() + + @add_start_docstrings_to_model_forward(BLIP_VISION_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=BlipVisionConfig) + def forward( + self, + pixel_values: Optional[torch.FloatTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + interpolate_pos_encoding: bool = False, + ) -> Union[Tuple, BaseModelOutputWithPooling]: + r""" + Returns: + + """ + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if pixel_values is None: + raise ValueError("You have to specify pixel_values") + + hidden_states = self.embeddings(pixel_values, interpolate_pos_encoding=interpolate_pos_encoding) + + encoder_outputs = self.encoder( + inputs_embeds=hidden_states, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + last_hidden_state = encoder_outputs[0] + last_hidden_state = self.post_layernorm(last_hidden_state) + + pooled_output = last_hidden_state[:, 0, :] + pooled_output = self.post_layernorm(pooled_output) + + if not return_dict: + return (last_hidden_state, pooled_output) + encoder_outputs[1:] + + return BaseModelOutputWithPooling( + last_hidden_state=last_hidden_state, + pooler_output=pooled_output, + hidden_states=encoder_outputs.hidden_states, + attentions=encoder_outputs.attentions, + ) + + def get_input_embeddings(self): + return self.embeddings + + +@add_start_docstrings( + """ + This model is going to be deprecated in future versions. Please use `BlipForConditionalGeneration`, `BlipForQuestionAnswering` or `BlipForImageTextRetrieval` depending on your usecase. + """, + BLIP_START_DOCSTRING, +) +class BlipModel(BlipPreTrainedModel): + config_class = BlipConfig + + def __init__(self, config: BlipConfig): + super().__init__(config) + + if not isinstance(config.text_config, BlipTextConfig): + raise TypeError( + "config.text_config is expected to be of type BlipTextConfig but is of type" + f" {type(config.text_config)}." + ) + + if not isinstance(config.vision_config, BlipVisionConfig): + raise TypeError( + "config.vision_config is expected to be of type BlipVisionConfig but is of type" + f" {type(config.vision_config)}." + ) + + text_config = config.text_config + vision_config = config.vision_config + + self.projection_dim = config.projection_dim + self.text_embed_dim = text_config.hidden_size + self.vision_embed_dim = vision_config.hidden_size + + self.text_model = BlipTextModel(text_config) + self.vision_model = BlipVisionModel(vision_config) + + self.visual_projection = nn.Linear(self.vision_embed_dim, self.projection_dim, bias=False) + self.text_projection = nn.Linear(self.text_embed_dim, self.projection_dim, bias=False) + self.logit_scale = nn.Parameter(torch.tensor(self.config.logit_scale_init_value)) + + logger.warning( + "`BlipModel` is going to be deprecated in future release, please use `BlipForConditionalGeneration`, `BlipForQuestionAnswering` or `BlipForImageTextRetrieval` depending on your usecase." + ) + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.text_model.get_input_embeddings() + + def set_input_embeddings(self, value): + self.text_model.set_input_embeddings(value) + + @add_start_docstrings_to_model_forward(BLIP_TEXT_INPUTS_DOCSTRING) + def get_text_features( + self, + input_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.Tensor] = None, + return_dict: Optional[bool] = None, + ) -> torch.FloatTensor: + r""" + Returns: + text_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The text embeddings obtained by + applying the projection layer to the pooled output of [`BlipTextModel`]. + + Examples: + + ```python + >>> from transformers import AutoProcessor, BlipModel + + >>> model = BlipModel.from_pretrained("Salesforce/blip-image-captioning-base") + >>> processor = AutoProcessor.from_pretrained("Salesforce/blip-image-captioning-base") + + >>> inputs = processor(text=["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt") + >>> text_features = model.get_text_features(**inputs) + ```""" + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + text_outputs = self.text_model( + input_ids=input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + return_dict=return_dict, + ) + + pooled_output = text_outputs[1] + text_features = self.text_projection(pooled_output) + + return text_features + + @add_start_docstrings_to_model_forward(BLIP_VISION_INPUTS_DOCSTRING) + def get_image_features( + self, + pixel_values: Optional[torch.FloatTensor] = None, + return_dict: Optional[bool] = None, + interpolate_pos_encoding: bool = False, + ) -> torch.FloatTensor: + r""" + Returns: + image_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The image embeddings obtained by + applying the projection layer to the pooled output of [`BlipVisionModel`]. + + Examples: + + ```python + >>> from PIL import Image + >>> import requests + >>> from transformers import AutoProcessor, BlipModel + + >>> model = BlipModel.from_pretrained("Salesforce/blip-image-captioning-base") + >>> processor = AutoProcessor.from_pretrained("Salesforce/blip-image-captioning-base") + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + + >>> inputs = processor(images=image, return_tensors="pt") + + >>> image_features = model.get_image_features(**inputs) + ```""" + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + vision_outputs = self.vision_model( + pixel_values=pixel_values, + return_dict=return_dict, + interpolate_pos_encoding=interpolate_pos_encoding, + ) + + pooled_output = vision_outputs[1] # pooled_output + image_features = self.visual_projection(pooled_output) + + return image_features + + @add_start_docstrings_to_model_forward(BLIP_INPUTS_DOCSTRING) + def get_multimodal_features( + self, + input_ids: Optional[torch.LongTensor] = None, + pixel_values: Optional[torch.FloatTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + return_dict: Optional[bool] = None, + interpolate_pos_encoding: bool = False, + ) -> torch.FloatTensor: + r""" + Returns: + multimodal_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The multimodal embeddings + obtained by applying the image embeddings to the text encoder using the cross-attention mechanism. + + Examples: + ```python + >>> from PIL import Image + >>> import requests + >>> from transformers import AutoProcessor, BlipModel + + >>> model = BlipModel.from_pretrained("Salesforce/blip-image-captioning-base") + >>> processor = AutoProcessor.from_pretrained("Salesforce/blip-image-captioning-base") + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + >>> texts = ["a photo of a cat", "a photo of a dog"] + >>> inputs = processor(images=image, text=texts, padding=True, return_tensors="pt") + + >>> multimodal_features = model.get_multimodal_features(**inputs) + ```""" + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + vision_outputs = self.vision_model( + pixel_values=pixel_values, + output_attentions=True, + output_hidden_states=True, + return_dict=return_dict, + interpolate_pos_encoding=interpolate_pos_encoding, + ) + + image_embeds = vision_outputs[0] + image_atts = torch.ones(image_embeds.size()[:-1], dtype=torch.long) + + text_outputs = self.text_model( + input_ids=input_ids, + attention_mask=attention_mask, + encoder_hidden_states=image_embeds, + encoder_attention_mask=image_atts, + return_dict=return_dict, + ) + + pooled_output = text_outputs[1] # pooled_output + multimodal_features = self.text_projection(pooled_output) + + return multimodal_features + + @add_start_docstrings_to_model_forward(BLIP_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=BlipOutput, config_class=BlipConfig) + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + pixel_values: Optional[torch.FloatTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + return_loss: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + interpolate_pos_encoding: bool = False, + ) -> Union[Tuple, BlipOutput]: + r""" + Returns: + + Examples: + + ```python + >>> from PIL import Image + >>> import requests + >>> from transformers import AutoProcessor, BlipModel + + >>> model = BlipModel.from_pretrained("Salesforce/blip-image-captioning-base") + >>> processor = AutoProcessor.from_pretrained("Salesforce/blip-image-captioning-base") + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + + >>> inputs = processor( + ... text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="pt", padding=True + ... ) + + >>> outputs = model(**inputs) + >>> logits_per_image = outputs.logits_per_image # this is the image-text similarity score + >>> probs = logits_per_image.softmax(dim=1) # we can take the softmax to get the label probabilities + ```""" + # Use BLIP model's config for some fields (if specified) instead of those of vision & text components. + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + vision_outputs = self.vision_model( + pixel_values=pixel_values, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + interpolate_pos_encoding=interpolate_pos_encoding, + ) + + text_outputs = self.text_model( + input_ids=input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + image_embeds = vision_outputs[1] + image_embeds = self.visual_projection(image_embeds) + + text_embeds = text_outputs[1] + text_embeds = self.text_projection(text_embeds) + + # normalized features + image_embeds = image_embeds / image_embeds.norm(p=2, dim=-1, keepdim=True) + text_embeds = text_embeds / text_embeds.norm(p=2, dim=-1, keepdim=True) + + # cosine similarity as logits + logit_scale = self.logit_scale.exp().to(device=text_embeds.device) + image_embeds = image_embeds.to(device=text_embeds.device, dtype=text_embeds.dtype) + logits_per_text = torch.matmul(text_embeds, image_embeds.t()) * logit_scale + logits_per_image = logits_per_text.t() + + loss = None + if return_loss: + loss = blip_loss(logits_per_text) + + if not return_dict: + output = (logits_per_image, logits_per_text, text_embeds, image_embeds, text_outputs, vision_outputs) + return ((loss,) + output) if loss is not None else output + + return BlipOutput( + loss=loss, + logits_per_image=logits_per_image, + logits_per_text=logits_per_text, + text_embeds=text_embeds, + image_embeds=image_embeds, + text_model_output=text_outputs, + vision_model_output=vision_outputs, + ) + + +@add_start_docstrings( + """ + BLIP Model for image captioning. The model consists of a vision encoder and a text decoder. One can optionally pass + `input_ids` to the model, which serve as a text prompt, to make the text decoder continue the prompt. Otherwise, + the decoder starts generating text from the [BOS] (beginning-of-sequence) token. will start generating the caption + from the text input. If no text input is provided, the decoder will start with the [BOS] token only. + """, + BLIP_START_DOCSTRING, +) +class BlipForConditionalGeneration(BlipPreTrainedModel, GenerationMixin): + config_class = BlipConfig + _tied_weights_keys = ["text_decoder.cls.predictions.decoder.bias"] + main_input_name = "pixel_values" + + def __init__(self, config: BlipConfig): + super().__init__(config) + + self.vision_model = BlipVisionModel(config.vision_config) + + self.text_decoder = BlipTextLMHeadModel(config.text_config) + + self.decoder_input_ids = config.text_config.bos_token_id + self.decoder_pad_token_id = config.text_config.pad_token_id + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.text_decoder.get_input_embeddings() + + def set_input_embeddings(self, value): + self.text_decoder.set_input_embeddings(value) + + @add_start_docstrings_to_model_forward(BLIP_VISION_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=BlipForConditionalGenerationModelOutput, config_class=BlipVisionConfig) + def forward( + self, + pixel_values: torch.FloatTensor, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.LongTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + labels: Optional[torch.LongTensor] = None, + return_dict: Optional[bool] = None, + interpolate_pos_encoding: bool = False, + ) -> Union[Tuple, BlipForConditionalGenerationModelOutput]: + r""" + Returns: + + Examples: + + ```python + >>> from PIL import Image + >>> import requests + >>> from transformers import AutoProcessor, BlipForConditionalGeneration + + >>> processor = AutoProcessor.from_pretrained("Salesforce/blip-image-captioning-base") + >>> model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base") + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + >>> text = "A picture of" + + >>> inputs = processor(images=image, text=text, return_tensors="pt") + + >>> outputs = model(**inputs) + ```""" + + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + + vision_outputs = self.vision_model( + pixel_values=pixel_values, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + interpolate_pos_encoding=interpolate_pos_encoding, + ) + + image_embeds = vision_outputs[0] + + outputs = self.text_decoder( + input_ids=input_ids, + attention_mask=attention_mask, + encoder_hidden_states=image_embeds, + labels=labels, + return_dict=return_dict, + reduction="mean", + ) + + if not return_dict: + outputs = (outputs[0], outputs[1]) if labels is not None else (outputs[0],) + outputs += (image_embeds, vision_outputs[0]) + vision_outputs[2:] + return tuple(output for output in outputs if output is not None) + + return BlipForConditionalGenerationModelOutput( + loss=outputs.loss, + logits=outputs.logits, + image_embeds=image_embeds, + last_hidden_state=vision_outputs.last_hidden_state, + hidden_states=vision_outputs.hidden_states, + attentions=vision_outputs.attentions, + ) + + @torch.no_grad() + def generate( + self, + pixel_values: torch.FloatTensor, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.LongTensor] = None, + interpolate_pos_encoding: bool = False, + **generate_kwargs, + ) -> torch.LongTensor: + r""" + Overrides *generate* function to be able to use the model as a conditional generator + + Parameters: + pixel_values (*torch.FloatTensor* of shape *(batch_size, num_channels, image_height, image_width)*: + Input image to be processed + input_ids (*torch.LongTensor* of shape *(batch_size, sequence_length)*, *optional*): + The sequence used as a prompt for the generation. + attention_mask (*torch.LongTensor* of shape *(batch_size, sequence_length)*, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + + Examples: + ```python + >>> from PIL import Image + >>> import requests + >>> from transformers import AutoProcessor, BlipForConditionalGeneration + + >>> model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base") + >>> processor = AutoProcessor.from_pretrained("Salesforce/blip-image-captioning-base") + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + + >>> inputs = processor(images=image, return_tensors="pt") + + >>> outputs = model.generate(**inputs) + >>> print(processor.decode(outputs[0], skip_special_tokens=True)) + two cats sleeping on a couch + ``` + """ + + batch_size = pixel_values.shape[0] + vision_outputs = self.vision_model( + pixel_values=pixel_values, + interpolate_pos_encoding=interpolate_pos_encoding, + ) + + image_embeds = vision_outputs[0] + + image_attention_mask = torch.ones(image_embeds.size()[:-1], dtype=torch.long, device=image_embeds.device) + + if isinstance(input_ids, list): + input_ids = torch.LongTensor(input_ids) + elif input_ids is None: + input_ids = ( + torch.LongTensor([[self.decoder_input_ids, self.config.text_config.eos_token_id]]) + .repeat(batch_size, 1) + .to(image_embeds.device) + ) + + input_ids[:, 0] = self.config.text_config.bos_token_id + attention_mask = attention_mask[:, :-1] if attention_mask is not None else None + + outputs = self.text_decoder.generate( + input_ids=input_ids[:, :-1], + eos_token_id=self.config.text_config.sep_token_id, + pad_token_id=self.config.text_config.pad_token_id, + attention_mask=attention_mask, + encoder_hidden_states=image_embeds, + encoder_attention_mask=image_attention_mask, + **generate_kwargs, + ) + + return outputs + + +@add_start_docstrings( + """ + BLIP Model for visual question answering. The model consists of a vision encoder, a text encoder as well as a text + decoder. The vision encoder will encode the input image, the text encoder will encode the input question together + with the encoding of the image, and the text decoder will output the answer to the question. + """, + BLIP_START_DOCSTRING, +) +class BlipForQuestionAnswering(BlipPreTrainedModel, GenerationMixin): + config_class = BlipConfig + _tied_weights_keys = ["text_decoder.cls.predictions.decoder.bias"] + + def __init__(self, config: BlipConfig): + super().__init__(config) + + self.vision_model = BlipVisionModel(config.vision_config) + + self.text_encoder = BlipTextModel(config.text_config, add_pooling_layer=False) + + self.text_decoder = BlipTextLMHeadModel(config.text_config) + + self.decoder_pad_token_id = config.text_config.pad_token_id + self.decoder_start_token_id = config.text_config.bos_token_id + + # Initialize weights and apply final processing + self.post_init() + + def set_input_embeddings(self, value): + self.text_encoder.set_input_embeddings(value) + + def get_input_embeddings(self): + # This will return shared embeddings if they are shared else specific to encoder. + return self.text_encoder.get_input_embeddings() + + @add_start_docstrings_to_model_forward(BLIP_VISION_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=BlipTextVisionModelOutput, config_class=BlipVisionConfig) + def forward( + self, + input_ids: torch.LongTensor, + pixel_values: torch.FloatTensor, + decoder_input_ids: Optional[torch.LongTensor] = None, + decoder_attention_mask: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.LongTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + labels: Optional[torch.LongTensor] = None, + return_dict: Optional[bool] = None, + interpolate_pos_encoding: bool = False, + ) -> Union[Tuple, BlipTextVisionModelOutput]: + r""" + Returns: + + Examples: + + ```python + >>> from PIL import Image + >>> import requests + >>> from transformers import AutoProcessor, BlipForQuestionAnswering + + >>> model = BlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base") + >>> processor = AutoProcessor.from_pretrained("Salesforce/blip-vqa-base") + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + + >>> # training + >>> text = "How many cats are in the picture?" + >>> label = "2" + >>> inputs = processor(images=image, text=text, return_tensors="pt") + >>> labels = processor(text=label, return_tensors="pt").input_ids + + >>> inputs["labels"] = labels + >>> outputs = model(**inputs) + >>> loss = outputs.loss + >>> loss.backward() + + >>> # inference + >>> text = "How many cats are in the picture?" + >>> inputs = processor(images=image, text=text, return_tensors="pt") + >>> outputs = model.generate(**inputs) + >>> print(processor.decode(outputs[0], skip_special_tokens=True)) + 2 + ```""" + if labels is None and decoder_input_ids is None: + raise ValueError( + "Either `decoder_input_ids` or `labels` should be passed when calling `forward` with" + " `BlipForQuestionAnswering`. if you are training the model make sure that `labels` is passed, if you" + " are using the model for inference make sure that `decoder_input_ids` is passed or call `generate`" + ) + + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + + vision_outputs = self.vision_model( + pixel_values=pixel_values, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + interpolate_pos_encoding=interpolate_pos_encoding, + ) + + image_embeds = vision_outputs[0] + image_attention_mask = torch.ones(image_embeds.size()[:-1], dtype=torch.long) + + question_embeds = self.text_encoder( + input_ids=input_ids, + attention_mask=attention_mask, + encoder_hidden_states=image_embeds, + encoder_attention_mask=image_attention_mask, + return_dict=return_dict, + ) + + if labels is not None and decoder_input_ids is None: + # labels are already shifted right, see: https://github.com/huggingface/transformers/pull/23153 + decoder_input_ids = labels + + question_embeds = question_embeds[0] if not return_dict else question_embeds.last_hidden_state + + answer_output = self.text_decoder( + input_ids=decoder_input_ids, + attention_mask=decoder_attention_mask, + encoder_hidden_states=question_embeds, + encoder_attention_mask=attention_mask, + labels=labels, + return_dict=return_dict, + reduction="mean", + ) + + if labels is not None: + decoder_loss = answer_output.loss.mean() if return_dict else answer_output[0].mean() + else: + decoder_loss = None + + if not return_dict: + outputs = (decoder_loss, image_embeds, vision_outputs[0]) + vision_outputs[2:] + return tuple(output for output in outputs if output is not None) + + return BlipTextVisionModelOutput( + loss=decoder_loss, + image_embeds=image_embeds, + last_hidden_state=vision_outputs.last_hidden_state, + hidden_states=vision_outputs.hidden_states, + attentions=vision_outputs.attentions, + ) + + @torch.no_grad() + def generate( + self, + input_ids: torch.LongTensor, + pixel_values: torch.FloatTensor, + attention_mask: Optional[torch.LongTensor] = None, + interpolate_pos_encoding: bool = False, + **generate_kwargs, + ) -> torch.LongTensor: + r""" + Overrides *generate* function to be able to use the model as a conditional generator + + Parameters: + input_ids (*torch.LongTensor* of shape *(batch_size, sequence_length)*): + The sequence used as a prompt for the generation. + pixel_values (*torch.FloatTensor* of shape *(batch_size, num_channels, image_height, image_width)*: + Input image to be processed + attention_mask (*torch.LongTensor* of shape *(batch_size, sequence_length)*, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`. `1` for + tokens that are NOT MASKED, `0` for MASKED tokens. + **generate_kwargs: + Additional arguments passed to the *generate* function of the decoder + + + Examples: + ```python + >>> from PIL import Image + >>> import requests + >>> from transformers import AutoProcessor, BlipForQuestionAnswering + + >>> model = BlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base") + >>> processor = AutoProcessor.from_pretrained("Salesforce/blip-vqa-base") + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + >>> text = "How many cats are in the picture?" + + >>> inputs = processor(images=image, text=text, return_tensors="pt") + + >>> outputs = model.generate(**inputs) + >>> print(processor.decode(outputs[0], skip_special_tokens=True)) + 2 + ``` + """ + vision_outputs = self.vision_model( + pixel_values=pixel_values, + interpolate_pos_encoding=interpolate_pos_encoding, + ) + + image_embeds = vision_outputs[0] + + image_attention_mask = torch.ones(image_embeds.size()[:-1], dtype=torch.long, device=image_embeds.device) + + if isinstance(input_ids, list): + input_ids = torch.LongTensor(input_ids) + + question_outputs = self.text_encoder( + input_ids=input_ids, + attention_mask=attention_mask, + encoder_hidden_states=image_embeds, + encoder_attention_mask=image_attention_mask, + return_dict=False, + ) + + question_embeds = question_outputs[0] + + question_attention_mask = torch.ones( + question_embeds.size()[:-1], dtype=torch.long, device=question_embeds.device + ) + + bos_ids = torch.full( + (question_embeds.size(0), 1), fill_value=self.decoder_start_token_id, device=question_embeds.device + ) + + outputs = self.text_decoder.generate( + input_ids=bos_ids, + eos_token_id=self.config.text_config.sep_token_id, + pad_token_id=self.config.text_config.pad_token_id, + encoder_hidden_states=question_embeds, + encoder_attention_mask=question_attention_mask, + **generate_kwargs, + ) + + return outputs + + +@add_start_docstrings( + """ + BLIP Model with a vision and text projector, and a classification head on top. The model is used in the context of + image-text retrieval. Given an image and a text, the model returns the probability of the text being relevant to + the image. + """, + BLIP_START_DOCSTRING, +) +class BlipForImageTextRetrieval(BlipPreTrainedModel): + config_class = BlipConfig + + def __init__(self, config: BlipConfig): + super().__init__(config) + + self.vision_model = BlipVisionModel(config.vision_config) + + self.text_encoder = BlipTextModel(config.text_config, add_pooling_layer=False) + + # vision projection layer + self.vision_proj = nn.Linear(config.vision_config.hidden_size, config.image_text_hidden_size) + + # text projection layer + self.text_proj = nn.Linear(config.text_config.hidden_size, config.image_text_hidden_size) + + # image text matching head + self.itm_head = nn.Linear(config.text_config.hidden_size, 2) + + self.decoder_pad_token_id = ( + config.text_config.pad_token_id + if not hasattr(config, "decoder_pad_token_id") + else config.decoder_pad_token_id + ) + self.decoder_start_token_id = ( + config.text_config.bos_token_id + if not hasattr(config, "decoder_start_token_id") + else config.decoder_start_token_id + ) + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.text_encoder.get_input_embeddings() + + def set_input_embeddings(self, value): + self.text_encoder.set_input_embeddings(value) + + @add_start_docstrings_to_model_forward(BLIP_VISION_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=BlipTextVisionModelOutput, config_class=BlipVisionConfig) + def forward( + self, + input_ids: torch.LongTensor, + pixel_values: torch.FloatTensor, + use_itm_head: Optional[bool] = True, + attention_mask: Optional[torch.LongTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + interpolate_pos_encoding: bool = False, + ) -> Union[Tuple, BlipTextVisionModelOutput]: + r""" + Returns: + + Examples: + + ```python + >>> from PIL import Image + >>> import requests + >>> from transformers import AutoProcessor, BlipForImageTextRetrieval + + >>> model = BlipForImageTextRetrieval.from_pretrained("Salesforce/blip-itm-base-coco") + >>> processor = AutoProcessor.from_pretrained("Salesforce/blip-itm-base-coco") + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + >>> text = "an image of a cat" + + >>> inputs = processor(images=image, text=text, return_tensors="pt") + >>> outputs = model(**inputs) + ``` + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + + vision_outputs = self.vision_model( + pixel_values=pixel_values, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + interpolate_pos_encoding=interpolate_pos_encoding, + ) + + image_embeds = vision_outputs[0] + image_atts = torch.ones(image_embeds.size()[:-1], dtype=torch.long) + + if use_itm_head: + question_embeds = self.text_encoder( + input_ids=input_ids, + attention_mask=attention_mask, + encoder_hidden_states=image_embeds, + encoder_attention_mask=image_atts, + return_dict=return_dict, + ) + question_embeds = question_embeds[0] if not return_dict else question_embeds.last_hidden_state + + output = self.itm_head(question_embeds[:, 0, :]) + else: + question_embeds = self.text_encoder( + input_ids=input_ids, + attention_mask=attention_mask, + return_dict=return_dict, + ) + question_embeds = question_embeds[0] if not return_dict else question_embeds.last_hidden_state + + image_feat = normalize(self.vision_proj(image_embeds[:, 0, :]), dim=-1) + text_feat = normalize(self.text_proj(question_embeds[:, 0, :]), dim=-1) + + output = image_feat @ text_feat.t() + + if not return_dict: + outputs = (output, vision_outputs[0]) + vision_outputs[2:] + (question_embeds,) + return tuple(output for output in outputs if output is not None) + + return BlipImageTextMatchingModelOutput( + itm_score=output, + last_hidden_state=vision_outputs.last_hidden_state, + hidden_states=vision_outputs.hidden_states, + attentions=vision_outputs.attentions, + question_embeds=question_embeds, + ) + + +__all__ = [ + "BlipModel", + "BlipPreTrainedModel", + "BlipForConditionalGeneration", + "BlipForQuestionAnswering", + "BlipVisionModel", + "BlipTextModel", + "BlipForImageTextRetrieval", +] diff --git a/docs/transformers/build/lib/transformers/models/blip/modeling_blip_text.py b/docs/transformers/build/lib/transformers/models/blip/modeling_blip_text.py new file mode 100644 index 0000000000000000000000000000000000000000..f26f269c7b90fc8d5c11001531d95aa81db02729 --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/blip/modeling_blip_text.py @@ -0,0 +1,960 @@ +# coding=utf-8 +# Copyright 2022 The Salesforce Team Authors and The HuggingFace Team. All rights reserved. +# +# Licensed under the BSD-3-clause license (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import math +from typing import List, Optional, Tuple, Union + +import torch +import torch.utils.checkpoint +from torch import Tensor, device, nn +from torch.nn import CrossEntropyLoss + +from ...activations import ACT2FN +from ...generation import GenerationMixin +from ...modeling_outputs import ( + BaseModelOutputWithPastAndCrossAttentions, + BaseModelOutputWithPoolingAndCrossAttentions, + CausalLMOutputWithCrossAttentions, +) +from ...modeling_utils import ( + PreTrainedModel, + apply_chunking_to_forward, + find_pruneable_heads_and_indices, + prune_linear_layer, +) +from ...utils import logging +from .configuration_blip import BlipTextConfig + + +logger = logging.get_logger(__name__) + + +# Adapted from https://github.com/salesforce/BLIP/blob/main/models/med.py#L52 +class BlipTextEmbeddings(nn.Module): + """Construct the embeddings from word and position embeddings.""" + + def __init__(self, config): + super().__init__() + self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id) + self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size) + + # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load + # any TensorFlow checkpoint file + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + # position_ids (1, len position emb) is contiguous in memory and exported when serialized + self.register_buffer( + "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False + ) + self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") + + self.config = config + + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + past_key_values_length: int = 0, + ) -> torch.Tensor: + if input_ids is not None: + input_shape = input_ids.size() + else: + input_shape = inputs_embeds.size()[:-1] + + seq_length = input_shape[1] + + if position_ids is None: + position_ids = self.position_ids[:, past_key_values_length : seq_length + past_key_values_length] + + if inputs_embeds is None: + inputs_embeds = self.word_embeddings(input_ids) + + embeddings = inputs_embeds + + if self.position_embedding_type == "absolute": + position_embeddings = self.position_embeddings(position_ids) + embeddings += position_embeddings + embeddings = self.LayerNorm(embeddings) + embeddings = self.dropout(embeddings) + return embeddings + + +# Adapted from https://github.com/salesforce/BLIP/blob/main/models/med.py#L97 +class BlipTextSelfAttention(nn.Module): + def __init__(self, config, is_cross_attention): + super().__init__() + self.config = config + if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"): + raise ValueError( + "The hidden size (%d) is not a multiple of the number of attention heads (%d)" + % (config.hidden_size, config.num_attention_heads) + ) + + self.num_attention_heads = config.num_attention_heads + self.attention_head_size = int(config.hidden_size / config.num_attention_heads) + self.all_head_size = self.num_attention_heads * self.attention_head_size + + self.query = nn.Linear(config.hidden_size, self.all_head_size) + if is_cross_attention: + self.key = nn.Linear(config.encoder_hidden_size, self.all_head_size) + self.value = nn.Linear(config.encoder_hidden_size, self.all_head_size) + else: + self.key = nn.Linear(config.hidden_size, self.all_head_size) + self.value = nn.Linear(config.hidden_size, self.all_head_size) + + self.dropout = nn.Dropout(config.attention_probs_dropout_prob) + self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") + if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query": + self.max_position_embeddings = config.max_position_embeddings + self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size) + + def save_attn_gradients(self, attn_gradients): + self.attn_gradients = attn_gradients + + def get_attn_gradients(self): + return self.attn_gradients + + def save_attention_map(self, attention_map): + self.attention_map = attention_map + + def get_attention_map(self): + return self.attention_map + + def transpose_for_scores(self, x): + new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size) + x = x.view(*new_x_shape) + return x.permute(0, 2, 1, 3) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.FloatTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + encoder_attention_mask: Optional[torch.FloatTensor] = None, + past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None, + output_attentions: Optional[bool] = False, + ) -> Tuple[torch.Tensor]: + mixed_query_layer = self.query(hidden_states) + + # If this is instantiated as a cross-attention module, the keys + # and values come from an encoder; the attention mask needs to be + # such that the encoder's padding tokens are not attended to. + is_cross_attention = encoder_hidden_states is not None + + if is_cross_attention: + key_layer = self.transpose_for_scores(self.key(encoder_hidden_states)) + value_layer = self.transpose_for_scores(self.value(encoder_hidden_states)) + attention_mask = encoder_attention_mask + elif past_key_value is not None: + key_layer = self.transpose_for_scores(self.key(hidden_states)) + value_layer = self.transpose_for_scores(self.value(hidden_states)) + key_layer = torch.cat([past_key_value[0], key_layer], dim=2) + value_layer = torch.cat([past_key_value[1], value_layer], dim=2) + else: + key_layer = self.transpose_for_scores(self.key(hidden_states)) + value_layer = self.transpose_for_scores(self.value(hidden_states)) + + query_layer = self.transpose_for_scores(mixed_query_layer) + + past_key_value = (key_layer, value_layer) + + # Take the dot product between "query" and "key" to get the raw attention scores. + attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2)) + + if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query": + seq_length = hidden_states.size()[1] + position_ids_l = torch.arange(seq_length, dtype=torch.long, device=hidden_states.device).view(-1, 1) + position_ids_r = torch.arange(seq_length, dtype=torch.long, device=hidden_states.device).view(1, -1) + distance = position_ids_l - position_ids_r + positional_embedding = self.distance_embedding(distance + self.max_position_embeddings - 1) + positional_embedding = positional_embedding.to(dtype=query_layer.dtype) # fp16 compatibility + + if self.position_embedding_type == "relative_key": + relative_position_scores = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding) + attention_scores = attention_scores + relative_position_scores + elif self.position_embedding_type == "relative_key_query": + relative_position_scores_query = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding) + relative_position_scores_key = torch.einsum("bhrd,lrd->bhlr", key_layer, positional_embedding) + attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key + + attention_scores = attention_scores / math.sqrt(self.attention_head_size) + if attention_mask is not None: + # Apply the attention mask is (precomputed for all layers in BlipTextModel forward() function) + attention_scores = attention_scores + attention_mask.to(attention_scores.device) + + # Normalize the attention scores to probabilities. + attention_probs = nn.Softmax(dim=-1)(attention_scores) + + # This is actually dropping out entire tokens to attend to, which might + # seem a bit unusual, but is taken from the original Transformer paper. + attention_probs_dropped = self.dropout(attention_probs) + + # Mask heads if we want to + if head_mask is not None: + attention_probs_dropped = attention_probs_dropped * head_mask + + context_layer = torch.matmul(attention_probs_dropped, value_layer) + + context_layer = context_layer.permute(0, 2, 1, 3).contiguous() + new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,) + context_layer = context_layer.view(*new_context_layer_shape) + + outputs = (context_layer, attention_probs) if output_attentions else (context_layer,) + + outputs = outputs + (past_key_value,) + return outputs + + +# Copied from transformers.models.bert.modeling_bert.BertSelfOutput with Bert -> BlipText +class BlipTextSelfOutput(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = self.LayerNorm(hidden_states + input_tensor) + return hidden_states + + +# Adapted from https://github.com/salesforce/BLIP/blob/main/models/med.py#242 +class BlipTextAttention(nn.Module): + def __init__(self, config, is_cross_attention=False): + super().__init__() + self.self = BlipTextSelfAttention(config, is_cross_attention) + self.output = BlipTextSelfOutput(config) + self.pruned_heads = set() + + def prune_heads(self, heads): + if len(heads) == 0: + return + heads, index = find_pruneable_heads_and_indices( + heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads + ) + + # Prune linear layers + self.self.query = prune_linear_layer(self.self.query, index) + self.self.key = prune_linear_layer(self.self.key, index) + self.self.value = prune_linear_layer(self.self.value, index) + self.output.dense = prune_linear_layer(self.output.dense, index, dim=1) + + # Update hyper params and store pruned heads + self.self.num_attention_heads = self.self.num_attention_heads - len(heads) + self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads + self.pruned_heads = self.pruned_heads.union(heads) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.FloatTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + encoder_attention_mask: Optional[torch.FloatTensor] = None, + past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None, + output_attentions: Optional[bool] = False, + ) -> Tuple[torch.Tensor]: + self_outputs = self.self( + hidden_states, + attention_mask, + head_mask, + encoder_hidden_states, + encoder_attention_mask, + past_key_value, + output_attentions, + ) + attention_output = self.output(self_outputs[0], hidden_states) + outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them + return outputs + + +# Copied from transformers.models.bert.modeling_bert.BertIntermediate with Bert -> BlipText +class BlipTextIntermediate(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.intermediate_size) + if isinstance(config.hidden_act, str): + self.intermediate_act_fn = ACT2FN[config.hidden_act] + else: + self.intermediate_act_fn = config.hidden_act + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.intermediate_act_fn(hidden_states) + return hidden_states + + +# Copied from transformers.models.bert.modeling_bert.BertOutput with Bert -> BlipText +class BlipTextOutput(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.intermediate_size, config.hidden_size) + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = self.LayerNorm(hidden_states + input_tensor) + return hidden_states + + +class BlipTextLayer(nn.Module): + def __init__(self, config, layer_num): + super().__init__() + self.config = config + self.chunk_size_feed_forward = config.chunk_size_feed_forward + self.seq_len_dim = 1 + self.attention = BlipTextAttention(config) + self.layer_num = layer_num + if self.config.is_decoder: + self.crossattention = BlipTextAttention(config, is_cross_attention=self.config.is_decoder) + self.intermediate = BlipTextIntermediate(config) + self.output = BlipTextOutput(config) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.FloatTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + encoder_attention_mask: Optional[torch.FloatTensor] = None, + past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None, + output_attentions: Optional[bool] = False, + ) -> Tuple[torch.Tensor]: + # decoder uni-directional self-attention cached key/values tuple is at positions 1,2 + self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None + self_attention_outputs = self.attention( + hidden_states, + attention_mask, + head_mask, + output_attentions=output_attentions, + past_key_value=self_attn_past_key_value, + ) + attention_output = self_attention_outputs[0] + + outputs = self_attention_outputs[1:-1] + present_key_value = self_attention_outputs[-1] + + if encoder_hidden_states is not None: + cross_attention_outputs = self.crossattention( + attention_output, + attention_mask, + head_mask, + encoder_hidden_states, + encoder_attention_mask, + output_attentions=output_attentions, + ) + attention_output = cross_attention_outputs[0] + outputs = outputs + cross_attention_outputs[1:-1] # add cross attentions if we output attention weights + layer_output = apply_chunking_to_forward( + self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attention_output + ) + outputs = (layer_output,) + outputs + + outputs = outputs + (present_key_value,) + + return outputs + + def feed_forward_chunk(self, attention_output): + intermediate_output = self.intermediate(attention_output) + layer_output = self.output(intermediate_output, attention_output) + return layer_output + + +# Adapted from https://github.com/salesforce/BLIP/blob/main/models/med.py#L386 +class BlipTextEncoder(nn.Module): + def __init__(self, config): + super().__init__() + self.config = config + self.layer = nn.ModuleList([BlipTextLayer(config, i) for i in range(config.num_hidden_layers)]) + self.gradient_checkpointing = False + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.FloatTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + encoder_attention_mask: Optional[torch.FloatTensor] = None, + past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = False, + output_hidden_states: Optional[bool] = False, + return_dict: Optional[bool] = True, + ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPastAndCrossAttentions]: + if self.gradient_checkpointing and self.training: + if use_cache: + logger.warning( + "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." + ) + use_cache = False + all_hidden_states = () if output_hidden_states else None + all_self_attentions = () if output_attentions else None + all_cross_attentions = () if output_attentions and self.config.is_decoder else None + + next_decoder_cache = () if use_cache else None + + for i in range(self.config.num_hidden_layers): + layer_module = self.layer[i] + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + layer_head_mask = head_mask[i] if head_mask is not None else None + past_key_value = past_key_values[i] if past_key_values is not None else None + + if self.gradient_checkpointing and self.training: + layer_outputs = self._gradient_checkpointing_func( + layer_module.__call__, + hidden_states, + attention_mask, + layer_head_mask, + encoder_hidden_states, + encoder_attention_mask, + past_key_value, + output_attentions, + ) + else: + layer_outputs = layer_module( + hidden_states, + attention_mask, + layer_head_mask, + encoder_hidden_states, + encoder_attention_mask, + past_key_value, + output_attentions, + ) + + hidden_states = layer_outputs[0] + if use_cache: + next_decoder_cache += (layer_outputs[-1],) + if output_attentions: + all_self_attentions = all_self_attentions + (layer_outputs[1],) + all_cross_attentions = all_cross_attentions + (layer_outputs[2],) + + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + if not return_dict: + return tuple( + v + for v in [ + hidden_states, + next_decoder_cache, + all_hidden_states, + all_self_attentions, + all_cross_attentions, + ] + if v is not None + ) + return BaseModelOutputWithPastAndCrossAttentions( + last_hidden_state=hidden_states, + past_key_values=next_decoder_cache, + hidden_states=all_hidden_states, + attentions=all_self_attentions, + cross_attentions=all_cross_attentions, + ) + + +# Copied from transformers.models.bert.modeling_bert.BertPooler with Bert->BlipText +class BlipTextPooler(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + self.activation = nn.Tanh() + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + # We "pool" the model by simply taking the hidden state corresponding + # to the first token. + first_token_tensor = hidden_states[:, 0] + pooled_output = self.dense(first_token_tensor) + pooled_output = self.activation(pooled_output) + return pooled_output + + +# Copied from transformers.models.bert.modeling_bert.BertPredictionHeadTransform with Bert->BlipText +class BlipTextPredictionHeadTransform(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + if isinstance(config.hidden_act, str): + self.transform_act_fn = ACT2FN[config.hidden_act] + else: + self.transform_act_fn = config.hidden_act + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.transform_act_fn(hidden_states) + hidden_states = self.LayerNorm(hidden_states) + return hidden_states + + +# Copied from transformers.models.bert.modeling_bert.BertLMPredictionHead with Bert->BlipText +class BlipTextLMPredictionHead(nn.Module): + def __init__(self, config): + super().__init__() + self.transform = BlipTextPredictionHeadTransform(config) + + # The output weights are the same as the input embeddings, but there is + # an output-only bias for each token. + self.decoder = nn.Linear(config.hidden_size, config.vocab_size, bias=False) + + self.bias = nn.Parameter(torch.zeros(config.vocab_size)) + + # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings` + self.decoder.bias = self.bias + + def _tie_weights(self): + self.decoder.bias = self.bias + + def forward(self, hidden_states): + hidden_states = self.transform(hidden_states) + hidden_states = self.decoder(hidden_states) + return hidden_states + + +# Copied from transformers.models.bert.modeling_bert.BertOnlyMLMHead with Bert->BlipText +class BlipTextOnlyMLMHead(nn.Module): + def __init__(self, config): + super().__init__() + self.predictions = BlipTextLMPredictionHead(config) + + def forward(self, sequence_output: torch.Tensor) -> torch.Tensor: + prediction_scores = self.predictions(sequence_output) + return prediction_scores + + +# Adapted from https://github.com/salesforce/BLIP/blob/main/models/med.py#L548 +class BlipTextPreTrainedModel(PreTrainedModel): + """ + An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained + models. + """ + + config_class = BlipTextConfig + base_model_prefix = "bert" + _no_split_modules = [] + + def _init_weights(self, module): + """Initialize the weights""" + if isinstance(module, (nn.Linear, nn.Embedding)): + # Slightly different from the TF version which uses truncated_normal for initialization + # cf https://github.com/pytorch/pytorch/pull/5617 + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + elif isinstance(module, nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + if isinstance(module, nn.Linear) and module.bias is not None: + module.bias.data.zero_() + + +# Adapted from https://github.com/salesforce/BLIP/blob/3a29b7410476bf5f2ba0955827390eb6ea1f4f9d/models/med.py#L571 +class BlipTextModel(BlipTextPreTrainedModel): + """ + The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of + cross-attention is added between the self-attention layers, following the architecture described in [Attention is + all you need](https://arxiv.org/abs/1706.03762) by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, + Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin. argument and `is_decoder` set to `True`; an + `encoder_hidden_states` is then expected as an input to the forward pass. + """ + + def __init__(self, config, add_pooling_layer=True): + super().__init__(config) + self.config = config + + self.embeddings = BlipTextEmbeddings(config) + self.encoder = BlipTextEncoder(config) + self.pooler = BlipTextPooler(config) if add_pooling_layer else None + + self.post_init() + + def get_input_embeddings(self): + return self.embeddings.word_embeddings + + def set_input_embeddings(self, value): + self.embeddings.word_embeddings = value + + # Copied from transformers.models.bert.modeling_bert.BertModel._prune_heads + def _prune_heads(self, heads_to_prune): + """ + Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base + class PreTrainedModel + """ + for layer, heads in heads_to_prune.items(): + self.encoder.layer[layer].attention.prune_heads(heads) + + def get_extended_attention_mask( + self, attention_mask: Tensor, input_shape: Tuple[int], device: device, is_decoder: bool + ) -> Tensor: + """ + Makes broadcastable attention and causal masks so that future and masked tokens are ignored. + + Arguments: + attention_mask (`torch.Tensor`): + Mask with ones indicating tokens to attend to, zeros for tokens to ignore. + input_shape (`Tuple[int]`): + The shape of the input to the model. + device (`torch.device`): + The device of the input to the model. + + Returns: + `torch.Tensor` The extended attention mask, with a the same dtype as `attention_mask.dtype`. + """ + # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length] + # ourselves in which case we just need to make it broadcastable to all heads. + if attention_mask.dim() == 3: + extended_attention_mask = attention_mask[:, None, :, :] + elif attention_mask.dim() == 2: + # Provided a padding mask of dimensions [batch_size, seq_length] + # - if the model is a decoder, apply a causal mask in addition to the padding mask + # - if the model is an encoder, make the mask broadcastable to [batch_size, num_heads, seq_length, seq_length] + if is_decoder: + batch_size, seq_length = input_shape + + seq_ids = torch.arange(seq_length, device=device) + causal_mask = seq_ids[None, None, :].repeat(batch_size, seq_length, 1) <= seq_ids[None, :, None] + # in case past_key_values are used we need to add a prefix ones mask to the causal mask + causal_mask = causal_mask.to(attention_mask.dtype) + + if causal_mask.shape[1] < attention_mask.shape[1]: + prefix_seq_len = attention_mask.shape[1] - causal_mask.shape[1] + causal_mask = torch.cat( + [ + torch.ones( + (batch_size, seq_length, prefix_seq_len), device=device, dtype=causal_mask.dtype + ), + causal_mask, + ], + axis=-1, + ) + + extended_attention_mask = causal_mask[:, None, :, :] * attention_mask[:, None, None, :] + else: + extended_attention_mask = attention_mask[:, None, None, :] + else: + raise ValueError( + "Wrong shape for input_ids (shape {}) or attention_mask (shape {})".format( + input_shape, attention_mask.shape + ) + ) + + # Since attention_mask is 1.0 for positions we want to attend and 0.0 for + # masked positions, this operation will create a tensor which is 0.0 for + # positions we want to attend and -10000.0 for masked positions. + # Since we are adding it to the raw scores before the softmax, this is + # effectively the same as removing these entirely. + extended_attention_mask = extended_attention_mask.to(dtype=self.dtype) # fp16 compatibility + extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0 + return extended_attention_mask + + def forward( + self, + input_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + encoder_embeds: Optional[torch.Tensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, + encoder_attention_mask: Optional[torch.Tensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + is_decoder: Optional[bool] = False, + ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPoolingAndCrossAttentions]: + r""" + encoder_hidden_states (`torch.FloatTensor`, *optional*): + Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if + the model is configured as a decoder. + encoder_attention_mask (`torch.FloatTensor`, *optional*): + Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in + the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`: + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*): + Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding. + If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that + don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all + `decoder_input_ids` of shape `(batch_size, sequence_length)`. + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see + `past_key_values`). + """ + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if is_decoder: + use_cache = use_cache if use_cache is not None else self.config.use_cache + else: + use_cache = False + + if input_ids is not None and inputs_embeds is not None: + raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") + elif input_ids is not None: + self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask) + input_shape = input_ids.size() + batch_size, seq_length = input_shape + device = input_ids.device + elif inputs_embeds is not None: + input_shape = inputs_embeds.size()[:-1] + batch_size, seq_length = input_shape + device = inputs_embeds.device + elif encoder_embeds is not None: + input_shape = encoder_embeds.size()[:-1] + batch_size, seq_length = input_shape + device = encoder_embeds.device + else: + raise ValueError("You have to specify either input_ids or inputs_embeds or encoder_embeds") + + # past_key_values_length + past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0 + + if attention_mask is None: + attention_mask = torch.ones(((batch_size, seq_length + past_key_values_length))).to(device) + + # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length] + # ourselves in which case we just need to make it broadcastable to all heads. + extended_attention_mask: torch.Tensor = self.get_extended_attention_mask( + attention_mask, input_shape, device, is_decoder + ) + + # If a 2D or 3D attention mask is provided for the cross-attention + # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length] + if encoder_hidden_states is not None: + if isinstance(encoder_hidden_states, list): + encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states[0].size() + else: + encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size() + encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length) + + if isinstance(encoder_attention_mask, list): + encoder_extended_attention_mask = [self.invert_attention_mask(mask) for mask in encoder_attention_mask] + elif encoder_attention_mask is None: + encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device) + encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask) + else: + encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask) + else: + encoder_extended_attention_mask = None + + # Prepare head mask if needed + # 1.0 in head_mask indicate we keep the head + # attention_probs has shape bsz x n_heads x N x N + # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads] + # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length] + head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers) + + if encoder_embeds is None: + embedding_output = self.embeddings( + input_ids=input_ids, + position_ids=position_ids, + inputs_embeds=inputs_embeds, + past_key_values_length=past_key_values_length, + ) + else: + embedding_output = encoder_embeds + + encoder_outputs = self.encoder( + embedding_output, + attention_mask=extended_attention_mask, + head_mask=head_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_extended_attention_mask, + past_key_values=past_key_values, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + sequence_output = encoder_outputs[0] + pooled_output = self.pooler(sequence_output) if self.pooler is not None else None + + if not return_dict: + return (sequence_output, pooled_output) + encoder_outputs[1:] + + return BaseModelOutputWithPoolingAndCrossAttentions( + last_hidden_state=sequence_output, + pooler_output=pooled_output, + past_key_values=encoder_outputs.past_key_values, + hidden_states=encoder_outputs.hidden_states, + attentions=encoder_outputs.attentions, + cross_attentions=encoder_outputs.cross_attentions, + ) + + +# Adapted from https://github.com/salesforce/BLIP/blob/main/models/med.py#L811 +class BlipTextLMHeadModel(BlipTextPreTrainedModel, GenerationMixin): + def __init__(self, config): + super().__init__(config) + + self.bert = BlipTextModel(config, add_pooling_layer=False) + self.cls = BlipTextOnlyMLMHead(config) + self.label_smoothing = config.label_smoothing + + def get_input_embeddings(self): + return self.bert.get_input_embeddings() + + def set_input_embeddings(self, new_embeddings): + self.bert.set_input_embeddings(new_embeddings) + + def get_output_embeddings(self): + return self.cls.predictions.decoder + + def set_output_embeddings(self, new_embeddings): + self.cls.predictions.decoder = new_embeddings + self.cls.predictions.bias = new_embeddings.bias + + def forward( + self, + input_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, + encoder_attention_mask: Optional[torch.Tensor] = None, + labels: Optional[torch.Tensor] = None, + past_key_values: Optional[List[torch.Tensor]] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + return_logits: Optional[bool] = False, + is_decoder: Optional[bool] = True, + reduction: Optional[str] = "mean", + ) -> Union[Tuple[torch.Tensor], CausalLMOutputWithCrossAttentions]: + r""" + encoder_hidden_states (`torch.FloatTensor`, *optional*): Sequence of + hidden-states at the output of the last layer of the encoder. Used in the cross-attention if the model is + configured as a decoder. + encoder_attention_mask (`torch.FloatTensor`, *optional*): + Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in + the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`: + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + labels (`torch.LongTensor`, *optional*): + Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in + `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are + ignored (masked), the loss is only computed for the tokens with labels n `[0, ..., config.vocab_size]` + past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*): + Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding. + If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that + don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all + `decoder_input_ids` of shape `(batch_size, sequence_length)`. + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see + `past_key_values`). + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + if labels is not None: + use_cache = False + + outputs = self.bert( + input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + past_key_values=past_key_values, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + is_decoder=is_decoder, + ) + + sequence_output = outputs[0] + prediction_scores = self.cls(sequence_output) + + if return_logits: + return prediction_scores[:, :-1, :].contiguous() + + lm_loss = None + if labels is not None: + # we are doing next-token prediction; shift prediction scores and input ids by one + shifted_prediction_scores = prediction_scores[:, :-1, :].contiguous() + labels = labels[:, 1:].contiguous().to(shifted_prediction_scores.device) + loss_fct = CrossEntropyLoss(reduction=reduction, label_smoothing=self.label_smoothing) + lm_loss = loss_fct(shifted_prediction_scores.view(-1, self.config.vocab_size), labels.view(-1)) + if reduction == "none": + lm_loss = lm_loss.view(prediction_scores.size(0), -1).sum(1) + + if not return_dict: + output = (prediction_scores,) + outputs[2:] + return ((lm_loss,) + output) if lm_loss is not None else output + + return CausalLMOutputWithCrossAttentions( + loss=lm_loss, + logits=prediction_scores, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + cross_attentions=outputs.cross_attentions, + ) + + def prepare_inputs_for_generation(self, input_ids, past_key_values=None, attention_mask=None, **model_kwargs): + # Overwrite -- hardcoded key return (`is_decoder=True`) + + input_shape = input_ids.shape + # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly + if attention_mask is None: + attention_mask = input_ids.new_ones(input_shape) + + # cut decoder_input_ids if past_key_values is used + if past_key_values is not None: + past_length = past_key_values[0][0].shape[2] + + # Some generation methods already pass only the last input ID + if input_ids.shape[1] > past_length: + remove_prefix_length = past_length + else: + # Default to old behavior: keep only final ID + remove_prefix_length = input_ids.shape[1] - 1 + + input_ids = input_ids[:, remove_prefix_length:] + + return { + "input_ids": input_ids, + "attention_mask": attention_mask, + "past_key_values": past_key_values, + "encoder_hidden_states": model_kwargs.get("encoder_hidden_states", None), + "encoder_attention_mask": model_kwargs.get("encoder_attention_mask", None), + "is_decoder": True, + } + + def _reorder_cache(self, past_key_values, beam_idx): + reordered_past = () + for layer_past in past_key_values: + reordered_past += ( + tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past), + ) + return reordered_past + + +__all__ = ["BlipTextModel", "BlipTextLMHeadModel", "BlipTextPreTrainedModel"] diff --git a/docs/transformers/build/lib/transformers/models/blip/modeling_tf_blip.py b/docs/transformers/build/lib/transformers/models/blip/modeling_tf_blip.py new file mode 100644 index 0000000000000000000000000000000000000000..9573ca0fbbc553a9fb9943f84dd142be5633d9b2 --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/blip/modeling_tf_blip.py @@ -0,0 +1,1709 @@ +# coding=utf-8 +# Copyright 2023 The Salesforce Team Authors and The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""TensorFlow BLIP model.""" + +from __future__ import annotations + +import warnings +from dataclasses import dataclass +from typing import Any, Optional, Tuple, Union + +import tensorflow as tf + +from ...modeling_tf_outputs import TFBaseModelOutput, TFBaseModelOutputWithPooling +from ...modeling_tf_utils import ( + TFPreTrainedModel, + get_initializer, + get_tf_activation, + keras, + keras_serializable, + shape_list, + unpack_inputs, +) +from ...tf_utils import check_embeddings_within_bounds, stable_softmax +from ...utils import ( + ModelOutput, + add_start_docstrings, + add_start_docstrings_to_model_forward, + logging, + replace_return_docstrings, +) +from .configuration_blip import BlipConfig, BlipTextConfig, BlipVisionConfig +from .modeling_tf_blip_text import BLIP_TEXT_INPUTS_DOCSTRING, TFBlipTextLMHeadModel, TFBlipTextModel + + +logger = logging.get_logger(__name__) + +_CHECKPOINT_FOR_DOC = "Salesforce/blip-vqa-base" + + +# Copied from transformers.models.clip.modeling_tf_clip.contrastive_loss +def contrastive_loss(logits: tf.Tensor) -> tf.Tensor: + return tf.math.reduce_mean( + keras.metrics.sparse_categorical_crossentropy( + y_true=tf.range(shape_list(logits)[0]), y_pred=logits, from_logits=True + ) + ) + + +# Copied from transformers.models.clip.modeling_tf_clip.clip_loss with clip->blip +def blip_loss(similarity: tf.Tensor) -> tf.Tensor: + caption_loss = contrastive_loss(similarity) + image_loss = contrastive_loss(tf.transpose(similarity)) + return (caption_loss + image_loss) / 2.0 + + +@dataclass +class TFBlipForConditionalGenerationModelOutput(ModelOutput): + """ + Adapted from the base class for vision model's outputs that also contains image embeddings of the pooling of the + last hidden states. This class also adds the loss term from the text decoder. + + Args: + loss (`tf.Tensor`, *optional*, returned when `labels` is provided, `tf.Tensor` of shape `(1,)`): + Languge modeling loss from the text decoder. + logits (`tf.Tensor` of shape `(batch_size, sequence_length, config.vocab_size)`, *optional*): + Prediction scores of the language modeling head of the text decoder model. + image_embeds (`tf.Tensor` of shape `(batch_size, output_dim)`, *optional*): + The image embeddings obtained after applying the Vision Transformer model to the input image. + last_hidden_state (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Sequence of hidden-states at the output of the last layer of the model. + hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True`): + Tuple of `tf.Tensor` (one for the output of the embeddings, if the model has an embedding layer, + one for + the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the optional initial embedding outputs. + attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed): + Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads.` + """ + + loss: Tuple[tf.Tensor] | None = None + logits: Tuple[tf.Tensor] | None = None + image_embeds: tf.Tensor | None = None + last_hidden_state: Optional[tf.Tensor] = None + hidden_states: Tuple[tf.Tensor, ...] | None = None + attentions: Tuple[tf.Tensor, ...] | None = None + + @property + def decoder_logits(self): + warnings.warn( + "`decoder_logits` attribute is deprecated and will be removed in version 5 of Transformers." + " Please use the `logits` attribute to retrieve the final output instead.", + FutureWarning, + ) + return self.logits + + +@dataclass +class TFBlipTextVisionModelOutput(ModelOutput): + """ + Adapted from the base class for vision model's outputs that also contains image embeddings of the pooling of the + last hidden states. This class also adds the loss term from the text decoder. + + Args: + loss (`tf.Tensor` of shape `(1,)`, *optional*, returned when `labels` is provided): + Languge modeling loss from the text decoder. + image_embeds (`tf.Tensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`): + The image embeddings obtained by applying the projection layer to the pooler_output. + last_hidden_state (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`): + Sequence of hidden-states at the output of the last layer of the model. + hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `tf.Tensor` (one for the output of the embeddings, if the model has an embedding layer, + one for + the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the optional initial embedding outputs. + attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + + loss: tf.Tensor | None = None + image_embeds: tf.Tensor | None = None + last_hidden_state: Optional[tf.Tensor] = None + hidden_states: Tuple[tf.Tensor, ...] | None = None + attentions: Tuple[tf.Tensor, ...] | None = None + + +@dataclass +class TFBlipImageTextMatchingModelOutput(ModelOutput): + """ + Adapted from the base class for vision model's outputs that also contains image embeddings of the pooling of the + last hidden states. This class also adds the loss term from the text decoder as well as the image-text similarity + scores. + + Args: + itm_score (`tf.Tensor`): + The image-text similarity scores. + loss (`tf.Tensor` of shape `(1,)`, *optional*, returned when `labels` is provided): + Languge modeling loss from the text decoder. + image_embeds (`tf.Tensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`): + The image embeddings obtained by applying the projection layer to the pooler_output. + last_hidden_state (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`): + Sequence of hidden-states at the output of the last layer of the model. + hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `tf.Tensor` (one for the output of the embeddings, if the model has an embedding layer, + one for + the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the optional initial embedding outputs. + vision_pooler_output (`tf.Tensor` of shape `(batch_size, hidden_size)`, *optional*): + Last layer hidden-state of the vision of the vision-only branch of the model. + attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + question_embeds (`tf.Tensor`): + The question embeddings obtained by the text projection layer. + """ + + itm_score: tf.Tensor | None = None + loss: tf.Tensor | None = None + image_embeds: tf.Tensor | None = None + last_hidden_state: Optional[tf.Tensor] = None + hidden_states: Tuple[tf.Tensor, ...] | None = None + vision_pooler_output: tf.Tensor | None = None + attentions: Tuple[tf.Tensor, ...] | None = None + question_embeds: Tuple[tf.Tensor] | None = None + + +@dataclass +class TFBlipOutput(ModelOutput): + """ + Args: + loss (`tf.Tensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`): + Contrastive loss for image-text similarity. + logits_per_image:(`tf.Tensor` of shape `(image_batch_size, text_batch_size)`): + The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text + similarity scores. + logits_per_text:(`tf.Tensor` of shape `(text_batch_size, image_batch_size)`): + The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image + similarity scores. + text_embeds(`tf.Tensor` of shape `(batch_size, output_dim`): + The text embeddings obtained by applying the projection layer to the pooled output of [`BlipTextModel`]. + image_embeds(`tf.Tensor` of shape `(batch_size, output_dim`): + The image embeddings obtained by applying the projection layer to the pooled output of [`BlipVisionModel`]. + text_model_output(`BaseModelOutputWithPooling`): + The output of the [`BlipTextModel`]. + vision_model_output(`BaseModelOutputWithPooling`): + The output of the [`BlipVisionModel`]. + """ + + loss: tf.Tensor | None = None + logits_per_image: Optional[tf.Tensor] = None + logits_per_text: Optional[tf.Tensor] = None + text_embeds: Optional[tf.Tensor] = None + image_embeds: Optional[tf.Tensor] = None + text_model_output: TFBaseModelOutputWithPooling = None + vision_model_output: TFBaseModelOutputWithPooling = None + + def to_tuple(self) -> Tuple[Any]: + return tuple( + self[k] if k not in ["text_model_output", "vision_model_output"] else getattr(self, k).to_tuple() + for k in self.keys() + ) + + +class TFBlipVisionEmbeddings(keras.layers.Layer): + def __init__(self, config: BlipVisionConfig, **kwargs): + super().__init__(**kwargs) + self.config = config + self.embed_dim = config.hidden_size + self.image_size = config.image_size + self.patch_size = config.patch_size + + self.patch_embedding = keras.layers.Conv2D( + filters=self.embed_dim, + kernel_size=self.patch_size, + strides=self.patch_size, + kernel_initializer=get_initializer(self.config.initializer_range), + data_format="channels_last", + name="patch_embedding", + ) + + self.num_patches = (self.image_size // self.patch_size) ** 2 + self.num_positions = self.num_patches + 1 + + def build(self, input_shape=None): + self.class_embedding = self.add_weight( + shape=(1, 1, self.embed_dim), + initializer=get_initializer(self.config.initializer_range), + trainable=True, + name="class_embedding", + ) + + self.position_embedding = self.add_weight( + shape=(1, self.num_positions, self.embed_dim), + initializer=get_initializer(self.config.initializer_range), + trainable=True, + name="position_embedding", + ) + + if self.built: + return + self.built = True + if getattr(self, "patch_embedding", None) is not None: + with tf.name_scope(self.patch_embedding.name): + self.patch_embedding.build([None, None, None, 3]) + + def call(self, pixel_values: tf.Tensor) -> tf.Tensor: + # Input is channels-first, we transpose. PyTorch transposes after the conv because PyTorch + # likes channels-first convs. + batch_size = tf.shape(pixel_values)[0] + pixel_values = tf.transpose(pixel_values, perm=(0, 2, 3, 1)) + patch_embeds = self.patch_embedding(pixel_values) + patch_embeds = tf.reshape(patch_embeds, (batch_size, self.num_patches, -1)) + + class_embeds = tf.broadcast_to(self.class_embedding, (batch_size, 1, self.embed_dim)) + embeddings = tf.concat([class_embeds, patch_embeds], axis=1) + embeddings = embeddings + self.position_embedding[:, : tf.shape(embeddings)[1], :] + return embeddings + + +# Copied from transformers.models.clip.modeling_tf_clip.TFCLIPTextEmbeddings with CLIP->Blip +class TFBlipTextEmbeddings(keras.layers.Layer): + def __init__(self, config: BlipTextConfig, **kwargs): + super().__init__(**kwargs) + + self.embed_dim = config.hidden_size + + self.config = config + + def build(self, input_shape: tf.TensorShape = None): + with tf.name_scope("token_embedding"): + self.weight = self.add_weight( + shape=(self.config.vocab_size, self.embed_dim), + initializer=get_initializer(self.config.initializer_factor * self.config.initializer_range), + trainable=True, + name="weight", + ) + + with tf.name_scope("position_embedding"): + self.position_embedding = self.add_weight( + shape=(self.config.max_position_embeddings, self.embed_dim), + initializer=get_initializer(self.config.initializer_factor * self.config.initializer_range), + trainable=True, + name="embeddings", + ) + + super().build(input_shape) + + def call( + self, + input_ids: Optional[tf.Tensor] = None, + position_ids: Optional[tf.Tensor] = None, + inputs_embeds: Optional[tf.Tensor] = None, + ) -> tf.Tensor: + """ + Applies embedding based on inputs tensor. + + Returns: + final_embeddings (`tf.Tensor`): output embedding tensor. + """ + if input_ids is None and inputs_embeds is None: + raise ValueError("You have to specify either input_ids or inputs_embeds") + + if inputs_embeds is None: + check_embeddings_within_bounds(input_ids, self.config.vocab_size) + inputs_embeds = tf.gather(params=self.weight, indices=input_ids) + + input_shape = shape_list(inputs_embeds)[:-1] + + if position_ids is None: + position_ids = tf.expand_dims(tf.range(start=0, limit=input_shape[-1]), axis=0) + + position_embeds = tf.gather(params=self.position_embedding, indices=position_ids) + position_embeds = tf.tile(input=position_embeds, multiples=(input_shape[0], 1, 1)) + final_embeddings = inputs_embeds + position_embeds + + return final_embeddings + + +class TFBlipAttention(keras.layers.Layer): + """Multi-headed attention from 'Attention Is All You Need' paper""" + + def __init__(self, config, **kwargs): + super().__init__(**kwargs) + self.config = config + self.embed_dim = config.hidden_size + self.num_heads = config.num_attention_heads + self.head_dim = self.embed_dim // self.num_heads + if self.head_dim * self.num_heads != self.embed_dim: + raise ValueError( + f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:" + f" {self.num_heads})." + ) + self.scale = self.head_dim**-0.5 + self.dropout = keras.layers.Dropout(config.attention_dropout, name="dropout") + + self.qkv = keras.layers.Dense( + 3 * self.embed_dim, kernel_initializer=get_initializer(config.initializer_range), name="qkv" + ) + + self.projection = keras.layers.Dense( + self.embed_dim, kernel_initializer=get_initializer(config.initializer_range), name="projection" + ) + + def call( + self, + hidden_states: tf.Tensor, + head_mask: tf.Tensor | None = None, + output_attentions: Optional[bool] = False, + training: Optional[bool] = None, + ) -> Tuple[tf.Tensor, tf.Tensor | None, Tuple[tf.Tensor] | None]: + """Input shape: Batch x Time x Channel""" + + bsz, tgt_len, embed_dim = shape_list(hidden_states) + + mixed_qkv = self.qkv(hidden_states) + mixed_qkv = tf.reshape(mixed_qkv, (bsz, tgt_len, 3, self.num_heads, self.head_dim)) + mixed_qkv = tf.transpose(mixed_qkv, perm=(2, 0, 3, 1, 4)) + + query_states, key_states, value_states = mixed_qkv[0], mixed_qkv[1], mixed_qkv[2] + + # Take the dot product between "query" and "key" to get the raw attention scores. + attention_scores = query_states @ tf.transpose(key_states, (0, 1, 3, 2)) + + attention_scores = attention_scores * self.scale + + # Normalize the attention scores to probabilities. + attention_probs = stable_softmax(attention_scores, axis=-1) + + # This is actually dropping out entire tokens to attend to, which might + # seem a bit unusual, but is taken from the original Transformer paper. + attention_probs = self.dropout(attention_probs, training=training) + + # Mask heads if we want to + if head_mask is not None: + attention_probs = attention_probs * head_mask + + context_layer = tf.transpose(attention_probs @ value_states, perm=(0, 2, 1, 3)) + + new_context_layer_shape = shape_list(context_layer)[:-2] + [self.embed_dim] + context_layer = tf.reshape(context_layer, new_context_layer_shape) + + output = self.projection(context_layer) + + outputs = (output, attention_probs) if output_attentions else (output, None) + + return outputs + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "dropout", None) is not None: + with tf.name_scope(self.dropout.name): + self.dropout.build(None) + if getattr(self, "qkv", None) is not None: + with tf.name_scope(self.qkv.name): + self.qkv.build([None, None, self.embed_dim]) + if getattr(self, "projection", None) is not None: + with tf.name_scope(self.projection.name): + self.projection.build([None, None, self.embed_dim]) + + +class TFBlipMLP(keras.layers.Layer): + def __init__(self, config: BlipConfig, **kwargs): + super().__init__(**kwargs) + + self.activation_fn = get_tf_activation(config.hidden_act) + + in_proj_std = (config.hidden_size**-0.5) * ((2 * config.num_hidden_layers) ** -0.5) + fc_std = (2 * config.hidden_size) ** -0.5 + + self.fc1 = keras.layers.Dense( + units=config.intermediate_size, kernel_initializer=get_initializer(fc_std), name="fc1" + ) + self.fc2 = keras.layers.Dense( + units=config.hidden_size, kernel_initializer=get_initializer(in_proj_std), name="fc2" + ) + self.config = config + + def call(self, hidden_states: tf.Tensor) -> tf.Tensor: + hidden_states = self.fc1(inputs=hidden_states) + hidden_states = self.activation_fn(hidden_states) + hidden_states = self.fc2(inputs=hidden_states) + return hidden_states + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "fc1", None) is not None: + with tf.name_scope(self.fc1.name): + self.fc1.build([None, None, self.config.hidden_size]) + if getattr(self, "fc2", None) is not None: + with tf.name_scope(self.fc2.name): + self.fc2.build([None, None, self.config.intermediate_size]) + + +class TFBlipEncoderLayer(keras.layers.Layer): + def __init__(self, config: BlipConfig, **kwargs): + super().__init__(**kwargs) + self.embed_dim = config.hidden_size + self.self_attn = TFBlipAttention(config, name="self_attn") + self.layer_norm1 = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm1") + self.mlp = TFBlipMLP(config, name="mlp") + self.layer_norm2 = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm2") + + def call( + self, + hidden_states: tf.Tensor, + attention_mask: tf.Tensor, + output_attentions: Optional[bool] = False, + training: Optional[bool] = None, + ) -> Tuple[tf.Tensor]: + """ + Args: + hidden_states (`tf.Tensor`): input to the layer of shape `(batch, seq_len, embed_dim)` + attention_mask (`tf.Tensor`): attention mask of size + `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values. + `(config.encoder_attention_heads,)`. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + """ + residual = hidden_states + + hidden_states = self.layer_norm1(hidden_states) + hidden_states, attn_weights = self.self_attn( + hidden_states=hidden_states, + head_mask=attention_mask, + output_attentions=output_attentions, + training=training, + ) + hidden_states = hidden_states + residual + residual = hidden_states + hidden_states = self.layer_norm2(hidden_states) + hidden_states = self.mlp(hidden_states) + + hidden_states = hidden_states + residual + + outputs = (hidden_states,) + + if output_attentions: + outputs += (attn_weights,) + + return outputs + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "self_attn", None) is not None: + with tf.name_scope(self.self_attn.name): + self.self_attn.build(None) + if getattr(self, "layer_norm1", None) is not None: + with tf.name_scope(self.layer_norm1.name): + self.layer_norm1.build([None, None, self.embed_dim]) + if getattr(self, "mlp", None) is not None: + with tf.name_scope(self.mlp.name): + self.mlp.build(None) + if getattr(self, "layer_norm2", None) is not None: + with tf.name_scope(self.layer_norm2.name): + self.layer_norm2.build([None, None, self.embed_dim]) + + +class TFBlipPreTrainedModel(TFPreTrainedModel): + """ + An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained + models. + """ + + config_class = BlipConfig + base_model_prefix = "blip" + _keys_to_ignore_on_load_missing = [r"position_ids"] + + +BLIP_START_DOCSTRING = r""" + This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads + etc.) + + This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it + as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and + behavior. + + Parameters: + config ([`BlipConfig`]): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the [`~TFPreTrainedModel.from_pretrained`] method to load the model weights. +""" + +BLIP_VISION_INPUTS_DOCSTRING = r""" + Args: + pixel_values (`tf.Tensor` of shape `(batch_size, num_channels, height, width)`): + Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using + [`BlipImageProcessor`]. See [`BlipImageProcessor.__call__`] for details. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. +""" + +BLIP_INPUTS_DOCSTRING = r""" + Args: + input_ids (`tf.Tensor` of shape `(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide + it. + + Indices can be obtained using [`AutoProcessor`]. See [`BlipProcessor.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + attention_mask (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + position_ids (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, + config.max_position_embeddings - 1]`. + + [What are position IDs?](../glossary#position-ids) + pixel_values (`tf.Tensor` of shape `(batch_size, num_channels, height, width)`): + Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using + [`BlipImageProcessor`]. See [`BlipImageProcessor.__call__`] for details. + return_loss (`bool`, *optional*): + Whether or not to return the contrastive loss. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. +""" + + +@keras_serializable +class TFBlipEncoder(keras.layers.Layer): + config_class = BlipConfig + """ + Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a + [`BlipEncoderLayer`]. + + Args: + config (`BlipConfig`): + The corresponding vision configuration for the `BlipEncoder`. + """ + + def __init__(self, config: BlipConfig, **kwargs): + super().__init__(**kwargs) + self.config = config + self.layers = [TFBlipEncoderLayer(config, name=f"layers_._{i}") for i in range(config.num_hidden_layers)] + + @unpack_inputs + def call( + self, + inputs_embeds, + attention_mask: tf.Tensor | None = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + training: Optional[bool] = None, + ) -> Union[Tuple, TFBaseModelOutput]: + r""" + Args: + inputs_embeds (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`): + Embedded representation of the inputs. Should be float, not int tokens. + attention_mask (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors + for more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. + """ + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + encoder_states = () if output_hidden_states else None + all_attentions = () if output_attentions else None + + hidden_states = inputs_embeds + for idx, encoder_layer in enumerate(self.layers): + if output_hidden_states: + encoder_states = encoder_states + (hidden_states,) + layer_outputs = encoder_layer( + hidden_states, + attention_mask, + output_attentions=output_attentions, + training=training, + ) + + hidden_states = layer_outputs[0] + + if output_attentions: + all_attentions = all_attentions + (layer_outputs[1],) + + if output_hidden_states: + encoder_states = encoder_states + (hidden_states,) + + if not return_dict: + return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None) + return TFBaseModelOutput( + last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions + ) + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "layers", None) is not None: + for layer in self.layers: + with tf.name_scope(layer.name): + layer.build(None) + + +class TFBlipVisionModel(TFBlipPreTrainedModel): + main_input_name = "pixel_values" + config_class = BlipVisionConfig + + def __init__(self, config: BlipVisionConfig, *args, **kwargs): + super().__init__(config, *args, **kwargs) + self.config = config + + self.embeddings = TFBlipVisionEmbeddings(config, name="embeddings") + self.encoder = TFBlipEncoder(config, name="encoder") + self.post_layernorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="post_layernorm") + self.embed_dim = config.hidden_size + + def serving_output(self, output: TFBaseModelOutputWithPooling) -> TFBaseModelOutputWithPooling: + hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None + attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None + + return TFBaseModelOutputWithPooling( + last_hidden_state=output.last_hidden_state, + pooler_output=output.pooler_output, + hidden_states=hs, + attentions=attns, + ) + + @unpack_inputs + @add_start_docstrings_to_model_forward(BLIP_VISION_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=TFBaseModelOutputWithPooling, config_class=BlipVisionConfig) + def call( + self, + pixel_values: tf.Tensor | None = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + training: Optional[bool] = None, + ) -> Union[Tuple, TFBaseModelOutputWithPooling]: + r""" + Returns: + + """ + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if pixel_values is None: + raise ValueError("You have to specify pixel_values") + + hidden_states = self.embeddings(pixel_values) + + encoder_outputs = self.encoder( + inputs_embeds=hidden_states, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + ) + + last_hidden_state = encoder_outputs[0] + last_hidden_state = self.post_layernorm(last_hidden_state) + + pooled_output = last_hidden_state[:, 0, :] + # TF gets confused if we call the layer with inputs of different ranks, so insert a singleton dimension + pooled_output = self.post_layernorm(tf.expand_dims(pooled_output, 1)) + pooled_output = tf.squeeze(pooled_output, 1) + + if not return_dict: + return (last_hidden_state, pooled_output) + encoder_outputs[1:] + + return TFBaseModelOutputWithPooling( + last_hidden_state=last_hidden_state, + pooler_output=pooled_output, + hidden_states=encoder_outputs.hidden_states, + attentions=encoder_outputs.attentions, + ) + + def get_input_embeddings(self): + return self.embeddings + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "embeddings", None) is not None: + with tf.name_scope(self.embeddings.name): + self.embeddings.build(None) + if getattr(self, "encoder", None) is not None: + with tf.name_scope(self.encoder.name): + self.encoder.build(None) + if getattr(self, "post_layernorm", None) is not None: + with tf.name_scope(self.post_layernorm.name): + self.post_layernorm.build([None, None, self.embed_dim]) + + +class TFBlipMainLayer(keras.layers.Layer): + config_class = BlipConfig + + def __init__(self, config: BlipConfig, *args, **kwargs): + super().__init__(*args, **kwargs) + + if not isinstance(config.text_config, BlipTextConfig): + raise TypeError( + "config.text_config is expected to be of type BlipTextConfig but is of type" + f" {type(config.text_config)}." + ) + + if not isinstance(config.vision_config, BlipVisionConfig): + raise TypeError( + "config.vision_config is expected to be of type BlipVisionConfig but is of type" + f" {type(config.vision_config)}." + ) + + text_config = config.text_config + vision_config = config.vision_config + + self.projection_dim = config.projection_dim + self.text_embed_dim = text_config.hidden_size + self.vision_embed_dim = vision_config.hidden_size + + self.text_model = TFBlipTextModel(text_config, name="text_model") + self.vision_model = TFBlipVisionModel(vision_config, name="vision_model") + + self.visual_projection = keras.layers.Dense( + self.projection_dim, + use_bias=False, + kernel_initializer=get_initializer(config.initializer_range), + name="visual_projection", + ) + self.text_projection = keras.layers.Dense( + self.projection_dim, + use_bias=False, + kernel_initializer=get_initializer(config.initializer_range), + name="text_projection", + ) + + self.config = config + + def build(self, input_shape=None): + self.logit_scale = self.add_weight( + name="logit_scale", + shape=[], + initializer=keras.initializers.Constant(self.config.logit_scale_init_value), + trainable=True, + ) + + if self.built: + return + self.built = True + if getattr(self, "text_model", None) is not None: + with tf.name_scope(self.text_model.name): + self.text_model.build(None) + if getattr(self, "vision_model", None) is not None: + with tf.name_scope(self.vision_model.name): + self.vision_model.build(None) + if getattr(self, "visual_projection", None) is not None: + with tf.name_scope(self.visual_projection.name): + self.visual_projection.build([None, None, self.vision_embed_dim]) + if getattr(self, "text_projection", None) is not None: + with tf.name_scope(self.text_projection.name): + self.text_projection.build([None, None, self.text_embed_dim]) + + @unpack_inputs + def call( + self, + input_ids: tf.Tensor | None = None, + pixel_values: tf.Tensor | None = None, + attention_mask: tf.Tensor | None = None, + position_ids: tf.Tensor | None = None, + return_loss: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + training: Optional[bool] = None, + ) -> Union[Tuple, TFBlipOutput]: + # Use BLIP model's config for some fields (if specified) instead of those of vision & text components. + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + vision_outputs = self.vision_model( + pixel_values=pixel_values, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + ) + + text_outputs = self.text_model( + input_ids=input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + ) + + image_embeds = vision_outputs[1] + image_embeds = self.visual_projection(image_embeds) + + text_embeds = text_outputs[1] + text_embeds = self.text_projection(text_embeds) + + # normalized features + image_embeds = image_embeds / tf.norm(image_embeds, ord=2, axis=-1, keepdims=True) + text_embeds = text_embeds / tf.norm(text_embeds, ord=2, axis=-1, keepdims=True) + + # cosine similarity as logits + logit_scale = tf.exp(self.logit_scale) + logits_per_text = tf.matmul(text_embeds, image_embeds, transpose_b=True) * logit_scale + logits_per_image = tf.transpose(logits_per_text) + + loss = None + if return_loss: + loss = blip_loss(logits_per_text) + loss = tf.reshape(loss, (1,)) + + if not return_dict: + output = (logits_per_image, logits_per_text, text_embeds, image_embeds, text_outputs, vision_outputs) + return ((loss,) + output) if loss is not None else output + + return TFBlipOutput( + loss=loss, + logits_per_image=logits_per_image, + logits_per_text=logits_per_text, + text_embeds=text_embeds, + image_embeds=image_embeds, + text_model_output=text_outputs, + vision_model_output=vision_outputs, + ) + + +class TFBlipModel(TFBlipPreTrainedModel): + config_class = BlipConfig + _keys_to_ignore_on_load_missing = [r"text_decoder.cls.predictions.decoder.bias"] + main_input_name = "input_ids" + + def __init__(self, config: BlipConfig, *inputs, **kwargs): + super().__init__(config, *inputs, **kwargs) + + self.blip = TFBlipMainLayer(config, name="blip") + + def serving_output(self, output: TFBlipOutput) -> TFBlipOutput: + return TFBlipOutput( + logits_per_image=output.logits_per_image, + logits_per_text=output.logits_per_text, + text_embeds=output.text_embeds, + image_embeds=output.image_embeds, + ) + + @unpack_inputs + @add_start_docstrings_to_model_forward(BLIP_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=TFBlipOutput, config_class=BlipConfig) + def call( + self, + input_ids: tf.Tensor | None = None, + pixel_values: tf.Tensor | None = None, + attention_mask: tf.Tensor | None = None, + position_ids: tf.Tensor | None = None, + return_loss: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + training: Optional[bool] = None, + ) -> Union[Tuple, TFBlipOutput]: + r""" + Returns: + + Examples: + + ```python + >>> from PIL import Image + >>> import requests + >>> from transformers import AutoProcessor, TFBlipModel + + >>> model = TFBlipModel.from_pretrained("Salesforce/blip-image-captioning-base") + >>> processor = AutoProcessor.from_pretrained("Salesforce/blip-image-captioning-base") + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + + >>> inputs = processor( + ... text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="tf", padding=True + ... ) + + >>> outputs = model(**inputs) + >>> logits_per_image = outputs.logits_per_image # this is the image-text similarity score + >>> probs = tf.nn.softmax(logits_per_image, axis=1) # we can take the softmax to get the label probabilities + ```""" + outputs = self.blip( + input_ids=input_ids, + pixel_values=pixel_values, + attention_mask=attention_mask, + position_ids=position_ids, + return_loss=return_loss, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + ) + return outputs + + @add_start_docstrings_to_model_forward(BLIP_TEXT_INPUTS_DOCSTRING) + def get_text_features( + self, + input_ids: tf.Tensor | None = None, + attention_mask: tf.Tensor | None = None, + position_ids: tf.Tensor | None = None, + return_dict: Optional[bool] = None, + ) -> tf.Tensor: + r""" + Returns: + text_features (`tf.Tensor` of shape `(batch_size, output_dim`): The text embeddings obtained by applying + the projection layer to the pooled output of [`TFBlipTextModel`]. + + Examples: + + ```python + >>> from transformers import AutoProcessor, TFBlipModel + + >>> model = TFBlipModel.from_pretrained("Salesforce/blip-image-captioning-base") + >>> processor = AutoProcessor.from_pretrained("Salesforce/blip-image-captioning-base") + + >>> inputs = processor(text=["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="tf") + >>> text_features = model.get_text_features(**inputs) + ```""" + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + text_outputs = self.blip.text_model( + input_ids=input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + return_dict=return_dict, + ) + + pooled_output = text_outputs[1] + text_features = self.blip.text_projection(pooled_output) + + return text_features + + @add_start_docstrings_to_model_forward(BLIP_VISION_INPUTS_DOCSTRING) + def get_image_features( + self, + pixel_values: tf.Tensor | None = None, + return_dict: Optional[bool] = None, + ) -> tf.Tensor: + r""" + Returns: + image_features (`tf.Tensor` of shape `(batch_size, output_dim`): The image embeddings obtained by applying + the projection layer to the pooled output of [`TFBlipVisionModel`]. + + Examples: + + ```python + >>> from PIL import Image + >>> import requests + >>> from transformers import AutoProcessor, TFBlipModel + + >>> model = TFBlipModel.from_pretrained("Salesforce/blip-image-captioning-base") + >>> processor = AutoProcessor.from_pretrained("Salesforce/blip-image-captioning-base") + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + + >>> inputs = processor(images=image, return_tensors="tf") + + >>> image_features = model.get_image_features(**inputs) + ```""" + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + vision_outputs = self.blip.vision_model(pixel_values=pixel_values, return_dict=return_dict) + + pooled_output = vision_outputs[1] # pooled_output + image_features = self.blip.visual_projection(pooled_output) + + return image_features + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "blip", None) is not None: + with tf.name_scope(self.blip.name): + self.blip.build(None) + + +@add_start_docstrings( + """ + BLIP Model for image captioning. The model consists of a vision encoder and a text decoder. One can optionally pass + `input_ids` to the model, which serve as a text prompt, to make the text decoder continue the prompt. Otherwise, + the decoder starts generating text from the [BOS] (beginning-of-sequence) token. will start generating the caption + from the text input. If no text input is provided, the decoder will start with the [BOS] token only. + """, + BLIP_START_DOCSTRING, +) +class TFBlipForConditionalGeneration(TFBlipPreTrainedModel): + config_class = BlipConfig + _keys_to_ignore_on_load_missing = [r"text_decoder.cls.predictions.decoder.bias"] + main_input_name = "pixel_values" + + def __init__(self, config: BlipConfig, *args, **kwargs): + super().__init__(config, *args, **kwargs) + + self.vision_model = TFBlipVisionModel(config.vision_config, name="vision_model") + + self.text_decoder = TFBlipTextLMHeadModel(config.text_config, name="text_decoder") + + self.decoder_input_ids = config.text_config.bos_token_id + self.decoder_pad_token_id = config.text_config.pad_token_id + + def get_input_embeddings(self) -> keras.layers.Layer: + return self.vision_model.embeddings.patch_embedding + + @unpack_inputs + @add_start_docstrings_to_model_forward(BLIP_VISION_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=TFBlipForConditionalGenerationModelOutput, config_class=BlipConfig) + def call( + self, + pixel_values: tf.Tensor, + input_ids: tf.Tensor | None = None, + attention_mask: tf.Tensor | None = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + labels: tf.Tensor | None = None, + return_dict: Optional[bool] = None, + training: Optional[bool] = None, + ) -> Union[Tuple, TFBlipForConditionalGenerationModelOutput]: + r""" + Returns: + + Examples: + + ```python + >>> from PIL import Image + >>> import requests + >>> from transformers import AutoProcessor, TFBlipForConditionalGeneration + + >>> processor = AutoProcessor.from_pretrained("Salesforce/blip-image-captioning-base") + >>> model = TFBlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base") + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + >>> text = "A picture of" + + >>> inputs = processor(images=image, text=text, return_tensors="tf") + + >>> outputs = model(**inputs) + ```""" + + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + vision_outputs = self.vision_model( + pixel_values=pixel_values, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + ) + + image_embeds = vision_outputs[0] + + outputs = self.text_decoder( + input_ids=input_ids, + attention_mask=attention_mask, + encoder_hidden_states=image_embeds, + labels=labels, + return_dict=False, + training=training, + ) + + if not return_dict: + outputs = (outputs[0], outputs[1], image_embeds, vision_outputs[0]) + vision_outputs[2:] + return tuple(output for output in outputs if output is not None) + + if labels is not None: + loss = outputs[0] + logits = outputs[1] + else: + loss = None + logits = outputs[0] + + if loss is not None and loss.shape.rank == 0: + loss = tf.reshape(loss, (1,)) + + return TFBlipForConditionalGenerationModelOutput( + loss=loss, + logits=logits, + image_embeds=image_embeds, + last_hidden_state=vision_outputs.last_hidden_state, + hidden_states=vision_outputs.hidden_states, + attentions=vision_outputs.attentions, + ) + + def generate( + self, + pixel_values: tf.Tensor, + input_ids: tf.Tensor | None = None, + attention_mask: tf.Tensor | None = None, + **generate_kwargs, + ) -> tf.Tensor: + r""" + Overrides *generate* function to be able to use the model as a conditional generator + + Parameters: + pixel_values (`tf.Tensor` of shape `(batch_size, num_channels, image_height, image_width)`: + Input image to be processed + input_ids (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + The sequence used as a prompt for the generation. + attention_mask (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + + Examples: + ```python + >>> from PIL import Image + >>> import requests + >>> from transformers import AutoProcessor, TFBlipForConditionalGeneration + + >>> model = TFBlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base") + >>> processor = AutoProcessor.from_pretrained("Salesforce/blip-image-captioning-base") + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + + >>> inputs = processor(images=image, return_tensors="tf") + + >>> outputs = model.generate(**inputs) + >>> print(processor.decode(outputs[0], skip_special_tokens=True)) + two cats sleeping on a couch + ``` + """ + + batch_size = pixel_values.shape[0] + vision_outputs = self.vision_model(pixel_values=pixel_values) + + image_embeds = vision_outputs[0] + + image_attention_mask = tf.ones(shape_list(image_embeds)[:-1], dtype=tf.int32) + + if isinstance(input_ids, list): + input_ids = tf.convert_to_tensor(input_ids, dtype=tf.int32) + elif input_ids is None: + input_ids = tf.convert_to_tensor( + [[self.decoder_input_ids, self.config.text_config.eos_token_id]], dtype=tf.int32 + ) + + input_ids = tf.tile(input_ids, (batch_size, 1)) + + # PyTorch: input_ids[:, 0] = self.config.text_config.bos_token_id + input_ids = tf.concat( + [tf.ones((batch_size, 1), dtype=tf.int32) * self.config.text_config.bos_token_id, input_ids[:, 1:]], axis=1 + ) + attention_mask = attention_mask[:, :-1] if attention_mask is not None else None + + outputs = self.text_decoder.generate( + input_ids=input_ids[:, :-1], + eos_token_id=self.config.text_config.sep_token_id, + pad_token_id=self.config.text_config.pad_token_id, + attention_mask=attention_mask, + encoder_hidden_states=image_embeds, + encoder_attention_mask=image_attention_mask, + **generate_kwargs, + ) + + return outputs + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "vision_model", None) is not None: + with tf.name_scope(self.vision_model.name): + self.vision_model.build(None) + if getattr(self, "text_decoder", None) is not None: + with tf.name_scope(self.text_decoder.name): + self.text_decoder.build(None) + + +@add_start_docstrings( + """ + BLIP Model for visual question answering. The model consists of a vision encoder, a text encoder as well as a text + decoder. The vision encoder will encode the input image, the text encoder will encode the input question together + with the encoding of the image, and the text decoder will output the answer to the question. + """, + BLIP_START_DOCSTRING, +) +class TFBlipForQuestionAnswering(TFBlipPreTrainedModel): + config_class = BlipConfig + _keys_to_ignore_on_load_missing = [r"text_decoder.cls.predictions.decoder.bias"] + + def __init__(self, config: BlipConfig, *args, **kwargs): + super().__init__(config, *args, **kwargs) + + self.vision_model = TFBlipVisionModel(config.vision_config, name="vision_model") + + self.text_encoder = TFBlipTextModel(config.text_config, name="text_encoder", add_pooling_layer=False) + + self.text_decoder = TFBlipTextLMHeadModel(config.text_config, name="text_decoder") + + self.decoder_pad_token_id = config.text_config.pad_token_id + self.decoder_start_token_id = config.text_config.bos_token_id + + def get_input_embeddings(self) -> keras.layers.Layer: + return self.vision_model.embeddings.patch_embedding + + # Adapted from transformers.models.t5.modeling_tf_t5.TFT5PreTrainedModel._shift_right + def _shift_right(self, input_ids): + decoder_start_token_id = self.decoder_start_token_id + pad_token_id = self.decoder_pad_token_id + + if decoder_start_token_id is None or pad_token_id is None: + raise ValueError("decoder_start_token_id and pad_token_id must be defined!") + + start_tokens = tf.fill((shape_list(input_ids)[0], 1), decoder_start_token_id) + start_tokens = tf.cast(start_tokens, input_ids.dtype) # Ensure compatible dtypes for concatenation + shifted_input_ids = tf.concat([start_tokens, input_ids[:, :-1]], -1) + + # replace possible -100 values in labels by `pad_token_id` + shifted_input_ids = tf.where( + shifted_input_ids == -100, + tf.cast(tf.fill(shape_list(shifted_input_ids), pad_token_id), shifted_input_ids.dtype), + shifted_input_ids, + ) + + # "Verify that `labels` has only positive values and -100" + tf.debugging.assert_greater_equal(shifted_input_ids, tf.constant(0, dtype=shifted_input_ids.dtype)) + + return shifted_input_ids + + @unpack_inputs + @add_start_docstrings_to_model_forward(BLIP_VISION_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=TFBlipTextVisionModelOutput, config_class=BlipVisionConfig) + def call( + self, + input_ids: tf.Tensor, + pixel_values: tf.Tensor | None = None, + decoder_input_ids: tf.Tensor | None = None, + decoder_attention_mask: tf.Tensor | None = None, + attention_mask: tf.Tensor | None = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + labels: tf.Tensor | None = None, + return_dict: Optional[bool] = None, + training: Optional[bool] = None, + ) -> Union[Tuple, TFBlipTextVisionModelOutput]: + r""" + Returns: + + Examples: + + ```python + >>> from PIL import Image + >>> import requests + >>> from transformers import AutoProcessor, TFBlipForQuestionAnswering + + >>> model = TFBlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base") + >>> processor = AutoProcessor.from_pretrained("Salesforce/blip-vqa-base") + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + + >>> # training + >>> text = "How many cats are in the picture?" + >>> label = "2" + >>> inputs = processor(images=image, text=text, return_tensors="tf") + >>> labels = processor(text=label, return_tensors="tf").input_ids + + >>> inputs["labels"] = labels + >>> outputs = model(**inputs) + >>> loss = outputs.loss + + >>> # inference + >>> text = "How many cats are in the picture?" + >>> inputs = processor(images=image, text=text, return_tensors="tf") + >>> outputs = model.generate(**inputs) + >>> print(processor.decode(outputs[0], skip_special_tokens=True)) + 2 + ```""" + if labels is None and decoder_input_ids is None: + raise ValueError( + "Either `decoder_input_ids` or `labels` should be passed when calling" + " `TFBlipForQuestionAnswering`. if you are training the model make sure that `labels` is passed, if you" + " are using the model for inference make sure that `decoder_input_ids` is passed or call `generate`" + ) + + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + vision_outputs = self.vision_model( + pixel_values=pixel_values, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + ) + + image_embeds = vision_outputs[0] + image_attention_mask = tf.ones(shape_list(image_embeds)[:-1], dtype=tf.int64) + + question_embeds = self.text_encoder( + input_ids=input_ids, + attention_mask=attention_mask, + encoder_hidden_states=image_embeds, + encoder_attention_mask=image_attention_mask, + return_dict=return_dict, + training=training, + ) + + question_embeds = question_embeds[0] if not return_dict else question_embeds.last_hidden_state + + if labels is not None and decoder_input_ids is None: + # labels are already shifted right, see: https://github.com/huggingface/transformers/pull/23153 + decoder_input_ids = labels + + answer_output = self.text_decoder( + input_ids=decoder_input_ids, + attention_mask=decoder_attention_mask, + encoder_hidden_states=question_embeds, + encoder_attention_mask=attention_mask, + labels=labels, + return_dict=return_dict, + training=training, + ) + + if labels is not None: + decoder_loss = tf.reduce_mean(answer_output.loss) if return_dict else tf.reduce_mean(answer_output[0]) + else: + decoder_loss = None + + if not return_dict: + outputs = (decoder_loss, image_embeds, vision_outputs[0]) + vision_outputs[2:] + return tuple(output for output in outputs if output is not None) + + return TFBlipTextVisionModelOutput( + loss=decoder_loss, + image_embeds=image_embeds, + last_hidden_state=vision_outputs.last_hidden_state, + hidden_states=vision_outputs.hidden_states, + attentions=vision_outputs.attentions, + ) + + def generate( + self, + input_ids: tf.Tensor, + pixel_values: tf.Tensor, + attention_mask: tf.Tensor | None = None, + **generate_kwargs, + ) -> tf.Tensor: + r""" + Overrides *generate* function to be able to use the model as a conditional generator + + Parameters: + input_ids (`tf.Tensor` of shape `(batch_size, sequence_length)`): + The sequence used as a prompt for the generation. + pixel_values (`tf.Tensor` of shape `(batch_size, num_channels, image_height, image_width)`: + Input image to be processed + attention_mask (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`. `1` for + tokens that are NOT MASKED, `0` for MASKED tokens. + generate_kwargs (dict, *optional*): + Additional arguments passed to the `generate` function of the decoder + + + Examples: + ```python + >>> from PIL import Image + >>> import requests + >>> from transformers import AutoProcessor, TFBlipForQuestionAnswering + + >>> model = TFBlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base") + >>> processor = AutoProcessor.from_pretrained("Salesforce/blip-vqa-base") + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + >>> text = "How many cats are in the picture?" + + >>> inputs = processor(images=image, text=text, return_tensors="tf") + + >>> outputs = model.generate(**inputs) + >>> print(processor.decode(outputs[0], skip_special_tokens=True)) + 2 + ``` + """ + vision_outputs = self.vision_model(pixel_values=pixel_values) + + image_embeds = vision_outputs[0] + + image_attention_mask = tf.ones(shape_list(image_embeds)[:-1], dtype=tf.int32) + + if isinstance(input_ids, list): + input_ids = tf.Tensor(input_ids) + + question_outputs = self.text_encoder( + input_ids=input_ids, + attention_mask=attention_mask, + encoder_hidden_states=image_embeds, + encoder_attention_mask=image_attention_mask, + return_dict=False, + ) + + question_embeds = question_outputs[0] + + question_attention_mask = tf.ones(shape_list(question_embeds)[:-1], dtype=tf.int32) + + bos_ids = tf.fill( + (tf.shape(question_embeds)[0], 1), value=tf.cast(self.decoder_start_token_id, input_ids.dtype) + ) + + outputs = self.text_decoder.generate( + input_ids=bos_ids, + eos_token_id=self.config.text_config.sep_token_id, + pad_token_id=self.config.text_config.pad_token_id, + encoder_hidden_states=question_embeds, + encoder_attention_mask=question_attention_mask, + **generate_kwargs, + ) + + return outputs + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "vision_model", None) is not None: + with tf.name_scope(self.vision_model.name): + self.vision_model.build(None) + if getattr(self, "text_encoder", None) is not None: + with tf.name_scope(self.text_encoder.name): + self.text_encoder.build(None) + if getattr(self, "text_decoder", None) is not None: + with tf.name_scope(self.text_decoder.name): + self.text_decoder.build(None) + + +@add_start_docstrings( + """ + BLIP Model with a vision and text projector, and a classification head on top. The model is used in the context of + image-text retrieval. Given an image and a text, the model returns the probability of the text being relevant to + the image. + """, + BLIP_START_DOCSTRING, +) +class TFBlipForImageTextRetrieval(TFBlipPreTrainedModel): + config_class = BlipConfig + + def __init__(self, config: BlipConfig, *args, **kwargs): + super().__init__(config, *args, **kwargs) + + self.vision_model = TFBlipVisionModel(config.vision_config, name="vision_model") + + self.text_encoder = TFBlipTextModel(config.text_config, name="text_encoder", add_pooling_layer=False) + + # vision projection layer + self.vision_proj = keras.layers.Dense( + config.image_text_hidden_size, + kernel_initializer=get_initializer(config.initializer_range), + name="vision_proj", + ) + + # text projection layer + self.text_proj = keras.layers.Dense( + config.image_text_hidden_size, + kernel_initializer=get_initializer(config.initializer_range), + name="text_proj", + ) + + # image text matching head + self.itm_head = keras.layers.Dense( + 2, kernel_initializer=get_initializer(config.initializer_range), name="itm_head" + ) + + self.decoder_pad_token_id = ( + config.text_config.pad_token_id + if not hasattr(config, "decoder_pad_token_id") + else config.decoder_pad_token_id + ) + self.decoder_start_token_id = ( + config.text_config.bos_token_id + if not hasattr(config, "decoder_start_token_id") + else config.decoder_start_token_id + ) + self.config = config + + def get_input_embeddings(self) -> keras.layers.Layer: + return self.vision_model.embeddings.patch_embedding + + @unpack_inputs + @add_start_docstrings_to_model_forward(BLIP_VISION_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=TFBlipImageTextMatchingModelOutput, config_class=BlipVisionConfig) + def call( + self, + input_ids: tf.Tensor, + pixel_values: tf.Tensor | None = None, + use_itm_head: Optional[bool] = True, + attention_mask: tf.Tensor | None = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + training: Optional[bool] = None, + ) -> Union[Tuple, TFBlipImageTextMatchingModelOutput]: + r""" + Returns: + + Examples: + + ```python + >>> from PIL import Image + >>> import requests + >>> from transformers import AutoProcessor, TFBlipForImageTextRetrieval + + >>> model = TFBlipForImageTextRetrieval.from_pretrained("Salesforce/blip-itm-base-coco") + >>> processor = AutoProcessor.from_pretrained("Salesforce/blip-itm-base-coco") + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + >>> text = "an image of a cat" + + >>> inputs = processor(images=image, text=text, return_tensors="tf") + >>> outputs = model(**inputs) + ``` + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + vision_outputs = self.vision_model( + pixel_values=pixel_values, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + ) + + image_embeds = vision_outputs[0] + image_atts = tf.ones(shape_list(image_embeds)[:-1], dtype=tf.int64) + + # Matt: In PyTorch, only one path (itm/non-itm) is taken. However, in TensorFlow this can result in + # some layers not being built! To avoid this, we always call both paths, then use an if statement to select + # which output to pass to the final output. The unnecessary nodes will be pruned from the final graph, but + # not before the layers have all been built correctly. + itm_question_embeds = self.text_encoder( + input_ids=input_ids, + attention_mask=attention_mask, + encoder_hidden_states=image_embeds, + encoder_attention_mask=image_atts, + return_dict=return_dict, + training=training, + ) + itm_question_embeds = itm_question_embeds[0] if not return_dict else itm_question_embeds.last_hidden_state + + itm_output = self.itm_head(itm_question_embeds[:, 0, :]) + + no_itm_question_embeds = self.text_encoder( + input_ids=input_ids, + attention_mask=attention_mask, + return_dict=return_dict, + training=training, + ) + no_itm_question_embeds = ( + no_itm_question_embeds[0] if not return_dict else no_itm_question_embeds.last_hidden_state + ) + + image_feat, _ = tf.linalg.normalize(self.vision_proj(image_embeds[:, 0, :]), ord=2, axis=-1) + text_feat, _ = tf.linalg.normalize(self.text_proj(no_itm_question_embeds[:, 0, :]), ord=2, axis=-1) + + no_itm_output = tf.matmul(image_feat, text_feat, transpose_b=True) + + if use_itm_head: + output = itm_output + question_embeds = itm_question_embeds + else: + output = no_itm_output + question_embeds = no_itm_question_embeds + + if not return_dict: + outputs = (output, vision_outputs[0]) + vision_outputs[2:] + (question_embeds,) + return tuple(output for output in outputs if output is not None) + + return TFBlipImageTextMatchingModelOutput( + itm_score=output, + last_hidden_state=vision_outputs.last_hidden_state, + hidden_states=vision_outputs.hidden_states, + attentions=vision_outputs.attentions, + question_embeds=question_embeds, + ) + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "vision_model", None) is not None: + with tf.name_scope(self.vision_model.name): + self.vision_model.build(None) + if getattr(self, "text_encoder", None) is not None: + with tf.name_scope(self.text_encoder.name): + self.text_encoder.build(None) + if getattr(self, "vision_proj", None) is not None: + with tf.name_scope(self.vision_proj.name): + self.vision_proj.build([None, None, self.config.vision_config.hidden_size]) + if getattr(self, "text_proj", None) is not None: + with tf.name_scope(self.text_proj.name): + self.text_proj.build([None, None, self.config.text_config.hidden_size]) + if getattr(self, "itm_head", None) is not None: + with tf.name_scope(self.itm_head.name): + self.itm_head.build([None, None, self.config.text_config.hidden_size]) + + +__all__ = [ + "TFBlipModel", + "TFBlipPreTrainedModel", + "TFBlipForConditionalGeneration", + "TFBlipForQuestionAnswering", + "TFBlipVisionModel", + "TFBlipTextModel", + "TFBlipForImageTextRetrieval", +] diff --git a/docs/transformers/build/lib/transformers/models/blip/modeling_tf_blip_text.py b/docs/transformers/build/lib/transformers/models/blip/modeling_tf_blip_text.py new file mode 100644 index 0000000000000000000000000000000000000000..6414bfa3b7e1de5f244b0e67caef0e4af343b804 --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/blip/modeling_tf_blip_text.py @@ -0,0 +1,1125 @@ +# coding=utf-8 +# Copyright 2023 The Salesforce Team Authors and The HuggingFace Team. All rights reserved. +# +# Licensed under the BSD-3-clause license (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://opensource.org/licenses/BSD-3-Clause +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +from __future__ import annotations + +import math +from typing import Optional, Tuple + +import tensorflow as tf + +from ...modeling_tf_outputs import ( + TFBaseModelOutputWithPastAndCrossAttentions, + TFBaseModelOutputWithPoolingAndCrossAttentions, + TFCausalLMOutputWithCrossAttentions, +) +from ...modeling_tf_utils import ( + TFModelInputType, + TFPreTrainedModel, + get_initializer, + get_tf_activation, + keras, + keras_serializable, + shape_list, + unpack_inputs, +) +from ...tf_utils import check_embeddings_within_bounds, invert_attention_mask, stable_softmax +from ...utils import add_start_docstrings_to_model_forward, logging +from .configuration_blip import BlipTextConfig + + +logger = logging.get_logger(__name__) + +BLIP_TEXT_INPUTS_DOCSTRING = r""" + Args: + input_ids (`tf.Tensor` of shape `(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide + it. + + Indices can be obtained using [`AutoProcessor`]. See [`BlipProcessor.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + attention_mask (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + position_ids (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, + config.max_position_embeddings - 1]`. + + [What are position IDs?](../glossary#position-ids) + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. +""" + + +# Adapted from https://github.com/salesforce/BLIP/blob/main/models/med.py#L52 +class TFBlipTextEmbeddings(keras.layers.Layer): + """Construct the embeddings from word and position embeddings.""" + + def __init__(self, config, **kwargs): + super().__init__(**kwargs) + self.word_embeddings = keras.layers.Embedding( + config.vocab_size, + config.hidden_size, + embeddings_initializer=get_initializer(config.initializer_range), + name="word_embeddings", + ) + self.position_embeddings = keras.layers.Embedding( + config.max_position_embeddings, + config.hidden_size, + embeddings_initializer=get_initializer(config.initializer_range), + name="position_embeddings", + ) + + # self.LayerNorm is not snake-cased to stick with PyTorch model variable name and be able to load + # any TensorFlow checkpoint file + self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") + self.dropout = keras.layers.Dropout(config.hidden_dropout_prob, name="dropout") + + self.position_ids = tf.expand_dims(tf.range(config.max_position_embeddings), 0) + self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") + + self.config = config + + def call(self, input_ids=None, position_ids=None, inputs_embeds=None, past_key_values_length=0, training=None): + if input_ids is not None: + input_shape = tf.shape(input_ids) + else: + input_shape = tf.shape(inputs_embeds)[:-1] + + seq_length = input_shape[1] + + if position_ids is None: + position_ids = self.position_ids[:, past_key_values_length : seq_length + past_key_values_length] + + if inputs_embeds is None: + check_embeddings_within_bounds(input_ids, self.config.vocab_size) + inputs_embeds = self.word_embeddings(input_ids) + + embeddings = inputs_embeds + + if self.position_embedding_type == "absolute": + position_embeddings = self.position_embeddings(position_ids) + embeddings += position_embeddings + embeddings = self.LayerNorm(embeddings) + embeddings = self.dropout(embeddings, training=training) + return embeddings + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "word_embeddings", None) is not None: + with tf.name_scope(self.word_embeddings.name): + self.word_embeddings.build(None) + if getattr(self, "position_embeddings", None) is not None: + with tf.name_scope(self.position_embeddings.name): + self.position_embeddings.build(None) + if getattr(self, "LayerNorm", None) is not None: + with tf.name_scope(self.LayerNorm.name): + self.LayerNorm.build([None, None, self.config.hidden_size]) + if getattr(self, "dropout", None) is not None: + with tf.name_scope(self.dropout.name): + self.dropout.build(None) + + +# Adapted from https://github.com/salesforce/BLIP/blob/main/models/med.py#L97 +class TFBlipTextSelfAttention(keras.layers.Layer): + def __init__(self, config, is_cross_attention, **kwargs): + super().__init__(**kwargs) + self.config = config + if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"): + raise ValueError( + "The hidden size (%d) is not a multiple of the number of attention heads (%d)" + % (config.hidden_size, config.num_attention_heads) + ) + + self.num_attention_heads = config.num_attention_heads + self.attention_head_size = int(config.hidden_size / config.num_attention_heads) + self.all_head_size = self.num_attention_heads * self.attention_head_size + + self.query = keras.layers.Dense( + self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="query" + ) + self.key = keras.layers.Dense( + self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="key" + ) + self.value = keras.layers.Dense( + self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="value" + ) + + self.dropout = keras.layers.Dropout(config.attention_probs_dropout_prob) + self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") + if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query": + self.max_position_embeddings = config.max_position_embeddings + self.distance_embedding = keras.layers.Embedding( + 2 * config.max_position_embeddings - 1, self.attention_head_size + ) + self.is_cross_attention = is_cross_attention + + def transpose_for_scores(self, x): + new_x_shape = tf.concat( + [tf.shape(x)[:-1], tf.constant([self.num_attention_heads, self.attention_head_size], dtype=tf.int32)], + axis=0, + ) + x = tf.reshape(x, new_x_shape) + return tf.transpose(x, perm=(0, 2, 1, 3)) + + def call( + self, + hidden_states, + attention_mask=None, + head_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + past_key_value=None, + output_attentions=False, + training=None, + ): + mixed_query_layer = self.query(hidden_states) + + # If this is instantiated as a cross-attention module, the keys + # and values come from an encoder; the attention mask needs to be + # such that the encoder's padding tokens are not attended to. + is_cross_attention = encoder_hidden_states is not None + + if is_cross_attention: + key_layer = self.transpose_for_scores(self.key(encoder_hidden_states)) + value_layer = self.transpose_for_scores(self.value(encoder_hidden_states)) + attention_mask = encoder_attention_mask + elif past_key_value is not None: + key_layer = self.transpose_for_scores(self.key(hidden_states)) + value_layer = self.transpose_for_scores(self.value(hidden_states)) + key_layer = tf.concat([past_key_value[0], key_layer], axis=2) + value_layer = tf.concat([past_key_value[1], value_layer], axis=2) + else: + key_layer = self.transpose_for_scores(self.key(hidden_states)) + value_layer = self.transpose_for_scores(self.value(hidden_states)) + + query_layer = self.transpose_for_scores(mixed_query_layer) + + past_key_value = (key_layer, value_layer) + + # Take the dot product between "query" and "key" to get the raw attention scores. + attention_scores = tf.matmul(query_layer, key_layer, transpose_b=True) + + if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query": + seq_length = shape_list(hidden_states)[1] + position_ids_l = tf.expand_dims(tf.range(seq_length, dtype=tf.int64, device=hidden_states.device), 1) + position_ids_r = tf.expand_dims(tf.range(seq_length, dtype=tf.int64, device=hidden_states.device), 0) + distance = position_ids_l - position_ids_r + positional_embedding = self.distance_embedding(distance + self.max_position_embeddings - 1) + positional_embedding = tf.cast(positional_embedding, query_layer.dtype) # fp16 compatibility + + if self.position_embedding_type == "relative_key": + relative_position_scores = tf.einsum("bhld,lrd->bhlr", query_layer, positional_embedding) + attention_scores = attention_scores + relative_position_scores + elif self.position_embedding_type == "relative_key_query": + relative_position_scores_query = tf.einsum("bhld,lrd->bhlr", query_layer, positional_embedding) + relative_position_scores_key = tf.einsum("bhrd,lrd->bhlr", key_layer, positional_embedding) + attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key + + attention_scores = attention_scores / math.sqrt(self.attention_head_size) + if attention_mask is not None: + # Apply the attention mask is (precomputed for all layers in BlipTextModel forward() function) + attention_scores = attention_scores + tf.cast(attention_mask, attention_scores.dtype) + + # Normalize the attention scores to probabilities. + attention_probs = stable_softmax(attention_scores, axis=-1) + + # This is actually dropping out entire tokens to attend to, which might + # seem a bit unusual, but is taken from the original Transformer paper. + attention_probs_dropped = self.dropout(attention_probs, training=training) + + # Mask heads if we want to + if head_mask is not None: + attention_probs_dropped = attention_probs_dropped * head_mask + + context_layer = attention_probs_dropped @ value_layer + + context_layer = tf.transpose(context_layer, perm=(0, 2, 1, 3)) + new_context_layer_shape = shape_list(context_layer)[:-2] + [self.all_head_size] + context_layer = tf.reshape(context_layer, new_context_layer_shape) + + outputs = (context_layer, attention_probs) if output_attentions else (context_layer,) + + outputs = outputs + (past_key_value,) + return outputs + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "query", None) is not None: + with tf.name_scope(self.query.name): + self.query.build([None, None, self.config.hidden_size]) + if self.is_cross_attention: + if getattr(self, "key", None) is not None: + with tf.name_scope(self.key.name): + self.key.build([None, None, self.config.encoder_hidden_size]) + if getattr(self, "value", None) is not None: + with tf.name_scope(self.value.name): + self.value.build([None, None, self.config.encoder_hidden_size]) + else: + if getattr(self, "key", None) is not None: + with tf.name_scope(self.key.name): + self.key.build([None, None, self.config.hidden_size]) + if getattr(self, "value", None) is not None: + with tf.name_scope(self.value.name): + self.value.build([None, None, self.config.hidden_size]) + + +class TFBlipTextSelfOutput(keras.layers.Layer): + def __init__(self, config: BlipTextConfig, **kwargs): + super().__init__(**kwargs) + + self.dense = keras.layers.Dense( + units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" + ) + self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") + self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob) + self.config = config + + def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: Optional[bool] = None) -> tf.Tensor: + hidden_states = self.dense(inputs=hidden_states) + hidden_states = self.dropout(inputs=hidden_states, training=training) + hidden_states = self.LayerNorm(inputs=hidden_states + input_tensor) + + return hidden_states + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build([None, None, self.config.hidden_size]) + if getattr(self, "LayerNorm", None) is not None: + with tf.name_scope(self.LayerNorm.name): + self.LayerNorm.build([None, None, self.config.hidden_size]) + + +# Adapted from https://github.com/salesforce/BLIP/blob/main/models/med.py#242 +class TFBlipTextAttention(keras.layers.Layer): + def __init__(self, config, is_cross_attention=False, **kwargs): + super().__init__(**kwargs) + self.self = TFBlipTextSelfAttention(config, is_cross_attention, name="self") + # "output" is a protected attribute on TF models + self.self_output = TFBlipTextSelfOutput(config, name="output") + + def call( + self, + hidden_states: tf.Tensor, + attention_mask: tf.Tensor | None = None, + head_mask: tf.Tensor | None = None, + encoder_hidden_states: tf.Tensor | None = None, + encoder_attention_mask: tf.Tensor | None = None, + past_key_value: Tuple[Tuple[tf.Tensor]] | None = None, + output_attentions: Optional[bool] = False, + training: Optional[bool] = None, + ): + self_outputs = self.self( + hidden_states, + attention_mask, + head_mask, + encoder_hidden_states, + encoder_attention_mask, + past_key_value, + output_attentions, + training=training, + ) + attention_output = self.self_output(self_outputs[0], hidden_states, training=training) + outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them + return outputs + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "self", None) is not None: + with tf.name_scope(self.self.name): + self.self.build(None) + if getattr(self, "self_output", None) is not None: + with tf.name_scope(self.self_output.name): + self.self_output.build(None) + + +# Copied from transformers.models.bert.modeling_tf_bert.TFBertIntermediate with Bert->BlipText +class TFBlipTextIntermediate(keras.layers.Layer): + def __init__(self, config: BlipTextConfig, **kwargs): + super().__init__(**kwargs) + + self.dense = keras.layers.Dense( + units=config.intermediate_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" + ) + + if isinstance(config.hidden_act, str): + self.intermediate_act_fn = get_tf_activation(config.hidden_act) + else: + self.intermediate_act_fn = config.hidden_act + self.config = config + + def call(self, hidden_states: tf.Tensor) -> tf.Tensor: + hidden_states = self.dense(inputs=hidden_states) + hidden_states = self.intermediate_act_fn(hidden_states) + + return hidden_states + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build([None, None, self.config.hidden_size]) + + +class TFBlipTextOutput(keras.layers.Layer): + def __init__(self, config: BlipTextConfig, **kwargs): + super().__init__(**kwargs) + + self.dense = keras.layers.Dense( + units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" + ) + self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") + self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob) + self.config = config + + def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor: + hidden_states = self.dense(inputs=hidden_states) + hidden_states = self.dropout(inputs=hidden_states, training=training) + hidden_states = self.LayerNorm(inputs=hidden_states + input_tensor) + + return hidden_states + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build([None, None, self.config.intermediate_size]) + if getattr(self, "LayerNorm", None) is not None: + with tf.name_scope(self.LayerNorm.name): + self.LayerNorm.build([None, None, self.config.hidden_size]) + + +class TFBlipTextLayer(keras.layers.Layer): + def __init__(self, config, **kwargs): + super().__init__(**kwargs) + self.config = config + self.attention = TFBlipTextAttention(config, name="attention") + if self.config.is_decoder: + self.crossattention = TFBlipTextAttention( + config, is_cross_attention=self.config.is_decoder, name="crossattention" + ) + self.intermediate = TFBlipTextIntermediate(config, name="intermediate") + self.self_output = TFBlipTextOutput(config, name="output") + + def call( + self, + hidden_states, + attention_mask=None, + head_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + past_key_value=None, + output_attentions=False, + training=None, + ): + # decoder uni-directional self-attention cached key/values tuple is at positions 1,2 + self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None + self_attention_outputs = self.attention( + hidden_states, + attention_mask, + head_mask, + output_attentions=output_attentions, + past_key_value=self_attn_past_key_value, + training=training, + ) + attention_output = self_attention_outputs[0] + + outputs = self_attention_outputs[1:-1] + present_key_value = self_attention_outputs[-1] + + if encoder_hidden_states is not None: + cross_attention_outputs = self.crossattention( + attention_output, + attention_mask, + head_mask, + encoder_hidden_states, + encoder_attention_mask, + output_attentions=output_attentions, + training=training, + ) + attention_output = cross_attention_outputs[0] + outputs = outputs + cross_attention_outputs[1:-1] # add cross attentions if we output attention weights + intermediate_output = self.intermediate(attention_output) + layer_output = self.self_output(intermediate_output, attention_output, training=training) + outputs = (layer_output,) + outputs + + outputs = outputs + (present_key_value,) + + return outputs + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "attention", None) is not None: + with tf.name_scope(self.attention.name): + self.attention.build(None) + if getattr(self, "intermediate", None) is not None: + with tf.name_scope(self.intermediate.name): + self.intermediate.build(None) + if getattr(self, "self_output", None) is not None: + with tf.name_scope(self.self_output.name): + self.self_output.build(None) + if getattr(self, "crossattention", None) is not None: + with tf.name_scope(self.crossattention.name): + self.crossattention.build(None) + + +# Adapted from https://github.com/salesforce/BLIP/blob/main/models/med.py#L386 +@keras_serializable +class TFBlipTextEncoder(keras.layers.Layer): + config_class = BlipTextConfig + + def __init__(self, config, name=None, **kwargs): + super().__init__(name=name, **kwargs) + self.config = config + self.layer = [TFBlipTextLayer(config, name=f"layer_._{i}") for i in range(config.num_hidden_layers)] + + @unpack_inputs + def call( + self, + hidden_states, + attention_mask=None, + head_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + past_key_values=None, + use_cache=None, + output_attentions=False, + output_hidden_states=False, + return_dict=True, + training=None, + ): + all_hidden_states = () if output_hidden_states else None + all_self_attentions = () if output_attentions else None + all_cross_attentions = () if output_attentions and self.config.is_decoder else None + + next_decoder_cache = () if use_cache else None + + for i in range(self.config.num_hidden_layers): + layer_module = self.layer[i] + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + layer_head_mask = head_mask[i] if head_mask is not None else None + past_key_value = past_key_values[i] if past_key_values is not None else None + + layer_outputs = layer_module( + hidden_states, + attention_mask, + layer_head_mask, + encoder_hidden_states, + encoder_attention_mask, + past_key_value, + output_attentions, + training=training, + ) + + hidden_states = layer_outputs[0] + if use_cache: + next_decoder_cache += (layer_outputs[-1],) + if output_attentions: + all_self_attentions = all_self_attentions + (layer_outputs[1],) + all_cross_attentions = all_cross_attentions + (layer_outputs[2],) + + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + if not return_dict: + return tuple( + v + for v in [ + hidden_states, + next_decoder_cache, + all_hidden_states, + all_self_attentions, + all_cross_attentions, + ] + if v is not None + ) + return TFBaseModelOutputWithPastAndCrossAttentions( + last_hidden_state=hidden_states, + past_key_values=next_decoder_cache, + hidden_states=all_hidden_states, + attentions=all_self_attentions, + cross_attentions=all_cross_attentions, + ) + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "layer", None) is not None: + for layer in self.layer: + with tf.name_scope(layer.name): + layer.build(None) + + +# Copied from transformers.models.bert.modeling_tf_bert.TFBertPooler with Bert->BlipText +class TFBlipTextPooler(keras.layers.Layer): + def __init__(self, config: BlipTextConfig, **kwargs): + super().__init__(**kwargs) + + self.dense = keras.layers.Dense( + units=config.hidden_size, + kernel_initializer=get_initializer(config.initializer_range), + activation="tanh", + name="dense", + ) + self.config = config + + def call(self, hidden_states: tf.Tensor) -> tf.Tensor: + # We "pool" the model by simply taking the hidden state corresponding + # to the first token. + first_token_tensor = hidden_states[:, 0] + pooled_output = self.dense(inputs=first_token_tensor) + + return pooled_output + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build([None, None, self.config.hidden_size]) + + +# Copied from transformers.models.bert.modeling_tf_bert.TFBertPredictionHeadTransform with Bert->BlipText +class TFBlipTextPredictionHeadTransform(keras.layers.Layer): + def __init__(self, config: BlipTextConfig, **kwargs): + super().__init__(**kwargs) + + self.dense = keras.layers.Dense( + units=config.hidden_size, + kernel_initializer=get_initializer(config.initializer_range), + name="dense", + ) + + if isinstance(config.hidden_act, str): + self.transform_act_fn = get_tf_activation(config.hidden_act) + else: + self.transform_act_fn = config.hidden_act + + self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") + self.config = config + + def call(self, hidden_states: tf.Tensor) -> tf.Tensor: + hidden_states = self.dense(inputs=hidden_states) + hidden_states = self.transform_act_fn(hidden_states) + hidden_states = self.LayerNorm(inputs=hidden_states) + + return hidden_states + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build([None, None, self.config.hidden_size]) + if getattr(self, "LayerNorm", None) is not None: + with tf.name_scope(self.LayerNorm.name): + self.LayerNorm.build([None, None, self.config.hidden_size]) + + +class TFBlipTextLMPredictionHead(keras.layers.Layer): + def __init__(self, config, **kwargs): + super().__init__(**kwargs) + self.transform = TFBlipTextPredictionHeadTransform(config, name="transform") + + # The output weights are the same as the input embeddings, but there is + # an output-only bias for each token. + self.decoder = keras.layers.Dense( + config.vocab_size, + kernel_initializer=get_initializer(config.initializer_range), + name="decoder", + use_bias=False, + ) + self.config = config + + def build(self, input_shape=None): + self.bias = self.add_weight(name="bias", shape=(self.config.vocab_size,), initializer="zeros", trainable=True) + + if self.built: + return + self.built = True + if getattr(self, "transform", None) is not None: + with tf.name_scope(self.transform.name): + self.transform.build(None) + if getattr(self, "decoder", None) is not None: + with tf.name_scope(self.decoder.name): + self.decoder.build([None, None, self.config.hidden_size]) + + def call(self, hidden_states): + hidden_states = self.transform(hidden_states) + hidden_states = self.decoder(hidden_states) + self.bias + return hidden_states + + +class TFBlipTextOnlyMLMHead(keras.layers.Layer): + def __init__(self, config, **kwargs): + super().__init__(**kwargs) + self.predictions = TFBlipTextLMPredictionHead(config, name="predictions") + + def call(self, sequence_output: tf.Tensor) -> tf.Tensor: + prediction_scores = self.predictions(sequence_output) + return prediction_scores + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "predictions", None) is not None: + with tf.name_scope(self.predictions.name): + self.predictions.build(None) + + +# Adapted from https://github.com/salesforce/BLIP/blob/main/models/med.py#L548 +class TFBlipTextPreTrainedModel(TFPreTrainedModel): + """ + An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained + models. + """ + + config_class = BlipTextConfig + base_model_prefix = "bert" + _keys_to_ignore_on_load_missing = [r"position_ids"] + + +# Adapted from https://github.com/salesforce/BLIP/blob/3a29b7410476bf5f2ba0955827390eb6ea1f4f9d/models/med.py#L571 +class TFBlipTextModel(TFBlipTextPreTrainedModel): + """ + The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of + cross-attention is added between the self-attention layers, following the architecture described in [Attention is + all you need](https://arxiv.org/abs/1706.03762) by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, + Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin. argument and `is_decoder` set to `True`; an + `encoder_hidden_states` is then expected as an input to the forward pass. + """ + + def __init__(self, config, add_pooling_layer=True, name=None, **kwargs): + super().__init__(config, name=name, **kwargs) + self.config = config + + self.embeddings = TFBlipTextEmbeddings(config, name="embeddings") + self.encoder = TFBlipTextEncoder(config, name="encoder") + self.pooler = TFBlipTextPooler(config, name="pooler") if add_pooling_layer else None + + def get_input_embeddings(self): + return self.embeddings.word_embeddings + + def set_input_embeddings(self, value): + self.embeddings.word_embeddings = value + + @tf.function + def get_extended_attention_mask( + self, attention_mask: tf.Tensor, input_shape: Tuple[int], is_decoder: bool + ) -> tf.Tensor: + """ + Makes broadcastable attention and causal masks so that future and masked tokens are ignored. + + Arguments: + attention_mask (`tf.Tensor`): + Mask with ones indicating tokens to attend to, zeros for tokens to ignore. + input_shape (`Tuple[int]`): + The shape of the input to the model. + is_decoder (`bool`): + Whether the model is used as a decoder. + + Returns: + `tf.Tensor` The extended attention mask, with the same dtype as `attention_mask.dtype`. + """ + # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length] + # ourselves in which case we just need to make it broadcastable to all heads. + if not isinstance(attention_mask, tf.Tensor): + attention_mask = tf.convert_to_tensor(attention_mask) # Catches NumPy inputs that haven't been cast yet + if attention_mask.shape.rank == 3: + extended_attention_mask = attention_mask[:, None, :, :] + elif attention_mask.shape.rank == 2: + # Provided a padding mask of dimensions [batch_size, seq_length] + # - if the model is a decoder, apply a causal mask in addition to the padding mask + # - if the model is an encoder, make the mask broadcastable to [batch_size, num_heads, seq_length, seq_length] + if is_decoder: + batch_size, seq_length = input_shape + + seq_ids = tf.range(seq_length, dtype=attention_mask.dtype) + causal_mask = tf.broadcast_to(seq_ids, (batch_size, seq_length, seq_length)) <= seq_ids[None, :, None] + # in case past_key_values are used we need to add a prefix ones mask to the causal mask + + if shape_list(causal_mask)[1] < shape_list(attention_mask)[1]: + prefix_seq_len = tf.shape(attention_mask)[1] - tf.shape(causal_mask)[1] + causal_mask = tf.concat( + [ + tf.ones((batch_size, seq_length, prefix_seq_len), dtype=causal_mask.dtype), + causal_mask, + ], + axis=-1, + ) + extended_attention_mask = ( + tf.cast(causal_mask[:, None, :, :], attention_mask.dtype) * attention_mask[:, None, None, :] + ) + else: + extended_attention_mask = attention_mask[:, None, None, :] + else: + raise ValueError( + "Wrong shape for input_ids (shape {}) or attention_mask (shape {})".format( + input_shape, attention_mask.shape + ) + ) + + # Since attention_mask is 1.0 for positions we want to attend and 0.0 for + # masked positions, this operation will create a tensor which is 0.0 for + # positions we want to attend and -10000.0 for masked positions. + # Since we are adding it to the raw scores before the softmax, this is + # effectively the same as removing these entirely. + extended_attention_mask = tf.cast(extended_attention_mask, self.dtype) # fp16 compatibility + extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0 + return extended_attention_mask + + @add_start_docstrings_to_model_forward(BLIP_TEXT_INPUTS_DOCSTRING) + @unpack_inputs + def call( + self, + input_ids: TFModelInputType | None = None, + attention_mask: tf.Tensor | None = None, + position_ids: tf.Tensor | None = None, + head_mask: tf.Tensor | None = None, + inputs_embeds: tf.Tensor | None = None, + encoder_embeds: tf.Tensor | None = None, + encoder_hidden_states: tf.Tensor | None = None, + encoder_attention_mask: tf.Tensor | None = None, + past_key_values: Tuple[Tuple[tf.Tensor]] | None = None, + use_cache: bool | None = None, + output_attentions: bool | None = None, + output_hidden_states: bool | None = None, + return_dict: bool | None = None, + is_decoder: bool = False, + training: bool = False, + ) -> Tuple[tf.Tensor] | TFBaseModelOutputWithPoolingAndCrossAttentions: + r""" + encoder_hidden_states (`tf.Tensor`, *optional*): + Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if + the model is configured as a decoder. + encoder_attention_mask (`tf.Tensor`, *optional*): + Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in + the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`: + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + past_key_values (`tuple(tuple(tf.Tensor))`, *optional*): + Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding. + If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that + don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all + `decoder_input_ids` of shape `(batch_size, sequence_length)`. + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see + `past_key_values`). + """ + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if is_decoder: + use_cache = use_cache if use_cache is not None else self.config.use_cache + else: + use_cache = False + + if input_ids is not None and inputs_embeds is not None: + raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") + elif input_ids is not None: + input_shape = shape_list(input_ids) + batch_size, seq_length = input_shape + elif inputs_embeds is not None: + input_shape = shape_list(inputs_embeds)[:-1] + batch_size, seq_length = input_shape + elif encoder_embeds is not None: + input_shape = shape_list(encoder_embeds)[:-1] + batch_size, seq_length = input_shape + else: + raise ValueError("You have to specify either input_ids or inputs_embeds or encoder_embeds") + + # past_key_values_length + past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0 + + if attention_mask is None: + attention_mask = tf.ones(((batch_size, seq_length + past_key_values_length))) + + # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length] + # ourselves in which case we just need to make it broadcastable to all heads. + extended_attention_mask: tf.Tensor = self.get_extended_attention_mask(attention_mask, input_shape, is_decoder) + + # If a 2D or 3D attention mask is provided for the cross-attention + # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length] + if encoder_hidden_states is not None: + if isinstance(encoder_hidden_states, list): + encoder_batch_size, encoder_sequence_length, _ = shape_list(encoder_hidden_states[0]) + else: + encoder_batch_size, encoder_sequence_length, _ = shape_list(encoder_hidden_states) + encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length) + + if isinstance(encoder_attention_mask, list): + encoder_extended_attention_mask = [invert_attention_mask(mask) for mask in encoder_attention_mask] + elif encoder_attention_mask is None: + encoder_attention_mask = tf.ones(encoder_hidden_shape) + encoder_extended_attention_mask = invert_attention_mask(encoder_attention_mask) + else: + encoder_extended_attention_mask = invert_attention_mask(encoder_attention_mask) + else: + encoder_extended_attention_mask = None + + # Prepare head mask if needed + # 1.0 in head_mask indicate we keep the head + # attention_probs has shape bsz x n_heads x N x N + # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads] + # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length] + head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers) + + if encoder_embeds is None: + embedding_output = self.embeddings( + input_ids=input_ids, + position_ids=position_ids, + inputs_embeds=inputs_embeds, + past_key_values_length=past_key_values_length, + ) + else: + embedding_output = encoder_embeds + + encoder_outputs = self.encoder( + embedding_output, + attention_mask=extended_attention_mask, + head_mask=head_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_extended_attention_mask, + past_key_values=past_key_values, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + ) + sequence_output = encoder_outputs[0] + pooled_output = self.pooler(sequence_output) if self.pooler is not None else None + + if not return_dict: + return (sequence_output, pooled_output) + encoder_outputs[1:] + + return TFBaseModelOutputWithPoolingAndCrossAttentions( + last_hidden_state=sequence_output, + pooler_output=pooled_output, + past_key_values=encoder_outputs.past_key_values, + hidden_states=encoder_outputs.hidden_states, + attentions=encoder_outputs.attentions, + cross_attentions=encoder_outputs.cross_attentions, + ) + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "embeddings", None) is not None: + with tf.name_scope(self.embeddings.name): + self.embeddings.build(None) + if getattr(self, "encoder", None) is not None: + with tf.name_scope(self.encoder.name): + self.encoder.build(None) + if getattr(self, "pooler", None) is not None: + with tf.name_scope(self.pooler.name): + self.pooler.build(None) + + +# Adapted from https://github.com/salesforce/BLIP/blob/main/models/med.py#L811 +class TFBlipTextLMHeadModel(TFBlipTextPreTrainedModel): + _keys_to_ignore_on_load_unexpected = [r"pooler"] + _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias"] + + def __init__(self, config, **kwargs): + super().__init__(config, **kwargs) + + self.bert = TFBlipTextModel(config, add_pooling_layer=False, name="bert") + self.cls = TFBlipTextOnlyMLMHead(config, name="cls") + self.label_smoothing = config.label_smoothing + + def get_output_embeddings(self): + return self.cls.predictions.decoder + + def set_output_embeddings(self, new_embeddings): + self.cls.predictions.decoder = new_embeddings + + @add_start_docstrings_to_model_forward(BLIP_TEXT_INPUTS_DOCSTRING) + @unpack_inputs + def call( + self, + input_ids=None, + attention_mask=None, + position_ids=None, + head_mask=None, + inputs_embeds=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + labels=None, + past_key_values=None, + use_cache=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + return_logits=False, + is_decoder=True, + training=None, + ): + r""" + encoder_hidden_states (`tf.Tensor`, *optional*): Sequence of + hidden-states at the output of the last layer of the encoder. Used in the cross-attention if the model is + configured as a decoder. + encoder_attention_mask (`tf.Tensor`, *optional*): + Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in + the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`: + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + labels (`tf.Tensor`, *optional*): + Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in + `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are + ignored (masked), the loss is only computed for the tokens with labels n `[0, ..., config.vocab_size]` + past_key_values (`tuple(tuple(tf.Tensor))`, *optional*): + Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding. + If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that + don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all + `decoder_input_ids` of shape `(batch_size, sequence_length)`. + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see + `past_key_values`). + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + if labels is not None: + use_cache = False + + outputs = self.bert( + input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + past_key_values=past_key_values, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + is_decoder=is_decoder, + training=training, + ) + + sequence_output = outputs[0] + prediction_scores = self.cls(sequence_output) + + if return_logits: + return prediction_scores[:, :-1, :] + + lm_loss = None + if labels is not None: + # we are doing next-token prediction; shift prediction scores and input ids by one + shifted_prediction_scores = prediction_scores[:, :-1, :] + shifted_prediction_scores = tf.reshape(shifted_prediction_scores, (-1, self.config.vocab_size)) + labels = labels[:, 1:] + labels = tf.reshape(labels, (-1,)) + # Keras won't give us label smoothing for sparse CE, so we de-sparsify things here + # Use relu to clamp masked labels at 0 to avoid NaN (we will be zeroing those out later anyway) + one_hot_labels = tf.one_hot(tf.nn.relu(labels), depth=self.config.vocab_size, dtype=tf.float32) + loss_fct = keras.losses.CategoricalCrossentropy( + from_logits=True, label_smoothing=self.label_smoothing, reduction="none" + ) + masked_positions = tf.cast(tf.not_equal(labels, -100), dtype=tf.float32) + lm_loss = loss_fct(one_hot_labels, shifted_prediction_scores) + lm_loss *= masked_positions + lm_loss = tf.reduce_sum(lm_loss, axis=0) / tf.math.count_nonzero(masked_positions, dtype=tf.float32) + + if not return_dict: + output = (prediction_scores,) + outputs[2:] + return ((lm_loss,) + output) if lm_loss is not None else output + + return TFCausalLMOutputWithCrossAttentions( + loss=lm_loss, + logits=prediction_scores, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + cross_attentions=outputs.cross_attentions, + ) + + def prepare_inputs_for_generation(self, input_ids, past_key_values=None, attention_mask=None, **model_kwargs): + input_shape = input_ids.shape + # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly + if attention_mask is None: + attention_mask = input_ids.new_ones(input_shape) + + # cut decoder_input_ids if past_key_values is used + if past_key_values is not None: + input_ids = input_ids[:, -1:] + + return { + "input_ids": input_ids, + "attention_mask": attention_mask, + "past_key_values": past_key_values, + "encoder_hidden_states": model_kwargs.get("encoder_hidden_states", None), + "encoder_attention_mask": model_kwargs.get("encoder_attention_mask", None), + "is_decoder": True, + } + + def _reorder_cache(self, past_key_values, beam_idx): + reordered_past = () + for layer_past in past_key_values: + reordered_past += (tuple(past_state.index_select(0, beam_idx) for past_state in layer_past),) + return reordered_past + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "bert", None) is not None: + with tf.name_scope(self.bert.name): + self.bert.build(None) + if getattr(self, "cls", None) is not None: + with tf.name_scope(self.cls.name): + self.cls.build(None) + + +__all__ = ["TFBlipTextLMHeadModel", "TFBlipTextModel", "TFBlipTextPreTrainedModel"] diff --git a/docs/transformers/build/lib/transformers/models/blip/processing_blip.py b/docs/transformers/build/lib/transformers/models/blip/processing_blip.py new file mode 100644 index 0000000000000000000000000000000000000000..c65ff6b66fdcbc1c04a5c8f89b358da93d56ae2b --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/blip/processing_blip.py @@ -0,0 +1,139 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Processor class for Blip. +""" + +from typing import List, Optional, Union + +from ...image_utils import ImageInput +from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack +from ...tokenization_utils_base import BatchEncoding, PreTokenizedInput, TextInput + + +class BlipProcessorKwargs(ProcessingKwargs, total=False): + _defaults = { + "text_kwargs": { + "add_special_tokens": True, + "padding": False, + "stride": 0, + "return_overflowing_tokens": False, + "return_special_tokens_mask": False, + "return_offsets_mapping": False, + "return_token_type_ids": False, + "return_length": False, + "verbose": True, + }, + "images_kwargs": {}, + } + + +class BlipProcessor(ProcessorMixin): + r""" + Constructs a BLIP processor which wraps a BERT tokenizer and BLIP image processor into a single processor. + + [`BlipProcessor`] offers all the functionalities of [`BlipImageProcessor`] and [`BertTokenizerFast`]. See the + docstring of [`~BlipProcessor.__call__`] and [`~BlipProcessor.decode`] for more information. + + Args: + image_processor (`BlipImageProcessor`): + An instance of [`BlipImageProcessor`]. The image processor is a required input. + tokenizer (`BertTokenizerFast`): + An instance of ['BertTokenizerFast`]. The tokenizer is a required input. + """ + + attributes = ["image_processor", "tokenizer"] + valid_kwargs = [] + image_processor_class = ("BlipImageProcessor", "BlipImageProcessorFast") + tokenizer_class = ("BertTokenizer", "BertTokenizerFast") + + def __init__(self, image_processor, tokenizer, **kwargs): + tokenizer.return_token_type_ids = False + super().__init__(image_processor, tokenizer) + self.current_processor = self.image_processor + + def __call__( + self, + images: ImageInput = None, + text: Optional[Union[str, List[str], TextInput, PreTokenizedInput]] = None, + audio=None, + videos=None, + **kwargs: Unpack[BlipProcessorKwargs], + ) -> BatchEncoding: + """ + This method uses [`BlipImageProcessor.__call__`] method to prepare image(s) for the model, and + [`BertTokenizerFast.__call__`] to prepare text for the model. + + Please refer to the docstring of the above two methods for more information. + Args: + images (`ImageInput`): + The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch + tensor. Both channels-first and channels-last formats are supported. + text (`TextInput`, `PreTokenizedInput`, `List[TextInput]`, `List[PreTokenizedInput]`): + The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings + (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set + `is_split_into_words=True` (to lift the ambiguity with a batch of sequences). + return_tensors (`str` or [`~utils.TensorType`], *optional*): + If set, will return tensors of a particular framework. Acceptable values are: + - `'tf'`: Return TensorFlow `tf.constant` objects. + - `'pt'`: Return PyTorch `torch.Tensor` objects. + - `'np'`: Return NumPy `np.ndarray` objects. + - `'jax'`: Return JAX `jnp.ndarray` objects. + """ + if images is None and text is None: + raise ValueError("You have to specify either images or text.") + + text_encoding = None + + # add pixel_values encoding. If we also have text_encoding, update image encoding and return it. + # else, return the text encoding. + output_kwargs = self._merge_kwargs( + BlipProcessorKwargs, + tokenizer_init_kwargs=self.tokenizer.init_kwargs, + **kwargs, + ) + if text is not None: + text_encoding = self.tokenizer(text, **output_kwargs["text_kwargs"]) + if images is not None: + encoding_image_processor = self.image_processor(images, **output_kwargs["images_kwargs"]) + + if text_encoding is not None: + encoding_image_processor.update(text_encoding) + return encoding_image_processor + + return text_encoding + + def batch_decode(self, *args, **kwargs): + """ + This method forwards all its arguments to BertTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please + refer to the docstring of this method for more information. + """ + return self.tokenizer.batch_decode(*args, **kwargs) + + def decode(self, *args, **kwargs): + """ + This method forwards all its arguments to BertTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to + the docstring of this method for more information. + """ + return self.tokenizer.decode(*args, **kwargs) + + @property + def model_input_names(self): + tokenizer_input_names = self.tokenizer.model_input_names + image_processor_input_names = self.image_processor.model_input_names + return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names)) + + +__all__ = ["BlipProcessor"] diff --git a/docs/transformers/build/lib/transformers/models/blip_2/__init__.py b/docs/transformers/build/lib/transformers/models/blip_2/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..0717e81fca606edde774ad19092472bf59b4701a --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/blip_2/__init__.py @@ -0,0 +1,28 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ...utils import _LazyModule +from ...utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .configuration_blip_2 import * + from .modeling_blip_2 import * + from .processing_blip_2 import * +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/docs/transformers/build/lib/transformers/models/blip_2/configuration_blip_2.py b/docs/transformers/build/lib/transformers/models/blip_2/configuration_blip_2.py new file mode 100644 index 0000000000000000000000000000000000000000..db55e39ab7319b7286237922568b925b7b85607a --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/blip_2/configuration_blip_2.py @@ -0,0 +1,350 @@ +# coding=utf-8 +# Copyright 2023 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""BLIP-2 model configuration""" + +from typing import Optional + +from ...configuration_utils import PretrainedConfig +from ...models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING_NAMES +from ...utils import logging +from ..auto import CONFIG_MAPPING, AutoConfig + + +logger = logging.get_logger(__name__) + + +class Blip2VisionConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`Blip2VisionModel`]. It is used to instantiate a + BLIP-2 vision encoder according to the specified arguments, defining the model architecture. Instantiating a + configuration defaults will yield a similar configuration to that of the BLIP-2 + [Salesforce/blip2-opt-2.7b](https://huggingface.co/Salesforce/blip2-opt-2.7b) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + hidden_size (`int`, *optional*, defaults to 1408): + Dimensionality of the encoder layers and the pooler layer. + intermediate_size (`int`, *optional*, defaults to 6144): + Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder. + num_hidden_layers (`int`, *optional*, defaults to 39): + Number of hidden layers in the Transformer encoder. + num_attention_heads (`int`, *optional*, defaults to 16): + Number of attention heads for each attention layer in the Transformer encoder. + image_size (`int`, *optional*, defaults to 224): + The size (resolution) of each image. + patch_size (`int`, *optional*, defaults to 14): + The size (resolution) of each patch. + hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`): + The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, + `"relu"`, `"selu"` and `"gelu_new"` `"gelu"` are supported. layer_norm_eps (`float`, *optional*, defaults + to 1e-5): The epsilon used by the layer normalization layers. + attention_dropout (`float`, *optional*, defaults to 0.0): + The dropout ratio for the attention probabilities. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + qkv_bias (`bool`, *optional*, defaults to `True`): + Whether to add a bias to the queries and values in the self-attention layers. + + Example: + + ```python + >>> from transformers import Blip2VisionConfig, Blip2VisionModel + + >>> # Initializing a Blip2VisionConfig with Salesforce/blip2-opt-2.7b style configuration + >>> configuration = Blip2VisionConfig() + + >>> # Initializing a Blip2VisionModel (with random weights) from the Salesforce/blip2-opt-2.7b style configuration + >>> model = Blip2VisionModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "blip_2_vision_model" + base_config_key = "vision_config" + + def __init__( + self, + hidden_size=1408, + intermediate_size=6144, + num_hidden_layers=39, + num_attention_heads=16, + image_size=224, + patch_size=14, + hidden_act="gelu", + layer_norm_eps=1e-6, + attention_dropout=0.0, + initializer_range=1e-10, + qkv_bias=True, + **kwargs, + ): + super().__init__(**kwargs) + + self.hidden_size = hidden_size + self.intermediate_size = intermediate_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.patch_size = patch_size + self.image_size = image_size + self.initializer_range = initializer_range + self.attention_dropout = attention_dropout + self.layer_norm_eps = layer_norm_eps + self.hidden_act = hidden_act + self.qkv_bias = qkv_bias + + +class Blip2QFormerConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`Blip2QFormerModel`]. It is used to instantiate a + BLIP-2 Querying Transformer (Q-Former) model according to the specified arguments, defining the model architecture. + Instantiating a configuration with the defaults will yield a similar configuration to that of the BLIP-2 + [Salesforce/blip2-opt-2.7b](https://huggingface.co/Salesforce/blip2-opt-2.7b) architecture. Configuration objects + inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the documentation from + [`PretrainedConfig`] for more information. + + Note that [`Blip2QFormerModel`] is very similar to [`BertLMHeadModel`] with interleaved cross-attention. + + Args: + vocab_size (`int`, *optional*, defaults to 30522): + Vocabulary size of the Q-Former model. Defines the number of different tokens that can be represented by + the `inputs_ids` passed when calling the model. + hidden_size (`int`, *optional*, defaults to 768): + Dimensionality of the encoder layers and the pooler layer. + num_hidden_layers (`int`, *optional*, defaults to 12): + Number of hidden layers in the Transformer encoder. + num_attention_heads (`int`, *optional*, defaults to 12): + Number of attention heads for each attention layer in the Transformer encoder. + intermediate_size (`int`, *optional*, defaults to 3072): + Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder. + hidden_act (`str` or `Callable`, *optional*, defaults to `"gelu"`): + The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, + `"relu"`, `"silu"` and `"gelu_new"` are supported. + hidden_dropout_prob (`float`, *optional*, defaults to 0.1): + The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. + attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1): + The dropout ratio for the attention probabilities. + max_position_embeddings (`int`, *optional*, defaults to 512): + The maximum sequence length that this model might ever be used with. Typically set this to something large + just in case (e.g., 512 or 1024 or 2048). + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + layer_norm_eps (`float`, *optional*, defaults to 1e-12): + The epsilon used by the layer normalization layers. + pad_token_id (`int`, *optional*, defaults to 0): + Index to be used for padding token. + position_embedding_type (`str`, *optional*, defaults to `"absolute"`): + Type of position embedding. Choose one of `"absolute"`, `"relative_key"`, `"relative_key_query"`. For + positional embeddings use `"absolute"`. For more information on `"relative_key"`, please refer to + [Self-Attention with Relative Position Representations (Shaw et al.)](https://arxiv.org/abs/1803.02155). + For more information on `"relative_key_query"`, please refer to *Method 4* in [Improve Transformer Models + with Better Relative Position Embeddings (Huang et al.)](https://arxiv.org/abs/2009.13658). + cross_attention_frequency (`int`, *optional*, defaults to 2): + The frequency of adding cross-attention to the Transformer layers. + encoder_hidden_size (`int`, *optional*, defaults to 1408): + The hidden size of the hidden states for cross-attention. + use_qformer_text_input (`bool`, *optional*, defaults to `False`): + Whether to use BERT-style embeddings. + + Examples: + + ```python + >>> from transformers import Blip2QFormerConfig, Blip2QFormerModel + + >>> # Initializing a BLIP-2 Salesforce/blip2-opt-2.7b style configuration + >>> configuration = Blip2QFormerConfig() + + >>> # Initializing a model (with random weights) from the Salesforce/blip2-opt-2.7b style configuration + >>> model = Blip2QFormerModel(configuration) + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "blip_2_qformer" + base_config_key = "qformer_config" + + def __init__( + self, + vocab_size=30522, + hidden_size=768, + num_hidden_layers=12, + num_attention_heads=12, + intermediate_size=3072, + hidden_act="gelu", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + max_position_embeddings=512, + initializer_range=0.02, + layer_norm_eps=1e-12, + pad_token_id=0, + position_embedding_type="absolute", + cross_attention_frequency=2, + encoder_hidden_size=1408, + use_qformer_text_input=False, + **kwargs, + ): + super().__init__(pad_token_id=pad_token_id, **kwargs) + + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.hidden_act = hidden_act + self.intermediate_size = intermediate_size + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.max_position_embeddings = max_position_embeddings + self.initializer_range = initializer_range + self.layer_norm_eps = layer_norm_eps + self.position_embedding_type = position_embedding_type + self.cross_attention_frequency = cross_attention_frequency + self.encoder_hidden_size = encoder_hidden_size + self.use_qformer_text_input = use_qformer_text_input + + +class Blip2Config(PretrainedConfig): + r""" + [`Blip2Config`] is the configuration class to store the configuration of a [`Blip2ForConditionalGeneration`]. It is + used to instantiate a BLIP-2 model according to the specified arguments, defining the vision model, Q-Former model + and language model configs. Instantiating a configuration with the defaults will yield a similar configuration to + that of the BLIP-2 [Salesforce/blip2-opt-2.7b](https://huggingface.co/Salesforce/blip2-opt-2.7b) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + vision_config (`dict`, *optional*): + Dictionary of configuration options used to initialize [`Blip2VisionConfig`]. + qformer_config (`dict`, *optional*): + Dictionary of configuration options used to initialize [`Blip2QFormerConfig`]. + text_config (`dict`, *optional*): + Dictionary of configuration options used to initialize any [`PretrainedConfig`]. + num_query_tokens (`int`, *optional*, defaults to 32): + The number of query tokens passed through the Transformer. + image_text_hidden_size (`int`, *optional*, defaults to 256): + Dimentionality of the hidden state of the image-text fusion layer. + + image_token_index (`int`, *optional*): + Token index of special image token. + kwargs (*optional*): + Dictionary of keyword arguments. + + Example: + + ```python + >>> from transformers import ( + ... Blip2VisionConfig, + ... Blip2QFormerConfig, + ... OPTConfig, + ... Blip2Config, + ... Blip2ForConditionalGeneration, + ... ) + + >>> # Initializing a Blip2Config with Salesforce/blip2-opt-2.7b style configuration + >>> configuration = Blip2Config() + + >>> # Initializing a Blip2ForConditionalGeneration (with random weights) from the Salesforce/blip2-opt-2.7b style configuration + >>> model = Blip2ForConditionalGeneration(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + + >>> # We can also initialize a Blip2Config from a Blip2VisionConfig, Blip2QFormerConfig and any PretrainedConfig + + >>> # Initializing BLIP-2 vision, BLIP-2 Q-Former and language model configurations + >>> vision_config = Blip2VisionConfig() + >>> qformer_config = Blip2QFormerConfig() + >>> text_config = OPTConfig() + + >>> config = Blip2Config.from_text_vision_configs(vision_config, qformer_config, text_config) + ```""" + + model_type = "blip-2" + attribute_map = { + "image_token_id": "image_token_index", + } + sub_configs = {"text_config": AutoConfig, "qformer_config": Blip2QFormerConfig, "vision_config": Blip2VisionConfig} + + def __init__( + self, + vision_config=None, + qformer_config=None, + text_config=None, + num_query_tokens=32, + image_text_hidden_size=256, + image_token_index=None, + **kwargs, + ): + super().__init__(**kwargs) + + if vision_config is None: + vision_config = {} + logger.info("vision_config is None. initializing the Blip2VisionConfig with default values.") + + if qformer_config is None: + qformer_config = {} + logger.info("qformer_config is None. Initializing the Blip2QFormerConfig with default values.") + + if text_config is None: + text_config = {} + logger.info("text_config is None. Initializing the text config with default values (`OPTConfig`).") + + self.vision_config = Blip2VisionConfig(**vision_config) + self.qformer_config = Blip2QFormerConfig(**qformer_config) + text_model_type = text_config["model_type"] if "model_type" in text_config else "opt" + self.text_config = CONFIG_MAPPING[text_model_type](**text_config) + + self.num_query_tokens = num_query_tokens + self.image_text_hidden_size = image_text_hidden_size + self.image_token_index = image_token_index + self.qformer_config.encoder_hidden_size = self.vision_config.hidden_size + self.use_decoder_only_language_model = self.text_config.model_type in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES + self.initializer_factor = 1.0 + self.initializer_range = 0.02 + + @classmethod + def from_vision_qformer_text_configs( + cls, + vision_config: Blip2VisionConfig, + qformer_config: Blip2QFormerConfig, + text_config: Optional[PretrainedConfig] = None, + **kwargs, + ): + r""" + Instantiate a [`Blip2Config`] (or a derived class) from a BLIP-2 vision model, Q-Former and language model + configurations. + + Args: + vision_config (`dict`): + Dictionary of configuration options used to initialize [`Blip2VisionConfig`]. + qformer_config (`dict`): + Dictionary of configuration options used to initialize [`Blip2QFormerConfig`]. + text_config (`dict`, *optional*): + Dictionary of configuration options used to initialize any [`PretrainedConfig`]. + + Returns: + [`Blip2Config`]: An instance of a configuration object + """ + + return cls( + vision_config=vision_config.to_dict(), + qformer_config=qformer_config.to_dict(), + text_config=text_config.to_dict() if text_config is not None else None, + **kwargs, + ) + + +__all__ = ["Blip2Config", "Blip2QFormerConfig", "Blip2VisionConfig"] diff --git a/docs/transformers/build/lib/transformers/models/blip_2/convert_blip_2_original_to_pytorch.py b/docs/transformers/build/lib/transformers/models/blip_2/convert_blip_2_original_to_pytorch.py new file mode 100644 index 0000000000000000000000000000000000000000..d6640045b80c30ccda3e33b7eef9345d55e3d41f --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/blip_2/convert_blip_2_original_to_pytorch.py @@ -0,0 +1,390 @@ +# coding=utf-8 +# Copyright 2023 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Convert BLIP-2 checkpoints from the original repository. + +URL: https://github.com/salesforce/LAVIS/tree/main/projects/blip2 +""" + +import argparse + +import requests +import torch + +# pip3 install salesforce-lavis +# I'm actually installing a slightly modified version: pip3 install -U git+https://github.com/nielsrogge/LAVIS.git@blip2_float32 +# to make sure we can compare both original and HF implementation in float32 +from lavis.models import load_model_and_preprocess +from PIL import Image + +from transformers import ( + AutoTokenizer, + BertTokenizer, + Blip2Config, + Blip2ForConditionalGeneration, + Blip2ForImageTextRetrieval, + Blip2Processor, + Blip2QFormerConfig, + Blip2VisionConfig, + BlipImageProcessor, + OPTConfig, + T5Config, + set_seed, +) +from transformers.utils.constants import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD + + +def load_demo_image(): + url = "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/assets/merlion.png" + image = Image.open(requests.get(url, stream=True).raw).convert("RGB") + + return image + + +# here we list all keys to be renamed (original name on the left, our name on the right) +def create_rename_keys(config, model_name): + rename_keys = [] + # fmt: off + + # vision encoder + rename_keys.append(("visual_encoder.cls_token", "vision_model.embeddings.class_embedding")) + rename_keys.append(("visual_encoder.pos_embed", "vision_model.embeddings.position_embedding")) + rename_keys.append(("visual_encoder.patch_embed.proj.weight", "vision_model.embeddings.patch_embedding.weight")) + rename_keys.append(("visual_encoder.patch_embed.proj.bias", "vision_model.embeddings.patch_embedding.bias")) + rename_keys.append(("ln_vision.weight", "vision_model.post_layernorm.weight")) + rename_keys.append(("ln_vision.bias", "vision_model.post_layernorm.bias")) + + for i in range(config.vision_config.num_hidden_layers): + rename_keys.append((f"visual_encoder.blocks.{i}.norm1.weight", f"vision_model.encoder.layers.{i}.layer_norm1.weight")) + rename_keys.append((f"visual_encoder.blocks.{i}.norm1.bias", f"vision_model.encoder.layers.{i}.layer_norm1.bias")) + rename_keys.append((f"visual_encoder.blocks.{i}.norm2.weight", f"vision_model.encoder.layers.{i}.layer_norm2.weight")) + rename_keys.append((f"visual_encoder.blocks.{i}.norm2.bias", f"vision_model.encoder.layers.{i}.layer_norm2.bias")) + rename_keys.append((f"visual_encoder.blocks.{i}.attn.qkv.weight", f"vision_model.encoder.layers.{i}.self_attn.qkv.weight")) + rename_keys.append((f"visual_encoder.blocks.{i}.attn.proj.weight", f"vision_model.encoder.layers.{i}.self_attn.projection.weight",)) + rename_keys.append((f"visual_encoder.blocks.{i}.attn.proj.bias", f"vision_model.encoder.layers.{i}.self_attn.projection.bias")) + rename_keys.append((f"visual_encoder.blocks.{i}.mlp.fc1.weight", f"vision_model.encoder.layers.{i}.mlp.fc1.weight")) + rename_keys.append((f"visual_encoder.blocks.{i}.mlp.fc1.bias", f"vision_model.encoder.layers.{i}.mlp.fc1.bias")) + rename_keys.append((f"visual_encoder.blocks.{i}.mlp.fc2.weight", f"vision_model.encoder.layers.{i}.mlp.fc2.weight")) + rename_keys.append((f"visual_encoder.blocks.{i}.mlp.fc2.bias", f"vision_model.encoder.layers.{i}.mlp.fc2.bias")) + + # QFormer + rename_keys.append(("Qformer.bert.embeddings.LayerNorm.weight", "qformer.layernorm.weight")) + rename_keys.append(("Qformer.bert.embeddings.LayerNorm.bias", "qformer.layernorm.bias")) + if "itm" in model_name: + rename_keys.append(("Qformer.bert.embeddings.word_embeddings.weight", "embeddings.word_embeddings.weight")) + rename_keys.append(("Qformer.bert.embeddings.position_embeddings.weight", "embeddings.position_embeddings.weight")) + rename_keys.append(("vision_proj.weight", "vision_projection.weight")) + rename_keys.append(("vision_proj.bias", "vision_projection.bias")) + rename_keys.append(("text_proj.weight", "text_projection.weight")) + rename_keys.append(("text_proj.bias", "text_projection.bias")) + + # fmt: on + return rename_keys + + +def rename_key(dct, old, new): + val = dct.pop(old) + dct[new] = val + + +def read_in_q_v_bias(state_dict, config): + for i in range(config.vision_config.num_hidden_layers): + # read in original q and v biases + q_bias = state_dict.pop(f"visual_encoder.blocks.{i}.attn.q_bias") + v_bias = state_dict.pop(f"visual_encoder.blocks.{i}.attn.v_bias") + + # next, set bias in the state dict + qkv_bias = torch.cat((q_bias, torch.zeros_like(v_bias, requires_grad=False), v_bias)) + state_dict[f"vision_model.encoder.layers.{i}.self_attn.qkv.bias"] = qkv_bias + + +def get_blip2_config(model_name, eos_token_id): + image_size = 364 if "coco" in model_name else 224 + vision_config = Blip2VisionConfig(image_size=image_size).to_dict() + + # make sure the models have proper bos_token_id and eos_token_id set (important for generation) + # seems like flan-T5 models don't have bos_token_id properly set? + if "opt-2.7b" in model_name: + text_config = OPTConfig.from_pretrained("facebook/opt-2.7b", eos_token_id=eos_token_id).to_dict() + elif "opt-6.7b" in model_name: + text_config = OPTConfig.from_pretrained("facebook/opt-6.7b", eos_token_id=eos_token_id).to_dict() + elif "t5-xl" in model_name: + text_config = T5Config.from_pretrained("google/flan-t5-xl", dense_act_fn="gelu", bos_token_id=1).to_dict() + elif "t5-xxl" in model_name: + text_config = T5Config.from_pretrained("google/flan-t5-xxl", dense_act_fn="gelu", bos_token_id=1).to_dict() + elif "itm" in model_name: + text_config = {} + else: + raise ValueError("Model name not supported") + + if "itm" in model_name: + config = Blip2Config( + vision_config=vision_config, + qformer_config=Blip2QFormerConfig(vocab_size=30523, use_qformer_text_input=True).to_dict(), + ) + else: + config = Blip2Config(vision_config=vision_config, text_config=text_config) + + return config, image_size + + +@torch.no_grad() +def convert_blip2_checkpoint( + model_name, pytorch_dump_folder_path=None, push_to_hub=False, lavis_device="cpu", hf_model_device="cpu" +): + """ + Copy/paste/tweak model's weights to Transformers design. + """ + if "opt" in model_name: + tokenizer = AutoTokenizer.from_pretrained("facebook/opt-2.7b") + elif "itm" in model_name: + tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", truncation_side="right") + tokenizer.add_special_tokens({"bos_token": "[DEC]"}) + else: + tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-xl") + + if "itm" in model_name: + eos_token_id = None + else: + eos_token_id = tokenizer("\n", add_special_tokens=False).input_ids[0] + config, image_size = get_blip2_config(model_name, eos_token_id=eos_token_id) + + if "itm" in model_name: + hf_model = Blip2ForImageTextRetrieval(config).eval() + else: + hf_model = Blip2ForConditionalGeneration(config).eval() + + model_name_to_original = { + "blip2-opt-2.7b": ("blip2_opt", "pretrain_opt2.7b"), + "blip2-opt-6.7b": ("blip2_opt", "pretrain_opt6.7b"), + "blip2-opt-2.7b-coco": ("blip2_opt", "caption_coco_opt2.7b"), + "blip2-opt-6.7b-coco": ("blip2_opt", "caption_coco_opt6.7b"), + "blip2-flan-t5-xl": ("blip2_t5", "pretrain_flant5xl"), + "blip2-flan-t5-xl-coco": ("blip2_t5", "caption_coco_flant5xl"), + "blip2-flan-t5-xxl": ("blip2_t5", "pretrain_flant5xxl"), + "blip2-itm-vit-g": ("blip2_image_text_matching", "pretrain"), + "blip2-itm-vit-g-coco": ("blip2_image_text_matching", "coco"), + } + + name, type = model_name_to_original[model_name] + + # load original model + print("Loading original model...") + original_model, vis_processors, _ = load_model_and_preprocess( + name=name, model_type=type, is_eval=True, device=lavis_device + ) + original_model.eval() + print("Done!") + + # update state dict keys + state_dict = original_model.state_dict() + rename_keys = create_rename_keys(config, model_name) + for src, dest in rename_keys: + rename_key(state_dict, src, dest) + + # some keys can be renamed efficiently + for key, val in state_dict.copy().items(): + val = state_dict.pop(key) + if key.startswith("Qformer.bert"): + key = key.replace("Qformer.bert", "qformer") + if "attention.self" in key: + key = key.replace("self", "attention") + if "opt_proj" in key: + key = key.replace("opt_proj", "language_projection") + if "t5_proj" in key: + key = key.replace("t5_proj", "language_projection") + if key.startswith("opt"): + key = key.replace("opt", "language") + if key.startswith("t5"): + key = key.replace("t5", "language") + state_dict[key] = val + + # read in qv biases + read_in_q_v_bias(state_dict, config) + + missing_keys, unexpected_keys = hf_model.load_state_dict(state_dict, strict=False) + assert len(missing_keys) == 0 + + if "itm" in model_name: + unexpected_keys = list(filter(lambda x: not x.startswith("Qformer.cls"), unexpected_keys)) + assert unexpected_keys == ["temp", "qformer.embeddings.position_ids"] + else: + assert unexpected_keys == ["qformer.embeddings.position_ids"] + + image = load_demo_image() + original_pixel_values = vis_processors["eval"](image).unsqueeze(0).to(lavis_device) + + # create processor + image_processor = BlipImageProcessor( + size={"height": image_size, "width": image_size}, image_mean=OPENAI_CLIP_MEAN, image_std=OPENAI_CLIP_STD + ) + processor = Blip2Processor(image_processor=image_processor, tokenizer=tokenizer) + pixel_values = processor(images=image, return_tensors="pt").pixel_values.to(hf_model_device) + + # make sure processor creates exact same pixel values + assert torch.allclose(pixel_values, original_pixel_values.to(pixel_values.device)) + + original_model.to(lavis_device) + hf_model.to(hf_model_device) + + if "itm" in model_name: + caption = "a large fountain spewing water into the air" + input_ids = tokenizer([caption], return_tensors="pt").input_ids.to(hf_model_device) + attention_mask = processor(text=caption, return_tensors="pt").attention_mask.to(hf_model_device) + + with torch.no_grad(): + original_logits = original_model( + {"image": original_pixel_values, "text_input": [caption]}, match_head="itm" + ) + logits = hf_model( + pixel_values=pixel_values, + input_ids=input_ids, + attention_mask=attention_mask, + use_image_text_matching_head=True, + ) + + assert original_logits.shape == logits.logits_per_image.shape + print("First values of original logits:", original_logits[0, :3]) + print("First values of HF logits:", logits.logits_per_image[0, :3]) + + # assert values + # cast to same type + target_dtype = logits.logits_per_image.dtype + assert torch.allclose(original_logits.to(target_dtype), logits.logits_per_image, atol=1e-4) + + original_itm_scores = torch.nn.functional.softmax(original_logits, dim=1) + itm_scores = torch.nn.functional.softmax(logits.logits_per_image, dim=1) + assert torch.allclose(original_itm_scores.to(target_dtype), itm_scores, atol=1e-4) + print("Looks ok!") + + with torch.no_grad(): + original_logits = original_model( + {"image": original_pixel_values, "text_input": [caption]}, match_head="itc" + ) + logits = hf_model( + pixel_values=pixel_values, + input_ids=input_ids, + attention_mask=attention_mask, + use_image_text_matching_head=False, + ) + + assert original_logits.shape == logits.logits_per_image.shape + print("First values of original logits:", original_logits[0, :3]) + print("First values of HF logits:", logits.logits_per_image[0, :3]) + + # assert values + # cast to same type + target_dtype = logits.logits_per_image.dtype + assert torch.allclose(original_logits.to(target_dtype), logits.logits_per_image, atol=1e-4) + print("Looks ok!") + + else: + input_ids = tokenizer(["\n"], return_tensors="pt").input_ids.to(hf_model_device) + + with torch.no_grad(): + if "opt" in model_name: + original_logits = original_model({"image": original_pixel_values, "text_input": [""]}).logits + logits = hf_model(pixel_values, input_ids).logits + else: + original_logits = original_model( + {"image": original_pixel_values, "text_input": ["\n"], "text_output": ["\n"]} + ).logits + labels = input_ids.masked_fill(input_ids == tokenizer.pad_token_id, -100) + logits = hf_model(pixel_values, input_ids, labels=labels).logits + + assert original_logits.shape == logits.shape + print("First values of original logits:", original_logits[0, :3, :3]) + print("First values of HF logits:", logits[0, :3, :3]) + + # assert values + assert torch.allclose(original_logits.to(logits.device), logits, atol=1e-4) + print("Looks ok!") + + print("Generating a caption...") + prompt = "Question: what object is in this image? Answer:" + input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(hf_model_device) + + set_seed(42) + + original_outputs = original_model.generate( + {"image": original_pixel_values, "prompt": prompt}, use_nucleus_sampling=True, max_length=50 + ) + outputs = hf_model.generate( + pixel_values, + input_ids, + do_sample=True, + num_beams=5, + max_length=30, + min_length=1, + top_p=0.9, + repetition_penalty=1.0, + length_penalty=1.0, + temperature=1, + ) + output_text = processor.batch_decode(outputs, skip_special_tokens=True) + output_text = [text.strip() for text in output_text] + print("Original generation:", original_outputs) + print("HF generation:", output_text) + + if pytorch_dump_folder_path is not None: + processor.save_pretrained(pytorch_dump_folder_path) + hf_model.save_pretrained(pytorch_dump_folder_path) + + if push_to_hub: + processor.push_to_hub(f"nielsr/{model_name}") + hf_model.push_to_hub(f"nielsr/{model_name}") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + choices = [ + "blip2-opt-2.7b", + "blip2-opt-6.7b", + "blip2-opt-2.7b-coco", + "blip2-opt-6.7b-coco", + "blip2-flan-t5-xl", + "blip2-flan-t5-xl-coco", + "blip2-flan-t5-xxl", + "blip2-itm-vit-g", + "blip2-itm-vit-g-coco", + ] + parser.add_argument( + "--model_name", + default="blip2-opt-2.7b", + choices=choices, + type=str, + help="Path to hf config.json of model to convert", + ) + parser.add_argument("--pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.") + parser.add_argument( + "--push_to_hub", + action="store_true", + help="Whether to push the model and processor to the hub after converting", + ) + # note: this script is tested on 2 GPUs, as models are compared in float32, + # which requires quite some memory. Hence loading both on a + # separate device is the easiest to compare + parser.add_argument( + "--lavis_device", default="cpu", type=str, help="Torch device to run the conversion, either cpu or cuda." + ) + parser.add_argument( + "--hf_model_device", default="cpu", type=str, help="Torch device to run the conversion, either cpu or cuda." + ) + + args = parser.parse_args() + + convert_blip2_checkpoint( + args.model_name, args.pytorch_dump_folder_path, args.push_to_hub, args.lavis_device, args.hf_model_device + ) diff --git a/docs/transformers/build/lib/transformers/models/blip_2/modeling_blip_2.py b/docs/transformers/build/lib/transformers/models/blip_2/modeling_blip_2.py new file mode 100644 index 0000000000000000000000000000000000000000..d6ec49505bc9ed4326433881e6c9f390060a768e --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/blip_2/modeling_blip_2.py @@ -0,0 +1,2643 @@ +# coding=utf-8 +# Copyright 2023 The Salesforce Authors and The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch BLIP-2 model.""" + +import math +from dataclasses import dataclass +from typing import Any, Optional, Tuple, Union + +import torch +import torch.utils.checkpoint +from torch import nn +from torch.nn import CrossEntropyLoss + +from ...activations import ACT2FN +from ...generation import GenerationMixin +from ...modeling_outputs import ( + BaseModelOutput, + BaseModelOutputWithPastAndCrossAttentions, + BaseModelOutputWithPooling, + BaseModelOutputWithPoolingAndCrossAttentions, +) +from ...modeling_utils import PreTrainedModel +from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer +from ...utils import ( + ModelOutput, + add_start_docstrings, + add_start_docstrings_to_model_forward, + logging, + replace_return_docstrings, + torch_int, +) +from ..auto import AutoModelForCausalLM, AutoModelForSeq2SeqLM +from .configuration_blip_2 import Blip2Config, Blip2QFormerConfig, Blip2VisionConfig + + +logger = logging.get_logger(__name__) + +_CHECKPOINT_FOR_DOC = "Salesforce/blip2-opt-2.7b" + + +@dataclass +class Blip2ForConditionalGenerationModelOutput(ModelOutput): + """ + Class defining the outputs of [`Blip2ForConditionalGeneration`]. + + Args: + loss (`torch.FloatTensor`, *optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`): + Language modeling loss from the language model. + logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`): + Prediction scores of the language modeling head of the language model. + vision_outputs (`BaseModelOutputWithPooling`): + Outputs of the vision encoder. + qformer_outputs (`BaseModelOutputWithPoolingAndCrossAttentions`): + Outputs of the Q-Former (Querying Transformer). + language_model_outputs (`CausalLMOutputWithPast` or `Seq2SeqLMOutput`): + Outputs of the language model. + """ + + loss: Optional[Tuple[torch.FloatTensor]] = None + logits: Optional[Tuple[torch.FloatTensor]] = None + vision_outputs: Optional[torch.FloatTensor] = None + qformer_outputs: Optional[Tuple[torch.FloatTensor]] = None + language_model_outputs: Optional[Tuple[torch.FloatTensor]] = None + + def to_tuple(self) -> Tuple[Any]: + return tuple( + self[k] + if k not in ["vision_outputs", "qformer_outputs", "language_model_outputs"] + else getattr(self, k).to_tuple() + for k in self.keys() + ) + + +@dataclass +class Blip2ImageTextMatchingModelOutput(ModelOutput): + """ + Args: + loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`): + Contrastive loss for image-text similarity. + logits_per_image (`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`): + The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text + similarity scores. + logits_per_text (`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`): + The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image + similarity scores. + text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`): + The text embeddings obtained by applying the projection layer to the pooled output. + image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`): + The image embeddings obtained by applying the projection layer to the pooled output. + text_model_output (`BaseModelOutputWithPooling`): + The output of the [`Blip2QFormerModel`]. + vision_model_output (`BaseModelOutputWithPooling`): + The output of the [`Blip2VisionModel`]. + """ + + loss: Optional[torch.FloatTensor] = None + logits_per_image: Optional[torch.FloatTensor] = None + logits_per_text: Optional[torch.FloatTensor] = None + text_embeds: Optional[torch.FloatTensor] = None + image_embeds: Optional[torch.FloatTensor] = None + text_model_output: BaseModelOutputWithPooling = None + vision_model_output: BaseModelOutputWithPooling = None + + def to_tuple(self) -> Tuple[Any]: + return tuple( + self[k] if k not in ["text_model_output", "vision_model_output"] else getattr(self, k).to_tuple() + for k in self.keys() + ) + + +@dataclass +# Copied from transformers.models.clip.modeling_clip.CLIPTextModelOutput with CLIP->Blip2 +class Blip2TextModelOutput(ModelOutput): + """ + Base class for text model's outputs that also contains a pooling of the last hidden states. + + Args: + text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`): + The text embeddings obtained by applying the projection layer to the pooler_output. + last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): + Sequence of hidden-states at the output of the last layer of the model. + hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, + + one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the optional initial embedding outputs. + attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + + text_embeds: Optional[torch.FloatTensor] = None + last_hidden_state: Optional[torch.FloatTensor] = None + hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None + attentions: Optional[Tuple[torch.FloatTensor, ...]] = None + + +@dataclass +# Copied from transformers.models.clip.modeling_clip.CLIPVisionModelOutput with CLIP->Blip2 +class Blip2VisionModelOutput(ModelOutput): + """ + Base class for vision model's outputs that also contains image embeddings of the pooling of the last hidden states. + + Args: + image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`): + The image embeddings obtained by applying the projection layer to the pooler_output. + last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): + Sequence of hidden-states at the output of the last layer of the model. + hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, + + one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the optional initial embedding outputs. + attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + + image_embeds: Optional[torch.FloatTensor] = None + last_hidden_state: Optional[torch.FloatTensor] = None + hidden_states: Optional[Tuple[torch.FloatTensor, ...]] = None + attentions: Optional[Tuple[torch.FloatTensor, ...]] = None + + +# Copied from transformers.models.blip.modeling_blip.BlipVisionEmbeddings with Blip->Blip2 +class Blip2VisionEmbeddings(nn.Module): + def __init__(self, config: Blip2VisionConfig): + super().__init__() + self.config = config + self.embed_dim = config.hidden_size + self.image_size = config.image_size + self.patch_size = config.patch_size + + self.class_embedding = nn.Parameter(torch.randn(1, 1, self.embed_dim)) + + self.patch_embedding = nn.Conv2d( + in_channels=3, out_channels=self.embed_dim, kernel_size=self.patch_size, stride=self.patch_size + ) + + self.num_patches = (self.image_size // self.patch_size) ** 2 + self.num_positions = self.num_patches + 1 + + self.position_embedding = nn.Parameter(torch.randn(1, self.num_positions, self.embed_dim)) + + def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width: int) -> torch.Tensor: + """ + This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution + images. This method is also adapted to support torch.jit tracing. + + Adapted from: + - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and + - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211 + """ + + num_patches = embeddings.shape[1] - 1 + num_positions = self.position_embedding.shape[1] - 1 + + # always interpolate when tracing to ensure the exported model works for dynamic input shapes + if not torch.jit.is_tracing() and num_patches == num_positions and height == width: + return self.position_embedding + + class_pos_embed = self.position_embedding[:, :1] + patch_pos_embed = self.position_embedding[:, 1:] + + dim = embeddings.shape[-1] + + new_height = height // self.patch_size + new_width = width // self.patch_size + + sqrt_num_positions = torch_int(num_positions**0.5) + patch_pos_embed = patch_pos_embed.reshape(1, sqrt_num_positions, sqrt_num_positions, dim) + patch_pos_embed = patch_pos_embed.permute(0, 3, 1, 2) + + patch_pos_embed = nn.functional.interpolate( + patch_pos_embed, + size=(new_height, new_width), + mode="bicubic", + align_corners=False, + ) + + patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim) + + return torch.cat((class_pos_embed, patch_pos_embed), dim=1) + + def forward(self, pixel_values: torch.FloatTensor, interpolate_pos_encoding: bool = False) -> torch.Tensor: + batch_size, _, height, width = pixel_values.shape + target_dtype = self.patch_embedding.weight.dtype + patch_embeds = self.patch_embedding(pixel_values.to(dtype=target_dtype)) # shape = [*, width, grid, grid] + patch_embeds = patch_embeds.flatten(2).transpose(1, 2) + class_embeds = self.class_embedding.expand(batch_size, 1, -1).to(target_dtype) + embeddings = torch.cat([class_embeds, patch_embeds], dim=1) + if interpolate_pos_encoding: + position_embedding = self.interpolate_pos_encoding(embeddings, height, width) + else: + position_embedding = self.position_embedding + embeddings = embeddings + position_embedding[:, : embeddings.size(1), :].to(target_dtype) + return embeddings + + +class Blip2Attention(nn.Module): + """Multi-headed attention from 'Attention Is All You Need' paper""" + + def __init__(self, config): + super().__init__() + self.config = config + self.embed_dim = config.hidden_size + self.num_heads = config.num_attention_heads + self.head_dim = self.embed_dim // self.num_heads + if self.head_dim * self.num_heads != self.embed_dim: + raise ValueError( + f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:" + f" {self.num_heads})." + ) + self.scale = self.head_dim**-0.5 + self.dropout = nn.Dropout(config.attention_dropout) + + # small tweak here compared to CLIP, no bias here + self.qkv = nn.Linear(self.embed_dim, 3 * self.embed_dim, bias=False) + + if config.qkv_bias: + q_bias = nn.Parameter(torch.zeros(self.embed_dim)) + v_bias = nn.Parameter(torch.zeros(self.embed_dim)) + else: + q_bias = None + v_bias = None + + if q_bias is not None: + qkv_bias = torch.cat((q_bias, torch.zeros_like(v_bias, requires_grad=False), v_bias)) + self.qkv.bias = nn.Parameter(qkv_bias) + + self.projection = nn.Linear(self.embed_dim, self.embed_dim) + + def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int): + return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous() + + def forward( + self, + hidden_states: torch.Tensor, + head_mask: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = False, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + """Input shape: Batch x Time x Channel""" + + bsz, tgt_len, embed_dim = hidden_states.size() + + mixed_qkv = self.qkv(hidden_states) + + mixed_qkv = mixed_qkv.reshape(bsz, tgt_len, 3, self.num_heads, embed_dim // self.num_heads).permute( + 2, 0, 3, 1, 4 + ) + query_states, key_states, value_states = mixed_qkv[0], mixed_qkv[1], mixed_qkv[2] + + # Take the dot product between "query" and "key" to get the raw attention scores. + attention_scores = torch.matmul(query_states, key_states.transpose(-1, -2)) + + attention_scores = attention_scores * self.scale + + # Normalize the attention scores to probabilities. + attention_probs = nn.functional.softmax(attention_scores, dim=-1) + + # This is actually dropping out entire tokens to attend to, which might + # seem a bit unusual, but is taken from the original Transformer paper. + attention_probs = self.dropout(attention_probs) + + # Mask heads if we want to + if head_mask is not None: + attention_probs = attention_probs * head_mask + + context_layer = torch.matmul(attention_probs, value_states).permute(0, 2, 1, 3) + + new_context_layer_shape = context_layer.size()[:-2] + (self.embed_dim,) + context_layer = context_layer.reshape(new_context_layer_shape) + + output = self.projection(context_layer) + + outputs = (output, attention_probs) if output_attentions else (output, None) + + return outputs + + +# Copied from transformers.models.blip.modeling_blip.BlipMLP +class Blip2MLP(nn.Module): + def __init__(self, config): + super().__init__() + self.config = config + self.activation_fn = ACT2FN[config.hidden_act] + self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size) + self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size) + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + hidden_states = self.fc1(hidden_states) + hidden_states = self.activation_fn(hidden_states) + hidden_states = self.fc2(hidden_states) + return hidden_states + + +# Copied from transformers.models.blip.modeling_blip.BlipEncoderLayer with Blip->Blip2 +class Blip2EncoderLayer(nn.Module): + def __init__(self, config: Blip2Config): + super().__init__() + self.embed_dim = config.hidden_size + self.self_attn = Blip2Attention(config) + self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps) + self.mlp = Blip2MLP(config) + self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: torch.Tensor, + output_attentions: Optional[bool] = False, + ) -> Tuple[torch.FloatTensor]: + """ + Args: + hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)` + attention_mask (`torch.FloatTensor`): attention mask of size + `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values. + `(config.encoder_attention_heads,)`. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + """ + residual = hidden_states + + hidden_states = self.layer_norm1(hidden_states) + hidden_states, attn_weights = self.self_attn( + hidden_states=hidden_states, + head_mask=attention_mask, + output_attentions=output_attentions, + ) + hidden_states = hidden_states + residual + residual = hidden_states + hidden_states = self.layer_norm2(hidden_states) + hidden_states = self.mlp(hidden_states) + + hidden_states = hidden_states + residual + + outputs = (hidden_states,) + + if output_attentions: + outputs += (attn_weights,) + + return outputs + + +class Blip2PreTrainedModel(PreTrainedModel): + """ + An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained + models. + """ + + config_class = Blip2Config + base_model_prefix = "blip" + supports_gradient_checkpointing = True + + _no_split_modules = [ + "Blip2Attention", + "Blip2QFormerMultiHeadAttention", + "Blip2TextEmbeddings", + "T5Block", + "OPTDecoderLayer", + ] + _skip_keys_device_placement = "past_key_values" + + def _init_weights(self, module): + """Initialize the weights""" + factor = self.config.initializer_range + + if isinstance(module, (nn.Linear, nn.Conv2d)): + module.weight.data.normal_(mean=0.0, std=factor) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=factor) + elif isinstance(module, nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + elif isinstance(module, Blip2VisionEmbeddings): + nn.init.trunc_normal_(module.position_embedding, mean=0.0, std=factor) + nn.init.trunc_normal_(module.class_embedding, mean=0.0, std=factor) + elif isinstance( + module, + ( + Blip2Model, + Blip2TextModelWithProjection, + Blip2VisionModelWithProjection, + Blip2ForConditionalGeneration, + Blip2ForImageTextRetrieval, + ), + ): + module.query_tokens.data.zero_() + + +BLIP_2_START_DOCSTRING = r""" + This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads + etc.) + + This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. + Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage + and behavior. + + Parameters: + config ([`Blip2Config`]): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights. +""" + +BLIP_2_QFORMER_START_DOCSTRING = r""" + This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads + etc.) + + This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. + Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage + and behavior. + + Parameters: + config ([`Blip2QFormerConfig`]): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights. +""" + +BLIP_2_VISION_INPUTS_DOCSTRING = r""" + Args: + pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`): + Pixel values. Pixel values can be obtained using [`Blip2Processor`]. See [`Blip2Processor.__call__`] for + details. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. + interpolate_pos_encoding (`bool`, *optional*, defaults to `False`): + Whether to interpolate the pre-trained position encodings. +""" + +BLIP_2_TEXT_INPUTS_DOCSTRING = r""" + Args: + input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide + it. Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. [What are input IDs?](../glossary#input-ids) + attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + [What are attention masks?](../glossary#attention-mask) + decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*): + Indices of decoder input sequence tokens in the vocabulary. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are decoder input IDs?](../glossary#decoder-input-ids) + + T5 uses the `pad_token_id` as the starting token for `decoder_input_ids` generation. If `past_key_values` + is used, optionally only the last `decoder_input_ids` have to be input (see `past_key_values`). + + To know more on how to prepare `decoder_input_ids` for pretraining take a look at [T5 + Training](./t5#training). + decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*): + Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also + be used by default. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. +""" + +BLIP_2_TEXT_WITH_PROJECTION_INPUTS_DOCSTRING = r""" + Args: + input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): + Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide + it. Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. [What are input IDs?](../glossary#input-ids) + attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + [What are attention masks?](../glossary#attention-mask) + position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, + config.max_position_embeddings - 1]`. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. +""" + +BLIP_2_INPUTS_DOCSTRING = r""" + Args: + pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`): + Pixel values. Pixel values can be obtained using [`Blip2Processor`]. See [`Blip2Processor.__call__`] for + details. + + input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Indices of input sequence tokens in the vocabulary of the language model. Input tokens can optionally be + provided to serve as text prompt, which the language model can continue. + + Indices can be obtained using [`Blip2Processor`]. See [`Blip2Processor.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + + decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*): + Indices of decoder input sequence tokens in the vocabulary of the language model. Only relevant in case an + encoder-decoder language model (like T5) is used. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. [What are decoder input IDs?](../glossary#decoder-input-ids) + + decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*): + Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also + be used by default. + + Only relevant in case an encoder-decoder language model (like T5) is used. + + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. + interpolate_pos_encoding (`bool`, *optional*, defaults to `False`): + Whether to interpolate the pre-trained position encodings. + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see + `past_key_values`). +""" + +BLIP2_IMAGE_TEXT_RETRIEVAL_INPUTS_DOCSTRING = r""" + Args: + pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`): + Pixel values. Pixel values can be obtained using [`Blip2Processor`]. See [`Blip2Processor.__call__`] for + details. + + input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Indices of input sequence tokens in the vocabulary of the language model. Input tokens can optionally be + provided to serve as text prompt, which the language model can continue. + + Indices can be obtained using [`Blip2Processor`]. See [`Blip2Processor.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + + attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + + use_image_text_matching_head (`bool`, *optional*): + Whether to return the Image-Text Matching or Contrastive scores. + + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. +""" + + +BLIP2_QFORMER_INPUTS_DOCSTRING = r""" + Args: + query_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): + Hidden states to be used in the attention computation. If cross-attention, + will be used for the query (i.e., key and value will use the encoder_hidden_states). + + query_length (`int`, *optional*): + Length of the query, usually based on the number of query tokens. + If no value is provided, query_length will be inferred by the query_embeds. + + attention_mask (`torch.FloatTensor`, *optional*): + Attention mask of size `(batch, sequence_length)` where padding elements + are indicated by 0. + + head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*): + Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, `optional`): + Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if + the model is configured as a decoder. + + encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, `optional`): + Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in + the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`: + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of: + shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`): Contains precomputed key and + value hidden states of the attention blocks. Can be used to speed up decoding. If `past_key_values` are + used, the user can optionally input only the last `decoder_input_ids` (those that don't have their past key + value states given to this model) of shape `(batch_size, 1)` instead of all `decoder_input_ids` of shape + `(batch_size, sequence_length)`. + + use_cache (`bool`, `optional`): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see + `past_key_values`). + + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. +""" + + +# Copied from transformers.models.blip.modeling_blip.BlipEncoder with Blip->Blip2 +class Blip2Encoder(nn.Module): + """ + Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a + [`Blip2EncoderLayer`]. + + Args: + config (`Blip2Config`): + The corresponding vision configuration for the `Blip2Encoder`. + """ + + def __init__(self, config: Blip2Config): + super().__init__() + self.config = config + self.layers = nn.ModuleList([Blip2EncoderLayer(config) for _ in range(config.num_hidden_layers)]) + self.gradient_checkpointing = False + + def forward( + self, + inputs_embeds, + attention_mask: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, BaseModelOutput]: + r""" + Args: + inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): + Embedded representation of the inputs. Should be float, not int tokens. + attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under + returned tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors + for more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. + """ + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + encoder_states = () if output_hidden_states else None + all_attentions = () if output_attentions else None + + hidden_states = inputs_embeds + for idx, encoder_layer in enumerate(self.layers): + if output_hidden_states: + encoder_states = encoder_states + (hidden_states,) + if self.gradient_checkpointing and self.training: + layer_outputs = self._gradient_checkpointing_func( + encoder_layer.__call__, + hidden_states, + attention_mask, + output_attentions, + ) + else: + layer_outputs = encoder_layer( + hidden_states, + attention_mask, + output_attentions=output_attentions, + ) + + hidden_states = layer_outputs[0] + + if output_attentions: + all_attentions = all_attentions + (layer_outputs[1],) + + if output_hidden_states: + encoder_states = encoder_states + (hidden_states,) + + if not return_dict: + return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None) + return BaseModelOutput( + last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions + ) + + +# Copied from transformers.models.blip.modeling_blip.BlipVisionModel with Blip->Blip2, BLIP->BLIP_2 +class Blip2VisionModel(Blip2PreTrainedModel): + main_input_name = "pixel_values" + config_class = Blip2VisionConfig + + def __init__(self, config: Blip2VisionConfig): + super().__init__(config) + self.config = config + embed_dim = config.hidden_size + + self.embeddings = Blip2VisionEmbeddings(config) + self.encoder = Blip2Encoder(config) + self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps) + + self.post_init() + + @add_start_docstrings_to_model_forward(BLIP_2_VISION_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=Blip2VisionConfig) + def forward( + self, + pixel_values: Optional[torch.FloatTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + interpolate_pos_encoding: bool = False, + ) -> Union[Tuple, BaseModelOutputWithPooling]: + r""" + Returns: + + """ + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if pixel_values is None: + raise ValueError("You have to specify pixel_values") + + hidden_states = self.embeddings(pixel_values, interpolate_pos_encoding=interpolate_pos_encoding) + + encoder_outputs = self.encoder( + inputs_embeds=hidden_states, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + last_hidden_state = encoder_outputs[0] + last_hidden_state = self.post_layernorm(last_hidden_state) + + pooled_output = last_hidden_state[:, 0, :] + pooled_output = self.post_layernorm(pooled_output) + + if not return_dict: + return (last_hidden_state, pooled_output) + encoder_outputs[1:] + + return BaseModelOutputWithPooling( + last_hidden_state=last_hidden_state, + pooler_output=pooled_output, + hidden_states=encoder_outputs.hidden_states, + attentions=encoder_outputs.attentions, + ) + + def get_input_embeddings(self): + return self.embeddings + + +class Blip2QFormerMultiHeadAttention(nn.Module): + def __init__(self, config, is_cross_attention=False): + super().__init__() + self.config = config + if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"): + raise ValueError( + "The hidden size (%d) is not a multiple of the number of attention heads (%d)" + % (config.hidden_size, config.num_attention_heads) + ) + + self.num_attention_heads = config.num_attention_heads + self.attention_head_size = int(config.hidden_size / config.num_attention_heads) + self.all_head_size = self.num_attention_heads * self.attention_head_size + + self.query = nn.Linear(config.hidden_size, self.all_head_size) + if is_cross_attention: + self.key = nn.Linear(config.encoder_hidden_size, self.all_head_size) + self.value = nn.Linear(config.encoder_hidden_size, self.all_head_size) + else: + self.key = nn.Linear(config.hidden_size, self.all_head_size) + self.value = nn.Linear(config.hidden_size, self.all_head_size) + + self.dropout = nn.Dropout(config.attention_probs_dropout_prob) + self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") + if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query": + self.max_position_embeddings = config.max_position_embeddings + self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size) + self.save_attention = False + + def save_attn_gradients(self, attn_gradients): + self.attn_gradients = attn_gradients + + def get_attn_gradients(self): + return self.attn_gradients + + def save_attention_map(self, attention_map): + self.attention_map = attention_map + + def get_attention_map(self): + return self.attention_map + + def transpose_for_scores(self, x): + new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size) + x = x.view(*new_x_shape) + return x.permute(0, 2, 1, 3) + + def forward( + self, + hidden_states, + attention_mask=None, + head_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + past_key_value=None, + output_attentions=False, + ): + # If this is instantiated as a cross-attention module, the keys + # and values come from an encoder; the attention mask needs to be + # such that the encoder's padding tokens are not attended to. + is_cross_attention = encoder_hidden_states is not None + + if is_cross_attention: + key_layer = self.transpose_for_scores(self.key(encoder_hidden_states)) + value_layer = self.transpose_for_scores(self.value(encoder_hidden_states)) + attention_mask = encoder_attention_mask + elif past_key_value is not None: + key_layer = self.transpose_for_scores(self.key(hidden_states)) + value_layer = self.transpose_for_scores(self.value(hidden_states)) + key_layer = torch.cat([past_key_value[0], key_layer], dim=2) + value_layer = torch.cat([past_key_value[1], value_layer], dim=2) + else: + key_layer = self.transpose_for_scores(self.key(hidden_states)) + value_layer = self.transpose_for_scores(self.value(hidden_states)) + + mixed_query_layer = self.query(hidden_states) + + query_layer = self.transpose_for_scores(mixed_query_layer) + + past_key_value = (key_layer, value_layer) + + # Take the dot product between "query" and "key" to get the raw attention scores. + attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2)) + + if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query": + seq_length = hidden_states.size()[1] + position_ids_l = torch.arange(seq_length, dtype=torch.long, device=hidden_states.device).view(-1, 1) + position_ids_r = torch.arange(seq_length, dtype=torch.long, device=hidden_states.device).view(1, -1) + distance = position_ids_l - position_ids_r + positional_embedding = self.distance_embedding(distance + self.max_position_embeddings - 1) + positional_embedding = positional_embedding.to(dtype=query_layer.dtype) # fp16 compatibility + + if self.position_embedding_type == "relative_key": + relative_position_scores = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding) + attention_scores = attention_scores + relative_position_scores + elif self.position_embedding_type == "relative_key_query": + relative_position_scores_query = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding) + relative_position_scores_key = torch.einsum("bhrd,lrd->bhlr", key_layer, positional_embedding) + attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key + + attention_scores = attention_scores / math.sqrt(self.attention_head_size) + + if attention_mask is not None: + # Apply the attention mask is (precomputed for all layers in BertModel forward() function) + attention_scores = attention_scores + attention_mask + + # Normalize the attention scores to probabilities. + attention_probs = nn.Softmax(dim=-1)(attention_scores) + + if is_cross_attention and self.save_attention: + self.save_attention_map(attention_probs) + attention_probs.register_hook(self.save_attn_gradients) + + # This is actually dropping out entire tokens to attend to, which might + # seem a bit unusual, but is taken from the original Transformer paper. + attention_probs_dropped = self.dropout(attention_probs) + + # Mask heads if we want to + if head_mask is not None: + attention_probs_dropped = attention_probs_dropped * head_mask + + context_layer = torch.matmul(attention_probs_dropped, value_layer) + + context_layer = context_layer.permute(0, 2, 1, 3).contiguous() + new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,) + context_layer = context_layer.view(*new_context_layer_shape) + + outputs = (context_layer, attention_probs) if output_attentions else (context_layer,) + + outputs = outputs + (past_key_value,) + return outputs + + +# Copied from transformers.models.bert.modeling_bert.BertSelfOutput with Bert->Blip2QFormer +class Blip2QFormerSelfOutput(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = self.LayerNorm(hidden_states + input_tensor) + return hidden_states + + +class Blip2QFormerAttention(nn.Module): + def __init__(self, config, is_cross_attention=False): + super().__init__() + self.attention = Blip2QFormerMultiHeadAttention(config, is_cross_attention) + self.output = Blip2QFormerSelfOutput(config) + self.pruned_heads = set() + + def prune_heads(self, heads): + if len(heads) == 0: + return + heads, index = find_pruneable_heads_and_indices( + heads, self.attention.num_attention_heads, self.attention.attention_head_size, self.pruned_heads + ) + + # Prune linear layers + self.attention.query = prune_linear_layer(self.attention.query, index) + self.attention.key = prune_linear_layer(self.attention.key, index) + self.attention.value = prune_linear_layer(self.attention.value, index) + self.output.dense = prune_linear_layer(self.output.dense, index, dim=1) + + # Update hyper params and store pruned heads + self.attention.num_attention_heads = self.attention.num_attention_heads - len(heads) + self.attention.all_head_size = self.attention.attention_head_size * self.attention.num_attention_heads + self.pruned_heads = self.pruned_heads.union(heads) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.FloatTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + encoder_attention_mask: Optional[torch.FloatTensor] = None, + past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None, + output_attentions: Optional[bool] = False, + ) -> Tuple[torch.Tensor]: + self_outputs = self.attention( + hidden_states, + attention_mask, + head_mask, + encoder_hidden_states, + encoder_attention_mask, + past_key_value, + output_attentions, + ) + attention_output = self.output(self_outputs[0], hidden_states) + outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them + return outputs + + +# Copied from transformers.models.bert.modeling_bert.BertIntermediate with Bert->Blip2QFormer +class Blip2QFormerIntermediate(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.intermediate_size) + if isinstance(config.hidden_act, str): + self.intermediate_act_fn = ACT2FN[config.hidden_act] + else: + self.intermediate_act_fn = config.hidden_act + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.intermediate_act_fn(hidden_states) + return hidden_states + + +# Copied from transformers.models.bert.modeling_bert.BertOutput with Bert->Blip2QFormer +class Blip2QFormerOutput(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.intermediate_size, config.hidden_size) + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = self.LayerNorm(hidden_states + input_tensor) + return hidden_states + + +class Blip2QFormerLayer(nn.Module): + def __init__(self, config, layer_idx): + super().__init__() + self.chunk_size_feed_forward = config.chunk_size_feed_forward + self.seq_len_dim = 1 + self.attention = Blip2QFormerAttention(config) + + self.layer_idx = layer_idx + + if layer_idx % config.cross_attention_frequency == 0: + self.crossattention = Blip2QFormerAttention(config, is_cross_attention=True) + self.has_cross_attention = True + else: + self.has_cross_attention = False + + if config.use_qformer_text_input: + self.intermediate = Blip2QFormerIntermediate(config) + self.output = Blip2QFormerOutput(config) + + self.intermediate_query = Blip2QFormerIntermediate(config) + self.output_query = Blip2QFormerOutput(config) + + def forward( + self, + hidden_states, + attention_mask=None, + head_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + past_key_value=None, + output_attentions=False, + query_length=0, + ): + # decoder uni-directional self-attention cached key/values tuple is at positions 1,2 + self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None + self_attention_outputs = self.attention( + hidden_states, + attention_mask, + head_mask, + output_attentions=output_attentions, + past_key_value=self_attn_past_key_value, + ) + attention_output = self_attention_outputs[0] + outputs = self_attention_outputs[1:-1] + + present_key_value = self_attention_outputs[-1] + + if query_length > 0: + query_attention_output = attention_output[:, :query_length, :] + + if self.has_cross_attention: + if encoder_hidden_states is None: + raise ValueError("encoder_hidden_states must be given for cross-attention layers") + cross_attention_outputs = self.crossattention( + query_attention_output, + attention_mask, + head_mask, + encoder_hidden_states, + encoder_attention_mask, + output_attentions=output_attentions, + ) + query_attention_output = cross_attention_outputs[0] + # add cross attentions if we output attention weights + outputs = outputs + cross_attention_outputs[1:-1] + + layer_output = apply_chunking_to_forward( + self.feed_forward_chunk_query, + self.chunk_size_feed_forward, + self.seq_len_dim, + query_attention_output, + ) + + if attention_output.shape[1] > query_length: + layer_output_text = apply_chunking_to_forward( + self.feed_forward_chunk, + self.chunk_size_feed_forward, + self.seq_len_dim, + attention_output[:, query_length:, :], + ) + layer_output = torch.cat([layer_output, layer_output_text], dim=1) + else: + layer_output = apply_chunking_to_forward( + self.feed_forward_chunk, + self.chunk_size_feed_forward, + self.seq_len_dim, + attention_output, + ) + outputs = (layer_output,) + outputs + + outputs = outputs + (present_key_value,) + + return outputs + + def feed_forward_chunk(self, attention_output): + intermediate_output = self.intermediate(attention_output) + layer_output = self.output(intermediate_output, attention_output) + return layer_output + + def feed_forward_chunk_query(self, attention_output): + intermediate_output = self.intermediate_query(attention_output) + layer_output = self.output_query(intermediate_output, attention_output) + return layer_output + + +class Blip2QFormerEncoder(nn.Module): + def __init__(self, config): + super().__init__() + self.config = config + self.layer = nn.ModuleList( + [Blip2QFormerLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)] + ) + self.gradient_checkpointing = False + + def forward( + self, + hidden_states, + attention_mask=None, + head_mask=None, + encoder_hidden_states=None, + encoder_attention_mask=None, + past_key_values=None, + use_cache=None, + output_attentions=False, + output_hidden_states=False, + return_dict=True, + query_length=0, + ): + all_hidden_states = () if output_hidden_states else None + all_self_attentions = () if output_attentions else None + all_cross_attentions = () if output_attentions else None + + next_decoder_cache = () if use_cache else None + + for i in range(self.config.num_hidden_layers): + layer_module = self.layer[i] + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + layer_head_mask = head_mask[i] if head_mask is not None else None + past_key_value = past_key_values[i] if past_key_values is not None else None + + if getattr(self.config, "gradient_checkpointing", False) and self.training: + if use_cache: + logger.warning( + "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." + ) + use_cache = False + layer_outputs = self._gradient_checkpointing_func( + layer_module.__call__, + hidden_states, + attention_mask, + layer_head_mask, + encoder_hidden_states, + encoder_attention_mask, + ) + else: + layer_outputs = layer_module( + hidden_states, + attention_mask, + layer_head_mask, + encoder_hidden_states, + encoder_attention_mask, + past_key_value, + output_attentions, + query_length, + ) + + hidden_states = layer_outputs[0] + if use_cache: + next_decoder_cache += (layer_outputs[-1],) + if output_attentions: + all_self_attentions = all_self_attentions + (layer_outputs[1],) + if layer_module.has_cross_attention: + all_cross_attentions = all_cross_attentions + (layer_outputs[2],) + + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + if not return_dict: + return tuple( + v + for v in [ + hidden_states, + next_decoder_cache, + all_hidden_states, + all_self_attentions, + all_cross_attentions, + ] + if v is not None + ) + return BaseModelOutputWithPastAndCrossAttentions( + last_hidden_state=hidden_states, + past_key_values=next_decoder_cache, + hidden_states=all_hidden_states, + attentions=all_self_attentions, + cross_attentions=all_cross_attentions, + ) + + +class Blip2TextEmbeddings(nn.Module): + """Construct the embeddings from word and position embeddings.""" + + def __init__(self, config): + super().__init__() + self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id) + self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size) + + # position_ids (1, len position emb) is contiguous in memory and exported when serialized + self.register_buffer( + "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False + ) + self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") + + def forward( + self, + input_ids: Optional[torch.FloatTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + query_embeds: Optional[torch.FloatTensor] = None, + ) -> torch.Tensor: + if input_ids is not None: + seq_length = input_ids.size()[1] + else: + seq_length = 0 + + if position_ids is None: + position_ids = self.position_ids[:, :seq_length] + + if input_ids is not None: + input_ids = input_ids.to(self.word_embeddings.weight.device) + embeddings = self.word_embeddings(input_ids) + if self.position_embedding_type == "absolute": + position_embeddings = self.position_embeddings(position_ids) + embeddings += position_embeddings + + if query_embeds is not None: + # `query_embeds` are kept in fp32 when we use it with Qformer + if query_embeds.dtype != embeddings.dtype: + query_embeds = query_embeds.to(embeddings.dtype) + embeddings = torch.cat((query_embeds, embeddings), dim=1) + else: + embeddings = query_embeds + + return embeddings + + +@add_start_docstrings( + """ + BLIP-2 Querying Transformer (Q-Former). + """, + BLIP_2_QFORMER_START_DOCSTRING, +) +class Blip2QFormerModel(Blip2PreTrainedModel): + def __init__(self, config: Blip2QFormerConfig): + super().__init__(config) + self.config = config + + self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + self.encoder = Blip2QFormerEncoder(config) + + self.post_init() + + def get_input_embeddings(self): + return self.embeddings.word_embeddings + + def set_input_embeddings(self, value): + self.embeddings.word_embeddings = value + + def _prune_heads(self, heads_to_prune): + """ + Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base + class PreTrainedModel + """ + for layer, heads in heads_to_prune.items(): + self.encoder.layer[layer].attention.prune_heads(heads) + + def get_extended_attention_mask( + self, + attention_mask: torch.Tensor, + input_shape: Tuple[int], + device: torch.device, + has_query: bool = False, + ) -> torch.Tensor: + """ + Makes broadcastable attention and causal masks so that future and masked tokens are ignored. + + Arguments: + attention_mask (`torch.Tensor`): + Mask with ones indicating tokens to attend to, zeros for tokens to ignore. + input_shape (`Tuple[int]`): + The shape of the input to the model. + device (`torch.device`): + The device of the input to the model. + + Returns: + `torch.Tensor` The extended attention mask, with a the same dtype as `attention_mask.dtype`. + """ + # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length] + # ourselves in which case we just need to make it broadcastable to all heads. + if attention_mask.dim() == 3: + extended_attention_mask = attention_mask[:, None, :, :] + elif attention_mask.dim() == 2: + # Provided a padding mask of dimensions [batch_size, seq_length] + # - the model is an encoder, so make the mask broadcastable to [batch_size, num_heads, seq_length, seq_length] + extended_attention_mask = attention_mask[:, None, None, :] + else: + raise ValueError( + "Wrong shape for input_ids (shape {}) or attention_mask (shape {})".format( + input_shape, attention_mask.shape + ) + ) + + # Since attention_mask is 1.0 for positions we want to attend and 0.0 for + # masked positions, this operation will create a tensor which is 0.0 for + # positions we want to attend and -10000.0 for masked positions. + # Since we are adding it to the raw scores before the softmax, this is + # effectively the same as removing these entirely. + extended_attention_mask = extended_attention_mask.to(dtype=self.dtype) # fp16 compatibility + extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0 + return extended_attention_mask + + @add_start_docstrings_to_model_forward(BLIP2_QFORMER_INPUTS_DOCSTRING) + @replace_return_docstrings( + output_type=BaseModelOutputWithPoolingAndCrossAttentions, config_class=Blip2QFormerConfig + ) + def forward( + self, + query_embeds: torch.FloatTensor, + query_length: Optional[int] = None, + attention_mask: Optional[torch.FloatTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + encoder_attention_mask: Optional[torch.FloatTensor] = None, + past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPoolingAndCrossAttentions]: + r""" + Returns: + """ + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + # past_key_values_length + past_key_values_length = ( + past_key_values[0][0].shape[2] - self.config.query_length if past_key_values is not None else 0 + ) + + query_length = ( + query_length if query_length is not None else query_embeds.shape[1] if query_embeds is not None else 0 + ) + + embedding_output = self.layernorm(query_embeds) + embedding_output = self.dropout(embedding_output) + + input_shape = embedding_output.size()[:-1] + batch_size, seq_length = input_shape + device = embedding_output.device + + if attention_mask is None: + attention_mask = torch.ones(((batch_size, seq_length + past_key_values_length)), device=device) + + # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length] + # ourselves in which case we just need to make it broadcastable to all heads. + extended_attention_mask = self.get_extended_attention_mask(attention_mask, input_shape, device) + + # If a 2D or 3D attention mask is provided for the cross-attention + # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length] + if encoder_hidden_states is not None: + # Qformer and latent query tokens are kept in fp32. We cast `encoder_hidden_states` if not fp32 already + if encoder_hidden_states.dtype != query_embeds.dtype: + encoder_hidden_states = encoder_hidden_states.to(query_embeds.dtype) + + if isinstance(encoder_hidden_states, list): + encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states[0].size() + else: + encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size() + encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length) + + if isinstance(encoder_attention_mask, list): + encoder_extended_attention_mask = [self.invert_attention_mask(mask) for mask in encoder_attention_mask] + elif encoder_attention_mask is None: + encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device) + encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask) + else: + encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask) + else: + encoder_extended_attention_mask = None + + # Prepare head mask if needed + # 1.0 in head_mask indicate we keep the head + # attention_probs has shape bsz x n_heads x N x N + # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads] + # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length] + head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers) + + encoder_outputs = self.encoder( + embedding_output, + attention_mask=extended_attention_mask, + head_mask=head_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_extended_attention_mask, + past_key_values=past_key_values, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + query_length=query_length, + ) + sequence_output = encoder_outputs[0] + pooled_output = sequence_output[:, 0, :] + + if not return_dict: + return (sequence_output, pooled_output) + encoder_outputs[1:] + + return BaseModelOutputWithPoolingAndCrossAttentions( + last_hidden_state=sequence_output, + pooler_output=pooled_output, + past_key_values=encoder_outputs.past_key_values, + hidden_states=encoder_outputs.hidden_states, + attentions=encoder_outputs.attentions, + cross_attentions=encoder_outputs.cross_attentions, + ) + + +@add_start_docstrings( + """ + BLIP-2 Model for generating text and image features. The model consists of a vision encoder, Querying Transformer + (Q-Former) and a language model. + """, + BLIP_2_START_DOCSTRING, +) +class Blip2Model(Blip2PreTrainedModel): + config_class = Blip2Config + main_input_name = "pixel_values" + _keep_in_fp32_modules = ["query_tokens", "qformer"] + + def __init__(self, config: Blip2Config): + super().__init__(config) + + self.vision_model = Blip2VisionModel(config.vision_config) + + self.query_tokens = nn.Parameter(torch.zeros(1, config.num_query_tokens, config.qformer_config.hidden_size)) + self.qformer = Blip2QFormerModel(config.qformer_config) + + self.language_projection = nn.Linear(config.qformer_config.hidden_size, config.text_config.hidden_size) + if config.use_decoder_only_language_model: + language_model = AutoModelForCausalLM.from_config(config.text_config) + else: + language_model = AutoModelForSeq2SeqLM.from_config(config.text_config) + + # Update _tied_weights_keys using the base model used. + if language_model._tied_weights_keys is not None: + self._tied_weights_keys = [f"language_model.{k}" for k in language_model._tied_weights_keys] + + self.language_model = language_model + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.language_model.get_input_embeddings() + + def set_input_embeddings(self, value): + self.language_model.set_input_embeddings(value) + + def set_output_embeddings(self, new_embeddings): + self.language_model.set_output_embeddings(new_embeddings) + + def get_output_embeddings(self) -> nn.Module: + return self.language_model.get_output_embeddings() + + def get_encoder(self): + return self.language_model.get_encoder() + + def get_decoder(self): + return self.language_model.get_decoder() + + def _tie_weights(self): + if not self.config.use_decoder_only_language_model: + self.language_model.encoder.embed_tokens = self.language_model.shared + self.language_model.decoder.embed_tokens = self.language_model.shared + + @add_start_docstrings_to_model_forward(BLIP_2_TEXT_INPUTS_DOCSTRING) + def get_text_features( + self, + input_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + decoder_input_ids: Optional[torch.Tensor] = None, + decoder_attention_mask: Optional[torch.Tensor] = None, + labels: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ): + r""" + Returns: + text_outputs (`CausalLMOutputWithPast`, or `tuple(torch.FloatTensor)` if `return_dict=False`): + The language model outputs. If `return_dict=True`, the output is a [`CausalLMOutputWithPast`] that + contains the language model logits, the past key values and the hidden states if + `output_hidden_states=True`. + Examples: + ```python + >>> import torch + >>> from transformers import AutoTokenizer, Blip2Model + + >>> model = Blip2Model.from_pretrained("Salesforce/blip2-opt-2.7b") + + >>> tokenizer = AutoTokenizer.from_pretrained("Salesforce/blip2-opt-2.7b") + >>> inputs = tokenizer(["a photo of a cat"], padding=True, return_tensors="pt") + >>> text_features = model.get_text_features(**inputs) + ```""" + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if self.config.use_decoder_only_language_model: + text_outputs = self.language_model( + input_ids=input_ids, + attention_mask=attention_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + else: + inputs_embeds = self.language_model.get_input_embeddings()(input_ids) + + text_outputs = self.language_model( + inputs_embeds=inputs_embeds, + attention_mask=attention_mask, + decoder_input_ids=decoder_input_ids, + decoder_attention_mask=decoder_attention_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + labels=labels, + ) + + return text_outputs + + @add_start_docstrings_to_model_forward(BLIP_2_VISION_INPUTS_DOCSTRING) + def get_image_features( + self, + pixel_values: Optional[torch.FloatTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + interpolate_pos_encoding: bool = False, + ): + r""" + Returns: + vision_outputs (`BaseModelOutputWithPooling` or tuple of `torch.FloatTensor`): + The vision model outputs. If `return_dict=True`, the output is a [`BaseModelOutputWithPooling`] that + contains the image features, the pooled image features and the hidden states if + `output_hidden_states=True`. + Examples: + ```python + >>> import torch + >>> from PIL import Image + >>> import requests + >>> from transformers import AutoProcessor, Blip2Model + + >>> model = Blip2Model.from_pretrained("Salesforce/blip2-opt-2.7b") + + >>> processor = AutoProcessor.from_pretrained("Salesforce/blip2-opt-2.7b") + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + >>> inputs = processor(images=image, return_tensors="pt") + >>> image_outputs = model.get_image_features(**inputs) + ```""" + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + vision_outputs = self.vision_model( + pixel_values=pixel_values, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + interpolate_pos_encoding=interpolate_pos_encoding, + ) + + return vision_outputs + + @add_start_docstrings_to_model_forward(BLIP_2_INPUTS_DOCSTRING) + def get_qformer_features( + self, + pixel_values: Optional[torch.FloatTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + interpolate_pos_encoding: bool = False, + ): + r""" + Returns: + vision_outputs (`BaseModelOutputWithPooling` or tuple of `torch.FloatTensor`): + The vision model outputs. If `return_dict=True`, the output is a [`BaseModelOutputWithPooling`] that + contains the image features, the pooled image features and the hidden states if + `output_hidden_states=True`. + Examples: + ```python + >>> import torch + >>> from PIL import Image + >>> import requests + >>> from transformers import Blip2Processor, Blip2Model + + >>> processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b") + >>> model = Blip2Model.from_pretrained("Salesforce/blip2-opt-2.7b") + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + >>> inputs = processor(images=image, return_tensors="pt") + >>> qformer_outputs = model.get_qformer_features(**inputs) + ```""" + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + vision_outputs = self.vision_model( + pixel_values=pixel_values, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + interpolate_pos_encoding=interpolate_pos_encoding, + ) + + image_embeds = vision_outputs[0] + + # step 2: forward the query tokens through the QFormer, using the image embeddings for cross-attention + image_attention_mask = torch.ones(image_embeds.size()[:-1], dtype=torch.long, device=image_embeds.device) + + query_tokens = self.query_tokens.expand(image_embeds.shape[0], -1, -1) + query_outputs = self.qformer( + query_embeds=query_tokens, + encoder_hidden_states=image_embeds, + encoder_attention_mask=image_attention_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + return query_outputs + + @add_start_docstrings_to_model_forward(BLIP_2_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=Blip2ForConditionalGenerationModelOutput, config_class=Blip2VisionConfig) + def forward( + self, + pixel_values: torch.FloatTensor, + input_ids: torch.FloatTensor, + attention_mask: Optional[torch.LongTensor] = None, + decoder_input_ids: Optional[torch.LongTensor] = None, + decoder_attention_mask: Optional[torch.LongTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + labels: Optional[torch.LongTensor] = None, + return_dict: Optional[bool] = None, + interpolate_pos_encoding: bool = False, + ) -> Union[Tuple, Blip2ForConditionalGenerationModelOutput]: + r""" + Returns: + + Examples: + + ```python + >>> from PIL import Image + >>> import requests + >>> from transformers import Blip2Processor, Blip2Model + >>> import torch + + >>> device = "cuda" if torch.cuda.is_available() else "cpu" + + >>> processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b") + >>> model = Blip2Model.from_pretrained("Salesforce/blip2-opt-2.7b", torch_dtype=torch.float16) + >>> model.to(device) # doctest: +IGNORE_RESULT + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + + >>> prompt = "Question: how many cats are there? Answer:" + >>> inputs = processor(images=image, text=prompt, return_tensors="pt").to(device, torch.float16) + + >>> outputs = model(**inputs) + ```""" + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + # step 1: forward the images through the vision encoder, + # to get image embeddings of shape (batch_size, seq_len, hidden_size) + vision_outputs = self.vision_model( + pixel_values=pixel_values, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + interpolate_pos_encoding=interpolate_pos_encoding, + ) + image_embeds = vision_outputs[0] + + # step 2: forward the query tokens through the QFormer, using the image embeddings for cross-attention + image_attention_mask = torch.ones(image_embeds.size()[:-1], dtype=torch.long, device=image_embeds.device) + + query_tokens = self.query_tokens.expand(image_embeds.shape[0], -1, -1) + query_outputs = self.qformer( + query_embeds=query_tokens, + encoder_hidden_states=image_embeds, + encoder_attention_mask=image_attention_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + query_output = query_outputs[0] + + # Qformer is kept in fp32, we downcast the output back if needed + if query_output.dtype != image_embeds.dtype: + query_output = query_output.to(image_embeds.dtype) + + # step 3: use the language model, conditioned on the query outputs and the prompt + language_model_inputs = self.language_projection(query_output) + language_model_attention_mask = torch.ones( + language_model_inputs.size()[:-1], dtype=torch.long, device=language_model_inputs.device + ) + inputs_embeds = self.language_model.get_input_embeddings()(input_ids) + inputs_embeds = torch.cat([language_model_inputs, inputs_embeds], dim=1) + + if attention_mask is None: + attention_mask = torch.ones_like(input_ids) + expected_device = language_model_attention_mask.device + attention_mask = torch.cat([language_model_attention_mask, attention_mask.to(expected_device)], dim=1) + + if self.config.use_decoder_only_language_model: + outputs = self.language_model( + inputs_embeds=inputs_embeds, + attention_mask=attention_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + logits = outputs.logits if return_dict else outputs[0] + loss = None + # we compute the loss here since we need to take into account the sequence length of the query embeds + if labels is not None: + labels = labels.to(logits.device) + logits = logits[:, -labels.size(1) :, :] + # Shift so that tokens < n predict n + shift_logits = logits[..., :-1, :].contiguous() + shift_labels = labels[..., 1:].contiguous().to(logits.device) + + # Flatten the tokens + loss_fct = CrossEntropyLoss(reduction="mean") + + loss = loss_fct(shift_logits.view(-1, self.config.text_config.vocab_size), shift_labels.view(-1)) + else: + outputs = self.language_model( + inputs_embeds=inputs_embeds, + attention_mask=attention_mask, + decoder_input_ids=decoder_input_ids, + decoder_attention_mask=decoder_attention_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=True, # toggle for easier access to loss/logits below + labels=labels, + ) + loss = outputs.loss + logits = outputs.logits + outputs = outputs.to_tuple() if not return_dict else outputs + + if not return_dict: + output = (logits, vision_outputs, query_outputs, outputs) + return ((loss,) + output) if loss is not None else output + + return Blip2ForConditionalGenerationModelOutput( + loss=loss, + logits=logits, + vision_outputs=vision_outputs, + qformer_outputs=query_outputs, + language_model_outputs=outputs, + ) + + +@add_start_docstrings( + """ + BLIP-2 Text Model with a projection layer on top (a linear layer on top of the pooled output). + """, + BLIP_2_START_DOCSTRING, +) +class Blip2TextModelWithProjection(Blip2PreTrainedModel): + supports_gradient_checkpointing = False + _keep_in_fp32_modules = ["query_tokens", "qformer"] + + def __init__(self, config: Blip2Config): + super().__init__(config) + + self.query_tokens = nn.Parameter(torch.zeros(1, config.num_query_tokens, config.qformer_config.hidden_size)) + self.embeddings = Blip2TextEmbeddings(config.qformer_config) + self.qformer = Blip2QFormerModel(config.qformer_config) + + # text projection layer + self.text_projection = nn.Linear(config.qformer_config.hidden_size, config.image_text_hidden_size) + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.embeddings.word_embeddings + + def set_input_embeddings(self, value): + self.embeddings.word_embeddings = value + + @add_start_docstrings_to_model_forward(BLIP_2_TEXT_WITH_PROJECTION_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=Blip2TextModelOutput, config_class=Blip2Config) + def forward( + self, + input_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, Blip2TextModelOutput]: + r""" + Returns: + + Examples: + + ```python + >>> import torch + >>> from transformers import AutoProcessor, Blip2TextModelWithProjection + + >>> device = "cuda" if torch.cuda.is_available() else "cpu" + + >>> model = Blip2TextModelWithProjection.from_pretrained( + ... "Salesforce/blip2-itm-vit-g", torch_dtype=torch.float16 + ... ) + + >>> model.to(device) # doctest: +IGNORE_RESULT + + >>> processor = AutoProcessor.from_pretrained("Salesforce/blip2-itm-vit-g") + + >>> inputs = processor(text=["a photo of a cat", "a photo of a dog"], return_tensors="pt").to(device) + + >>> outputs = model(**inputs) + >>> text_embeds = outputs.text_embeds + >>> print(text_embeds.shape) + torch.Size([2, 7, 256]) + ```""" + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + query_embeds = self.embeddings( + input_ids=input_ids, + position_ids=position_ids, + ) + + text_outputs = self.qformer( + query_embeds=query_embeds, + query_length=0, + attention_mask=attention_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + pooled_output = text_outputs[0] if not return_dict else text_outputs.last_hidden_state + + text_embeds = self.text_projection(pooled_output) + text_embeds = nn.functional.normalize(text_embeds, dim=-1) + + if not return_dict: + outputs = (text_embeds, text_outputs[0]) + text_outputs[2:] + return tuple(output for output in outputs if output is not None) + + return Blip2TextModelOutput( + text_embeds=text_embeds, + last_hidden_state=text_outputs.last_hidden_state, + hidden_states=text_outputs.hidden_states, + attentions=text_outputs.attentions, + ) + + +@add_start_docstrings( + """ + BLIP-2 Vision Model with a projection layer on top (a linear layer on top of the pooled output). + """, + BLIP_2_START_DOCSTRING, +) +class Blip2VisionModelWithProjection(Blip2PreTrainedModel): + main_input_name = "pixel_values" + _keep_in_fp32_modules = ["query_tokens", "qformer"] + + def __init__(self, config: Blip2Config): + super().__init__(config) + + self.vision_model = Blip2VisionModel(config.vision_config) + + self.query_tokens = nn.Parameter(torch.zeros(1, config.num_query_tokens, config.qformer_config.hidden_size)) + self.qformer = Blip2QFormerModel(config.qformer_config) + + # vision projection layer + self.vision_projection = nn.Linear(config.qformer_config.hidden_size, config.image_text_hidden_size) + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self) -> nn.Module: + return self.vision_model.embeddings.patch_embedding + + @add_start_docstrings_to_model_forward(BLIP_2_VISION_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=Blip2VisionModelOutput, config_class=Blip2Config) + def forward( + self, + pixel_values: Optional[torch.FloatTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, Blip2VisionModelOutput]: + r""" + Returns: + + Examples: + + ```python + >>> import torch + >>> from PIL import Image + >>> import requests + >>> from transformers import AutoProcessor, Blip2VisionModelWithProjection + + >>> device = "cuda" if torch.cuda.is_available() else "cpu" + + >>> processor = AutoProcessor.from_pretrained("Salesforce/blip2-itm-vit-g") + >>> model = Blip2VisionModelWithProjection.from_pretrained( + ... "Salesforce/blip2-itm-vit-g", torch_dtype=torch.float16 + ... ) + >>> model.to(device) # doctest: +IGNORE_RESULT + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + + >>> inputs = processor(images=image, return_tensors="pt").to(device, torch.float16) + + >>> outputs = model(**inputs) + >>> image_embeds = outputs.image_embeds + >>> print(image_embeds.shape) + torch.Size([1, 32, 256]) + ```""" + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + vision_outputs = self.vision_model( + pixel_values=pixel_values, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + pooled_output = vision_outputs[0] if not return_dict else vision_outputs.last_hidden_state + + image_attention_mask = torch.ones(pooled_output.size()[:-1], dtype=torch.long, device=pooled_output.device) + + query_tokens = self.query_tokens.expand(pooled_output.shape[0], -1, -1) + + query_outputs = self.qformer( + query_embeds=query_tokens, + encoder_hidden_states=pooled_output, + encoder_attention_mask=image_attention_mask, + return_dict=return_dict, + ) + + embeds = query_outputs[0] if not return_dict else query_outputs.last_hidden_state + image_embeds = self.vision_projection(embeds) + image_embeds = nn.functional.normalize(image_embeds, dim=-1) + + if not return_dict: + outputs = (image_embeds, vision_outputs[0]) + vision_outputs[2:] + return tuple(output for output in outputs if output is not None) + + return Blip2VisionModelOutput( + image_embeds=image_embeds, + last_hidden_state=vision_outputs.last_hidden_state, + hidden_states=vision_outputs.hidden_states, + attentions=vision_outputs.attentions, + ) + + +@add_start_docstrings( + """ + BLIP-2 Model for generating text given an image and an optional text prompt. The model consists of a vision + encoder, Querying Transformer (Q-Former) and a language model. + + One can optionally pass `input_ids` to the model, which serve as a text prompt, to make the language model continue + the prompt. Otherwise, the language model starts generating text from the [BOS] (beginning-of-sequence) token. + + + + Note that Flan-T5 checkpoints cannot be cast to float16. They are pre-trained using bfloat16. + + + """, + BLIP_2_START_DOCSTRING, +) +class Blip2ForConditionalGeneration(Blip2PreTrainedModel, GenerationMixin): + config_class = Blip2Config + main_input_name = "pixel_values" + _supports_cache_class = True + _supports_static_cache = True + _supports_quantized_cache = False # not all LM bacbones support (e.g. T5) + _keep_in_fp32_modules = ["query_tokens", "qformer"] + + def __init__(self, config: Blip2Config): + super().__init__(config) + + self.vision_model = Blip2VisionModel(config.vision_config) + + self.query_tokens = nn.Parameter(torch.zeros(1, config.num_query_tokens, config.qformer_config.hidden_size)) + self.qformer = Blip2QFormerModel(config.qformer_config) + + self.language_projection = nn.Linear(config.qformer_config.hidden_size, config.text_config.hidden_size) + if config.use_decoder_only_language_model: + language_model = AutoModelForCausalLM.from_config(config.text_config) + else: + language_model = AutoModelForSeq2SeqLM.from_config(config.text_config) + + # Update _tied_weights_keys using the base model used. + if language_model._tied_weights_keys is not None: + self._tied_weights_keys = [f"language_model.{k}" for k in language_model._tied_weights_keys] + + self.language_model = language_model + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.language_model.get_input_embeddings() + + def set_input_embeddings(self, value): + self.language_model.set_input_embeddings(value) + + def set_output_embeddings(self, new_embeddings): + self.language_model.set_output_embeddings(new_embeddings) + + def get_output_embeddings(self) -> nn.Module: + return self.language_model.get_output_embeddings() + + def get_encoder(self): + return self.language_model.get_encoder() + + def get_decoder(self): + return self.language_model.get_decoder() + + def _tie_weights(self): + if not self.config.use_decoder_only_language_model: + self.language_model.encoder.embed_tokens = self.language_model.shared + self.language_model.decoder.embed_tokens = self.language_model.shared + + def _preprocess_accelerate(self): + r""" + Some pre-processing hacks to make the model `accelerate` compatible. Check + https://github.com/huggingface/transformers/pull/21707 for more details. + """ + hf_device_map = self.hf_device_map + + if len(hf_device_map) > 1 and "language_model" not in hf_device_map and torch.cuda.device_count() > 1: + # warn users about unexpected behavior when using multi-GPU + BLIP-2 + `accelerate`. + logger.warning( + "The `language_model` is not in the `hf_device_map` dictionary and you are running your script" + " in a multi-GPU environment. this may lead to unexpected behavior when using `accelerate`." + " Please pass a `device_map` that contains `language_model` to remove this warning." + " Please refer to https://github.com/huggingface/blog/blob/main/accelerate-large-models.md for" + " more details on creating a `device_map` for large models.", + ) + + if hasattr(self.language_model, "_hf_hook"): + self.language_model._hf_hook.io_same_device = True # For `generate` compatibility + + @add_start_docstrings_to_model_forward(BLIP_2_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=Blip2ForConditionalGenerationModelOutput, config_class=Blip2VisionConfig) + def forward( + self, + pixel_values: torch.FloatTensor, + input_ids: torch.FloatTensor, + attention_mask: Optional[torch.LongTensor] = None, + decoder_input_ids: Optional[torch.LongTensor] = None, + decoder_attention_mask: Optional[torch.LongTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + labels: Optional[torch.LongTensor] = None, + return_dict: Optional[bool] = None, + interpolate_pos_encoding: bool = False, + use_cache: Optional[bool] = None, + ) -> Union[Tuple, Blip2ForConditionalGenerationModelOutput]: + r""" + Returns: + + Examples: + + Prepare processor, model and image input + + ```python + >>> from PIL import Image + >>> import requests + >>> from transformers import Blip2Processor, Blip2ForConditionalGeneration + >>> import torch + + >>> device = "cuda" if torch.cuda.is_available() else "cpu" + + >>> processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b") + >>> model = Blip2ForConditionalGeneration.from_pretrained( + ... "Salesforce/blip2-opt-2.7b", load_in_8bit=True, device_map={"": 0}, torch_dtype=torch.float16 + ... ) # doctest: +IGNORE_RESULT + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + ``` + + Image captioning (without providing a text prompt): + + ```python + >>> inputs = processor(images=image, return_tensors="pt").to(device, torch.float16) + + >>> generated_ids = model.generate(**inputs) + >>> generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip() + >>> print(generated_text) + two cats laying on a couch + ``` + + Visual question answering (prompt = question): + + ```python + >>> prompt = "Question: how many cats are there? Answer:" + >>> inputs = processor(images=image, text=prompt, return_tensors="pt").to(device="cuda", dtype=torch.float16) + + >>> generated_ids = model.generate(**inputs) + >>> generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip() + >>> print(generated_text) + two + ``` + + Note that int8 inference is also supported through [bitsandbytes](https://github.com/TimDettmers/bitsandbytes). + This greatly reduces the amount of memory used by the model while maintaining the same performance. + + ```python + >>> model = Blip2ForConditionalGeneration.from_pretrained( + ... "Salesforce/blip2-opt-2.7b", load_in_8bit=True, device_map={"": 0}, torch_dtype=torch.bfloat16 + ... ) # doctest: +IGNORE_RESULT + + >>> inputs = processor(images=image, text=prompt, return_tensors="pt").to(device="cuda", dtype=torch.bfloat16) + + >>> generated_ids = model.generate(**inputs) + >>> generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip() + >>> print(generated_text) + two + ```""" + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + # step 1: forward the images through the vision encoder, + # to get image embeddings of shape (batch_size, seq_len, hidden_size) + vision_outputs = self.vision_model( + pixel_values=pixel_values, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + interpolate_pos_encoding=interpolate_pos_encoding, + ) + image_embeds = vision_outputs[0] + + # step 2: forward the query tokens through the QFormer, using the image embeddings for cross-attention + image_attention_mask = torch.ones(image_embeds.size()[:-1], dtype=torch.long, device=image_embeds.device) + + query_tokens = self.query_tokens.expand(image_embeds.shape[0], -1, -1) + query_outputs = self.qformer( + query_embeds=query_tokens, + encoder_hidden_states=image_embeds, + encoder_attention_mask=image_attention_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + query_output = query_outputs[0] + + # Qformer is kept in fp32, we downcast the output back if needed + if query_output.dtype != image_embeds.dtype: + query_output = query_output.to(image_embeds.dtype) + + # step 3: use the language model, conditioned on the query outputs and the prompt + language_model_inputs = self.language_projection(query_output) + language_model_attention_mask = torch.ones( + language_model_inputs.size()[:-1], dtype=torch.long, device=language_model_inputs.device + ) + inputs_embeds = self.language_model.get_input_embeddings()(input_ids) + if attention_mask is None: + attention_mask = torch.ones_like(input_ids) + + # if the model already has "image_token_id" then the input is expanded to account for image embeds + # otherwise we expand manually by concating + if getattr(self.config, "image_token_id", None) is not None: + special_image_mask = (input_ids == self.config.image_token_id).unsqueeze(-1).expand_as(inputs_embeds) + language_model_inputs = language_model_inputs.to(inputs_embeds.device, inputs_embeds.dtype) + inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, language_model_inputs) + else: + logger.warning_once( + "Expanding inputs for image tokens in BLIP-2 should be done in processing. " + "Please follow instruction here (https://gist.github.com/zucchini-nlp/e9f20b054fa322f84ac9311d9ab67042) to update your BLIP-2 model. " + "Using processors without these attributes in the config is deprecated and will throw an error in v4.50." + ) + inputs_embeds = torch.cat([language_model_inputs, inputs_embeds.to(language_model_inputs.device)], dim=1) + attention_mask = torch.cat( + [language_model_attention_mask, attention_mask.to(language_model_attention_mask.device)], dim=1 + ) + + if self.config.use_decoder_only_language_model: + outputs = self.language_model( + inputs_embeds=inputs_embeds, + attention_mask=attention_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + use_cache=use_cache, + ) + logits = outputs.logits if return_dict else outputs[0] + loss = None + # we compute the loss here since we need to take into account the sequence length of the query embeds + if labels is not None: + labels = labels.to(logits.device) + logits = logits[:, -labels.size(1) :, :] + # Shift so that tokens < n predict n + shift_logits = logits[..., :-1, :].contiguous() + shift_labels = labels[..., 1:].contiguous().to(logits.device) + + # Flatten the tokens + loss_fct = CrossEntropyLoss(reduction="mean") + + loss = loss_fct(shift_logits.view(-1, self.config.text_config.vocab_size), shift_labels.view(-1)) + else: + outputs = self.language_model( + inputs_embeds=inputs_embeds, + attention_mask=attention_mask, + decoder_input_ids=decoder_input_ids, + decoder_attention_mask=decoder_attention_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=True, # toggle for easier access to loss/logits below + labels=labels, + use_cache=use_cache, + ) + loss = outputs.loss + logits = outputs.logits + outputs = outputs.to_tuple() if not return_dict else outputs + + if not return_dict: + output = (logits, vision_outputs, query_outputs, outputs) + return ((loss,) + output) if loss is not None else output + + return Blip2ForConditionalGenerationModelOutput( + loss=loss, + logits=logits, + vision_outputs=vision_outputs, + qformer_outputs=query_outputs, + language_model_outputs=outputs, + ) + + @torch.no_grad() + def generate( + self, + pixel_values: torch.FloatTensor, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.LongTensor] = None, + interpolate_pos_encoding: bool = False, + **generate_kwargs, + ) -> torch.LongTensor: + """ + Overrides `generate` function to be able to use the model as a conditional generator. + + Args: + pixel_values (`torch.FloatTensor` of shape (batch_size, num_channels, height, width)): + Input images to be processed. + input_ids (`torch.LongTensor` of shape (batch_size, sequence_length), *optional*): + The sequence used as a prompt for the generation. + attention_mask (`torch.LongTensor` of shape (batch_size, sequence_length), *optional*): + Mask to avoid performing attention on padding token indices + + Returns: + captions (list): A list of strings of length batch_size * num_captions. + """ + if hasattr(self, "hf_device_map"): + # preprocess for `accelerate` + self._preprocess_accelerate() + + batch_size = pixel_values.shape[0] + image_embeds = self.vision_model( + pixel_values, + return_dict=True, + interpolate_pos_encoding=interpolate_pos_encoding, + ).last_hidden_state + image_attention_mask = torch.ones(image_embeds.size()[:-1], dtype=torch.long, device=image_embeds.device) + + query_tokens = self.query_tokens.expand(image_embeds.shape[0], -1, -1) + query_outputs = self.qformer( + query_embeds=query_tokens, + encoder_hidden_states=image_embeds, + encoder_attention_mask=image_attention_mask, + return_dict=True, + ) + query_output = query_outputs.last_hidden_state + + # Qformer is kept in fp32, we downcast the output back if needed + if query_output.dtype != image_embeds.dtype: + query_output = query_output.to(image_embeds.dtype) + + language_model_inputs = self.language_projection(query_output) + language_attention_mask = torch.ones( + language_model_inputs.size()[:-1], dtype=torch.long, device=language_model_inputs.device + ) + + if input_ids is None: + start_tokens = [self.config.text_config.bos_token_id] + if getattr(self.config, "image_token_id", None) is not None: + start_tokens = [self.config.image_token_id] * self.config.num_query_tokens + start_tokens + input_ids = torch.tensor([start_tokens], dtype=torch.long, device=image_embeds.device) + input_ids = input_ids.repeat(batch_size, 1) + + inputs_embeds = self.get_input_embeddings()(input_ids) + if attention_mask is None: + attention_mask = torch.ones_like(input_ids) + + # if the model already has "image_token_id" then the input is expanded to account for image embeds + # otherwise we expand manually by concatenating + if getattr(self.config, "image_token_id", None) is not None: + special_image_mask = (input_ids == self.config.image_token_id).unsqueeze(-1).expand_as(inputs_embeds) + inputs_embeds[special_image_mask] = language_model_inputs.flatten() + else: + logger.warning_once( + "Expanding inputs for image tokens in BLIP-2 should be done in processing. " + "Please follow instruction here (https://gist.github.com/zucchini-nlp/e9f20b054fa322f84ac9311d9ab67042) to update your BLIP-2 model. " + "Using processors without these attributes in the config is deprecated and will throw an error in v4.50." + ) + inputs_embeds = torch.cat([language_model_inputs, inputs_embeds.to(language_model_inputs.device)], dim=1) + attention_mask = torch.cat( + [language_attention_mask, attention_mask.to(language_attention_mask.device)], dim=1 + ) + + # add image_embeds length to max_length, so that the final max_length in counted only on token embeds + # -1 is to account for the prepended BOS after `generate.` + # TODO (joao, raushan): refactor `generate` to avoid these operations with VLMs + if not self.language_model.config.is_encoder_decoder: + generate_kwargs["max_length"] = ( + generate_kwargs.get("max_length", 20) + language_model_inputs.shape[1] - 1 + ) + generate_kwargs["min_length"] = generate_kwargs.get("min_length", 0) + language_model_inputs.shape[1] + + inputs = {"inputs_embeds": inputs_embeds, "attention_mask": attention_mask} + if not self.language_model.config.is_encoder_decoder: + inputs["input_ids"] = input_ids + + outputs = self.language_model.generate(**inputs, **generate_kwargs) + return outputs + + +@add_start_docstrings( + """ + BLIP-2 Model with a vision and text projector, and a classification head on top. The model is used in the context + of image-text retrieval. Given an image and a text, the model returns the probability of the text being relevant to + the image. + """, + BLIP_2_START_DOCSTRING, +) +class Blip2ForImageTextRetrieval(Blip2PreTrainedModel): + main_input_name = "pixel_values" + _keep_in_fp32_modules = ["query_tokens", "qformer"] + + def __init__(self, config: Blip2Config): + super().__init__(config) + + self.vision_model = Blip2VisionModel(config.vision_config) + + self.query_tokens = nn.Parameter(torch.zeros(1, config.num_query_tokens, config.qformer_config.hidden_size)) + + self.embeddings = Blip2TextEmbeddings(config.qformer_config) + self.qformer = Blip2QFormerModel(config.qformer_config) + + # vision projection layer + self.vision_projection = nn.Linear(config.qformer_config.hidden_size, config.image_text_hidden_size) + + # text projection layer + self.text_projection = nn.Linear(config.qformer_config.hidden_size, config.image_text_hidden_size) + + # image text matching head + self.itm_head = nn.Linear(config.qformer_config.hidden_size, 2) + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.embeddings.word_embeddings + + def set_input_embeddings(self, value): + self.embeddings.word_embeddings = value + + @add_start_docstrings_to_model_forward(BLIP2_IMAGE_TEXT_RETRIEVAL_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=Blip2ImageTextMatchingModelOutput, config_class=Blip2Config) + def forward( + self, + pixel_values: torch.FloatTensor, + input_ids: torch.LongTensor, + attention_mask: Optional[torch.LongTensor] = None, + use_image_text_matching_head: Optional[bool] = False, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, Blip2ImageTextMatchingModelOutput]: + r""" + Returns: + + Examples: + + ```python + >>> import torch + >>> from PIL import Image + >>> import requests + >>> from transformers import AutoProcessor, Blip2ForImageTextRetrieval + + >>> device = "cuda" if torch.cuda.is_available() else "cpu" + + >>> model = Blip2ForImageTextRetrieval.from_pretrained("Salesforce/blip2-itm-vit-g", torch_dtype=torch.float16) + >>> processor = AutoProcessor.from_pretrained("Salesforce/blip2-itm-vit-g") + + >>> model.to(device) # doctest: +IGNORE_RESULT + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + >>> text = "two cats laying on a pink blanket" + + >>> inputs = processor(images=image, text=text, return_tensors="pt").to(device, torch.float16) + >>> itm_out = model(**inputs, use_image_text_matching_head=True) + >>> logits_per_image = torch.nn.functional.softmax(itm_out.logits_per_image, dim=1) + >>> probs = logits_per_image.softmax(dim=1) # we can take the softmax to get the label probabilities + + >>> print(f"{probs[0][0]:.1%} that image 0 is not '{text}'") + 26.9% that image 0 is not 'two cats laying on a pink blanket' + + >>> print(f"{probs[0][1]:.1%} that image 0 is '{text}'") + 73.0% that image 0 is 'two cats laying on a pink blanket' + + >>> texts = ["a photo of a cat", "a photo of a dog"] + + >>> inputs = processor(images=image, text=texts, return_tensors="pt").to(device, torch.float16) + >>> itc_out = model(**inputs, use_image_text_matching_head=False) + >>> logits_per_image = itc_out.logits_per_image # this is the image-text similarity score + >>> probs = logits_per_image.softmax(dim=1) # we can take the softmax to get the label probabilities + + >>> print(f"{probs[0][0]:.1%} that image 0 is '{texts[0]}'") + 55.3% that image 0 is 'a photo of a cat' + + >>> print(f"{probs[0][1]:.1%} that image 0 is '{texts[1]}'") + 44.7% that image 0 is 'a photo of a dog' + ``` + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + + vision_outputs = self.vision_model( + pixel_values=pixel_values, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + image_embeds = vision_outputs[0] + image_attention_mask = torch.ones(image_embeds.size()[:-1], dtype=torch.long, device=image_embeds.device) + + if use_image_text_matching_head: + query_tokens = self.query_tokens.expand(image_embeds.shape[0], -1, -1) + query_attention_mask = torch.ones(query_tokens.size()[:-1], dtype=torch.long, device=query_tokens.device) + attention_mask = torch.cat([query_attention_mask, attention_mask], dim=1) + + query_embeds = self.embeddings( + input_ids=input_ids, + query_embeds=query_tokens, + ) + + text_outputs = self.qformer( + query_embeds=query_embeds, + query_length=query_tokens.shape[1], + attention_mask=attention_mask, + encoder_hidden_states=image_embeds, + encoder_attention_mask=image_attention_mask, + return_dict=return_dict, + ) + text_embeds = text_outputs[0] if not return_dict else text_outputs.last_hidden_state + + output = self.itm_head(text_embeds[:, : query_tokens.size(1), :]) + logits_per_image = output.mean(dim=1) + logits_per_text = logits_per_image.t() + else: + query_tokens = self.query_tokens.expand(image_embeds.shape[0], -1, -1) + query_outputs = self.qformer( + query_embeds=query_tokens, + encoder_hidden_states=image_embeds, + encoder_attention_mask=image_attention_mask, + return_dict=return_dict, + ) + image_embeds = query_outputs[0] if not return_dict else query_outputs.last_hidden_state + + query_embeds = self.embeddings( + input_ids=input_ids, + ) + text_outputs = self.qformer( + query_embeds=query_embeds, + query_length=0, + attention_mask=attention_mask, + return_dict=return_dict, + ) + question_embeds = text_outputs[0] if not return_dict else text_outputs.last_hidden_state + + # normalized features + image_embeds = nn.functional.normalize(self.vision_projection(image_embeds), dim=-1) + text_embeds = nn.functional.normalize(self.text_projection(question_embeds[:, 0, :]), dim=-1) + + # cosine similarity as logits + logits_per_image = torch.matmul(image_embeds, text_embeds.t()) + logits_per_image, _ = logits_per_image.max(dim=1) + + logits_per_text = logits_per_image.t() + + if not return_dict: + output = (logits_per_image, logits_per_text, text_embeds, image_embeds, text_outputs, vision_outputs) + return output + + return Blip2ImageTextMatchingModelOutput( + logits_per_image=logits_per_image, + logits_per_text=logits_per_text, + text_embeds=text_embeds, + image_embeds=image_embeds, + text_model_output=text_outputs, + vision_model_output=vision_outputs, + ) + + +__all__ = [ + "Blip2Model", + "Blip2VisionModelWithProjection", + "Blip2QFormerModel", + "Blip2PreTrainedModel", + "Blip2ForConditionalGeneration", + "Blip2ForImageTextRetrieval", + "Blip2VisionModel", + "Blip2TextModelWithProjection", +] diff --git a/docs/transformers/build/lib/transformers/models/blip_2/processing_blip_2.py b/docs/transformers/build/lib/transformers/models/blip_2/processing_blip_2.py new file mode 100644 index 0000000000000000000000000000000000000000..36b663dccb7651aba9320948e2594638cff5bb5e --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/blip_2/processing_blip_2.py @@ -0,0 +1,193 @@ +# coding=utf-8 +# Copyright 2023 The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Processor class for BLIP-2. +""" + +from typing import List, Optional, Union + +from ...image_processing_utils import BatchFeature +from ...image_utils import ImageInput +from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack +from ...tokenization_utils_base import ( + AddedToken, + BatchEncoding, + PreTokenizedInput, + TextInput, +) +from ...utils import logging + + +logger = logging.get_logger(__name__) + + +class Blip2ProcessorKwargs(ProcessingKwargs, total=False): + _defaults = { + "text_kwargs": { + "add_special_tokens": True, + "padding": False, + "stride": 0, + "return_overflowing_tokens": False, + "return_special_tokens_mask": False, + "return_offsets_mapping": False, + "return_token_type_ids": False, + "return_length": False, + "verbose": True, + }, + "images_kwargs": {}, + } + + +class Blip2Processor(ProcessorMixin): + r""" + Constructs a BLIP-2 processor which wraps a BLIP image processor and an OPT/T5 tokenizer into a single processor. + + [`BlipProcessor`] offers all the functionalities of [`BlipImageProcessor`] and [`AutoTokenizer`]. See the docstring + of [`~BlipProcessor.__call__`] and [`~BlipProcessor.decode`] for more information. + + Args: + image_processor (`BlipImageProcessor`): + An instance of [`BlipImageProcessor`]. The image processor is a required input. + tokenizer (`AutoTokenizer`): + An instance of ['PreTrainedTokenizer`]. The tokenizer is a required input. + num_query_tokens (`int`, *optional*): + Number of tokens used by the Qformer as queries, should be same as in model's config. + """ + + attributes = ["image_processor", "tokenizer"] + valid_kwargs = ["num_query_tokens"] + image_processor_class = ("BlipImageProcessor", "BlipImageProcessorFast") + tokenizer_class = "AutoTokenizer" + + def __init__(self, image_processor, tokenizer, num_query_tokens=None, **kwargs): + tokenizer.return_token_type_ids = False + self.current_processor = image_processor + if not hasattr(tokenizer, "image_token"): + self.image_token = AddedToken("", normalized=False, special=True) + tokenizer.add_tokens([self.image_token], special_tokens=True) + else: + self.image_token = tokenizer.image_token + self.num_query_tokens = num_query_tokens + + super().__init__(image_processor, tokenizer) + + def __call__( + self, + images: ImageInput = None, + text: Optional[Union[str, List[str], TextInput, PreTokenizedInput]] = None, + audio=None, + videos=None, + **kwargs: Unpack[Blip2ProcessorKwargs], + ) -> BatchEncoding: + """ + This method uses [`BlipImageProcessor.__call__`] method to prepare image(s) for the model, and + [`BertTokenizerFast.__call__`] to prepare text for the model. + + Please refer to the docstring of the above two methods for more information. + Args: + images (`ImageInput`): + The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch + tensor. Both channels-first and channels-last formats are supported. + text (`TextInput`, `PreTokenizedInput`, `List[TextInput]`, `List[PreTokenizedInput]`): + The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings + (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set + `is_split_into_words=True` (to lift the ambiguity with a batch of sequences). + return_tensors (`str` or [`~utils.TensorType`], *optional*): + If set, will return tensors of a particular framework. Acceptable values are: + - `'tf'`: Return TensorFlow `tf.constant` objects. + - `'pt'`: Return PyTorch `torch.Tensor` objects. + - `'np'`: Return NumPy `np.ndarray` objects. + - `'jax'`: Return JAX `jnp.ndarray` objects. + """ + if images is None and text is None: + raise ValueError("You have to specify either images or text.") + output_kwargs = self._merge_kwargs( + Blip2ProcessorKwargs, + tokenizer_init_kwargs=self.tokenizer.init_kwargs, + **kwargs, + ) + # BC for explicit return_tensors + if "return_tensors" in output_kwargs["common_kwargs"]: + return_tensors = output_kwargs["common_kwargs"].pop("return_tensors", None) + else: + return_tensors = None + encoding = BatchFeature(tensor_type=return_tensors) + if text is not None: + if isinstance(text, str): + text = [text] + elif not isinstance(text, list) and not isinstance(text[0], str): + raise ValueError("Invalid input text. Please provide a string, or a list of strings") + + text_encoding = {} + + return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None) + _text_encoding = self.tokenizer(text, **output_kwargs["text_kwargs"], return_tensors=None) + output_kwargs["text_kwargs"]["return_tensors"] = return_tensors + + # if we know how many query tokens, expand text inside processor. We need this hacky manipulation + # because BLIP expects image tokens to be at the beginning even before BOS token + if self.num_query_tokens is not None: + image_tokens = self.image_token.content * self.num_query_tokens + image_token_encoding = self.tokenizer( + [image_tokens] * len(text), add_special_tokens=False, return_tensors=None + ) + for k in _text_encoding: + text_encoding[k] = [ + img_encoding + txt_encoding + for img_encoding, txt_encoding in zip(image_token_encoding[k], _text_encoding[k]) + ] + else: + text_encoding = _text_encoding + logger.warning_once( + "Expanding inputs for image tokens in BLIP-2 should be done in processing. " + "Please follow instruction here (https://gist.github.com/zucchini-nlp/e9f20b054fa322f84ac9311d9ab67042) to update your BLIP-2 model. " + "Using processors without these attributes in the config is deprecated and will throw an error in v4.50." + ) + + # cast to desired return tensors type + encoding.update(BatchEncoding(text_encoding, tensor_type=return_tensors)) + # add pixel_values encoding. If we also have text_encoding, update image encoding and return it. + # else, return the text encoding. + + if images is not None: + image_encoding = self.image_processor(images, **output_kwargs["images_kwargs"]) + encoding.update(image_encoding) + return encoding + + # Copied from transformers.models.blip.processing_blip.BlipProcessor.batch_decode with BertTokenizerFast->PreTrainedTokenizer + def batch_decode(self, *args, **kwargs): + """ + This method forwards all its arguments to PreTrainedTokenizer's [`~PreTrainedTokenizer.batch_decode`]. Please + refer to the docstring of this method for more information. + """ + return self.tokenizer.batch_decode(*args, **kwargs) + + # Copied from transformers.models.blip.processing_blip.BlipProcessor.decode with BertTokenizerFast->PreTrainedTokenizer + def decode(self, *args, **kwargs): + """ + This method forwards all its arguments to PreTrainedTokenizer's [`~PreTrainedTokenizer.decode`]. Please refer to + the docstring of this method for more information. + """ + return self.tokenizer.decode(*args, **kwargs) + + @property + # Copied from transformers.models.blip.processing_blip.BlipProcessor.model_input_names + def model_input_names(self): + tokenizer_input_names = self.tokenizer.model_input_names + image_processor_input_names = self.image_processor.model_input_names + return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names)) + + +__all__ = ["Blip2Processor"] diff --git a/docs/transformers/build/lib/transformers/models/bloom/__init__.py b/docs/transformers/build/lib/transformers/models/bloom/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..72d1d6e6ca4724235ce46978e5c11e84583d4033 --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/bloom/__init__.py @@ -0,0 +1,29 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ...utils import _LazyModule +from ...utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .configuration_bloom import * + from .modeling_bloom import * + from .modeling_flax_bloom import * + from .tokenization_bloom_fast import * +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/docs/transformers/build/lib/transformers/models/bloom/configuration_bloom.py b/docs/transformers/build/lib/transformers/models/bloom/configuration_bloom.py new file mode 100644 index 0000000000000000000000000000000000000000..ca10c7ce7ed4ef795f8aefad01bcf93bc88099ee --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/bloom/configuration_bloom.py @@ -0,0 +1,237 @@ +# coding=utf-8 +# Copyright 2022 the Big Science Workshop and HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Bloom configuration""" + +from collections import OrderedDict +from typing import TYPE_CHECKING, Any, List, Mapping, Optional + +from packaging import version + + +if TYPE_CHECKING: + from ... import PreTrainedTokenizer, TensorType + +from ...configuration_utils import PretrainedConfig +from ...onnx import OnnxConfigWithPast, PatchingSpec +from ...utils import is_torch_available, logging + + +logger = logging.get_logger(__name__) + + +class BloomConfig(PretrainedConfig): + """ + This is the configuration class to store the configuration of a [`BloomModel`]. It is used to instantiate a Bloom + model according to the specified arguments, defining the model architecture. Instantiating a configuration with the + defaults will yield a similar configuration to the Bloom architecture + [bigscience/bloom](https://huggingface.co/bigscience/bloom). + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + + Args: + vocab_size (`int`, *optional*, defaults to 250880): + Vocabulary size of the Bloom model. Defines the maximum number of different tokens that can be represented + by the `inputs_ids` passed when calling [`BloomModel`]. Check [this + discussion](https://huggingface.co/bigscience/bloom/discussions/120#633d28389addb8530b406c2a) on how the + `vocab_size` has been defined. + hidden_size (`int`, *optional*, defaults to 64): + Dimensionality of the embeddings and hidden states. + n_layer (`int`, *optional*, defaults to 2): + Number of hidden layers in the Transformer encoder. + n_head (`int`, *optional*, defaults to 8): + Number of attention heads for each attention layer in the Transformer encoder. + layer_norm_epsilon (`float`, *optional*, defaults to 1e-5): + The epsilon to use in the layer normalization layers. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + apply_residual_connection_post_layernorm (`bool`, *optional*, defaults to `False`): + If enabled, use the layer norm of the hidden states as the residual in the transformer blocks + hidden_dropout (`float`, *optional*, defaults to 0.1): + Dropout rate of the dropout function on the bias dropout. + attention_dropout (`float`, *optional*, defaults to 0.1): + Dropout rate applied to the attention probs + use_cache (`bool`, *optional*, defaults to `True`): + Whether or not the model should return the last key/values attentions (not used by all models). + pretraining_tp (`int`, *optional*, defaults to `1`): + Experimental feature. Tensor parallelism rank used during pretraining with Megatron. Please refer to [this + document](https://huggingface.co/docs/transformers/parallelism) to understand more about it. This value is + necessary to ensure exact reproducibility of the pretraining results. Please refer to [this + issue](https://github.com/pytorch/pytorch/issues/76232). Note also that this is enabled only when + `slow_but_exact=True`. + slow_but_exact (`bool`, *optional*, defaults to `False`): + Experimental feature. Whether to use slow but exact implementation of the attention mechanism. While + merging the TP rank tensors, due to slicing operations the results may be slightly different between the + model trained on Megatron and our model. Please refer to [this + issue](https://github.com/pytorch/pytorch/issues/76232). A solution to obtain more accurate results is to + enable this feature. Enabling this will hurt the computational time of the inference. Will be probably + resolved in the future once the main model has been fine-tuned with TP_rank=1. + + Example: + + ```python + >>> from transformers import BloomConfig, BloomModel + + >>> # Initializing a Bloom configuration + >>> configuration = BloomConfig() + + >>> # Initializing a model (with random weights) from the configuration + >>> model = BloomModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "bloom" + keys_to_ignore_at_inference = ["past_key_values"] + attribute_map = { + "num_hidden_layers": "n_layer", + "num_attention_heads": "n_head", + } + + def __init__( + self, + vocab_size=250880, + hidden_size=64, + n_layer=2, + n_head=8, + layer_norm_epsilon=1e-5, + initializer_range=0.02, + use_cache=True, + bos_token_id=1, + eos_token_id=2, + apply_residual_connection_post_layernorm=False, + hidden_dropout=0.0, + attention_dropout=0.0, + pretraining_tp=1, # TP rank used when training with megatron + slow_but_exact=False, + **kwargs, + ): + self.vocab_size = vocab_size + # Backward compatibility with n_embed kwarg + n_embed = kwargs.pop("n_embed", None) + self.hidden_size = hidden_size if n_embed is None else n_embed + self.n_layer = n_layer + self.n_head = n_head + self.layer_norm_epsilon = layer_norm_epsilon + self.initializer_range = initializer_range + self.use_cache = use_cache + self.pretraining_tp = pretraining_tp + self.apply_residual_connection_post_layernorm = apply_residual_connection_post_layernorm + self.hidden_dropout = hidden_dropout + self.attention_dropout = attention_dropout + + self.bos_token_id = bos_token_id + self.eos_token_id = eos_token_id + self.slow_but_exact = slow_but_exact + + super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs) + + +class BloomOnnxConfig(OnnxConfigWithPast): + torch_onnx_minimum_version = version.parse("1.12") + + def __init__( + self, + config: PretrainedConfig, + task: str = "default", + patching_specs: List[PatchingSpec] = None, + use_past: bool = False, + ): + super().__init__(config, task=task, patching_specs=patching_specs, use_past=use_past) + if not getattr(self._config, "pad_token_id", None): + # TODO: how to do that better? + self._config.pad_token_id = 0 + + @property + def inputs(self) -> Mapping[str, Mapping[int, str]]: + common_inputs = OrderedDict({"input_ids": {0: "batch", 1: "sequence"}}) + if self.use_past: + # BLOOM stores values on dynamic axis 2. For more details see: https://github.com/huggingface/transformers/pull/18344 + self.fill_with_past_key_values_(common_inputs, direction="inputs", inverted_values_shape=True) + common_inputs["attention_mask"] = {0: "batch", 1: "past_sequence + sequence"} + else: + common_inputs["attention_mask"] = {0: "batch", 1: "sequence"} + + return common_inputs + + @property + def num_layers(self) -> int: + return self._config.n_layer + + @property + def num_attention_heads(self) -> int: + return self._config.n_head + + @property + def atol_for_validation(self) -> float: + return 1e-3 + + def generate_dummy_inputs( + self, + tokenizer: "PreTrainedTokenizer", + batch_size: int = -1, + seq_length: int = -1, + is_pair: bool = False, + framework: Optional["TensorType"] = None, + ) -> Mapping[str, Any]: + common_inputs = super(OnnxConfigWithPast, self).generate_dummy_inputs( + tokenizer, batch_size=batch_size, seq_length=seq_length, is_pair=is_pair, framework=framework + ) + + # We need to order the input in the way they appears in the forward() + ordered_inputs = OrderedDict({"input_ids": common_inputs["input_ids"]}) + + # Need to add the past_keys + if self.use_past: + if not is_torch_available(): + raise ValueError("Cannot generate dummy past_keys inputs without PyTorch installed.") + else: + import torch + + batch, seqlen = common_inputs["input_ids"].shape + # Not using the same length for past_key_values + past_key_values_length = seqlen + 2 + head_dim = self._config.hidden_size // self.num_attention_heads + past_key_shape = ( + batch * self.num_attention_heads, + head_dim, + past_key_values_length, + ) + past_value_shape = ( + batch * self.num_attention_heads, + past_key_values_length, + head_dim, + ) + ordered_inputs["past_key_values"] = [ + (torch.zeros(past_key_shape), torch.zeros(past_value_shape)) for _ in range(self.num_layers) + ] + + ordered_inputs["attention_mask"] = common_inputs["attention_mask"] + if self.use_past: + mask_dtype = ordered_inputs["attention_mask"].dtype + ordered_inputs["attention_mask"] = torch.cat( + [ordered_inputs["attention_mask"], torch.ones(batch, past_key_values_length, dtype=mask_dtype)], dim=1 + ) + + return ordered_inputs + + @property + def default_onnx_opset(self) -> int: + return 13 + + +__all__ = ["BloomConfig", "BloomOnnxConfig"] diff --git a/docs/transformers/build/lib/transformers/models/bloom/convert_bloom_original_checkpoint_to_pytorch.py b/docs/transformers/build/lib/transformers/models/bloom/convert_bloom_original_checkpoint_to_pytorch.py new file mode 100644 index 0000000000000000000000000000000000000000..73d251875dc2efc6bc31603aa472f3831b8a2f15 --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/bloom/convert_bloom_original_checkpoint_to_pytorch.py @@ -0,0 +1,254 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Convert BigScience BLOOM checkpoint.""" + +import argparse +import json +import os +import re + +import torch + +from transformers import BloomConfig, BloomModel +from transformers.file_utils import CONFIG_NAME, WEIGHTS_NAME +from transformers.utils import logging + + +logging.set_verbosity_info() + +WEIGHTS_TO_AVERAGE_ENDSWITH = [ + "word_embeddings_layernorm.weight", + "word_embeddings_layernorm.bias", + "input_layernorm.weight", + "input_layernorm.bias", + "post_attention_layernorm.weight", + "post_attention_layernorm.bias", + "self_attention.dense.bias", + "mlp.dense_4h_to_h.bias", + "ln_f.weight", + "ln_f.bias", +] + +WEIGHTS_WITH_ROW_PARALLELISM_CONTAIN = [ + "mlp.dense_4h_to_h.weight", + "self_attention.dense.weight", +] + + +def layer_name_mapping(key, file): + """Convert Megatron-DeepSpeed TP/PP weights mapping in transformers PP only""" + # Handle first and last layers + layer_rename_map = { + "word_embeddings.weight": "word_embeddings.weight", + "word_embeddings.norm.weight": "word_embeddings_layernorm.weight", + "word_embeddings.norm.bias": "word_embeddings_layernorm.bias", + "weight": "ln_f.weight", + "bias": "ln_f.bias", + } + + if key in layer_rename_map: + return layer_rename_map[key] + + # Handle transformer blocks + layer_number = int(re.match(r".*layer_(\d*).*", file)[1]) + layer_number -= 3 + return f"h.{layer_number}." + key + + +def get_dtype_size(dtype): + if dtype == torch.bool: + return 1 / 8 + bit_search = re.search(r"[^\d](\d+)$", str(dtype)) + if bit_search is None: + raise ValueError(f"`dtype` is not a valid dtype: {dtype}.") + bit_size = int(bit_search.groups()[0]) + return bit_size // 8 + + +def convert_bloom_checkpoint_to_pytorch( + bloom_checkpoint_path, bloom_config_file, pytorch_dump_folder_path, shard_model, pretraining_tp +): + # Construct model + if bloom_config_file == "": + config = BloomConfig() + else: + config = BloomConfig.from_json_file(bloom_config_file) + + if shard_model: + file_names = os.listdir(bloom_checkpoint_path) + file_names = sorted(filter(lambda s: s.startswith("layer") and "model_00" in s, file_names)) + + index_dict = {"weight_map": {}, "metadata": {}} + total_size = 0 + + missing_keys = None + + config = BloomConfig() + + for j, file in enumerate(file_names): + print("Processing file: {}".format(file)) + tensors = None + + for i in range(pretraining_tp): + # load all TP files + f_name = file.replace("model_00", f"model_0{i}") + temp = torch.load(os.path.join(bloom_checkpoint_path, f_name), map_location="cpu", weights_only=True) + + # Rename keys in the transformers names + keys = list(temp.keys()) + for key in keys: + temp[layer_name_mapping(key, file)] = temp.pop(key) + + if tensors is None: + tensors = temp + else: + for key in tensors.keys(): + if any(key.endswith(end) for end in WEIGHTS_TO_AVERAGE_ENDSWITH): + # We average (sum and then divide) some weights accross TP ranks (see https://github.com/bigscience-workshop/Megatron-DeepSpeed/blob/olruwase/sync_layer_norms/megatron/training.py#L425) + tensors[key] += temp[key] + else: + # Some weights are RowParallelLinear in Megatron-Deepspeed, others are ColumnParallel + cat_dim = 1 if any(text in key for text in WEIGHTS_WITH_ROW_PARALLELISM_CONTAIN) else 0 + # We concatenate these weights accross TP ranks + tensors[key] = torch.cat([tensors[key], temp[key]], dim=cat_dim) + + # Divide by the number of TP the weights we want to average + for key in tensors.keys(): + if any(key.endswith(end) for end in WEIGHTS_TO_AVERAGE_ENDSWITH): + tensors[key] = tensors[key] / pretraining_tp + torch.save( + tensors, + os.path.join( + pytorch_dump_folder_path, + "pytorch_model_{}-of-{}.bin".format(str(j + 1).zfill(5), str(len(file_names)).zfill(5)), + ), + ) + + for key in tensors.keys(): + value = tensors[key] + total_size += value.numel() * get_dtype_size(value.dtype) + if key not in index_dict["weight_map"]: + index_dict["weight_map"][key] = "pytorch_model_{}-of-{}.bin".format( + str(j + 1).zfill(5), str(len(file_names)).zfill(5) + ) + + config = BloomConfig() + pytorch_config_dump_path = pytorch_dump_folder_path + "/" + CONFIG_NAME + index_dict["metadata"]["total_size"] = total_size + with open(pytorch_config_dump_path, "w", encoding="utf-8") as f: + f.write(config.to_json_string()) + with open(os.path.join(pytorch_dump_folder_path, WEIGHTS_NAME + ".index.json"), "w", encoding="utf-8") as f: + json_config = json.dumps(index_dict, indent=2, sort_keys=True) + "\n" + f.write(json_config) + else: + model = BloomModel(config) + + file_names = os.listdir(bloom_checkpoint_path) + file_names = sorted(filter(lambda s: s.startswith("layer") and "model_00" in s, file_names)) + + missing_keys = None + for i, file in enumerate(file_names): + tensors = None + for i in range(pretraining_tp): + # load all TP files + f_name = file.replace("model_00", f"model_0{i}") + temp = torch.load(os.path.join(bloom_checkpoint_path, f_name), map_location="cpu", weights_only=True) + + # Rename keys in the transformers names + keys = list(temp.keys()) + for key in keys: + temp[layer_name_mapping(key, file)] = temp.pop(key) + + if tensors is None: + tensors = temp + else: + for key in tensors.keys(): + # We average (sum and then divide) some weights accross TP ranks (see https://github.com/bigscience-workshop/Megatron-DeepSpeed/blob/olruwase/sync_layer_norms/megatron/training.py#L425) + if any(key.endswith(end) for end in WEIGHTS_TO_AVERAGE_ENDSWITH): + tensors[key] += temp[key] + else: + # Some weights are RowParallelLinear in Megatron-Deepspeed, others are ColumnParallel + cat_dim = 1 if any(text in key for text in WEIGHTS_WITH_ROW_PARALLELISM_CONTAIN) else 0 + # We concatenate these weights accross TP ranks + tensors[key] = torch.cat([tensors[key], temp[key]], dim=cat_dim) + + # Divide by the number of TP the weights we want to average + for key in tensors.keys(): + if any(key.endswith(end) for end in WEIGHTS_TO_AVERAGE_ENDSWITH): + tensors[key] = tensors[key] / pretraining_tp + + other_keys = model.load_state_dict(tensors, strict=False) + assert not other_keys.unexpected_keys, f"The keys {other_keys.unexpected_keys} are unexpected" + if missing_keys is None: + missing_keys = set(other_keys.missing_keys) + else: + missing_keys = missing_keys.intersection(set(other_keys.missing_keys)) + + assert not missing_keys, f"The keys {missing_keys} are missing" + + # Save pytorch-model + os.makedirs(pytorch_dump_folder_path, exist_ok=True) + pytorch_weights_dump_path = pytorch_dump_folder_path + "/" + WEIGHTS_NAME + pytorch_config_dump_path = pytorch_dump_folder_path + "/" + CONFIG_NAME + print(f"Save PyTorch model to {pytorch_weights_dump_path} with dtype {config.torch_dtype}") + if config.torch_dtype is not None: + model = model.to(config.torch_dtype) + torch.save(model.state_dict(), pytorch_weights_dump_path) + print(f"Save configuration file to {pytorch_config_dump_path}") + with open(pytorch_config_dump_path, "w", encoding="utf-8") as f: + f.write(config.to_json_string()) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + # Required parameters + parser.add_argument( + "--bloom_checkpoint_path", + default=None, + type=str, + required=True, + help="Path to the Megatron-LM checkpoint path.", + ) + parser.add_argument( + "--pytorch_dump_folder_path", default=None, type=str, required=True, help="Path to the output PyTorch model." + ) + parser.add_argument( + "--bloom_config_file", + default="", + type=str, + help=( + "An optional config json file corresponding to the pre-trained model. \n" + "This specifies the model architecture." + ), + ) + parser.add_argument( + "--shard_model", + action="store_true", + help="An optional setting to shard the output model \nThis enables sharding the converted checkpoint", + ) + parser.add_argument( + "--pretraining_tp", + default=4, + type=int, + help="Pretraining TP rank that has been used when training the model in Megatron-LM \n", + ) + args = parser.parse_args() + convert_bloom_checkpoint_to_pytorch( + args.bloom_checkpoint_path, + args.bloom_config_file, + args.pytorch_dump_folder_path, + args.shard_model, + args.pretraining_tp, + ) diff --git a/docs/transformers/build/lib/transformers/models/bloom/modeling_bloom.py b/docs/transformers/build/lib/transformers/models/bloom/modeling_bloom.py new file mode 100644 index 0000000000000000000000000000000000000000..f9aee9362db12f61f6a82847801e833cdbb2d160 --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/bloom/modeling_bloom.py @@ -0,0 +1,1397 @@ +# coding=utf-8 +# Copyright 2022 HuggingFace Inc. team and BigScience workshop. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch BLOOM model.""" + +import math +import warnings +from typing import Optional, Tuple, Union + +import torch +import torch.utils.checkpoint +from torch import nn +from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, LayerNorm, MSELoss +from torch.nn import functional as F + +from ...cache_utils import Cache, DynamicCache, StaticCache +from ...file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward +from ...generation import GenerationMixin +from ...modeling_attn_mask_utils import AttentionMaskConverter +from ...modeling_outputs import ( + BaseModelOutputWithPastAndCrossAttentions, + CausalLMOutputWithCrossAttentions, + QuestionAnsweringModelOutput, + SequenceClassifierOutputWithPast, + TokenClassifierOutput, +) +from ...modeling_utils import PreTrainedModel +from ...utils import ( + is_torch_flex_attn_available, + logging, +) +from .configuration_bloom import BloomConfig + + +if is_torch_flex_attn_available(): + from torch.nn.attention.flex_attention import BlockMask + + from ...integrations.flex_attention import make_flex_block_causal_mask + + +logger = logging.get_logger(__name__) + +_CHECKPOINT_FOR_DOC = "bigscience/bloom-560m" +_CONFIG_FOR_DOC = "BloomConfig" + + +def build_alibi_tensor(attention_mask: torch.Tensor, num_heads: int, dtype: torch.dtype) -> torch.Tensor: + """ + Link to paper: https://arxiv.org/abs/2108.12409 Alibi tensor is not causal as the original paper mentions, it + relies on a translation invariance of softmax for quick implementation: with l being a tensor, and a fixed value + `softmax(l+a) = softmax(l)`. Based on + https://github.com/ofirpress/attention_with_linear_biases/blob/a35aaca144e0eb6b789dfcb46784c4b8e31b7983/fairseq/models/transformer.py#L742 + TODO @thomasw21 this doesn't work as nicely due to the masking strategy, and so masking varies slightly. + + Args: + Returns tensor shaped (batch_size * num_heads, 1, max_seq_len) + attention_mask (`torch.Tensor`): + Token-wise attention mask, this should be of shape (batch_size, max_seq_len). + num_heads (`int`): + number of heads + dtype (`torch.dtype`, *optional*, default=`torch.bfloat16`): + dtype of the output tensor + """ + batch_size, seq_length = attention_mask.shape + closest_power_of_2 = 2 ** math.floor(math.log2(num_heads)) + base = torch.tensor( + 2 ** (-(2 ** -(math.log2(closest_power_of_2) - 3))), device=attention_mask.device, dtype=torch.float32 + ) + powers = torch.arange(1, 1 + closest_power_of_2, device=attention_mask.device, dtype=torch.int32) + slopes = torch.pow(base, powers) + + if closest_power_of_2 != num_heads: + extra_base = torch.tensor( + 2 ** (-(2 ** -(math.log2(2 * closest_power_of_2) - 3))), device=attention_mask.device, dtype=torch.float32 + ) + num_remaining_heads = min(closest_power_of_2, num_heads - closest_power_of_2) + extra_powers = torch.arange(1, 1 + 2 * num_remaining_heads, 2, device=attention_mask.device, dtype=torch.int32) + slopes = torch.cat([slopes, torch.pow(extra_base, extra_powers)], dim=0) + + # Note: alibi will added to the attention bias that will be applied to the query, key product of attention + # => therefore alibi will have to be of shape (batch_size, num_heads, query_length, key_length) + # => here we set (batch_size=1, num_heads=num_heads, query_length=1, key_length=max_length) + # => the query_length dimension will then be broadcasted correctly + # This is more or less identical to T5's relative position bias: + # https://github.com/huggingface/transformers/blob/f681437203baa7671de3174b0fa583c349d9d5e1/src/transformers/models/t5/modeling_t5.py#L527 + arange_tensor = ((attention_mask.cumsum(dim=-1) - 1) * attention_mask)[:, None, :] + alibi = slopes[..., None] * arange_tensor + return alibi.reshape(batch_size * num_heads, 1, seq_length).to(dtype) + + +def dropout_add(x: torch.Tensor, residual: torch.Tensor, prob: float, training: bool) -> torch.Tensor: + """ + Dropout add function + + Args: + x (`torch.tensor`): + input tensor + residual (`torch.tensor`): + residual tensor + prob (`float`): + dropout probability + training (`bool`): + training mode + """ + out = F.dropout(x, p=prob, training=training) + out = residual + out + return out + + +def bloom_gelu_forward(x: torch.Tensor) -> torch.Tensor: + """ + Custom bias GELU function. Adapted from Megatron-DeepSpeed code. Here we use a simple implementation (inference) to + make the model jitable. + + Args: + x (`torch.tensor`): + input hidden states + """ + return x * 0.5 * (1.0 + torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x))) + + +def bloom_gelu_back(g: torch.Tensor, x: torch.Tensor) -> torch.Tensor: + """ + gradient of tanh approximation of gelu gradient of actual gelu is: 0.5 * (1. + torch.erf(x * 0.70710678)) + + 0.3989423 * x * torch.exp(-0.5 * x * x) + + Args: + g (`torch.tensor`): + gradient output tensor + x (`torch.tensor`): + input tensor + """ + x = x[0] # x is a tuple of 1 element, needs to unpack it first + tanh_out = torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x)) + # sqrt(2/pi) * 3 * 0.044715 -> 0.1070322243 + ff = 0.5 * x * ((1 - tanh_out * tanh_out) * (0.79788456 + 0.1070322243 * x * x)) + 0.5 * (1 + tanh_out) + return ff * g + + +class GeLUFunction(torch.autograd.Function): + @staticmethod + def forward(ctx, input: torch.Tensor) -> torch.Tensor: + ctx.save_for_backward(input) + return bloom_gelu_forward(input) + + @staticmethod + def backward(ctx, grad_output: torch.Tensor) -> torch.Tensor: + input = ctx.saved_tensors + tmp = bloom_gelu_back(grad_output, input) + return tmp + + +class BloomGelu(nn.Module): + """ + BloomBiasGelu wrapper function that make use of the simple function on inference mode to make the model + torchscriptable and use the autograd function in training mode to get the accurate results of the gradients Partly + copied from Megatron-DeepSpeed code and adapted for our needs + + See here why autograd functions are not torchscriptable: https://github.com/pytorch/pytorch/issues/22329 + """ + + def __init__(self): + super().__init__() + + def forward(self, x: torch.Tensor) -> torch.Tensor: + if self.training: + return GeLUFunction.apply(x) + else: + return bloom_gelu_forward(x) + + +class BloomAttention(nn.Module): + def __init__(self, config: BloomConfig, layer_idx: Optional[int] = None): + super().__init__() + + self.pretraining_tp = config.pretraining_tp + self.slow_but_exact = config.slow_but_exact + + self.hidden_size = config.hidden_size + self.num_heads = config.n_head + self.head_dim = self.hidden_size // self.num_heads + self.split_size = self.hidden_size + self.hidden_dropout = config.hidden_dropout + + if self.head_dim * self.num_heads != self.hidden_size: + raise ValueError( + f"`hidden_size` must be divisible by num_heads (got `hidden_size`: {self.hidden_size} and `num_heads`:" + f" {self.num_heads})." + ) + + # Layer-wise attention scaling + self.inv_norm_factor = 1.0 / math.sqrt(self.head_dim) + self.beta = 1.0 + self.layer_idx = layer_idx + if layer_idx is None: + logger.warning_once( + f"Instantiating {self.__class__.__name__} without passing a `layer_idx` is not recommended and will " + "lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` " + "when creating this class." + ) + + self.query_key_value = nn.Linear(self.hidden_size, 3 * self.hidden_size, bias=True) + self.dense = nn.Linear(self.hidden_size, self.hidden_size) + self.attention_dropout = nn.Dropout(config.attention_dropout) + + def _reshape(self, fused_qkv: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + """ + Split the last dimension into (num_heads, head_dim) and reshapes to (bs, heads, len, dim) shape + without making any copies, results share same memory storage as `fused_qkv` + + Args: + fused_qkv (`torch.tensor`): [batch_size, seq_length, num_heads * 3 * head_dim] + + Returns: + query: [batch_size, num_heads, seq_length, head_dim] + key: [batch_size, num_heads, seq_length, head_dim] + value: [batch_size, num_heads, seq_length, head_dim] + """ + batch_size, seq_length, three_times_hidden_size = fused_qkv.shape + fused_qkv = fused_qkv.view(batch_size, seq_length, self.num_heads, 3, self.head_dim) + query_layer = fused_qkv[..., 0, :].transpose(1, 2) + key_layer = fused_qkv[..., 1, :].transpose(1, 2) + value_layer = fused_qkv[..., 2, :].transpose(1, 2) + return query_layer, key_layer, value_layer + + def _merge_heads(self, x: torch.Tensor) -> torch.Tensor: + """ + Merge heads together over the last dimension + + Args: + x (`torch.tensor`): [batch_size * num_heads, seq_length, head_dim] + + Returns: + torch.tensor: [batch_size, seq_length, num_heads * head_dim] + """ + # What we want to achieve is: + # batch_size * num_heads, seq_length, head_dim -> batch_size, seq_length, num_heads * head_dim + batch_size_and_num_heads, seq_length, _ = x.shape + batch_size = batch_size_and_num_heads // self.num_heads + + # First view to decompose the batch size + # batch_size * num_heads, seq_length, head_dim -> batch_size, num_heads, seq_length, head_dim + x = x.view(batch_size, self.num_heads, seq_length, self.head_dim) + + # batch_size, num_heads, seq_length, head_dim -> batch_size, seq_length, num_heads, head_dim + x = x.permute(0, 2, 1, 3) + + # batch_size, seq_length, num_heads, head_dim -> batch_size, seq_length, num_heads * head_dim + return x.reshape(batch_size, seq_length, self.num_heads * self.head_dim) + + def forward( + self, + hidden_states: torch.Tensor, + residual: torch.Tensor, + alibi: torch.Tensor, + attention_mask: torch.Tensor, + layer_past: Optional[Cache] = None, + head_mask: Optional[torch.Tensor] = None, + use_cache: bool = False, + output_attentions: bool = False, + cache_position: Optional[torch.LongTensor] = None, + ): + batch_size, q_length, _ = hidden_states.shape + fused_qkv = self.query_key_value(hidden_states) # [batch_size, seq_length, 3 x hidden_size] + # 3 x [batch_size, num_heads, seq_length, head_dim] + query_layer, key_layer, value_layer = self._reshape(fused_qkv) + + if layer_past is not None: + cache_kwargs = {"cache_position": cache_position} + key_layer, value_layer = layer_past.update(key_layer, value_layer, self.layer_idx, cache_kwargs) + + # reshape qkv for further computations + query_layer = query_layer.reshape(batch_size * self.num_heads, -1, self.head_dim) + key_layer = key_layer.reshape(batch_size * self.num_heads, -1, self.head_dim).transpose(-1, -2) + value_layer = value_layer.reshape(batch_size * self.num_heads, -1, self.head_dim) + + # [batch_size * num_heads, q_length, kv_length] + attention_scores = alibi.baddbmm( + batch1=query_layer, + batch2=key_layer, + beta=self.beta, + alpha=self.inv_norm_factor, + ) + + # change view to [batch_size, num_heads, q_length, kv_length] + attn_weights = attention_scores.view(batch_size, self.num_heads, q_length, -1) + if attention_mask is not None: # no matter the length, we just slice it + causal_mask = attention_mask[:, :, :, : key_layer.shape[-1]] + attn_weights = attn_weights + causal_mask + + # cast attention scores to fp32, compute scaled softmax and cast back to initial dtype + attention_probs = F.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_layer.dtype) + + # [batch_size, num_heads, q_length, kv_length] + attention_probs = self.attention_dropout(attention_probs) + + if head_mask is not None: + attention_probs = attention_probs * head_mask + + # change view [batch_size x num_heads, q_length, kv_length] + attention_probs_reshaped = attention_probs.view(batch_size * self.num_heads, q_length, -1) + + # matmul: [batch_size * num_heads, q_length, head_dim] + context_layer = torch.bmm(attention_probs_reshaped, value_layer) + + # change view [batch_size, q_length, num_heads * head_dim] + context_layer = self._merge_heads(context_layer) + + # aggregate results across tp ranks. See here: https://github.com/pytorch/pytorch/issues/76232 + if self.pretraining_tp > 1 and self.slow_but_exact: + slices = self.hidden_size / self.pretraining_tp + output_tensor = torch.zeros_like(context_layer) + for i in range(self.pretraining_tp): + output_tensor = output_tensor + F.linear( + context_layer[:, :, int(i * slices) : int((i + 1) * slices)], + self.dense.weight[:, int(i * slices) : int((i + 1) * slices)], + ) + else: + output_tensor = self.dense(context_layer) + + output_tensor = dropout_add(output_tensor, residual, self.hidden_dropout, self.training) + + outputs = (output_tensor, layer_past) + if output_attentions: + outputs += (attention_probs,) + + return outputs + + +class BloomMLP(nn.Module): + def __init__(self, config: BloomConfig): + super().__init__() + hidden_size = config.hidden_size + + self.pretraining_tp = config.pretraining_tp + self.slow_but_exact = config.slow_but_exact + self.dense_h_to_4h = nn.Linear(hidden_size, 4 * hidden_size) + self.gelu_impl = BloomGelu() + self.dense_4h_to_h = nn.Linear(4 * hidden_size, hidden_size) + self.hidden_dropout = config.hidden_dropout + + def forward(self, hidden_states: torch.Tensor, residual: torch.Tensor) -> torch.Tensor: + hidden_states = self.gelu_impl(self.dense_h_to_4h(hidden_states)) + + if self.pretraining_tp > 1 and self.slow_but_exact: + intermediate_output = torch.zeros_like(residual) + slices = self.dense_4h_to_h.weight.shape[-1] / self.pretraining_tp + for i in range(self.pretraining_tp): + intermediate_output = intermediate_output + F.linear( + hidden_states[:, :, int(i * slices) : int((i + 1) * slices)], + self.dense_4h_to_h.weight[:, int(i * slices) : int((i + 1) * slices)], + ) + else: + intermediate_output = self.dense_4h_to_h(hidden_states) + + output = dropout_add(intermediate_output, residual, self.hidden_dropout, self.training) + + return output + + +class BloomBlock(nn.Module): + def __init__(self, config: BloomConfig, layer_idx: Optional[int] = None): + super().__init__() + hidden_size = config.hidden_size + + self.input_layernorm = LayerNorm(hidden_size, eps=config.layer_norm_epsilon) + self.num_heads = config.n_head + self.self_attention = BloomAttention(config, layer_idx) + self.post_attention_layernorm = LayerNorm(hidden_size, eps=config.layer_norm_epsilon) + + self.mlp = BloomMLP(config) + + self.apply_residual_connection_post_layernorm = config.apply_residual_connection_post_layernorm + self.hidden_dropout = config.hidden_dropout + + def forward( + self, + hidden_states: torch.Tensor, + alibi: torch.Tensor, + attention_mask: torch.Tensor, + layer_past: Optional[Cache] = None, + head_mask: Optional[torch.Tensor] = None, + use_cache: bool = False, + output_attentions: bool = False, + cache_position: Optional[torch.LongTensor] = None, + ): + # hidden_states: [batch_size, seq_length, hidden_size] + + # Layer norm at the beginning of the transformer layer. + layernorm_output = self.input_layernorm(hidden_states) + + # Layer norm post the self attention. + if self.apply_residual_connection_post_layernorm: + residual = layernorm_output + else: + residual = hidden_states + + # Self attention. + attn_outputs = self.self_attention( + layernorm_output, + residual, + layer_past=layer_past, + attention_mask=attention_mask, + alibi=alibi, + head_mask=head_mask, + use_cache=use_cache, + output_attentions=output_attentions, + cache_position=cache_position, + ) + + attention_output = attn_outputs[0] + + outputs = attn_outputs[1:] + + layernorm_output = self.post_attention_layernorm(attention_output) + + # Get residual + if self.apply_residual_connection_post_layernorm: + residual = layernorm_output + else: + residual = attention_output + + # MLP. + output = self.mlp(layernorm_output, residual) + + if use_cache: + outputs = (output,) + outputs + else: + outputs = (output,) + outputs[1:] + + return outputs # hidden_states, past_kv, attentions + + +class BloomPreTrainedModel(PreTrainedModel): + config_class = BloomConfig + base_model_prefix = "transformer" + supports_gradient_checkpointing = True + _no_split_modules = ["BloomBlock"] + _skip_keys_device_placement = "past_key_values" + _supports_cache_class = True + _supports_static_cache = True + _supports_quantized_cache = True + + def __init__(self, *inputs, **kwargs): + super().__init__(*inputs, **kwargs) + + def _init_weights(self, module: nn.Module): + """Initialize the weights.""" + if isinstance(module, nn.Linear): + # Slightly different from the TF version which uses truncated_normal for initialization + # cf https://github.com/pytorch/pytorch/pull/5617 + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + elif isinstance(module, LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + + +BLOOM_START_DOCSTRING = r""" + + This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading or saving, resizing the input embeddings etc.) + + This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. + Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage + and behavior. + + Parameters: + config ([`BloomConfig`]): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights. +""" + +BLOOM_INPUTS_DOCSTRING = r""" + Args: + input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`): + `input_ids_length` = `sequence_length` if `past_key_values` is `None` else `past_key_values[0][0].shape[2]` + (`sequence_length` of input past key value states). Indices of input sequence tokens in the vocabulary. + + If `past_key_values` is used, only `input_ids` that do not have their past calculated should be passed as + `input_ids`. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*): + Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention + blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values` + returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`. + + Two formats are allowed: + - a [`~cache_utils.Cache`] instance, see our + [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache); + - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of + shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy + cache format. + + The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the + legacy cache format will be returned. + + If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't + have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids` + of shape `(batch_size, sequence_length)`. + attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*): + Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This + is useful if you want more control over how to convert `input_ids` indices into associated vectors than the + model's internal embedding lookup matrix. + + If `past_key_values` is used, optionally only the last `inputs_embeds` have to be input (see + `past_key_values`). + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see + `past_key_values`). + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple. + cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*): + Indices depicting the position of the input sequence tokens in the sequence. Contrarily to `position_ids`, + this tensor is not affected by padding. It is used to update the cache in the correct position and to infer + the complete sequence length. +""" + + +@add_start_docstrings( + "The bare Bloom Model transformer outputting raw hidden-states without any specific head on top.", + BLOOM_START_DOCSTRING, +) +class BloomModel(BloomPreTrainedModel): + def __init__(self, config: BloomConfig): + super().__init__(config) + + self.embed_dim = config.hidden_size + self.num_heads = config.n_head + + # Embedding + LN Embedding + self.word_embeddings = nn.Embedding(config.vocab_size, self.embed_dim) + self.word_embeddings_layernorm = LayerNorm(self.embed_dim, eps=config.layer_norm_epsilon) + + # Transformer blocks + self.h = nn.ModuleList([BloomBlock(config, layer_idx=i) for i in range(config.num_hidden_layers)]) + + # Final Layer Norm + self.ln_f = LayerNorm(self.embed_dim, eps=config.layer_norm_epsilon) + + self.gradient_checkpointing = False + + # Initialize weights and apply final processing + self.post_init() + + def build_alibi_tensor(self, attention_mask: torch.Tensor, num_heads: int, dtype: torch.dtype) -> torch.Tensor: + return build_alibi_tensor(attention_mask, num_heads, dtype) + + def get_input_embeddings(self): + return self.word_embeddings + + def set_input_embeddings(self, new_embeddings: torch.Tensor): + self.word_embeddings = new_embeddings + + @add_start_docstrings_to_model_forward(BLOOM_INPUTS_DOCSTRING) + @add_code_sample_docstrings( + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=BaseModelOutputWithPastAndCrossAttentions, + config_class=_CONFIG_FOR_DOC, + ) + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Union[Cache, Tuple[Tuple[torch.Tensor, torch.Tensor], ...]]] = None, + attention_mask: Optional[torch.Tensor] = None, + head_mask: Optional[torch.LongTensor] = None, + inputs_embeds: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + cache_position: Optional[torch.LongTensor] = None, + **deprecated_arguments, + ) -> Union[Tuple[torch.Tensor, ...], BaseModelOutputWithPastAndCrossAttentions]: + if deprecated_arguments.pop("position_ids", False) is not False: + # `position_ids` could have been `torch.Tensor` or `None` so defaulting pop to `False` allows to detect if users were passing explicitly `None` + warnings.warn( + "`position_ids` have no functionality in BLOOM and will be removed in v5.0.0. You can safely ignore" + " passing `position_ids`.", + FutureWarning, + ) + if len(deprecated_arguments) > 0: + raise ValueError(f"Got unexpected arguments: {deprecated_arguments}") + + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + use_cache = use_cache if use_cache is not None else self.config.use_cache + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if (input_ids is None) ^ (inputs_embeds is not None): + raise ValueError("You must specify exactly one of input_ids or inputs_embeds") + + if self.gradient_checkpointing and self.training and use_cache: + logger.warning_once( + "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." + ) + use_cache = False + + if inputs_embeds is None: + inputs_embeds = self.word_embeddings(input_ids) + + # kept for BC (non `Cache` `past_key_values` inputs) + return_legacy_cache = False + if use_cache and not isinstance(past_key_values, Cache): + return_legacy_cache = True + if past_key_values is None: + past_key_values = DynamicCache() + else: + past_key_values = DynamicCache.from_legacy_cache(past_key_values) + logger.warning_once( + "We detected that you are passing `past_key_values` as a tuple of tuples. This is deprecated and " + "will be removed in v4.47. Please convert your cache or use an appropriate `Cache` class " + "(https://huggingface.co/docs/transformers/kv_cache#legacy-cache-format)" + ) + + batch_size, seq_length, _ = inputs_embeds.shape + past_length = past_key_values.get_seq_length() if past_key_values is not None else 0 + seq_length_with_past = seq_length + past_length + if cache_position is None: + cache_position = torch.arange(past_length, past_length + seq_length, device=inputs_embeds.device) + + # Prepare head mask if needed + # 1.0 in head_mask indicate we keep the head + # attention_probs has shape batch_size x num_heads x N x N + # head_mask has shape n_layer x batch x num_heads x N x N + head_mask = self.get_head_mask(head_mask, self.config.n_layer) + hidden_states = self.word_embeddings_layernorm(inputs_embeds) + + next_decoder_cache = None + all_self_attentions = () if output_attentions else None + all_hidden_states = () if output_hidden_states else None + + # Compute alibi tensor: check build_alibi_tensor documentation + if attention_mask is None: + attention_mask = torch.ones((batch_size, seq_length_with_past), device=hidden_states.device) + else: + attention_mask = attention_mask.to(hidden_states.device) + + alibi = self.build_alibi_tensor(attention_mask, self.num_heads, dtype=hidden_states.dtype) + causal_mask = self._update_causal_mask( + attention_mask, inputs_embeds, cache_position, past_key_values, output_attentions + ) + + for i, block in enumerate(self.h): + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + if self.gradient_checkpointing and self.training: + outputs = self._gradient_checkpointing_func( + block.__call__, + hidden_states, + alibi, + causal_mask, + past_key_values, + head_mask[i], + use_cache, + output_attentions, + cache_position, + ) + else: + outputs = block( + hidden_states, + layer_past=past_key_values, + attention_mask=causal_mask, + head_mask=head_mask[i], + use_cache=use_cache, + output_attentions=output_attentions, + alibi=alibi, + cache_position=cache_position, + ) + + hidden_states = outputs[0] + if use_cache: + next_decoder_cache = outputs[1] + + if output_attentions: + all_self_attentions = all_self_attentions + (outputs[2 if use_cache else 1],) + + # Add last hidden state + hidden_states = self.ln_f(hidden_states) + + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + next_cache = next_decoder_cache if use_cache else None + if return_legacy_cache: + next_cache = next_cache.to_legacy_cache() + + if not return_dict: + return tuple( + v for v in [hidden_states, next_cache, all_hidden_states, all_self_attentions] if v is not None + ) + + return BaseModelOutputWithPastAndCrossAttentions( + last_hidden_state=hidden_states, + past_key_values=next_cache, + hidden_states=all_hidden_states, + attentions=all_self_attentions, + ) + + # Copied from transformers.models.llama.modeling_llama.LlamaModel._update_causal_mask + def _update_causal_mask( + self, + attention_mask: Union[torch.Tensor, "BlockMask"], + input_tensor: torch.Tensor, + cache_position: torch.Tensor, + past_key_values: Cache, + output_attentions: bool = False, + ): + if self.config._attn_implementation == "flash_attention_2": + if attention_mask is not None and (attention_mask == 0.0).any(): + return attention_mask + return None + if self.config._attn_implementation == "flex_attention": + if isinstance(attention_mask, torch.Tensor): + attention_mask = make_flex_block_causal_mask(attention_mask) + return attention_mask + + # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in + # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail + # to infer the attention mask. + past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0 + using_static_cache = isinstance(past_key_values, StaticCache) + + # When output attentions is True, sdpa implementation's forward method calls the eager implementation's forward + if self.config._attn_implementation == "sdpa" and not using_static_cache and not output_attentions: + if AttentionMaskConverter._ignore_causal_mask_sdpa( + attention_mask, + inputs_embeds=input_tensor, + past_key_values_length=past_seen_tokens, + is_training=self.training, + ): + return None + + dtype, device = input_tensor.dtype, input_tensor.device + sequence_length = input_tensor.shape[1] + if using_static_cache: + target_length = past_key_values.get_max_cache_shape() + else: + target_length = ( + attention_mask.shape[-1] + if isinstance(attention_mask, torch.Tensor) + else past_seen_tokens + sequence_length + 1 + ) + + # In case the provided `attention` mask is 2D, we generate a causal mask here (4D). + causal_mask = self._prepare_4d_causal_attention_mask_with_cache_position( + attention_mask, + sequence_length=sequence_length, + target_length=target_length, + dtype=dtype, + device=device, + cache_position=cache_position, + batch_size=input_tensor.shape[0], + ) + + if ( + self.config._attn_implementation == "sdpa" + and attention_mask is not None + and attention_mask.device.type in ["cuda", "xpu", "npu"] + and not output_attentions + ): + # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when + # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path. + # Details: https://github.com/pytorch/pytorch/issues/110213 + min_dtype = torch.finfo(dtype).min + causal_mask = AttentionMaskConverter._unmask_unattended(causal_mask, min_dtype) + + return causal_mask + + @staticmethod + # Copied from transformers.models.llama.modeling_llama.LlamaModel._prepare_4d_causal_attention_mask_with_cache_position + def _prepare_4d_causal_attention_mask_with_cache_position( + attention_mask: torch.Tensor, + sequence_length: int, + target_length: int, + dtype: torch.dtype, + device: torch.device, + cache_position: torch.Tensor, + batch_size: int, + **kwargs, + ): + """ + Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape + `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing. + + Args: + attention_mask (`torch.Tensor`): + A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape + `(batch_size, 1, query_length, key_value_length)`. + sequence_length (`int`): + The sequence length being processed. + target_length (`int`): + The target length: when generating with static cache, the mask should be as long as the static cache, + to account for the 0 padding, the part of the cache that is not filled yet. + dtype (`torch.dtype`): + The dtype to use for the 4D attention mask. + device (`torch.device`): + The device to place the 4D attention mask on. + cache_position (`torch.Tensor`): + Indices depicting the position of the input sequence tokens in the sequence. + batch_size (`torch.Tensor`): + Batch size. + """ + if attention_mask is not None and attention_mask.dim() == 4: + # In this case we assume that the mask comes already in inverted form and requires no inversion or slicing. + causal_mask = attention_mask + else: + min_dtype = torch.finfo(dtype).min + causal_mask = torch.full( + (sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device + ) + if sequence_length != 1: + causal_mask = torch.triu(causal_mask, diagonal=1) + causal_mask *= torch.arange(target_length, device=device) > cache_position.reshape(-1, 1) + causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1) + if attention_mask is not None: + causal_mask = causal_mask.clone() # copy to contiguous memory for in-place edit + mask_length = attention_mask.shape[-1] + padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :].to( + causal_mask.device + ) + padding_mask = padding_mask == 0 + causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill( + padding_mask, min_dtype + ) + + return causal_mask + + +@add_start_docstrings( + """ + The Bloom Model transformer with a language modeling head on top (linear layer with weights tied to the input + embeddings). + """, + BLOOM_START_DOCSTRING, +) +class BloomForCausalLM(BloomPreTrainedModel, GenerationMixin): + _tied_weights_keys = ["lm_head.weight"] + + def __init__(self, config: BloomConfig): + super().__init__(config) + self.transformer = BloomModel(config) + self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) + + # Initialize weights and apply final processing + self.post_init() + + def get_output_embeddings(self): + return self.lm_head + + def set_output_embeddings(self, new_embeddings: torch.Tensor): + self.lm_head = new_embeddings + + def prepare_inputs_for_generation( + self, + input_ids, + past_key_values=None, + attention_mask=None, + inputs_embeds=None, + cache_position=None, + use_cache=True, + **kwargs, + ): + # Overwriten because of the fixed-shape attention mask creation + + # If we have cache: let's slice `input_ids` through `cache_position`, to keep only the unprocessed tokens + # Exception 1: when passing input_embeds, input_ids may be missing entries + # Exception 2: some generation methods do special slicing of input_ids, so we don't need to do it here + # Exception 3: with synced GPUs cache_position may go out of bounds, but we only want dummy token in that case. + # (we can't check exception 3 while compiling) + # Exception 4: If input_embeds are passed then slice it through `cache_position`, to keep only the unprocessed tokens and + # generate the first token for each sequence. Later use the generated Input ids for continuation. + if past_key_values is not None: + if inputs_embeds is not None and input_ids.shape[1] == 0: # Exception 4 + inputs_embeds = inputs_embeds[:, -cache_position.shape[0] :] + elif ( + inputs_embeds is not None # Exception 1 + or cache_position[-1] >= input_ids.shape[1] # Exception 3 + ): + input_ids = input_ids[:, -cache_position.shape[0] :] + elif input_ids.shape[1] != cache_position.shape[0]: # Default case (the "else", a no op, is Exception 2) + input_ids = input_ids[:, cache_position] + + # if `inputs_embeds` are passed, we only want to use them in the 1st generation step + if inputs_embeds is not None and len(cache_position) == inputs_embeds.shape[1]: + model_inputs = {"inputs_embeds": inputs_embeds, "input_ids": None} + else: + # This `clone` call is needed to avoid recapturing cuda graphs with `torch.compile`'s `mode="reduce-overhead`, as otherwise the + # input `position_ids` would have various stride during the decoding. Here, simply using `.contiguous()` is not sufficient as in + # the batch size = 1 case, `position_ids` is already contiguous but with varying stride which retriggers a capture. + model_inputs = {"input_ids": input_ids.clone(memory_format=torch.contiguous_format), "inputs_embeds": None} + + # This part differs from other models because BLOOM needs a 2D mask to construct alibi tensor + # The only difference is the usage of 2D instead of 4D mask, but the shape will be static + if isinstance(past_key_values, StaticCache) and attention_mask is not None: + target_length = past_key_values.get_max_cache_shape() + batch_size, seq_length = attention_mask.shape + diff = target_length - seq_length + + new_attn_mask = torch.zeros(batch_size, diff, device=attention_mask.device, dtype=attention_mask.dtype) + attention_mask = torch.cat( + [attention_mask, new_attn_mask], + dim=-1, + ) + + model_inputs.update( + { + "cache_position": cache_position, + "past_key_values": past_key_values, + "use_cache": use_cache, + "attention_mask": attention_mask, + } + ) + return model_inputs + + @add_start_docstrings_to_model_forward(BLOOM_INPUTS_DOCSTRING) + @add_code_sample_docstrings( + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=CausalLMOutputWithCrossAttentions, + config_class=_CONFIG_FOR_DOC, + ) + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Union[Cache, Tuple[Tuple[torch.Tensor, torch.Tensor], ...]]] = None, + attention_mask: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + labels: Optional[torch.Tensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + cache_position: Optional[torch.LongTensor] = None, + **deprecated_arguments, + ) -> Union[Tuple[torch.Tensor], CausalLMOutputWithCrossAttentions]: + r""" + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set + `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100` + are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]` + """ + # Bloom has deprecated kwargs, so we need to pop num_items_in_batch explicitly + num_items_in_batch = deprecated_arguments.pop("num_items_in_batch", None) + if deprecated_arguments.pop("position_ids", False) is not False: + # `position_ids` could have been `torch.Tensor` or `None` so defaulting pop to `False` allows to detect if users were passing explicitly `None` + warnings.warn( + "`position_ids` have no functionality in BLOOM and will be removed in v5.0.0. You can safely ignore" + " passing `position_ids`.", + FutureWarning, + ) + if len(deprecated_arguments) > 0: + raise ValueError(f"Got unexpected arguments: {deprecated_arguments}") + + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + transformer_outputs = self.transformer( + input_ids, + past_key_values=past_key_values, + attention_mask=attention_mask, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + cache_position=cache_position, + ) + hidden_states = transformer_outputs[0] + + lm_logits = self.lm_head(hidden_states) + + loss = None + if labels is not None: + # move labels to correct device to enable model parallelism + labels = labels.to(lm_logits.device) + # Flatten the tokens + loss = self.loss_function( + lm_logits, + labels, + vocab_size=self.config.vocab_size, + num_items_in_batch=num_items_in_batch, + ) + + if not return_dict: + output = (lm_logits,) + transformer_outputs[1:] + return ((loss,) + output) if loss is not None else output + + return CausalLMOutputWithCrossAttentions( + loss=loss, + logits=lm_logits, + past_key_values=transformer_outputs.past_key_values, + hidden_states=transformer_outputs.hidden_states, + attentions=transformer_outputs.attentions, + ) + + def _reorder_cache( + self, past: Tuple[Tuple[torch.Tensor, torch.Tensor], ...], beam_idx: torch.LongTensor + ) -> Tuple[Tuple[torch.Tensor, torch.Tensor], ...]: + """ + This function is used to re-order the `past_key_values` cache if [`~PreTrainedModel.beam_search`] or + [`~PreTrainedModel.beam_sample`] is called. This is required to match `past_key_values` with the correct + beam_idx at every generation step. + + Output shares the same memory storage as `past`. + """ + # Get a copy of `beam_idx` on all the devices where we need those indices. + device_to_beam_idx = { + past_state.device: beam_idx.to(past_state.device) for layer_past in past for past_state in layer_past + } + reordered_past = tuple( + ( + layer_past[0].index_select(0, device_to_beam_idx[layer_past[0].device]), + layer_past[1].index_select(0, device_to_beam_idx[layer_past[0].device]), + ) + for layer_past in past + ) + return reordered_past + + +@add_start_docstrings( + """ + The Bloom Model transformer with a sequence classification head on top (linear layer). + + [`BloomForSequenceClassification`] uses the last token in order to do the classification, as other causal models + (e.g. GPT-1) do. + + Since it does classification on the last token, it requires to know the position of the last token. If a + `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If + no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the + padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in + each row of the batch). + """, + BLOOM_START_DOCSTRING, +) +class BloomForSequenceClassification(BloomPreTrainedModel): + def __init__(self, config: BloomConfig): + super().__init__(config) + self.num_labels = config.num_labels + self.transformer = BloomModel(config) + self.score = nn.Linear(config.hidden_size, config.num_labels, bias=False) + + # Initialize weights and apply final processing + self.post_init() + + @add_start_docstrings_to_model_forward(BLOOM_INPUTS_DOCSTRING) + @add_code_sample_docstrings( + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=SequenceClassifierOutputWithPast, + config_class=_CONFIG_FOR_DOC, + ) + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Union[Cache, Tuple[Tuple[torch.Tensor, torch.Tensor], ...]]] = None, + attention_mask: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + labels: Optional[torch.Tensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + **deprecated_arguments, + ) -> Union[Tuple[torch.Tensor], SequenceClassifierOutputWithPast]: + r""" + labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., + config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If + `config.num_labels > 1` a classification loss is computed (Cross-Entropy). + """ + if deprecated_arguments.pop("position_ids", False) is not False: + # `position_ids` could have been `torch.Tensor` or `None` so defaulting pop to `False` allows to detect if users were passing explicitly `None` + warnings.warn( + "`position_ids` have no functionality in BLOOM and will be removed in v5.0.0. You can safely ignore" + " passing `position_ids`.", + FutureWarning, + ) + if len(deprecated_arguments) > 0: + raise ValueError(f"Got unexpected arguments: {deprecated_arguments}") + + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + transformer_outputs = self.transformer( + input_ids, + past_key_values=past_key_values, + attention_mask=attention_mask, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + hidden_states = transformer_outputs[0] + logits = self.score(hidden_states) + + if input_ids is not None: + batch_size = input_ids.shape[0] + else: + batch_size = inputs_embeds.shape[0] + + if self.config.pad_token_id is None and batch_size != 1: + raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.") + if self.config.pad_token_id is None: + last_non_pad_token = -1 + elif input_ids is not None: + # To handle both left- and right- padding, we take the rightmost token that is not equal to pad_token_id + non_pad_mask = (input_ids != self.config.pad_token_id).to(logits.device, torch.int32) + token_indices = torch.arange(input_ids.shape[-1], device=logits.device, dtype=torch.int32) + last_non_pad_token = (token_indices * non_pad_mask).argmax(-1) + else: + last_non_pad_token = -1 + logger.warning_once( + f"{self.__class__.__name__} will not detect padding tokens in `inputs_embeds`. Results may be " + "unexpected if using padding tokens in conjunction with `inputs_embeds.`" + ) + + pooled_logits = logits[torch.arange(batch_size, device=logits.device), last_non_pad_token] + + loss = None + if labels is not None: + if self.config.problem_type is None: + if self.num_labels == 1: + self.config.problem_type = "regression" + elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int): + self.config.problem_type = "single_label_classification" + else: + self.config.problem_type = "multi_label_classification" + + if self.config.problem_type == "regression": + loss_fct = MSELoss() + if self.num_labels == 1: + loss = loss_fct(pooled_logits.squeeze(), labels.squeeze()) + else: + loss = loss_fct(pooled_logits, labels) + elif self.config.problem_type == "single_label_classification": + loss_fct = CrossEntropyLoss() + loss = loss_fct(pooled_logits, labels) + elif self.config.problem_type == "multi_label_classification": + loss_fct = BCEWithLogitsLoss() + loss = loss_fct(pooled_logits, labels) + if not return_dict: + output = (pooled_logits,) + transformer_outputs[1:] + return ((loss,) + output) if loss is not None else output + + return SequenceClassifierOutputWithPast( + loss=loss, + logits=pooled_logits, + past_key_values=transformer_outputs.past_key_values, + hidden_states=transformer_outputs.hidden_states, + attentions=transformer_outputs.attentions, + ) + + +@add_start_docstrings( + """ + Bloom Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for + Named-Entity-Recognition (NER) tasks. + """, + BLOOM_START_DOCSTRING, +) +class BloomForTokenClassification(BloomPreTrainedModel): + def __init__(self, config: BloomConfig): + super().__init__(config) + self.num_labels = config.num_labels + + self.transformer = BloomModel(config) + if hasattr(config, "classifier_dropout") and config.classifier_dropout is not None: + classifier_dropout = config.classifier_dropout + elif hasattr(config, "hidden_dropout") and config.hidden_dropout is not None: + classifier_dropout = config.hidden_dropout + else: + classifier_dropout = 0.1 + self.dropout = nn.Dropout(classifier_dropout) + self.classifier = nn.Linear(config.hidden_size, config.num_labels) + + # Initialize weights and apply final processing + self.post_init() + + @add_start_docstrings_to_model_forward(BLOOM_INPUTS_DOCSTRING) + @add_code_sample_docstrings( + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=TokenClassifierOutput, + config_class=_CONFIG_FOR_DOC, + ) + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[Union[Cache, Tuple[Tuple[torch.Tensor, torch.Tensor], ...]]] = None, + attention_mask: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + labels: Optional[torch.Tensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + **deprecated_arguments, + ) -> Union[Tuple[torch.Tensor], TokenClassifierOutput]: + r""" + labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., + config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If + `config.num_labels > 1` a classification loss is computed (Cross-Entropy). + """ + if deprecated_arguments.pop("position_ids", False) is not False: + # `position_ids` could have been `torch.Tensor` or `None` so defaulting pop to `False` allows to detect if users were passing explicitly `None` + warnings.warn( + "`position_ids` have no functionality in BLOOM and will be removed in v5.0.0. You can safely ignore" + " passing `position_ids`.", + FutureWarning, + ) + if len(deprecated_arguments) > 0: + raise ValueError(f"Got unexpected arguments: {deprecated_arguments}") + + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + transformer_outputs = self.transformer( + input_ids, + past_key_values=past_key_values, + attention_mask=attention_mask, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + hidden_states = transformer_outputs[0] + hidden_states = self.dropout(hidden_states) + logits = self.classifier(hidden_states) + + loss = None + if labels is not None: + # move labels to correct device to enable model parallelism + labels = labels.to(logits.device) + batch_size, seq_length = labels.shape + loss_fct = CrossEntropyLoss() + loss = loss_fct( + logits.view(batch_size * seq_length, self.num_labels), labels.view(batch_size * seq_length) + ) + + if not return_dict: + output = (logits,) + transformer_outputs[2:] + return ((loss,) + output) if loss is not None else output + + return TokenClassifierOutput( + loss=loss, + logits=logits, + hidden_states=transformer_outputs.hidden_states, + attentions=transformer_outputs.attentions, + ) + + +@add_start_docstrings( + """ + The BLOOM Model transformer with a span classification head on top for extractive question-answering tasks like + SQuAD (a linear layers on top of the hidden-states output to compute `span start logits` and `span end logits`). + """, + BLOOM_START_DOCSTRING, +) +class BloomForQuestionAnswering(BloomPreTrainedModel): + def __init__(self, config): + super().__init__(config) + self.transformer = BloomModel(config) + self.qa_outputs = nn.Linear(config.hidden_size, 2) + + # Initialize weights and apply final processing + self.post_init() + + @add_start_docstrings_to_model_forward(BLOOM_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.FloatTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + start_positions: Optional[torch.LongTensor] = None, + end_positions: Optional[torch.LongTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, QuestionAnsweringModelOutput]: + r""" + start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for position (index) of the start of the labelled span for computing the token classification loss. + Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence + are not taken into account for computing the loss. + end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for position (index) of the end of the labelled span for computing the token classification loss. + Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence + are not taken into account for computing the loss. + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.transformer( + input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + sequence_output = outputs[0] + + logits = self.qa_outputs(sequence_output) + start_logits, end_logits = logits.split(1, dim=-1) + start_logits = start_logits.squeeze(-1).contiguous() + end_logits = end_logits.squeeze(-1).contiguous() + + total_loss = None + if start_positions is not None and end_positions is not None: + # If we are on multi-GPU, split add a dimension + if len(start_positions.size()) > 1: + start_positions = start_positions.squeeze(-1) + if len(end_positions.size()) > 1: + end_positions = end_positions.squeeze(-1) + # sometimes the start/end positions are outside our model inputs, we ignore these terms + ignored_index = start_logits.size(1) + start_positions = start_positions.clamp(0, ignored_index) + end_positions = end_positions.clamp(0, ignored_index) + + loss_fct = CrossEntropyLoss(ignore_index=ignored_index) + start_loss = loss_fct(start_logits, start_positions) + end_loss = loss_fct(end_logits, end_positions) + total_loss = (start_loss + end_loss) / 2 + + if not return_dict: + output = (start_logits, end_logits) + outputs[2:] + return ((total_loss,) + output) if total_loss is not None else output + + return QuestionAnsweringModelOutput( + loss=total_loss, + start_logits=start_logits, + end_logits=end_logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +__all__ = [ + "BloomForCausalLM", + "BloomModel", + "BloomPreTrainedModel", + "BloomForSequenceClassification", + "BloomForTokenClassification", + "BloomForQuestionAnswering", +] diff --git a/docs/transformers/build/lib/transformers/models/bloom/modeling_flax_bloom.py b/docs/transformers/build/lib/transformers/models/bloom/modeling_flax_bloom.py new file mode 100644 index 0000000000000000000000000000000000000000..51ccb4c3625cde37d049e0703347ff58888b4f7c --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/bloom/modeling_flax_bloom.py @@ -0,0 +1,737 @@ +# coding=utf-8 +# Copyright 2023 HuggingFace Inc. Team and Bigscience Workshop. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Flax BLOOM model.""" + +import math +from functools import partial +from typing import Optional, Tuple + +import flax.linen as nn +import jax +import jax.numpy as jnp +from flax.core.frozen_dict import FrozenDict, freeze, unfreeze +from flax.linen import combine_masks, dot_product_attention_weights, make_causal_mask +from flax.linen.activation import tanh +from flax.traverse_util import flatten_dict, unflatten_dict +from jax import lax + +from ...modeling_flax_outputs import ( + FlaxBaseModelOutput, + FlaxBaseModelOutputWithPastAndCrossAttentions, + FlaxCausalLMOutput, +) +from ...modeling_flax_utils import FlaxPreTrainedModel, append_call_sample_docstring +from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging +from .configuration_bloom import BloomConfig + + +logger = logging.get_logger(__name__) + +_CHECKPOINT_FOR_DOC = "bigscience/bloom" +_CONFIG_FOR_DOC = "BloomConfig" + + +BLOOM_START_DOCSTRING = r""" + + This model inherits from [`FlaxPreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads + etc.) + + This model is also a Flax Linen + [flax.nn.Module](https://flax.readthedocs.io/en/latest/_autosummary/flax.nn.module.html) subclass. Use it as a + regular Flax Module and refer to the Flax documentation for all matter related to general usage and behavior. + + Finally, this model supports inherent JAX features such as: + + - [Just-In-Time (JIT) compilation](https://jax.readthedocs.io/en/latest/jax.html#just-in-time-compilation-jit) + - [Automatic Differentiation](https://jax.readthedocs.io/en/latest/jax.html#automatic-differentiation) + - [Vectorization](https://jax.readthedocs.io/en/latest/jax.html#vectorization-vmap) + - [Parallelization](https://jax.readthedocs.io/en/latest/jax.html#parallelization-pmap) + + Parameters: + config ([`BloomConfig`]): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the [`~FlaxPreTrainedModel.from_pretrained`] method to load the model weights. + dtype (`jax.numpy.dtype`, *optional*, defaults to `jax.numpy.float32`): + The data type of the computation. Can be one of `jax.numpy.float32`, `jax.numpy.float16` (on GPUs) and + `jax.numpy.bfloat16` (on TPUs). + + This can be used to enable mixed-precision training or half-precision inference on GPUs or TPUs. If + specified all the computation will be performed with the given `dtype`. + + **Note that this only specifies the dtype of the computation and does not influence the dtype of model + parameters.** + + If you wish to change the dtype of the model parameters, see [`~FlaxPreTrainedModel.to_fp16`] and + [`~FlaxPreTrainedModel.to_bf16`]. +""" + +BLOOM_INPUTS_DOCSTRING = r""" + Args: + input_ids (`numpy.ndarray` of shape `(batch_size, input_ids_length)`): + `input_ids_length` = `sequence_length`. Indices of input sequence tokens in the vocabulary. + + Indices can be obtained using [`BloomTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + attention_mask (`numpy.ndarray` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + past_key_values (`Dict[str, np.ndarray]`, *optional*, returned by `init_cache` or when passing previous `past_key_values`): + Dictionary of pre-computed hidden-states (key and values in the attention blocks) that can be used for fast + auto-regressive decoding. Pre-computed key and value hidden-states are of shape *[batch_size, max_length]*. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. +""" + + +def build_alibi_tensor(attention_mask: jnp.ndarray, num_heads: int, dtype: Optional[jnp.dtype] = jnp.float32): + """ + Flax implementation of the BLOOM Alibi tensor. BLOOM Alibi tensor is not causal as the original paper mentions, it + relies on a translation invariance of softmax for quick implementation: with l being a tensor, and a fixed value + `softmax(l+a) = softmax(l)`. Based on + https://github.com/ofirpress/attention_with_linear_biases/blob/a35aaca144e0eb6b789dfcb46784c4b8e31b7983/fairseq/models/transformer.py#L742 + Link to paper: https://arxiv.org/abs/2108.12409 + + Args: + attention_mask (`jnp.ndarray`): + Token-wise attention mask, this should be of shape `(batch_size, max_seq_len)`. + num_heads (`int`): + Number of attention heads. + dtype (`jnp.dtype`, *optional*, defaults to `jnp.float32`): + The data type (dtype) of the output tensor. + + Returns: Alibi tensor of shape `(batch_size * num_heads, 1, max_seq_len)`. + """ + batch_size, seq_length = attention_mask.shape + closest_power_of_2 = 2 ** math.floor(math.log2(num_heads)) + base = jnp.array(2 ** (-(2 ** -(math.log2(closest_power_of_2) - 3))), dtype=jnp.float32) + powers = jnp.arange(1, 1 + closest_power_of_2, dtype=jnp.float32) + slopes = jax.lax.pow(base, powers) + + if closest_power_of_2 != num_heads: + extra_base = jnp.array(2 ** (-(2 ** -(math.log2(2 * closest_power_of_2) - 3))), dtype=jnp.float32) + num_remaining_heads = min(closest_power_of_2, num_heads - closest_power_of_2) + extra_powers = jnp.arange(1, 1 + 2 * num_remaining_heads, 2, dtype=jnp.float32) + slopes = jnp.cat([slopes, jax.lax.pow(extra_base, extra_powers)], axis=0) + + # Note: the Alibi tensor will added to the attention bias that will be applied to the query, key product of attention + # therefore, Alibi will have to be of shape (batch_size, num_heads, query_length, key_length) + # => here we set (batch_size=1, num_heads=num_heads, query_length=1, key_length=max_length) + # so that the query_length dimension will then be broadcast correctly. + # This is more or less identical to T5's relative position bias: + # https://github.com/huggingface/transformers/blob/f681437203baa7671de3174b0fa583c349d9d5e1/src/transformers/models/t5/modeling_t5.py#L527 + arange_tensor = ((attention_mask.cumsum(axis=-1) - 1) * attention_mask)[:, None, :] + alibi = slopes[..., None] * arange_tensor + alibi = jnp.expand_dims(alibi, axis=2) + return jnp.asarray(alibi, dtype) + + +class FlaxBloomAttention(nn.Module): + config: BloomConfig + dtype: jnp.dtype = jnp.float32 + + def setup(self): + self.hidden_size = self.config.hidden_size + self.num_heads = self.config.n_head + self.head_dim = self.hidden_size // self.num_heads + self.attention_softmax_in_fp32 = self.dtype is not jnp.float32 + + if self.head_dim * self.num_heads != self.hidden_size: + raise ValueError( + f"`hidden_size` must be divisible by `num_heads` (got `hidden_size`: {self.hidden_size} and " + f"`num_heads`: {self.num_heads})." + ) + + dense = partial( + nn.Dense, + dtype=self.dtype, + kernel_init=jax.nn.initializers.normal(self.config.initializer_range), + ) + + self.query_key_value = dense(self.hidden_size * 3) + self.dense = dense(self.hidden_size) + self.resid_dropout = nn.Dropout(rate=self.config.hidden_dropout) + + def _split_heads(self, hidden_states): + return hidden_states.reshape(hidden_states.shape[:-1] + (self.num_heads, self.head_dim * 3)) + + def _merge_heads(self, hidden_states): + return hidden_states.reshape(hidden_states.shape[:2] + (self.hidden_size,)) + + @nn.compact + # Copied from transformers.models.gptj.modeling_flax_gptj.FlaxGPTJAttention._concatenate_to_cache + def _concatenate_to_cache(self, key, value, query, attention_mask): + """ + This function takes projected key, value states from a single input token and concatenates the states to cached + states from previous steps. This function is slightly adapted from the official Flax repository: + https://github.com/google/flax/blob/491ce18759622506588784b4fca0e4bf05f8c8cd/flax/linen/attention.py#L252 + """ + # detect if we're initializing by absence of existing cache data. + is_initialized = self.has_variable("cache", "cached_key") + cached_key = self.variable("cache", "cached_key", jnp.zeros, key.shape, key.dtype) + cached_value = self.variable("cache", "cached_value", jnp.zeros, value.shape, value.dtype) + cache_index = self.variable("cache", "cache_index", lambda: jnp.array(0, dtype=jnp.int32)) + + if is_initialized: + *batch_dims, max_length, num_heads, depth_per_head = cached_key.value.shape + # update key, value caches with our new 1d spatial slices + cur_index = cache_index.value + indices = (0,) * len(batch_dims) + (cur_index, 0, 0) + key = lax.dynamic_update_slice(cached_key.value, key, indices) + value = lax.dynamic_update_slice(cached_value.value, value, indices) + cached_key.value = key + cached_value.value = value + num_updated_cache_vectors = query.shape[1] + cache_index.value = cache_index.value + num_updated_cache_vectors + # causal mask for cached decoder self-attention: our single query position should only attend to those key + # positions that have already been generated and cached, not the remaining zero elements. + pad_mask = jnp.broadcast_to( + jnp.arange(max_length) < cur_index + num_updated_cache_vectors, + tuple(batch_dims) + (1, num_updated_cache_vectors, max_length), + ) + attention_mask = combine_masks(pad_mask, attention_mask) + return key, value, attention_mask + + def __call__( + self, + hidden_states, + residual, + alibi, + attention_mask=None, + deterministic: bool = True, + init_cache: bool = False, + output_attentions: bool = False, + ): + batch_size, seq_length = hidden_states.shape[:2] + + # proj q, k, v + fused_qkv = self.query_key_value(hidden_states) + fused_qkv = self._split_heads(fused_qkv) + query, key, value = jnp.split(fused_qkv, 3, axis=-1) + + causal_attention_mask = make_causal_mask(attention_mask, dtype="bool") + + # for fast decoding causal attention mask should be shifted + causal_attention_mask_shift = ( + self.variables["cache"]["cache_index"] if self.has_variable("cache", "cached_key") else 0 + ) + + # fast decoding for generate requires special attention_mask + if self.has_variable("cache", "cached_key"): + max_decoder_length = self.variables["cache"]["cached_key"].shape[1] + causal_attention_mask = jax.lax.dynamic_slice( + causal_attention_mask, + (0, 0, causal_attention_mask_shift, 0), + (1, 1, seq_length, max_decoder_length), + ) + + # broadcast causal attention mask & attention mask to fit for merge + causal_attention_mask = jnp.broadcast_to( + causal_attention_mask, (batch_size,) + causal_attention_mask.shape[1:] + ) + attention_mask = jnp.broadcast_to(jnp.expand_dims(attention_mask, axis=(-3, -2)), causal_attention_mask.shape) + attention_mask = combine_masks(attention_mask, causal_attention_mask) + + dropout_rng = None + if not deterministic and self.config.attention_dropout > 0.0: + dropout_rng = self.make_rng("dropout") + + # During fast autoregressive decoding, we feed one position at a time, + # and cache the keys and values step by step. + if self.has_variable("cache", "cached_key") or init_cache: + key, value, attention_mask = self._concatenate_to_cache(key, value, query, attention_mask) + + # transform boolean mask into float mask + mask_value = jnp.finfo(self.dtype).min + attention_bias = lax.select( + attention_mask > 0, + jnp.full(attention_mask.shape, 0.0).astype(self.dtype), + jnp.full(attention_mask.shape, mask_value).astype(self.dtype), + ) + + attention_bias = attention_bias + alibi + + # Cast in fp32 if the original dtype is different from fp32 + attention_dtype = jnp.float32 if self.attention_softmax_in_fp32 else self.dtype + + attn_weights = dot_product_attention_weights( + query, + key, + bias=attention_bias, + dropout_rng=dropout_rng, + dropout_rate=self.config.attention_dropout, + deterministic=deterministic, + dtype=attention_dtype, + ) + + # Cast back in the original dtype if the native dtype is not fp32 + if self.attention_softmax_in_fp32: + attn_weights = attn_weights.astype(self.dtype) + + attn_output = jnp.einsum("...hqk,...khd->...qhd", attn_weights, value) + attn_output = self._merge_heads(attn_output) + attn_output = self.dense(attn_output) + attn_output = self.resid_dropout(attn_output, deterministic=deterministic) + + attn_output = attn_output + residual + + outputs = (attn_output, attn_weights) if output_attentions else (attn_output,) + return outputs + + +class BloomGELU(nn.Module): + def setup(self): + self.dtype = jnp.float32 + + def __call__(self, x): + return x * 0.5 * (1.0 + tanh(0.79788456 * x * (1 + 0.044715 * x * x))) + + +class FlaxBloomMLP(nn.Module): + config: BloomConfig + dtype: jnp.dtype = jnp.float32 + + def setup(self): + hidden_size = self.config.hidden_size + + kernel_init = jax.nn.initializers.normal(self.config.initializer_range) + + self.dense_h_to_4h = nn.Dense(4 * hidden_size, dtype=self.dtype, kernel_init=kernel_init) + self.dense_4h_to_h = nn.Dense(hidden_size, dtype=self.dtype, kernel_init=kernel_init) + self.hidden_dropout = nn.Dropout(self.config.hidden_dropout) + self.act = BloomGELU() + + def __call__(self, hidden_states, residual, deterministic: bool = True): + hidden_states = self.dense_h_to_4h(hidden_states) + hidden_states = self.act(hidden_states) + + intermediate_output = self.dense_4h_to_h(hidden_states) + + intermediate_output = intermediate_output + residual + hidden_states = self.hidden_dropout(intermediate_output, deterministic=deterministic) + + return hidden_states + + +class FlaxBloomBlock(nn.Module): + config: BloomConfig + dtype: jnp.dtype = jnp.float32 + + def setup(self): + self.input_layernorm = nn.LayerNorm(epsilon=self.config.layer_norm_epsilon, dtype=self.dtype) + + self.self_attention = FlaxBloomAttention(self.config, dtype=self.dtype) + self.post_attention_layernorm = nn.LayerNorm(epsilon=self.config.layer_norm_epsilon, dtype=self.dtype) + + self.mlp = FlaxBloomMLP(self.config, dtype=self.dtype) + + self.apply_residual_connection_post_layernorm = self.config.apply_residual_connection_post_layernorm + self.hidden_dropout = self.config.hidden_dropout + + def __call__( + self, + hidden_states, + alibi, + attention_mask=None, + deterministic: bool = True, + init_cache: bool = False, + output_attentions: bool = False, + ): + layernorm_output = self.input_layernorm(hidden_states) + + # layer norm before saving residual if config calls for it + if self.apply_residual_connection_post_layernorm: + residual = layernorm_output + else: + residual = hidden_states + + # self-attention + attn_outputs = self.self_attention( + layernorm_output, + residual=residual, + alibi=alibi, + attention_mask=attention_mask, + deterministic=deterministic, + init_cache=init_cache, + output_attentions=output_attentions, + ) + + attention_output = attn_outputs[0] + + outputs = attn_outputs[1:] + + post_layernorm = self.post_attention_layernorm(attention_output) + + # set residual based on config + if self.apply_residual_connection_post_layernorm: + residual = post_layernorm + else: + residual = attention_output + + output = self.mlp(post_layernorm, residual, deterministic=deterministic) + + outputs = (output,) + outputs + + return outputs + + +class FlaxBloomPreTrainedModel(FlaxPreTrainedModel): + """ + An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained + models. + """ + + config_class = BloomConfig + base_model_prefix = "transformer" + module_class: nn.Module = None + + def __init__( + self, + config: BloomConfig, + input_shape: Tuple = (1, 1), + seed: int = 0, + dtype: jnp.dtype = jnp.float32, + _do_init: bool = True, + **kwargs, + ): + module = self.module_class(config=config, dtype=dtype, **kwargs) + super().__init__(config, module, input_shape=input_shape, seed=seed, dtype=dtype, _do_init=_do_init) + + def init_weights(self, rng: jax.random.PRNGKey, input_shape: Tuple, params: FrozenDict = None) -> FrozenDict: + # init input tensors + input_ids = jnp.zeros(input_shape, dtype="i4") + attention_mask = jnp.ones_like(input_ids) + params_rng, dropout_rng = jax.random.split(rng) + rngs = {"params": params_rng, "dropout": dropout_rng} + + random_params = self.module.init(rngs, input_ids, attention_mask, return_dict=False)["params"] + + if params is not None: + random_params = flatten_dict(unfreeze(random_params)) + params = flatten_dict(unfreeze(params)) + for missing_key in self._missing_keys: + params[missing_key] = random_params[missing_key] + self._missing_keys = set() + return freeze(unflatten_dict(params)) + else: + return random_params + + def init_cache(self, batch_size, max_length): + r""" + Args: + batch_size (`int`): + batch_size used for fast auto-regressive decoding. Defines the batch size of the initialized cache. + max_length (`int`): + maximum possible length for auto-regressive decoding. Defines the sequence length of the initialized + cache. + """ + # init input variables to retrieve cache + input_ids = jnp.ones((batch_size, max_length), dtype="i4") + attention_mask = jnp.ones_like(input_ids) + + init_variables = self.module.init( + jax.random.PRNGKey(0), input_ids, attention_mask, return_dict=False, init_cache=True + ) + return unfreeze(init_variables["cache"]) + + @add_start_docstrings_to_model_forward(BLOOM_INPUTS_DOCSTRING) + def __call__( + self, + input_ids, + attention_mask=None, + past_key_values: dict = None, + params: dict = None, + dropout_rng: jax.random.PRNGKey = None, + train: bool = False, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ): + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + batch_size, sequence_length = input_ids.shape + + if attention_mask is None: + attention_mask = jnp.ones((batch_size, sequence_length)) + + # Handle any PRNG if needed + rngs = {} + if dropout_rng is not None: + rngs["dropout"] = dropout_rng + + inputs = {"params": params or self.params} + + # If past_key_values are passed then cache is already initialized a private flag init_cache has to be passed + # down to ensure cache is used. It has to be made sure that cache is marked as mutable so that it can be + # changed by FlaxBloomAttention module + if past_key_values: + inputs["cache"] = past_key_values + mutable = ["cache"] + else: + mutable = False + + outputs = self.module.apply( + inputs, + jnp.array(input_ids, dtype="i4"), + jnp.array(attention_mask, dtype="i4"), + not train, + False, + output_attentions, + output_hidden_states, + return_dict, + rngs=rngs, + mutable=mutable, + ) + + # add updated cache to model output + if past_key_values is not None and return_dict: + outputs, past_key_values = outputs + outputs["past_key_values"] = unfreeze(past_key_values["cache"]) + return outputs + elif past_key_values is not None and not return_dict: + outputs, past_key_values = outputs + outputs = outputs[:1] + (unfreeze(past_key_values["cache"]),) + outputs[1:] + + return outputs + + +class FlaxBloomBlockCollection(nn.Module): + config: BloomConfig + dtype: jnp.dtype = jnp.float32 + + def setup(self): + self.layers = [ + FlaxBloomBlock(self.config, name=str(layer_number), dtype=self.dtype) + for layer_number in range(self.config.num_hidden_layers) + ] + + def __call__( + self, + hidden_states, + alibi, + attention_mask=None, + deterministic: bool = True, + init_cache: bool = False, + output_attentions: bool = False, + output_hidden_states: bool = False, + ): + all_attentions = () if output_attentions else None + all_hidden_states = () if output_hidden_states else None + + for layer_number in range(self.config.num_hidden_layers): + if output_hidden_states: + all_hidden_states += (hidden_states,) + + layer_outputs = self.layers[layer_number]( + hidden_states, + alibi=alibi, + attention_mask=attention_mask, + deterministic=deterministic, + init_cache=init_cache, + output_attentions=output_attentions, + ) + hidden_states = layer_outputs[0] + + if output_attentions: + all_attentions += (layer_outputs[1],) + + # this contains possible `None` values - `FlaxBloomModule` will filter them out + outputs = (hidden_states, all_hidden_states, all_attentions) + + return outputs + + +class FlaxBloomModule(nn.Module): + config: BloomConfig + dtype: jnp.dtype = jnp.float32 + + def setup(self): + self.embed_dim = self.config.hidden_size + + # word embeddings (no positional embedding layer) + self.word_embeddings = nn.Embed( + self.config.vocab_size, + self.embed_dim, + embedding_init=jax.nn.initializers.normal(stddev=self.config.initializer_range), + dtype=self.dtype, + ) + + # post-embedding layernorm + self.word_embeddings_layernorm = nn.LayerNorm(epsilon=self.config.layer_norm_epsilon, dtype=self.dtype) + + # transformer layers + self.h = FlaxBloomBlockCollection(self.config, dtype=self.dtype) + + # final layernorm + self.ln_f = nn.LayerNorm(epsilon=self.config.layer_norm_epsilon, dtype=self.dtype) + + def __call__( + self, + input_ids=None, + attention_mask=None, + deterministic=True, + init_cache: bool = False, + output_attentions: bool = False, + output_hidden_states: bool = False, + return_dict: bool = True, + ): + inputs_embeds = self.word_embeddings(input_ids) + # do post-embedding layernorm + hidden_states = self.word_embeddings_layernorm(inputs_embeds) + + # build alibi depending on `attention_mask` + alibi = build_alibi_tensor(attention_mask, self.config.n_head, dtype=hidden_states.dtype) + + outputs = self.h( + hidden_states, + alibi=alibi, + attention_mask=attention_mask, + deterministic=deterministic, + init_cache=init_cache, + output_hidden_states=output_hidden_states, + output_attentions=output_attentions, + ) + + hidden_states = outputs[0] + hidden_states = self.ln_f(hidden_states) + + if output_hidden_states: + all_hidden_states = outputs[1] + (hidden_states,) + outputs = (hidden_states, all_hidden_states) + outputs[2:] + else: + outputs = (hidden_states,) + outputs[1:] + + if not return_dict: + return tuple(v for v in [outputs[0], outputs[-1]] if v is not None) + + return FlaxBaseModelOutputWithPastAndCrossAttentions( + last_hidden_state=hidden_states, + hidden_states=outputs[1], + attentions=outputs[-1], + ) + + +@add_start_docstrings( + "The bare Bloom Model transformer outputting raw hidden-states without any specific head on top.", + BLOOM_START_DOCSTRING, +) +# Copied from transformers.models.gpt_neo.modeling_flax_gpt_neo.FlaxGPTNeoModel with GPTNeo->Bloom +class FlaxBloomModel(FlaxBloomPreTrainedModel): + module_class = FlaxBloomModule + + +append_call_sample_docstring(FlaxBloomModel, _CHECKPOINT_FOR_DOC, FlaxBaseModelOutput, _CONFIG_FOR_DOC) + + +class FlaxBloomForCausalLMModule(nn.Module): + config: BloomConfig + dtype: jnp.dtype = jnp.float32 + + def setup(self): + self.transformer = FlaxBloomModule(self.config, dtype=self.dtype) + self.lm_head = nn.Dense( + self.config.vocab_size, + use_bias=False, + dtype=self.dtype, + kernel_init=jax.nn.initializers.normal(stddev=self.config.initializer_range), + ) + + def __call__( + self, + input_ids, + attention_mask, + deterministic: bool = True, + init_cache: bool = False, + output_attentions: bool = False, + output_hidden_states: bool = False, + return_dict: bool = True, + ): + outputs = self.transformer( + input_ids, + attention_mask=attention_mask, + deterministic=deterministic, + init_cache=init_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + hidden_states = outputs[0] + + if self.config.tie_word_embeddings: + shared_kernel = self.transformer.variables["params"]["word_embeddings"]["embedding"].T + lm_logits = self.lm_head.apply({"params": {"kernel": shared_kernel}}, hidden_states) + else: + lm_logits = self.lm_head(hidden_states) + + if not return_dict: + return (lm_logits,) + outputs[1:] + + return FlaxCausalLMOutput(logits=lm_logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions) + + +@add_start_docstrings( + """ + The Bloom Model transformer with a language modeling head on top (linear layer with weights tied to the input + embeddings). + """, + BLOOM_START_DOCSTRING, +) +class FlaxBloomForCausalLM(FlaxBloomPreTrainedModel): + module_class = FlaxBloomForCausalLMModule + + def prepare_inputs_for_generation(self, input_ids, max_length, attention_mask: Optional[jax.Array] = None): + # initializing the cache + batch_size, seq_length = input_ids.shape + + past_key_values = self.init_cache(batch_size, max_length) + # Note that usually one would have to put 0's in the attention_mask for + # x > input_ids.shape[-1] and x < cache_length. But since Bloom uses a causal mask, + # those positions are masked anyway. Thus, we can create a single static attention_mask here, + # which is more efficient for compilation + extended_attention_mask = jnp.ones((batch_size, max_length), dtype="i4") + if attention_mask is not None: + extended_attention_mask = lax.dynamic_update_slice(extended_attention_mask, attention_mask, (0, 0)) + + return { + "past_key_values": past_key_values, + "attention_mask": extended_attention_mask, + } + + def update_inputs_for_generation(self, model_outputs, model_kwargs): + model_kwargs["past_key_values"] = model_outputs.past_key_values + return model_kwargs + + +append_call_sample_docstring(FlaxBloomForCausalLM, _CHECKPOINT_FOR_DOC, FlaxCausalLMOutput, _CONFIG_FOR_DOC) + + +__all__ = ["FlaxBloomForCausalLM", "FlaxBloomModel", "FlaxBloomPreTrainedModel"] diff --git a/docs/transformers/build/lib/transformers/models/bloom/tokenization_bloom_fast.py b/docs/transformers/build/lib/transformers/models/bloom/tokenization_bloom_fast.py new file mode 100644 index 0000000000000000000000000000000000000000..c84322637cb7e879b2cf50fdbd81fdc79b08ed98 --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/bloom/tokenization_bloom_fast.py @@ -0,0 +1,152 @@ +# coding=utf-8 +# Copyright 2022 The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Tokenization classes for Bloom.""" + +import pickle +from typing import Optional, Tuple + +from ...tokenization_utils_base import BatchEncoding +from ...tokenization_utils_fast import PreTrainedTokenizerFast +from ...utils import logging + + +logger = logging.get_logger(__name__) + +VOCAB_FILES_NAMES = {"tokenizer_file": "tokenizer.json"} + + +class BloomTokenizerFast(PreTrainedTokenizerFast): + """ + Construct a "fast" Bloom tokenizer (backed by HuggingFace's *tokenizers* library). Based on byte-level + Byte-Pair-Encoding. + + This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will + be encoded differently whether it is at the beginning of the sentence (without space) or not: + + ```python + >>> from transformers import BloomTokenizerFast + + >>> tokenizer = BloomTokenizerFast.from_pretrained("bigscience/bloom") + >>> tokenizer("Hello world")["input_ids"] + [59414, 8876] + + >>> tokenizer(" Hello world")["input_ids"] + [86153, 8876] + ``` + + You can get around that behavior by passing `add_prefix_space=True` when instantiating this tokenizer, but since + the model was not pretrained this way, it might yield a decrease in performance. + + + + When used with `is_split_into_words=True`, this tokenizer needs to be instantiated with `add_prefix_space=True`. + + + + This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should + refer to this superclass for more information regarding those methods. + + Args: + vocab_file (`str`): + Path to the vocabulary file. + merges_file (`str`): + Path to the merges file. + errors (`str`, *optional*, defaults to `"replace"`): + Paradigm to follow when decoding bytes to UTF-8. See + [bytes.decode](https://docs.python.org/3/library/stdtypes.html#bytes.decode) for more information. + unk_token (`str`, *optional*, defaults to `<|endoftext|>`): + The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this + token instead. + bos_token (`str`, *optional*, defaults to `<|endoftext|>`): + The beginning of sequence token. + eos_token (`str`, *optional*, defaults to `<|endoftext|>`): + The end of sequence token. + add_prefix_space (`bool`, *optional*, defaults to `False`): + Whether or not to add an initial space to the input. This allows to treat the leading word just as any + other word. (Bloom tokenizer detect beginning of words by the preceding space). + trim_offsets (`bool`, *optional*, defaults to `True`): + Whether or not the post-processing step should trim offsets to avoid including whitespaces. + """ + + vocab_files_names = VOCAB_FILES_NAMES + model_input_names = ["input_ids", "attention_mask"] + slow_tokenizer_class = None + # No `max_model_input_sizes` as BLOOM uses ALiBi positional embeddings + + def __init__( + self, + vocab_file=None, + merges_file=None, + tokenizer_file=None, + unk_token="", + bos_token="", + eos_token="", + pad_token="", + add_prefix_space=False, + clean_up_tokenization_spaces=False, + **kwargs, + ): + super().__init__( + vocab_file=vocab_file, + merges_file=merges_file, + tokenizer_file=tokenizer_file, + unk_token=unk_token, + bos_token=bos_token, + eos_token=eos_token, + pad_token=pad_token, + add_prefix_space=add_prefix_space, + clean_up_tokenization_spaces=clean_up_tokenization_spaces, + **kwargs, + ) + # TODO @ArthurZucker this can only work one way for now, to update later-on. Tests should also properly + # check this as they were green before. + pre_tok_state = pickle.dumps(self.backend_tokenizer.pre_tokenizer) + decoder_state = pickle.dumps(self.backend_tokenizer.decoder) + + if add_prefix_space: + pre_tok_state = pre_tok_state.replace(b'"add_prefix_space":false', b'"add_prefix_space": true') + decoder_state = decoder_state.replace(b'"add_prefix_space":false', b'"add_prefix_space": true') + self.backend_tokenizer.pre_tokenizer = pickle.loads(pre_tok_state) + self.backend_tokenizer.decoder = pickle.loads(decoder_state) + + self.add_prefix_space = add_prefix_space + + def _batch_encode_plus(self, *args, **kwargs) -> BatchEncoding: + is_split_into_words = kwargs.get("is_split_into_words", False) + if not (self.add_prefix_space or not is_split_into_words): + raise Exception( + f"You need to instantiate {self.__class__.__name__} with add_prefix_space=True to use it with" + " pretokenized inputs." + ) + + return super()._batch_encode_plus(*args, **kwargs) + + def _encode_plus(self, *args, **kwargs) -> BatchEncoding: + is_split_into_words = kwargs.get("is_split_into_words", False) + + if not (self.add_prefix_space or not is_split_into_words): + raise Exception( + f"You need to instantiate {self.__class__.__name__} with add_prefix_space=True to use it with" + " pretokenized inputs." + ) + + return super()._encode_plus(*args, **kwargs) + + def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]: + files = self._tokenizer.model.save(save_directory, name=filename_prefix) + return tuple(files) + + +__all__ = ["BloomTokenizerFast"] diff --git a/docs/transformers/build/lib/transformers/models/bridgetower/__init__.py b/docs/transformers/build/lib/transformers/models/bridgetower/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..8ca84a320fdc4aa1f4cf99aef78154849514c8a6 --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/bridgetower/__init__.py @@ -0,0 +1,30 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ...utils import _LazyModule +from ...utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .configuration_bridgetower import * + from .image_processing_bridgetower import * + from .image_processing_bridgetower_fast import * + from .modeling_bridgetower import * + from .processing_bridgetower import * +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/docs/transformers/build/lib/transformers/models/bridgetower/configuration_bridgetower.py b/docs/transformers/build/lib/transformers/models/bridgetower/configuration_bridgetower.py new file mode 100644 index 0000000000000000000000000000000000000000..6a3d9072defadcc329ff8da060945f5945f59ba9 --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/bridgetower/configuration_bridgetower.py @@ -0,0 +1,319 @@ +# coding=utf-8 +# Copyright 2023 The Intel Labs Team Authors, The Microsoft Research Team Authors and HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License=, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing=, software +# distributed under the License is distributed on an "AS IS" BASIS=, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND=, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""BridgeTower model configuration""" + +from ...configuration_utils import PretrainedConfig +from ...utils import logging + + +logger = logging.get_logger(__name__) + + +class BridgeTowerVisionConfig(PretrainedConfig): + r""" + This is the configuration class to store the vision configuration of a [`BridgeTowerModel`]. Instantiating a + configuration with the defaults will yield a similar configuration to that of the bridgetower-base + [BridgeTower/bridgetower-base](https://huggingface.co/BridgeTower/bridgetower-base/) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + hidden_size (`int`, *optional*, defaults to 768): + Dimensionality of the encoder layers and the pooler layer. + num_hidden_layers (`int`, *optional*, defaults to 12): + Number of hidden layers in visual encoder model. + patch_size (`int`, *optional*, defaults to 16): + The size (resolution) of each patch. + image_size (`int`, *optional*, defaults to 288): + The size (resolution) of each image. + initializer_factor (`float`, *optional*, defaults to 1): + A factor for initializing all weight matrices (should be kept to 1, used internally for initialization + testing). + layer_norm_eps (`float`, *optional*, defaults to 1e-05): + The epsilon used by the layer normalization layers. + stop_gradient (`bool`, *optional*, defaults to `False`): + Whether to stop gradient for training. + share_layernorm (`bool`, *optional*, defaults to `True`): + Whether LayerNorm layers are shared. + remove_last_layer (`bool`, *optional*, defaults to `False`): + Whether to remove the last layer from the vision encoder. + + + Example: + + ```python + >>> from transformers import BridgeTowerVisionConfig + + >>> # Initializing a BridgeTower BridgeTower/bridgetower-base style configuration for the vision model + >>> configuration = BridgeTowerVisionConfig() + + >>> # Accessing the configuration + >>> configuration + ```""" + + model_type = "bridgetower_vision_model" + base_config_key = "vision_config" + + def __init__( + self, + hidden_size=768, + num_hidden_layers=12, + num_channels=3, + patch_size=16, + image_size=288, + initializer_factor=1, + layer_norm_eps=1e-05, + stop_gradient=False, + share_layernorm=True, + remove_last_layer=False, + **kwargs, + ): + super().__init__(**kwargs) + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_channels = num_channels + self.patch_size = patch_size + self.image_size = image_size + self.initializer_factor = initializer_factor + self.layer_norm_eps = layer_norm_eps + self.stop_gradient = stop_gradient + self.share_layernorm = share_layernorm + self.remove_last_layer = remove_last_layer + + +class BridgeTowerTextConfig(PretrainedConfig): + r""" + This is the configuration class to store the text configuration of a [`BridgeTowerModel`]. The default values here + are copied from RoBERTa. Instantiating a configuration with the defaults will yield a similar configuration to that + of the bridgetower-base [BridegTower/bridgetower-base](https://huggingface.co/BridgeTower/bridgetower-base/) + architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + vocab_size (`int`, *optional*, defaults to 50265): + Vocabulary size of the text part of the model. Defines the number of different tokens that can be + represented by the `inputs_ids` passed when calling [`BridgeTowerModel`]. + hidden_size (`int`, *optional*, defaults to 768): + Dimensionality of the encoder layers and the pooler layer. + num_hidden_layers (`int`, *optional*, defaults to 12): + Number of hidden layers in the Transformer encoder. + num_attention_heads (`int`, *optional*, defaults to 12): + Number of attention heads for each attention layer in the Transformer encoder. + intermediate_size (`int`, *optional*, defaults to 3072): + Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder. + hidden_act (`str` or `Callable`, *optional*, defaults to `"gelu"`): + The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, + `"relu"`, `"silu"` and `"gelu_new"` are supported. + hidden_dropout_prob (`float`, *optional*, defaults to 0.1): + The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. + attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1): + The dropout ratio for the attention probabilities. + max_position_embeddings (`int`, *optional*, defaults to 514): + The maximum sequence length that this model might ever be used with. Typically set this to something large + just in case (e.g., 512 or 1024 or 2048). + type_vocab_size (`int`, *optional*, defaults to 2): + The vocabulary size of the `token_type_ids`. + initializer_factor (`float`, *optional*, defaults to 1): + A factor for initializing all weight matrices (should be kept to 1, used internally for initialization + testing). + layer_norm_eps (`float`, *optional*, defaults to 1e-05): + The epsilon used by the layer normalization layers. + position_embedding_type (`str`, *optional*, defaults to `"absolute"`): + Type of position embedding. Choose one of `"absolute"`, `"relative_key"`, `"relative_key_query"`. For + positional embeddings use `"absolute"`. For more information on `"relative_key"`, please refer to + [Self-Attention with Relative Position Representations (Shaw et al.)](https://arxiv.org/abs/1803.02155). + For more information on `"relative_key_query"`, please refer to *Method 4* in [Improve Transformer Models + with Better Relative Position Embeddings (Huang et al.)](https://arxiv.org/abs/2009.13658). + is_decoder (`bool`, *optional*, defaults to `False`): + Whether the model is used as a decoder or not. If `False`, the model is used as an encoder. + use_cache (`bool`, *optional*, defaults to `True`): + Whether or not the model should return the last key/values attentions (not used by all models). Only + relevant if `config.is_decoder=True`. + + Example: + + ```python + >>> from transformers import BridgeTowerTextConfig + + >>> # Initializing a BridgeTower BridgeTower/bridgetower-base style configuration for the text model + >>> configuration = BridgeTowerTextConfig() + + >>> # Accessing the configuration + >>> configuration + ```""" + + model_type = "bridgetower_text_model" + base_config_key = "text_config" + + def __init__( + self, + vocab_size=50265, + hidden_size=768, + num_hidden_layers=12, + num_attention_heads=12, + initializer_factor=1, + intermediate_size=3072, + hidden_act="gelu", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + max_position_embeddings=514, + type_vocab_size=1, + layer_norm_eps=1e-05, + pad_token_id=1, + bos_token_id=0, + eos_token_id=2, + position_embedding_type="absolute", + use_cache=True, + **kwargs, + ): + super().__init__(**kwargs) + + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.hidden_act = hidden_act + self.initializer_factor = initializer_factor + self.intermediate_size = intermediate_size + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.max_position_embeddings = max_position_embeddings + self.type_vocab_size = type_vocab_size + self.layer_norm_eps = layer_norm_eps + self.position_embedding_type = position_embedding_type + self.use_cache = use_cache + self.pad_token_id = pad_token_id + self.bos_token_id = bos_token_id + self.eos_token_id = eos_token_id + + +class BridgeTowerConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`BridgeTowerModel`]. It is used to instantiate a + BridgeTower model according to the specified arguments, defining the model architecture. Instantiating a + configuration with the defaults will yield a similar configuration to that of the bridgetower-base + [BridgeTower/bridgetower-base](https://huggingface.co/BridgeTower/bridgetower-base/) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + share_cross_modal_transformer_layers (`bool`, *optional*, defaults to `True`): + Whether cross modal transformer layers are shared. + hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`): + The non-linear activation function (function or string) in the encoder and pooler. + hidden_size (`int`, *optional*, defaults to 768): + Dimensionality of the encoder layers and the pooler layer. + initializer_factor (`float`, *optional*, defaults to 1): + A factor for initializing all weight matrices (should be kept to 1, used internally for initialization + testing). + layer_norm_eps (`float`, *optional*, defaults to 1e-05): + The epsilon used by the layer normalization layers. + share_link_tower_layers (`bool`, *optional*, defaults to `False`): + Whether the bride/link tower layers are shared. + link_tower_type (`str`, *optional*, defaults to `"add"`): + Type of the bridge/link layer. + num_attention_heads (`int`, *optional*, defaults to 12): + Number of attention heads for each attention layer in the Transformer encoder. + num_hidden_layers (`int`, *optional*, defaults to 6): + Number of hidden layers in the Transformer encoder. + tie_word_embeddings (`bool`, *optional*, defaults to `False`): + Whether to tie input and output embeddings. + init_layernorm_from_vision_encoder (`bool`, *optional*, defaults to `False`): + Whether to init LayerNorm from the vision encoder. + text_config (`dict`, *optional*): + Dictionary of configuration options used to initialize [`BridgeTowerTextConfig`]. + vision_config (`dict`, *optional*): + Dictionary of configuration options used to initialize [`BridgeTowerVisionConfig`]. + + Example: + + ```python + >>> from transformers import BridgeTowerModel, BridgeTowerConfig + + >>> # Initializing a BridgeTower BridgeTower/bridgetower-base style configuration + >>> configuration = BridgeTowerConfig() + + >>> # Initializing a model from the BridgeTower/bridgetower-base style configuration + >>> model = BridgeTowerModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "bridgetower" + sub_configs = {"text_config": BridgeTowerTextConfig, "vision_config": BridgeTowerVisionConfig} + + def __init__( + self, + share_cross_modal_transformer_layers=True, + hidden_act="gelu", + hidden_size=768, + initializer_factor=1, + layer_norm_eps=1e-05, + share_link_tower_layers=False, + link_tower_type="add", + num_attention_heads=12, + num_hidden_layers=6, + tie_word_embeddings=False, + init_layernorm_from_vision_encoder=False, + text_config=None, + vision_config=None, + **kwargs, + ): + # TODO: remove this once the Hub files are updated. + _ = kwargs.pop("text_config_dict", None) + _ = kwargs.pop("vision_config_dict", None) + + super().__init__(**kwargs) + self.share_cross_modal_transformer_layers = share_cross_modal_transformer_layers + self.hidden_act = hidden_act + self.hidden_size = hidden_size + self.initializer_factor = initializer_factor + self.layer_norm_eps = layer_norm_eps + self.share_link_tower_layers = share_link_tower_layers + self.link_tower_type = link_tower_type + self.num_attention_heads = num_attention_heads + self.num_hidden_layers = num_hidden_layers + self.tie_word_embeddings = tie_word_embeddings + self.init_layernorm_from_vision_encoder = init_layernorm_from_vision_encoder + + if text_config is None: + text_config = {} + logger.info("`text_config` is `None`. Initializing the `BridgeTowerTextConfig` with default values.") + + if vision_config is None: + vision_config = {} + logger.info("`vision_config` is `None`. Initializing the `BridgeTowerVisionConfig` with default values.") + + self.text_config = BridgeTowerTextConfig(**text_config) + self.vision_config = BridgeTowerVisionConfig(**vision_config) + + @classmethod + def from_text_vision_configs( + cls, text_config: BridgeTowerTextConfig, vision_config: BridgeTowerVisionConfig, **kwargs + ): + r""" + Instantiate a [`BridgeTowerConfig`] (or a derived class) from BridgeTower text model configuration. Returns: + [`BridgeTowerConfig`]: An instance of a configuration object + """ + + return cls(text_config=text_config.to_dict(), vision_config=vision_config.to_dict(), **kwargs) + + +__all__ = ["BridgeTowerConfig", "BridgeTowerTextConfig", "BridgeTowerVisionConfig"] diff --git a/docs/transformers/build/lib/transformers/models/bridgetower/image_processing_bridgetower.py b/docs/transformers/build/lib/transformers/models/bridgetower/image_processing_bridgetower.py new file mode 100644 index 0000000000000000000000000000000000000000..95eaa9f88b9266fbcece143803d7fd4f3a9485e0 --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/bridgetower/image_processing_bridgetower.py @@ -0,0 +1,541 @@ +# coding=utf-8 +# Copyright 2023 The Intel Labs Team Authors, The Microsoft Research Team Authors and HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Image processor class for BridgeTower.""" + +from typing import Any, Dict, Iterable, List, Optional, Tuple, Union + +import numpy as np + +from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict +from ...image_transforms import PaddingMode, center_crop, pad, resize, to_channel_dimension_format +from ...image_utils import ( + OPENAI_CLIP_MEAN, + OPENAI_CLIP_STD, + ChannelDimension, + ImageInput, + PILImageResampling, + get_image_size, + infer_channel_dimension_format, + is_scaled_image, + make_flat_list_of_images, + to_numpy_array, + valid_images, + validate_preprocess_arguments, +) +from ...utils import TensorType, filter_out_non_signature_kwargs, is_vision_available, logging + + +if is_vision_available(): + import PIL + +logger = logging.get_logger(__name__) + + +# Copied from transformers.models.vilt.image_processing_vilt.max_across_indices +def max_across_indices(values: Iterable[Any]) -> List[Any]: + """ + Return the maximum value across all indices of an iterable of values. + """ + return [max(values_i) for values_i in zip(*values)] + + +# Copied from transformers.models.vilt.image_processing_vilt.make_pixel_mask +def make_pixel_mask( + image: np.ndarray, output_size: Tuple[int, int], input_data_format: Optional[Union[str, ChannelDimension]] = None +) -> np.ndarray: + """ + Make a pixel mask for the image, where 1 indicates a valid pixel and 0 indicates padding. + + Args: + image (`np.ndarray`): + Image to make the pixel mask for. + output_size (`Tuple[int, int]`): + Output size of the mask. + """ + input_height, input_width = get_image_size(image, channel_dim=input_data_format) + mask = np.zeros(output_size, dtype=np.int64) + mask[:input_height, :input_width] = 1 + return mask + + +# Copied from transformers.models.vilt.image_processing_vilt.get_max_height_width +def get_max_height_width( + images: List[np.ndarray], input_data_format: Optional[Union[str, ChannelDimension]] = None +) -> List[int]: + """ + Get the maximum height and width across all images in a batch. + """ + if input_data_format is None: + input_data_format = infer_channel_dimension_format(images[0]) + + if input_data_format == ChannelDimension.FIRST: + _, max_height, max_width = max_across_indices([img.shape for img in images]) + elif input_data_format == ChannelDimension.LAST: + max_height, max_width, _ = max_across_indices([img.shape for img in images]) + else: + raise ValueError(f"Invalid channel dimension format: {input_data_format}") + return (max_height, max_width) + + +# Copied from transformers.models.vilt.image_processing_vilt.get_resize_output_image_size +def get_resize_output_image_size( + input_image: np.ndarray, + shorter: int = 800, + longer: int = 1333, + size_divisor: int = 32, + input_data_format: Optional[Union[str, ChannelDimension]] = None, +) -> Tuple[int, int]: + input_height, input_width = get_image_size(input_image, input_data_format) + min_size, max_size = shorter, longer + + scale = min_size / min(input_height, input_width) + + if input_height < input_width: + new_height = min_size + new_width = scale * input_width + else: + new_height = scale * input_height + new_width = min_size + + if max(new_height, new_width) > max_size: + scale = max_size / max(new_height, new_width) + new_height = scale * new_height + new_width = scale * new_width + + new_height, new_width = int(new_height + 0.5), int(new_width + 0.5) + new_height = new_height // size_divisor * size_divisor + new_width = new_width // size_divisor * size_divisor + + return new_height, new_width + + +class BridgeTowerImageProcessor(BaseImageProcessor): + r""" + Constructs a BridgeTower image processor. + + Args: + do_resize (`bool`, *optional*, defaults to `True`): + Whether to resize the image's (height, width) dimensions to the specified `size`. Can be overridden by the + `do_resize` parameter in the `preprocess` method. + size (`Dict[str, int]` *optional*, defaults to `{'shortest_edge': 288}`): + Resize the shorter side of the input to `size["shortest_edge"]`. The longer side will be limited to under + `int((1333 / 800) * size["shortest_edge"])` while preserving the aspect ratio. Only has an effect if + `do_resize` is set to `True`. Can be overridden by the `size` parameter in the `preprocess` method. + size_divisor (`int`, *optional*, defaults to 32): + The size by which to make sure both the height and width can be divided. Only has an effect if `do_resize` + is set to `True`. Can be overridden by the `size_divisor` parameter in the `preprocess` method. + resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`): + Resampling filter to use if resizing the image. Only has an effect if `do_resize` is set to `True`. Can be + overridden by the `resample` parameter in the `preprocess` method. + do_rescale (`bool`, *optional*, defaults to `True`): + Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the `do_rescale` + parameter in the `preprocess` method. + rescale_factor (`int` or `float`, *optional*, defaults to `1/255`): + Scale factor to use if rescaling the image. Only has an effect if `do_rescale` is set to `True`. Can be + overridden by the `rescale_factor` parameter in the `preprocess` method. + do_normalize (`bool`, *optional*, defaults to `True`): + Whether to normalize the image. Can be overridden by the `do_normalize` parameter in the `preprocess` + method. Can be overridden by the `do_normalize` parameter in the `preprocess` method. + image_mean (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_MEAN`): + Mean to use if normalizing the image. This is a float or list of floats the length of the number of + channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method. Can be + overridden by the `image_mean` parameter in the `preprocess` method. + image_std (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_STD`): + Standard deviation to use if normalizing the image. This is a float or list of floats the length of the + number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method. + Can be overridden by the `image_std` parameter in the `preprocess` method. + do_center_crop (`bool`, *optional*, defaults to `True`): + Whether to center crop the image. Can be overridden by the `do_center_crop` parameter in the `preprocess` + method. + crop_size (`Dict[str, int]`, *optional*): + Desired output size when applying center-cropping. Only has an effect if `do_center_crop` is set to `True`. + Can be overridden by the `crop_size` parameter in the `preprocess` method. If unset defaults to `size`, + do_pad (`bool`, *optional*, defaults to `True`): + Whether to pad the image to the `(max_height, max_width)` of the images in the batch. Can be overridden by + the `do_pad` parameter in the `preprocess` method. + """ + + model_input_names = ["pixel_values"] + + def __init__( + self, + do_resize: bool = True, + size: Dict[str, int] = None, + size_divisor: int = 32, + resample: PILImageResampling = PILImageResampling.BICUBIC, + do_rescale: bool = True, + rescale_factor: Union[int, float] = 1 / 255, + do_normalize: bool = True, + image_mean: Optional[Union[float, List[float]]] = None, + image_std: Optional[Union[float, List[float]]] = None, + do_center_crop: bool = True, + crop_size: Dict[str, int] = None, + do_pad: bool = True, + **kwargs, + ) -> None: + if "pad_and_return_pixel_mask" in kwargs: + do_pad = kwargs.pop("pad_and_return_pixel_mask") + + super().__init__(**kwargs) + size = size if size is not None else {"shortest_edge": 288} + size = get_size_dict(size, default_to_square=False) + + self.do_resize = do_resize + self.size = size + self.size_divisor = size_divisor + self.resample = resample + self.do_rescale = do_rescale + self.rescale_factor = rescale_factor + self.do_normalize = do_normalize + self.image_mean = image_mean if image_mean is not None else OPENAI_CLIP_MEAN + self.image_std = image_std if image_std is not None else OPENAI_CLIP_STD + self.do_pad = do_pad + self.do_center_crop = do_center_crop + self.crop_size = crop_size + + # Copied from transformers.models.vilt.image_processing_vilt.ViltImageProcessor.resize + def resize( + self, + image: np.ndarray, + size: Dict[str, int], + size_divisor: int = 32, + resample: PILImageResampling = PILImageResampling.BICUBIC, + data_format: Optional[Union[str, ChannelDimension]] = None, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + **kwargs, + ) -> np.ndarray: + """ + Resize an image. + + Resizes the shorter side of the image to `size["shortest_edge"]` while preserving the aspect ratio. If the + longer side is larger than the max size `(int(`size["shortest_edge"]` * 1333 / 800))`, the longer side is then + resized to the max size while preserving the aspect ratio. + + Args: + image (`np.ndarray`): + Image to resize. + size (`Dict[str, int]`): + Controls the size of the output image. Should be of the form `{"shortest_edge": int}`. + size_divisor (`int`, *optional*, defaults to 32): + The image is resized to a size that is a multiple of this value. + resample (`PILImageResampling` filter, *optional*, defaults to `PILImageResampling.BICUBIC`): + Resampling filter to use when resiizing the image. + data_format (`str` or `ChannelDimension`, *optional*): + The channel dimension format of the image. If not provided, it will be the same as the input image. + input_data_format (`str` or `ChannelDimension`, *optional*): + The channel dimension format of the input image. If not provided, it will be inferred. + """ + size = get_size_dict(size, default_to_square=False) + if "shortest_edge" not in size: + raise ValueError(f"The `size` dictionary must contain the key `shortest_edge`. Got {size.keys()}") + shorter = size["shortest_edge"] + longer = int(1333 / 800 * shorter) + output_size = get_resize_output_image_size( + image, shorter=shorter, longer=longer, size_divisor=size_divisor, input_data_format=input_data_format + ) + return resize( + image, + size=output_size, + resample=resample, + data_format=data_format, + input_data_format=input_data_format, + **kwargs, + ) + + def center_crop( + self, + image: np.ndarray, + size: Dict[str, int], + data_format: Optional[Union[str, ChannelDimension]] = None, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + **kwargs, + ) -> np.ndarray: + """ + Center crop an image to `(size["height"], size["width"])`. If the input size is smaller than `crop_size` along + any edge, the image is padded with 0's and then center cropped. + + Args: + image (`np.ndarray`): + Image to center crop. + size (`Dict[str, int]`): + Size of the output image in the form `{"height": h, "width": w}`. + data_format (`str` or `ChannelDimension`, *optional*): + The channel dimension format of the image. If not provided, it will be the same as the input image. + input_data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format of the input image. If not provided, it will be inferred from the input + image. + """ + output_size = size["shortest_edge"] + return center_crop( + image, + size=(output_size, output_size), + data_format=data_format, + input_data_format=input_data_format, + **kwargs, + ) + + # Copied from transformers.models.vilt.image_processing_vilt.ViltImageProcessor._pad_image + def _pad_image( + self, + image: np.ndarray, + output_size: Tuple[int, int], + constant_values: Union[float, Iterable[float]] = 0, + data_format: Optional[ChannelDimension] = None, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + ) -> np.ndarray: + """ + Pad an image with zeros to the given size. + """ + input_height, input_width = get_image_size(image, channel_dim=input_data_format) + output_height, output_width = output_size + + pad_bottom = output_height - input_height + pad_right = output_width - input_width + padding = ((0, pad_bottom), (0, pad_right)) + padded_image = pad( + image, + padding, + mode=PaddingMode.CONSTANT, + constant_values=constant_values, + data_format=data_format, + input_data_format=input_data_format, + ) + return padded_image + + # Copied from transformers.models.vilt.image_processing_vilt.ViltImageProcessor.pad + def pad( + self, + images: List[np.ndarray], + constant_values: Union[float, Iterable[float]] = 0, + return_pixel_mask: bool = True, + return_tensors: Optional[Union[str, TensorType]] = None, + data_format: Optional[ChannelDimension] = None, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + ) -> BatchFeature: + """ + Pads a batch of images to the bottom and right of the image with zeros to the size of largest height and width + in the batch and optionally returns their corresponding pixel mask. + + Args: + image (`np.ndarray`): + Image to pad. + constant_values (`float` or `Iterable[float]`, *optional*): + The value to use for the padding if `mode` is `"constant"`. + return_pixel_mask (`bool`, *optional*, defaults to `True`): + Whether to return a pixel mask. + return_tensors (`str` or `TensorType`, *optional*): + The type of tensors to return. Can be one of: + - Unset: Return a list of `np.ndarray`. + - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`. + - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`. + - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`. + - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`. + data_format (`str` or `ChannelDimension`, *optional*): + The channel dimension format of the image. If not provided, it will be the same as the input image. + input_data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format of the input image. If not provided, it will be inferred. + """ + pad_size = get_max_height_width(images, input_data_format=input_data_format) + + padded_images = [ + self._pad_image( + image, + pad_size, + constant_values=constant_values, + data_format=data_format, + input_data_format=input_data_format, + ) + for image in images + ] + data = {"pixel_values": padded_images} + + if return_pixel_mask: + masks = [ + make_pixel_mask(image=image, output_size=pad_size, input_data_format=input_data_format) + for image in images + ] + data["pixel_mask"] = masks + + return BatchFeature(data=data, tensor_type=return_tensors) + + @filter_out_non_signature_kwargs() + def preprocess( + self, + images: ImageInput, + do_resize: Optional[bool] = None, + size: Optional[Dict[str, int]] = None, + size_divisor: Optional[int] = None, + resample: PILImageResampling = None, + do_rescale: Optional[bool] = None, + rescale_factor: Optional[float] = None, + do_normalize: Optional[bool] = None, + image_mean: Optional[Union[float, List[float]]] = None, + image_std: Optional[Union[float, List[float]]] = None, + do_pad: Optional[bool] = None, + do_center_crop: Optional[bool] = None, + crop_size: Dict[str, int] = None, + return_tensors: Optional[Union[str, TensorType]] = None, + data_format: ChannelDimension = ChannelDimension.FIRST, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + ) -> PIL.Image.Image: + """ + Preprocess an image or batch of images. + + Args: + images (`ImageInput`): + Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If + passing in images with pixel values between 0 and 1, set `do_rescale=False`. + do_resize (`bool`, *optional*, defaults to `self.do_resize`): + Whether to resize the image. + size (`Dict[str, int]`, *optional*, defaults to `self.size`): + Controls the size of the image after `resize`. The shortest edge of the image is resized to + `size["shortest_edge"]` whilst preserving the aspect ratio. If the longest edge of this resized image + is > `int(size["shortest_edge"] * (1333 / 800))`, then the image is resized again to make the longest + edge equal to `int(size["shortest_edge"] * (1333 / 800))`. + size_divisor (`int`, *optional*, defaults to `self.size_divisor`): + The image is resized to a size that is a multiple of this value. + resample (`PILImageResampling`, *optional*, defaults to `self.resample`): + Resampling filter to use if resizing the image. Only has an effect if `do_resize` is set to `True`. + do_rescale (`bool`, *optional*, defaults to `self.do_rescale`): + Whether to rescale the image values between [0 - 1]. + rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`): + Rescale factor to rescale the image by if `do_rescale` is set to `True`. + do_normalize (`bool`, *optional*, defaults to `self.do_normalize`): + Whether to normalize the image. + image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`): + Image mean to normalize the image by if `do_normalize` is set to `True`. + image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`): + Image standard deviation to normalize the image by if `do_normalize` is set to `True`. + do_pad (`bool`, *optional*, defaults to `self.do_pad`): + Whether to pad the image to the (max_height, max_width) in the batch. If `True`, a pixel mask is also + created and returned. + do_center_crop (`bool`, *optional*, defaults to `self.do_center_crop`): + Whether to center crop the image. If the input size is smaller than `crop_size` along any edge, the + image is padded with 0's and then center cropped. + crop_size (`Dict[str, int]`, *optional*, defaults to `self.crop_size`): + Size of the image after center crop. If one edge the image is smaller than `crop_size`, it will be + padded with zeros and then cropped + return_tensors (`str` or `TensorType`, *optional*): + The type of tensors to return. Can be one of: + - Unset: Return a list of `np.ndarray`. + - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`. + - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`. + - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`. + - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`. + data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`): + The channel dimension format for the output image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + - Unset: Use the channel dimension format of the input image. + input_data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format for the input image. If unset, the channel dimension format is inferred + from the input image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + - `"none"` or `ChannelDimension.NONE`: image in (height, width) format. + """ + do_resize = do_resize if do_resize is not None else self.do_resize + size_divisor = size_divisor if size_divisor is not None else self.size_divisor + resample = resample if resample is not None else self.resample + do_rescale = do_rescale if do_rescale is not None else self.do_rescale + rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor + do_normalize = do_normalize if do_normalize is not None else self.do_normalize + image_mean = image_mean if image_mean is not None else self.image_mean + image_std = image_std if image_std is not None else self.image_std + do_pad = do_pad if do_pad is not None else self.do_pad + do_center_crop = do_center_crop if do_center_crop is not None else self.do_center_crop + # For backwards compatibility. Initial version of this processor was cropping to the "size" argument, which + # it should default to if crop_size is undefined. + crop_size = ( + crop_size if crop_size is not None else (self.crop_size if self.crop_size is not None else self.size) + ) + + size = size if size is not None else self.size + size = get_size_dict(size, default_to_square=False) + images = make_flat_list_of_images(images) + + if not valid_images(images): + raise ValueError( + "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, " + "torch.Tensor, tf.Tensor or jax.ndarray." + ) + # Here, crop_size is used only if it is set, else size will be used. + validate_preprocess_arguments( + do_rescale=do_rescale, + rescale_factor=rescale_factor, + do_normalize=do_normalize, + image_mean=image_mean, + image_std=image_std, + do_pad=do_pad, + size_divisibility=size_divisor, + do_center_crop=do_center_crop, + crop_size=crop_size, + do_resize=do_resize, + size=size, + resample=resample, + ) + # All transformations expect numpy arrays. + images = [to_numpy_array(image) for image in images] + + if do_rescale and is_scaled_image(images[0]): + logger.warning_once( + "It looks like you are trying to rescale already rescaled images. If the input" + " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again." + ) + + if do_resize: + images = [ + self.resize( + image=image, + size=size, + size_divisor=size_divisor, + resample=resample, + input_data_format=input_data_format, + ) + for image in images + ] + + if do_center_crop: + images = [ + self.center_crop(image=image, size=crop_size, input_data_format=input_data_format) for image in images + ] + + if do_rescale: + images = [ + self.rescale(image=image, scale=rescale_factor, input_data_format=input_data_format) + for image in images + ] + + if do_normalize: + images = [ + self.normalize(image=image, mean=image_mean, std=image_std, input_data_format=input_data_format) + for image in images + ] + + images = [ + to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) for image in images + ] + + if do_pad: + encoded_outputs = self.pad( + images, return_pixel_mask=True, return_tensors=return_tensors, input_data_format=data_format + ) + else: + encoded_outputs = BatchFeature(data={"pixel_values": images}, tensor_type=return_tensors) + + return encoded_outputs + + +__all__ = ["BridgeTowerImageProcessor"] diff --git a/docs/transformers/build/lib/transformers/models/bridgetower/image_processing_bridgetower_fast.py b/docs/transformers/build/lib/transformers/models/bridgetower/image_processing_bridgetower_fast.py new file mode 100644 index 0000000000000000000000000000000000000000..7af3213854fd1795bfa59b35c612a6894b925d57 --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/bridgetower/image_processing_bridgetower_fast.py @@ -0,0 +1,345 @@ +# coding=utf-8 +# Copyright 2025 The Intel Labs Team Authors, The Microsoft Research Team Authors and HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Fast Image processor class for BridgeTower.""" + +from typing import Dict, Iterable, Optional, Tuple, Union + +from ...image_processing_utils_fast import ( + BASE_IMAGE_PROCESSOR_FAST_DOCSTRING, + BASE_IMAGE_PROCESSOR_FAST_DOCSTRING_PREPROCESS, + BaseImageProcessorFast, + BatchFeature, + DefaultFastImageProcessorKwargs, + ImageInput, + SizeDict, + TensorType, + Unpack, + get_max_height_width, + group_images_by_shape, + reorder_images, +) +from ...image_utils import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD, PILImageResampling +from ...utils import add_start_docstrings, is_torch_available, is_torchvision_available, is_torchvision_v2_available + + +if is_torch_available(): + import torch + +if is_torchvision_available(): + if is_torchvision_v2_available(): + from torchvision.transforms.v2 import functional as F + else: + from torchvision.transforms import functional as F + + +def make_pixel_mask( + image: "torch.Tensor", + output_size: Tuple[int, int], +) -> "torch.Tensor": + """ + Make a pixel mask for the image, where 1 indicates a valid pixel and 0 indicates padding. + + Args: + image (`np.ndarray`): + Image to make the pixel mask for. + output_size (`Tuple[int, int]`): + Output size of the mask. + """ + input_height, input_width = image.shape[-2:] + batch_size = image.size(0) + mask = torch.zeros((batch_size, *output_size), dtype=torch.long) + mask[:input_height, :input_width] = 1 + return mask + + +def get_resize_output_image_size( + input_image: "torch.Tensor", + shorter: int = 800, + longer: int = 1333, + size_divisor: int = 32, +) -> Tuple[int, int]: + input_height, input_width = input_image.shape[-2:] + min_size, max_size = shorter, longer + + scale = min_size / min(input_height, input_width) + + if input_height < input_width: + new_height = min_size + new_width = scale * input_width + else: + new_height = scale * input_height + new_width = min_size + + if max(new_height, new_width) > max_size: + scale = max_size / max(new_height, new_width) + new_height = scale * new_height + new_width = scale * new_width + + new_height, new_width = int(new_height + 0.5), int(new_width + 0.5) + new_height = new_height // size_divisor * size_divisor + new_width = new_width // size_divisor * size_divisor + + return new_height, new_width + + +class BridgeTowerFastImageProcessorKwargs(DefaultFastImageProcessorKwargs): + size_divisor: Optional[int] + do_pad: Optional[bool] + + +@add_start_docstrings( + "Constructs a fast BridgeTower image processor.", + BASE_IMAGE_PROCESSOR_FAST_DOCSTRING, + """ + size_divisor (`int`, *optional*, defaults to 32): + The size by which to make sure both the height and width can be divided. Only has an effect if `do_resize` + is set to `True`. Can be overridden by the `size_divisor` parameter in the `preprocess` method. + do_pad (`bool`, *optional*, defaults to `True`): + Whether to pad the image to the `(max_height, max_width)` of the images in the batch. Can be overridden by + the `do_pad` parameter in the `preprocess` method. + """, +) +class BridgeTowerImageProcessorFast(BaseImageProcessorFast): + resample = PILImageResampling.BICUBIC + image_mean = OPENAI_CLIP_MEAN + image_std = OPENAI_CLIP_STD + size = {"shortest_edge": 288} + default_to_square = False + crop_size = {"shortest_edge": 288} + do_resize = True + do_center_crop = True + do_rescale = True + do_normalize = True + do_pad = True + size_divisor = 32 + valid_kwargs = BridgeTowerFastImageProcessorKwargs + + def __init__(self, **kwargs: Unpack[BridgeTowerFastImageProcessorKwargs]): + super().__init__(**kwargs) + + @add_start_docstrings( + BASE_IMAGE_PROCESSOR_FAST_DOCSTRING_PREPROCESS, + """ + size_divisor (`int`, *optional*, defaults to 32): + The size by which to make sure both the height and width can be divided. Only has an effect if `do_resize` + is set to `True`. Can be overridden by the `size_divisor` parameter in the `preprocess` method. + do_pad (`bool`, *optional*, defaults to `True`): + Whether to pad the image to the `(max_height, max_width)` of the images in the batch. Can be overridden by + the `do_pad` parameter in the `preprocess` method. + """, + ) + def preprocess(self, images: ImageInput, **kwargs: Unpack[BridgeTowerFastImageProcessorKwargs]) -> BatchFeature: + return super().preprocess(images, **kwargs) + + def resize( + self, + image: "torch.Tensor", + size: SizeDict, + size_divisor: int = 32, + interpolation: "F.InterpolationMode" = None, + antialias: bool = True, + **kwargs, + ) -> "torch.Tensor": + """ + Resize an image. + + Resizes the shorter side of the image to `size["shortest_edge"]` while preserving the aspect ratio. If the + longer side is larger than the max size `(int(`size["shortest_edge"]` * 1333 / 800))`, the longer side is then + resized to the max size while preserving the aspect ratio. + + Args: + image (`torch.Tensor`): + Image to resize. + size (`SizeDict`): + Dictionary in the format `{"height": int, "width": int}` specifying the size of the output image. + size_divisor (`int`, *optional*, defaults to 32): + The image is resized to a size that is a multiple of this value. + resample (`InterpolationMode`, *optional*, defaults to `InterpolationMode.BILINEAR`): + `InterpolationMode` filter to use when resizing the image e.g. `InterpolationMode.BICUBIC`. + + Returns: + `torch.Tensor`: The resized image. + """ + interpolation = interpolation if interpolation is not None else F.InterpolationMode.BILINEAR + if not size.shortest_edge: + raise ValueError(f"The `size` dictionary must contain the key `shortest_edge`. Got {size.keys()}") + shorter = size.shortest_edge + longer = int(1333 / 800 * shorter) + output_size = get_resize_output_image_size( + image, + shorter=shorter, + longer=longer, + size_divisor=size_divisor, + ) + return F.resize(image, output_size, interpolation=interpolation, antialias=antialias) + + def center_crop( + self, + image: "torch.Tensor", + size: Dict[str, int], + **kwargs, + ) -> "torch.Tensor": + """ + Center crop an image to `(size["height"], size["width"])`. If the input size is smaller than `crop_size` along + any edge, the image is padded with 0's and then center cropped. + + Args: + image (`torch.Tensor`): + Image to center crop. + size (`Dict[str, int]`): + Size of the output image in the form `{"height": h, "width": w}`. + """ + output_size = size.shortest_edge + return F.center_crop( + image, + output_size=(output_size, output_size), + **kwargs, + ) + + def _pad_image( + self, + image: "torch.Tensor", + output_size: Tuple[int, int], + constant_values: Union[float, Iterable[float]] = 0, + ) -> "torch.Tensor": + """ + Pad an image with zeros to the given size. + """ + input_height, input_width = image.shape[-2:] + output_height, output_width = output_size + + pad_bottom = output_height - input_height + pad_right = output_width - input_width + padding = (0, 0, pad_right, pad_bottom) + padded_image = F.pad( + image, + padding, + fill=constant_values, + ) + return padded_image + + def pad( + self, + images: list["torch.Tensor"], + constant_values: Union[float, Iterable[float]] = 0, + return_pixel_mask: bool = True, + ) -> tuple: + """ + Pads a batch of images to the bottom and right of the image with zeros to the size of largest height and width + in the batch and optionally returns their corresponding pixel mask. + + Args: + image (`torch.Tensor`): + Image to pad. + constant_values (`float` or `Iterable[float]`, *optional*): + The value to use for the padding if `mode` is `"constant"`. + return_pixel_mask (`bool`, *optional*, defaults to `True`): + Whether to return a pixel mask. + return_tensors (`str` or `TensorType`, *optional*): + The type of tensors to return. Can be one of: + - Unset: Return a list of `np.ndarray`. + - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`. + - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`. + - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`. + - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`. + """ + pad_size = get_max_height_width(images) + + grouped_images, grouped_images_index = group_images_by_shape(images) + processed_images_grouped = {} + processed_masks_grouped = {} + for shape, stacked_images in grouped_images.items(): + stacked_images = self._pad_image( + stacked_images, + pad_size, + constant_values=constant_values, + ) + processed_images_grouped[shape] = stacked_images + + if return_pixel_mask: + stacked_masks = make_pixel_mask(image=stacked_images, output_size=pad_size) + processed_masks_grouped[shape] = stacked_masks + + processed_images = reorder_images(processed_images_grouped, grouped_images_index) + + processed_masks = None + if return_pixel_mask: + processed_masks = reorder_images(processed_masks_grouped, grouped_images_index) + + return processed_images, processed_masks + + def _preprocess( + self, + images: list["torch.Tensor"], + do_resize: bool, + size: SizeDict, + size_divisor: Optional[int], + interpolation: Optional["F.InterpolationMode"], + do_pad: bool, + do_center_crop: bool, + crop_size: SizeDict, + do_rescale: bool, + rescale_factor: float, + do_normalize: bool, + image_mean: Optional[Union[float, list[float]]], + image_std: Optional[Union[float, list[float]]], + return_tensors: Optional[Union[str, TensorType]], + **kwargs, + ) -> BatchFeature: + # Group images by size for batched resizing + grouped_images, grouped_images_index = group_images_by_shape(images) + resized_images_grouped = {} + for shape, stacked_images in grouped_images.items(): + if do_resize: + stacked_images = self.resize( + image=stacked_images, size=size, size_divisor=size_divisor, interpolation=interpolation + ) + resized_images_grouped[shape] = stacked_images + resized_images = reorder_images(resized_images_grouped, grouped_images_index) + + # Group images by size for further processing + # Needed in case do_resize is False, or resize returns images with different sizes + grouped_images, grouped_images_index = group_images_by_shape(resized_images) + processed_images_grouped = {} + for shape, stacked_images in grouped_images.items(): + if do_center_crop: + stacked_images = self.center_crop(stacked_images, crop_size) + # Fused rescale and normalize + stacked_images = self.rescale_and_normalize( + stacked_images, do_rescale, rescale_factor, do_normalize, image_mean, image_std + ) + processed_images_grouped[shape] = stacked_images + + processed_images = reorder_images(processed_images_grouped, grouped_images_index) + + data = {} + if do_pad: + processed_images, processed_masks = self.pad(processed_images, return_pixel_mask=True) + processed_masks = torch.stack(processed_masks, dim=0) if return_tensors else processed_masks + data["pixel_mask"] = processed_masks + + processed_images = torch.stack(processed_images, dim=0) if return_tensors else processed_images + data["pixel_values"] = processed_images + + return BatchFeature(data=data, tensor_type=return_tensors) + + def to_dict(self): + encoder_dict = super().to_dict() + encoder_dict.pop("_valid_processor_keys", None) + encoder_dict.pop("crop_size", None) + return encoder_dict + + +__all__ = ["BridgeTowerImageProcessorFast"] diff --git a/docs/transformers/build/lib/transformers/models/bridgetower/modeling_bridgetower.py b/docs/transformers/build/lib/transformers/models/bridgetower/modeling_bridgetower.py new file mode 100644 index 0000000000000000000000000000000000000000..524b4caa7431002164bc651c6dd991e475dc5e0d --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/bridgetower/modeling_bridgetower.py @@ -0,0 +1,1984 @@ +# coding=utf-8 +# Copyright 2023 The Intel Labs Team Authors, The Microsoft Research Team Authors and HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch BridgeTower Model""" + +import math +from collections import OrderedDict +from dataclasses import dataclass +from typing import List, Optional, Tuple, Union + +import torch +import torch.utils.checkpoint +from torch import nn +from torch.nn import CrossEntropyLoss + +from ...activations import ACT2FN, QuickGELUActivation +from ...modeling_outputs import ( + BaseModelOutputWithPastAndCrossAttentions, + BaseModelOutputWithPoolingAndCrossAttentions, + MaskedLMOutput, + ModelOutput, + SequenceClassifierOutput, +) +from ...modeling_utils import PreTrainedModel, apply_chunking_to_forward +from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer +from ...utils import ( + add_start_docstrings, + add_start_docstrings_to_model_forward, + logging, + replace_return_docstrings, + torch_int, +) +from .configuration_bridgetower import BridgeTowerConfig, BridgeTowerTextConfig, BridgeTowerVisionConfig + + +logger = logging.get_logger(__name__) + +_CONFIG_FOR_DOC = "BridgeTowerConfig" +_CHECKPOINT_FOR_DOC = "BridgeTower/bridgetower-base" +_TOKENIZER_FOR_DOC = "RobertaTokenizer" + + +BRIDGETOWER_START_DOCSTRING = r""" + This model is a PyTorch `torch.nn.Module `_ subclass. Use + it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and + behavior. + + Parameters: + config ([`BridgeTowerConfig`]): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights. +""" + +BRIDGETOWER_INPUTS_DOCSTRING = r""" + Args: + input_ids (`torch.LongTensor` of shape `({0})`): + Indices of input sequence tokens in the vocabulary. Indices can be obtained using [`AutoTokenizer`]. See + [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details. [What are input + IDs?](../glossary#input-ids) + + attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + [What are attention masks?](../glossary#attention-mask) + + token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*): + Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0, + 1]`: + - 0 corresponds to a *sentence A* token, + - 1 corresponds to a *sentence B* token. + [What are token type IDs?](../glossary#token-type-ids) + + pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`): + Pixel values. Pixel values can be obtained using [`BridgeTowerImageProcessor`]. See + [`BridgeTowerImageProcessor.__call__`] for details. + + pixel_mask (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*): + Mask to avoid performing attention on padding pixel values. Mask values selected in `[0, 1]`: + + - 1 for pixels that are real (i.e. **not masked**), + - 0 for pixels that are padding (i.e. **masked**). + `What are attention masks? <../glossary.html#attention-mask>`__ + + head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*): + Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`: + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This + is useful if you want more control over how to convert `input_ids` indices into associated vectors than the + model's internal embedding lookup matrix. + + image_embeds (`torch.FloatTensor` of shape `(batch_size, num_patches, hidden_size)`, *optional*): + Optionally, instead of passing `pixel_values`, you can choose to directly pass an embedded representation. + This is useful if you want more control over how to convert `pixel_values` into patch embeddings. + + image_token_type_idx (`int`, *optional*): + - The token type ids for images. + + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + interpolate_pos_encoding (`bool`, defaults to `False`): + Whether to interpolate the pre-trained position encodings. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. +""" + + +@dataclass +class BridgeTowerModelOutput(ModelOutput): + """ + Output type of [`BridgeTowerModel`]. + + Args: + text_features (`torch.FloatTensor` of shape `(batch_size, text_sequence_length, hidden_size)`): + Sequence of hidden-states at the text output of the last layer of the model. + image_features (`torch.FloatTensor` of shape `(batch_size, image_sequence_length, hidden_size)`): + Sequence of hidden-states at the image output of the last layer of the model. + pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size x 2)`): + Concatenation of last layer hidden-state of the first token of the text and image sequence (classification + token), respectively, after further processing through layers used for auxiliary pretraining tasks. + hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, + + one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of + the model at the output of each layer plus the optional initial embedding outputs. + attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + + text_features: Optional[torch.FloatTensor] = None + image_features: Optional[torch.FloatTensor] = None + pooler_output: Optional[torch.FloatTensor] = None + hidden_states: Optional[Tuple[torch.FloatTensor]] = None + attentions: Optional[Tuple[torch.FloatTensor]] = None + + +@dataclass +class BridgeTowerContrastiveOutput(ModelOutput): + """ + Output type of ['BridgeTowerForContrastiveLearning'] + + Args: + loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`: + Image-text contrastive loss. + logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`): + Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). + text_embeds (`torch.FloatTensor)`, *optional*, returned when model is initialized with `with_projection=True`): + The text embeddings obtained by applying the projection layer to the pooler_output. + image_embeds (`torch.FloatTensor)`, *optional*, returned when model is initialized with `with_projection=True`): + The image embeddings obtained by applying the projection layer to the pooler_output. + cross_embeds (`torch.FloatTensor)`, *optional*, returned when model is initialized with `with_projection=True`): + The text-image cross-modal embeddings obtained by applying the projection layer to the pooler_output. + hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, + + one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of + the model at the output of each layer plus the optional initial embedding outputs. + attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + """ + + loss: Optional[torch.FloatTensor] = None + logits: Optional[torch.FloatTensor] = None + text_embeds: Optional[Tuple[torch.FloatTensor]] = None + image_embeds: Optional[Tuple[torch.FloatTensor]] = None + cross_embeds: Optional[Tuple[torch.FloatTensor]] = None + hidden_states: Optional[Tuple[torch.FloatTensor]] = None + attentions: Optional[Tuple[torch.FloatTensor]] = None + + +class BridgeTowerResidualAttention(nn.Module): + def __init__(self, config): + super().__init__() + + self.attn = nn.MultiheadAttention(config.hidden_size, config.hidden_size // 64) + self.ln_1 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.mlp = nn.ModuleDict( + OrderedDict( + [ + ("c_fc", nn.Linear(config.hidden_size, config.hidden_size * 4)), + ("gelu", QuickGELUActivation()), + ("c_proj", nn.Linear(config.hidden_size * 4, config.hidden_size)), + ] + ) + ) + self.ln_2 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.attn_mask = None + + def attention(self, hidden_state: torch.Tensor, attention_mask: torch.Tensor): + if attention_mask is not None: + attention_mask = attention_mask.to(dtype=torch.bool, device=hidden_state.device) + self.attn_mask = ( + self.attn_mask.to(dtype=hidden_state.dtype, device=hidden_state.device) + if self.attn_mask is not None + else None + ) + return self.attn( + hidden_state, + hidden_state, + hidden_state, + need_weights=False, + attn_mask=self.attn_mask, + key_padding_mask=attention_mask, + )[0] + + def forward(self, hidden_state: torch.Tensor, attention_mask: Optional[torch.Tensor] = None): + residual_state = hidden_state + self.attention(self.ln_1(hidden_state), attention_mask) + hidden_state = self.ln_2(residual_state) + for _, layer in self.mlp.items(): + hidden_state = layer(hidden_state) + hidden_state = residual_state + hidden_state + return hidden_state + + +class BridgeTowerTransformer(nn.Module): + def __init__(self, config): + super().__init__() + self.hidden_size = config.hidden_size + self.num_hidden_layers = config.num_hidden_layers + if config.remove_last_layer: + self.resblocks = nn.ModuleList( + [BridgeTowerResidualAttention(config) for _ in range(self.num_hidden_layers - 1)] + ) + else: + self.resblocks = nn.ModuleList( + [BridgeTowerResidualAttention(config) for _ in range(self.num_hidden_layers)] + ) + self.stop_gradient = config.stop_gradient + + def forward(self, hidden_state: torch.Tensor, attention_mask: Optional[torch.Tensor] = None): + hidden_states = [] + for block in self.resblocks: + hidden_state = block(hidden_state, attention_mask) + if self.stop_gradient: + hidden_states.append(hidden_state.detach()) + else: + hidden_states.append(hidden_state) + return hidden_states + + +# Copied from transformers.models.clip.modeling_clip.CLIPVisionEmbeddings with CLIP->BridgeTower +class BridgeTowerVisionEmbeddings(nn.Module): + def __init__(self, config: BridgeTowerVisionConfig): + super().__init__() + self.config = config + self.embed_dim = config.hidden_size + self.image_size = config.image_size + self.patch_size = config.patch_size + + self.class_embedding = nn.Parameter(torch.randn(self.embed_dim)) + + self.patch_embedding = nn.Conv2d( + in_channels=config.num_channels, + out_channels=self.embed_dim, + kernel_size=self.patch_size, + stride=self.patch_size, + bias=False, + ) + + self.num_patches = (self.image_size // self.patch_size) ** 2 + self.num_positions = self.num_patches + 1 + self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim) + self.register_buffer("position_ids", torch.arange(self.num_positions).expand((1, -1)), persistent=False) + + def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width: int) -> torch.Tensor: + """ + This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution + images. This method is also adapted to support torch.jit tracing. + + Adapted from: + - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and + - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211 + """ + + num_patches = embeddings.shape[1] - 1 + position_embedding = self.position_embedding.weight.unsqueeze(0) + num_positions = position_embedding.shape[1] - 1 + + # always interpolate when tracing to ensure the exported model works for dynamic input shapes + if not torch.jit.is_tracing() and num_patches == num_positions and height == width: + return self.position_embedding(self.position_ids) + + class_pos_embed = position_embedding[:, :1] + patch_pos_embed = position_embedding[:, 1:] + + dim = embeddings.shape[-1] + + new_height = height // self.patch_size + new_width = width // self.patch_size + + sqrt_num_positions = torch_int(num_positions**0.5) + patch_pos_embed = patch_pos_embed.reshape(1, sqrt_num_positions, sqrt_num_positions, dim) + patch_pos_embed = patch_pos_embed.permute(0, 3, 1, 2) + + patch_pos_embed = nn.functional.interpolate( + patch_pos_embed, + size=(new_height, new_width), + mode="bicubic", + align_corners=False, + ) + + patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim) + + return torch.cat((class_pos_embed, patch_pos_embed), dim=1) + + def forward(self, pixel_values: torch.FloatTensor, interpolate_pos_encoding=False) -> torch.Tensor: + batch_size, _, height, width = pixel_values.shape + if not interpolate_pos_encoding and (height != self.image_size or width != self.image_size): + raise ValueError( + f"Input image size ({height}*{width}) doesn't match model ({self.image_size}*{self.image_size})." + ) + target_dtype = self.patch_embedding.weight.dtype + patch_embeds = self.patch_embedding(pixel_values.to(dtype=target_dtype)) # shape = [*, width, grid, grid] + patch_embeds = patch_embeds.flatten(2).transpose(1, 2) + + class_embeds = self.class_embedding.expand(batch_size, 1, -1) + embeddings = torch.cat([class_embeds, patch_embeds], dim=1) + if interpolate_pos_encoding: + embeddings = embeddings + self.interpolate_pos_encoding(embeddings, height, width) + else: + embeddings = embeddings + self.position_embedding(self.position_ids) + return embeddings + + +class BridgeTowerVisionTransformer(nn.Module): + def __init__(self, config): + super().__init__() + + self.embeddings = BridgeTowerVisionEmbeddings(config) + self.ln_pre = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.transformer = BridgeTowerTransformer(config) + self.ln_post = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.share_layernorm = config.share_layernorm + if not config.share_layernorm: + self.ln_separate = nn.ModuleList( + [nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) for _ in range(config.num_hidden_layers)] + ) + + def forward( + self, + pixel_values: torch.Tensor, + attention_mask, + interpolate_pos_encoding: bool = False, + ): + hidden_states = self.embeddings(pixel_values, interpolate_pos_encoding) + hidden_states = self.ln_pre(hidden_states) + # NLD -> LND + hidden_states = hidden_states.permute(1, 0, 2) + + hidden_states = self.transformer(hidden_states, attention_mask) + # shape = [num_hidden_layers, hidden_size, *, grid ** 2] + hidden_states = torch.stack(hidden_states, dim=0) + # shape = [num_hidden_layers, *, hidden_size, grid ** 2] + hidden_states = hidden_states.permute(0, 2, 1, 3) + if self.share_layernorm: + hidden_states = self.ln_post(hidden_states) + else: + hidden_states_stack = [] + for hidden_states, ln in zip(hidden_states, self.ln_separate): + hidden_states = ln(hidden_states) + hidden_states_stack.append(hidden_states) + # shape = [num_hidden_layers, *, hidden_size, grid ** 2] + hidden_states = torch.stack(hidden_states_stack, dim=0) + return hidden_states + + def forward_pre( + self, + pixel_values: torch.Tensor, + interpolate_pos_encoding: bool = False, + ): + hidden_states = self.embeddings(pixel_values, interpolate_pos_encoding=interpolate_pos_encoding) + hidden_states = self.ln_pre(hidden_states) + # NLD -> LND + hidden_states = hidden_states.permute(1, 0, 2) + return hidden_states + + def forward_post(self, hidden_state: torch.Tensor): + visual_output_post = hidden_state.permute(1, 0, 2) + visual_output_post = self.ln_post(visual_output_post) + return visual_output_post + + +class BridgeTowerLinkTower(nn.Module): + def __init__(self, config): + super().__init__() + self.link_tower_type = config.link_tower_type + self.hidden_size = config.hidden_size + if config.link_tower_type in ["add", "scaled_add", "interpolate"]: + if config.link_tower_type == "scaled_add": + self.scaled_factor = nn.Parameter(torch.tensor(1.0)) + elif config.link_tower_type == "interpolate": + self.beta = nn.Parameter(torch.tensor(0.5)) + self.LayerNorm = nn.LayerNorm(self.hidden_size, eps=config.layer_norm_eps) + else: + raise NotImplementedError(f"link_tower_type {config.link_tower_type} is not implemented") + + def forward(self, hidden_states, cross_modal_hidden_states, attention_mask): + if self.link_tower_type == "add": + return self.LayerNorm(hidden_states + cross_modal_hidden_states) + elif self.link_tower_type == "scaled_add": + return self.LayerNorm(hidden_states * self.scaled_factor + cross_modal_hidden_states) + elif self.link_tower_type == "interpolate": + return self.LayerNorm(hidden_states * (1 - self.beta) + cross_modal_hidden_states * self.beta) + else: + raise NotImplementedError(f"link_tower_type {self.link_tower_type} is not implemented") + + +# Copied from transformers.models.bert.modeling_bert.BertSelfOutput with Bert->BridgeTower +class BridgeTowerSelfOutput(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = self.LayerNorm(hidden_states + input_tensor) + return hidden_states + + +# Copied from transformers.models.bert.modeling_bert.BertIntermediate with Bert->BridgeTower +class BridgeTowerIntermediate(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.intermediate_size) + if isinstance(config.hidden_act, str): + self.intermediate_act_fn = ACT2FN[config.hidden_act] + else: + self.intermediate_act_fn = config.hidden_act + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.intermediate_act_fn(hidden_states) + return hidden_states + + +# Copied from transformers.models.bert.modeling_bert.BertOutput with Bert->BridgeTower +class BridgeTowerOutput(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.intermediate_size, config.hidden_size) + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = self.LayerNorm(hidden_states + input_tensor) + return hidden_states + + +# Copied from transformers.models.bert.modeling_bert.BertPooler with Bert->BridgeTower +class BridgeTowerPooler(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + self.activation = nn.Tanh() + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + # We "pool" the model by simply taking the hidden state corresponding + # to the first token. + first_token_tensor = hidden_states[:, 0] + pooled_output = self.dense(first_token_tensor) + pooled_output = self.activation(pooled_output) + return pooled_output + + +# Copied from transformers.models.roberta.modeling_roberta.RobertaSelfAttention with Roberta->BridgeTower +class BridgeTowerSelfAttention(nn.Module): + def __init__(self, config, position_embedding_type=None): + super().__init__() + if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"): + raise ValueError( + f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention " + f"heads ({config.num_attention_heads})" + ) + + self.num_attention_heads = config.num_attention_heads + self.attention_head_size = int(config.hidden_size / config.num_attention_heads) + self.all_head_size = self.num_attention_heads * self.attention_head_size + + self.query = nn.Linear(config.hidden_size, self.all_head_size) + self.key = nn.Linear(config.hidden_size, self.all_head_size) + self.value = nn.Linear(config.hidden_size, self.all_head_size) + + self.dropout = nn.Dropout(config.attention_probs_dropout_prob) + self.position_embedding_type = position_embedding_type or getattr( + config, "position_embedding_type", "absolute" + ) + if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query": + self.max_position_embeddings = config.max_position_embeddings + self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size) + + self.is_decoder = config.is_decoder + + def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor: + new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size) + x = x.view(new_x_shape) + return x.permute(0, 2, 1, 3) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.FloatTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + encoder_attention_mask: Optional[torch.FloatTensor] = None, + past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None, + output_attentions: Optional[bool] = False, + ) -> Tuple[torch.Tensor]: + mixed_query_layer = self.query(hidden_states) + + # If this is instantiated as a cross-attention module, the keys + # and values come from an encoder; the attention mask needs to be + # such that the encoder's padding tokens are not attended to. + is_cross_attention = encoder_hidden_states is not None + + if is_cross_attention and past_key_value is not None: + # reuse k,v, cross_attentions + key_layer = past_key_value[0] + value_layer = past_key_value[1] + attention_mask = encoder_attention_mask + elif is_cross_attention: + key_layer = self.transpose_for_scores(self.key(encoder_hidden_states)) + value_layer = self.transpose_for_scores(self.value(encoder_hidden_states)) + attention_mask = encoder_attention_mask + elif past_key_value is not None: + key_layer = self.transpose_for_scores(self.key(hidden_states)) + value_layer = self.transpose_for_scores(self.value(hidden_states)) + key_layer = torch.cat([past_key_value[0], key_layer], dim=2) + value_layer = torch.cat([past_key_value[1], value_layer], dim=2) + else: + key_layer = self.transpose_for_scores(self.key(hidden_states)) + value_layer = self.transpose_for_scores(self.value(hidden_states)) + + query_layer = self.transpose_for_scores(mixed_query_layer) + + use_cache = past_key_value is not None + if self.is_decoder: + # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states. + # Further calls to cross_attention layer can then reuse all cross-attention + # key/value_states (first "if" case) + # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of + # all previous decoder key/value_states. Further calls to uni-directional self-attention + # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case) + # if encoder bi-directional self-attention `past_key_value` is always `None` + past_key_value = (key_layer, value_layer) + + # Take the dot product between "query" and "key" to get the raw attention scores. + attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2)) + + if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query": + query_length, key_length = query_layer.shape[2], key_layer.shape[2] + if use_cache: + position_ids_l = torch.tensor(key_length - 1, dtype=torch.long, device=hidden_states.device).view( + -1, 1 + ) + else: + position_ids_l = torch.arange(query_length, dtype=torch.long, device=hidden_states.device).view(-1, 1) + position_ids_r = torch.arange(key_length, dtype=torch.long, device=hidden_states.device).view(1, -1) + distance = position_ids_l - position_ids_r + + positional_embedding = self.distance_embedding(distance + self.max_position_embeddings - 1) + positional_embedding = positional_embedding.to(dtype=query_layer.dtype) # fp16 compatibility + + if self.position_embedding_type == "relative_key": + relative_position_scores = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding) + attention_scores = attention_scores + relative_position_scores + elif self.position_embedding_type == "relative_key_query": + relative_position_scores_query = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding) + relative_position_scores_key = torch.einsum("bhrd,lrd->bhlr", key_layer, positional_embedding) + attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key + + attention_scores = attention_scores / math.sqrt(self.attention_head_size) + if attention_mask is not None: + # Apply the attention mask is (precomputed for all layers in BridgeTowerModel forward() function) + attention_scores = attention_scores + attention_mask + + # Normalize the attention scores to probabilities. + attention_probs = nn.functional.softmax(attention_scores, dim=-1) + + # This is actually dropping out entire tokens to attend to, which might + # seem a bit unusual, but is taken from the original Transformer paper. + attention_probs = self.dropout(attention_probs) + + # Mask heads if we want to + if head_mask is not None: + attention_probs = attention_probs * head_mask + + context_layer = torch.matmul(attention_probs, value_layer) + + context_layer = context_layer.permute(0, 2, 1, 3).contiguous() + new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,) + context_layer = context_layer.view(new_context_layer_shape) + + outputs = (context_layer, attention_probs) if output_attentions else (context_layer,) + + if self.is_decoder: + outputs = outputs + (past_key_value,) + return outputs + + +BRIDGE_TOWER_SELF_ATTENTION_CLASSES = { + "eager": BridgeTowerSelfAttention, +} + + +# Copied from transformers.models.bert.modeling_bert.BertAttention with Bert->BridgeTower,BERT->BRIDGE_TOWER +class BridgeTowerAttention(nn.Module): + def __init__(self, config, position_embedding_type=None): + super().__init__() + self.self = BRIDGE_TOWER_SELF_ATTENTION_CLASSES[config._attn_implementation]( + config, position_embedding_type=position_embedding_type + ) + self.output = BridgeTowerSelfOutput(config) + self.pruned_heads = set() + + def prune_heads(self, heads): + if len(heads) == 0: + return + heads, index = find_pruneable_heads_and_indices( + heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads + ) + + # Prune linear layers + self.self.query = prune_linear_layer(self.self.query, index) + self.self.key = prune_linear_layer(self.self.key, index) + self.self.value = prune_linear_layer(self.self.value, index) + self.output.dense = prune_linear_layer(self.output.dense, index, dim=1) + + # Update hyper params and store pruned heads + self.self.num_attention_heads = self.self.num_attention_heads - len(heads) + self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads + self.pruned_heads = self.pruned_heads.union(heads) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.FloatTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + encoder_attention_mask: Optional[torch.FloatTensor] = None, + past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None, + output_attentions: Optional[bool] = False, + ) -> Tuple[torch.Tensor]: + self_outputs = self.self( + hidden_states, + attention_mask, + head_mask, + encoder_hidden_states, + encoder_attention_mask, + past_key_value, + output_attentions, + ) + attention_output = self.output(self_outputs[0], hidden_states) + outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them + return outputs + + +class BridgeTowerBertCrossLayer(nn.Module): + def __init__(self, config): + super().__init__() + self.chunk_size_feed_forward = config.chunk_size_feed_forward + self.seq_len_dim = 1 + self.attention = BridgeTowerAttention(config) + self.is_decoder = config.is_decoder + self.add_cross_attention = config.add_cross_attention + self.crossattention = BridgeTowerAttention(config) + self.intermediate = BridgeTowerIntermediate(config) + self.output = BridgeTowerOutput(config) + + def forward( + self, + hidden_states, + encoder_hidden_states, + attention_mask=None, + head_mask=None, + encoder_attention_mask=None, + past_key_value=None, + output_attentions=False, + ): + # decoder uni-directional self-attention cached key/values tuple is at positions 1,2 + self_attention_outputs = self.attention( + hidden_states, + attention_mask=attention_mask, + head_mask=None, + output_attentions=output_attentions, + past_key_value=None, + ) + attention_output = self_attention_outputs[0] + + # if decoder, the last output is tuple of self-attn cache + # add self attentions if we output attention weights + outputs = self_attention_outputs[1:] + + cross_attention_outputs = self.crossattention( + attention_output, + attention_mask=attention_mask, + head_mask=head_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + past_key_value=past_key_value, + output_attentions=output_attentions, + ) + attention_output = cross_attention_outputs[0] + # add cross attentions if we output attention weights + outputs = outputs + cross_attention_outputs[1:-1] + + layer_output = apply_chunking_to_forward( + self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attention_output + ) + outputs = (layer_output,) + outputs + + return outputs + + def feed_forward_chunk(self, attention_output): + intermediate_output = self.intermediate(attention_output) + layer_output = self.output(intermediate_output, attention_output) + return layer_output + + +class BridgeTowerTextLayer(nn.Module): + def __init__(self, config): + super().__init__() + self.chunk_size_feed_forward = config.chunk_size_feed_forward + self.seq_len_dim = 1 + self.attention = BridgeTowerAttention(config) + self.is_decoder = config.is_decoder + self.add_cross_attention = config.add_cross_attention + if self.add_cross_attention: + if not self.is_decoder: + raise ValueError(f"{self} should be used as a decoder model if cross attention is added") + self.crossattention = BridgeTowerAttention(config, position_embedding_type="absolute") + self.intermediate = BridgeTowerIntermediate(config) + self.output = BridgeTowerOutput(config) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.FloatTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + encoder_attention_mask: Optional[torch.FloatTensor] = None, + past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None, + output_attentions: Optional[bool] = False, + ) -> Tuple[torch.Tensor]: + # decoder uni-directional self-attention cached key/values tuple is at positions 1,2 + self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None + self_attention_outputs = self.attention( + hidden_states, + attention_mask, + head_mask, + output_attentions=output_attentions, + past_key_value=self_attn_past_key_value, + ) + attention_output = self_attention_outputs[0] + + # if decoder, the last output is tuple of self-attn cache + if self.is_decoder: + outputs = self_attention_outputs[1:-1] + present_key_value = self_attention_outputs[-1] + else: + outputs = self_attention_outputs[1:] # add self attentions if we output attention weights + + cross_attn_present_key_value = None + if self.is_decoder and encoder_hidden_states is not None: + if not hasattr(self, "crossattention"): + raise ValueError( + f"If `encoder_hidden_states` are passed, {self} has to be instantiated with cross-attention layers" + " by setting `config.add_cross_attention=True`" + ) + + # cross_attn cached key/values tuple is at positions 3,4 of past_key_value tuple + cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None + cross_attention_outputs = self.crossattention( + attention_output, + attention_mask, + head_mask, + encoder_hidden_states, + encoder_attention_mask, + cross_attn_past_key_value, + output_attentions, + ) + attention_output = cross_attention_outputs[0] + outputs = outputs + cross_attention_outputs[1:-1] # add cross attentions if we output attention weights + + # add cross-attn cache to positions 3,4 of present_key_value tuple + cross_attn_present_key_value = cross_attention_outputs[-1] + present_key_value = present_key_value + cross_attn_present_key_value + + layer_output = apply_chunking_to_forward( + self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attention_output + ) + outputs = (layer_output,) + outputs + + # if decoder, return the attn key/values as the last output + if self.is_decoder: + outputs = outputs + (present_key_value,) + + return outputs + + def feed_forward_chunk(self, attention_output): + intermediate_output = self.intermediate(attention_output) + layer_output = self.output(intermediate_output, attention_output) + return layer_output + + +# Copied from transformers.models.roberta.modeling_roberta.RobertaEncoder with Roberta->BridgeTowerText +class BridgeTowerTextEncoder(nn.Module): + def __init__(self, config): + super().__init__() + self.config = config + self.layer = nn.ModuleList([BridgeTowerTextLayer(config) for _ in range(config.num_hidden_layers)]) + self.gradient_checkpointing = False + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.FloatTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + encoder_attention_mask: Optional[torch.FloatTensor] = None, + past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = False, + output_hidden_states: Optional[bool] = False, + return_dict: Optional[bool] = True, + ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPastAndCrossAttentions]: + all_hidden_states = () if output_hidden_states else None + all_self_attentions = () if output_attentions else None + all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None + + if self.gradient_checkpointing and self.training: + if use_cache: + logger.warning_once( + "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." + ) + use_cache = False + + next_decoder_cache = () if use_cache else None + for i, layer_module in enumerate(self.layer): + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + layer_head_mask = head_mask[i] if head_mask is not None else None + past_key_value = past_key_values[i] if past_key_values is not None else None + + if self.gradient_checkpointing and self.training: + layer_outputs = self._gradient_checkpointing_func( + layer_module.__call__, + hidden_states, + attention_mask, + layer_head_mask, + encoder_hidden_states, + encoder_attention_mask, + past_key_value, + output_attentions, + ) + else: + layer_outputs = layer_module( + hidden_states, + attention_mask, + layer_head_mask, + encoder_hidden_states, + encoder_attention_mask, + past_key_value, + output_attentions, + ) + + hidden_states = layer_outputs[0] + if use_cache: + next_decoder_cache += (layer_outputs[-1],) + if output_attentions: + all_self_attentions = all_self_attentions + (layer_outputs[1],) + if self.config.add_cross_attention: + all_cross_attentions = all_cross_attentions + (layer_outputs[2],) + + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + if not return_dict: + return tuple( + v + for v in [ + hidden_states, + next_decoder_cache, + all_hidden_states, + all_self_attentions, + all_cross_attentions, + ] + if v is not None + ) + return BaseModelOutputWithPastAndCrossAttentions( + last_hidden_state=hidden_states, + past_key_values=next_decoder_cache, + hidden_states=all_hidden_states, + attentions=all_self_attentions, + cross_attentions=all_cross_attentions, + ) + + +# Copied from transformers.models.roberta.modeling_roberta.RobertaEmbeddings with Roberta->BridgeTowerText +class BridgeTowerTextEmbeddings(nn.Module): + """ + Same as BertEmbeddings with a tiny tweak for positional embeddings indexing. + """ + + # Copied from transformers.models.bert.modeling_bert.BertEmbeddings.__init__ + def __init__(self, config): + super().__init__() + self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id) + self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size) + self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size) + + # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load + # any TensorFlow checkpoint file + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + # position_ids (1, len position emb) is contiguous in memory and exported when serialized + self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") + self.register_buffer( + "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False + ) + self.register_buffer( + "token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False + ) + + # End copy + self.padding_idx = config.pad_token_id + self.position_embeddings = nn.Embedding( + config.max_position_embeddings, config.hidden_size, padding_idx=self.padding_idx + ) + + def forward( + self, input_ids=None, token_type_ids=None, position_ids=None, inputs_embeds=None, past_key_values_length=0 + ): + if position_ids is None: + if input_ids is not None: + # Create the position ids from the input token ids. Any padded tokens remain padded. + position_ids = create_position_ids_from_input_ids(input_ids, self.padding_idx, past_key_values_length) + else: + position_ids = self.create_position_ids_from_inputs_embeds(inputs_embeds) + + if input_ids is not None: + input_shape = input_ids.size() + else: + input_shape = inputs_embeds.size()[:-1] + + seq_length = input_shape[1] + + # Setting the token_type_ids to the registered buffer in constructor where it is all zeros, which usually occurs + # when its auto-generated, registered buffer helps users when tracing the model without passing token_type_ids, solves + # issue #5664 + if token_type_ids is None: + if hasattr(self, "token_type_ids"): + buffered_token_type_ids = self.token_type_ids[:, :seq_length] + buffered_token_type_ids_expanded = buffered_token_type_ids.expand(input_shape[0], seq_length) + token_type_ids = buffered_token_type_ids_expanded + else: + token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device) + + if inputs_embeds is None: + inputs_embeds = self.word_embeddings(input_ids) + token_type_embeddings = self.token_type_embeddings(token_type_ids) + + embeddings = inputs_embeds + token_type_embeddings + if self.position_embedding_type == "absolute": + position_embeddings = self.position_embeddings(position_ids) + embeddings += position_embeddings + embeddings = self.LayerNorm(embeddings) + embeddings = self.dropout(embeddings) + return embeddings + + def create_position_ids_from_inputs_embeds(self, inputs_embeds): + """ + We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids. + + Args: + inputs_embeds: torch.Tensor + + Returns: torch.Tensor + """ + input_shape = inputs_embeds.size()[:-1] + sequence_length = input_shape[1] + + position_ids = torch.arange( + self.padding_idx + 1, sequence_length + self.padding_idx + 1, dtype=torch.long, device=inputs_embeds.device + ) + return position_ids.unsqueeze(0).expand(input_shape) + + +# Copied from transformers.models.roberta.modeling_roberta.create_position_ids_from_input_ids +def create_position_ids_from_input_ids(input_ids, padding_idx, past_key_values_length=0): + """ + Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols + are ignored. This is modified from fairseq's `utils.make_positions`. + + Args: + x: torch.Tensor x: + + Returns: torch.Tensor + """ + # The series of casts and type-conversions here are carefully balanced to both work with ONNX export and XLA. + mask = input_ids.ne(padding_idx).int() + incremental_indices = (torch.cumsum(mask, dim=1).type_as(mask) + past_key_values_length) * mask + return incremental_indices.long() + padding_idx + + +class BridgeTowerPreTrainedModel(PreTrainedModel): + """ + An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained + models. + """ + + config_class = BridgeTowerConfig + base_model_prefix = "bridgetower" + supports_gradient_checkpointing = False + _no_split_modules = ["BridgeTowerSelfAttention", "BridgeTowerResidualAttention"] + _skip_keys_device_placement = "past_key_values" + + def _init_weights(self, module): + if isinstance(module, BridgeTowerVisionModel): + proj_std = (module.visual.transformer.hidden_size**-0.5) * ( + (2 * module.visual.transformer.num_hidden_layers) ** -0.5 + ) + attn_std = module.visual.transformer.hidden_size**-0.5 + fc_std = (2 * module.visual.transformer.hidden_size) ** -0.5 + for block in module.visual.transformer.resblocks: + nn.init.normal_(block.attn.in_proj_weight, std=attn_std * self.config.initializer_factor) + nn.init.normal_(block.attn.out_proj.weight, std=proj_std * self.config.initializer_factor) + nn.init.normal_(block.mlp.c_fc.weight, std=fc_std * self.config.initializer_factor) + nn.init.normal_(block.mlp.c_proj.weight, std=proj_std * self.config.initializer_factor) + + nn.init.normal_(module.visual.embeddings.class_embedding, std=attn_std * self.config.initializer_factor) + nn.init.normal_( + module.visual.embeddings.position_embedding.weight, std=attn_std * self.config.initializer_factor + ) + elif isinstance(module, (nn.Linear, nn.Conv2d, nn.Embedding)): + module.weight.data.normal_(mean=0.0, std=0.05 * self.config.initializer_factor) + elif isinstance(module, nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + + if isinstance(module, nn.Linear) and module.bias is not None: + module.bias.data.zero_() + + +class BridgeTowerVisionModel(BridgeTowerPreTrainedModel): + config_class = BridgeTowerVisionConfig + + def __init__(self, config): + super().__init__(config) + self.visual = BridgeTowerVisionTransformer(config) + + @property + def dtype(self): + return self.visual.embeddings.patch_embedding.weight.dtype + + def forward(self, image, image_mask=None, interpolate_pos_encoding=False): + return self.visual(image.type(self.dtype), image_mask, interpolate_pos_encoding) + + +class BridgeTowerTextModel(BridgeTowerPreTrainedModel): + """ + + The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of + cross-attention is added between the self-attention layers, following the architecture described in *Attention is + all you need*_ by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz + Kaiser and Illia Polosukhin. + + To behave as an decoder the model needs to be initialized with the `is_decoder` argument of the configuration set + to `True`. To be used in a Seq2Seq model, the model needs to initialized with both `is_decoder` argument and + `add_cross_attention` set to `True`; an `encoder_hidden_states` is then expected as an input to the forward pass. + + .. _*Attention is all you need*: https://arxiv.org/abs/1706.03762 + + """ + + config_class = BridgeTowerTextConfig + + def __init__(self, config, add_pooling_layer=True): + super().__init__(config) + self.config = config + + self.embeddings = BridgeTowerTextEmbeddings(config) + self.encoder = BridgeTowerTextEncoder(config) + + self.pooler = BridgeTowerPooler(config) if add_pooling_layer else None + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.embeddings.word_embeddings + + def set_input_embeddings(self, value): + self.embeddings.word_embeddings = value + + def _prune_heads(self, heads_to_prune): + """ + Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base + class PreTrainedModel + """ + for layer, heads in heads_to_prune.items(): + self.encoder.layer[layer].attention.prune_heads(heads) + + # Copied from transformers.models.clap.modeling_clap.ClapTextModel.forward + def forward( + self, + input_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + token_type_ids: Optional[torch.Tensor] = None, + position_ids: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, + encoder_attention_mask: Optional[torch.Tensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPoolingAndCrossAttentions]: + r""" + encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if + the model is configured as a decoder. + encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in + the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`): + Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding. + + If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that + don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all + `decoder_input_ids` of shape `(batch_size, sequence_length)`. + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see + `past_key_values`). + """ + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if self.config.is_decoder: + use_cache = use_cache if use_cache is not None else self.config.use_cache + else: + use_cache = False + + if input_ids is not None and inputs_embeds is not None: + raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") + elif input_ids is not None: + self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask) + input_shape = input_ids.size() + elif inputs_embeds is not None: + input_shape = inputs_embeds.size()[:-1] + else: + raise ValueError("You have to specify either input_ids or inputs_embeds") + + batch_size, seq_length = input_shape + device = input_ids.device if input_ids is not None else inputs_embeds.device + + # past_key_values_length + past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0 + + if attention_mask is None: + attention_mask = torch.ones(((batch_size, seq_length + past_key_values_length)), device=device) + + if token_type_ids is None: + if hasattr(self.embeddings, "token_type_ids"): + buffered_token_type_ids = self.embeddings.token_type_ids[:, :seq_length] + buffered_token_type_ids_expanded = buffered_token_type_ids.expand(batch_size, seq_length) + token_type_ids = buffered_token_type_ids_expanded + else: + token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device) + + # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length] + # ourselves in which case we just need to make it broadcastable to all heads. + extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape) + + # If a 2D or 3D attention mask is provided for the cross-attention + # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length] + if self.config.is_decoder and encoder_hidden_states is not None: + encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size() + encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length) + if encoder_attention_mask is None: + encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device) + encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask) + else: + encoder_extended_attention_mask = None + + # Prepare head mask if needed + # 1.0 in head_mask indicate we keep the head + # attention_probs has shape bsz x n_heads x N x N + # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads] + # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length] + head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers) + + embedding_output = self.embeddings( + input_ids=input_ids, + position_ids=position_ids, + token_type_ids=token_type_ids, + inputs_embeds=inputs_embeds, + past_key_values_length=past_key_values_length, + ) + encoder_outputs = self.encoder( + embedding_output, + attention_mask=extended_attention_mask, + head_mask=head_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_extended_attention_mask, + past_key_values=past_key_values, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + sequence_output = encoder_outputs[0] + pooled_output = self.pooler(sequence_output) if self.pooler is not None else None + + if not return_dict: + return (sequence_output, pooled_output) + encoder_outputs[1:] + + return BaseModelOutputWithPoolingAndCrossAttentions( + last_hidden_state=sequence_output, + pooler_output=pooled_output, + past_key_values=encoder_outputs.past_key_values, + hidden_states=encoder_outputs.hidden_states, + attentions=encoder_outputs.attentions, + cross_attentions=encoder_outputs.cross_attentions, + ) + + +@add_start_docstrings( + "The bare BridgeTower Model transformer outputting BridgeTowerModelOutput object without any specific head on" + " top.", + BRIDGETOWER_START_DOCSTRING, +) +class BridgeTowerModel(BridgeTowerPreTrainedModel): + def __init__(self, config): + super().__init__(config) + self.config = config + vision_config = config.vision_config + text_config = config.text_config + + if config.share_cross_modal_transformer_layers: + self.cross_modal_text_transform = nn.Linear(text_config.hidden_size, config.hidden_size) + self.cross_modal_image_transform = nn.Linear(vision_config.hidden_size, config.hidden_size) + else: + self.cross_modal_text_transform = nn.ModuleList( + [nn.Linear(text_config.hidden_size, config.hidden_size) for _ in range(config.num_hidden_layers)] + ) + self.cross_modal_image_transform = nn.ModuleList( + [nn.Linear(vision_config.hidden_size, config.hidden_size) for _ in range(config.num_hidden_layers)] + ) + + self.token_type_embeddings = nn.Embedding(2, config.hidden_size) + + self.vision_model = BridgeTowerVisionModel(vision_config) + + self.text_model = BridgeTowerTextModel(text_config) + + if not vision_config.share_layernorm and config.init_layernorm_from_vision_encoder: + for ln in self.vision_model.visual.cross_modal_ln_separate: + ln.weight.data = self.vision_model.visual.ln_post.weight.data + ln.bias.data = self.vision_model.visual.ln_post.bias.data + + self.cross_modal_image_layers = nn.ModuleList( + [BridgeTowerBertCrossLayer(text_config) for _ in range(config.num_hidden_layers)] + ) + self.cross_modal_text_layers = nn.ModuleList( + [BridgeTowerBertCrossLayer(text_config) for _ in range(config.num_hidden_layers)] + ) + + # Class token => Linear => Tanh + self.cross_modal_image_pooler = BridgeTowerPooler(config) + self.cross_modal_text_pooler = BridgeTowerPooler(config) + + # Initialize BridgeTower Components + self.cross_modal_text_layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.cross_modal_image_layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + + if config.share_link_tower_layers: + self.cross_modal_text_link_tower = BridgeTowerLinkTower(config) + self.cross_modal_image_link_tower = BridgeTowerLinkTower(config) + else: + self.cross_modal_text_link_tower = nn.ModuleList( + [BridgeTowerLinkTower(config) for _ in range(config.num_hidden_layers - 1)] + ) + self.cross_modal_image_link_tower = nn.ModuleList( + [BridgeTowerLinkTower(config) for _ in range(config.num_hidden_layers - 1)] + ) + + self.post_init() + + def get_input_embeddings(self): + return self.text_model.get_input_embeddings() + + def set_input_embeddings(self, value): + self.text_model.set_input_embeddings(value) + + @add_start_docstrings_to_model_forward(BRIDGETOWER_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=BridgeTowerModelOutput, config_class=_CONFIG_FOR_DOC) + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.FloatTensor] = None, + token_type_ids: Optional[torch.LongTensor] = None, + pixel_values: Optional[torch.FloatTensor] = None, + pixel_mask: Optional[torch.LongTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + image_embeds: Optional[torch.FloatTensor] = None, + image_token_type_idx: Optional[int] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + labels: Optional[torch.LongTensor] = None, + interpolate_pos_encoding: bool = False, + ) -> Union[Tuple[torch.Tensor], BridgeTowerModelOutput]: + r""" + output_hidden_states (`bool`, *optional*): + If set to `True`, hidden states are returned as a list containing the hidden states of text, image, and + cross-modal components respectively. i.e. `(hidden_states_text, hidden_states_image, + hidden_states_cross_modal)` where each element is a list of the hidden states of the corresponding + modality. `hidden_states_txt/img` are a list of tensors corresponding to unimodal hidden states and + `hidden_states_cross_modal` is a list of tuples containing `cross_modal_text_hidden_states` and + `cross_modal_image_hidden_states` of each brdige layer. + labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels are currently not supported. + Returns: + + Examples: + + ```python + >>> from transformers import BridgeTowerProcessor, BridgeTowerModel + >>> from PIL import Image + >>> import requests + + >>> # prepare image and text + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + >>> text = "hello world" + >>> processor = BridgeTowerProcessor.from_pretrained("BridgeTower/bridgetower-base") + >>> model = BridgeTowerModel.from_pretrained("BridgeTower/bridgetower-base") + + >>> inputs = processor(image, text, return_tensors="pt") + >>> outputs = model(**inputs) + >>> outputs.keys() + odict_keys(['text_features', 'image_features', 'pooler_output']) + ```""" + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + all_hidden_states_text = () if output_hidden_states else None + all_hidden_states_image = () if output_hidden_states else None + all_hidden_states_cross = () if output_hidden_states else None + all_hidden_states = () if output_hidden_states else None + all_self_attentions = () if output_attentions else None + + if inputs_embeds is not None and input_ids is None: + raise NotImplementedError( + "BridgeTowerModel does not use `inputs_embeds`. Make sure to pass in `input_ids` instead." + ) + + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + image_token_type_idx = image_token_type_idx if image_token_type_idx else 1 + input_shape = input_ids.size() + text_embeds = self.text_model.embeddings(input_ids=input_ids) + + if output_hidden_states: + all_hidden_states_text += (text_embeds,) + + if attention_mask is None: + attention_mask = torch.ones(input_shape, dtype=torch.long, device=input_ids.device) + extend_text_masks = self.text_model.get_extended_attention_mask(attention_mask, input_shape).to( + input_ids.device + ) + + # The split_index determines how many layers of the uni-modal encoder are applied before the cross-modal encoder + split_index = len(self.text_model.encoder.layer) - self.config.num_hidden_layers + 1 + + # Run the first 'split_index' layers of the textual encoder + for layer in self.text_model.encoder.layer[:split_index]: + text_embeds = layer(text_embeds, extend_text_masks)[0] + + if output_hidden_states: + all_hidden_states_text += (text_embeds,) + + if image_embeds is None: + image_embeds = self.vision_model.visual.forward_pre( + pixel_values.type(self.vision_model.dtype), interpolate_pos_encoding=interpolate_pos_encoding + ) + else: + # Permute as BridgeTowerResidualAttention has batch_first=True + image_embeds = image_embeds.permute(1, 0, 2) + + if output_hidden_states: + all_hidden_states_image += (image_embeds,) + + # Run the first 'split_index' layers of the visual encoder + for block in self.vision_model.visual.transformer.resblocks[:split_index]: + image_embeds = block(image_embeds) + if output_hidden_states: + all_hidden_states_image += (image_embeds,) + + image_embeds_with_ln = self.vision_model.visual.forward_post(image_embeds.type(self.vision_model.dtype)) + + # first layer is a special case because we don't have the output from the cross-encoder yet + cross_modal_text = self.cross_modal_text_transform(text_embeds) + + text_token_type_embeddings = self.token_type_embeddings( + torch.zeros(1, dtype=torch.long, device=input_ids.device) + ).expand_as(cross_modal_text) + + cross_modal_text = self.cross_modal_text_layernorm(cross_modal_text + text_token_type_embeddings) + + image_embeds_with_ln = self.cross_modal_image_transform(image_embeds_with_ln) + image_token_type_embeddings = self.token_type_embeddings( + torch.full((1,), image_token_type_idx, dtype=torch.long, device=input_ids.device) + ).expand_as(image_embeds_with_ln) + + image_embeds_with_ln = image_embeds_with_ln + image_token_type_embeddings + cross_modal_image = self.cross_modal_image_layernorm(image_embeds_with_ln) + + pixel_mask = torch.ones( + (cross_modal_image.size(0), cross_modal_image.size(1)), + dtype=torch.long, + device=input_ids.device, + ) + extend_image_masks = self.text_model.get_extended_attention_mask(pixel_mask, pixel_mask.size()).to( + input_ids.device + ) + + layer_outputs_text = self.cross_modal_text_layers[0]( + cross_modal_text, + cross_modal_image, + attention_mask=extend_text_masks, + encoder_attention_mask=extend_image_masks, + output_attentions=output_attentions, + ) + cross_text_features = layer_outputs_text[0] + + layer_outputs_image = self.cross_modal_image_layers[0]( + cross_modal_image, + cross_modal_text, + attention_mask=extend_image_masks, + encoder_attention_mask=extend_text_masks, + output_attentions=output_attentions, + ) + cross_image_features = layer_outputs_image[0] + + if output_hidden_states: + all_hidden_states_cross += ((cross_text_features, cross_image_features),) + + if output_attentions: + all_self_attentions += ((layer_outputs_text[1], layer_outputs_image[1]),) + + link_layer_index = 0 + + # Each of the top 6 layers of the visual and textual encoders ([split_index:]) is connected to each layer of + # the cross-modal encoder via bridge layers, which brings bottom-up alignment and fusion to the cross-modal encoder. + for i in range(split_index, len(self.text_model.encoder.layer)): + text_embeds = self.text_model.encoder.layer[i](text_embeds, extend_text_masks)[0] + image_embeds = self.vision_model.visual.transformer.resblocks[i](image_embeds).type( + self.vision_model.dtype + ) + image_embeds_with_ln = ( + self.cross_modal_image_transform(self.vision_model.visual.forward_post(image_embeds)) + + image_token_type_embeddings + ) + + text_link_tower = self.cross_modal_text_link_tower[link_layer_index] + image_link_tower = self.cross_modal_image_link_tower[link_layer_index] + + # Bridge layers for textual and visual encoders + cross_text_features_ = text_link_tower( + self.cross_modal_text_transform(text_embeds) + text_token_type_embeddings, + cross_text_features, + extend_text_masks, + ) + cross_image_features_ = image_link_tower(image_embeds_with_ln, cross_image_features, extend_image_masks) + + # Cross-modal encoder via bridge layers of textual and visual encoders + layer_outputs_text = self.cross_modal_text_layers[link_layer_index + 1]( + cross_text_features_, + cross_image_features_, + attention_mask=extend_text_masks, + encoder_attention_mask=extend_image_masks, + output_attentions=output_attentions, + ) + cross_text_features = layer_outputs_text[0] + + layer_outputs_image = self.cross_modal_image_layers[link_layer_index + 1]( + cross_image_features_, + cross_text_features_, + attention_mask=extend_image_masks, + encoder_attention_mask=extend_text_masks, + output_attentions=output_attentions, + ) + cross_image_features = layer_outputs_image[0] + + link_layer_index += 1 + + if output_hidden_states: + all_hidden_states_text += (text_embeds,) + all_hidden_states_image += (image_embeds,) + all_hidden_states_cross += ((cross_text_features, cross_image_features),) + + if output_attentions: + all_self_attentions += ((layer_outputs_text[1], layer_outputs_image[1]),) + + # Concatenate the cls token of the text and image features to get the final represtation + text_features, image_features = cross_text_features, cross_image_features + cls_features = self.get_cls_features(text_features, image_features) + + if output_hidden_states: + all_hidden_states = (all_hidden_states_text, all_hidden_states_image, all_hidden_states_cross) + + if not return_dict: + return tuple( + v + for v in [text_features, image_features, cls_features, all_hidden_states, all_self_attentions] + if v is not None + ) + + return BridgeTowerModelOutput( + text_features=text_features, + image_features=image_features, + pooler_output=cls_features, + hidden_states=all_hidden_states, + attentions=all_self_attentions, + ) + + def get_cls_features(self, text_features, image_features): + cls_features_text = self.cross_modal_text_pooler(text_features) + cls_features_image = self.cross_modal_image_pooler(image_features) + return torch.cat([cls_features_text, cls_features_image], dim=-1) + + +# Copied from transformers.models.vilt.modeling_vilt.ViltPredictionHeadTransform with Vilt->BridgeTower +class BridgeTowerPredictionHeadTransform(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + if isinstance(config.hidden_act, str): + self.transform_act_fn = ACT2FN[config.hidden_act] + else: + self.transform_act_fn = config.hidden_act + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + + def forward(self, hidden_states): + hidden_states = self.dense(hidden_states) + hidden_states = self.transform_act_fn(hidden_states) + hidden_states = self.LayerNorm(hidden_states) + return hidden_states + + +class BridgeTowerMLMHead(nn.Module): + def __init__(self, config, weight=None): + super().__init__() + self.config = config + self.transform = BridgeTowerPredictionHeadTransform(config) + self.decoder = nn.Linear(config.hidden_size, config.text_config.vocab_size, bias=False) + self.bias = nn.Parameter(torch.zeros(config.text_config.vocab_size)) + if weight is not None: + self.decoder.weight = weight + + def forward(self, x): + mlm_score = self.transform(x) + mlm_score = self.decoder(mlm_score) + self.bias + return mlm_score + + +class BridgeTowerITMHead(nn.Module): + def __init__(self, hidden_size): + super().__init__() + self.fc = nn.Linear(hidden_size, 2) + + def forward(self, x): + itm_score = self.fc(x) + return itm_score + + +@add_start_docstrings( + """ + BridgeTower Model with a language modeling head on top as done during pretraining. + """, + BRIDGETOWER_START_DOCSTRING, +) +class BridgeTowerForMaskedLM(BridgeTowerPreTrainedModel): + _tied_weights_keys = ["mlm_score.decoder.weight"] + + def __init__(self, config): + super().__init__(config) + + self.bridgetower = BridgeTowerModel(config) + self.mlm_score = BridgeTowerMLMHead(config) + + # Initialize weights and apply final processing + self.post_init() + + def get_output_embeddings(self): + return self.mlm_score.decoder + + def set_output_embeddings(self, new_embeddings): + self.mlm_score.decoder = new_embeddings + + @add_start_docstrings_to_model_forward(BRIDGETOWER_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @replace_return_docstrings(output_type=MaskedLMOutput, config_class=_CONFIG_FOR_DOC) + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.FloatTensor] = None, + token_type_ids: Optional[torch.LongTensor] = None, + pixel_values: Optional[torch.FloatTensor] = None, + pixel_mask: Optional[torch.LongTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + image_embeds: Optional[torch.FloatTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + labels: Optional[torch.LongTensor] = None, + ) -> Union[MaskedLMOutput, Tuple[torch.FloatTensor]]: + r""" + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ..., + config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the + loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]` + Returns: + + Examples: + + ```python + >>> from transformers import BridgeTowerProcessor, BridgeTowerForMaskedLM + >>> from PIL import Image + >>> import requests + + >>> url = "http://images.cocodataset.org/val2017/000000360943.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw).convert("RGB") + >>> text = "a looking out of the window" + + >>> processor = BridgeTowerProcessor.from_pretrained("BridgeTower/bridgetower-base-itm-mlm") + >>> model = BridgeTowerForMaskedLM.from_pretrained("BridgeTower/bridgetower-base-itm-mlm") + + >>> # prepare inputs + >>> encoding = processor(image, text, return_tensors="pt") + + >>> # forward pass + >>> outputs = model(**encoding) + + >>> results = processor.decode(outputs.logits.argmax(dim=-1).squeeze(0).tolist()) + + >>> print(results) + .a cat looking out of the window. + ```""" + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + outputs = self.bridgetower( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + pixel_values=pixel_values, + pixel_mask=pixel_mask, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + image_embeds=image_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + mlm_logits = self.mlm_score(outputs.text_features if return_dict else outputs[0]) + masked_lm_loss = None + if labels is not None: + loss_fct = CrossEntropyLoss() # -100 index = padding token + + labels = labels.to(mlm_logits.device) + masked_lm_loss = loss_fct(mlm_logits.view(-1, self.config.text_config.vocab_size), labels.view(-1)) + + if not return_dict: + output = tuple(mlm_logits) + return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output + + return MaskedLMOutput( + loss=masked_lm_loss, + logits=mlm_logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +@add_start_docstrings( + """ + BridgeTower Model transformer with a classifier head on top (a linear layer on top of the final hidden state of the + [CLS] token) for image-to-text matching. + """, + BRIDGETOWER_START_DOCSTRING, +) +class BridgeTowerForImageAndTextRetrieval(BridgeTowerPreTrainedModel): + def __init__(self, config): + super().__init__(config) + + self.bridgetower = BridgeTowerModel(config) + + self.itm_score = BridgeTowerITMHead(config.hidden_size * 2) + + # Initialize weights and apply final processing + self.post_init() + + @add_start_docstrings_to_model_forward(BRIDGETOWER_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=SequenceClassifierOutput, config_class=_CONFIG_FOR_DOC) + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.FloatTensor] = None, + token_type_ids: Optional[torch.LongTensor] = None, + pixel_values: Optional[torch.FloatTensor] = None, + pixel_mask: Optional[torch.LongTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + image_embeds: Optional[torch.FloatTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + labels: Optional[torch.LongTensor] = None, + ) -> Union[SequenceClassifierOutput, Tuple[torch.FloatTensor]]: + r""" + labels (`torch.LongTensor` of shape `(batch_size, 1)`, *optional*): + Labels for computing the image-text matching loss. 0 means the pairs don't match and 1 means they match. + The pairs with 0 will be skipped for calculation. + Returns: + + Examples: + + ```python + >>> from transformers import BridgeTowerProcessor, BridgeTowerForImageAndTextRetrieval + >>> import requests + >>> from PIL import Image + + >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" + >>> image = Image.open(requests.get(url, stream=True).raw) + >>> texts = ["An image of two cats chilling on a couch", "A football player scoring a goal"] + + >>> processor = BridgeTowerProcessor.from_pretrained("BridgeTower/bridgetower-base-itm-mlm") + >>> model = BridgeTowerForImageAndTextRetrieval.from_pretrained("BridgeTower/bridgetower-base-itm-mlm") + + >>> # forward pass + >>> scores = dict() + >>> for text in texts: + ... # prepare inputs + ... encoding = processor(image, text, return_tensors="pt") + ... outputs = model(**encoding) + ... scores[text] = outputs.logits[0, 1].item() + ```""" + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.bridgetower( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + pixel_values=pixel_values, + pixel_mask=pixel_mask, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + image_embeds=image_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + pooler_output = outputs.pooler_output if return_dict else outputs[2] + + logits = self.itm_score(pooler_output) + + itm_loss = None + if labels is not None: + loss_fct = CrossEntropyLoss() + + labels = labels.to(logits.device) + itm_loss = loss_fct(logits, labels) + + if not return_dict: + output = tuple(logits) + return ((itm_loss,) + output) if itm_loss is not None else output + + return SequenceClassifierOutput( + loss=itm_loss, + logits=logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +class BridgeTowerContrastiveHead(nn.Module): + def __init__(self, hidden_size, embed_size): + super().__init__() + self.fc = nn.Linear(hidden_size, embed_size) + + def forward(self, x): + x = self.fc(x) + return x + + +@add_start_docstrings( + """ + BridgeTower Model with a image-text contrastive head on top computing image-text contrastive loss. + """, + BRIDGETOWER_START_DOCSTRING, +) +class BridgeTowerForContrastiveLearning(BridgeTowerPreTrainedModel): + def __init__(self, config): + super().__init__(config) + + self.bridgetower = BridgeTowerModel(config) + + self.itc_text_head = BridgeTowerContrastiveHead(config.hidden_size, config.contrastive_hidden_size) + self.itc_image_head = BridgeTowerContrastiveHead(config.hidden_size, config.contrastive_hidden_size) + self.itc_cross_modal_head = BridgeTowerContrastiveHead(config.hidden_size * 2, config.contrastive_hidden_size) + + self.logit_scale = nn.Parameter(torch.tensor(self.config.logit_scale_init_value)) + # Initialize weights and apply final processing + self.post_init() + + @add_start_docstrings_to_model_forward(BRIDGETOWER_INPUTS_DOCSTRING) + @replace_return_docstrings(output_type=BridgeTowerContrastiveOutput, config_class=_CONFIG_FOR_DOC) + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.FloatTensor] = None, + token_type_ids: Optional[torch.LongTensor] = None, + pixel_values: Optional[torch.FloatTensor] = None, + pixel_mask: Optional[torch.LongTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + image_embeds: Optional[torch.FloatTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = True, + return_dict: Optional[bool] = None, + return_loss: Optional[bool] = None, + ) -> Union[BridgeTowerContrastiveOutput, Tuple[torch.FloatTensor]]: + r""" + return_loss (`bool`, *optional*): + Whether or not to return the contrastive loss. + Returns: + + Examples: + + ```python + >>> from transformers import BridgeTowerProcessor, BridgeTowerForContrastiveLearning + >>> import requests + >>> from PIL import Image + >>> import torch + + >>> image_urls = [ + ... "https://farm4.staticflickr.com/3395/3428278415_81c3e27f15_z.jpg", + ... "http://images.cocodataset.org/val2017/000000039769.jpg", + ... ] + >>> texts = ["two dogs in a car", "two cats sleeping on a couch"] + >>> images = [Image.open(requests.get(url, stream=True).raw) for url in image_urls] + + >>> processor = BridgeTowerProcessor.from_pretrained("BridgeTower/bridgetower-large-itm-mlm-itc") + >>> model = BridgeTowerForContrastiveLearning.from_pretrained("BridgeTower/bridgetower-large-itm-mlm-itc") + + >>> inputs = processor(images, texts, padding=True, return_tensors="pt") + >>> loss = model(**inputs, return_loss=True).loss + + >>> inputs = processor(images, texts[::-1], padding=True, return_tensors="pt") + >>> loss_swapped = model(**inputs, return_loss=True).loss + + >>> print("Loss", round(loss.item(), 4)) + Loss 0.0019 + + >>> print("Loss with swapped images", round(loss_swapped.item(), 4)) + Loss with swapped images 2.126 + ```""" + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.bridgetower( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + pixel_values=pixel_values, + pixel_mask=pixel_mask, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + image_embeds=image_embeds, + output_attentions=output_attentions, + output_hidden_states=True, + return_dict=return_dict, + ) + + pooler_output = outputs.pooler_output if return_dict else outputs[2] + hidden_states_txt, hidden_states_img, hidden_states_cross_modal = ( + outputs.hidden_states if return_dict else outputs[3] + ) + + text_embeds = hidden_states_txt[-1] + image_embeds = hidden_states_img[-1] + + image_embeds_with_ln = self.bridgetower.vision_model.visual.forward_post(image_embeds) + image_token_type_embeddings = self.bridgetower.token_type_embeddings( + torch.full((1,), 1, dtype=torch.long, device=self.bridgetower.token_type_embeddings.weight.device) + ).expand_as(image_embeds_with_ln) + + image_embeds = self.bridgetower.cross_modal_image_transform(image_embeds_with_ln) + image_token_type_embeddings + + # normalized features + text_embeds = nn.functional.normalize(self.itc_text_head(text_embeds[:, 0, :]), dim=-1, p=2) + image_embeds = nn.functional.normalize(self.itc_image_head(image_embeds[:, 0, :]), dim=-1, p=2).to( + device=text_embeds.device + ) + cross_embeds = nn.functional.normalize(self.itc_cross_modal_head(pooler_output), dim=-1, p=2).to( + device=text_embeds.device + ) + + logits = torch.stack([text_embeds, image_embeds, cross_embeds], dim=-2) + + logit_scale = self.logit_scale.exp().to(device=text_embeds.device) + logits_text_to_image = torch.matmul(text_embeds, image_embeds.t()) * logit_scale + logits_text_to_cross = torch.matmul(text_embeds, cross_embeds.t()) * logit_scale + logits_image_to_cross = torch.matmul(image_embeds, cross_embeds.t()) * logit_scale + + itc_loss = None + + if return_loss: + labels = torch.arange(len(logits), device=logits.device) + text_to_image_loss = nn.functional.cross_entropy(logits_text_to_image, labels) + text_to_cross_loss = nn.functional.cross_entropy(logits_text_to_cross, labels) + image_to_cross_loss = nn.functional.cross_entropy(logits_image_to_cross, labels) + itc_loss = (text_to_image_loss + text_to_cross_loss + image_to_cross_loss) / 3.0 + + if not return_dict: + output = (logits, text_embeds, image_embeds, cross_embeds) + outputs[3:] + return ((itc_loss,) + output) if itc_loss is not None else output + + return BridgeTowerContrastiveOutput( + loss=itc_loss, + logits=logits, + text_embeds=text_embeds, + image_embeds=image_embeds, + cross_embeds=cross_embeds, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +__all__ = [ + "BridgeTowerForContrastiveLearning", + "BridgeTowerForImageAndTextRetrieval", + "BridgeTowerForMaskedLM", + "BridgeTowerModel", + "BridgeTowerPreTrainedModel", +] diff --git a/docs/transformers/build/lib/transformers/models/bridgetower/processing_bridgetower.py b/docs/transformers/build/lib/transformers/models/bridgetower/processing_bridgetower.py new file mode 100644 index 0000000000000000000000000000000000000000..5519d0a34ce911b26f48b76399bc68cba798bdbd --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/bridgetower/processing_bridgetower.py @@ -0,0 +1,114 @@ +# coding=utf-8 +# Copyright 2023 The Intel Labs Team Authors, The Microsoft Research Team Authors and HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Processor class for BridgeTower. +""" + +from typing import List, Union + +from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack +from ...tokenization_utils_base import BatchEncoding, PreTokenizedInput, TextInput + + +class BridgeTowerProcessorKwargs(ProcessingKwargs, total=False): + _defaults = { + "text_kwargs": { + "add_special_tokens": True, + "padding": False, + "stride": 0, + "return_overflowing_tokens": False, + "return_special_tokens_mask": False, + "return_offsets_mapping": False, + "return_length": False, + "verbose": True, + }, + "images_kwargs": { + "do_normalize": True, + "do_center_crop": True, + }, + } + + +class BridgeTowerProcessor(ProcessorMixin): + r""" + Constructs a BridgeTower processor which wraps a Roberta tokenizer and BridgeTower image processor into a single + processor. + + [`BridgeTowerProcessor`] offers all the functionalities of [`BridgeTowerImageProcessor`] and + [`RobertaTokenizerFast`]. See the docstring of [`~BridgeTowerProcessor.__call__`] and + [`~BridgeTowerProcessor.decode`] for more information. + + Args: + image_processor (`BridgeTowerImageProcessor`): + An instance of [`BridgeTowerImageProcessor`]. The image processor is a required input. + tokenizer (`RobertaTokenizerFast`): + An instance of ['RobertaTokenizerFast`]. The tokenizer is a required input. + """ + + attributes = ["image_processor", "tokenizer"] + image_processor_class = "BridgeTowerImageProcessor" + tokenizer_class = ("RobertaTokenizer", "RobertaTokenizerFast") + + def __init__(self, image_processor, tokenizer): + super().__init__(image_processor, tokenizer) + + def __call__( + self, + images, + text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None, + audio=None, + videos=None, + **kwargs: Unpack[BridgeTowerProcessorKwargs], + ) -> BatchEncoding: + """ + This method uses [`BridgeTowerImageProcessor.__call__`] method to prepare image(s) for the model, and + [`RobertaTokenizerFast.__call__`] to prepare text for the model. + + Please refer to the docstring of the above two methods for more information. + """ + output_kwargs = self._merge_kwargs( + BridgeTowerProcessorKwargs, + tokenizer_init_kwargs=self.tokenizer.init_kwargs, + **kwargs, + ) + encoding = self.tokenizer(text=text, **output_kwargs["text_kwargs"]) + # add pixel_values + pixel_mask + encoding_image_processor = self.image_processor(images, **output_kwargs["images_kwargs"]) + encoding.update(encoding_image_processor) + + return encoding + + def batch_decode(self, *args, **kwargs): + """ + This method forwards all its arguments to RobertaTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please + refer to the docstring of this method for more information. + """ + return self.tokenizer.batch_decode(*args, **kwargs) + + def decode(self, *args, **kwargs): + """ + This method forwards all its arguments to RobertaTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer + to the docstring of this method for more information. + """ + return self.tokenizer.decode(*args, **kwargs) + + @property + def model_input_names(self): + tokenizer_input_names = self.tokenizer.model_input_names + image_processor_input_names = self.image_processor.model_input_names + return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names)) + + +__all__ = ["BridgeTowerProcessor"] diff --git a/docs/transformers/build/lib/transformers/models/bros/__init__.py b/docs/transformers/build/lib/transformers/models/bros/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..2178cfd03a40593bc9acb97111204aab0423860c --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/bros/__init__.py @@ -0,0 +1,28 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ...utils import _LazyModule +from ...utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .configuration_bros import * + from .modeling_bros import * + from .processing_bros import * +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/docs/transformers/build/lib/transformers/models/bros/configuration_bros.py b/docs/transformers/build/lib/transformers/models/bros/configuration_bros.py new file mode 100644 index 0000000000000000000000000000000000000000..84c9989f309fb782b47aed37f9728dd7a26ba6b8 --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/bros/configuration_bros.py @@ -0,0 +1,138 @@ +# coding=utf-8 +# Copyright 2023-present NAVER Corp, The Microsoft Research Asia LayoutLM Team Authors and the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Bros model configuration""" + +from ...configuration_utils import PretrainedConfig +from ...utils import logging + + +logger = logging.get_logger(__name__) + + +class BrosConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`BrosModel`] or a [`TFBrosModel`]. It is used to + instantiate a Bros model according to the specified arguments, defining the model architecture. Instantiating a + configuration with the defaults will yield a similar configuration to that of the Bros + [jinho8345/bros-base-uncased](https://huggingface.co/jinho8345/bros-base-uncased) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + Args: + vocab_size (`int`, *optional*, defaults to 30522): + Vocabulary size of the Bros model. Defines the number of different tokens that can be represented by the + `inputs_ids` passed when calling [`BrosModel`] or [`TFBrosModel`]. + hidden_size (`int`, *optional*, defaults to 768): + Dimensionality of the encoder layers and the pooler layer. + num_hidden_layers (`int`, *optional*, defaults to 12): + Number of hidden layers in the Transformer encoder. + num_attention_heads (`int`, *optional*, defaults to 12): + Number of attention heads for each attention layer in the Transformer encoder. + intermediate_size (`int`, *optional*, defaults to 3072): + Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder. + hidden_act (`str` or `Callable`, *optional*, defaults to `"gelu"`): + The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, + `"relu"`, `"silu"` and `"gelu_new"` are supported. + hidden_dropout_prob (`float`, *optional*, defaults to 0.1): + The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. + attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1): + The dropout ratio for the attention probabilities. + max_position_embeddings (`int`, *optional*, defaults to 512): + The maximum sequence length that this model might ever be used with. Typically set this to something large + just in case (e.g., 512 or 1024 or 2048). + type_vocab_size (`int`, *optional*, defaults to 2): + The vocabulary size of the `token_type_ids` passed when calling [`BrosModel`] or [`TFBrosModel`]. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + layer_norm_eps (`float`, *optional*, defaults to 1e-12): + The epsilon used by the layer normalization layers. + pad_token_id (`int`, *optional*, defaults to 0): + The index of the padding token in the token vocabulary. + dim_bbox (`int`, *optional*, defaults to 8): + The dimension of the bounding box coordinates. (x0, y1, x1, y0, x1, y1, x0, y1) + bbox_scale (`float`, *optional*, defaults to 100.0): + The scale factor of the bounding box coordinates. + n_relations (`int`, *optional*, defaults to 1): + The number of relations for SpadeEE(entity extraction), SpadeEL(entity linking) head. + classifier_dropout_prob (`float`, *optional*, defaults to 0.1): + The dropout ratio for the classifier head. + + + Examples: + + ```python + >>> from transformers import BrosConfig, BrosModel + + >>> # Initializing a BROS jinho8345/bros-base-uncased style configuration + >>> configuration = BrosConfig() + + >>> # Initializing a model from the jinho8345/bros-base-uncased style configuration + >>> model = BrosModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "bros" + + def __init__( + self, + vocab_size=30522, + hidden_size=768, + num_hidden_layers=12, + num_attention_heads=12, + intermediate_size=3072, + hidden_act="gelu", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + max_position_embeddings=512, + type_vocab_size=2, + initializer_range=0.02, + layer_norm_eps=1e-12, + pad_token_id=0, + dim_bbox=8, + bbox_scale=100.0, + n_relations=1, + classifier_dropout_prob=0.1, + **kwargs, + ): + super().__init__( + vocab_size=vocab_size, + hidden_size=hidden_size, + num_hidden_layers=num_hidden_layers, + num_attention_heads=num_attention_heads, + intermediate_size=intermediate_size, + hidden_act=hidden_act, + hidden_dropout_prob=hidden_dropout_prob, + attention_probs_dropout_prob=attention_probs_dropout_prob, + max_position_embeddings=max_position_embeddings, + type_vocab_size=type_vocab_size, + initializer_range=initializer_range, + layer_norm_eps=layer_norm_eps, + pad_token_id=pad_token_id, + **kwargs, + ) + + self.dim_bbox = dim_bbox + self.bbox_scale = bbox_scale + self.n_relations = n_relations + self.dim_bbox_sinusoid_emb_2d = self.hidden_size // 4 + self.dim_bbox_sinusoid_emb_1d = self.dim_bbox_sinusoid_emb_2d // self.dim_bbox + self.dim_bbox_projection = self.hidden_size // self.num_attention_heads + self.classifier_dropout_prob = classifier_dropout_prob + + +__all__ = ["BrosConfig"] diff --git a/docs/transformers/build/lib/transformers/models/bros/convert_bros_to_pytorch.py b/docs/transformers/build/lib/transformers/models/bros/convert_bros_to_pytorch.py new file mode 100644 index 0000000000000000000000000000000000000000..c0984f2c74b20cc61a02f616815d59b79d5a2afb --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/bros/convert_bros_to_pytorch.py @@ -0,0 +1,145 @@ +# coding=utf-8 +# Copyright 2023 The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Convert Bros checkpoints.""" + +import argparse + +import bros # original repo +import torch + +from transformers import BrosConfig, BrosModel, BrosProcessor +from transformers.utils import logging + + +logging.set_verbosity_info() +logger = logging.get_logger(__name__) + + +def get_configs(model_name): + bros_config = BrosConfig.from_pretrained(model_name) + return bros_config + + +def remove_ignore_keys_(state_dict): + ignore_keys = [ + "embeddings.bbox_sinusoid_emb.inv_freq", + ] + for k in ignore_keys: + state_dict.pop(k, None) + + +def rename_key(name): + if name == "embeddings.bbox_projection.weight": + name = "bbox_embeddings.bbox_projection.weight" + + if name == "embeddings.bbox_sinusoid_emb.x_pos_emb.inv_freq": + name = "bbox_embeddings.bbox_sinusoid_emb.x_pos_emb.inv_freq" + + if name == "embeddings.bbox_sinusoid_emb.y_pos_emb.inv_freq": + name = "bbox_embeddings.bbox_sinusoid_emb.y_pos_emb.inv_freq" + + return name + + +def convert_state_dict(orig_state_dict, model): + # rename keys + for key in orig_state_dict.copy().keys(): + val = orig_state_dict.pop(key) + orig_state_dict[rename_key(key)] = val + + # remove ignore keys + remove_ignore_keys_(orig_state_dict) + + return orig_state_dict + + +def convert_bros_checkpoint(model_name, pytorch_dump_folder_path=None, push_to_hub=False): + # load original model + original_model = bros.BrosModel.from_pretrained(model_name).eval() + + # load HuggingFace Model + bros_config = get_configs(model_name) + model = BrosModel.from_pretrained(model_name, config=bros_config) + model.eval() + + state_dict = original_model.state_dict() + new_state_dict = convert_state_dict(state_dict, model) + model.load_state_dict(new_state_dict) + + # verify results + + # original BROS model require 4 points (8 float values) for each bbox, prepare bbox with [batch_size, seq_len, 8] shape + bbox = torch.tensor( + [ + [ + [0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000], + [0.4396, 0.6720, 0.4659, 0.6720, 0.4659, 0.6850, 0.4396, 0.6850], + [0.4698, 0.6720, 0.4843, 0.6720, 0.4843, 0.6850, 0.4698, 0.6850], + [0.4698, 0.6720, 0.4843, 0.6720, 0.4843, 0.6850, 0.4698, 0.6850], + [0.2047, 0.6870, 0.2730, 0.6870, 0.2730, 0.7000, 0.2047, 0.7000], + [0.2047, 0.6870, 0.2730, 0.6870, 0.2730, 0.7000, 0.2047, 0.7000], + [1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000], + ] + ] + ) + + processor = BrosProcessor.from_pretrained(model_name) + + encoding = processor("His name is Rocco.", return_tensors="pt") + encoding["bbox"] = bbox + + original_hidden_states = original_model(**encoding).last_hidden_state + # pixel_values = processor(image, return_tensors="pt").pixel_values + + last_hidden_states = model(**encoding).last_hidden_state + + assert torch.allclose(original_hidden_states, last_hidden_states, atol=1e-4) + + if pytorch_dump_folder_path is not None: + print(f"Saving model and processor to {pytorch_dump_folder_path}") + model.save_pretrained(pytorch_dump_folder_path) + processor.save_pretrained(pytorch_dump_folder_path) + + if push_to_hub: + model.push_to_hub("jinho8345/" + model_name.split("/")[-1], commit_message="Update model") + processor.push_to_hub("jinho8345/" + model_name.split("/")[-1], commit_message="Update model") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + + # Required parameters + parser.add_argument( + "--model_name", + default="jinho8345/bros-base-uncased", + required=False, + type=str, + help="Name of the original model you'd like to convert.", + ) + parser.add_argument( + "--pytorch_dump_folder_path", + default=None, + required=False, + type=str, + help="Path to the output PyTorch model directory.", + ) + parser.add_argument( + "--push_to_hub", + action="store_true", + help="Whether or not to push the converted model and processor to the 🤗 hub.", + ) + + args = parser.parse_args() + convert_bros_checkpoint(args.model_name, args.pytorch_dump_folder_path, args.push_to_hub) diff --git a/docs/transformers/build/lib/transformers/models/bros/modeling_bros.py b/docs/transformers/build/lib/transformers/models/bros/modeling_bros.py new file mode 100644 index 0000000000000000000000000000000000000000..df39d2a49b7df67d363f558c341da3ce5c8aebc1 --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/bros/modeling_bros.py @@ -0,0 +1,1323 @@ +# coding=utf-8 +# Copyright 2023-present NAVER Corp, The Microsoft Research Asia LayoutLM Team Authors and the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch Bros model.""" + +import math +from dataclasses import dataclass +from typing import List, Optional, Tuple, Union + +import torch +import torch.utils.checkpoint +from torch import nn +from torch.nn import CrossEntropyLoss + +from ...activations import ACT2FN +from ...modeling_outputs import ( + BaseModelOutputWithPastAndCrossAttentions, + BaseModelOutputWithPoolingAndCrossAttentions, + TokenClassifierOutput, +) +from ...modeling_utils import PreTrainedModel +from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer +from ...utils import ( + ModelOutput, + add_start_docstrings, + add_start_docstrings_to_model_forward, + logging, + replace_return_docstrings, +) +from .configuration_bros import BrosConfig + + +logger = logging.get_logger(__name__) + +_CHECKPOINT_FOR_DOC = "jinho8345/bros-base-uncased" +_CONFIG_FOR_DOC = "BrosConfig" + + +BROS_START_DOCSTRING = r""" + This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. + Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage + and behavior. + + Parameters: + config ([`BrosConfig`]): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights. +""" + +BROS_INPUTS_DOCSTRING = r""" + Args: + input_ids (`torch.LongTensor` of shape `({0})`): + Indices of input sequence tokens in the vocabulary. + + Indices can be obtained using [`BrosProcessor`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + + bbox ('torch.FloatTensor' of shape '(batch_size, num_boxes, 4)'): + Bounding box coordinates for each token in the input sequence. Each bounding box is a list of four values + (x1, y1, x2, y2), where (x1, y1) is the top left corner, and (x2, y2) is the bottom right corner of the + bounding box. + + attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + + bbox_first_token_mask (`torch.FloatTensor` of shape `({0})`, *optional*): + Mask to indicate the first token of each bounding box. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*): + Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0, + 1]`: + + - 0 corresponds to a *sentence A* token, + - 1 corresponds to a *sentence B* token. + + [What are token type IDs?](../glossary#token-type-ids) + + position_ids (`torch.LongTensor` of shape `({0})`, *optional*): + Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, + config.max_position_embeddings - 1]`. + + [What are position IDs?](../glossary#position-ids) + + head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*): + Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This + is useful if you want more control over how to convert `input_ids` indices into associated vectors than the + model's internal embedding lookup matrix. + + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + + return_dict (`bool`, *optional*): + Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple. +""" + + +@dataclass +class BrosSpadeOutput(ModelOutput): + """ + Base class for outputs of token classification models. + + Args: + loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided) : + Classification loss. + initial_token_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_labels)`): + Classification scores for entity initial tokens (before SoftMax). + subsequent_token_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, sequence_length+1)`): + Classification scores for entity sequence tokens (before SoftMax). + hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, + + one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. + + Hidden-states of the model at the output of each layer plus the optional initial embedding outputs. + attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, + sequence_length)`. + + Attentions weights after the attention softmax, used to compute the weighted average in the self-attention + heads. + """ + + loss: Optional[torch.FloatTensor] = None + initial_token_logits: Optional[torch.FloatTensor] = None + subsequent_token_logits: Optional[torch.FloatTensor] = None + hidden_states: Optional[Tuple[torch.FloatTensor]] = None + attentions: Optional[Tuple[torch.FloatTensor]] = None + + +class BrosPositionalEmbedding1D(nn.Module): + # Reference: https://github.com/kimiyoung/transformer-xl/blob/master/pytorch/mem_transformer.py#L15 + + def __init__(self, config): + super(BrosPositionalEmbedding1D, self).__init__() + + self.dim_bbox_sinusoid_emb_1d = config.dim_bbox_sinusoid_emb_1d + + inv_freq = 1 / ( + 10000 ** (torch.arange(0.0, self.dim_bbox_sinusoid_emb_1d, 2.0) / self.dim_bbox_sinusoid_emb_1d) + ) + self.register_buffer("inv_freq", inv_freq) + + def forward(self, pos_seq: torch.Tensor) -> torch.Tensor: + seq_size = pos_seq.size() + b1, b2, b3 = seq_size + sinusoid_inp = pos_seq.view(b1, b2, b3, 1) * self.inv_freq.view(1, 1, 1, self.dim_bbox_sinusoid_emb_1d // 2) + pos_emb = torch.cat([sinusoid_inp.sin(), sinusoid_inp.cos()], dim=-1) + return pos_emb + + +class BrosPositionalEmbedding2D(nn.Module): + def __init__(self, config): + super(BrosPositionalEmbedding2D, self).__init__() + + self.dim_bbox = config.dim_bbox + self.x_pos_emb = BrosPositionalEmbedding1D(config) + self.y_pos_emb = BrosPositionalEmbedding1D(config) + + def forward(self, bbox: torch.Tensor) -> torch.Tensor: + stack = [] + for i in range(self.dim_bbox): + if i % 2 == 0: + stack.append(self.x_pos_emb(bbox[..., i])) + else: + stack.append(self.y_pos_emb(bbox[..., i])) + bbox_pos_emb = torch.cat(stack, dim=-1) + return bbox_pos_emb + + +class BrosBboxEmbeddings(nn.Module): + def __init__(self, config): + super(BrosBboxEmbeddings, self).__init__() + self.bbox_sinusoid_emb = BrosPositionalEmbedding2D(config) + self.bbox_projection = nn.Linear(config.dim_bbox_sinusoid_emb_2d, config.dim_bbox_projection, bias=False) + + def forward(self, bbox: torch.Tensor): + bbox_t = bbox.transpose(0, 1) + bbox_pos = bbox_t[None, :, :, :] - bbox_t[:, None, :, :] + bbox_pos_emb = self.bbox_sinusoid_emb(bbox_pos) + bbox_pos_emb = self.bbox_projection(bbox_pos_emb) + + return bbox_pos_emb + + +class BrosTextEmbeddings(nn.Module): + """Construct the embeddings from word, position and token_type embeddings.""" + + def __init__(self, config): + super().__init__() + + self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id) + self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size) + self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size) + + # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load + # any TensorFlow checkpoint file + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + # position_ids (1, len position emb) is contiguous in memory and exported when serialized + self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") + self.register_buffer("position_ids", torch.arange(config.max_position_embeddings).expand((1, -1))) + self.register_buffer( + "token_type_ids", + torch.zeros( + self.position_ids.size(), + dtype=torch.long, + device=self.position_ids.device, + ), + persistent=False, + ) + + def forward( + self, + input_ids: Optional[torch.Tensor] = None, + token_type_ids: Optional[torch.Tensor] = None, + position_ids: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + past_key_values_length: int = 0, + ) -> torch.Tensor: + if input_ids is not None: + input_shape = input_ids.size() + else: + input_shape = inputs_embeds.size()[:-1] + + seq_length = input_shape[1] + + if position_ids is None: + position_ids = self.position_ids[:, past_key_values_length : seq_length + past_key_values_length] + + if token_type_ids is None: + if hasattr(self, "token_type_ids"): + buffered_token_type_ids = self.token_type_ids[:, :seq_length] + buffered_token_type_ids_expanded = buffered_token_type_ids.expand(input_shape[0], seq_length) + token_type_ids = buffered_token_type_ids_expanded + else: + token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device) + + if inputs_embeds is None: + inputs_embeds = self.word_embeddings(input_ids) + token_type_embeddings = self.token_type_embeddings(token_type_ids) + + embeddings = inputs_embeds + token_type_embeddings + if self.position_embedding_type == "absolute": + position_embeddings = self.position_embeddings(position_ids) + embeddings += position_embeddings + embeddings = self.LayerNorm(embeddings) + embeddings = self.dropout(embeddings) + return embeddings + + +class BrosSelfAttention(nn.Module): + def __init__(self, config): + super().__init__() + if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"): + raise ValueError( + f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention " + f"heads ({config.num_attention_heads})" + ) + + self.num_attention_heads = config.num_attention_heads + self.attention_head_size = int(config.hidden_size / config.num_attention_heads) + self.all_head_size = self.num_attention_heads * self.attention_head_size + + self.query = nn.Linear(config.hidden_size, self.all_head_size) + self.key = nn.Linear(config.hidden_size, self.all_head_size) + self.value = nn.Linear(config.hidden_size, self.all_head_size) + + self.dropout = nn.Dropout(config.attention_probs_dropout_prob) + self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") + if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query": + self.max_position_embeddings = config.max_position_embeddings + self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size) + + self.is_decoder = config.is_decoder + + def transpose_for_scores(self, x: torch.Tensor): + new_x_shape = x.size()[:-1] + ( + self.num_attention_heads, + self.attention_head_size, + ) + x = x.view(*new_x_shape) + return x.permute(0, 2, 1, 3) + + def forward( + self, + hidden_states: torch.Tensor, + bbox_pos_emb: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, + encoder_attention_mask: Optional[torch.Tensor] = None, + past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None, + output_attentions: Optional[torch.Tensor] = False, + ) -> Tuple[torch.Tensor]: + mixed_query_layer = self.query(hidden_states) + + # If this is instantiated as a cross-attention module, the keys + # and values come from an encoder; the attention mask needs to be + # such that the encoder's padding tokens are not attended to. + is_cross_attention = encoder_hidden_states is not None + + if is_cross_attention and past_key_value is not None: + # reuse k,v, cross_attentions + key_layer = past_key_value[0] + value_layer = past_key_value[1] + attention_mask = encoder_attention_mask + elif is_cross_attention: + key_layer = self.transpose_for_scores(self.key(encoder_hidden_states)) + value_layer = self.transpose_for_scores(self.value(encoder_hidden_states)) + attention_mask = encoder_attention_mask + elif past_key_value is not None: + key_layer = self.transpose_for_scores(self.key(hidden_states)) + value_layer = self.transpose_for_scores(self.value(hidden_states)) + key_layer = torch.cat([past_key_value[0], key_layer], dim=2) + value_layer = torch.cat([past_key_value[1], value_layer], dim=2) + else: + key_layer = self.transpose_for_scores(self.key(hidden_states)) + value_layer = self.transpose_for_scores(self.value(hidden_states)) + + query_layer = self.transpose_for_scores(mixed_query_layer) + + if self.is_decoder: + # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states. + # Further calls to cross_attention layer can then reuse all cross-attention + # key/value_states (first "if" case) + # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of + # all previous decoder key/value_states. Further calls to uni-directional self-attention + # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case) + # if encoder bi-directional self-attention `past_key_value` is always `None` + past_key_value = (key_layer, value_layer) + + # Take the dot product between "query" and "key" to get the raw attention scores. + attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2)) + + if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query": + seq_length = hidden_states.size()[1] + position_ids_l = torch.arange(seq_length, dtype=torch.long, device=hidden_states.device).view(-1, 1) + position_ids_r = torch.arange(seq_length, dtype=torch.long, device=hidden_states.device).view(1, -1) + distance = position_ids_l - position_ids_r + positional_embedding = self.distance_embedding(distance + self.max_position_embeddings - 1) + positional_embedding = positional_embedding.to(dtype=query_layer.dtype) # fp16 compatibility + + if self.position_embedding_type == "relative_key": + relative_position_scores = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding) + attention_scores = attention_scores + relative_position_scores + elif self.position_embedding_type == "relative_key_query": + relative_position_scores_query = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding) + relative_position_scores_key = torch.einsum("bhrd,lrd->bhlr", key_layer, positional_embedding) + + attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key + + # bbox positional encoding + batch_size, n_head, seq_length, d_head = query_layer.shape + bbox_pos_emb = bbox_pos_emb.view(seq_length, seq_length, batch_size, d_head) + bbox_pos_emb = bbox_pos_emb.permute([2, 0, 1, 3]) + bbox_pos_scores = torch.einsum("bnid,bijd->bnij", (query_layer, bbox_pos_emb)) + + attention_scores = attention_scores + bbox_pos_scores + + attention_scores = attention_scores / math.sqrt(self.attention_head_size) + if attention_mask is not None: + # Apply the attention mask is (precomputed for all layers in BrosModel forward() function) + attention_scores = attention_scores + attention_mask + + # Normalize the attention scores to probabilities. + attention_probs = nn.Softmax(dim=-1)(attention_scores) + + # This is actually dropping out entire tokens to attend to, which might + # seem a bit unusual, but is taken from the original Transformer paper. + attention_probs = self.dropout(attention_probs) + + # Mask heads if we want to + if head_mask is not None: + attention_probs = attention_probs * head_mask + + context_layer = torch.matmul(attention_probs, value_layer) + + context_layer = context_layer.permute(0, 2, 1, 3).contiguous() + new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,) + context_layer = context_layer.view(*new_context_layer_shape) + + outputs = (context_layer, attention_probs) if output_attentions else (context_layer,) + + if self.is_decoder: + outputs = outputs + (past_key_value,) + return outputs + + +# Copied from transformers.models.bert.modeling_bert.BertSelfOutput with Bert->Bros +class BrosSelfOutput(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = self.LayerNorm(hidden_states + input_tensor) + return hidden_states + + +class BrosAttention(nn.Module): + def __init__(self, config): + super().__init__() + self.self = BrosSelfAttention(config) + self.output = BrosSelfOutput(config) + self.pruned_heads = set() + + def prune_heads(self, heads): + if len(heads) == 0: + return + heads, index = find_pruneable_heads_and_indices( + heads, + self.self.num_attention_heads, + self.self.attention_head_size, + self.pruned_heads, + ) + + # Prune linear layers + self.self.query = prune_linear_layer(self.self.query, index) + self.self.key = prune_linear_layer(self.self.key, index) + self.self.value = prune_linear_layer(self.self.value, index) + self.output.dense = prune_linear_layer(self.output.dense, index, dim=1) + + # Update hyper params and store pruned heads + self.self.num_attention_heads = self.self.num_attention_heads - len(heads) + self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads + self.pruned_heads = self.pruned_heads.union(heads) + + def forward( + self, + hidden_states: torch.Tensor, + bbox_pos_emb: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, + encoder_attention_mask: Optional[torch.Tensor] = None, + past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None, + output_attentions: Optional[bool] = False, + ) -> Tuple[torch.Tensor]: + self_outputs = self.self( + hidden_states=hidden_states, + bbox_pos_emb=bbox_pos_emb, + attention_mask=attention_mask, + head_mask=head_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + past_key_value=past_key_value, + output_attentions=output_attentions, + ) + attention_output = self.output(self_outputs[0], hidden_states) + outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them + return outputs + + +# Copied from transformers.models.bert.modeling_bert.BertIntermediate with Bert->Bros +class BrosIntermediate(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.intermediate_size) + if isinstance(config.hidden_act, str): + self.intermediate_act_fn = ACT2FN[config.hidden_act] + else: + self.intermediate_act_fn = config.hidden_act + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.intermediate_act_fn(hidden_states) + return hidden_states + + +class BrosOutput(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.intermediate_size, config.hidden_size) + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = self.LayerNorm(hidden_states + input_tensor) + return hidden_states + + +class BrosLayer(nn.Module): + def __init__(self, config): + super().__init__() + self.chunk_size_feed_forward = config.chunk_size_feed_forward + self.seq_len_dim = 1 + self.attention = BrosAttention(config) + self.is_decoder = config.is_decoder + self.add_cross_attention = config.add_cross_attention + if self.add_cross_attention: + if not self.is_decoder: + raise Exception(f"{self} should be used as a decoder model if cross attention is added") + self.crossattention = BrosAttention(config) + self.intermediate = BrosIntermediate(config) + self.output = BrosOutput(config) + + def forward( + self, + hidden_states: torch.Tensor, + bbox_pos_emb: torch.Tensor, + attention_mask: Optional[torch.FloatTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + encoder_attention_mask: Optional[torch.FloatTensor] = None, + past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None, + output_attentions: Optional[bool] = False, + ) -> Tuple[torch.Tensor]: + # decoder uni-directional self-attention cached key/values tuple is at positions 1,2 + self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None + self_attention_outputs = self.attention( + hidden_states, + bbox_pos_emb=bbox_pos_emb, + attention_mask=attention_mask, + head_mask=head_mask, + output_attentions=output_attentions, + past_key_value=self_attn_past_key_value, + ) + attention_output = self_attention_outputs[0] + + # if decoder, the last output is tuple of self-attn cache + if self.is_decoder: + outputs = self_attention_outputs[1:-1] + present_key_value = self_attention_outputs[-1] + else: + outputs = self_attention_outputs[1:] # add self attentions if we output attention weights + + cross_attn_present_key_value = None + if self.is_decoder and encoder_hidden_states is not None: + if hasattr(self, "crossattention"): + raise Exception( + f"If `encoder_hidden_states` are passed, {self} has to be instantiated with cross-attention layers by setting `config.add_cross_attention=True`" + ) + + # cross_attn cached key/values tuple is at positions 3,4 of past_key_value tuple + cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None + cross_attention_outputs = self.crossattention( + attention_output, + attention_mask, + head_mask, + encoder_hidden_states, + encoder_attention_mask, + cross_attn_past_key_value, + output_attentions, + ) + attention_output = cross_attention_outputs[0] + outputs = outputs + cross_attention_outputs[1:-1] # add cross attentions if we output attention weights + + # add cross-attn cache to positions 3,4 of present_key_value tuple + cross_attn_present_key_value = cross_attention_outputs[-1] + present_key_value = present_key_value + cross_attn_present_key_value + + layer_output = apply_chunking_to_forward( + self.feed_forward_chunk, + self.chunk_size_feed_forward, + self.seq_len_dim, + attention_output, + ) + outputs = (layer_output,) + outputs + + # if decoder, return the attn key/values as the last output + if self.is_decoder: + outputs = outputs + (present_key_value,) + + return outputs + + def feed_forward_chunk(self, attention_output): + intermediate_output = self.intermediate(attention_output) + layer_output = self.output(intermediate_output, attention_output) + return layer_output + + +class BrosEncoder(nn.Module): + def __init__(self, config): + super().__init__() + self.config = config + self.layer = nn.ModuleList([BrosLayer(config) for _ in range(config.num_hidden_layers)]) + + def forward( + self, + hidden_states: torch.Tensor, + bbox_pos_emb: torch.Tensor, + attention_mask: Optional[torch.FloatTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + encoder_attention_mask: Optional[torch.FloatTensor] = None, + past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = False, + output_hidden_states: Optional[bool] = False, + return_dict: Optional[bool] = True, + ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPastAndCrossAttentions]: + all_hidden_states = () if output_hidden_states else None + all_self_attentions = () if output_attentions else None + all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None + + next_decoder_cache = () if use_cache else None + for i, layer_module in enumerate(self.layer): + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + layer_head_mask = head_mask[i] if head_mask is not None else None + past_key_value = past_key_values[i] if past_key_values is not None else None + + if getattr(self.config, "gradient_checkpointing", False) and self.training: + if use_cache: + logger.warning( + "`use_cache=True` is incompatible with `config.gradient_checkpointing=True`. Setting " + "`use_cache=False`..." + ) + use_cache = False + layer_outputs = self._gradient_checkpointing_func( + layer_module.__call__, + hidden_states, + bbox_pos_emb, + attention_mask, + layer_head_mask, + encoder_hidden_states, + encoder_attention_mask, + output_attentions, + ) + else: + layer_outputs = layer_module( + hidden_states=hidden_states, + bbox_pos_emb=bbox_pos_emb, + attention_mask=attention_mask, + head_mask=layer_head_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + past_key_value=past_key_value, + output_attentions=output_attentions, + ) + + hidden_states = layer_outputs[0] + if use_cache: + next_decoder_cache += (layer_outputs[-1],) + if output_attentions: + all_self_attentions = all_self_attentions + (layer_outputs[1],) + if self.config.add_cross_attention: + all_cross_attentions = all_cross_attentions + (layer_outputs[2],) + + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + if not return_dict: + return tuple( + v + for v in [ + hidden_states, + next_decoder_cache, + all_hidden_states, + all_self_attentions, + all_cross_attentions, + ] + if v is not None + ) + return BaseModelOutputWithPastAndCrossAttentions( + last_hidden_state=hidden_states, + past_key_values=next_decoder_cache, + hidden_states=all_hidden_states, + attentions=all_self_attentions, + cross_attentions=all_cross_attentions, + ) + + +# Copied from transformers.models.bert.modeling_bert.BertPooler with Bert->Bros +class BrosPooler(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + self.activation = nn.Tanh() + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + # We "pool" the model by simply taking the hidden state corresponding + # to the first token. + first_token_tensor = hidden_states[:, 0] + pooled_output = self.dense(first_token_tensor) + pooled_output = self.activation(pooled_output) + return pooled_output + + +class BrosRelationExtractor(nn.Module): + def __init__(self, config): + super().__init__() + self.n_relations = config.n_relations + self.backbone_hidden_size = config.hidden_size + self.head_hidden_size = config.hidden_size + self.classifier_dropout_prob = config.classifier_dropout_prob + + self.drop = nn.Dropout(self.classifier_dropout_prob) + self.query = nn.Linear(self.backbone_hidden_size, self.n_relations * self.head_hidden_size) + + self.key = nn.Linear(self.backbone_hidden_size, self.n_relations * self.head_hidden_size) + + self.dummy_node = nn.Parameter(torch.zeros(1, self.backbone_hidden_size)) + + def forward(self, query_layer: torch.Tensor, key_layer: torch.Tensor): + query_layer = self.query(self.drop(query_layer)) + + dummy_vec = self.dummy_node.unsqueeze(0).repeat(1, key_layer.size(1), 1) + key_layer = torch.cat([key_layer, dummy_vec], axis=0) + key_layer = self.key(self.drop(key_layer)) + + query_layer = query_layer.view( + query_layer.size(0), query_layer.size(1), self.n_relations, self.head_hidden_size + ) + key_layer = key_layer.view(key_layer.size(0), key_layer.size(1), self.n_relations, self.head_hidden_size) + + relation_score = torch.matmul( + query_layer.permute(2, 1, 0, 3), key_layer.permute(2, 1, 3, 0) + ) # equivalent to torch.einsum("ibnd,jbnd->nbij", (query_layer, key_layer)) + + return relation_score + + +class BrosPreTrainedModel(PreTrainedModel): + """ + An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained + models. + """ + + config_class = BrosConfig + base_model_prefix = "bros" + + def _init_weights(self, module): + """Initialize the weights""" + if isinstance(module, nn.Linear): + # Slightly different from the TF version which uses truncated_normal for initialization + # cf https://github.com/pytorch/pytorch/pull/5617 + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + elif isinstance(module, nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + + +@add_start_docstrings( + "The bare Bros Model transformer outputting raw hidden-states without any specific head on top.", + BROS_START_DOCSTRING, +) +class BrosModel(BrosPreTrainedModel): + def __init__(self, config, add_pooling_layer=True): + super().__init__(config) + self.config = config + + self.embeddings = BrosTextEmbeddings(config) + self.bbox_embeddings = BrosBboxEmbeddings(config) + self.encoder = BrosEncoder(config) + + self.pooler = BrosPooler(config) if add_pooling_layer else None + + self.init_weights() + + def get_input_embeddings(self): + return self.embeddings.word_embeddings + + def set_input_embeddings(self, value): + self.embeddings.word_embeddings = value + + def _prune_heads(self, heads_to_prune): + """ + Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base + class PreTrainedModel + """ + for layer, heads in heads_to_prune.items(): + self.encoder.layer[layer].attention.prune_heads(heads) + + @add_start_docstrings_to_model_forward(BROS_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @replace_return_docstrings(output_type=BaseModelOutputWithPoolingAndCrossAttentions, config_class=_CONFIG_FOR_DOC) + def forward( + self, + input_ids: Optional[torch.Tensor] = None, + bbox: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + token_type_ids: Optional[torch.Tensor] = None, + position_ids: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, + encoder_attention_mask: Optional[torch.Tensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPoolingAndCrossAttentions]: + r""" + Returns: + + Examples: + + ```python + >>> import torch + >>> from transformers import BrosProcessor, BrosModel + + >>> processor = BrosProcessor.from_pretrained("jinho8345/bros-base-uncased") + + >>> model = BrosModel.from_pretrained("jinho8345/bros-base-uncased") + + >>> encoding = processor("Hello, my dog is cute", add_special_tokens=False, return_tensors="pt") + >>> bbox = torch.tensor([[[0, 0, 1, 1]]]).repeat(1, encoding["input_ids"].shape[-1], 1) + >>> encoding["bbox"] = bbox + + >>> outputs = model(**encoding) + >>> last_hidden_states = outputs.last_hidden_state + ```""" + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if self.config.is_decoder: + use_cache = use_cache if use_cache is not None else self.config.use_cache + else: + use_cache = False + + if input_ids is not None and inputs_embeds is not None: + raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") + elif input_ids is not None: + input_shape = input_ids.size() + elif inputs_embeds is not None: + input_shape = inputs_embeds.size()[:-1] + else: + raise ValueError("You have to specify either input_ids or inputs_embeds") + + if bbox is None: + raise ValueError("You have to specify bbox") + + batch_size, seq_length = input_shape + device = input_ids.device if input_ids is not None else inputs_embeds.device + + # past_key_values_length + past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0 + + if attention_mask is None: + attention_mask = torch.ones(input_shape, device=device) + + if token_type_ids is None: + if hasattr(self.embeddings, "token_type_ids"): + buffered_token_type_ids = self.embeddings.token_type_ids[:, :seq_length] + buffered_token_type_ids_expanded = buffered_token_type_ids.expand(batch_size, seq_length) + token_type_ids = buffered_token_type_ids_expanded + else: + token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device) + + # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length] + # ourselves in which case we just need to make it broadcastable to all heads. + extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape, device) + + # If a 2D or 3D attention mask is provided for the cross-attention + # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length] + if self.config.is_decoder and encoder_hidden_states is not None: + encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size() + encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length) + if encoder_attention_mask is None: + encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device) + encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask) + else: + encoder_extended_attention_mask = None + + # Prepare head mask if needed + # 1.0 in head_mask indicate we keep the head + # attention_probs has shape bsz x n_heads x N x N + # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads] + # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length] + head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers) + + embedding_output = self.embeddings( + input_ids=input_ids, + position_ids=position_ids, + token_type_ids=token_type_ids, + inputs_embeds=inputs_embeds, + past_key_values_length=past_key_values_length, + ) + + # if bbox has 2 points (4 float tensors) per token, convert it to 4 points (8 float tensors) per token + if bbox.shape[-1] == 4: + bbox = bbox[:, :, [0, 1, 2, 1, 2, 3, 0, 3]] + scaled_bbox = bbox * self.config.bbox_scale + bbox_position_embeddings = self.bbox_embeddings(scaled_bbox) + + encoder_outputs = self.encoder( + embedding_output, + bbox_pos_emb=bbox_position_embeddings, + attention_mask=extended_attention_mask, + head_mask=head_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_extended_attention_mask, + past_key_values=past_key_values, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + sequence_output = encoder_outputs[0] + pooled_output = self.pooler(sequence_output) if self.pooler is not None else None + + if not return_dict: + return (sequence_output, pooled_output) + encoder_outputs[1:] + + return BaseModelOutputWithPoolingAndCrossAttentions( + last_hidden_state=sequence_output, + pooler_output=pooled_output, + past_key_values=encoder_outputs.past_key_values, + hidden_states=encoder_outputs.hidden_states, + attentions=encoder_outputs.attentions, + cross_attentions=encoder_outputs.cross_attentions, + ) + + +@add_start_docstrings( + """ + Bros Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for + Named-Entity-Recognition (NER) tasks. + """, + BROS_START_DOCSTRING, +) +class BrosForTokenClassification(BrosPreTrainedModel): + _keys_to_ignore_on_load_unexpected = [r"pooler"] + + def __init__(self, config): + super().__init__(config) + self.num_labels = config.num_labels + + self.bros = BrosModel(config) + classifier_dropout = ( + config.classifier_dropout if hasattr(config, "classifier_dropout") else config.hidden_dropout_prob + ) + self.dropout = nn.Dropout(classifier_dropout) + self.classifier = nn.Linear(config.hidden_size, config.num_labels) + + self.init_weights() + + @add_start_docstrings_to_model_forward(BROS_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @replace_return_docstrings(output_type=TokenClassifierOutput, config_class=_CONFIG_FOR_DOC) + def forward( + self, + input_ids: Optional[torch.Tensor] = None, + bbox: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + bbox_first_token_mask: Optional[torch.Tensor] = None, + token_type_ids: Optional[torch.Tensor] = None, + position_ids: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + labels: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple[torch.Tensor], TokenClassifierOutput]: + r""" + + Returns: + + Examples: + + ```python + >>> import torch + >>> from transformers import BrosProcessor, BrosForTokenClassification + + >>> processor = BrosProcessor.from_pretrained("jinho8345/bros-base-uncased") + + >>> model = BrosForTokenClassification.from_pretrained("jinho8345/bros-base-uncased") + + >>> encoding = processor("Hello, my dog is cute", add_special_tokens=False, return_tensors="pt") + >>> bbox = torch.tensor([[[0, 0, 1, 1]]]).repeat(1, encoding["input_ids"].shape[-1], 1) + >>> encoding["bbox"] = bbox + + >>> outputs = model(**encoding) + ```""" + + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.bros( + input_ids, + bbox=bbox, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + sequence_output = outputs[0] + + sequence_output = self.dropout(sequence_output) + logits = self.classifier(sequence_output) + + loss = None + if labels is not None: + loss_fct = CrossEntropyLoss() + if bbox_first_token_mask is not None: + bbox_first_token_mask = bbox_first_token_mask.view(-1) + loss = loss_fct( + logits.view(-1, self.num_labels)[bbox_first_token_mask], labels.view(-1)[bbox_first_token_mask] + ) + else: + loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) + + if not return_dict: + output = (logits,) + outputs[2:] + return ((loss,) + output) if loss is not None else output + + return TokenClassifierOutput( + loss=loss, + logits=logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +@add_start_docstrings( + """ + Bros Model with a token classification head on top (initial_token_layers and subsequent_token_layer on top of the + hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. The initial_token_classifier is used to + predict the first token of each entity, and the subsequent_token_classifier is used to predict the subsequent + tokens within an entity. Compared to BrosForTokenClassification, this model is more robust to serialization errors + since it predicts next token from one token. + """, + BROS_START_DOCSTRING, +) +class BrosSpadeEEForTokenClassification(BrosPreTrainedModel): + _keys_to_ignore_on_load_unexpected = [r"pooler"] + + def __init__(self, config): + super().__init__(config) + self.config = config + self.num_labels = config.num_labels + self.n_relations = config.n_relations + self.backbone_hidden_size = config.hidden_size + + self.bros = BrosModel(config) + classifier_dropout = ( + config.classifier_dropout if hasattr(config, "classifier_dropout") else config.hidden_dropout_prob + ) + + # Initial token classification for Entity Extraction (NER) + self.initial_token_classifier = nn.Sequential( + nn.Dropout(classifier_dropout), + nn.Linear(config.hidden_size, config.hidden_size), + nn.Dropout(classifier_dropout), + nn.Linear(config.hidden_size, config.num_labels), + ) + + # Subsequent token classification for Entity Extraction (NER) + self.subsequent_token_classifier = BrosRelationExtractor(config) + + self.init_weights() + + @add_start_docstrings_to_model_forward(BROS_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @replace_return_docstrings(output_type=BrosSpadeOutput, config_class=_CONFIG_FOR_DOC) + def forward( + self, + input_ids: Optional[torch.Tensor] = None, + bbox: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + bbox_first_token_mask: Optional[torch.Tensor] = None, + token_type_ids: Optional[torch.Tensor] = None, + position_ids: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + initial_token_labels: Optional[torch.Tensor] = None, + subsequent_token_labels: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple[torch.Tensor], BrosSpadeOutput]: + r""" + Returns: + + Examples: + + ```python + >>> import torch + >>> from transformers import BrosProcessor, BrosSpadeEEForTokenClassification + + >>> processor = BrosProcessor.from_pretrained("jinho8345/bros-base-uncased") + + >>> model = BrosSpadeEEForTokenClassification.from_pretrained("jinho8345/bros-base-uncased") + + >>> encoding = processor("Hello, my dog is cute", add_special_tokens=False, return_tensors="pt") + >>> bbox = torch.tensor([[[0, 0, 1, 1]]]).repeat(1, encoding["input_ids"].shape[-1], 1) + >>> encoding["bbox"] = bbox + + >>> outputs = model(**encoding) + ```""" + + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.bros( + input_ids=input_ids, + bbox=bbox, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + last_hidden_states = outputs[0] + last_hidden_states = last_hidden_states.transpose(0, 1).contiguous() + initial_token_logits = self.initial_token_classifier(last_hidden_states).transpose(0, 1).contiguous() + subsequent_token_logits = self.subsequent_token_classifier(last_hidden_states, last_hidden_states).squeeze(0) + + # make subsequent token (sequence token classification) mask + inv_attention_mask = 1 - attention_mask + batch_size, max_seq_length = inv_attention_mask.shape + device = inv_attention_mask.device + invalid_token_mask = torch.cat([inv_attention_mask, torch.zeros([batch_size, 1]).to(device)], axis=1).bool() + subsequent_token_logits = subsequent_token_logits.masked_fill( + invalid_token_mask[:, None, :], torch.finfo(subsequent_token_logits.dtype).min + ) + self_token_mask = torch.eye(max_seq_length, max_seq_length + 1).to(device=device, dtype=torch.bool) + subsequent_token_logits = subsequent_token_logits.masked_fill( + self_token_mask[None, :, :], torch.finfo(subsequent_token_logits.dtype).min + ) + subsequent_token_mask = attention_mask.view(-1).bool() + + loss = None + if initial_token_labels is not None and subsequent_token_labels is not None: + loss_fct = CrossEntropyLoss() + + # get initial token loss + initial_token_labels = initial_token_labels.view(-1) + if bbox_first_token_mask is not None: + bbox_first_token_mask = bbox_first_token_mask.view(-1) + initial_token_loss = loss_fct( + initial_token_logits.view(-1, self.num_labels)[bbox_first_token_mask], + initial_token_labels[bbox_first_token_mask], + ) + else: + initial_token_loss = loss_fct(initial_token_logits.view(-1, self.num_labels), initial_token_labels) + + subsequent_token_labels = subsequent_token_labels.view(-1) + subsequent_token_loss = loss_fct( + subsequent_token_logits.view(-1, max_seq_length + 1)[subsequent_token_mask], + subsequent_token_labels[subsequent_token_mask], + ) + + loss = initial_token_loss + subsequent_token_loss + + if not return_dict: + output = (initial_token_logits, subsequent_token_logits) + outputs[2:] + return ((loss,) + output) if loss is not None else output + + return BrosSpadeOutput( + loss=loss, + initial_token_logits=initial_token_logits, + subsequent_token_logits=subsequent_token_logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +@add_start_docstrings( + """ + Bros Model with a token classification head on top (a entity_linker layer on top of the hidden-states output) e.g. + for Entity-Linking. The entity_linker is used to predict intra-entity links (one entity to another entity). + """, + BROS_START_DOCSTRING, +) +class BrosSpadeELForTokenClassification(BrosPreTrainedModel): + _keys_to_ignore_on_load_unexpected = [r"pooler"] + + def __init__(self, config): + super().__init__(config) + self.config = config + self.num_labels = config.num_labels + self.n_relations = config.n_relations + self.backbone_hidden_size = config.hidden_size + + self.bros = BrosModel(config) + (config.classifier_dropout if hasattr(config, "classifier_dropout") else config.hidden_dropout_prob) + + self.entity_linker = BrosRelationExtractor(config) + + self.init_weights() + + @add_start_docstrings_to_model_forward(BROS_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @replace_return_docstrings(output_type=TokenClassifierOutput, config_class=_CONFIG_FOR_DOC) + def forward( + self, + input_ids: Optional[torch.Tensor] = None, + bbox: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + bbox_first_token_mask: Optional[torch.Tensor] = None, + token_type_ids: Optional[torch.Tensor] = None, + position_ids: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + labels: Optional[torch.Tensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple[torch.Tensor], TokenClassifierOutput]: + r""" + Returns: + + Examples: + + ```python + >>> import torch + >>> from transformers import BrosProcessor, BrosSpadeELForTokenClassification + + >>> processor = BrosProcessor.from_pretrained("jinho8345/bros-base-uncased") + + >>> model = BrosSpadeELForTokenClassification.from_pretrained("jinho8345/bros-base-uncased") + + >>> encoding = processor("Hello, my dog is cute", add_special_tokens=False, return_tensors="pt") + >>> bbox = torch.tensor([[[0, 0, 1, 1]]]).repeat(1, encoding["input_ids"].shape[-1], 1) + >>> encoding["bbox"] = bbox + + >>> outputs = model(**encoding) + ```""" + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.bros( + input_ids=input_ids, + bbox=bbox, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + last_hidden_states = outputs[0] + last_hidden_states = last_hidden_states.transpose(0, 1).contiguous() + + logits = self.entity_linker(last_hidden_states, last_hidden_states).squeeze(0) + + loss = None + if labels is not None: + loss_fct = CrossEntropyLoss() + + batch_size, max_seq_length = attention_mask.shape + device = attention_mask.device + + self_token_mask = torch.eye(max_seq_length, max_seq_length + 1).to(device=device, dtype=torch.bool) + + mask = bbox_first_token_mask.view(-1) + bbox_first_token_mask = torch.cat( + [ + ~bbox_first_token_mask, + torch.zeros([batch_size, 1], dtype=torch.bool, device=device), + ], + axis=1, + ) + logits = logits.masked_fill(bbox_first_token_mask[:, None, :], torch.finfo(logits.dtype).min) + logits = logits.masked_fill(self_token_mask[None, :, :], torch.finfo(logits.dtype).min) + + loss = loss_fct(logits.view(-1, max_seq_length + 1)[mask], labels.view(-1)[mask]) + + if not return_dict: + output = (logits,) + outputs[2:] + return ((loss,) + output) if loss is not None else output + + return TokenClassifierOutput( + loss=loss, + logits=logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +__all__ = [ + "BrosPreTrainedModel", + "BrosModel", + "BrosForTokenClassification", + "BrosSpadeEEForTokenClassification", + "BrosSpadeELForTokenClassification", +] diff --git a/docs/transformers/build/lib/transformers/models/bros/processing_bros.py b/docs/transformers/build/lib/transformers/models/bros/processing_bros.py new file mode 100644 index 0000000000000000000000000000000000000000..4687e7f8a86ae5ab4d1339517622f52023c57499 --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/bros/processing_bros.py @@ -0,0 +1,112 @@ +# coding=utf-8 +# Copyright 2023 The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Processor class for Bros. +""" + +from typing import List, Optional, Union + +from ...processing_utils import ProcessorMixin +from ...tokenization_utils_base import BatchEncoding, PaddingStrategy, PreTokenizedInput, TextInput, TruncationStrategy +from ...utils import TensorType + + +class BrosProcessor(ProcessorMixin): + r""" + Constructs a Bros processor which wraps a BERT tokenizer. + + [`BrosProcessor`] offers all the functionalities of [`BertTokenizerFast`]. See the docstring of + [`~BrosProcessor.__call__`] and [`~BrosProcessor.decode`] for more information. + + Args: + tokenizer (`BertTokenizerFast`, *optional*): + An instance of ['BertTokenizerFast`]. The tokenizer is a required input. + """ + + attributes = ["tokenizer"] + tokenizer_class = ("BertTokenizer", "BertTokenizerFast") + + def __init__(self, tokenizer=None, **kwargs): + if tokenizer is None: + raise ValueError("You need to specify a `tokenizer`.") + + super().__init__(tokenizer) + + def __call__( + self, + text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None, + add_special_tokens: bool = True, + padding: Union[bool, str, PaddingStrategy] = False, + truncation: Union[bool, str, TruncationStrategy] = None, + max_length: Optional[int] = None, + stride: int = 0, + pad_to_multiple_of: Optional[int] = None, + return_token_type_ids: Optional[bool] = None, + return_attention_mask: Optional[bool] = None, + return_overflowing_tokens: bool = False, + return_special_tokens_mask: bool = False, + return_offsets_mapping: bool = False, + return_length: bool = False, + verbose: bool = True, + return_tensors: Optional[Union[str, TensorType]] = None, + **kwargs, + ) -> BatchEncoding: + """ + This method uses [`BertTokenizerFast.__call__`] to prepare text for the model. + + Please refer to the docstring of the above two methods for more information. + """ + encoding = self.tokenizer( + text=text, + add_special_tokens=add_special_tokens, + padding=padding, + truncation=truncation, + max_length=max_length, + stride=stride, + pad_to_multiple_of=pad_to_multiple_of, + return_token_type_ids=return_token_type_ids, + return_attention_mask=return_attention_mask, + return_overflowing_tokens=return_overflowing_tokens, + return_special_tokens_mask=return_special_tokens_mask, + return_offsets_mapping=return_offsets_mapping, + return_length=return_length, + verbose=verbose, + return_tensors=return_tensors, + **kwargs, + ) + + return encoding + + def batch_decode(self, *args, **kwargs): + """ + This method forwards all its arguments to BertTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please + refer to the docstring of this method for more information. + """ + return self.tokenizer.batch_decode(*args, **kwargs) + + def decode(self, *args, **kwargs): + """ + This method forwards all its arguments to BertTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to + the docstring of this method for more information. + """ + return self.tokenizer.decode(*args, **kwargs) + + @property + def model_input_names(self): + tokenizer_input_names = self.tokenizer.model_input_names + return list(dict.fromkeys(tokenizer_input_names)) + + +__all__ = ["BrosProcessor"] diff --git a/docs/transformers/build/lib/transformers/models/byt5/__init__.py b/docs/transformers/build/lib/transformers/models/byt5/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..cb726942b0f16105f8a5a5f7c661485951d7ccc7 --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/byt5/__init__.py @@ -0,0 +1,26 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ...utils import _LazyModule +from ...utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .tokenization_byt5 import * +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/docs/transformers/build/lib/transformers/models/byt5/convert_byt5_original_tf_checkpoint_to_pytorch.py b/docs/transformers/build/lib/transformers/models/byt5/convert_byt5_original_tf_checkpoint_to_pytorch.py new file mode 100644 index 0000000000000000000000000000000000000000..9b1b15857ceaa1f523eca5e1e542fe48e63d6651 --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/byt5/convert_byt5_original_tf_checkpoint_to_pytorch.py @@ -0,0 +1,59 @@ +# coding=utf-8 +# Copyright 2018 The T5 authors and HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Convert T5 checkpoint.""" + +import argparse + +from transformers import T5Config, T5ForConditionalGeneration, load_tf_weights_in_t5 +from transformers.utils import logging + + +logging.set_verbosity_info() + + +def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, config_file, pytorch_dump_path): + # Initialise PyTorch model + config = T5Config.from_json_file(config_file) + print(f"Building PyTorch model from configuration: {config}") + model = T5ForConditionalGeneration(config) + + # Load weights from tf checkpoint + load_tf_weights_in_t5(model, config, tf_checkpoint_path) + + # Save pytorch-model + print(f"Save PyTorch model to {pytorch_dump_path}") + model.save_pretrained(pytorch_dump_path) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + # Required parameters + parser.add_argument( + "--tf_checkpoint_path", default=None, type=str, required=True, help="Path to the TensorFlow checkpoint path." + ) + parser.add_argument( + "--config_file", + default=None, + type=str, + required=True, + help=( + "The config json file corresponding to the pre-trained T5 model. \nThis specifies the model architecture." + ), + ) + parser.add_argument( + "--pytorch_dump_path", default=None, type=str, required=True, help="Path to the output PyTorch model." + ) + args = parser.parse_args() + convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path, args.config_file, args.pytorch_dump_path) diff --git a/docs/transformers/build/lib/transformers/models/byt5/tokenization_byt5.py b/docs/transformers/build/lib/transformers/models/byt5/tokenization_byt5.py new file mode 100644 index 0000000000000000000000000000000000000000..b39ba254b38170e47dcbe0b8da0926fb2e849450 --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/byt5/tokenization_byt5.py @@ -0,0 +1,236 @@ +# coding=utf-8 +# Copyright 2021 T5 Authors and HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Tokenization class for model ByT5.""" + +import warnings +from typing import List, Optional, Tuple + +from ...tokenization_utils import AddedToken, PreTrainedTokenizer +from ...utils import logging + + +logger = logging.get_logger(__name__) + + +class ByT5Tokenizer(PreTrainedTokenizer): + """ + Construct a ByT5 tokenizer. ByT5 simply uses raw bytes utf-8 encoding. + + This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to + this superclass for more information regarding those methods. + + Args: + eos_token (`str`, *optional*, defaults to `""`): + The end of sequence token. + + + + When building a sequence using special tokens, this is not the token that is used for the end of sequence. + The token used is the `sep_token`. + + + + unk_token (`str`, *optional*, defaults to `""`): + The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this + token instead. + pad_token (`str`, *optional*, defaults to `""`): + The token used for padding, for example when batching sequences of different lengths. + extra_ids (`int`, *optional*, defaults to 125): + Add a number of extra ids added to the end of the vocabulary for use as sentinels. These tokens are + accessible as "" where "{%d}" is a number between 0 and extra_ids-1. Extra tokens are + indexed from the end of the vocabulary up to beginning ("" is the last token in the vocabulary + like in ByT5 preprocessing see + [here](https://github.com/google-research/text-to-text-transfer-transformer/blob/9fd7b14a769417be33bc6c850f9598764913c833/t5/data/preprocessors.py#L2117)). + additional_special_tokens (`List[str]`, *optional*): + Additional special tokens used by the tokenizer. + """ + + model_input_names = ["input_ids", "attention_mask"] + + def __init__( + self, + eos_token="", + unk_token="", + pad_token="", + extra_ids=125, + additional_special_tokens=None, + **kwargs, + ) -> None: + # Add extra_ids to the special token list + if extra_ids > 0 and additional_special_tokens is None: + additional_special_tokens = [f"" for i in range(extra_ids)] + elif extra_ids > 0 and additional_special_tokens is not None and len(additional_special_tokens) > 0: + # Check that we have the right number of extra_id special tokens + extra_tokens = len(set(filter(lambda x: bool("extra_id" in str(x)), additional_special_tokens))) + if extra_tokens != extra_ids: + raise ValueError( + f"Both extra_ids ({extra_ids}) and additional_special_tokens ({additional_special_tokens}) are" + " provided to ByT5Tokenizer. In this case the additional_special_tokens must include the" + " extra_ids tokens" + ) + + pad_token = AddedToken(pad_token, lstrip=True, rstrip=True) if isinstance(pad_token, str) else pad_token + # we force left and right stripping for backward compatibility. The byt5tests depend on this. + eos_token = AddedToken(eos_token, lstrip=True, rstrip=True) if isinstance(eos_token, str) else eos_token + unk_token = AddedToken(unk_token, lstrip=True, rstrip=True) if isinstance(unk_token, str) else unk_token + # unk token needs to be in the vocab with correct index + self._added_tokens_decoder = {0: pad_token, 1: eos_token, 2: unk_token} + self.offset = len(self._added_tokens_decoder) + self._utf_vocab_size = 2**8 # utf is 8 bits + super().__init__( + eos_token=eos_token, + unk_token=unk_token, + pad_token=pad_token, + extra_ids=0, + additional_special_tokens=additional_special_tokens, # TODO extra ids are not used :sweatywmile: + **kwargs, + ) + + @property + def vocab_size(self): + return self._utf_vocab_size + + def get_vocab(self): + vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size + self.offset)} + vocab.update(self.added_tokens_encoder) + return vocab + + def get_special_tokens_mask( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False + ) -> List[int]: + """ + Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding + special tokens using the tokenizer `prepare_for_model` method. + + Args: + token_ids_0 (`List[int]`): + List of IDs. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + already_has_special_tokens (`bool`, *optional*, defaults to `False`): + Whether or not the token list is already formatted with special tokens for the model. + + Returns: + `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. + """ + if already_has_special_tokens: + return super().get_special_tokens_mask( + token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True + ) + + # normal case: some special tokens + if token_ids_1 is None: + return ([0] * len(token_ids_0)) + [1] + return ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1] + + def _add_eos_if_not_present(self, token_ids: List[int]) -> List[int]: + """Do not add eos again if user already added it.""" + if len(token_ids) > 0 and token_ids[-1] == self.eos_token_id: + warnings.warn( + f"This sequence already has {self.eos_token}. In future versions this behavior may lead to duplicated" + " eos tokens being added." + ) + return token_ids + else: + return token_ids + [self.eos_token_id] + + def create_token_type_ids_from_sequences( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None + ) -> List[int]: + """ + Create a mask from the two sequences passed to be used in a sequence-pair classification task. ByT5 does not + make use of token type ids, therefore a list of zeros is returned. + + Args: + token_ids_0 (`List[int]`): + List of IDs. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + + Returns: + `List[int]`: List of zeros. + """ + eos = [self.eos_token_id] + + if token_ids_1 is None: + return len(token_ids_0 + eos) * [0] + return len(token_ids_0 + eos + token_ids_1 + eos) * [0] + + def build_inputs_with_special_tokens( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None + ) -> List[int]: + """ + Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and + adding special tokens. A sequence has the following format: + + - single sequence: `X ` + - pair of sequences: `A B ` + + Args: + token_ids_0 (`List[int]`): + List of IDs to which the special tokens will be added. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + + Returns: + `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens. + """ + token_ids_0 = self._add_eos_if_not_present(token_ids_0) + if token_ids_1 is None: + return token_ids_0 + else: + token_ids_1 = self._add_eos_if_not_present(token_ids_1) + return token_ids_0 + token_ids_1 + + def _tokenize(self, text: str) -> List[str]: + """Take as input a string and return a list of strings (tokens) for words/sub-words""" + tokens = [chr(i) for i in text.encode("utf-8")] + return tokens + + def _convert_token_to_id(self, token): + """Converts a token (str) in an id using the vocab.""" + + if len(token) != 1: + token_id = None + else: + token_id = ord(token) + self.offset + + return token_id + + def _convert_id_to_token(self, index): + """Converts an index (integer) in a token (str) using the vocab.""" + token = chr(index - self.offset) + return token + + def convert_tokens_to_string(self, tokens): + """Converts a sequence of tokens (string) in a single string.""" + bstring = b"" + for token in tokens: + if token in self.added_tokens_decoder: + tok_string = self.added_tokens_decoder[token].encode("utf-8") + elif token in self.added_tokens_encoder: + tok_string = token.encode("utf-8") + else: + tok_string = bytes([ord(token)]) + bstring += tok_string + string = bstring.decode("utf-8", errors="ignore") + return string + + # ByT5Tokenizer has no vocab file + def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]: + return () + + +__all__ = ["ByT5Tokenizer"] diff --git a/docs/transformers/build/lib/transformers/models/camembert/__init__.py b/docs/transformers/build/lib/transformers/models/camembert/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..9d90f64de97f78ccaf1592c3c5cad40a9b2d5dcb --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/camembert/__init__.py @@ -0,0 +1,30 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ...utils import _LazyModule +from ...utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .configuration_camembert import * + from .modeling_camembert import * + from .modeling_tf_camembert import * + from .tokenization_camembert import * + from .tokenization_camembert_fast import * +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/docs/transformers/build/lib/transformers/models/camembert/configuration_camembert.py b/docs/transformers/build/lib/transformers/models/camembert/configuration_camembert.py new file mode 100644 index 0000000000000000000000000000000000000000..eaf8c94b8914811e35210b6ebcf3fcdece6281c7 --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/camembert/configuration_camembert.py @@ -0,0 +1,155 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""CamemBERT configuration""" + +from collections import OrderedDict +from typing import Mapping + +from ...configuration_utils import PretrainedConfig +from ...onnx import OnnxConfig +from ...utils import logging + + +logger = logging.get_logger(__name__) + + +class CamembertConfig(PretrainedConfig): + """ + This is the configuration class to store the configuration of a [`CamembertModel`] or a [`TFCamembertModel`]. It is + used to instantiate a Camembert model according to the specified arguments, defining the model architecture. + Instantiating a configuration with the defaults will yield a similar configuration to that of the Camembert + [almanach/camembert-base](https://huggingface.co/almanach/camembert-base) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + + Args: + vocab_size (`int`, *optional*, defaults to 30522): + Vocabulary size of the BERT model. Defines the number of different tokens that can be represented by the + `inputs_ids` passed when calling [`CamembertModel`] or [`TFCamembertModel`]. + hidden_size (`int`, *optional*, defaults to 768): + Dimensionality of the encoder layers and the pooler layer. + num_hidden_layers (`int`, *optional*, defaults to 12): + Number of hidden layers in the Transformer encoder. + num_attention_heads (`int`, *optional*, defaults to 12): + Number of attention heads for each attention layer in the Transformer encoder. + intermediate_size (`int`, *optional*, defaults to 3072): + Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder. + hidden_act (`str` or `Callable`, *optional*, defaults to `"gelu"`): + The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, + `"relu"`, `"silu"` and `"gelu_new"` are supported. + hidden_dropout_prob (`float`, *optional*, defaults to 0.1): + The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. + attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1): + The dropout ratio for the attention probabilities. + max_position_embeddings (`int`, *optional*, defaults to 512): + The maximum sequence length that this model might ever be used with. Typically set this to something large + just in case (e.g., 512 or 1024 or 2048). + type_vocab_size (`int`, *optional*, defaults to 2): + The vocabulary size of the `token_type_ids` passed when calling [`CamembertModel`] or [`TFCamembertModel`]. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + layer_norm_eps (`float`, *optional*, defaults to 1e-12): + The epsilon used by the layer normalization layers. + position_embedding_type (`str`, *optional*, defaults to `"absolute"`): + Type of position embedding. Choose one of `"absolute"`, `"relative_key"`, `"relative_key_query"`. For + positional embeddings use `"absolute"`. For more information on `"relative_key"`, please refer to + [Self-Attention with Relative Position Representations (Shaw et al.)](https://arxiv.org/abs/1803.02155). + For more information on `"relative_key_query"`, please refer to *Method 4* in [Improve Transformer Models + with Better Relative Position Embeddings (Huang et al.)](https://arxiv.org/abs/2009.13658). + is_decoder (`bool`, *optional*, defaults to `False`): + Whether the model is used as a decoder or not. If `False`, the model is used as an encoder. + use_cache (`bool`, *optional*, defaults to `True`): + Whether or not the model should return the last key/values attentions (not used by all models). Only + relevant if `config.is_decoder=True`. + classifier_dropout (`float`, *optional*): + The dropout ratio for the classification head. + + Example: + + ```python + >>> from transformers import CamembertConfig, CamembertModel + + >>> # Initializing a Camembert almanach/camembert-base style configuration + >>> configuration = CamembertConfig() + + >>> # Initializing a model (with random weights) from the almanach/camembert-base style configuration + >>> model = CamembertModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "camembert" + + def __init__( + self, + vocab_size=30522, + hidden_size=768, + num_hidden_layers=12, + num_attention_heads=12, + intermediate_size=3072, + hidden_act="gelu", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + max_position_embeddings=512, + type_vocab_size=2, + initializer_range=0.02, + layer_norm_eps=1e-12, + pad_token_id=1, + bos_token_id=0, + eos_token_id=2, + position_embedding_type="absolute", + use_cache=True, + classifier_dropout=None, + **kwargs, + ): + super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs) + + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.hidden_act = hidden_act + self.intermediate_size = intermediate_size + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.max_position_embeddings = max_position_embeddings + self.type_vocab_size = type_vocab_size + self.initializer_range = initializer_range + self.layer_norm_eps = layer_norm_eps + self.position_embedding_type = position_embedding_type + self.use_cache = use_cache + self.classifier_dropout = classifier_dropout + + +class CamembertOnnxConfig(OnnxConfig): + @property + def inputs(self) -> Mapping[str, Mapping[int, str]]: + if self.task == "multiple-choice": + dynamic_axis = {0: "batch", 1: "choice", 2: "sequence"} + else: + dynamic_axis = {0: "batch", 1: "sequence"} + return OrderedDict( + [ + ("input_ids", dynamic_axis), + ("attention_mask", dynamic_axis), + ] + ) + + +__all__ = ["CamembertConfig", "CamembertOnnxConfig"] diff --git a/docs/transformers/build/lib/transformers/models/camembert/modeling_camembert.py b/docs/transformers/build/lib/transformers/models/camembert/modeling_camembert.py new file mode 100644 index 0000000000000000000000000000000000000000..b69590ae21a59702c18dd3c69772c65c4994a29c --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/camembert/modeling_camembert.py @@ -0,0 +1,1716 @@ +# coding=utf-8 +# Copyright 2019 Inria, Facebook AI Research and the HuggingFace Inc. team. +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch CamemBERT model.""" + +import math +from typing import List, Optional, Tuple, Union + +import torch +import torch.utils.checkpoint +from packaging import version +from torch import nn +from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss + +from ...activations import ACT2FN, gelu +from ...generation import GenerationMixin +from ...modeling_attn_mask_utils import ( + _prepare_4d_attention_mask_for_sdpa, + _prepare_4d_causal_attention_mask_for_sdpa, +) +from ...modeling_outputs import ( + BaseModelOutputWithPastAndCrossAttentions, + BaseModelOutputWithPoolingAndCrossAttentions, + CausalLMOutputWithCrossAttentions, + MaskedLMOutput, + MultipleChoiceModelOutput, + QuestionAnsweringModelOutput, + SequenceClassifierOutput, + TokenClassifierOutput, +) +from ...modeling_utils import PreTrainedModel +from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer +from ...utils import ( + add_code_sample_docstrings, + add_start_docstrings, + add_start_docstrings_to_model_forward, + get_torch_version, + logging, + replace_return_docstrings, +) +from .configuration_camembert import CamembertConfig + + +logger = logging.get_logger(__name__) + +_CHECKPOINT_FOR_DOC = "almanach/camembert-base" +_CONFIG_FOR_DOC = "CamembertConfig" + + +CAMEMBERT_START_DOCSTRING = r""" + + This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads + etc.) + + This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. + Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage + and behavior. + + Parameters: + config ([`CamembertConfig`]): Model configuration class with all the parameters of the + model. Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights. +""" + + +# Copied from transformers.models.roberta.modeling_roberta.RobertaEmbeddings with Roberta->Camembert +class CamembertEmbeddings(nn.Module): + """ + Same as BertEmbeddings with a tiny tweak for positional embeddings indexing. + """ + + # Copied from transformers.models.bert.modeling_bert.BertEmbeddings.__init__ + def __init__(self, config): + super().__init__() + self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id) + self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size) + self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size) + + # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load + # any TensorFlow checkpoint file + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + # position_ids (1, len position emb) is contiguous in memory and exported when serialized + self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") + self.register_buffer( + "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False + ) + self.register_buffer( + "token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False + ) + + # End copy + self.padding_idx = config.pad_token_id + self.position_embeddings = nn.Embedding( + config.max_position_embeddings, config.hidden_size, padding_idx=self.padding_idx + ) + + def forward( + self, input_ids=None, token_type_ids=None, position_ids=None, inputs_embeds=None, past_key_values_length=0 + ): + if position_ids is None: + if input_ids is not None: + # Create the position ids from the input token ids. Any padded tokens remain padded. + position_ids = create_position_ids_from_input_ids(input_ids, self.padding_idx, past_key_values_length) + else: + position_ids = self.create_position_ids_from_inputs_embeds(inputs_embeds) + + if input_ids is not None: + input_shape = input_ids.size() + else: + input_shape = inputs_embeds.size()[:-1] + + seq_length = input_shape[1] + + # Setting the token_type_ids to the registered buffer in constructor where it is all zeros, which usually occurs + # when its auto-generated, registered buffer helps users when tracing the model without passing token_type_ids, solves + # issue #5664 + if token_type_ids is None: + if hasattr(self, "token_type_ids"): + buffered_token_type_ids = self.token_type_ids[:, :seq_length] + buffered_token_type_ids_expanded = buffered_token_type_ids.expand(input_shape[0], seq_length) + token_type_ids = buffered_token_type_ids_expanded + else: + token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device) + + if inputs_embeds is None: + inputs_embeds = self.word_embeddings(input_ids) + token_type_embeddings = self.token_type_embeddings(token_type_ids) + + embeddings = inputs_embeds + token_type_embeddings + if self.position_embedding_type == "absolute": + position_embeddings = self.position_embeddings(position_ids) + embeddings += position_embeddings + embeddings = self.LayerNorm(embeddings) + embeddings = self.dropout(embeddings) + return embeddings + + def create_position_ids_from_inputs_embeds(self, inputs_embeds): + """ + We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids. + + Args: + inputs_embeds: torch.Tensor + + Returns: torch.Tensor + """ + input_shape = inputs_embeds.size()[:-1] + sequence_length = input_shape[1] + + position_ids = torch.arange( + self.padding_idx + 1, sequence_length + self.padding_idx + 1, dtype=torch.long, device=inputs_embeds.device + ) + return position_ids.unsqueeze(0).expand(input_shape) + + +# Copied from transformers.models.roberta.modeling_roberta.RobertaSelfAttention with Roberta->Camembert +class CamembertSelfAttention(nn.Module): + def __init__(self, config, position_embedding_type=None): + super().__init__() + if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"): + raise ValueError( + f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention " + f"heads ({config.num_attention_heads})" + ) + + self.num_attention_heads = config.num_attention_heads + self.attention_head_size = int(config.hidden_size / config.num_attention_heads) + self.all_head_size = self.num_attention_heads * self.attention_head_size + + self.query = nn.Linear(config.hidden_size, self.all_head_size) + self.key = nn.Linear(config.hidden_size, self.all_head_size) + self.value = nn.Linear(config.hidden_size, self.all_head_size) + + self.dropout = nn.Dropout(config.attention_probs_dropout_prob) + self.position_embedding_type = position_embedding_type or getattr( + config, "position_embedding_type", "absolute" + ) + if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query": + self.max_position_embeddings = config.max_position_embeddings + self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size) + + self.is_decoder = config.is_decoder + + def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor: + new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size) + x = x.view(new_x_shape) + return x.permute(0, 2, 1, 3) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.FloatTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + encoder_attention_mask: Optional[torch.FloatTensor] = None, + past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None, + output_attentions: Optional[bool] = False, + ) -> Tuple[torch.Tensor]: + mixed_query_layer = self.query(hidden_states) + + # If this is instantiated as a cross-attention module, the keys + # and values come from an encoder; the attention mask needs to be + # such that the encoder's padding tokens are not attended to. + is_cross_attention = encoder_hidden_states is not None + + if is_cross_attention and past_key_value is not None: + # reuse k,v, cross_attentions + key_layer = past_key_value[0] + value_layer = past_key_value[1] + attention_mask = encoder_attention_mask + elif is_cross_attention: + key_layer = self.transpose_for_scores(self.key(encoder_hidden_states)) + value_layer = self.transpose_for_scores(self.value(encoder_hidden_states)) + attention_mask = encoder_attention_mask + elif past_key_value is not None: + key_layer = self.transpose_for_scores(self.key(hidden_states)) + value_layer = self.transpose_for_scores(self.value(hidden_states)) + key_layer = torch.cat([past_key_value[0], key_layer], dim=2) + value_layer = torch.cat([past_key_value[1], value_layer], dim=2) + else: + key_layer = self.transpose_for_scores(self.key(hidden_states)) + value_layer = self.transpose_for_scores(self.value(hidden_states)) + + query_layer = self.transpose_for_scores(mixed_query_layer) + + use_cache = past_key_value is not None + if self.is_decoder: + # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states. + # Further calls to cross_attention layer can then reuse all cross-attention + # key/value_states (first "if" case) + # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of + # all previous decoder key/value_states. Further calls to uni-directional self-attention + # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case) + # if encoder bi-directional self-attention `past_key_value` is always `None` + past_key_value = (key_layer, value_layer) + + # Take the dot product between "query" and "key" to get the raw attention scores. + attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2)) + + if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query": + query_length, key_length = query_layer.shape[2], key_layer.shape[2] + if use_cache: + position_ids_l = torch.tensor(key_length - 1, dtype=torch.long, device=hidden_states.device).view( + -1, 1 + ) + else: + position_ids_l = torch.arange(query_length, dtype=torch.long, device=hidden_states.device).view(-1, 1) + position_ids_r = torch.arange(key_length, dtype=torch.long, device=hidden_states.device).view(1, -1) + distance = position_ids_l - position_ids_r + + positional_embedding = self.distance_embedding(distance + self.max_position_embeddings - 1) + positional_embedding = positional_embedding.to(dtype=query_layer.dtype) # fp16 compatibility + + if self.position_embedding_type == "relative_key": + relative_position_scores = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding) + attention_scores = attention_scores + relative_position_scores + elif self.position_embedding_type == "relative_key_query": + relative_position_scores_query = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding) + relative_position_scores_key = torch.einsum("bhrd,lrd->bhlr", key_layer, positional_embedding) + attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key + + attention_scores = attention_scores / math.sqrt(self.attention_head_size) + if attention_mask is not None: + # Apply the attention mask is (precomputed for all layers in CamembertModel forward() function) + attention_scores = attention_scores + attention_mask + + # Normalize the attention scores to probabilities. + attention_probs = nn.functional.softmax(attention_scores, dim=-1) + + # This is actually dropping out entire tokens to attend to, which might + # seem a bit unusual, but is taken from the original Transformer paper. + attention_probs = self.dropout(attention_probs) + + # Mask heads if we want to + if head_mask is not None: + attention_probs = attention_probs * head_mask + + context_layer = torch.matmul(attention_probs, value_layer) + + context_layer = context_layer.permute(0, 2, 1, 3).contiguous() + new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,) + context_layer = context_layer.view(new_context_layer_shape) + + outputs = (context_layer, attention_probs) if output_attentions else (context_layer,) + + if self.is_decoder: + outputs = outputs + (past_key_value,) + return outputs + + +# Copied from transformers.models.roberta.modeling_roberta.RobertaSdpaSelfAttention with Roberta->Camembert +class CamembertSdpaSelfAttention(CamembertSelfAttention): + def __init__(self, config, position_embedding_type=None): + super().__init__(config, position_embedding_type=position_embedding_type) + self.dropout_prob = config.attention_probs_dropout_prob + self.require_contiguous_qkv = version.parse(get_torch_version()) < version.parse("2.2.0") + + # Adapted from CamembertSelfAttention + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + encoder_attention_mask: Optional[torch.FloatTensor] = None, + past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None, + output_attentions: Optional[bool] = False, + ) -> Tuple[torch.Tensor]: + if self.position_embedding_type != "absolute" or output_attentions or head_mask is not None: + # TODO: Improve this warning with e.g. `model.config._attn_implementation = "manual"` once implemented. + logger.warning_once( + "CamembertSdpaSelfAttention is used but `torch.nn.functional.scaled_dot_product_attention` does not support " + "non-absolute `position_embedding_type` or `output_attentions=True` or `head_mask`. Falling back to " + "the manual attention implementation, but specifying the manual implementation will be required from " + "Transformers version v5.0.0 onwards. This warning can be removed using the argument " + '`attn_implementation="eager"` when loading the model.' + ) + return super().forward( + hidden_states, + attention_mask, + head_mask, + encoder_hidden_states, + encoder_attention_mask, + past_key_value, + output_attentions, + ) + + bsz, tgt_len, _ = hidden_states.size() + + query_layer = self.transpose_for_scores(self.query(hidden_states)) + + # If this is instantiated as a cross-attention module, the keys and values come from an encoder; the attention + # mask needs to be such that the encoder's padding tokens are not attended to. + is_cross_attention = encoder_hidden_states is not None + + current_states = encoder_hidden_states if is_cross_attention else hidden_states + attention_mask = encoder_attention_mask if is_cross_attention else attention_mask + + # Check `seq_length` of `past_key_value` == `len(current_states)` to support prefix tuning + if is_cross_attention and past_key_value and past_key_value[0].shape[2] == current_states.shape[1]: + key_layer, value_layer = past_key_value + else: + key_layer = self.transpose_for_scores(self.key(current_states)) + value_layer = self.transpose_for_scores(self.value(current_states)) + if past_key_value is not None and not is_cross_attention: + key_layer = torch.cat([past_key_value[0], key_layer], dim=2) + value_layer = torch.cat([past_key_value[1], value_layer], dim=2) + + if self.is_decoder: + # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states. + # Further calls to cross_attention layer can then reuse all cross-attention + # key/value_states (first "if" case) + # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of + # all previous decoder key/value_states. Further calls to uni-directional self-attention + # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case) + # if encoder bi-directional self-attention `past_key_value` is always `None` + past_key_value = (key_layer, value_layer) + + # SDPA with memory-efficient backend is broken in torch==2.1.2 when using non-contiguous inputs and a custom + # attn_mask, so we need to call `.contiguous()` here. This was fixed in torch==2.2.0. + # Reference: https://github.com/pytorch/pytorch/issues/112577 + if self.require_contiguous_qkv and query_layer.device.type == "cuda" and attention_mask is not None: + query_layer = query_layer.contiguous() + key_layer = key_layer.contiguous() + value_layer = value_layer.contiguous() + + # We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment + # in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling. + # The tgt_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create + # a causal mask in case tgt_len == 1. + is_causal = ( + True if self.is_decoder and not is_cross_attention and attention_mask is None and tgt_len > 1 else False + ) + + attn_output = torch.nn.functional.scaled_dot_product_attention( + query_layer, + key_layer, + value_layer, + attn_mask=attention_mask, + dropout_p=self.dropout_prob if self.training else 0.0, + is_causal=is_causal, + ) + + attn_output = attn_output.transpose(1, 2) + attn_output = attn_output.reshape(bsz, tgt_len, self.all_head_size) + + outputs = (attn_output,) + if self.is_decoder: + outputs = outputs + (past_key_value,) + return outputs + + +# Copied from transformers.models.roberta.modeling_roberta.RobertaSelfOutput with Roberta->Camembert +class CamembertSelfOutput(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = self.LayerNorm(hidden_states + input_tensor) + return hidden_states + + +CAMEMBERT_SELF_ATTENTION_CLASSES = { + "eager": CamembertSelfAttention, + "sdpa": CamembertSdpaSelfAttention, +} + + +# Copied from transformers.models.roberta.modeling_roberta.RobertaAttention with Roberta->Camembert,ROBERTA->CAMEMBERT +class CamembertAttention(nn.Module): + def __init__(self, config, position_embedding_type=None): + super().__init__() + self.self = CAMEMBERT_SELF_ATTENTION_CLASSES[config._attn_implementation]( + config, position_embedding_type=position_embedding_type + ) + self.output = CamembertSelfOutput(config) + self.pruned_heads = set() + + def prune_heads(self, heads): + if len(heads) == 0: + return + heads, index = find_pruneable_heads_and_indices( + heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads + ) + + # Prune linear layers + self.self.query = prune_linear_layer(self.self.query, index) + self.self.key = prune_linear_layer(self.self.key, index) + self.self.value = prune_linear_layer(self.self.value, index) + self.output.dense = prune_linear_layer(self.output.dense, index, dim=1) + + # Update hyper params and store pruned heads + self.self.num_attention_heads = self.self.num_attention_heads - len(heads) + self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads + self.pruned_heads = self.pruned_heads.union(heads) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.FloatTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + encoder_attention_mask: Optional[torch.FloatTensor] = None, + past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None, + output_attentions: Optional[bool] = False, + ) -> Tuple[torch.Tensor]: + self_outputs = self.self( + hidden_states, + attention_mask, + head_mask, + encoder_hidden_states, + encoder_attention_mask, + past_key_value, + output_attentions, + ) + attention_output = self.output(self_outputs[0], hidden_states) + outputs = (attention_output,) + self_outputs[1:] # add attentions if we output them + return outputs + + +# Copied from transformers.models.bert.modeling_bert.BertIntermediate with Bert->Roberta->Camembert +class CamembertIntermediate(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.intermediate_size) + if isinstance(config.hidden_act, str): + self.intermediate_act_fn = ACT2FN[config.hidden_act] + else: + self.intermediate_act_fn = config.hidden_act + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.intermediate_act_fn(hidden_states) + return hidden_states + + +# Copied from transformers.models.bert.modeling_bert.BertOutput with Bert->Roberta->Camembert +class CamembertOutput(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.intermediate_size, config.hidden_size) + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = self.LayerNorm(hidden_states + input_tensor) + return hidden_states + + +# Copied from transformers.models.roberta.modeling_roberta.RobertaLayer with Roberta->Camembert +class CamembertLayer(nn.Module): + def __init__(self, config): + super().__init__() + self.chunk_size_feed_forward = config.chunk_size_feed_forward + self.seq_len_dim = 1 + self.attention = CamembertAttention(config) + self.is_decoder = config.is_decoder + self.add_cross_attention = config.add_cross_attention + if self.add_cross_attention: + if not self.is_decoder: + raise ValueError(f"{self} should be used as a decoder model if cross attention is added") + self.crossattention = CamembertAttention(config, position_embedding_type="absolute") + self.intermediate = CamembertIntermediate(config) + self.output = CamembertOutput(config) + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.FloatTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + encoder_attention_mask: Optional[torch.FloatTensor] = None, + past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None, + output_attentions: Optional[bool] = False, + ) -> Tuple[torch.Tensor]: + # decoder uni-directional self-attention cached key/values tuple is at positions 1,2 + self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None + self_attention_outputs = self.attention( + hidden_states, + attention_mask, + head_mask, + output_attentions=output_attentions, + past_key_value=self_attn_past_key_value, + ) + attention_output = self_attention_outputs[0] + + # if decoder, the last output is tuple of self-attn cache + if self.is_decoder: + outputs = self_attention_outputs[1:-1] + present_key_value = self_attention_outputs[-1] + else: + outputs = self_attention_outputs[1:] # add self attentions if we output attention weights + + cross_attn_present_key_value = None + if self.is_decoder and encoder_hidden_states is not None: + if not hasattr(self, "crossattention"): + raise ValueError( + f"If `encoder_hidden_states` are passed, {self} has to be instantiated with cross-attention layers" + " by setting `config.add_cross_attention=True`" + ) + + # cross_attn cached key/values tuple is at positions 3,4 of past_key_value tuple + cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None + cross_attention_outputs = self.crossattention( + attention_output, + attention_mask, + head_mask, + encoder_hidden_states, + encoder_attention_mask, + cross_attn_past_key_value, + output_attentions, + ) + attention_output = cross_attention_outputs[0] + outputs = outputs + cross_attention_outputs[1:-1] # add cross attentions if we output attention weights + + # add cross-attn cache to positions 3,4 of present_key_value tuple + cross_attn_present_key_value = cross_attention_outputs[-1] + present_key_value = present_key_value + cross_attn_present_key_value + + layer_output = apply_chunking_to_forward( + self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attention_output + ) + outputs = (layer_output,) + outputs + + # if decoder, return the attn key/values as the last output + if self.is_decoder: + outputs = outputs + (present_key_value,) + + return outputs + + def feed_forward_chunk(self, attention_output): + intermediate_output = self.intermediate(attention_output) + layer_output = self.output(intermediate_output, attention_output) + return layer_output + + +# Copied from transformers.models.roberta.modeling_roberta.RobertaEncoder with Roberta->Camembert +class CamembertEncoder(nn.Module): + def __init__(self, config): + super().__init__() + self.config = config + self.layer = nn.ModuleList([CamembertLayer(config) for _ in range(config.num_hidden_layers)]) + self.gradient_checkpointing = False + + def forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.FloatTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + encoder_attention_mask: Optional[torch.FloatTensor] = None, + past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = False, + output_hidden_states: Optional[bool] = False, + return_dict: Optional[bool] = True, + ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPastAndCrossAttentions]: + all_hidden_states = () if output_hidden_states else None + all_self_attentions = () if output_attentions else None + all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None + + if self.gradient_checkpointing and self.training: + if use_cache: + logger.warning_once( + "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..." + ) + use_cache = False + + next_decoder_cache = () if use_cache else None + for i, layer_module in enumerate(self.layer): + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + layer_head_mask = head_mask[i] if head_mask is not None else None + past_key_value = past_key_values[i] if past_key_values is not None else None + + if self.gradient_checkpointing and self.training: + layer_outputs = self._gradient_checkpointing_func( + layer_module.__call__, + hidden_states, + attention_mask, + layer_head_mask, + encoder_hidden_states, + encoder_attention_mask, + past_key_value, + output_attentions, + ) + else: + layer_outputs = layer_module( + hidden_states, + attention_mask, + layer_head_mask, + encoder_hidden_states, + encoder_attention_mask, + past_key_value, + output_attentions, + ) + + hidden_states = layer_outputs[0] + if use_cache: + next_decoder_cache += (layer_outputs[-1],) + if output_attentions: + all_self_attentions = all_self_attentions + (layer_outputs[1],) + if self.config.add_cross_attention: + all_cross_attentions = all_cross_attentions + (layer_outputs[2],) + + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + if not return_dict: + return tuple( + v + for v in [ + hidden_states, + next_decoder_cache, + all_hidden_states, + all_self_attentions, + all_cross_attentions, + ] + if v is not None + ) + return BaseModelOutputWithPastAndCrossAttentions( + last_hidden_state=hidden_states, + past_key_values=next_decoder_cache, + hidden_states=all_hidden_states, + attentions=all_self_attentions, + cross_attentions=all_cross_attentions, + ) + + +# Copied from transformers.models.bert.modeling_bert.BertPooler +class CamembertPooler(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + self.activation = nn.Tanh() + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + # We "pool" the model by simply taking the hidden state corresponding + # to the first token. + first_token_tensor = hidden_states[:, 0] + pooled_output = self.dense(first_token_tensor) + pooled_output = self.activation(pooled_output) + return pooled_output + + +class CamembertPreTrainedModel(PreTrainedModel): + """ + An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained + models. + """ + + config_class = CamembertConfig + base_model_prefix = "roberta" + supports_gradient_checkpointing = True + _supports_sdpa = True + + # Copied from transformers.models.bert.modeling_bert.BertPreTrainedModel._init_weights with BertLMPredictionHead->CamembertLMHead + def _init_weights(self, module): + """Initialize the weights""" + if isinstance(module, nn.Linear): + # Slightly different from the TF version which uses truncated_normal for initialization + # cf https://github.com/pytorch/pytorch/pull/5617 + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + elif isinstance(module, nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + elif isinstance(module, CamembertLMHead): + module.bias.data.zero_() + + +CAMEMBERT_INPUTS_DOCSTRING = r""" + Args: + input_ids (`torch.LongTensor` of shape `({0})`): + Indices of input sequence tokens in the vocabulary. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*): + Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0, + 1]`: + + - 0 corresponds to a *sentence A* token, + - 1 corresponds to a *sentence B* token. + + [What are token type IDs?](../glossary#token-type-ids) + position_ids (`torch.LongTensor` of shape `({0})`, *optional*): + Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, + config.max_position_embeddings - 1]`. + + [What are position IDs?](../glossary#position-ids) + head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*): + Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This + is useful if you want more control over how to convert `input_ids` indices into associated vectors than the + model's internal embedding lookup matrix. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. +""" + + +# Copied from transformers.models.roberta.modeling_roberta.RobertaClassificationHead with Roberta->Camembert +class CamembertClassificationHead(nn.Module): + """Head for sentence-level classification tasks.""" + + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + classifier_dropout = ( + config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob + ) + self.dropout = nn.Dropout(classifier_dropout) + self.out_proj = nn.Linear(config.hidden_size, config.num_labels) + + def forward(self, features, **kwargs): + x = features[:, 0, :] # take token (equiv. to [CLS]) + x = self.dropout(x) + x = self.dense(x) + x = torch.tanh(x) + x = self.dropout(x) + x = self.out_proj(x) + return x + + +# Copied from transformers.models.roberta.modeling_roberta.RobertaLMHead with Roberta->Camembert +class CamembertLMHead(nn.Module): + """Camembert Head for masked language modeling.""" + + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + + self.decoder = nn.Linear(config.hidden_size, config.vocab_size) + self.bias = nn.Parameter(torch.zeros(config.vocab_size)) + self.decoder.bias = self.bias + + def forward(self, features, **kwargs): + x = self.dense(features) + x = gelu(x) + x = self.layer_norm(x) + + # project back to size of vocabulary with bias + x = self.decoder(x) + + return x + + def _tie_weights(self): + # To tie those two weights if they get disconnected (on TPU or when the bias is resized) + # For accelerate compatibility and to not break backward compatibility + if self.decoder.bias.device.type == "meta": + self.decoder.bias = self.bias + else: + self.bias = self.decoder.bias + + +@add_start_docstrings( + "The bare CamemBERT Model transformer outputting raw hidden-states without any specific head on top.", + CAMEMBERT_START_DOCSTRING, +) +class CamembertModel(CamembertPreTrainedModel): + """ + + The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of + cross-attention is added between the self-attention layers, following the architecture described in *Attention is + all you need*_ by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz + Kaiser and Illia Polosukhin. + + To behave as a decoder the model needs to be initialized with the `is_decoder` argument of the configuration set to + `True`. To be used in a Seq2Seq model, the model needs to initialized with both `is_decoder` argument and + `add_cross_attention` set to `True`; an `encoder_hidden_states` is then expected as an input to the forward pass. + + .. _*Attention is all you need*: https://arxiv.org/abs/1706.03762 + + """ + + _no_split_modules = [] + + # Copied from transformers.models.roberta.modeling_roberta.RobertaModel.__init__ with Roberta->Camembert + def __init__(self, config, add_pooling_layer=True): + super().__init__(config) + self.config = config + + self.embeddings = CamembertEmbeddings(config) + self.encoder = CamembertEncoder(config) + + self.pooler = CamembertPooler(config) if add_pooling_layer else None + + self.attn_implementation = config._attn_implementation + self.position_embedding_type = config.position_embedding_type + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.embeddings.word_embeddings + + def set_input_embeddings(self, value): + self.embeddings.word_embeddings = value + + def _prune_heads(self, heads_to_prune): + """ + Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base + class PreTrainedModel + """ + for layer, heads in heads_to_prune.items(): + self.encoder.layer[layer].attention.prune_heads(heads) + + @add_start_docstrings_to_model_forward(CAMEMBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @add_code_sample_docstrings( + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=BaseModelOutputWithPoolingAndCrossAttentions, + config_class=_CONFIG_FOR_DOC, + ) + # Copied from transformers.models.roberta.modeling_roberta.RobertaModel.forward + def forward( + self, + input_ids: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + token_type_ids: Optional[torch.Tensor] = None, + position_ids: Optional[torch.Tensor] = None, + head_mask: Optional[torch.Tensor] = None, + inputs_embeds: Optional[torch.Tensor] = None, + encoder_hidden_states: Optional[torch.Tensor] = None, + encoder_attention_mask: Optional[torch.Tensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple[torch.Tensor], BaseModelOutputWithPoolingAndCrossAttentions]: + r""" + encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if + the model is configured as a decoder. + encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)` or `(batch_size, sequence_length, target_length)`, *optional*): + Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in + the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`): + Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding. + + If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that + don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all + `decoder_input_ids` of shape `(batch_size, sequence_length)`. + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see + `past_key_values`). + """ + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if self.config.is_decoder: + use_cache = use_cache if use_cache is not None else self.config.use_cache + else: + use_cache = False + + if input_ids is not None and inputs_embeds is not None: + raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") + elif input_ids is not None: + self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask) + input_shape = input_ids.size() + elif inputs_embeds is not None: + input_shape = inputs_embeds.size()[:-1] + else: + raise ValueError("You have to specify either input_ids or inputs_embeds") + + batch_size, seq_length = input_shape + device = input_ids.device if input_ids is not None else inputs_embeds.device + + # past_key_values_length + past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0 + + if token_type_ids is None: + if hasattr(self.embeddings, "token_type_ids"): + buffered_token_type_ids = self.embeddings.token_type_ids[:, :seq_length] + buffered_token_type_ids_expanded = buffered_token_type_ids.expand(batch_size, seq_length) + token_type_ids = buffered_token_type_ids_expanded + else: + token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device) + + embedding_output = self.embeddings( + input_ids=input_ids, + position_ids=position_ids, + token_type_ids=token_type_ids, + inputs_embeds=inputs_embeds, + past_key_values_length=past_key_values_length, + ) + + if attention_mask is None: + attention_mask = torch.ones((batch_size, seq_length + past_key_values_length), device=device) + + use_sdpa_attention_masks = ( + self.attn_implementation == "sdpa" + and self.position_embedding_type == "absolute" + and head_mask is None + and not output_attentions + ) + + # Expand the attention mask + if use_sdpa_attention_masks and attention_mask.dim() == 2: + # Expand the attention mask for SDPA. + # [bsz, seq_len] -> [bsz, 1, seq_len, seq_len] + if self.config.is_decoder: + extended_attention_mask = _prepare_4d_causal_attention_mask_for_sdpa( + attention_mask, + input_shape, + embedding_output, + past_key_values_length, + ) + else: + extended_attention_mask = _prepare_4d_attention_mask_for_sdpa( + attention_mask, embedding_output.dtype, tgt_len=seq_length + ) + else: + # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length] + # ourselves in which case we just need to make it broadcastable to all heads. + extended_attention_mask = self.get_extended_attention_mask(attention_mask, input_shape) + + # If a 2D or 3D attention mask is provided for the cross-attention + # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length] + if self.config.is_decoder and encoder_hidden_states is not None: + encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size() + encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length) + if encoder_attention_mask is None: + encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device) + + if use_sdpa_attention_masks and encoder_attention_mask.dim() == 2: + # Expand the attention mask for SDPA. + # [bsz, seq_len] -> [bsz, 1, seq_len, seq_len] + encoder_extended_attention_mask = _prepare_4d_attention_mask_for_sdpa( + encoder_attention_mask, embedding_output.dtype, tgt_len=seq_length + ) + else: + encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask) + else: + encoder_extended_attention_mask = None + + # Prepare head mask if needed + # 1.0 in head_mask indicate we keep the head + # attention_probs has shape bsz x n_heads x N x N + # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads] + # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length] + head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers) + + encoder_outputs = self.encoder( + embedding_output, + attention_mask=extended_attention_mask, + head_mask=head_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_extended_attention_mask, + past_key_values=past_key_values, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + sequence_output = encoder_outputs[0] + pooled_output = self.pooler(sequence_output) if self.pooler is not None else None + + if not return_dict: + return (sequence_output, pooled_output) + encoder_outputs[1:] + + return BaseModelOutputWithPoolingAndCrossAttentions( + last_hidden_state=sequence_output, + pooler_output=pooled_output, + past_key_values=encoder_outputs.past_key_values, + hidden_states=encoder_outputs.hidden_states, + attentions=encoder_outputs.attentions, + cross_attentions=encoder_outputs.cross_attentions, + ) + + +@add_start_docstrings( + """CamemBERT Model with a `language modeling` head on top.""", + CAMEMBERT_START_DOCSTRING, +) +# Copied from transformers.models.roberta.modeling_roberta.RobertaForMaskedLM with Roberta->Camembert, ROBERTA->CAMEMBERT +class CamembertForMaskedLM(CamembertPreTrainedModel): + _tied_weights_keys = ["lm_head.decoder.weight", "lm_head.decoder.bias"] + + def __init__(self, config): + super().__init__(config) + + if config.is_decoder: + logger.warning( + "If you want to use `CamembertForMaskedLM` make sure `config.is_decoder=False` for " + "bi-directional self-attention." + ) + + self.roberta = CamembertModel(config, add_pooling_layer=False) + self.lm_head = CamembertLMHead(config) + + # Initialize weights and apply final processing + self.post_init() + + def get_output_embeddings(self): + return self.lm_head.decoder + + def set_output_embeddings(self, new_embeddings): + self.lm_head.decoder = new_embeddings + + @add_start_docstrings_to_model_forward(CAMEMBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @add_code_sample_docstrings( + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=MaskedLMOutput, + config_class=_CONFIG_FOR_DOC, + mask="", + expected_output="' Paris'", + expected_loss=0.1, + ) + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.FloatTensor] = None, + token_type_ids: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + encoder_attention_mask: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple[torch.Tensor], MaskedLMOutput]: + r""" + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ..., + config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the + loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]` + kwargs (`Dict[str, any]`, *optional*, defaults to `{}`): + Used to hide legacy arguments that have been deprecated. + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.roberta( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + sequence_output = outputs[0] + prediction_scores = self.lm_head(sequence_output) + + masked_lm_loss = None + if labels is not None: + # move labels to correct device to enable model parallelism + labels = labels.to(prediction_scores.device) + loss_fct = CrossEntropyLoss() + masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1)) + + if not return_dict: + output = (prediction_scores,) + outputs[2:] + return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output + + return MaskedLMOutput( + loss=masked_lm_loss, + logits=prediction_scores, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +@add_start_docstrings( + """ + CamemBERT Model transformer with a sequence classification/regression head on top (a linear layer on top of the + pooled output) e.g. for GLUE tasks. + """, + CAMEMBERT_START_DOCSTRING, +) +# Copied from transformers.models.roberta.modeling_roberta.RobertaForSequenceClassification with Roberta->Camembert, ROBERTA->CAMEMBERT +class CamembertForSequenceClassification(CamembertPreTrainedModel): + def __init__(self, config): + super().__init__(config) + self.num_labels = config.num_labels + self.config = config + + self.roberta = CamembertModel(config, add_pooling_layer=False) + self.classifier = CamembertClassificationHead(config) + + # Initialize weights and apply final processing + self.post_init() + + @add_start_docstrings_to_model_forward(CAMEMBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @add_code_sample_docstrings( + checkpoint="cardiffnlp/twitter-roberta-base-emotion", + output_type=SequenceClassifierOutput, + config_class=_CONFIG_FOR_DOC, + expected_output="'optimism'", + expected_loss=0.08, + ) + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.FloatTensor] = None, + token_type_ids: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple[torch.Tensor], SequenceClassifierOutput]: + r""" + labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., + config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If + `config.num_labels > 1` a classification loss is computed (Cross-Entropy). + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.roberta( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + sequence_output = outputs[0] + logits = self.classifier(sequence_output) + + loss = None + if labels is not None: + # move labels to correct device to enable model parallelism + labels = labels.to(logits.device) + if self.config.problem_type is None: + if self.num_labels == 1: + self.config.problem_type = "regression" + elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int): + self.config.problem_type = "single_label_classification" + else: + self.config.problem_type = "multi_label_classification" + + if self.config.problem_type == "regression": + loss_fct = MSELoss() + if self.num_labels == 1: + loss = loss_fct(logits.squeeze(), labels.squeeze()) + else: + loss = loss_fct(logits, labels) + elif self.config.problem_type == "single_label_classification": + loss_fct = CrossEntropyLoss() + loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) + elif self.config.problem_type == "multi_label_classification": + loss_fct = BCEWithLogitsLoss() + loss = loss_fct(logits, labels) + + if not return_dict: + output = (logits,) + outputs[2:] + return ((loss,) + output) if loss is not None else output + + return SequenceClassifierOutput( + loss=loss, + logits=logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +@add_start_docstrings( + """ + CamemBERT Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a + softmax) e.g. for RocStories/SWAG tasks. + """, + CAMEMBERT_START_DOCSTRING, +) +# Copied from transformers.models.roberta.modeling_roberta.RobertaForMultipleChoice with Roberta->Camembert, ROBERTA->CAMEMBERT +class CamembertForMultipleChoice(CamembertPreTrainedModel): + def __init__(self, config): + super().__init__(config) + + self.roberta = CamembertModel(config) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + self.classifier = nn.Linear(config.hidden_size, 1) + + # Initialize weights and apply final processing + self.post_init() + + @add_start_docstrings_to_model_forward( + CAMEMBERT_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length") + ) + @add_code_sample_docstrings( + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=MultipleChoiceModelOutput, + config_class=_CONFIG_FOR_DOC, + ) + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + token_type_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple[torch.Tensor], MultipleChoiceModelOutput]: + r""" + labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the multiple choice classification loss. Indices should be in `[0, ..., + num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See + `input_ids` above) + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1] + + flat_input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None + flat_position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None + flat_token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None + flat_attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None + flat_inputs_embeds = ( + inputs_embeds.view(-1, inputs_embeds.size(-2), inputs_embeds.size(-1)) + if inputs_embeds is not None + else None + ) + + outputs = self.roberta( + flat_input_ids, + position_ids=flat_position_ids, + token_type_ids=flat_token_type_ids, + attention_mask=flat_attention_mask, + head_mask=head_mask, + inputs_embeds=flat_inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + pooled_output = outputs[1] + + pooled_output = self.dropout(pooled_output) + logits = self.classifier(pooled_output) + reshaped_logits = logits.view(-1, num_choices) + + loss = None + if labels is not None: + # move labels to correct device to enable model parallelism + labels = labels.to(reshaped_logits.device) + loss_fct = CrossEntropyLoss() + loss = loss_fct(reshaped_logits, labels) + + if not return_dict: + output = (reshaped_logits,) + outputs[2:] + return ((loss,) + output) if loss is not None else output + + return MultipleChoiceModelOutput( + loss=loss, + logits=reshaped_logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +@add_start_docstrings( + """ + CamemBERT Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. + for Named-Entity-Recognition (NER) tasks. + """, + CAMEMBERT_START_DOCSTRING, +) +# Copied from transformers.models.roberta.modeling_roberta.RobertaForTokenClassification with Roberta->Camembert, ROBERTA->CAMEMBERT +class CamembertForTokenClassification(CamembertPreTrainedModel): + def __init__(self, config): + super().__init__(config) + self.num_labels = config.num_labels + + self.roberta = CamembertModel(config, add_pooling_layer=False) + classifier_dropout = ( + config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob + ) + self.dropout = nn.Dropout(classifier_dropout) + self.classifier = nn.Linear(config.hidden_size, config.num_labels) + + # Initialize weights and apply final processing + self.post_init() + + @add_start_docstrings_to_model_forward(CAMEMBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @add_code_sample_docstrings( + checkpoint="Jean-Baptiste/roberta-large-ner-english", + output_type=TokenClassifierOutput, + config_class=_CONFIG_FOR_DOC, + expected_output="['O', 'ORG', 'ORG', 'O', 'O', 'O', 'O', 'O', 'LOC', 'O', 'LOC', 'LOC']", + expected_loss=0.01, + ) + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.FloatTensor] = None, + token_type_ids: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple[torch.Tensor], TokenClassifierOutput]: + r""" + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`. + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.roberta( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + sequence_output = outputs[0] + + sequence_output = self.dropout(sequence_output) + logits = self.classifier(sequence_output) + + loss = None + if labels is not None: + # move labels to correct device to enable model parallelism + labels = labels.to(logits.device) + loss_fct = CrossEntropyLoss() + loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) + + if not return_dict: + output = (logits,) + outputs[2:] + return ((loss,) + output) if loss is not None else output + + return TokenClassifierOutput( + loss=loss, + logits=logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +@add_start_docstrings( + """ + CamemBERT Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear + layers on top of the hidden-states output to compute `span start logits` and `span end logits` + """, + CAMEMBERT_START_DOCSTRING, +) +# Copied from transformers.models.roberta.modeling_roberta.RobertaForQuestionAnswering with Roberta->Camembert, ROBERTA->CAMEMBERT +class CamembertForQuestionAnswering(CamembertPreTrainedModel): + def __init__(self, config): + super().__init__(config) + self.num_labels = config.num_labels + + self.roberta = CamembertModel(config, add_pooling_layer=False) + self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels) + + # Initialize weights and apply final processing + self.post_init() + + @add_start_docstrings_to_model_forward(CAMEMBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @add_code_sample_docstrings( + checkpoint="deepset/roberta-base-squad2", + output_type=QuestionAnsweringModelOutput, + config_class=_CONFIG_FOR_DOC, + expected_output="' puppet'", + expected_loss=0.86, + ) + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.FloatTensor] = None, + token_type_ids: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + start_positions: Optional[torch.LongTensor] = None, + end_positions: Optional[torch.LongTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple[torch.Tensor], QuestionAnsweringModelOutput]: + r""" + start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for position (index) of the start of the labelled span for computing the token classification loss. + Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence + are not taken into account for computing the loss. + end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for position (index) of the end of the labelled span for computing the token classification loss. + Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence + are not taken into account for computing the loss. + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.roberta( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + sequence_output = outputs[0] + + logits = self.qa_outputs(sequence_output) + start_logits, end_logits = logits.split(1, dim=-1) + start_logits = start_logits.squeeze(-1).contiguous() + end_logits = end_logits.squeeze(-1).contiguous() + + total_loss = None + if start_positions is not None and end_positions is not None: + # If we are on multi-GPU, split add a dimension + if len(start_positions.size()) > 1: + start_positions = start_positions.squeeze(-1) + if len(end_positions.size()) > 1: + end_positions = end_positions.squeeze(-1) + # sometimes the start/end positions are outside our model inputs, we ignore these terms + ignored_index = start_logits.size(1) + start_positions = start_positions.clamp(0, ignored_index) + end_positions = end_positions.clamp(0, ignored_index) + + loss_fct = CrossEntropyLoss(ignore_index=ignored_index) + start_loss = loss_fct(start_logits, start_positions) + end_loss = loss_fct(end_logits, end_positions) + total_loss = (start_loss + end_loss) / 2 + + if not return_dict: + output = (start_logits, end_logits) + outputs[2:] + return ((total_loss,) + output) if total_loss is not None else output + + return QuestionAnsweringModelOutput( + loss=total_loss, + start_logits=start_logits, + end_logits=end_logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +@add_start_docstrings( + """CamemBERT Model with a `language modeling` head on top for CLM fine-tuning.""", CAMEMBERT_START_DOCSTRING +) +# Copied from transformers.models.roberta.modeling_roberta.RobertaForCausalLM with Roberta->Camembert, ROBERTA->CAMEMBERT, FacebookAI/roberta-base->almanach/camembert-base +class CamembertForCausalLM(CamembertPreTrainedModel, GenerationMixin): + _tied_weights_keys = ["lm_head.decoder.weight", "lm_head.decoder.bias"] + + def __init__(self, config): + super().__init__(config) + + if not config.is_decoder: + logger.warning("If you want to use `CamembertLMHeadModel` as a standalone, add `is_decoder=True.`") + + self.roberta = CamembertModel(config, add_pooling_layer=False) + self.lm_head = CamembertLMHead(config) + + # Initialize weights and apply final processing + self.post_init() + + def get_output_embeddings(self): + return self.lm_head.decoder + + def set_output_embeddings(self, new_embeddings): + self.lm_head.decoder = new_embeddings + + @add_start_docstrings_to_model_forward(CAMEMBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @replace_return_docstrings(output_type=CausalLMOutputWithCrossAttentions, config_class=_CONFIG_FOR_DOC) + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.FloatTensor] = None, + token_type_ids: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + encoder_attention_mask: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + past_key_values: Tuple[Tuple[torch.FloatTensor]] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + **kwargs, + ) -> Union[Tuple[torch.Tensor], CausalLMOutputWithCrossAttentions]: + r""" + encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if + the model is configured as a decoder. + encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in + the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in + `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are + ignored (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]` + past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`): + Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding. + + If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that + don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all + `decoder_input_ids` of shape `(batch_size, sequence_length)`. + use_cache (`bool`, *optional*): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see + `past_key_values`). + + Returns: + + Example: + + ```python + >>> from transformers import AutoTokenizer, CamembertForCausalLM, AutoConfig + >>> import torch + + >>> tokenizer = AutoTokenizer.from_pretrained("almanach/camembert-base") + >>> config = AutoConfig.from_pretrained("almanach/camembert-base") + >>> config.is_decoder = True + >>> model = CamembertForCausalLM.from_pretrained("almanach/camembert-base", config=config) + + >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt") + >>> outputs = model(**inputs) + + >>> prediction_logits = outputs.logits + ```""" + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + if labels is not None: + use_cache = False + + outputs = self.roberta( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + past_key_values=past_key_values, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + sequence_output = outputs[0] + prediction_scores = self.lm_head(sequence_output) + + lm_loss = None + if labels is not None: + # move labels to correct device to enable model parallelism + labels = labels.to(prediction_scores.device) + lm_loss = self.loss_function( + prediction_scores, + labels, + vocab_size=self.config.vocab_size, + **kwargs, + ) + + if not return_dict: + output = (prediction_scores,) + outputs[2:] + return ((lm_loss,) + output) if lm_loss is not None else output + + return CausalLMOutputWithCrossAttentions( + loss=lm_loss, + logits=prediction_scores, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + cross_attentions=outputs.cross_attentions, + ) + + def _reorder_cache(self, past_key_values, beam_idx): + reordered_past = () + for layer_past in past_key_values: + reordered_past += ( + tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past), + ) + return reordered_past + + +# Copied from transformers.models.roberta.modeling_roberta.create_position_ids_from_input_ids +def create_position_ids_from_input_ids(input_ids, padding_idx, past_key_values_length=0): + """ + Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols + are ignored. This is modified from fairseq's `utils.make_positions`. + + Args: + x: torch.Tensor x: + + Returns: torch.Tensor + """ + # The series of casts and type-conversions here are carefully balanced to both work with ONNX export and XLA. + mask = input_ids.ne(padding_idx).int() + incremental_indices = (torch.cumsum(mask, dim=1).type_as(mask) + past_key_values_length) * mask + return incremental_indices.long() + padding_idx + + +__all__ = [ + "CamembertForCausalLM", + "CamembertForMaskedLM", + "CamembertForMultipleChoice", + "CamembertForQuestionAnswering", + "CamembertForSequenceClassification", + "CamembertForTokenClassification", + "CamembertModel", + "CamembertPreTrainedModel", +] diff --git a/docs/transformers/build/lib/transformers/models/camembert/modeling_tf_camembert.py b/docs/transformers/build/lib/transformers/models/camembert/modeling_tf_camembert.py new file mode 100644 index 0000000000000000000000000000000000000000..6f456723dea54aafe13a545ea85273835e9605b4 --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/camembert/modeling_tf_camembert.py @@ -0,0 +1,1801 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. +# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""TF 2.0 CamemBERT model.""" + +from __future__ import annotations + +import math +import warnings +from typing import Optional, Tuple, Union + +import numpy as np +import tensorflow as tf + +from ...activations_tf import get_tf_activation +from ...modeling_tf_outputs import ( + TFBaseModelOutputWithPastAndCrossAttentions, + TFBaseModelOutputWithPoolingAndCrossAttentions, + TFCausalLMOutputWithCrossAttentions, + TFMaskedLMOutput, + TFMultipleChoiceModelOutput, + TFQuestionAnsweringModelOutput, + TFSequenceClassifierOutput, + TFTokenClassifierOutput, +) +from ...modeling_tf_utils import ( + TFCausalLanguageModelingLoss, + TFMaskedLanguageModelingLoss, + TFModelInputType, + TFMultipleChoiceLoss, + TFPreTrainedModel, + TFQuestionAnsweringLoss, + TFSequenceClassificationLoss, + TFTokenClassificationLoss, + get_initializer, + keras, + keras_serializable, + unpack_inputs, +) +from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax +from ...utils import ( + add_code_sample_docstrings, + add_start_docstrings, + add_start_docstrings_to_model_forward, + logging, +) +from .configuration_camembert import CamembertConfig + + +logger = logging.get_logger(__name__) + +_CHECKPOINT_FOR_DOC = "almanach/camembert-base" +_CONFIG_FOR_DOC = "CamembertConfig" + + +CAMEMBERT_START_DOCSTRING = r""" + + This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the generic methods the + library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads + etc.) + + This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it + as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and + behavior. + + + + TensorFlow models and layers in `transformers` accept two formats as input: + + - having all inputs as keyword arguments (like PyTorch models), or + - having all inputs as a list, tuple or dict in the first positional argument. + + The reason the second format is supported is that Keras methods prefer this format when passing inputs to models + and layers. Because of this support, when using methods like `model.fit()` things should "just work" for you - just + pass your inputs and labels in any format that `model.fit()` supports! If, however, you want to use the second + format outside of Keras methods like `fit()` and `predict()`, such as when creating your own layers or models with + the Keras `Functional` API, there are three possibilities you can use to gather all the input Tensors in the first + positional argument: + + - a single Tensor with `input_ids` only and nothing else: `model(input_ids)` + - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring: + `model([input_ids, attention_mask])` or `model([input_ids, attention_mask, token_type_ids])` + - a dictionary with one or several input Tensors associated to the input names given in the docstring: + `model({"input_ids": input_ids, "token_type_ids": token_type_ids})` + + Note that when creating models and layers with + [subclassing](https://keras.io/guides/making_new_layers_and_models_via_subclassing/) then you don't need to worry + about any of this, as you can just pass inputs like you would to any other Python function! + + + + Parameters: + config ([`CamembertConfig`]): Model configuration class with all the parameters of the + model. Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights. +""" + +CAMEMBERT_INPUTS_DOCSTRING = r""" + Args: + input_ids (`Numpy array` or `tf.Tensor` of shape `({0})`): + Indices of input sequence tokens in the vocabulary. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.__call__`] and + [`PreTrainedTokenizer.encode`] for details. + + [What are input IDs?](../glossary#input-ids) + attention_mask (`Numpy array` or `tf.Tensor` of shape `({0})`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + token_type_ids (`Numpy array` or `tf.Tensor` of shape `({0})`, *optional*): + Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0, + 1]`: + + - 0 corresponds to a *sentence A* token, + - 1 corresponds to a *sentence B* token. + + [What are token type IDs?](../glossary#token-type-ids) + position_ids (`Numpy array` or `tf.Tensor` of shape `({0})`, *optional*): + Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, + config.max_position_embeddings - 1]`. + + [What are position IDs?](../glossary#position-ids) + head_mask (`Numpy array` or `tf.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*): + Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + inputs_embeds (`tf.Tensor` of shape `({0}, hidden_size)`, *optional*): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This + is useful if you want more control over how to convert `input_ids` indices into associated vectors than the + model's internal embedding lookup matrix. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. This argument can be used only in eager mode, in graph mode the value in the + config will be used instead. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. This argument can be used only in eager mode, in graph mode the value in the config will be + used instead. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. This argument can be used in + eager mode, in graph mode the value will always be set to True. + training (`bool`, *optional*, defaults to `False`): + Whether or not to use the model in training mode (some modules like dropout modules have different + behaviors between training and evaluation). +""" + + +# Copied from transformers.models.roberta.modeling_tf_roberta.TFRobertaEmbeddings +class TFCamembertEmbeddings(keras.layers.Layer): + """ + Same as BertEmbeddings with a tiny tweak for positional embeddings indexing. + """ + + def __init__(self, config, **kwargs): + super().__init__(**kwargs) + + self.padding_idx = 1 + self.config = config + self.hidden_size = config.hidden_size + self.max_position_embeddings = config.max_position_embeddings + self.initializer_range = config.initializer_range + self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") + self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob) + + def build(self, input_shape=None): + with tf.name_scope("word_embeddings"): + self.weight = self.add_weight( + name="weight", + shape=[self.config.vocab_size, self.hidden_size], + initializer=get_initializer(self.initializer_range), + ) + + with tf.name_scope("token_type_embeddings"): + self.token_type_embeddings = self.add_weight( + name="embeddings", + shape=[self.config.type_vocab_size, self.hidden_size], + initializer=get_initializer(self.initializer_range), + ) + + with tf.name_scope("position_embeddings"): + self.position_embeddings = self.add_weight( + name="embeddings", + shape=[self.max_position_embeddings, self.hidden_size], + initializer=get_initializer(self.initializer_range), + ) + + if self.built: + return + self.built = True + if getattr(self, "LayerNorm", None) is not None: + with tf.name_scope(self.LayerNorm.name): + self.LayerNorm.build([None, None, self.config.hidden_size]) + + def create_position_ids_from_input_ids(self, input_ids, past_key_values_length=0): + """ + Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding + symbols are ignored. This is modified from fairseq's `utils.make_positions`. + + Args: + input_ids: tf.Tensor + Returns: tf.Tensor + """ + mask = tf.cast(tf.math.not_equal(input_ids, self.padding_idx), dtype=input_ids.dtype) + incremental_indices = (tf.math.cumsum(mask, axis=1) + past_key_values_length) * mask + + return incremental_indices + self.padding_idx + + def call( + self, + input_ids=None, + position_ids=None, + token_type_ids=None, + inputs_embeds=None, + past_key_values_length=0, + training=False, + ): + """ + Applies embedding based on inputs tensor. + + Returns: + final_embeddings (`tf.Tensor`): output embedding tensor. + """ + assert not (input_ids is None and inputs_embeds is None) + + if input_ids is not None: + check_embeddings_within_bounds(input_ids, self.config.vocab_size) + inputs_embeds = tf.gather(params=self.weight, indices=input_ids) + + input_shape = shape_list(inputs_embeds)[:-1] + + if token_type_ids is None: + token_type_ids = tf.fill(dims=input_shape, value=0) + + if position_ids is None: + if input_ids is not None: + # Create the position ids from the input token ids. Any padded tokens remain padded. + position_ids = self.create_position_ids_from_input_ids( + input_ids=input_ids, past_key_values_length=past_key_values_length + ) + else: + position_ids = tf.expand_dims( + tf.range(start=self.padding_idx + 1, limit=input_shape[-1] + self.padding_idx + 1), axis=0 + ) + + position_embeds = tf.gather(params=self.position_embeddings, indices=position_ids) + token_type_embeds = tf.gather(params=self.token_type_embeddings, indices=token_type_ids) + final_embeddings = inputs_embeds + position_embeds + token_type_embeds + final_embeddings = self.LayerNorm(inputs=final_embeddings) + final_embeddings = self.dropout(inputs=final_embeddings, training=training) + + return final_embeddings + + +# Copied from transformers.models.bert.modeling_tf_bert.TFBertPooler with Bert->Camembert +class TFCamembertPooler(keras.layers.Layer): + def __init__(self, config: CamembertConfig, **kwargs): + super().__init__(**kwargs) + + self.dense = keras.layers.Dense( + units=config.hidden_size, + kernel_initializer=get_initializer(config.initializer_range), + activation="tanh", + name="dense", + ) + self.config = config + + def call(self, hidden_states: tf.Tensor) -> tf.Tensor: + # We "pool" the model by simply taking the hidden state corresponding + # to the first token. + first_token_tensor = hidden_states[:, 0] + pooled_output = self.dense(inputs=first_token_tensor) + + return pooled_output + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build([None, None, self.config.hidden_size]) + + +# Copied from transformers.models.bert.modeling_tf_bert.TFBertSelfAttention with Bert->Camembert +class TFCamembertSelfAttention(keras.layers.Layer): + def __init__(self, config: CamembertConfig, **kwargs): + super().__init__(**kwargs) + + if config.hidden_size % config.num_attention_heads != 0: + raise ValueError( + f"The hidden size ({config.hidden_size}) is not a multiple of the number " + f"of attention heads ({config.num_attention_heads})" + ) + + self.num_attention_heads = config.num_attention_heads + self.attention_head_size = int(config.hidden_size / config.num_attention_heads) + self.all_head_size = self.num_attention_heads * self.attention_head_size + self.sqrt_att_head_size = math.sqrt(self.attention_head_size) + + self.query = keras.layers.Dense( + units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="query" + ) + self.key = keras.layers.Dense( + units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="key" + ) + self.value = keras.layers.Dense( + units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="value" + ) + self.dropout = keras.layers.Dropout(rate=config.attention_probs_dropout_prob) + + self.is_decoder = config.is_decoder + self.config = config + + def transpose_for_scores(self, tensor: tf.Tensor, batch_size: int) -> tf.Tensor: + # Reshape from [batch_size, seq_length, all_head_size] to [batch_size, seq_length, num_attention_heads, attention_head_size] + tensor = tf.reshape(tensor=tensor, shape=(batch_size, -1, self.num_attention_heads, self.attention_head_size)) + + # Transpose the tensor from [batch_size, seq_length, num_attention_heads, attention_head_size] to [batch_size, num_attention_heads, seq_length, attention_head_size] + return tf.transpose(tensor, perm=[0, 2, 1, 3]) + + def call( + self, + hidden_states: tf.Tensor, + attention_mask: tf.Tensor, + head_mask: tf.Tensor, + encoder_hidden_states: tf.Tensor, + encoder_attention_mask: tf.Tensor, + past_key_value: Tuple[tf.Tensor], + output_attentions: bool, + training: bool = False, + ) -> Tuple[tf.Tensor]: + batch_size = shape_list(hidden_states)[0] + mixed_query_layer = self.query(inputs=hidden_states) + + # If this is instantiated as a cross-attention module, the keys + # and values come from an encoder; the attention mask needs to be + # such that the encoder's padding tokens are not attended to. + is_cross_attention = encoder_hidden_states is not None + + if is_cross_attention and past_key_value is not None: + # reuse k,v, cross_attentions + key_layer = past_key_value[0] + value_layer = past_key_value[1] + attention_mask = encoder_attention_mask + elif is_cross_attention: + key_layer = self.transpose_for_scores(self.key(inputs=encoder_hidden_states), batch_size) + value_layer = self.transpose_for_scores(self.value(inputs=encoder_hidden_states), batch_size) + attention_mask = encoder_attention_mask + elif past_key_value is not None: + key_layer = self.transpose_for_scores(self.key(inputs=hidden_states), batch_size) + value_layer = self.transpose_for_scores(self.value(inputs=hidden_states), batch_size) + key_layer = tf.concat([past_key_value[0], key_layer], axis=2) + value_layer = tf.concat([past_key_value[1], value_layer], axis=2) + else: + key_layer = self.transpose_for_scores(self.key(inputs=hidden_states), batch_size) + value_layer = self.transpose_for_scores(self.value(inputs=hidden_states), batch_size) + + query_layer = self.transpose_for_scores(mixed_query_layer, batch_size) + + if self.is_decoder: + # if cross_attention save Tuple(tf.Tensor, tf.Tensor) of all cross attention key/value_states. + # Further calls to cross_attention layer can then reuse all cross-attention + # key/value_states (first "if" case) + # if uni-directional self-attention (decoder) save Tuple(tf.Tensor, tf.Tensor) of + # all previous decoder key/value_states. Further calls to uni-directional self-attention + # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case) + # if encoder bi-directional self-attention `past_key_value` is always `None` + past_key_value = (key_layer, value_layer) + + # Take the dot product between "query" and "key" to get the raw attention scores. + # (batch size, num_heads, seq_len_q, seq_len_k) + attention_scores = tf.matmul(query_layer, key_layer, transpose_b=True) + dk = tf.cast(self.sqrt_att_head_size, dtype=attention_scores.dtype) + attention_scores = tf.divide(attention_scores, dk) + + if attention_mask is not None: + # Apply the attention mask is (precomputed for all layers in TFCamembertModel call() function) + attention_scores = tf.add(attention_scores, attention_mask) + + # Normalize the attention scores to probabilities. + attention_probs = stable_softmax(logits=attention_scores, axis=-1) + + # This is actually dropping out entire tokens to attend to, which might + # seem a bit unusual, but is taken from the original Transformer paper. + attention_probs = self.dropout(inputs=attention_probs, training=training) + + # Mask heads if we want to + if head_mask is not None: + attention_probs = tf.multiply(attention_probs, head_mask) + + attention_output = tf.matmul(attention_probs, value_layer) + attention_output = tf.transpose(attention_output, perm=[0, 2, 1, 3]) + + # (batch_size, seq_len_q, all_head_size) + attention_output = tf.reshape(tensor=attention_output, shape=(batch_size, -1, self.all_head_size)) + outputs = (attention_output, attention_probs) if output_attentions else (attention_output,) + + if self.is_decoder: + outputs = outputs + (past_key_value,) + return outputs + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "query", None) is not None: + with tf.name_scope(self.query.name): + self.query.build([None, None, self.config.hidden_size]) + if getattr(self, "key", None) is not None: + with tf.name_scope(self.key.name): + self.key.build([None, None, self.config.hidden_size]) + if getattr(self, "value", None) is not None: + with tf.name_scope(self.value.name): + self.value.build([None, None, self.config.hidden_size]) + + +# Copied from transformers.models.bert.modeling_tf_bert.TFBertSelfOutput with Bert->Camembert +class TFCamembertSelfOutput(keras.layers.Layer): + def __init__(self, config: CamembertConfig, **kwargs): + super().__init__(**kwargs) + + self.dense = keras.layers.Dense( + units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" + ) + self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") + self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob) + self.config = config + + def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor: + hidden_states = self.dense(inputs=hidden_states) + hidden_states = self.dropout(inputs=hidden_states, training=training) + hidden_states = self.LayerNorm(inputs=hidden_states + input_tensor) + + return hidden_states + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build([None, None, self.config.hidden_size]) + if getattr(self, "LayerNorm", None) is not None: + with tf.name_scope(self.LayerNorm.name): + self.LayerNorm.build([None, None, self.config.hidden_size]) + + +# Copied from transformers.models.bert.modeling_tf_bert.TFBertAttention with Bert->Camembert +class TFCamembertAttention(keras.layers.Layer): + def __init__(self, config: CamembertConfig, **kwargs): + super().__init__(**kwargs) + + self.self_attention = TFCamembertSelfAttention(config, name="self") + self.dense_output = TFCamembertSelfOutput(config, name="output") + + def prune_heads(self, heads): + raise NotImplementedError + + def call( + self, + input_tensor: tf.Tensor, + attention_mask: tf.Tensor, + head_mask: tf.Tensor, + encoder_hidden_states: tf.Tensor, + encoder_attention_mask: tf.Tensor, + past_key_value: Tuple[tf.Tensor], + output_attentions: bool, + training: bool = False, + ) -> Tuple[tf.Tensor]: + self_outputs = self.self_attention( + hidden_states=input_tensor, + attention_mask=attention_mask, + head_mask=head_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + past_key_value=past_key_value, + output_attentions=output_attentions, + training=training, + ) + attention_output = self.dense_output( + hidden_states=self_outputs[0], input_tensor=input_tensor, training=training + ) + # add attentions (possibly with past_key_value) if we output them + outputs = (attention_output,) + self_outputs[1:] + + return outputs + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "self_attention", None) is not None: + with tf.name_scope(self.self_attention.name): + self.self_attention.build(None) + if getattr(self, "dense_output", None) is not None: + with tf.name_scope(self.dense_output.name): + self.dense_output.build(None) + + +# Copied from transformers.models.bert.modeling_tf_bert.TFBertIntermediate with Bert->Camembert +class TFCamembertIntermediate(keras.layers.Layer): + def __init__(self, config: CamembertConfig, **kwargs): + super().__init__(**kwargs) + + self.dense = keras.layers.Dense( + units=config.intermediate_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" + ) + + if isinstance(config.hidden_act, str): + self.intermediate_act_fn = get_tf_activation(config.hidden_act) + else: + self.intermediate_act_fn = config.hidden_act + self.config = config + + def call(self, hidden_states: tf.Tensor) -> tf.Tensor: + hidden_states = self.dense(inputs=hidden_states) + hidden_states = self.intermediate_act_fn(hidden_states) + + return hidden_states + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build([None, None, self.config.hidden_size]) + + +# Copied from transformers.models.bert.modeling_tf_bert.TFBertOutput with Bert->Camembert +class TFCamembertOutput(keras.layers.Layer): + def __init__(self, config: CamembertConfig, **kwargs): + super().__init__(**kwargs) + + self.dense = keras.layers.Dense( + units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" + ) + self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") + self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob) + self.config = config + + def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor: + hidden_states = self.dense(inputs=hidden_states) + hidden_states = self.dropout(inputs=hidden_states, training=training) + hidden_states = self.LayerNorm(inputs=hidden_states + input_tensor) + + return hidden_states + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build([None, None, self.config.intermediate_size]) + if getattr(self, "LayerNorm", None) is not None: + with tf.name_scope(self.LayerNorm.name): + self.LayerNorm.build([None, None, self.config.hidden_size]) + + +# Copied from transformers.models.bert.modeling_tf_bert.TFBertLayer with Bert->Camembert +class TFCamembertLayer(keras.layers.Layer): + def __init__(self, config: CamembertConfig, **kwargs): + super().__init__(**kwargs) + + self.attention = TFCamembertAttention(config, name="attention") + self.is_decoder = config.is_decoder + self.add_cross_attention = config.add_cross_attention + if self.add_cross_attention: + if not self.is_decoder: + raise ValueError(f"{self} should be used as a decoder model if cross attention is added") + self.crossattention = TFCamembertAttention(config, name="crossattention") + self.intermediate = TFCamembertIntermediate(config, name="intermediate") + self.bert_output = TFCamembertOutput(config, name="output") + + def call( + self, + hidden_states: tf.Tensor, + attention_mask: tf.Tensor, + head_mask: tf.Tensor, + encoder_hidden_states: tf.Tensor | None, + encoder_attention_mask: tf.Tensor | None, + past_key_value: Tuple[tf.Tensor] | None, + output_attentions: bool, + training: bool = False, + ) -> Tuple[tf.Tensor]: + # decoder uni-directional self-attention cached key/values tuple is at positions 1,2 + self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None + self_attention_outputs = self.attention( + input_tensor=hidden_states, + attention_mask=attention_mask, + head_mask=head_mask, + encoder_hidden_states=None, + encoder_attention_mask=None, + past_key_value=self_attn_past_key_value, + output_attentions=output_attentions, + training=training, + ) + attention_output = self_attention_outputs[0] + + # if decoder, the last output is tuple of self-attn cache + if self.is_decoder: + outputs = self_attention_outputs[1:-1] + present_key_value = self_attention_outputs[-1] + else: + outputs = self_attention_outputs[1:] # add self attentions if we output attention weights + + cross_attn_present_key_value = None + if self.is_decoder and encoder_hidden_states is not None: + if not hasattr(self, "crossattention"): + raise ValueError( + f"If `encoder_hidden_states` are passed, {self} has to be instantiated with cross-attention layers" + " by setting `config.add_cross_attention=True`" + ) + + # cross_attn cached key/values tuple is at positions 3,4 of past_key_value tuple + cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None + cross_attention_outputs = self.crossattention( + input_tensor=attention_output, + attention_mask=attention_mask, + head_mask=head_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + past_key_value=cross_attn_past_key_value, + output_attentions=output_attentions, + training=training, + ) + attention_output = cross_attention_outputs[0] + outputs = outputs + cross_attention_outputs[1:-1] # add cross attentions if we output attention weights + + # add cross-attn cache to positions 3,4 of present_key_value tuple + cross_attn_present_key_value = cross_attention_outputs[-1] + present_key_value = present_key_value + cross_attn_present_key_value + + intermediate_output = self.intermediate(hidden_states=attention_output) + layer_output = self.bert_output( + hidden_states=intermediate_output, input_tensor=attention_output, training=training + ) + outputs = (layer_output,) + outputs # add attentions if we output them + + # if decoder, return the attn key/values as the last output + if self.is_decoder: + outputs = outputs + (present_key_value,) + + return outputs + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "attention", None) is not None: + with tf.name_scope(self.attention.name): + self.attention.build(None) + if getattr(self, "intermediate", None) is not None: + with tf.name_scope(self.intermediate.name): + self.intermediate.build(None) + if getattr(self, "bert_output", None) is not None: + with tf.name_scope(self.bert_output.name): + self.bert_output.build(None) + if getattr(self, "crossattention", None) is not None: + with tf.name_scope(self.crossattention.name): + self.crossattention.build(None) + + +# Copied from transformers.models.bert.modeling_tf_bert.TFBertEncoder with Bert->Camembert +class TFCamembertEncoder(keras.layers.Layer): + def __init__(self, config: CamembertConfig, **kwargs): + super().__init__(**kwargs) + self.config = config + self.layer = [TFCamembertLayer(config, name=f"layer_._{i}") for i in range(config.num_hidden_layers)] + + def call( + self, + hidden_states: tf.Tensor, + attention_mask: tf.Tensor, + head_mask: tf.Tensor, + encoder_hidden_states: tf.Tensor | None, + encoder_attention_mask: tf.Tensor | None, + past_key_values: Tuple[Tuple[tf.Tensor]] | None, + use_cache: Optional[bool], + output_attentions: bool, + output_hidden_states: bool, + return_dict: bool, + training: bool = False, + ) -> Union[TFBaseModelOutputWithPastAndCrossAttentions, Tuple[tf.Tensor]]: + all_hidden_states = () if output_hidden_states else None + all_attentions = () if output_attentions else None + all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None + + next_decoder_cache = () if use_cache else None + for i, layer_module in enumerate(self.layer): + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + past_key_value = past_key_values[i] if past_key_values is not None else None + + layer_outputs = layer_module( + hidden_states=hidden_states, + attention_mask=attention_mask, + head_mask=head_mask[i], + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + past_key_value=past_key_value, + output_attentions=output_attentions, + training=training, + ) + hidden_states = layer_outputs[0] + + if use_cache: + next_decoder_cache += (layer_outputs[-1],) + + if output_attentions: + all_attentions = all_attentions + (layer_outputs[1],) + if self.config.add_cross_attention and encoder_hidden_states is not None: + all_cross_attentions = all_cross_attentions + (layer_outputs[2],) + + # Add last layer + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + if not return_dict: + return tuple( + v for v in [hidden_states, all_hidden_states, all_attentions, all_cross_attentions] if v is not None + ) + + return TFBaseModelOutputWithPastAndCrossAttentions( + last_hidden_state=hidden_states, + past_key_values=next_decoder_cache, + hidden_states=all_hidden_states, + attentions=all_attentions, + cross_attentions=all_cross_attentions, + ) + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "layer", None) is not None: + for layer in self.layer: + with tf.name_scope(layer.name): + layer.build(None) + + +@keras_serializable +# Copied from transformers.models.roberta.modeling_tf_roberta.TFRobertaMainLayer with Roberta->Camembert +class TFCamembertMainLayer(keras.layers.Layer): + config_class = CamembertConfig + + def __init__(self, config, add_pooling_layer=True, **kwargs): + super().__init__(**kwargs) + + self.config = config + self.is_decoder = config.is_decoder + + self.num_hidden_layers = config.num_hidden_layers + self.initializer_range = config.initializer_range + self.output_attentions = config.output_attentions + self.output_hidden_states = config.output_hidden_states + self.return_dict = config.use_return_dict + self.encoder = TFCamembertEncoder(config, name="encoder") + self.pooler = TFCamembertPooler(config, name="pooler") if add_pooling_layer else None + # The embeddings must be the last declaration in order to follow the weights order + self.embeddings = TFCamembertEmbeddings(config, name="embeddings") + + # Copied from transformers.models.bert.modeling_tf_bert.TFBertMainLayer.get_input_embeddings + def get_input_embeddings(self) -> keras.layers.Layer: + return self.embeddings + + # Copied from transformers.models.bert.modeling_tf_bert.TFBertMainLayer.set_input_embeddings + def set_input_embeddings(self, value: tf.Variable): + self.embeddings.weight = value + self.embeddings.vocab_size = shape_list(value)[0] + + # Copied from transformers.models.bert.modeling_tf_bert.TFBertMainLayer._prune_heads + def _prune_heads(self, heads_to_prune): + """ + Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base + class PreTrainedModel + """ + raise NotImplementedError + + @unpack_inputs + # Copied from transformers.models.bert.modeling_tf_bert.TFBertMainLayer.call + def call( + self, + input_ids: TFModelInputType | None = None, + attention_mask: np.ndarray | tf.Tensor | None = None, + token_type_ids: np.ndarray | tf.Tensor | None = None, + position_ids: np.ndarray | tf.Tensor | None = None, + head_mask: np.ndarray | tf.Tensor | None = None, + inputs_embeds: np.ndarray | tf.Tensor | None = None, + encoder_hidden_states: np.ndarray | tf.Tensor | None = None, + encoder_attention_mask: np.ndarray | tf.Tensor | None = None, + past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + training: bool = False, + ) -> Union[TFBaseModelOutputWithPoolingAndCrossAttentions, Tuple[tf.Tensor]]: + if not self.config.is_decoder: + use_cache = False + + if input_ids is not None and inputs_embeds is not None: + raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") + elif input_ids is not None: + input_shape = shape_list(input_ids) + elif inputs_embeds is not None: + input_shape = shape_list(inputs_embeds)[:-1] + else: + raise ValueError("You have to specify either input_ids or inputs_embeds") + + batch_size, seq_length = input_shape + + if past_key_values is None: + past_key_values_length = 0 + past_key_values = [None] * len(self.encoder.layer) + else: + past_key_values_length = shape_list(past_key_values[0][0])[-2] + + if attention_mask is None: + attention_mask = tf.fill(dims=(batch_size, seq_length + past_key_values_length), value=1) + + if token_type_ids is None: + token_type_ids = tf.fill(dims=input_shape, value=0) + + embedding_output = self.embeddings( + input_ids=input_ids, + position_ids=position_ids, + token_type_ids=token_type_ids, + inputs_embeds=inputs_embeds, + past_key_values_length=past_key_values_length, + training=training, + ) + + # We create a 3D attention mask from a 2D tensor mask. + # Sizes are [batch_size, 1, 1, to_seq_length] + # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length] + # this attention mask is more simple than the triangular masking of causal attention + # used in OpenAI GPT, we just need to prepare the broadcast dimension here. + attention_mask_shape = shape_list(attention_mask) + + mask_seq_length = seq_length + past_key_values_length + # Copied from `modeling_tf_t5.py` + # Provided a padding mask of dimensions [batch_size, mask_seq_length] + # - if the model is a decoder, apply a causal mask in addition to the padding mask + # - if the model is an encoder, make the mask broadcastable to [batch_size, num_heads, mask_seq_length, mask_seq_length] + if self.is_decoder: + seq_ids = tf.range(mask_seq_length) + causal_mask = tf.less_equal( + tf.tile(seq_ids[None, None, :], (batch_size, mask_seq_length, 1)), + seq_ids[None, :, None], + ) + causal_mask = tf.cast(causal_mask, dtype=attention_mask.dtype) + extended_attention_mask = causal_mask * attention_mask[:, None, :] + attention_mask_shape = shape_list(extended_attention_mask) + extended_attention_mask = tf.reshape( + extended_attention_mask, (attention_mask_shape[0], 1, attention_mask_shape[1], attention_mask_shape[2]) + ) + if past_key_values[0] is not None: + # attention_mask needs to be sliced to the shape `[batch_size, 1, from_seq_length - cached_seq_length, to_seq_length] + extended_attention_mask = extended_attention_mask[:, :, -seq_length:, :] + else: + extended_attention_mask = tf.reshape( + attention_mask, (attention_mask_shape[0], 1, 1, attention_mask_shape[1]) + ) + + # Since attention_mask is 1.0 for positions we want to attend and 0.0 for + # masked positions, this operation will create a tensor which is 0.0 for + # positions we want to attend and -10000.0 for masked positions. + # Since we are adding it to the raw scores before the softmax, this is + # effectively the same as removing these entirely. + extended_attention_mask = tf.cast(extended_attention_mask, dtype=embedding_output.dtype) + one_cst = tf.constant(1.0, dtype=embedding_output.dtype) + ten_thousand_cst = tf.constant(-10000.0, dtype=embedding_output.dtype) + extended_attention_mask = tf.multiply(tf.subtract(one_cst, extended_attention_mask), ten_thousand_cst) + + # Copied from `modeling_tf_t5.py` with -1e9 -> -10000 + if self.is_decoder and encoder_attention_mask is not None: + # If a 2D ou 3D attention mask is provided for the cross-attention + # we need to make broadcastable to [batch_size, num_heads, mask_seq_length, mask_seq_length] + # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length] + encoder_attention_mask = tf.cast(encoder_attention_mask, dtype=extended_attention_mask.dtype) + num_dims_encoder_attention_mask = len(shape_list(encoder_attention_mask)) + if num_dims_encoder_attention_mask == 3: + encoder_extended_attention_mask = encoder_attention_mask[:, None, :, :] + if num_dims_encoder_attention_mask == 2: + encoder_extended_attention_mask = encoder_attention_mask[:, None, None, :] + + # T5 has a mask that can compare sequence ids, we can simulate this here with this transposition + # Cf. https://github.com/tensorflow/mesh/blob/8d2465e9bc93129b913b5ccc6a59aa97abd96ec6/mesh_tensorflow/transformer/transformer_layers.py#L270 + # encoder_extended_attention_mask = tf.math.equal(encoder_extended_attention_mask, + # tf.transpose(encoder_extended_attention_mask, perm=(-1, -2))) + + encoder_extended_attention_mask = (1.0 - encoder_extended_attention_mask) * -10000.0 + else: + encoder_extended_attention_mask = None + + # Prepare head mask if needed + # 1.0 in head_mask indicate we keep the head + # attention_probs has shape bsz x n_heads x N x N + # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads] + # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length] + if head_mask is not None: + raise NotImplementedError + else: + head_mask = [None] * self.config.num_hidden_layers + + encoder_outputs = self.encoder( + hidden_states=embedding_output, + attention_mask=extended_attention_mask, + head_mask=head_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_extended_attention_mask, + past_key_values=past_key_values, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + ) + + sequence_output = encoder_outputs[0] + pooled_output = self.pooler(hidden_states=sequence_output) if self.pooler is not None else None + + if not return_dict: + return ( + sequence_output, + pooled_output, + ) + encoder_outputs[1:] + + return TFBaseModelOutputWithPoolingAndCrossAttentions( + last_hidden_state=sequence_output, + pooler_output=pooled_output, + past_key_values=encoder_outputs.past_key_values, + hidden_states=encoder_outputs.hidden_states, + attentions=encoder_outputs.attentions, + cross_attentions=encoder_outputs.cross_attentions, + ) + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "encoder", None) is not None: + with tf.name_scope(self.encoder.name): + self.encoder.build(None) + if getattr(self, "pooler", None) is not None: + with tf.name_scope(self.pooler.name): + self.pooler.build(None) + if getattr(self, "embeddings", None) is not None: + with tf.name_scope(self.embeddings.name): + self.embeddings.build(None) + + +class TFCamembertPreTrainedModel(TFPreTrainedModel): + """ + An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained + models. + """ + + config_class = CamembertConfig + base_model_prefix = "roberta" + + +@add_start_docstrings( + "The bare CamemBERT Model transformer outputting raw hidden-states without any specific head on top.", + CAMEMBERT_START_DOCSTRING, +) +# Copied from transformers.models.roberta.modeling_tf_roberta.TFRobertaModel with Roberta->Camembert, ROBERTA->CAMEMBERT +class TFCamembertModel(TFCamembertPreTrainedModel): + def __init__(self, config, *inputs, **kwargs): + super().__init__(config, *inputs, **kwargs) + self.roberta = TFCamembertMainLayer(config, name="roberta") + + @unpack_inputs + @add_start_docstrings_to_model_forward(CAMEMBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @add_code_sample_docstrings( + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=TFBaseModelOutputWithPoolingAndCrossAttentions, + config_class=_CONFIG_FOR_DOC, + ) + def call( + self, + input_ids: TFModelInputType | None = None, + attention_mask: np.ndarray | tf.Tensor | None = None, + token_type_ids: np.ndarray | tf.Tensor | None = None, + position_ids: np.ndarray | tf.Tensor | None = None, + head_mask: np.ndarray | tf.Tensor | None = None, + inputs_embeds: np.ndarray | tf.Tensor | None = None, + encoder_hidden_states: np.ndarray | tf.Tensor | None = None, + encoder_attention_mask: np.ndarray | tf.Tensor | None = None, + past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + training: Optional[bool] = False, + ) -> Union[Tuple, TFBaseModelOutputWithPoolingAndCrossAttentions]: + r""" + encoder_hidden_states (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if + the model is configured as a decoder. + encoder_attention_mask (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in + the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + past_key_values (`Tuple[Tuple[tf.Tensor]]` of length `config.n_layers`) + contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding. + If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that + don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all + `decoder_input_ids` of shape `(batch_size, sequence_length)`. + use_cache (`bool`, *optional*, defaults to `True`): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see + `past_key_values`). Set to `False` during training, `True` during generation + """ + outputs = self.roberta( + input_ids=input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + past_key_values=past_key_values, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + ) + + return outputs + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "roberta", None) is not None: + with tf.name_scope(self.roberta.name): + self.roberta.build(None) + + +# Copied from transformers.models.roberta.modeling_tf_roberta.TFRobertaLMHead with Roberta->Camembert +class TFCamembertLMHead(keras.layers.Layer): + """Camembert Head for masked language modeling.""" + + def __init__(self, config, input_embeddings, **kwargs): + super().__init__(**kwargs) + + self.config = config + self.hidden_size = config.hidden_size + self.dense = keras.layers.Dense( + config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" + ) + self.layer_norm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm") + self.act = get_tf_activation("gelu") + + # The output weights are the same as the input embeddings, but there is + # an output-only bias for each token. + self.decoder = input_embeddings + + def build(self, input_shape=None): + self.bias = self.add_weight(shape=(self.config.vocab_size,), initializer="zeros", trainable=True, name="bias") + + if self.built: + return + self.built = True + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build([None, None, self.config.hidden_size]) + if getattr(self, "layer_norm", None) is not None: + with tf.name_scope(self.layer_norm.name): + self.layer_norm.build([None, None, self.config.hidden_size]) + + def get_output_embeddings(self): + return self.decoder + + def set_output_embeddings(self, value): + self.decoder.weight = value + self.decoder.vocab_size = shape_list(value)[0] + + def get_bias(self): + return {"bias": self.bias} + + def set_bias(self, value): + self.bias = value["bias"] + self.config.vocab_size = shape_list(value["bias"])[0] + + def call(self, hidden_states): + hidden_states = self.dense(hidden_states) + hidden_states = self.act(hidden_states) + hidden_states = self.layer_norm(hidden_states) + + # project back to size of vocabulary with bias + seq_length = shape_list(tensor=hidden_states)[1] + hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, self.hidden_size]) + hidden_states = tf.matmul(a=hidden_states, b=self.decoder.weight, transpose_b=True) + hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, seq_length, self.config.vocab_size]) + hidden_states = tf.nn.bias_add(value=hidden_states, bias=self.bias) + + return hidden_states + + +@add_start_docstrings( + """CamemBERT Model with a `language modeling` head on top.""", + CAMEMBERT_START_DOCSTRING, +) +# Copied from transformers.models.roberta.modeling_tf_roberta.TFRobertaForMaskedLM with Roberta->Camembert, ROBERTA->CAMEMBERT +class TFCamembertForMaskedLM(TFCamembertPreTrainedModel, TFMaskedLanguageModelingLoss): + # names with a '.' represents the authorized unexpected/missing layers when a TF model is loaded from a PT model + _keys_to_ignore_on_load_unexpected = [r"pooler", r"lm_head.decoder.weight"] + + def __init__(self, config, *inputs, **kwargs): + super().__init__(config, *inputs, **kwargs) + + self.roberta = TFCamembertMainLayer(config, add_pooling_layer=False, name="roberta") + self.lm_head = TFCamembertLMHead(config, self.roberta.embeddings, name="lm_head") + + def get_lm_head(self): + return self.lm_head + + def get_prefix_bias_name(self): + warnings.warn("The method get_prefix_bias_name is deprecated. Please use `get_bias` instead.", FutureWarning) + return self.name + "/" + self.lm_head.name + + @unpack_inputs + @add_start_docstrings_to_model_forward(CAMEMBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @add_code_sample_docstrings( + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=TFMaskedLMOutput, + config_class=_CONFIG_FOR_DOC, + mask="", + expected_output="' Paris'", + expected_loss=0.1, + ) + def call( + self, + input_ids: TFModelInputType | None = None, + attention_mask: np.ndarray | tf.Tensor | None = None, + token_type_ids: np.ndarray | tf.Tensor | None = None, + position_ids: np.ndarray | tf.Tensor | None = None, + head_mask: np.ndarray | tf.Tensor | None = None, + inputs_embeds: np.ndarray | tf.Tensor | None = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + labels: np.ndarray | tf.Tensor | None = None, + training: Optional[bool] = False, + ) -> Union[TFMaskedLMOutput, Tuple[tf.Tensor]]: + r""" + labels (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ..., + config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the + loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]` + """ + outputs = self.roberta( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + ) + + sequence_output = outputs[0] + prediction_scores = self.lm_head(sequence_output) + + loss = None if labels is None else self.hf_compute_loss(labels, prediction_scores) + + if not return_dict: + output = (prediction_scores,) + outputs[2:] + return ((loss,) + output) if loss is not None else output + + return TFMaskedLMOutput( + loss=loss, + logits=prediction_scores, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "roberta", None) is not None: + with tf.name_scope(self.roberta.name): + self.roberta.build(None) + if getattr(self, "lm_head", None) is not None: + with tf.name_scope(self.lm_head.name): + self.lm_head.build(None) + + +# Copied from transformers.models.roberta.modeling_tf_roberta.TFRobertaClassificationHead +class TFCamembertClassificationHead(keras.layers.Layer): + """Head for sentence-level classification tasks.""" + + def __init__(self, config, **kwargs): + super().__init__(**kwargs) + self.dense = keras.layers.Dense( + config.hidden_size, + kernel_initializer=get_initializer(config.initializer_range), + activation="tanh", + name="dense", + ) + classifier_dropout = ( + config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob + ) + self.dropout = keras.layers.Dropout(classifier_dropout) + self.out_proj = keras.layers.Dense( + config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="out_proj" + ) + self.config = config + + def call(self, features, training=False): + x = features[:, 0, :] # take token (equiv. to [CLS]) + x = self.dropout(x, training=training) + x = self.dense(x) + x = self.dropout(x, training=training) + x = self.out_proj(x) + return x + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "dense", None) is not None: + with tf.name_scope(self.dense.name): + self.dense.build([None, None, self.config.hidden_size]) + if getattr(self, "out_proj", None) is not None: + with tf.name_scope(self.out_proj.name): + self.out_proj.build([None, None, self.config.hidden_size]) + + +@add_start_docstrings( + """ + CamemBERT Model transformer with a sequence classification/regression head on top (a linear layer on top of the + pooled output) e.g. for GLUE tasks. + """, + CAMEMBERT_START_DOCSTRING, +) +# Copied from transformers.models.roberta.modeling_tf_roberta.TFRobertaForSequenceClassification with Roberta->Camembert, ROBERTA->CAMEMBERT +class TFCamembertForSequenceClassification(TFCamembertPreTrainedModel, TFSequenceClassificationLoss): + # names with a '.' represents the authorized unexpected/missing layers when a TF model is loaded from a PT model + _keys_to_ignore_on_load_unexpected = [r"pooler", r"lm_head"] + + def __init__(self, config, *inputs, **kwargs): + super().__init__(config, *inputs, **kwargs) + self.num_labels = config.num_labels + + self.roberta = TFCamembertMainLayer(config, add_pooling_layer=False, name="roberta") + self.classifier = TFCamembertClassificationHead(config, name="classifier") + + @unpack_inputs + @add_start_docstrings_to_model_forward(CAMEMBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @add_code_sample_docstrings( + checkpoint="cardiffnlp/twitter-roberta-base-emotion", + output_type=TFSequenceClassifierOutput, + config_class=_CONFIG_FOR_DOC, + expected_output="'optimism'", + expected_loss=0.08, + ) + def call( + self, + input_ids: TFModelInputType | None = None, + attention_mask: np.ndarray | tf.Tensor | None = None, + token_type_ids: np.ndarray | tf.Tensor | None = None, + position_ids: np.ndarray | tf.Tensor | None = None, + head_mask: np.ndarray | tf.Tensor | None = None, + inputs_embeds: np.ndarray | tf.Tensor | None = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + labels: np.ndarray | tf.Tensor | None = None, + training: Optional[bool] = False, + ) -> Union[TFSequenceClassifierOutput, Tuple[tf.Tensor]]: + r""" + labels (`tf.Tensor` of shape `(batch_size,)`, *optional*): + Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., + config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If + `config.num_labels > 1` a classification loss is computed (Cross-Entropy). + """ + outputs = self.roberta( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + ) + sequence_output = outputs[0] + logits = self.classifier(sequence_output, training=training) + + loss = None if labels is None else self.hf_compute_loss(labels, logits) + + if not return_dict: + output = (logits,) + outputs[2:] + return ((loss,) + output) if loss is not None else output + + return TFSequenceClassifierOutput( + loss=loss, + logits=logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "roberta", None) is not None: + with tf.name_scope(self.roberta.name): + self.roberta.build(None) + if getattr(self, "classifier", None) is not None: + with tf.name_scope(self.classifier.name): + self.classifier.build(None) + + +@add_start_docstrings( + """ + CamemBERT Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. + for Named-Entity-Recognition (NER) tasks. + """, + CAMEMBERT_START_DOCSTRING, +) +# Copied from transformers.models.roberta.modeling_tf_roberta.TFRobertaForTokenClassification with Roberta->Camembert, ROBERTA->CAMEMBERT +class TFCamembertForTokenClassification(TFCamembertPreTrainedModel, TFTokenClassificationLoss): + # names with a '.' represents the authorized unexpected/missing layers when a TF model is loaded from a PT model + _keys_to_ignore_on_load_unexpected = [r"pooler", r"lm_head"] + _keys_to_ignore_on_load_missing = [r"dropout"] + + def __init__(self, config, *inputs, **kwargs): + super().__init__(config, *inputs, **kwargs) + self.num_labels = config.num_labels + + self.roberta = TFCamembertMainLayer(config, add_pooling_layer=False, name="roberta") + classifier_dropout = ( + config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob + ) + self.dropout = keras.layers.Dropout(classifier_dropout) + self.classifier = keras.layers.Dense( + config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier" + ) + self.config = config + + @unpack_inputs + @add_start_docstrings_to_model_forward(CAMEMBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @add_code_sample_docstrings( + checkpoint="ydshieh/roberta-large-ner-english", + output_type=TFTokenClassifierOutput, + config_class=_CONFIG_FOR_DOC, + expected_output="['O', 'ORG', 'ORG', 'O', 'O', 'O', 'O', 'O', 'LOC', 'O', 'LOC', 'LOC']", + expected_loss=0.01, + ) + def call( + self, + input_ids: TFModelInputType | None = None, + attention_mask: np.ndarray | tf.Tensor | None = None, + token_type_ids: np.ndarray | tf.Tensor | None = None, + position_ids: np.ndarray | tf.Tensor | None = None, + head_mask: np.ndarray | tf.Tensor | None = None, + inputs_embeds: np.ndarray | tf.Tensor | None = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + labels: np.ndarray | tf.Tensor | None = None, + training: Optional[bool] = False, + ) -> Union[TFTokenClassifierOutput, Tuple[tf.Tensor]]: + r""" + labels (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`. + """ + outputs = self.roberta( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + ) + sequence_output = outputs[0] + + sequence_output = self.dropout(sequence_output, training=training) + logits = self.classifier(sequence_output) + + loss = None if labels is None else self.hf_compute_loss(labels, logits) + + if not return_dict: + output = (logits,) + outputs[2:] + return ((loss,) + output) if loss is not None else output + + return TFTokenClassifierOutput( + loss=loss, + logits=logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "roberta", None) is not None: + with tf.name_scope(self.roberta.name): + self.roberta.build(None) + if getattr(self, "classifier", None) is not None: + with tf.name_scope(self.classifier.name): + self.classifier.build([None, None, self.config.hidden_size]) + + +@add_start_docstrings( + """ + CamemBERT Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a + softmax) e.g. for RocStories/SWAG tasks. + """, + CAMEMBERT_START_DOCSTRING, +) +# Copied from transformers.models.roberta.modeling_tf_roberta.TFRobertaForMultipleChoice with Roberta->Camembert, ROBERTA->CAMEMBERT +class TFCamembertForMultipleChoice(TFCamembertPreTrainedModel, TFMultipleChoiceLoss): + # names with a '.' represents the authorized unexpected/missing layers when a TF model is loaded from a PT model + _keys_to_ignore_on_load_unexpected = [r"lm_head"] + _keys_to_ignore_on_load_missing = [r"dropout"] + + def __init__(self, config, *inputs, **kwargs): + super().__init__(config, *inputs, **kwargs) + + self.roberta = TFCamembertMainLayer(config, name="roberta") + self.dropout = keras.layers.Dropout(config.hidden_dropout_prob) + self.classifier = keras.layers.Dense( + 1, kernel_initializer=get_initializer(config.initializer_range), name="classifier" + ) + self.config = config + + @unpack_inputs + @add_start_docstrings_to_model_forward( + CAMEMBERT_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length") + ) + @add_code_sample_docstrings( + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=TFMultipleChoiceModelOutput, + config_class=_CONFIG_FOR_DOC, + ) + def call( + self, + input_ids: TFModelInputType | None = None, + attention_mask: np.ndarray | tf.Tensor | None = None, + token_type_ids: np.ndarray | tf.Tensor | None = None, + position_ids: np.ndarray | tf.Tensor | None = None, + head_mask: np.ndarray | tf.Tensor | None = None, + inputs_embeds: np.ndarray | tf.Tensor | None = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + labels: np.ndarray | tf.Tensor | None = None, + training: Optional[bool] = False, + ) -> Union[TFMultipleChoiceModelOutput, Tuple[tf.Tensor]]: + r""" + labels (`tf.Tensor` of shape `(batch_size,)`, *optional*): + Labels for computing the multiple choice classification loss. Indices should be in `[0, ..., num_choices]` + where `num_choices` is the size of the second dimension of the input tensors. (See `input_ids` above) + """ + + if input_ids is not None: + num_choices = shape_list(input_ids)[1] + seq_length = shape_list(input_ids)[2] + else: + num_choices = shape_list(inputs_embeds)[1] + seq_length = shape_list(inputs_embeds)[2] + + flat_input_ids = tf.reshape(input_ids, (-1, seq_length)) if input_ids is not None else None + flat_attention_mask = tf.reshape(attention_mask, (-1, seq_length)) if attention_mask is not None else None + flat_token_type_ids = tf.reshape(token_type_ids, (-1, seq_length)) if token_type_ids is not None else None + flat_position_ids = tf.reshape(position_ids, (-1, seq_length)) if position_ids is not None else None + outputs = self.roberta( + flat_input_ids, + flat_attention_mask, + flat_token_type_ids, + flat_position_ids, + head_mask, + inputs_embeds, + output_attentions, + output_hidden_states, + return_dict=return_dict, + training=training, + ) + pooled_output = outputs[1] + pooled_output = self.dropout(pooled_output, training=training) + logits = self.classifier(pooled_output) + reshaped_logits = tf.reshape(logits, (-1, num_choices)) + + loss = None if labels is None else self.hf_compute_loss(labels, reshaped_logits) + + if not return_dict: + output = (reshaped_logits,) + outputs[2:] + return ((loss,) + output) if loss is not None else output + + return TFMultipleChoiceModelOutput( + loss=loss, + logits=reshaped_logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "roberta", None) is not None: + with tf.name_scope(self.roberta.name): + self.roberta.build(None) + if getattr(self, "classifier", None) is not None: + with tf.name_scope(self.classifier.name): + self.classifier.build([None, None, self.config.hidden_size]) + + +@add_start_docstrings( + """ + CamemBERT Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear + layers on top of the hidden-states output to compute `span start logits` and `span end logits`). + """, + CAMEMBERT_START_DOCSTRING, +) +# Copied from transformers.models.roberta.modeling_tf_roberta.TFRobertaForQuestionAnswering with Roberta->Camembert, ROBERTA->CAMEMBERT +class TFCamembertForQuestionAnswering(TFCamembertPreTrainedModel, TFQuestionAnsweringLoss): + # names with a '.' represents the authorized unexpected/missing layers when a TF model is loaded from a PT model + _keys_to_ignore_on_load_unexpected = [r"pooler", r"lm_head"] + + def __init__(self, config, *inputs, **kwargs): + super().__init__(config, *inputs, **kwargs) + self.num_labels = config.num_labels + + self.roberta = TFCamembertMainLayer(config, add_pooling_layer=False, name="roberta") + self.qa_outputs = keras.layers.Dense( + config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="qa_outputs" + ) + self.config = config + + @unpack_inputs + @add_start_docstrings_to_model_forward(CAMEMBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @add_code_sample_docstrings( + checkpoint="ydshieh/roberta-base-squad2", + output_type=TFQuestionAnsweringModelOutput, + config_class=_CONFIG_FOR_DOC, + expected_output="' puppet'", + expected_loss=0.86, + ) + def call( + self, + input_ids: TFModelInputType | None = None, + attention_mask: np.ndarray | tf.Tensor | None = None, + token_type_ids: np.ndarray | tf.Tensor | None = None, + position_ids: np.ndarray | tf.Tensor | None = None, + head_mask: np.ndarray | tf.Tensor | None = None, + inputs_embeds: np.ndarray | tf.Tensor | None = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + start_positions: np.ndarray | tf.Tensor | None = None, + end_positions: np.ndarray | tf.Tensor | None = None, + training: Optional[bool] = False, + ) -> Union[TFQuestionAnsweringModelOutput, Tuple[tf.Tensor]]: + r""" + start_positions (`tf.Tensor` of shape `(batch_size,)`, *optional*): + Labels for position (index) of the start of the labelled span for computing the token classification loss. + Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence + are not taken into account for computing the loss. + end_positions (`tf.Tensor` of shape `(batch_size,)`, *optional*): + Labels for position (index) of the end of the labelled span for computing the token classification loss. + Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence + are not taken into account for computing the loss. + """ + outputs = self.roberta( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + ) + sequence_output = outputs[0] + + logits = self.qa_outputs(sequence_output) + start_logits, end_logits = tf.split(logits, 2, axis=-1) + start_logits = tf.squeeze(start_logits, axis=-1) + end_logits = tf.squeeze(end_logits, axis=-1) + + loss = None + if start_positions is not None and end_positions is not None: + labels = {"start_position": start_positions} + labels["end_position"] = end_positions + loss = self.hf_compute_loss(labels, (start_logits, end_logits)) + + if not return_dict: + output = (start_logits, end_logits) + outputs[2:] + return ((loss,) + output) if loss is not None else output + + return TFQuestionAnsweringModelOutput( + loss=loss, + start_logits=start_logits, + end_logits=end_logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "roberta", None) is not None: + with tf.name_scope(self.roberta.name): + self.roberta.build(None) + if getattr(self, "qa_outputs", None) is not None: + with tf.name_scope(self.qa_outputs.name): + self.qa_outputs.build([None, None, self.config.hidden_size]) + + +@add_start_docstrings( + """CamemBERT Model with a `language modeling` head on top for CLM fine-tuning.""", CAMEMBERT_START_DOCSTRING +) +# Copied from transformers.models.roberta.modeling_tf_roberta.TFRobertaForCausalLM with Roberta->Camembert, ROBERTA->CAMEMBERT +class TFCamembertForCausalLM(TFCamembertPreTrainedModel, TFCausalLanguageModelingLoss): + # names with a '.' represents the authorized unexpected/missing layers when a TF model is loaded from a PT model + _keys_to_ignore_on_load_unexpected = [r"pooler", r"lm_head.decoder.weight"] + + def __init__(self, config: CamembertConfig, *inputs, **kwargs): + super().__init__(config, *inputs, **kwargs) + + if not config.is_decoder: + logger.warning("If you want to use `TFCamembertLMHeadModel` as a standalone, add `is_decoder=True.`") + + self.roberta = TFCamembertMainLayer(config, add_pooling_layer=False, name="roberta") + self.lm_head = TFCamembertLMHead(config, input_embeddings=self.roberta.embeddings, name="lm_head") + + def get_lm_head(self): + return self.lm_head + + def get_prefix_bias_name(self): + warnings.warn("The method get_prefix_bias_name is deprecated. Please use `get_bias` instead.", FutureWarning) + return self.name + "/" + self.lm_head.name + + # Copied from transformers.models.bert.modeling_tf_bert.TFBertLMHeadModel.prepare_inputs_for_generation + def prepare_inputs_for_generation(self, input_ids, past_key_values=None, attention_mask=None, **model_kwargs): + input_shape = input_ids.shape + # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly + if attention_mask is None: + attention_mask = tf.ones(input_shape) + + # cut decoder_input_ids if past is used + if past_key_values is not None: + input_ids = input_ids[:, -1:] + + return {"input_ids": input_ids, "attention_mask": attention_mask, "past_key_values": past_key_values} + + @unpack_inputs + @add_start_docstrings_to_model_forward(CAMEMBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @add_code_sample_docstrings( + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=TFCausalLMOutputWithCrossAttentions, + config_class=_CONFIG_FOR_DOC, + ) + def call( + self, + input_ids: TFModelInputType | None = None, + attention_mask: np.ndarray | tf.Tensor | None = None, + token_type_ids: np.ndarray | tf.Tensor | None = None, + position_ids: np.ndarray | tf.Tensor | None = None, + head_mask: np.ndarray | tf.Tensor | None = None, + inputs_embeds: np.ndarray | tf.Tensor | None = None, + encoder_hidden_states: np.ndarray | tf.Tensor | None = None, + encoder_attention_mask: np.ndarray | tf.Tensor | None = None, + past_key_values: Optional[Tuple[Tuple[Union[np.ndarray, tf.Tensor]]]] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + labels: np.ndarray | tf.Tensor | None = None, + training: Optional[bool] = False, + ) -> Union[TFCausalLMOutputWithCrossAttentions, Tuple[tf.Tensor]]: + r""" + encoder_hidden_states (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*): + Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if + the model is configured as a decoder. + encoder_attention_mask (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*): + Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in + the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + past_key_values (`Tuple[Tuple[tf.Tensor]]` of length `config.n_layers`) + contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding. + If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that + don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all + `decoder_input_ids` of shape `(batch_size, sequence_length)`. + use_cache (`bool`, *optional*, defaults to `True`): + If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see + `past_key_values`). Set to `False` during training, `True` during generation + labels (`tf.Tensor` or `np.ndarray` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the cross entropy classification loss. Indices should be in `[0, ..., + config.vocab_size - 1]`. + """ + outputs = self.roberta( + input_ids=input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + past_key_values=past_key_values, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + ) + + sequence_output = outputs[0] + logits = self.lm_head(hidden_states=sequence_output, training=training) + loss = None + + if labels is not None: + # shift labels to the left and cut last logit token + shifted_logits = logits[:, :-1] + labels = labels[:, 1:] + loss = self.hf_compute_loss(labels=labels, logits=shifted_logits) + + if not return_dict: + output = (logits,) + outputs[2:] + return ((loss,) + output) if loss is not None else output + + return TFCausalLMOutputWithCrossAttentions( + loss=loss, + logits=logits, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + cross_attentions=outputs.cross_attentions, + ) + + def build(self, input_shape=None): + if self.built: + return + self.built = True + if getattr(self, "roberta", None) is not None: + with tf.name_scope(self.roberta.name): + self.roberta.build(None) + if getattr(self, "lm_head", None) is not None: + with tf.name_scope(self.lm_head.name): + self.lm_head.build(None) + + +__all__ = [ + "TFCamembertForCausalLM", + "TFCamembertForMaskedLM", + "TFCamembertForMultipleChoice", + "TFCamembertForQuestionAnswering", + "TFCamembertForSequenceClassification", + "TFCamembertForTokenClassification", + "TFCamembertModel", + "TFCamembertPreTrainedModel", +] diff --git a/docs/transformers/build/lib/transformers/models/camembert/tokenization_camembert.py b/docs/transformers/build/lib/transformers/models/camembert/tokenization_camembert.py new file mode 100644 index 0000000000000000000000000000000000000000..23cc569d49fc1cd372ec8b56d5c933a0c2e1ad88 --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/camembert/tokenization_camembert.py @@ -0,0 +1,323 @@ +# coding=utf-8 +# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License +"""Tokenization classes for Camembert model.""" + +import os +from shutil import copyfile +from typing import Any, Dict, List, Optional, Tuple + +import sentencepiece as spm + +from ...tokenization_utils import AddedToken, PreTrainedTokenizer +from ...utils import logging +from ...utils.import_utils import requires + + +logger = logging.get_logger(__name__) + +VOCAB_FILES_NAMES = {"vocab_file": "sentencepiece.bpe.model"} + + +SPIECE_UNDERLINE = "▁" + + +@requires(backends=("sentencepiece",)) +class CamembertTokenizer(PreTrainedTokenizer): + """ + Adapted from [`RobertaTokenizer`] and [`XLNetTokenizer`]. Construct a CamemBERT tokenizer. Based on + [SentencePiece](https://github.com/google/sentencepiece). + + This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to + this superclass for more information regarding those methods. + + Args: + vocab_file (`str`): + [SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that + contains the vocabulary necessary to instantiate a tokenizer. + bos_token (`str`, *optional*, defaults to `""`): + The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token. + + + + When building a sequence using special tokens, this is not the token that is used for the beginning of + sequence. The token used is the `cls_token`. + + + + eos_token (`str`, *optional*, defaults to `""`): + The end of sequence token. + + + + When building a sequence using special tokens, this is not the token that is used for the end of sequence. + The token used is the `sep_token`. + + + + sep_token (`str`, *optional*, defaults to `""`): + The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for + sequence classification or for a text and a question for question answering. It is also used as the last + token of a sequence built with special tokens. + cls_token (`str`, *optional*, defaults to `""`): + The classifier token which is used when doing sequence classification (classification of the whole sequence + instead of per-token classification). It is the first token of the sequence when built with special tokens. + unk_token (`str`, *optional*, defaults to `""`): + The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this + token instead. + pad_token (`str`, *optional*, defaults to `""`): + The token used for padding, for example when batching sequences of different lengths. + mask_token (`str`, *optional*, defaults to `""`): + The token used for masking values. This is the token used when training this model with masked language + modeling. This is the token which the model will try to predict. + additional_special_tokens (`List[str]`, *optional*, defaults to `['NOTUSED', 'NOTUSED', 'NOTUSED']`): + Additional special tokens used by the tokenizer. + sp_model_kwargs (`dict`, *optional*): + Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for + SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things, + to set: + + - `enable_sampling`: Enable subword regularization. + - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout. + + - `nbest_size = {0,1}`: No sampling is performed. + - `nbest_size > 1`: samples from the nbest_size results. + - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice) + using forward-filtering-and-backward-sampling algorithm. + + - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for + BPE-dropout. + + Attributes: + sp_model (`SentencePieceProcessor`): + The *SentencePiece* processor that is used for every conversion (string, tokens and IDs). + """ + + vocab_files_names = VOCAB_FILES_NAMES + model_input_names = ["input_ids", "attention_mask"] + + def __init__( + self, + vocab_file, + bos_token="", + eos_token="", + sep_token="", + cls_token="", + unk_token="", + pad_token="", + mask_token="", + additional_special_tokens=["NOTUSED", "NOTUSED", "NOTUSED"], + sp_model_kwargs: Optional[Dict[str, Any]] = None, + **kwargs, + ) -> None: + # Mask token behave like a normal word, i.e. include the space before it + mask_token = ( + AddedToken(mask_token, lstrip=True, rstrip=False, normalized=False, special=True) + if isinstance(mask_token, str) + else mask_token + ) + + self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs + + self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) + self.sp_model.Load(str(vocab_file)) + self.vocab_file = vocab_file + + # HACK: These tokens were added by the author for an obscure reason as they were already part of the + # sentencepiece vocabulary (this is the case for and and ). + # In this case it is recommended to properly set the tokens by hand. + self._added_tokens_decoder = { + 0: AddedToken("NOTUSED", special=True), + 1: AddedToken(pad_token, special=True) if isinstance(pad_token, str) else pad_token, + 2: AddedToken("NOTUSED", special=True), + 3: AddedToken(unk_token, special=True) if isinstance(unk_token, str) else unk_token, + 4: AddedToken("NOTUSED", special=True), + } + + self.fairseq_offset = 4 # 3 tokens are newly added, but the offset starts from 4 + + # legacy: camemebert is a particular case were we have to make sure `"NOTUSED"` is here + if "added_tokens_decoder" in kwargs: + # this is the only class that requires this unfortunately..... + # the reason is that the fast version has a whole. + kwargs["added_tokens_decoder"].update(self._added_tokens_decoder) + + super().__init__( + bos_token=bos_token, + eos_token=eos_token, + unk_token=unk_token, + sep_token=sep_token, + cls_token=cls_token, + pad_token=pad_token, + mask_token=mask_token, + additional_special_tokens=additional_special_tokens, + sp_model_kwargs=self.sp_model_kwargs, + **kwargs, + ) + + @property + def vocab_size(self): + # The length of the vocabulary without added tokens is len(self.sp_model) but the added tokens are added at the beginning. + return len(self.sp_model) + + def get_vocab(self): + vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size + self.fairseq_offset)} + vocab.update(self.added_tokens_encoder) + return vocab + + def _tokenize(self, text: str) -> List[str]: + return self.sp_model.encode(text, out_type=str) + + def _convert_token_to_id(self, token): + """Converts a token (str) in an id using the vocab.""" + # specifi to camembert, both 3 and 4 point to the unk token. + if self.sp_model.PieceToId(token) == 0: + # Convert sentence piece unk token to fairseq unk token index + return self.unk_token_id + return self.fairseq_offset + self.sp_model.PieceToId(token) + + def _convert_id_to_token(self, index): + """Converts an index (integer) in a token (str) using the vocab.""" + return self.sp_model.IdToPiece(index - self.fairseq_offset) + + def convert_tokens_to_string(self, tokens): + """Converts a sequence of tokens (string) in a single string.""" + # TODO decode outputs do not match between fast and slow + current_sub_tokens = [] + out_string = "" + prev_is_special = False + for token in tokens: + # make sure that special tokens are not decoded using sentencepiece model + if token in self.all_special_tokens: + if not prev_is_special: + out_string += " " + out_string += self.sp_model.decode(current_sub_tokens) + token + prev_is_special = True + current_sub_tokens = [] + else: + current_sub_tokens.append(token) + prev_is_special = False + out_string += self.sp_model.decode(current_sub_tokens) + return out_string.strip() + + def __getstate__(self): + state = self.__dict__.copy() + state["sp_model"] = None + return state + + def __setstate__(self, d): + self.__dict__ = d + + # for backward compatibility + if not hasattr(self, "sp_model_kwargs"): + self.sp_model_kwargs = {} + + self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) + self.sp_model.Load(self.vocab_file) + + def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]: + if not os.path.isdir(save_directory): + logger.error(f"Vocabulary path ({save_directory}) should be a directory") + return + out_vocab_file = os.path.join( + save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"] + ) + + if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file): + copyfile(self.vocab_file, out_vocab_file) + elif not os.path.isfile(self.vocab_file): + with open(out_vocab_file, "wb") as fi: + content_spiece_model = self.sp_model.serialized_model_proto() + fi.write(content_spiece_model) + + return (out_vocab_file,) + + def build_inputs_with_special_tokens( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None + ) -> List[int]: + """ + Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and + adding special tokens. An CamemBERT sequence has the following format: + + - single sequence: ` X ` + - pair of sequences: ` A B ` + + Args: + token_ids_0 (`List[int]`): + List of IDs to which the special tokens will be added. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + + Returns: + `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens. + """ + + if token_ids_1 is None: + return [self.cls_token_id] + token_ids_0 + [self.sep_token_id] + cls = [self.cls_token_id] + sep = [self.sep_token_id] + return cls + token_ids_0 + sep + sep + token_ids_1 + sep + + def get_special_tokens_mask( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False + ) -> List[int]: + """ + Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding + special tokens using the tokenizer `prepare_for_model` method. + + Args: + token_ids_0 (`List[int]`): + List of IDs. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + already_has_special_tokens (`bool`, *optional*, defaults to `False`): + Whether or not the token list is already formatted with special tokens for the model. + + Returns: + `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. + """ + if already_has_special_tokens: + return super().get_special_tokens_mask( + token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True + ) + + if token_ids_1 is None: + return [1] + ([0] * len(token_ids_0)) + [1] + return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1] + + def create_token_type_ids_from_sequences( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None + ) -> List[int]: + """ + Create a mask from the two sequences passed to be used in a sequence-pair classification task. CamemBERT, like + RoBERTa, does not make use of token type ids, therefore a list of zeros is returned. + + Args: + token_ids_0 (`List[int]`): + List of IDs. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + + Returns: + `List[int]`: List of zeros. + """ + sep = [self.sep_token_id] + cls = [self.cls_token_id] + + if token_ids_1 is None: + return len(cls + token_ids_0 + sep) * [0] + return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0] + + +__all__ = ["CamembertTokenizer"] diff --git a/docs/transformers/build/lib/transformers/models/camembert/tokenization_camembert_fast.py b/docs/transformers/build/lib/transformers/models/camembert/tokenization_camembert_fast.py new file mode 100644 index 0000000000000000000000000000000000000000..c04b5618390234f71a4d0f5890974ec1fd6916d0 --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/camembert/tokenization_camembert_fast.py @@ -0,0 +1,201 @@ +# coding=utf-8 +# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License +"""Fast tokenization classes for Camembert model.""" + +import os +from shutil import copyfile +from typing import List, Optional, Tuple + +from ...tokenization_utils import AddedToken +from ...tokenization_utils_fast import PreTrainedTokenizerFast +from ...utils import is_sentencepiece_available, logging + + +if is_sentencepiece_available(): + from .tokenization_camembert import CamembertTokenizer +else: + CamembertTokenizer = None + + +logger = logging.get_logger(__name__) + +VOCAB_FILES_NAMES = {"vocab_file": "sentencepiece.bpe.model", "tokenizer_file": "tokenizer.json"} + + +SPIECE_UNDERLINE = "▁" + + +class CamembertTokenizerFast(PreTrainedTokenizerFast): + """ + Construct a "fast" CamemBERT tokenizer (backed by HuggingFace's *tokenizers* library). Adapted from + [`RobertaTokenizer`] and [`XLNetTokenizer`]. Based on + [BPE](https://huggingface.co/docs/tokenizers/python/latest/components.html?highlight=BPE#models). + + This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should + refer to this superclass for more information regarding those methods. + + Args: + vocab_file (`str`): + [SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that + contains the vocabulary necessary to instantiate a tokenizer. + bos_token (`str`, *optional*, defaults to `""`): + The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token. + + + + When building a sequence using special tokens, this is not the token that is used for the beginning of + sequence. The token used is the `cls_token`. + + + + eos_token (`str`, *optional*, defaults to `""`): + The end of sequence token. + + + + When building a sequence using special tokens, this is not the token that is used for the end of sequence. + The token used is the `sep_token`. + + + + sep_token (`str`, *optional*, defaults to `""`): + The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for + sequence classification or for a text and a question for question answering. It is also used as the last + token of a sequence built with special tokens. + cls_token (`str`, *optional*, defaults to `""`): + The classifier token which is used when doing sequence classification (classification of the whole sequence + instead of per-token classification). It is the first token of the sequence when built with special tokens. + unk_token (`str`, *optional*, defaults to `""`): + The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this + token instead. + pad_token (`str`, *optional*, defaults to `""`): + The token used for padding, for example when batching sequences of different lengths. + mask_token (`str`, *optional*, defaults to `""`): + The token used for masking values. This is the token used when training this model with masked language + modeling. This is the token which the model will try to predict. + additional_special_tokens (`List[str]`, *optional*, defaults to `["NOTUSED", "NOTUSED"]`): + Additional special tokens used by the tokenizer. + """ + + vocab_files_names = VOCAB_FILES_NAMES + model_input_names = ["input_ids", "attention_mask"] + slow_tokenizer_class = CamembertTokenizer + + def __init__( + self, + vocab_file=None, + tokenizer_file=None, + bos_token="", + eos_token="", + sep_token="", + cls_token="", + unk_token="", + pad_token="", + mask_token="", + additional_special_tokens=["NOTUSED", "NOTUSED", "NOTUSED"], + **kwargs, + ): + # Mask token behave like a normal word, i.e. include the space before it. Will have normalized = False + mask_token = AddedToken(mask_token, lstrip=True, special=True) if isinstance(mask_token, str) else mask_token + super().__init__( + vocab_file, + tokenizer_file=tokenizer_file, + bos_token=bos_token, + eos_token=eos_token, + sep_token=sep_token, + cls_token=cls_token, + unk_token=unk_token, + pad_token=pad_token, + mask_token=mask_token, + additional_special_tokens=additional_special_tokens, + **kwargs, + ) + + self.vocab_file = vocab_file + + @property + def can_save_slow_tokenizer(self) -> bool: + return os.path.isfile(self.vocab_file) if self.vocab_file else False + + def build_inputs_with_special_tokens( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None + ) -> List[int]: + """ + Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and + adding special tokens. An CamemBERT sequence has the following format: + + - single sequence: ` X ` + - pair of sequences: ` A B ` + + Args: + token_ids_0 (`List[int]`): + List of IDs to which the special tokens will be added. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + + Returns: + `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens. + """ + + if token_ids_1 is None: + return [self.cls_token_id] + token_ids_0 + [self.sep_token_id] + cls = [self.cls_token_id] + sep = [self.sep_token_id] + return cls + token_ids_0 + sep + sep + token_ids_1 + sep + + def create_token_type_ids_from_sequences( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None + ) -> List[int]: + """ + Create a mask from the two sequences passed to be used in a sequence-pair classification task. CamemBERT, like + RoBERTa, does not make use of token type ids, therefore a list of zeros is returned. + + Args: + token_ids_0 (`List[int]`): + List of IDs. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + + Returns: + `List[int]`: List of zeros. + """ + sep = [self.sep_token_id] + cls = [self.cls_token_id] + + if token_ids_1 is None: + return len(cls + token_ids_0 + sep) * [0] + return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0] + + def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]: + if not self.can_save_slow_tokenizer: + raise ValueError( + "Your fast tokenizer does not have the necessary information to save the vocabulary for a slow " + "tokenizer." + ) + + if not os.path.isdir(save_directory): + logger.error(f"Vocabulary path ({save_directory}) should be a directory") + return + out_vocab_file = os.path.join( + save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"] + ) + + if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file): + copyfile(self.vocab_file, out_vocab_file) + + return (out_vocab_file,) + + +__all__ = ["CamembertTokenizerFast"] diff --git a/docs/transformers/build/lib/transformers/models/canine/__init__.py b/docs/transformers/build/lib/transformers/models/canine/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..bb00d8fd8827035ff7b351db49d7128a54429c53 --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/canine/__init__.py @@ -0,0 +1,28 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ...utils import _LazyModule +from ...utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .configuration_canine import * + from .modeling_canine import * + from .tokenization_canine import * +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/docs/transformers/build/lib/transformers/models/canine/configuration_canine.py b/docs/transformers/build/lib/transformers/models/canine/configuration_canine.py new file mode 100644 index 0000000000000000000000000000000000000000..29e90327d08f02d490006a464cb566adbca52007 --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/canine/configuration_canine.py @@ -0,0 +1,141 @@ +# coding=utf-8 +# Copyright Google AI and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""CANINE model configuration""" + +from ...configuration_utils import PretrainedConfig +from ...utils import logging + + +logger = logging.get_logger(__name__) + + +class CanineConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`CanineModel`]. It is used to instantiate an + CANINE model according to the specified arguments, defining the model architecture. Instantiating a configuration + with the defaults will yield a similar configuration to that of the CANINE + [google/canine-s](https://huggingface.co/google/canine-s) architecture. + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + + Args: + hidden_size (`int`, *optional*, defaults to 768): + Dimension of the encoder layers and the pooler layer. + num_hidden_layers (`int`, *optional*, defaults to 12): + Number of hidden layers in the deep Transformer encoder. + num_attention_heads (`int`, *optional*, defaults to 12): + Number of attention heads for each attention layer in the Transformer encoders. + intermediate_size (`int`, *optional*, defaults to 3072): + Dimension of the "intermediate" (i.e., feed-forward) layer in the Transformer encoders. + hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`): + The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, + `"relu"`, `"selu"` and `"gelu_new"` are supported. + hidden_dropout_prob (`float`, *optional*, defaults to 0.1): + The dropout probability for all fully connected layers in the embeddings, encoders, and pooler. + attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1): + The dropout ratio for the attention probabilities. + max_position_embeddings (`int`, *optional*, defaults to 16384): + The maximum sequence length that this model might ever be used with. + type_vocab_size (`int`, *optional*, defaults to 16): + The vocabulary size of the `token_type_ids` passed when calling [`CanineModel`]. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + layer_norm_eps (`float`, *optional*, defaults to 1e-12): + The epsilon used by the layer normalization layers. + pad_token_id (`int`, *optional*, defaults to 0): + Padding token id. + bos_token_id (`int`, *optional*, defaults to 57344): + Beginning of stream token id. + eos_token_id (`int`, *optional*, defaults to 57345): + End of stream token id. + downsampling_rate (`int`, *optional*, defaults to 4): + The rate at which to downsample the original character sequence length before applying the deep Transformer + encoder. + upsampling_kernel_size (`int`, *optional*, defaults to 4): + The kernel size (i.e. the number of characters in each window) of the convolutional projection layer when + projecting back from `hidden_size`*2 to `hidden_size`. + num_hash_functions (`int`, *optional*, defaults to 8): + The number of hash functions to use. Each hash function has its own embedding matrix. + num_hash_buckets (`int`, *optional*, defaults to 16384): + The number of hash buckets to use. + local_transformer_stride (`int`, *optional*, defaults to 128): + The stride of the local attention of the first shallow Transformer encoder. Defaults to 128 for good + TPU/XLA memory alignment. + + Example: + + ```python + >>> from transformers import CanineConfig, CanineModel + + >>> # Initializing a CANINE google/canine-s style configuration + >>> configuration = CanineConfig() + + >>> # Initializing a model (with random weights) from the google/canine-s style configuration + >>> model = CanineModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "canine" + + def __init__( + self, + hidden_size=768, + num_hidden_layers=12, + num_attention_heads=12, + intermediate_size=3072, + hidden_act="gelu", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + max_position_embeddings=16384, + type_vocab_size=16, + initializer_range=0.02, + layer_norm_eps=1e-12, + pad_token_id=0, + bos_token_id=0xE000, + eos_token_id=0xE001, + downsampling_rate=4, + upsampling_kernel_size=4, + num_hash_functions=8, + num_hash_buckets=16384, + local_transformer_stride=128, # Good TPU/XLA memory alignment. + **kwargs, + ): + super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs) + + self.max_position_embeddings = max_position_embeddings + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.intermediate_size = intermediate_size + self.hidden_act = hidden_act + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.initializer_range = initializer_range + self.type_vocab_size = type_vocab_size + self.layer_norm_eps = layer_norm_eps + + # Character config: + self.downsampling_rate = downsampling_rate + self.upsampling_kernel_size = upsampling_kernel_size + self.num_hash_functions = num_hash_functions + self.num_hash_buckets = num_hash_buckets + self.local_transformer_stride = local_transformer_stride + + +__all__ = ["CanineConfig"] diff --git a/docs/transformers/build/lib/transformers/models/canine/convert_canine_original_tf_checkpoint_to_pytorch.py b/docs/transformers/build/lib/transformers/models/canine/convert_canine_original_tf_checkpoint_to_pytorch.py new file mode 100644 index 0000000000000000000000000000000000000000..45dcdb290333dc9eb122dfb5e2a882b65241ab49 --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/canine/convert_canine_original_tf_checkpoint_to_pytorch.py @@ -0,0 +1,65 @@ +# coding=utf-8 +# Copyright 2021 The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Convert CANINE checkpoint.""" + +import argparse + +from transformers import CanineConfig, CanineModel, CanineTokenizer, load_tf_weights_in_canine +from transformers.utils import logging + + +logging.set_verbosity_info() + + +def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, pytorch_dump_path): + # Initialize PyTorch model + config = CanineConfig() + model = CanineModel(config) + model.eval() + + print(f"Building PyTorch model from configuration: {config}") + + # Load weights from tf checkpoint + load_tf_weights_in_canine(model, config, tf_checkpoint_path) + + # Save pytorch-model (weights and configuration) + print(f"Save PyTorch model to {pytorch_dump_path}") + model.save_pretrained(pytorch_dump_path) + + # Save tokenizer files + tokenizer = CanineTokenizer() + print(f"Save tokenizer files to {pytorch_dump_path}") + tokenizer.save_pretrained(pytorch_dump_path) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + # Required parameters + parser.add_argument( + "--tf_checkpoint_path", + default=None, + type=str, + required=True, + help="Path to the TensorFlow checkpoint. Should end with model.ckpt", + ) + parser.add_argument( + "--pytorch_dump_path", + default=None, + type=str, + required=True, + help="Path to a folder where the PyTorch model will be placed.", + ) + args = parser.parse_args() + convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path, args.pytorch_dump_path) diff --git a/docs/transformers/build/lib/transformers/models/canine/modeling_canine.py b/docs/transformers/build/lib/transformers/models/canine/modeling_canine.py new file mode 100644 index 0000000000000000000000000000000000000000..7a699f6ca959ee513feac9e348e0c9d857c3e1d6 --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/canine/modeling_canine.py @@ -0,0 +1,1653 @@ +# coding=utf-8 +# Copyright 2021 Google AI The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch CANINE model.""" + +import copy +import math +import os +from dataclasses import dataclass +from typing import Optional, Tuple, Union + +import torch +import torch.utils.checkpoint +from torch import nn +from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss + +from ...activations import ACT2FN +from ...modeling_outputs import ( + BaseModelOutput, + ModelOutput, + MultipleChoiceModelOutput, + QuestionAnsweringModelOutput, + SequenceClassifierOutput, + TokenClassifierOutput, +) +from ...modeling_utils import PreTrainedModel +from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer +from ...utils import ( + add_code_sample_docstrings, + add_start_docstrings, + add_start_docstrings_to_model_forward, + logging, + replace_return_docstrings, +) +from .configuration_canine import CanineConfig + + +logger = logging.get_logger(__name__) + +_CHECKPOINT_FOR_DOC = "google/canine-s" +_CONFIG_FOR_DOC = "CanineConfig" + + +# Support up to 16 hash functions. +_PRIMES = [31, 43, 59, 61, 73, 97, 103, 113, 137, 149, 157, 173, 181, 193, 211, 223] + + +@dataclass +class CanineModelOutputWithPooling(ModelOutput): + """ + Output type of [`CanineModel`]. Based on [`~modeling_outputs.BaseModelOutputWithPooling`], but with slightly + different `hidden_states` and `attentions`, as these also include the hidden states and attentions of the shallow + Transformer encoders. + + Args: + last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): + Sequence of hidden-states at the output of the last layer of the model (i.e. the output of the final + shallow Transformer encoder). + pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`): + Hidden-state of the first token of the sequence (classification token) at the last layer of the deep + Transformer encoder, further processed by a Linear layer and a Tanh activation function. The Linear layer + weights are trained from the next sentence prediction (classification) objective during pretraining. + hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): + Tuple of `torch.FloatTensor` (one for the input to each encoder + one for the output of each layer of each + encoder) of shape `(batch_size, sequence_length, hidden_size)` and `(batch_size, sequence_length // + config.downsampling_rate, hidden_size)`. Hidden-states of the model at the output of each layer plus the + initial input to each Transformer encoder. The hidden states of the shallow encoders have length + `sequence_length`, but the hidden states of the deep encoder have length `sequence_length` // + `config.downsampling_rate`. + attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): + Tuple of `torch.FloatTensor` (one for each layer) of the 3 Transformer encoders of shape `(batch_size, + num_heads, sequence_length, sequence_length)` and `(batch_size, num_heads, sequence_length // + config.downsampling_rate, sequence_length // config.downsampling_rate)`. Attentions weights after the + attention softmax, used to compute the weighted average in the self-attention heads. + """ + + last_hidden_state: Optional[torch.FloatTensor] = None + pooler_output: Optional[torch.FloatTensor] = None + hidden_states: Optional[Tuple[torch.FloatTensor]] = None + attentions: Optional[Tuple[torch.FloatTensor]] = None + + +def load_tf_weights_in_canine(model, config, tf_checkpoint_path): + """Load tf checkpoints in a pytorch model.""" + try: + import re + + import numpy as np + import tensorflow as tf + except ImportError: + logger.error( + "Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see " + "https://www.tensorflow.org/install/ for installation instructions." + ) + raise + tf_path = os.path.abspath(tf_checkpoint_path) + logger.info(f"Converting TensorFlow checkpoint from {tf_path}") + # Load weights from TF model + init_vars = tf.train.list_variables(tf_path) + names = [] + arrays = [] + for name, shape in init_vars: + logger.info(f"Loading TF weight {name} with shape {shape}") + array = tf.train.load_variable(tf_path, name) + names.append(name) + arrays.append(array) + + for name, array in zip(names, arrays): + name = name.split("/") + # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v + # which are not required for using pretrained model + # also discard the cls weights (which were used for the next sentence prediction pre-training task) + if any( + n + in [ + "adam_v", + "adam_m", + "AdamWeightDecayOptimizer", + "AdamWeightDecayOptimizer_1", + "global_step", + "cls", + "autoregressive_decoder", + "char_output_weights", + ] + for n in name + ): + logger.info(f"Skipping {'/'.join(name)}") + continue + # if first scope name starts with "bert", change it to "encoder" + if name[0] == "bert": + name[0] = "encoder" + # remove "embeddings" middle name of HashBucketCodepointEmbedders + elif name[1] == "embeddings": + name.remove(name[1]) + # rename segment_embeddings to token_type_embeddings + elif name[1] == "segment_embeddings": + name[1] = "token_type_embeddings" + # rename initial convolutional projection layer + elif name[1] == "initial_char_encoder": + name = ["chars_to_molecules"] + name[-2:] + # rename final convolutional projection layer + elif name[0] == "final_char_encoder" and name[1] in ["LayerNorm", "conv"]: + name = ["projection"] + name[1:] + pointer = model + for m_name in name: + if (re.fullmatch(r"[A-Za-z]+_\d+", m_name)) and "Embedder" not in m_name: + scope_names = re.split(r"_(\d+)", m_name) + else: + scope_names = [m_name] + if scope_names[0] == "kernel" or scope_names[0] == "gamma": + pointer = getattr(pointer, "weight") + elif scope_names[0] == "output_bias" or scope_names[0] == "beta": + pointer = getattr(pointer, "bias") + elif scope_names[0] == "output_weights": + pointer = getattr(pointer, "weight") + else: + try: + pointer = getattr(pointer, scope_names[0]) + except AttributeError: + logger.info(f"Skipping {'/'.join(name)}") + continue + if len(scope_names) >= 2: + num = int(scope_names[1]) + pointer = pointer[num] + if m_name[-11:] == "_embeddings": + pointer = getattr(pointer, "weight") + elif m_name[-10:] in [f"Embedder_{i}" for i in range(8)]: + pointer = getattr(pointer, "weight") + elif m_name == "kernel": + array = np.transpose(array) + + if pointer.shape != array.shape: + raise ValueError(f"Pointer shape {pointer.shape} and array shape {array.shape} mismatched") + + logger.info(f"Initialize PyTorch weight {name}") + pointer.data = torch.from_numpy(array) + return model + + +class CanineEmbeddings(nn.Module): + """Construct the character, position and token_type embeddings.""" + + def __init__(self, config): + super().__init__() + + self.config = config + + # character embeddings + shard_embedding_size = config.hidden_size // config.num_hash_functions + for i in range(config.num_hash_functions): + name = f"HashBucketCodepointEmbedder_{i}" + setattr(self, name, nn.Embedding(config.num_hash_buckets, shard_embedding_size)) + self.char_position_embeddings = nn.Embedding(config.num_hash_buckets, config.hidden_size) + self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size) + + # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load + # any TensorFlow checkpoint file + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + # position_ids (1, len position emb) is contiguous in memory and exported when serialized + self.register_buffer( + "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False + ) + self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") + + def _hash_bucket_tensors(self, input_ids, num_hashes: int, num_buckets: int): + """ + Converts ids to hash bucket ids via multiple hashing. + + Args: + input_ids: The codepoints or other IDs to be hashed. + num_hashes: The number of hash functions to use. + num_buckets: The number of hash buckets (i.e. embeddings in each table). + + Returns: + A list of tensors, each of which is the hash bucket IDs from one hash function. + """ + if num_hashes > len(_PRIMES): + raise ValueError(f"`num_hashes` must be <= {len(_PRIMES)}") + + primes = _PRIMES[:num_hashes] + + result_tensors = [] + for prime in primes: + hashed = ((input_ids + 1) * prime) % num_buckets + result_tensors.append(hashed) + return result_tensors + + def _embed_hash_buckets(self, input_ids, embedding_size: int, num_hashes: int, num_buckets: int): + """Converts IDs (e.g. codepoints) into embeddings via multiple hashing.""" + if embedding_size % num_hashes != 0: + raise ValueError(f"Expected `embedding_size` ({embedding_size}) % `num_hashes` ({num_hashes}) == 0") + + hash_bucket_tensors = self._hash_bucket_tensors(input_ids, num_hashes=num_hashes, num_buckets=num_buckets) + embedding_shards = [] + for i, hash_bucket_ids in enumerate(hash_bucket_tensors): + name = f"HashBucketCodepointEmbedder_{i}" + shard_embeddings = getattr(self, name)(hash_bucket_ids) + embedding_shards.append(shard_embeddings) + + return torch.cat(embedding_shards, dim=-1) + + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + token_type_ids: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + ) -> torch.FloatTensor: + if input_ids is not None: + input_shape = input_ids.size() + else: + input_shape = inputs_embeds.size()[:-1] + + seq_length = input_shape[1] + + if position_ids is None: + position_ids = self.position_ids[:, :seq_length] + + if token_type_ids is None: + token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device) + + if inputs_embeds is None: + inputs_embeds = self._embed_hash_buckets( + input_ids, self.config.hidden_size, self.config.num_hash_functions, self.config.num_hash_buckets + ) + + token_type_embeddings = self.token_type_embeddings(token_type_ids) + + embeddings = inputs_embeds + token_type_embeddings + + if self.position_embedding_type == "absolute": + position_embeddings = self.char_position_embeddings(position_ids) + embeddings += position_embeddings + embeddings = self.LayerNorm(embeddings) + embeddings = self.dropout(embeddings) + return embeddings + + +class CharactersToMolecules(nn.Module): + """Convert character sequence to initial molecule sequence (i.e. downsample) using strided convolutions.""" + + def __init__(self, config): + super().__init__() + + self.conv = nn.Conv1d( + in_channels=config.hidden_size, + out_channels=config.hidden_size, + kernel_size=config.downsampling_rate, + stride=config.downsampling_rate, + ) + self.activation = ACT2FN[config.hidden_act] + + # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load + # any TensorFlow checkpoint file + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + + def forward(self, char_encoding: torch.Tensor) -> torch.Tensor: + # `cls_encoding`: [batch, 1, hidden_size] + cls_encoding = char_encoding[:, 0:1, :] + + # char_encoding has shape [batch, char_seq, hidden_size] + # We transpose it to be [batch, hidden_size, char_seq] + char_encoding = torch.transpose(char_encoding, 1, 2) + downsampled = self.conv(char_encoding) + downsampled = torch.transpose(downsampled, 1, 2) + downsampled = self.activation(downsampled) + + # Truncate the last molecule in order to reserve a position for [CLS]. + # Often, the last position is never used (unless we completely fill the + # text buffer). This is important in order to maintain alignment on TPUs + # (i.e. a multiple of 128). + downsampled_truncated = downsampled[:, 0:-1, :] + + # We also keep [CLS] as a separate sequence position since we always + # want to reserve a position (and the model capacity that goes along + # with that) in the deep BERT stack. + # `result`: [batch, molecule_seq, molecule_dim] + result = torch.cat([cls_encoding, downsampled_truncated], dim=1) + + result = self.LayerNorm(result) + + return result + + +class ConvProjection(nn.Module): + """ + Project representations from hidden_size*2 back to hidden_size across a window of w = config.upsampling_kernel_size + characters. + """ + + def __init__(self, config): + super().__init__() + self.config = config + self.conv = nn.Conv1d( + in_channels=config.hidden_size * 2, + out_channels=config.hidden_size, + kernel_size=config.upsampling_kernel_size, + stride=1, + ) + self.activation = ACT2FN[config.hidden_act] + # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load + # any TensorFlow checkpoint file + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward( + self, + inputs: torch.Tensor, + final_seq_char_positions: Optional[torch.Tensor] = None, + ) -> torch.Tensor: + # inputs has shape [batch, mol_seq, molecule_hidden_size+char_hidden_final] + # we transpose it to be [batch, molecule_hidden_size+char_hidden_final, mol_seq] + inputs = torch.transpose(inputs, 1, 2) + + # PyTorch < 1.9 does not support padding="same" (which is used in the original implementation), + # so we pad the tensor manually before passing it to the conv layer + # based on https://github.com/google-research/big_transfer/blob/49afe42338b62af9fbe18f0258197a33ee578a6b/bit_tf2/models.py#L36-L38 + pad_total = self.config.upsampling_kernel_size - 1 + pad_beg = pad_total // 2 + pad_end = pad_total - pad_beg + + pad = nn.ConstantPad1d((pad_beg, pad_end), 0) + # `result`: shape (batch_size, char_seq_len, hidden_size) + result = self.conv(pad(inputs)) + result = torch.transpose(result, 1, 2) + result = self.activation(result) + result = self.LayerNorm(result) + result = self.dropout(result) + final_char_seq = result + + if final_seq_char_positions is not None: + # Limit transformer query seq and attention mask to these character + # positions to greatly reduce the compute cost. Typically, this is just + # done for the MLM training task. + # TODO add support for MLM + raise NotImplementedError("CanineForMaskedLM is currently not supported") + else: + query_seq = final_char_seq + + return query_seq + + +class CanineSelfAttention(nn.Module): + def __init__(self, config): + super().__init__() + if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"): + raise ValueError( + f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention " + f"heads ({config.num_attention_heads})" + ) + + self.num_attention_heads = config.num_attention_heads + self.attention_head_size = int(config.hidden_size / config.num_attention_heads) + self.all_head_size = self.num_attention_heads * self.attention_head_size + + self.query = nn.Linear(config.hidden_size, self.all_head_size) + self.key = nn.Linear(config.hidden_size, self.all_head_size) + self.value = nn.Linear(config.hidden_size, self.all_head_size) + + self.dropout = nn.Dropout(config.attention_probs_dropout_prob) + self.position_embedding_type = getattr(config, "position_embedding_type", "absolute") + if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query": + self.max_position_embeddings = config.max_position_embeddings + self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size) + + def transpose_for_scores(self, x): + new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size) + x = x.view(*new_x_shape) + return x.permute(0, 2, 1, 3) + + def forward( + self, + from_tensor: torch.Tensor, + to_tensor: torch.Tensor, + attention_mask: Optional[torch.FloatTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + output_attentions: Optional[bool] = False, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]: + mixed_query_layer = self.query(from_tensor) + + # If this is instantiated as a cross-attention module, the keys + # and values come from an encoder; the attention mask needs to be + # such that the encoder's padding tokens are not attended to. + + key_layer = self.transpose_for_scores(self.key(to_tensor)) + value_layer = self.transpose_for_scores(self.value(to_tensor)) + + query_layer = self.transpose_for_scores(mixed_query_layer) + + # Take the dot product between "query" and "key" to get the raw attention scores. + attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2)) + + if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query": + seq_length = from_tensor.size()[1] + position_ids_l = torch.arange(seq_length, dtype=torch.long, device=from_tensor.device).view(-1, 1) + position_ids_r = torch.arange(seq_length, dtype=torch.long, device=from_tensor.device).view(1, -1) + distance = position_ids_l - position_ids_r + positional_embedding = self.distance_embedding(distance + self.max_position_embeddings - 1) + positional_embedding = positional_embedding.to(dtype=query_layer.dtype) # fp16 compatibility + + if self.position_embedding_type == "relative_key": + relative_position_scores = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding) + attention_scores = attention_scores + relative_position_scores + elif self.position_embedding_type == "relative_key_query": + relative_position_scores_query = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding) + relative_position_scores_key = torch.einsum("bhrd,lrd->bhlr", key_layer, positional_embedding) + attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key + + attention_scores = attention_scores / math.sqrt(self.attention_head_size) + if attention_mask is not None: + if attention_mask.ndim == 3: + # if attention_mask is 3D, do the following: + attention_mask = torch.unsqueeze(attention_mask, dim=1) + # Since attention_mask is 1.0 for positions we want to attend and 0.0 for + # masked positions, this operation will create a tensor which is 0.0 for + # positions we want to attend and the dtype's smallest value for masked positions. + attention_mask = (1.0 - attention_mask.float()) * torch.finfo(attention_scores.dtype).min + # Apply the attention mask (precomputed for all layers in CanineModel forward() function) + attention_scores = attention_scores + attention_mask + + # Normalize the attention scores to probabilities. + attention_probs = nn.functional.softmax(attention_scores, dim=-1) + + # This is actually dropping out entire tokens to attend to, which might + # seem a bit unusual, but is taken from the original Transformer paper. + attention_probs = self.dropout(attention_probs) + + # Mask heads if we want to + if head_mask is not None: + attention_probs = attention_probs * head_mask + + context_layer = torch.matmul(attention_probs, value_layer) + + context_layer = context_layer.permute(0, 2, 1, 3).contiguous() + new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,) + context_layer = context_layer.view(*new_context_layer_shape) + + outputs = (context_layer, attention_probs) if output_attentions else (context_layer,) + + return outputs + + +class CanineSelfOutput(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward( + self, hidden_states: Tuple[torch.FloatTensor], input_tensor: torch.FloatTensor + ) -> Tuple[torch.FloatTensor, torch.FloatTensor]: + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = self.LayerNorm(hidden_states + input_tensor) + return hidden_states + + +class CanineAttention(nn.Module): + """ + Additional arguments related to local attention: + + - **local** (`bool`, *optional*, defaults to `False`) -- Whether to apply local attention. + - **always_attend_to_first_position** (`bool`, *optional*, defaults to `False`) -- Should all blocks be able to + attend + to the `to_tensor`'s first position (e.g. a [CLS] position)? - **first_position_attends_to_all** (`bool`, + *optional*, defaults to `False`) -- Should the *from_tensor*'s first position be able to attend to all + positions within the *from_tensor*? - **attend_from_chunk_width** (`int`, *optional*, defaults to 128) -- The + width of each block-wise chunk in `from_tensor`. - **attend_from_chunk_stride** (`int`, *optional*, defaults to + 128) -- The number of elements to skip when moving to the next block in `from_tensor`. - + **attend_to_chunk_width** (`int`, *optional*, defaults to 128) -- The width of each block-wise chunk in + *to_tensor*. - **attend_to_chunk_stride** (`int`, *optional*, defaults to 128) -- The number of elements to + skip when moving to the next block in `to_tensor`. + """ + + def __init__( + self, + config, + local=False, + always_attend_to_first_position: bool = False, + first_position_attends_to_all: bool = False, + attend_from_chunk_width: int = 128, + attend_from_chunk_stride: int = 128, + attend_to_chunk_width: int = 128, + attend_to_chunk_stride: int = 128, + ): + super().__init__() + self.self = CanineSelfAttention(config) + self.output = CanineSelfOutput(config) + self.pruned_heads = set() + + # additional arguments related to local attention + self.local = local + if attend_from_chunk_width < attend_from_chunk_stride: + raise ValueError( + "`attend_from_chunk_width` < `attend_from_chunk_stride` would cause sequence positions to get skipped." + ) + if attend_to_chunk_width < attend_to_chunk_stride: + raise ValueError( + "`attend_to_chunk_width` < `attend_to_chunk_stride`would cause sequence positions to get skipped." + ) + self.always_attend_to_first_position = always_attend_to_first_position + self.first_position_attends_to_all = first_position_attends_to_all + self.attend_from_chunk_width = attend_from_chunk_width + self.attend_from_chunk_stride = attend_from_chunk_stride + self.attend_to_chunk_width = attend_to_chunk_width + self.attend_to_chunk_stride = attend_to_chunk_stride + + def prune_heads(self, heads): + if len(heads) == 0: + return + heads, index = find_pruneable_heads_and_indices( + heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads + ) + + # Prune linear layers + self.self.query = prune_linear_layer(self.self.query, index) + self.self.key = prune_linear_layer(self.self.key, index) + self.self.value = prune_linear_layer(self.self.value, index) + self.output.dense = prune_linear_layer(self.output.dense, index, dim=1) + + # Update hyper params and store pruned heads + self.self.num_attention_heads = self.self.num_attention_heads - len(heads) + self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads + self.pruned_heads = self.pruned_heads.union(heads) + + def forward( + self, + hidden_states: Tuple[torch.FloatTensor], + attention_mask: Optional[torch.FloatTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + output_attentions: Optional[bool] = False, + ) -> Tuple[torch.FloatTensor, Optional[torch.FloatTensor]]: + if not self.local: + self_outputs = self.self(hidden_states, hidden_states, attention_mask, head_mask, output_attentions) + attention_output = self_outputs[0] + else: + from_seq_length = to_seq_length = hidden_states.shape[1] + from_tensor = to_tensor = hidden_states + + # Create chunks (windows) that we will attend *from* and then concatenate them. + from_chunks = [] + if self.first_position_attends_to_all: + from_chunks.append((0, 1)) + # We must skip this first position so that our output sequence is the + # correct length (this matters in the *from* sequence only). + from_start = 1 + else: + from_start = 0 + for chunk_start in range(from_start, from_seq_length, self.attend_from_chunk_stride): + chunk_end = min(from_seq_length, chunk_start + self.attend_from_chunk_width) + from_chunks.append((chunk_start, chunk_end)) + + # Determine the chunks (windows) that will attend *to*. + to_chunks = [] + if self.first_position_attends_to_all: + to_chunks.append((0, to_seq_length)) + for chunk_start in range(0, to_seq_length, self.attend_to_chunk_stride): + chunk_end = min(to_seq_length, chunk_start + self.attend_to_chunk_width) + to_chunks.append((chunk_start, chunk_end)) + + if len(from_chunks) != len(to_chunks): + raise ValueError( + f"Expected to have same number of `from_chunks` ({from_chunks}) and " + f"`to_chunks` ({from_chunks}). Check strides." + ) + + # next, compute attention scores for each pair of windows and concatenate + attention_output_chunks = [] + attention_probs_chunks = [] + for (from_start, from_end), (to_start, to_end) in zip(from_chunks, to_chunks): + from_tensor_chunk = from_tensor[:, from_start:from_end, :] + to_tensor_chunk = to_tensor[:, to_start:to_end, :] + # `attention_mask`: [batch_size, from_seq, to_seq] + # `attention_mask_chunk`: [batch_size, from_seq_chunk, to_seq_chunk] + attention_mask_chunk = attention_mask[:, from_start:from_end, to_start:to_end] + if self.always_attend_to_first_position: + cls_attention_mask = attention_mask[:, from_start:from_end, 0:1] + attention_mask_chunk = torch.cat([cls_attention_mask, attention_mask_chunk], dim=2) + + cls_position = to_tensor[:, 0:1, :] + to_tensor_chunk = torch.cat([cls_position, to_tensor_chunk], dim=1) + + attention_outputs_chunk = self.self( + from_tensor_chunk, to_tensor_chunk, attention_mask_chunk, head_mask, output_attentions + ) + attention_output_chunks.append(attention_outputs_chunk[0]) + if output_attentions: + attention_probs_chunks.append(attention_outputs_chunk[1]) + + attention_output = torch.cat(attention_output_chunks, dim=1) + + attention_output = self.output(attention_output, hidden_states) + outputs = (attention_output,) + if not self.local: + outputs = outputs + self_outputs[1:] # add attentions if we output them + else: + outputs = outputs + tuple(attention_probs_chunks) # add attentions if we output them + return outputs + + +class CanineIntermediate(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.intermediate_size) + if isinstance(config.hidden_act, str): + self.intermediate_act_fn = ACT2FN[config.hidden_act] + else: + self.intermediate_act_fn = config.hidden_act + + def forward(self, hidden_states: torch.FloatTensor) -> torch.FloatTensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.intermediate_act_fn(hidden_states) + return hidden_states + + +class CanineOutput(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.intermediate_size, config.hidden_size) + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + + def forward(self, hidden_states: Tuple[torch.FloatTensor], input_tensor: torch.FloatTensor) -> torch.FloatTensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.dropout(hidden_states) + hidden_states = self.LayerNorm(hidden_states + input_tensor) + return hidden_states + + +class CanineLayer(nn.Module): + def __init__( + self, + config, + local, + always_attend_to_first_position, + first_position_attends_to_all, + attend_from_chunk_width, + attend_from_chunk_stride, + attend_to_chunk_width, + attend_to_chunk_stride, + ): + super().__init__() + self.chunk_size_feed_forward = config.chunk_size_feed_forward + self.seq_len_dim = 1 + self.attention = CanineAttention( + config, + local, + always_attend_to_first_position, + first_position_attends_to_all, + attend_from_chunk_width, + attend_from_chunk_stride, + attend_to_chunk_width, + attend_to_chunk_stride, + ) + self.intermediate = CanineIntermediate(config) + self.output = CanineOutput(config) + + def forward( + self, + hidden_states: Tuple[torch.FloatTensor], + attention_mask: Optional[torch.FloatTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + output_attentions: Optional[bool] = False, + ) -> Tuple[torch.FloatTensor, Optional[torch.FloatTensor]]: + self_attention_outputs = self.attention( + hidden_states, + attention_mask, + head_mask, + output_attentions=output_attentions, + ) + attention_output = self_attention_outputs[0] + + outputs = self_attention_outputs[1:] # add self attentions if we output attention weights + + layer_output = apply_chunking_to_forward( + self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attention_output + ) + outputs = (layer_output,) + outputs + + return outputs + + def feed_forward_chunk(self, attention_output): + intermediate_output = self.intermediate(attention_output) + layer_output = self.output(intermediate_output, attention_output) + return layer_output + + +class CanineEncoder(nn.Module): + def __init__( + self, + config, + local=False, + always_attend_to_first_position=False, + first_position_attends_to_all=False, + attend_from_chunk_width=128, + attend_from_chunk_stride=128, + attend_to_chunk_width=128, + attend_to_chunk_stride=128, + ): + super().__init__() + self.config = config + self.layer = nn.ModuleList( + [ + CanineLayer( + config, + local, + always_attend_to_first_position, + first_position_attends_to_all, + attend_from_chunk_width, + attend_from_chunk_stride, + attend_to_chunk_width, + attend_to_chunk_stride, + ) + for _ in range(config.num_hidden_layers) + ] + ) + self.gradient_checkpointing = False + + def forward( + self, + hidden_states: Tuple[torch.FloatTensor], + attention_mask: Optional[torch.FloatTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + output_attentions: Optional[bool] = False, + output_hidden_states: Optional[bool] = False, + return_dict: Optional[bool] = True, + ) -> Union[Tuple, BaseModelOutput]: + all_hidden_states = () if output_hidden_states else None + all_self_attentions = () if output_attentions else None + + for i, layer_module in enumerate(self.layer): + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + layer_head_mask = head_mask[i] if head_mask is not None else None + + if self.gradient_checkpointing and self.training: + layer_outputs = self._gradient_checkpointing_func( + layer_module.__call__, + hidden_states, + attention_mask, + layer_head_mask, + output_attentions, + ) + else: + layer_outputs = layer_module(hidden_states, attention_mask, layer_head_mask, output_attentions) + + hidden_states = layer_outputs[0] + if output_attentions: + all_self_attentions = all_self_attentions + (layer_outputs[1],) + + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + if not return_dict: + return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None) + return BaseModelOutput( + last_hidden_state=hidden_states, + hidden_states=all_hidden_states, + attentions=all_self_attentions, + ) + + +class CaninePooler(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + self.activation = nn.Tanh() + + def forward(self, hidden_states: Tuple[torch.FloatTensor]) -> torch.FloatTensor: + # We "pool" the model by simply taking the hidden state corresponding + # to the first token. + first_token_tensor = hidden_states[:, 0] + pooled_output = self.dense(first_token_tensor) + pooled_output = self.activation(pooled_output) + return pooled_output + + +class CaninePredictionHeadTransform(nn.Module): + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + if isinstance(config.hidden_act, str): + self.transform_act_fn = ACT2FN[config.hidden_act] + else: + self.transform_act_fn = config.hidden_act + self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + + def forward(self, hidden_states: Tuple[torch.FloatTensor]) -> torch.FloatTensor: + hidden_states = self.dense(hidden_states) + hidden_states = self.transform_act_fn(hidden_states) + hidden_states = self.LayerNorm(hidden_states) + return hidden_states + + +class CanineLMPredictionHead(nn.Module): + def __init__(self, config): + super().__init__() + self.transform = CaninePredictionHeadTransform(config) + + # The output weights are the same as the input embeddings, but there is + # an output-only bias for each token. + self.decoder = nn.Linear(config.hidden_size, config.vocab_size, bias=False) + + self.bias = nn.Parameter(torch.zeros(config.vocab_size)) + + # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings` + self.decoder.bias = self.bias + + def forward(self, hidden_states: Tuple[torch.FloatTensor]) -> torch.FloatTensor: + hidden_states = self.transform(hidden_states) + hidden_states = self.decoder(hidden_states) + return hidden_states + + +class CanineOnlyMLMHead(nn.Module): + def __init__(self, config): + super().__init__() + self.predictions = CanineLMPredictionHead(config) + + def forward( + self, + sequence_output: Tuple[torch.Tensor], + ) -> Tuple[torch.Tensor]: + prediction_scores = self.predictions(sequence_output) + return prediction_scores + + +class CaninePreTrainedModel(PreTrainedModel): + """ + An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained + models. + """ + + config_class = CanineConfig + load_tf_weights = load_tf_weights_in_canine + base_model_prefix = "canine" + supports_gradient_checkpointing = True + + def _init_weights(self, module): + """Initialize the weights""" + if isinstance(module, (nn.Linear, nn.Conv1d)): + # Slightly different from the TF version which uses truncated_normal for initialization + # cf https://github.com/pytorch/pytorch/pull/5617 + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=self.config.initializer_range) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + elif isinstance(module, nn.LayerNorm): + module.bias.data.zero_() + module.weight.data.fill_(1.0) + + +CANINE_START_DOCSTRING = r""" + This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use + it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and + behavior. + + Parameters: + config ([`CanineConfig`]): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights. +""" + +CANINE_INPUTS_DOCSTRING = r""" + Args: + input_ids (`torch.LongTensor` of shape `({0})`): + Indices of input sequence tokens in the vocabulary. + + Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and + [`PreTrainedTokenizer.__call__`] for details. + + [What are input IDs?](../glossary#input-ids) + attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*): + Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + [What are attention masks?](../glossary#attention-mask) + token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*): + Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0, + 1]`: + + - 0 corresponds to a *sentence A* token, + - 1 corresponds to a *sentence B* token. + + [What are token type IDs?](../glossary#token-type-ids) + position_ids (`torch.LongTensor` of shape `({0})`, *optional*): + Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0, + config.max_position_embeddings - 1]`. + + [What are position IDs?](../glossary#position-ids) + head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*): + Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*): + Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This + is useful if you want more control over how to convert *input_ids* indices into associated vectors than the + model's internal embedding lookup matrix. + output_attentions (`bool`, *optional*): + Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned + tensors for more detail. + output_hidden_states (`bool`, *optional*): + Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for + more detail. + return_dict (`bool`, *optional*): + Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. +""" + + +@add_start_docstrings( + "The bare CANINE Model transformer outputting raw hidden-states without any specific head on top.", + CANINE_START_DOCSTRING, +) +class CanineModel(CaninePreTrainedModel): + def __init__(self, config, add_pooling_layer=True): + super().__init__(config) + self.config = config + shallow_config = copy.deepcopy(config) + shallow_config.num_hidden_layers = 1 + + self.char_embeddings = CanineEmbeddings(config) + # shallow/low-dim transformer encoder to get a initial character encoding + self.initial_char_encoder = CanineEncoder( + shallow_config, + local=True, + always_attend_to_first_position=False, + first_position_attends_to_all=False, + attend_from_chunk_width=config.local_transformer_stride, + attend_from_chunk_stride=config.local_transformer_stride, + attend_to_chunk_width=config.local_transformer_stride, + attend_to_chunk_stride=config.local_transformer_stride, + ) + self.chars_to_molecules = CharactersToMolecules(config) + # deep transformer encoder + self.encoder = CanineEncoder(config) + self.projection = ConvProjection(config) + # shallow/low-dim transformer encoder to get a final character encoding + self.final_char_encoder = CanineEncoder(shallow_config) + + self.pooler = CaninePooler(config) if add_pooling_layer else None + + # Initialize weights and apply final processing + self.post_init() + + def _prune_heads(self, heads_to_prune): + """ + Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base + class PreTrainedModel + """ + for layer, heads in heads_to_prune.items(): + self.encoder.layer[layer].attention.prune_heads(heads) + + def _create_3d_attention_mask_from_input_mask(self, from_tensor, to_mask): + """ + Create 3D attention mask from a 2D tensor mask. + + Args: + from_tensor: 2D or 3D Tensor of shape [batch_size, from_seq_length, ...]. + to_mask: int32 Tensor of shape [batch_size, to_seq_length]. + + Returns: + float Tensor of shape [batch_size, from_seq_length, to_seq_length]. + """ + batch_size, from_seq_length = from_tensor.shape[0], from_tensor.shape[1] + + to_seq_length = to_mask.shape[1] + + to_mask = torch.reshape(to_mask, (batch_size, 1, to_seq_length)).float() + + # We don't assume that `from_tensor` is a mask (although it could be). We + # don't actually care if we attend *from* padding tokens (only *to* padding) + # tokens so we create a tensor of all ones. + broadcast_ones = torch.ones(size=(batch_size, from_seq_length, 1), dtype=torch.float32, device=to_mask.device) + + # Here we broadcast along two dimensions to create the mask. + mask = broadcast_ones * to_mask + + return mask + + def _downsample_attention_mask(self, char_attention_mask: torch.Tensor, downsampling_rate: int): + """Downsample 2D character attention mask to 2D molecule attention mask using MaxPool1d layer.""" + + # first, make char_attention_mask 3D by adding a channel dim + batch_size, char_seq_len = char_attention_mask.shape + poolable_char_mask = torch.reshape(char_attention_mask, (batch_size, 1, char_seq_len)) + + # next, apply MaxPool1d to get pooled_molecule_mask of shape (batch_size, 1, mol_seq_len) + pooled_molecule_mask = torch.nn.MaxPool1d(kernel_size=downsampling_rate, stride=downsampling_rate)( + poolable_char_mask.float() + ) + + # finally, squeeze to get tensor of shape (batch_size, mol_seq_len) + molecule_attention_mask = torch.squeeze(pooled_molecule_mask, dim=-1) + + return molecule_attention_mask + + def _repeat_molecules(self, molecules: torch.Tensor, char_seq_length: int) -> torch.Tensor: + """Repeats molecules to make them the same length as the char sequence.""" + + rate = self.config.downsampling_rate + + molecules_without_extra_cls = molecules[:, 1:, :] + # `repeated`: [batch_size, almost_char_seq_len, molecule_hidden_size] + repeated = torch.repeat_interleave(molecules_without_extra_cls, repeats=rate, dim=-2) + + # So far, we've repeated the elements sufficient for any `char_seq_length` + # that's a multiple of `downsampling_rate`. Now we account for the last + # n elements (n < `downsampling_rate`), i.e. the remainder of floor + # division. We do this by repeating the last molecule a few extra times. + last_molecule = molecules[:, -1:, :] + remainder_length = char_seq_length % rate + remainder_repeated = torch.repeat_interleave( + last_molecule, + # +1 molecule to compensate for truncation. + repeats=remainder_length + rate, + dim=-2, + ) + + # `repeated`: [batch_size, char_seq_len, molecule_hidden_size] + return torch.cat([repeated, remainder_repeated], dim=-2) + + @add_start_docstrings_to_model_forward(CANINE_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @add_code_sample_docstrings( + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=CanineModelOutputWithPooling, + config_class=_CONFIG_FOR_DOC, + ) + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.FloatTensor] = None, + token_type_ids: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, CanineModelOutputWithPooling]: + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + all_hidden_states = () if output_hidden_states else None + all_self_attentions = () if output_attentions else None + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + if input_ids is not None and inputs_embeds is not None: + raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") + elif input_ids is not None: + self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask) + input_shape = input_ids.size() + elif inputs_embeds is not None: + input_shape = inputs_embeds.size()[:-1] + else: + raise ValueError("You have to specify either input_ids or inputs_embeds") + + batch_size, seq_length = input_shape + device = input_ids.device if input_ids is not None else inputs_embeds.device + + if attention_mask is None: + attention_mask = torch.ones(((batch_size, seq_length)), device=device) + if token_type_ids is None: + token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device) + + # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length] + # ourselves in which case we just need to make it broadcastable to all heads. + extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape) + molecule_attention_mask = self._downsample_attention_mask( + attention_mask, downsampling_rate=self.config.downsampling_rate + ) + extended_molecule_attention_mask: torch.Tensor = self.get_extended_attention_mask( + molecule_attention_mask, (batch_size, molecule_attention_mask.shape[-1]) + ) + + # Prepare head mask if needed + # 1.0 in head_mask indicate we keep the head + # attention_probs has shape bsz x n_heads x N x N + # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads] + # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length] + head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers) + + # `input_char_embeddings`: shape (batch_size, char_seq, char_dim) + input_char_embeddings = self.char_embeddings( + input_ids=input_ids, + position_ids=position_ids, + token_type_ids=token_type_ids, + inputs_embeds=inputs_embeds, + ) + + # Contextualize character embeddings using shallow Transformer. + # We use a 3D attention mask for the local attention. + # `input_char_encoding`: shape (batch_size, char_seq_len, char_dim) + char_attention_mask = self._create_3d_attention_mask_from_input_mask( + input_ids if input_ids is not None else inputs_embeds, attention_mask + ) + init_chars_encoder_outputs = self.initial_char_encoder( + input_char_embeddings, + attention_mask=char_attention_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + ) + input_char_encoding = init_chars_encoder_outputs.last_hidden_state + + # Downsample chars to molecules. + # The following lines have dimensions: [batch, molecule_seq, molecule_dim]. + # In this transformation, we change the dimensionality from `char_dim` to + # `molecule_dim`, but do *NOT* add a resnet connection. Instead, we rely on + # the resnet connections (a) from the final char transformer stack back into + # the original char transformer stack and (b) the resnet connections from + # the final char transformer stack back into the deep BERT stack of + # molecules. + # + # Empirically, it is critical to use a powerful enough transformation here: + # mean pooling causes training to diverge with huge gradient norms in this + # region of the model; using a convolution here resolves this issue. From + # this, it seems that molecules and characters require a very different + # feature space; intuitively, this makes sense. + init_molecule_encoding = self.chars_to_molecules(input_char_encoding) + + # Deep BERT encoder + # `molecule_sequence_output`: shape (batch_size, mol_seq_len, mol_dim) + encoder_outputs = self.encoder( + init_molecule_encoding, + attention_mask=extended_molecule_attention_mask, + head_mask=head_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + molecule_sequence_output = encoder_outputs[0] + pooled_output = self.pooler(molecule_sequence_output) if self.pooler is not None else None + + # Upsample molecules back to characters. + # `repeated_molecules`: shape (batch_size, char_seq_len, mol_hidden_size) + repeated_molecules = self._repeat_molecules(molecule_sequence_output, char_seq_length=input_shape[-1]) + + # Concatenate representations (contextualized char embeddings and repeated molecules): + # `concat`: shape [batch_size, char_seq_len, molecule_hidden_size+char_hidden_final] + concat = torch.cat([input_char_encoding, repeated_molecules], dim=-1) + + # Project representation dimension back to hidden_size + # `sequence_output`: shape (batch_size, char_seq_len, hidden_size]) + sequence_output = self.projection(concat) + + # Apply final shallow Transformer + # `sequence_output`: shape (batch_size, char_seq_len, hidden_size]) + final_chars_encoder_outputs = self.final_char_encoder( + sequence_output, + attention_mask=extended_attention_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + ) + sequence_output = final_chars_encoder_outputs.last_hidden_state + + if output_hidden_states: + deep_encoder_hidden_states = encoder_outputs.hidden_states if return_dict else encoder_outputs[1] + all_hidden_states = ( + all_hidden_states + + init_chars_encoder_outputs.hidden_states + + deep_encoder_hidden_states + + final_chars_encoder_outputs.hidden_states + ) + + if output_attentions: + deep_encoder_self_attentions = encoder_outputs.attentions if return_dict else encoder_outputs[-1] + all_self_attentions = ( + all_self_attentions + + init_chars_encoder_outputs.attentions + + deep_encoder_self_attentions + + final_chars_encoder_outputs.attentions + ) + + if not return_dict: + output = (sequence_output, pooled_output) + output += tuple(v for v in [all_hidden_states, all_self_attentions] if v is not None) + return output + + return CanineModelOutputWithPooling( + last_hidden_state=sequence_output, + pooler_output=pooled_output, + hidden_states=all_hidden_states, + attentions=all_self_attentions, + ) + + +@add_start_docstrings( + """ + CANINE Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled + output) e.g. for GLUE tasks. + """, + CANINE_START_DOCSTRING, +) +class CanineForSequenceClassification(CaninePreTrainedModel): + def __init__(self, config): + super().__init__(config) + self.num_labels = config.num_labels + + self.canine = CanineModel(config) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + self.classifier = nn.Linear(config.hidden_size, config.num_labels) + + # Initialize weights and apply final processing + self.post_init() + + @add_start_docstrings_to_model_forward(CANINE_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @add_code_sample_docstrings( + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=SequenceClassifierOutput, + config_class=_CONFIG_FOR_DOC, + ) + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.FloatTensor] = None, + token_type_ids: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, SequenceClassifierOutput]: + r""" + labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the sequence classification/regression loss. Indices should be in `[0, ..., + config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If + `config.num_labels > 1` a classification loss is computed (Cross-Entropy). + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.canine( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + pooled_output = outputs[1] + + pooled_output = self.dropout(pooled_output) + logits = self.classifier(pooled_output) + + loss = None + if labels is not None: + if self.config.problem_type is None: + if self.num_labels == 1: + self.config.problem_type = "regression" + elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int): + self.config.problem_type = "single_label_classification" + else: + self.config.problem_type = "multi_label_classification" + + if self.config.problem_type == "regression": + loss_fct = MSELoss() + if self.num_labels == 1: + loss = loss_fct(logits.squeeze(), labels.squeeze()) + else: + loss = loss_fct(logits, labels) + elif self.config.problem_type == "single_label_classification": + loss_fct = CrossEntropyLoss() + loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) + elif self.config.problem_type == "multi_label_classification": + loss_fct = BCEWithLogitsLoss() + loss = loss_fct(logits, labels) + if not return_dict: + output = (logits,) + outputs[2:] + return ((loss,) + output) if loss is not None else output + + return SequenceClassifierOutput( + loss=loss, + logits=logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +@add_start_docstrings( + """ + CANINE Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a + softmax) e.g. for RocStories/SWAG tasks. + """, + CANINE_START_DOCSTRING, +) +class CanineForMultipleChoice(CaninePreTrainedModel): + def __init__(self, config): + super().__init__(config) + + self.canine = CanineModel(config) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + self.classifier = nn.Linear(config.hidden_size, 1) + + # Initialize weights and apply final processing + self.post_init() + + @add_start_docstrings_to_model_forward(CANINE_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length")) + @add_code_sample_docstrings( + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=MultipleChoiceModelOutput, + config_class=_CONFIG_FOR_DOC, + ) + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.FloatTensor] = None, + token_type_ids: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, MultipleChoiceModelOutput]: + r""" + labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for computing the multiple choice classification loss. Indices should be in `[0, ..., + num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See + `input_ids` above) + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1] + + input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None + attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None + token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None + position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None + inputs_embeds = ( + inputs_embeds.view(-1, inputs_embeds.size(-2), inputs_embeds.size(-1)) + if inputs_embeds is not None + else None + ) + + outputs = self.canine( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + pooled_output = outputs[1] + + pooled_output = self.dropout(pooled_output) + logits = self.classifier(pooled_output) + reshaped_logits = logits.view(-1, num_choices) + + loss = None + if labels is not None: + loss_fct = CrossEntropyLoss() + loss = loss_fct(reshaped_logits, labels) + + if not return_dict: + output = (reshaped_logits,) + outputs[2:] + return ((loss,) + output) if loss is not None else output + + return MultipleChoiceModelOutput( + loss=loss, + logits=reshaped_logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +@add_start_docstrings( + """ + CANINE Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for + Named-Entity-Recognition (NER) tasks. + """, + CANINE_START_DOCSTRING, +) +class CanineForTokenClassification(CaninePreTrainedModel): + def __init__(self, config): + super().__init__(config) + self.num_labels = config.num_labels + + self.canine = CanineModel(config) + self.dropout = nn.Dropout(config.hidden_dropout_prob) + self.classifier = nn.Linear(config.hidden_size, config.num_labels) + + # Initialize weights and apply final processing + self.post_init() + + @add_start_docstrings_to_model_forward(CANINE_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @replace_return_docstrings(output_type=TokenClassifierOutput, config_class=_CONFIG_FOR_DOC) + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.FloatTensor] = None, + token_type_ids: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, TokenClassifierOutput]: + r""" + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`. + + Returns: + + Example: + + ```python + >>> from transformers import AutoTokenizer, CanineForTokenClassification + >>> import torch + + >>> tokenizer = AutoTokenizer.from_pretrained("google/canine-s") + >>> model = CanineForTokenClassification.from_pretrained("google/canine-s") + + >>> inputs = tokenizer( + ... "HuggingFace is a company based in Paris and New York", add_special_tokens=False, return_tensors="pt" + ... ) + + >>> with torch.no_grad(): + ... logits = model(**inputs).logits + + >>> predicted_token_class_ids = logits.argmax(-1) + + >>> # Note that tokens are classified rather then input words which means that + >>> # there might be more predicted token classes than words. + >>> # Multiple token classes might account for the same word + >>> predicted_tokens_classes = [model.config.id2label[t.item()] for t in predicted_token_class_ids[0]] + >>> predicted_tokens_classes # doctest: +SKIP + ``` + + ```python + >>> labels = predicted_token_class_ids + >>> loss = model(**inputs, labels=labels).loss + >>> round(loss.item(), 2) # doctest: +SKIP + ```""" + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.canine( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + sequence_output = outputs[0] + + sequence_output = self.dropout(sequence_output) + logits = self.classifier(sequence_output) + + loss = None + if labels is not None: + loss_fct = CrossEntropyLoss() + loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) + + if not return_dict: + output = (logits,) + outputs[2:] + return ((loss,) + output) if loss is not None else output + + return TokenClassifierOutput( + loss=loss, + logits=logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +@add_start_docstrings( + """ + CANINE Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear + layers on top of the hidden-states output to compute `span start logits` and `span end logits`). + """, + CANINE_START_DOCSTRING, +) +class CanineForQuestionAnswering(CaninePreTrainedModel): + def __init__(self, config): + super().__init__(config) + self.num_labels = config.num_labels + + self.canine = CanineModel(config) + self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels) + + # Initialize weights and apply final processing + self.post_init() + + @add_start_docstrings_to_model_forward(CANINE_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @add_code_sample_docstrings( + checkpoint="Splend1dchan/canine-c-squad", + output_type=QuestionAnsweringModelOutput, + config_class=_CONFIG_FOR_DOC, + expected_output="'nice puppet'", + expected_loss=8.81, + ) + def forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.FloatTensor] = None, + token_type_ids: Optional[torch.LongTensor] = None, + position_ids: Optional[torch.LongTensor] = None, + head_mask: Optional[torch.FloatTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + start_positions: Optional[torch.LongTensor] = None, + end_positions: Optional[torch.LongTensor] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, QuestionAnsweringModelOutput]: + r""" + start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for position (index) of the start of the labelled span for computing the token classification loss. + Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence + are not taken into account for computing the loss. + end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*): + Labels for position (index) of the end of the labelled span for computing the token classification loss. + Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence + are not taken into account for computing the loss. + """ + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + outputs = self.canine( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + sequence_output = outputs[0] + + logits = self.qa_outputs(sequence_output) + start_logits, end_logits = logits.split(1, dim=-1) + start_logits = start_logits.squeeze(-1) + end_logits = end_logits.squeeze(-1) + + total_loss = None + if start_positions is not None and end_positions is not None: + # If we are on multi-GPU, split add a dimension + if len(start_positions.size()) > 1: + start_positions = start_positions.squeeze(-1) + if len(end_positions.size()) > 1: + end_positions = end_positions.squeeze(-1) + # sometimes the start/end positions are outside our model inputs, we ignore these terms + ignored_index = start_logits.size(1) + start_positions.clamp_(0, ignored_index) + end_positions.clamp_(0, ignored_index) + + loss_fct = CrossEntropyLoss(ignore_index=ignored_index) + start_loss = loss_fct(start_logits, start_positions) + end_loss = loss_fct(end_logits, end_positions) + total_loss = (start_loss + end_loss) / 2 + + if not return_dict: + output = (start_logits, end_logits) + outputs[2:] + return ((total_loss,) + output) if total_loss is not None else output + + return QuestionAnsweringModelOutput( + loss=total_loss, + start_logits=start_logits, + end_logits=end_logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + +__all__ = [ + "CanineForMultipleChoice", + "CanineForQuestionAnswering", + "CanineForSequenceClassification", + "CanineForTokenClassification", + "CanineLayer", + "CanineModel", + "CaninePreTrainedModel", + "load_tf_weights_in_canine", +] diff --git a/docs/transformers/build/lib/transformers/models/canine/tokenization_canine.py b/docs/transformers/build/lib/transformers/models/canine/tokenization_canine.py new file mode 100644 index 0000000000000000000000000000000000000000..fe2734712dca5b79170738993767977df2f79105 --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/canine/tokenization_canine.py @@ -0,0 +1,244 @@ +# coding=utf-8 +# Copyright Google AI and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Tokenization classes for CANINE.""" + +from typing import Dict, List, Optional + +from ...tokenization_utils import AddedToken, PreTrainedTokenizer +from ...utils import logging + + +logger = logging.get_logger(__name__) + + +# Unicode defines 1,114,112 total “codepoints” +UNICODE_VOCAB_SIZE = 1114112 + +# Below: Constants defining canonical codepoints for special, pseudo-characters. +# Copied from https://github.com/google-research/language/blob/master/language/canine/special_codepoints.py +PAD = 0 +CLS = 0xE000 +SEP = 0xE001 +BOS = 0xE002 +MASK = 0xE003 +RESERVED = 0xE004 + +# Maps special codepoints to human-readable names. +SPECIAL_CODEPOINTS: Dict[int, str] = { + # Special symbols are represented using codepoints values that are valid, + # but designated as "Private Use", meaning that they will never be assigned + # characters by the Unicode Consortium, and are thus safe for use here. + # + # NOTE: Do *NOT* add any sort of [UNK_CHAR] here. They are explicitly + # excluded and should fail with a hard error. + CLS: "[CLS]", + SEP: "[SEP]", + BOS: "[BOS]", + MASK: "[MASK]", + PAD: "[PAD]", + RESERVED: "[RESERVED]", +} + +# Maps special codepoint human-readable names to their codepoint values. +SPECIAL_CODEPOINTS_BY_NAME: Dict[str, int] = {name: codepoint for codepoint, name in SPECIAL_CODEPOINTS.items()} + + +class CanineTokenizer(PreTrainedTokenizer): + r""" + Construct a CANINE tokenizer (i.e. a character splitter). It turns text into a sequence of characters, and then + converts each character into its Unicode code point. + + [`CanineTokenizer`] inherits from [`PreTrainedTokenizer`]. + + Refer to superclass [`PreTrainedTokenizer`] for usage examples and documentation concerning parameters. + + Args: + model_max_length (`int`, *optional*, defaults to 2048): + The maximum sentence length the model accepts. + """ + + def __init__( + self, + bos_token=chr(CLS), + eos_token=chr(SEP), + sep_token=chr(SEP), + cls_token=chr(CLS), + pad_token=chr(PAD), + mask_token=chr(MASK), + add_prefix_space=False, + model_max_length=2048, + **kwargs, + ): + bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token + eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token + sep_token = AddedToken(sep_token, lstrip=False, rstrip=False) if isinstance(sep_token, str) else sep_token + cls_token = AddedToken(cls_token, lstrip=False, rstrip=False) if isinstance(cls_token, str) else cls_token + pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token + + # Mask token behave like a normal word, i.e. include the space before it + mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token + + # Creates a mapping for looking up the IDs of special symbols. + self._special_codepoints: Dict[str, int] = {} + for codepoint, name in SPECIAL_CODEPOINTS.items(): + self._special_codepoints[name] = codepoint + + # Creates a mapping for looking up the string forms of special symbol IDs. + self._special_codepoint_strings: Dict[int, str] = { + codepoint: name for name, codepoint in self._special_codepoints.items() + } + + self._unicode_vocab_size = UNICODE_VOCAB_SIZE + self._num_special_tokens = len(self._special_codepoints) + + super().__init__( + bos_token=bos_token, + eos_token=eos_token, + sep_token=sep_token, + cls_token=cls_token, + pad_token=pad_token, + mask_token=mask_token, + add_prefix_space=add_prefix_space, + model_max_length=model_max_length, + **kwargs, + ) + + @property + def vocab_size(self) -> int: + return self._unicode_vocab_size + + def get_vocab(self): + vocab = {chr(i): i for i in range(self.vocab_size)} + vocab.update(self.added_tokens_encoder) + return vocab + + def _tokenize(self, text: str) -> List[str]: + """Tokenize a string (i.e. perform character splitting).""" + return list(text) + + def _convert_token_to_id(self, token: str) -> int: + """Converts a token (i.e. a Unicode character) in an id (i.e. its integer Unicode code point value).""" + try: + return ord(token) + except TypeError: + raise ValueError(f"invalid token: '{token}'") + + def _convert_id_to_token(self, index: int) -> str: + """ + Converts a Unicode code point (integer) in a token (str). In case it's a special code point, convert to + human-readable format. + """ + try: + if index in SPECIAL_CODEPOINTS: + return SPECIAL_CODEPOINTS[index] + return chr(index) + except TypeError: + raise ValueError(f"invalid id: {index}") + + def convert_tokens_to_string(self, tokens): + return "".join(tokens) + + def build_inputs_with_special_tokens( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None + ) -> List[int]: + """ + Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and + adding special tokens. A CANINE sequence has the following format: + + - single sequence: `[CLS] X [SEP]` + - pair of sequences: `[CLS] A [SEP] B [SEP]` + + Args: + token_ids_0 (`List[int]`): + List of IDs to which the special tokens will be added. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + + Returns: + `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens. + """ + sep = [self.sep_token_id] + cls = [self.cls_token_id] + + result = cls + token_ids_0 + sep + if token_ids_1 is not None: + result += token_ids_1 + sep + return result + + def get_special_tokens_mask( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False + ) -> List[int]: + """ + Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding + special tokens using the tokenizer `prepare_for_model` method. + + Args: + token_ids_0 (`List[int]`): + List of IDs. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + already_has_special_tokens (`bool`, *optional*, defaults to `False`): + Whether or not the token list is already formatted with special tokens for the model. + + Returns: + `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. + """ + if already_has_special_tokens: + return super().get_special_tokens_mask( + token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True + ) + + result = [1] + ([0] * len(token_ids_0)) + [1] + if token_ids_1 is not None: + result += ([0] * len(token_ids_1)) + [1] + return result + + def create_token_type_ids_from_sequences( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None + ) -> List[int]: + """ + Create a mask from the two sequences passed to be used in a sequence-pair classification task. A CANINE + sequence pair mask has the following format: + + ``` + 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 + | first sequence | second sequence | + ``` + + If `token_ids_1` is `None`, this method only returns the first portion of the mask (0s). + + Args: + token_ids_0 (`List[int]`): + List of IDs. + token_ids_1 (`List[int]`, *optional*): + Optional second list of IDs for sequence pairs. + + Returns: + `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s). + """ + sep = [self.sep_token_id] + cls = [self.cls_token_id] + + result = len(cls + token_ids_0 + sep) * [0] + if token_ids_1 is not None: + result += len(token_ids_1 + sep) * [1] + return result + + # CanineTokenizer has no vocab file + def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None): + return () + + +__all__ = ["CanineTokenizer"] diff --git a/docs/transformers/build/lib/transformers/models/chameleon/__init__.py b/docs/transformers/build/lib/transformers/models/chameleon/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..4332161036d516359b6a52f7df4a1a63c2c069ba --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/chameleon/__init__.py @@ -0,0 +1,29 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ...utils import _LazyModule +from ...utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .configuration_chameleon import * + from .image_processing_chameleon import * + from .modeling_chameleon import * + from .processing_chameleon import * +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/docs/transformers/build/lib/transformers/models/chameleon/configuration_chameleon.py b/docs/transformers/build/lib/transformers/models/chameleon/configuration_chameleon.py new file mode 100644 index 0000000000000000000000000000000000000000..2cc9cdb29d46c513d61fe8931c35f45daf4ba555 --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/chameleon/configuration_chameleon.py @@ -0,0 +1,281 @@ +# coding=utf-8 +# Copyright 2024 Meta Inc. and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""chameleon model configuration""" + +from typing import List + +from ...configuration_utils import PretrainedConfig +from ...utils import logging + + +logger = logging.get_logger(__name__) + + +class ChameleonVQVAEConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`ChameleonVQModel`]. It is used to instantiate a + `ChameleonVQModel` according to the specified arguments, defining the model architecture. + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. Instantiating a + configuration with the defaults will yield a similar configuration to the VQModel of the + [meta/chameleon-7B](https://huggingface.co/meta/chameleon-7B). + + Args: + embed_dim (`int`, *optional*, defaults to 256): + Dimensionality of each embedding vector. + num_embeddings (`int`, *optional*, defaults to 8192): + Number of codebook embeddings. + double_latent (`bool`, *optional*, defaults to `False`): + Whether to use double z channels. + latent_channels (`int`, *optional*, defaults to 256): + Number of channels for the latent space. + resolution (`int`, *optional*, defaults to 512): + Resolution of the input images. + in_channels (`int`, *optional*, defaults to 3): + Number of input channels. + base_channels (`int`, *optional*, defaults to 128): + Base channel count. + channel_multiplier (`List[int]`, *optional*, defaults to `[1, 1, 2, 2, 4]`): + Channel multipliers for each resolution. + num_res_blocks (`int`, *optional*, defaults to 2): + Number of residual blocks. + attn_resolutions (`List[int]`, *optional*): + Resolutions to apply attention. + dropout (`float`, *optional*, defaults to 0.0): + Dropout rate. + attn_type (`str`, *optional*, defaults to `"vanilla"`): + Attention type used in VQ-GAN encoder. Can be "vanilla" or None. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + """ + + model_type = "chameleon_vqgan" + base_config_key = "vq_config" + + def __init__( + self, + embed_dim: int = 256, + num_embeddings: int = 8192, + double_latent: bool = False, + latent_channels: int = 256, + resolution: int = 512, + in_channels: int = 3, + base_channels: int = 128, + channel_multiplier: List[int] = [1, 1, 2, 2, 4], + num_res_blocks: int = 2, + attn_resolutions: List[int] = None, + dropout: float = 0.0, + attn_type: str = "vanilla", + initializer_range=0.02, + **kwargs, + ): + super().__init__(**kwargs) + self.embed_dim = embed_dim + self.num_embeddings = num_embeddings + self.double_latent = double_latent + self.latent_channels = latent_channels + self.resolution = resolution + self.in_channels = in_channels + self.base_channels = base_channels + self.channel_multiplier = channel_multiplier + self.num_res_blocks = num_res_blocks + self.attn_resolutions = attn_resolutions + self.dropout = dropout + self.attn_type = attn_type + self.initializer_range = initializer_range + + +class ChameleonConfig(PretrainedConfig): + r""" + This is the configuration class to store the configuration of a [`ChameleonModel`]. It is used to instantiate a + chameleon model according to the specified arguments, defining the model architecture. Instantiating a + configuration with the defaults will yield a similar configuration to that of the + [meta/chameleon-7B](https://huggingface.co/meta/chameleon-7B). + + Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the + documentation from [`PretrainedConfig`] for more information. + + + Args: + vocab_size (`int`, *optional*, defaults to 65536): + Vocabulary size of the chameleon model. Defines the number of different tokens that can be represented by the + `inputs_ids` passed when calling [`ChameleonModel`]; this includes text and image tokens. + hidden_size (`int`, *optional*, defaults to 4096): + Dimension of the hidden representations. + intermediate_size (`int`, *optional*, defaults to 11008): + Dimension of the MLP representations. + num_hidden_layers (`int`, *optional*, defaults to 32): + Number of hidden layers in the Transformer decoder. + num_attention_heads (`int`, *optional*, defaults to 32): + Number of attention heads for each attention layer in the Transformer decoder. + num_key_value_heads (`int`, *optional*, defaults to 32): + This is the number of key_value heads that should be used to implement Grouped Query Attention. If + `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if + `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When + converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed + by meanpooling all the original heads within that group. For more details checkout [this + paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to + `num_attention_heads`. + hidden_act (`str` or `function`, *optional*, defaults to `"silu"`): + The non-linear activation function (function or string) in the decoder. + max_position_embeddings (`int`, *optional*, defaults to 4096): + The maximum sequence length that this model might ever be used with. Chameleon supports up to 4096 tokens. + initializer_range (`float`, *optional*, defaults to 0.02): + The standard deviation of the truncated_normal_initializer for initializing all weight matrices. + rms_norm_eps (`float`, *optional*, defaults to 1e-05): + The epsilon used by the rms normalization layers. + use_cache (`bool`, *optional*, defaults to `True`): + Whether or not the model should return the last key/values attentions (not used by all models). Only + relevant if `config.is_decoder=True`. + pad_token_id (`int`, *optional*): + Padding token id. + bos_token_id (`int`, *optional*, defaults to 1): + Beginning of stream token id. + eos_token_id (`int`, *optional*, defaults to 2): + End of stream token id. + tie_word_embeddings (`bool`, *optional*, defaults to `False`): + Whether to tie weight embeddings + rope_theta (`float`, *optional*, defaults to 10000.0): + The base period of the RoPE embeddings. + rope_scaling (`Dict`, *optional*): + Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports two scaling + strategies: linear and dynamic. Their scaling factor must be a float greater than 1. The expected format is + `{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update + `max_position_embeddings` to the expected new maximum. See the following thread for more information on how + these scaling strategies behave: + https://www.reddit.com/r/Localchameleon/comments/14mrgpr/dynamically_scaled_rope_further_increases/. This is an + experimental feature, subject to breaking API changes in future versions. + attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`): + Whether to use a bias in the query, key, value and output projection layers during self-attention. + attention_dropout (`float`, *optional*, defaults to 0.0): + The dropout ratio for the attention probabilities. + model_parallel_size (`int`, *optional*, defaults to 1): + Number of shards used when training the model. This will be used in qk layernorm because the original Chameleon inference + doesn't do reduction in those layers and each rank has its own biases. + swin_norm (`bool`, *optional*, defaults to `False`): + Use Swin Transformer normalization. + vq_config (`dict`, *optional*): + ChameleonVQConfig instance containing the configuration for the VQ-VAE model. + vocabulary_map (`dict`, *optional*): + A dictionary containing the vocabulary map from the tokenizer. Used to obtain tokens from the image inputs. + mlp_bias (`bool`, *optional*, defaults to `False`): + Whether to use a bias in up_proj, down_proj and gate_proj layers in the MLP layers. + + + ```python + >>> from transformers import ChameleonModel, ChameleonConfig + + >>> # Initializing a chameleon chameleon-7b style configuration + >>> configuration = ChameleonConfig() + + >>> # Initializing a model from the chameleon-7b style configuration + >>> model = ChameleonModel(configuration) + + >>> # Accessing the model configuration + >>> configuration = model.config + ```""" + + model_type = "chameleon" + sub_configs = {"vq_config": ChameleonVQVAEConfig} + keys_to_ignore_at_inference = ["past_key_values"] + + def __init__( + self, + vocab_size=65536, + hidden_size=4096, + intermediate_size=11008, + num_hidden_layers=32, + num_attention_heads=32, + num_key_value_heads=32, + hidden_act="silu", + max_position_embeddings=4096, + initializer_range=0.02, + rms_norm_eps=1e-05, + use_cache=True, + pad_token_id=None, + bos_token_id=1, + eos_token_id=2, + tie_word_embeddings=False, + rope_theta=10000.0, + rope_scaling=None, + attention_bias=False, + attention_dropout=0.0, + model_parallel_size=1, + swin_norm=False, + vq_config=None, + vocabulary_map=None, + mlp_bias=False, + **kwargs, + ): + self.vocab_size = vocab_size + self.max_position_embeddings = max_position_embeddings + self.hidden_size = hidden_size + self.intermediate_size = intermediate_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.mlp_bias = mlp_bias + + self.num_key_value_heads = num_key_value_heads + self.hidden_act = hidden_act + self.initializer_range = initializer_range + self.rms_norm_eps = rms_norm_eps + self.use_cache = use_cache + self.rope_theta = rope_theta + self.rope_scaling = rope_scaling + self._rope_scaling_validation() + self.attention_bias = attention_bias + self.attention_dropout = attention_dropout + self.model_parallel_size = model_parallel_size + self.swin_norm = swin_norm + + if vq_config is None: + vq_config = {} + logger.info("vq_config is None. initializing the ChameleonVQConfig with default values.") + + self.vq_config = ChameleonVQVAEConfig(**vq_config) + + self.vocabulary_map = vocabulary_map + + super().__init__( + pad_token_id=pad_token_id, + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + tie_word_embeddings=tie_word_embeddings, + **kwargs, + ) + + def _rope_scaling_validation(self): + """ + Validate the `rope_scaling` configuration. + """ + if self.rope_scaling is None: + return + + if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 2: + raise ValueError( + "`rope_scaling` must be a dictionary with with two fields, `type` and `factor`, " + f"got {self.rope_scaling}" + ) + rope_scaling_type = self.rope_scaling.get("type", None) + rope_scaling_factor = self.rope_scaling.get("factor", None) + if rope_scaling_type is None or rope_scaling_type not in ["linear", "dynamic"]: + raise ValueError( + f"`rope_scaling`'s type field must be one of ['linear', 'dynamic'], got {rope_scaling_type}" + ) + if rope_scaling_factor is None or not isinstance(rope_scaling_factor, float) or rope_scaling_factor <= 1.0: + raise ValueError(f"`rope_scaling`'s factor field must be a float > 1, got {rope_scaling_factor}") + + +__all__ = ["ChameleonConfig", "ChameleonVQVAEConfig"] diff --git a/docs/transformers/build/lib/transformers/models/chameleon/convert_chameleon_weights_to_hf.py b/docs/transformers/build/lib/transformers/models/chameleon/convert_chameleon_weights_to_hf.py new file mode 100644 index 0000000000000000000000000000000000000000..f74607f7b3c04cd9114847ff482d25aa7890e4ae --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/chameleon/convert_chameleon_weights_to_hf.py @@ -0,0 +1,478 @@ +# Copyright 2024 Meta Inc. and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import argparse +import gc +import json +import os + +import requests +import torch +import yaml +from accelerate import init_empty_weights +from PIL import Image + +from transformers import ( + ChameleonConfig, + ChameleonForConditionalGeneration, + ChameleonImageProcessor, + ChameleonProcessor, +) + + +try: + from transformers import LlamaTokenizerFast +except ImportError: + raise ValueError( + "Chameleon conversion supports only FastTokenizer and LlamaTokenizerFast can't be imported! " + "Update your `tokenizers` library and re-run the tokenizer conversion." + ) + +""" +Sample usage: + +``` +python src/transformers/models/chameleon/convert_chameleon_weights_to_hf.py \ + --input_dir /path/to/downloaded/chameleon/weights --model_size 7B --output_dir /output/path +``` + +Thereafter, models can be loaded via: + +```py +from transformers import ChameleonForConditionalGeneration, LlamaTokenizerFast + +model = ChameleonForConditionalGeneration.from_pretrained("/output/path") +tokenizer = LlamaTokenizerFast.from_pretrained("/output/path") +``` + +Important note: you need to be able to host the whole model in RAM to execute this script (even if the biggest versions +come in several checkpoints they each contain a part of each weight of the model, so we need to load them all in RAM). +""" + +NUM_SHARDS = { + "7B": 1, + "30B": 4, +} + +VOCAB_SIZE = 65536 + + +def compute_intermediate_size(n, ffn_dim_multiplier=1, multiple_of=256): + return multiple_of * ((int(ffn_dim_multiplier * int(8 * n / 3)) + multiple_of - 1) // multiple_of) + + +def read_json(path): + with open(path, "r") as f: + return json.load(f) + + +def write_json(text, path): + with open(path, "w") as f: + json.dump(text, f) + + +def write_model(model_path, input_base_path, model_size, chameleon_version=1): + os.makedirs(model_path, exist_ok=True) + input_model_path = os.path.join(input_base_path, "models", model_size.lower()) + params_path = os.path.join(input_model_path, "params.json") + consolidate_params_path = os.path.join(input_model_path, "consolidate_params.json") + + params = read_json(params_path) + if os.path.isfile(consolidate_params_path): + params = {**params, **read_json(consolidate_params_path)} + num_shards = NUM_SHARDS[model_size] + model_parallel_size = params["model_parallel_size"] + params = params.get("model", params) + n_layers = params["n_layers"] + n_heads = params["n_heads"] + n_heads_per_shard = n_heads // num_shards + dim = params["dim"] + dims_per_head = dim // n_heads + base = params.get("rope_theta", 10000.0) + swin_norm = params["swin_norm"] + if base > 10000.0: + max_position_embeddings = 16384 + else: + # Depending on the Chameleon version, the default max_position_embeddings has different values. + if chameleon_version == 1: + max_position_embeddings = 4096 + else: + raise NotImplementedError( + f"Version {chameleon_version} of chameleon is not supported yet. " + "Current supported versions of chameleon are [1]." + ) + + if params.get("n_kv_heads", None) is not None: + num_key_value_heads = params["n_kv_heads"] # for GQA / MQA + num_local_key_value_heads = n_heads_per_shard // num_key_value_heads + key_value_dim = dim // num_key_value_heads + else: # compatibility with other checkpoints + num_key_value_heads = n_heads + num_local_key_value_heads = n_heads_per_shard + key_value_dim = dim + + print(f"Fetching all parameters from the checkpoint at {input_model_path}.") + # Load weights + if num_shards == 1: + # Not sharded + # (The sharded implementation would also work, but this is simpler.) + loaded = None + for possible_name in ["consolidated.pth", "consolidated.00.pth"]: + possible_path = os.path.join(input_model_path, possible_name) + if os.path.exists(possible_path): + loaded = torch.load(possible_path, map_location="cpu", weights_only=True) + break + assert loaded is not None + else: + # Sharded + loaded = [ + torch.load( + os.path.join(input_model_path, f"consolidated.{i:02d}.pth"), map_location="cpu", weights_only=True + ) + for i in range(num_shards) + ] + + # permute for sliced rotary + def permute(w, n_heads, dim1=dim, dim2=dim): + return w.view(n_heads, dim1 // n_heads // 2, 2, dim2).transpose(1, 2).reshape(dim1, dim2) + + # Load weights to the state dict + state_dict = {} + for layer_i in range(n_layers): + if num_shards == 1: + # Unsharded + state_dict.update( + { + f"model.layers.{layer_i}.self_attn.q_proj.weight": permute( + loaded[f"layers.{layer_i}.attention.wq.weight"], n_heads=n_heads + ), + f"model.layers.{layer_i}.self_attn.k_proj.weight": permute( + loaded[f"layers.{layer_i}.attention.wk.weight"], + n_heads=num_key_value_heads, + dim1=key_value_dim, + ), + f"model.layers.{layer_i}.self_attn.v_proj.weight": loaded[f"layers.{layer_i}.attention.wv.weight"], + f"model.layers.{layer_i}.self_attn.o_proj.weight": loaded[f"layers.{layer_i}.attention.wo.weight"], + f"model.layers.{layer_i}.mlp.gate_proj.weight": loaded[f"layers.{layer_i}.feed_forward.w1.weight"], + f"model.layers.{layer_i}.mlp.down_proj.weight": loaded[f"layers.{layer_i}.feed_forward.w2.weight"], + f"model.layers.{layer_i}.mlp.up_proj.weight": loaded[f"layers.{layer_i}.feed_forward.w3.weight"], + f"model.layers.{layer_i}.input_layernorm.weight": loaded[ + f"layers.{layer_i}.attention_norm.weight" + ], + f"model.layers.{layer_i}.post_attention_layernorm.weight": loaded[ + f"layers.{layer_i}.ffn_norm.weight" + ], + } + ) + # qk_layernorm (see https://github.com/huggingface/transformers/pull/31534#issuecomment-2207354677) + state_dict[f"model.layers.{layer_i}.self_attn.q_norm.weight"] = ( + loaded[f"layers.{layer_i}.attention.q_normalization.weight"] + .view(dims_per_head // 2, 2) + .t() + .reshape(1, -1) + .repeat_interleave(n_heads, 0) + ) + state_dict[f"model.layers.{layer_i}.self_attn.q_norm.bias"] = ( + loaded[f"layers.{layer_i}.attention.q_normalization.bias"] + .view(dims_per_head // 2, 2) + .t() + .reshape(1, -1) + .repeat_interleave(n_heads, 0) + ) + state_dict[f"model.layers.{layer_i}.self_attn.k_norm.weight"] = ( + loaded[f"layers.{layer_i}.attention.k_normalization.weight"] + .view(dims_per_head // 2, 2) + .t() + .reshape(1, -1) + .repeat_interleave(num_key_value_heads, 0) + ) + state_dict[f"model.layers.{layer_i}.self_attn.k_norm.bias"] = ( + loaded[f"layers.{layer_i}.attention.k_normalization.bias"] + .view(dims_per_head // 2, 2) + .t() + .reshape(1, -1) + .repeat_interleave(num_key_value_heads, 0) + ) + + else: + # Sharded + state_dict.update( + { + f"model.layers.{layer_i}.input_layernorm.weight": torch.stack( + [l[f"layers.{layer_i}.attention_norm.weight"] for l in loaded] + ).mean(dim=0), + f"model.layers.{layer_i}.post_attention_layernorm.weight": torch.stack( + [l[f"layers.{layer_i}.ffn_norm.weight"] for l in loaded] + ).mean(dim=0), + } + ) + state_dict[f"model.layers.{layer_i}.self_attn.q_proj.weight"] = permute( + torch.cat( + [ + loaded[i][f"layers.{layer_i}.attention.wq.weight"].view(n_heads_per_shard, dims_per_head, dim) + for i in range(num_shards) + ], + dim=0, + ).reshape(dim, dim), + n_heads=n_heads, + ) + + state_dict[f"model.layers.{layer_i}.self_attn.k_proj.weight"] = permute( + torch.cat( + [ + loaded[i][f"layers.{layer_i}.attention.wk.weight"].view( + num_local_key_value_heads, dims_per_head, dim + ) + for i in range(num_shards) + ], + dim=0, + ).reshape(key_value_dim, dim), + n_heads=num_key_value_heads, + dim1=key_value_dim, + ) + + # qk_layernorm (see https://github.com/huggingface/transformers/pull/31534#issuecomment-2207354677) + state_dict[f"model.layers.{layer_i}.self_attn.q_norm.weight"] = ( + torch.cat([l[f"layers.{layer_i}.attention.q_normalization.weight"].unsqueeze(0) for l in loaded]) + .view(num_shards, dims_per_head // 2, 2) + .transpose(1, 2) + .reshape(num_shards, -1) + .repeat_interleave(n_heads // num_shards, 0) + ) + state_dict[f"model.layers.{layer_i}.self_attn.q_norm.bias"] = ( + torch.cat([l[f"layers.{layer_i}.attention.q_normalization.bias"].unsqueeze(0) for l in loaded]) + .view(num_shards, dims_per_head // 2, 2) + .transpose(1, 2) + .reshape(num_shards, -1) + .repeat_interleave(n_heads // num_shards, 0) + ) + state_dict[f"model.layers.{layer_i}.self_attn.k_norm.weight"] = ( + torch.cat([l[f"layers.{layer_i}.attention.k_normalization.weight"].unsqueeze(0) for l in loaded]) + .view(num_shards, dims_per_head // 2, 2) + .transpose(1, 2) + .reshape(num_shards, -1) + .repeat_interleave(num_key_value_heads // num_shards, 0) + ) + state_dict[f"model.layers.{layer_i}.self_attn.k_norm.bias"] = ( + torch.cat([l[f"layers.{layer_i}.attention.k_normalization.bias"].unsqueeze(0) for l in loaded]) + .view(num_shards, dims_per_head // 2, 2) + .transpose(1, 2) + .reshape(num_shards, -1) + .repeat_interleave(num_key_value_heads // num_shards, 0) + ) + + state_dict[f"model.layers.{layer_i}.self_attn.v_proj.weight"] = torch.cat( + [ + loaded[i][f"layers.{layer_i}.attention.wv.weight"].view( + num_local_key_value_heads, dims_per_head, dim + ) + for i in range(num_shards) + ], + dim=0, + ).reshape(key_value_dim, dim) + + state_dict[f"model.layers.{layer_i}.self_attn.o_proj.weight"] = torch.cat( + [loaded[i][f"layers.{layer_i}.attention.wo.weight"] for i in range(num_shards)], dim=1 + ) + state_dict[f"model.layers.{layer_i}.mlp.gate_proj.weight"] = torch.cat( + [loaded[i][f"layers.{layer_i}.feed_forward.w1.weight"] for i in range(num_shards)], dim=0 + ) + state_dict[f"model.layers.{layer_i}.mlp.down_proj.weight"] = torch.cat( + [loaded[i][f"layers.{layer_i}.feed_forward.w2.weight"] for i in range(num_shards)], dim=1 + ) + state_dict[f"model.layers.{layer_i}.mlp.up_proj.weight"] = torch.cat( + [loaded[i][f"layers.{layer_i}.feed_forward.w3.weight"] for i in range(num_shards)], dim=0 + ) + + if num_shards == 1: + # Unsharded + state_dict.update( + { + "model.embed_tokens.weight": loaded["tok_embeddings.weight"], + "model.norm.weight": loaded["norm.weight"], + "lm_head.weight": loaded["output.weight"], + } + ) + else: + state_dict.update( + { + "model.embed_tokens.weight": torch.cat( + [loaded[i]["tok_embeddings.weight"] for i in range(num_shards)], dim=1 + ), + "model.norm.weight": torch.stack([loaded[i]["norm.weight"] for i in range(num_shards)]).mean(dim=0), + "lm_head.weight": torch.cat([loaded[i]["output.weight"] for i in range(num_shards)], dim=0), + } + ) + + # Load VQGAN weights + vqgan_path = os.path.join(input_base_path, "tokenizer/vqgan.ckpt") + vqgan_state_dict = torch.load(vqgan_path, map_location="cpu", weights_only=True)["state_dict"] + for k, v in vqgan_state_dict.items(): + if "decoder" in k: + continue # we dont do image generation yet + state_dict[f"model.vqmodel.{k}"] = v + + # Write configs + ffn_dim_multiplier = params["ffn_dim_multiplier"] if "ffn_dim_multiplier" in params else 1 + multiple_of = params["multiple_of"] if "multiple_of" in params else 256 + + with open(os.path.join(input_base_path, "tokenizer/text_tokenizer.json")) as tokenizer_file: + tokenizer_config = json.load(tokenizer_file) + vocabulary_map = tokenizer_config["model"]["vocab"] + vocabulary_map[""] = vocabulary_map[ + "" + ] # use a reserved token instead of adding a new one + del vocabulary_map[""] + + for token in tokenizer_config["added_tokens"]: + if token["content"] == "": + token["content"] = "" + + with open(os.path.join(input_base_path, "tokenizer/text_tokenizer_modified.json"), "w") as f: + json.dump(tokenizer_config, f) # save the new file to init tokenizer later + + vq_keys_to_replace = [ + ("ch", "base_channels"), + ("out_ch", "out_channels"), + ("n_embed", "num_embeddings"), + ("ch_mult", "channel_multiplier"), + ("double_z", "double_latent"), + ("z_channels", "latent_channels"), + ] + with open(os.path.join(input_base_path, "tokenizer/vqgan.yaml")) as vqgan_cfg_file: + vq_config = yaml.safe_load(vqgan_cfg_file)["model"]["params"] + vq_config.update(**vq_config["ddconfig"]) + for old, new in vq_keys_to_replace: + vq_config[new] = vq_config[old] + del vq_config["ddconfig"] + del vq_config["ckpt_path"] + del vq_config["lossconfig"] + + config = ChameleonConfig( + hidden_size=dim, + intermediate_size=compute_intermediate_size(dim, ffn_dim_multiplier, multiple_of), + num_attention_heads=params["n_heads"], + num_hidden_layers=params["n_layers"], + rms_norm_eps=params["norm_eps"], + num_key_value_heads=num_key_value_heads, + vocab_size=VOCAB_SIZE, + rope_theta=base, + max_position_embeddings=max_position_embeddings, + model_parallel_size=model_parallel_size, + swin_norm=swin_norm, + vq_config=vq_config, + vocabulary_map=vocabulary_map, + ) + with init_empty_weights(): + model = ChameleonForConditionalGeneration(config) + + model.load_state_dict(state_dict, assign=True, strict=False) + model.save_pretrained(model_path, safe_serialization=True) + + # Load and save the processor + tokenizer = LlamaTokenizerFast( + tokenizer_file=os.path.join(input_base_path, "tokenizer/text_tokenizer_modified.json"), legacy=False + ) + tokenizer.sep_token_id = 8710 # assign to sep so that we can append it after input text + tokenizer.pad_token_id = 1 # assing to special pad_token + image_processor = ChameleonImageProcessor() + processor = ChameleonProcessor(image_processor=image_processor, tokenizer=tokenizer) + processor.save_pretrained(model_path) + + # Make space so we can load the model properly now. + del state_dict + del loaded + del vqgan_state_dict + gc.collect() + + # Short inference on a few examples to check if generation makes sense + # taken from https://github.com/facebookresearch/chameleon/blob/7a72f40aa5f462965c8374f25257f55b65b25ff4/data/prompts_for_human_evaluations.jsonl + print("Loading the checkpoint in a Chameleon model...") + print("*" * 100) + model = ChameleonForConditionalGeneration.from_pretrained( + model_path, attn_implementation="eager", torch_dtype=torch.bfloat16, device_map="auto" + ) + processor = ChameleonProcessor.from_pretrained(model_path) + + prompt = "I'm very intrigued by this work of art:Please tell me about the artist." + image = Image.open( + requests.get( + "https://uploads4.wikiart.org/images/paul-klee/death-for-the-idea-1915.jpg!Large.jpg", stream=True + ).raw + ) + inputs = processor(prompt, images=image, return_tensors="pt").to(model.device, torch.bfloat16) + length = inputs.input_ids.shape[1] + + out = model.generate(**inputs, max_new_tokens=40, do_sample=False) + generated_text = processor.batch_decode(out[:, length:], skip_special_tokens=True)[0] + + print(f"Generation for single-image: {generated_text}") + print("*" * 100) + + # Multi-image example + prompt = "I used to know a lot about constellations when I was younger, but as I grew older, I forgot most of what I knew. These are the only two constellations that I really remember now.I would like for you to tell me about 3 more constellations and give me a little bit of history about the constellation." + image = Image.open( + requests.get("https://nineplanets.org/wp-content/uploads/2020/12/the-big-dipper-1.jpg", stream=True).raw + ) + image_2 = Image.open( + requests.get("https://www.kxan.com/wp-content/uploads/sites/40/2020/10/ORION.jpg", stream=True).raw + ) + + inputs = processor(prompt, images=[image, image_2], return_tensors="pt").to(model.device, dtype=torch.bfloat16) + length = inputs.input_ids.shape[1] + out = model.generate(**inputs, max_new_tokens=50, do_sample=False) + generated_text = processor.batch_decode(out[:, length:], skip_special_tokens=True)[0] + + print(f"Generation for multi-image: {generated_text}") + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument( + "--input_dir", + help="Location of Chameleon weights", + ) + parser.add_argument( + "--model_size", + choices=["7B", "30B"], + help="" + " models correspond to the finetuned versions, and are specific to the Chameleon official release. For more details on Chameleon, checkout the original repo: https://github.com/facebookresearch/chameleon", + ) + parser.add_argument( + "--output_dir", + help="Location to write HF model", + ) + parser.add_argument( + "--test_inference", + action="store_true", + help="Whether to load the model for generation to test it's converted correctly.", + ) + # Different Chameleon versions used different default values for max_position_embeddings, hence the need to be able to specify which version is being used. + parser.add_argument( + "--chameleon_version", + choices=[1], + default=1, + type=int, + help="Version of the Chameleon model to convert", + ) + args = parser.parse_args() + write_model( + model_path=args.output_dir, + input_base_path=args.input_dir, + model_size=args.model_size, + chameleon_version=args.chameleon_version, + ) + + +if __name__ == "__main__": + main() diff --git a/docs/transformers/build/lib/transformers/models/chameleon/image_processing_chameleon.py b/docs/transformers/build/lib/transformers/models/chameleon/image_processing_chameleon.py new file mode 100644 index 0000000000000000000000000000000000000000..2d1417a8ee80a6aa8ade8ffd4b1b86e7c2dc448e --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/chameleon/image_processing_chameleon.py @@ -0,0 +1,344 @@ +# coding=utf-8 +# Copyright 2024 Meta Inc. and The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Image processor class for Chameleon.""" + +from typing import Dict, List, Optional, Union + +import numpy as np + +from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict +from ...image_transforms import ( + get_resize_output_image_size, + resize, + to_channel_dimension_format, +) +from ...image_utils import ( + ChannelDimension, + ImageInput, + PILImageResampling, + infer_channel_dimension_format, + is_scaled_image, + make_flat_list_of_images, + to_numpy_array, + valid_images, + validate_preprocess_arguments, +) +from ...utils import TensorType, filter_out_non_signature_kwargs, is_vision_available, logging + + +logger = logging.get_logger(__name__) + +if is_vision_available(): + import PIL + + +class ChameleonImageProcessor(BaseImageProcessor): + r""" + Constructs a Chameleon image processor. + + Args: + do_resize (`bool`, *optional*, defaults to `True`): + Whether to resize the image's (height, width) dimensions to the specified `size`. Can be overridden by + `do_resize` in the `preprocess` method. + size (`Dict[str, int]` *optional*, defaults to `{"shortest_edge": 512}`): + Size of the image after resizing. The shortest edge of the image is resized to size["shortest_edge"], with + the longest edge resized to keep the input aspect ratio. Can be overridden by `size` in the `preprocess` + method. + resample (`PILImageResampling`, *optional*, defaults to 1): + Resampling filter to use if resizing the image. Can be overridden by `resample` in the `preprocess` method. + do_center_crop (`bool`, *optional*, defaults to `True`): + Whether to center crop the image to the specified `crop_size`. Can be overridden by `do_center_crop` in the + `preprocess` method. + crop_size (`Dict[str, int]` *optional*, defaults to {"height": 512, "width": 512}): + Size of the output image after applying `center_crop`. Can be overridden by `crop_size` in the `preprocess` + method. + do_rescale (`bool`, *optional*, defaults to `True`): + Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by `do_rescale` in + the `preprocess` method. + rescale_factor (`int` or `float`, *optional*, defaults to 0.0078): + Scale factor to use if rescaling the image. Can be overridden by `rescale_factor` in the `preprocess` + method. + do_normalize (`bool`, *optional*, defaults to `True`): + Whether to normalize the image. Can be overridden by `do_normalize` in the `preprocess` method. + image_mean (`float` or `List[float]`, *optional*, defaults to `[1.0, 1.0, 1.0]`): + Mean to use if normalizing the image. This is a float or list of floats the length of the number of + channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method. + image_std (`float` or `List[float]`, *optional*, defaults to `[1.0, 1.0, 1.0]`): + Standard deviation to use if normalizing the image. This is a float or list of floats the length of the + number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method. + Can be overridden by the `image_std` parameter in the `preprocess` method. + do_convert_rgb (`bool`, *optional*, defaults to `True`): + Whether to convert the image to RGB. + """ + + model_input_names = ["pixel_values"] + + def __init__( + self, + do_resize: bool = True, + size: Dict[str, int] = None, + resample: PILImageResampling = PIL.Image.LANCZOS, + do_center_crop: bool = True, + crop_size: Dict[str, int] = None, + do_rescale: bool = True, + rescale_factor: Union[int, float] = 0.0078, + do_normalize: bool = True, + image_mean: Optional[Union[float, List[float]]] = None, + image_std: Optional[Union[float, List[float]]] = None, + do_convert_rgb: bool = True, + **kwargs, + ) -> None: + super().__init__(**kwargs) + size = size if size is not None else {"shortest_edge": 512} + size = get_size_dict(size, default_to_square=False) + crop_size = crop_size if crop_size is not None else {"height": 512, "width": 512} + crop_size = get_size_dict(crop_size, default_to_square=True, param_name="crop_size") + + self.do_resize = do_resize + self.size = size + self.resample = resample + self.do_center_crop = do_center_crop + self.crop_size = crop_size + self.do_rescale = do_rescale + self.rescale_factor = rescale_factor + self.do_normalize = do_normalize + self.image_mean = image_mean if image_mean is not None else [1.0, 1.0, 1.0] + self.image_std = image_std if image_std is not None else [1.0, 1.0, 1.0] + self.do_convert_rgb = do_convert_rgb + + # Copied from transformers.models.clip.image_processing_clip.CLIPImageProcessor.resize + def resize( + self, + image: np.ndarray, + size: Dict[str, int], + resample: PILImageResampling = PILImageResampling.BICUBIC, + data_format: Optional[Union[str, ChannelDimension]] = None, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + **kwargs, + ) -> np.ndarray: + """ + Resize an image. The shortest edge of the image is resized to size["shortest_edge"], with the longest edge + resized to keep the input aspect ratio. + + Args: + image (`np.ndarray`): + Image to resize. + size (`Dict[str, int]`): + Size of the output image. + resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`): + Resampling filter to use when resiizing the image. + data_format (`str` or `ChannelDimension`, *optional*): + The channel dimension format of the image. If not provided, it will be the same as the input image. + input_data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format of the input image. If not provided, it will be inferred. + """ + default_to_square = True + if "shortest_edge" in size: + size = size["shortest_edge"] + default_to_square = False + elif "height" in size and "width" in size: + size = (size["height"], size["width"]) + else: + raise ValueError("Size must contain either 'shortest_edge' or 'height' and 'width'.") + + output_size = get_resize_output_image_size( + image, + size=size, + default_to_square=default_to_square, + input_data_format=input_data_format, + ) + return resize( + image, + size=output_size, + resample=resample, + data_format=data_format, + input_data_format=input_data_format, + **kwargs, + ) + + @filter_out_non_signature_kwargs() + def preprocess( + self, + images: ImageInput, + do_resize: Optional[bool] = None, + size: Dict[str, int] = None, + resample: PILImageResampling = None, + do_center_crop: Optional[bool] = None, + crop_size: Optional[int] = None, + do_rescale: Optional[bool] = None, + rescale_factor: Optional[float] = None, + do_normalize: Optional[bool] = None, + image_mean: Optional[Union[float, List[float]]] = None, + image_std: Optional[Union[float, List[float]]] = None, + do_convert_rgb: Optional[bool] = None, + return_tensors: Optional[Union[str, TensorType]] = None, + data_format: Optional[ChannelDimension] = ChannelDimension.FIRST, + input_data_format: Optional[Union[str, ChannelDimension]] = None, + ) -> PIL.Image.Image: + """ + Preprocess an image or batch of images. + + Args: + images (`ImageInput`): + Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If + passing in images with pixel values between 0 and 1, set `do_rescale=False`. + do_resize (`bool`, *optional*, defaults to `self.do_resize`): + Whether to resize the image. + size (`Dict[str, int]`, *optional*, defaults to `self.size`): + Size of the image after resizing. Shortest edge of the image is resized to size["shortest_edge"], with + the longest edge resized to keep the input aspect ratio. + resample (`int`, *optional*, defaults to `self.resample`): + Resampling filter to use if resizing the image. This can be one of the enum `PILImageResampling`. Only + has an effect if `do_resize` is set to `True`. + do_center_crop (`bool`, *optional*, defaults to `self.do_center_crop`): + Whether to center crop the image. + crop_size (`Dict[str, int]`, *optional*, defaults to `self.crop_size`): + Size of the center crop. Only has an effect if `do_center_crop` is set to `True`. + do_rescale (`bool`, *optional*, defaults to `self.do_rescale`): + Whether to rescale the image. + rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`): + Rescale factor to rescale the image by if `do_rescale` is set to `True`. + do_normalize (`bool`, *optional*, defaults to `self.do_normalize`): + Whether to normalize the image. + image_mean (`float` or `List[float]`, *optional*, defaults to `self.image_mean`): + Image mean to use for normalization. Only has an effect if `do_normalize` is set to `True`. + image_std (`float` or `List[float]`, *optional*, defaults to `self.image_std`): + Image standard deviation to use for normalization. Only has an effect if `do_normalize` is set to + `True`. + do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`): + Whether to convert the image to RGB. + return_tensors (`str` or `TensorType`, *optional*): + The type of tensors to return. Can be one of: + - Unset: Return a list of `np.ndarray`. + - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`. + - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`. + - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`. + - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`. + data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`): + The channel dimension format for the output image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + - Unset: Use the channel dimension format of the input image. + input_data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format for the input image. If unset, the channel dimension format is inferred + from the input image. Can be one of: + - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format. + - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format. + - `"none"` or `ChannelDimension.NONE`: image in (height, width) format. + """ + do_resize = do_resize if do_resize is not None else self.do_resize + size = size if size is not None else self.size + size = get_size_dict(size, param_name="size", default_to_square=False) + resample = resample if resample is not None else self.resample + do_center_crop = do_center_crop if do_center_crop is not None else self.do_center_crop + crop_size = crop_size if crop_size is not None else self.crop_size + crop_size = get_size_dict(crop_size, param_name="crop_size", default_to_square=True) + do_rescale = do_rescale if do_rescale is not None else self.do_rescale + rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor + do_normalize = do_normalize if do_normalize is not None else self.do_normalize + image_mean = image_mean if image_mean is not None else self.image_mean + image_std = image_std if image_std is not None else self.image_std + do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb + + images = make_flat_list_of_images(images) + + if not valid_images(images): + raise ValueError( + "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, " + "torch.Tensor, tf.Tensor or jax.ndarray." + ) + + validate_preprocess_arguments( + do_rescale=do_rescale, + rescale_factor=rescale_factor, + do_normalize=do_normalize, + image_mean=image_mean, + image_std=image_std, + do_center_crop=do_center_crop, + crop_size=crop_size, + do_resize=do_resize, + size=size, + resample=resample, + ) + + if do_convert_rgb: + images = [self.blend_rgba(image) for image in images] + + # All transformations expect numpy arrays. + images = [to_numpy_array(image) for image in images] + + if do_rescale and is_scaled_image(images[0]): + logger.warning_once( + "It looks like you are trying to rescale already rescaled images. If the input" + " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again." + ) + + if input_data_format is None: + # We assume that all images have the same channel dimension format. + input_data_format = infer_channel_dimension_format(images[0]) + all_images = [] + for image in images: + if do_resize: + image = self.resize(image=image, size=size, resample=resample, input_data_format=input_data_format) + + if do_center_crop: + image = self.center_crop(image=image, size=crop_size, input_data_format=input_data_format) + + if do_rescale: + image = self.rescale(image=image, scale=rescale_factor, input_data_format=input_data_format) + + if do_normalize: + image = self.normalize( + image=image, mean=image_mean, std=image_std, input_data_format=input_data_format + ) + + all_images.append(image) + images = [ + to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) + for image in all_images + ] + + data = {"pixel_values": images} + return BatchFeature(data=data, tensor_type=return_tensors) + + def blend_rgba(self, image: ImageInput) -> ImageInput: + """ + Convert image to RGB by blending the transparency layer if it's in RGBA format. + If image is not `PIL.Image`, it si simply returned without modifications. + + Args: + image (`ImageInput`): + Image to convert. + """ + + if not isinstance(image, PIL.Image.Image): + return image + elif image.mode == "RGB": + return image + + img_rgba = np.array(image.convert("RGBA")) + + # If there is no transparency layer, simple convert and return. + if not (img_rgba[:, :, 3] < 255).any(): + return image.convert("RGB") + + # There is a transparency layer, blend it with a white background. + # Calculate the alpha proportion for blending. + alpha = img_rgba[:, :, 3] / 255.0 + img_rgb = (1 - alpha[:, :, np.newaxis]) * 255 + alpha[:, :, np.newaxis] * img_rgba[:, :, :3] + return PIL.Image.fromarray(img_rgb.astype("uint8"), "RGB") + + +__all__ = ["ChameleonImageProcessor"] diff --git a/docs/transformers/build/lib/transformers/models/chinese_clip/__init__.py b/docs/transformers/build/lib/transformers/models/chinese_clip/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..9c4476b9080ff514fe273fde33d96b5c0edaba2b --- /dev/null +++ b/docs/transformers/build/lib/transformers/models/chinese_clip/__init__.py @@ -0,0 +1,31 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import TYPE_CHECKING + +from ...utils import _LazyModule +from ...utils.import_utils import define_import_structure + + +if TYPE_CHECKING: + from .configuration_chinese_clip import * + from .feature_extraction_chinese_clip import * + from .image_processing_chinese_clip import * + from .image_processing_chinese_clip_fast import * + from .modeling_chinese_clip import * + from .processing_chinese_clip import * +else: + import sys + + _file = globals()["__file__"] + sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__) diff --git a/docs/transformers/build/lib/transformers/optimization.py b/docs/transformers/build/lib/transformers/optimization.py new file mode 100644 index 0000000000000000000000000000000000000000..98e07f78f715a539510dd1c131fb03be623103aa --- /dev/null +++ b/docs/transformers/build/lib/transformers/optimization.py @@ -0,0 +1,890 @@ +# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""PyTorch optimization for BERT model.""" + +import math +import warnings +from functools import partial +from typing import Optional, Union + +import torch +from torch.optim import Optimizer +from torch.optim.lr_scheduler import LambdaLR, ReduceLROnPlateau + +from .trainer_pt_utils import LayerWiseDummyOptimizer, LayerWiseDummyScheduler +from .trainer_utils import SchedulerType +from .utils import logging + + +logger = logging.get_logger(__name__) + + +def _get_constant_lambda(_=None): + return 1 + + +def get_constant_schedule(optimizer: Optimizer, last_epoch: int = -1): + """ + Create a schedule with a constant learning rate, using the learning rate set in optimizer. + + Args: + optimizer ([`~torch.optim.Optimizer`]): + The optimizer for which to schedule the learning rate. + last_epoch (`int`, *optional*, defaults to -1): + The index of the last epoch when resuming training. + + Return: + `torch.optim.lr_scheduler.LambdaLR` with the appropriate schedule. + """ + + return LambdaLR(optimizer, _get_constant_lambda, last_epoch=last_epoch) + + +def get_reduce_on_plateau_schedule(optimizer: Optimizer, **kwargs): + """ + Create a schedule with a constant learning rate that decreases when a metric has stopped improving. + + Args: + optimizer ([`~torch.optim.Optimizer`]): + The optimizer for which to schedule the learning rate. + kwargs (`dict`, *optional*): + Extra parameters to be passed to the scheduler. See `torch.optim.lr_scheduler.ReduceLROnPlateau` + for possible parameters. + + Return: + `torch.optim.lr_scheduler.ReduceLROnPlateau` with the appropriate schedule. + """ + + return ReduceLROnPlateau(optimizer, **kwargs) + + +def _get_constant_schedule_with_warmup_lr_lambda(current_step: int, *, num_warmup_steps: int): + if current_step < num_warmup_steps: + return float(current_step) / float(max(1.0, num_warmup_steps)) + return 1.0 + + +def get_constant_schedule_with_warmup(optimizer: Optimizer, num_warmup_steps: int, last_epoch: int = -1): + """ + Create a schedule with a constant learning rate preceded by a warmup period during which the learning rate + increases linearly between 0 and the initial lr set in the optimizer. + + Args: + optimizer ([`~torch.optim.Optimizer`]): + The optimizer for which to schedule the learning rate. + num_warmup_steps (`int`): + The number of steps for the warmup phase. + last_epoch (`int`, *optional*, defaults to -1): + The index of the last epoch when resuming training. + + Return: + `torch.optim.lr_scheduler.LambdaLR` with the appropriate schedule. + """ + + lr_lambda = partial(_get_constant_schedule_with_warmup_lr_lambda, num_warmup_steps=num_warmup_steps) + return LambdaLR(optimizer, lr_lambda, last_epoch=last_epoch) + + +def _get_linear_schedule_with_warmup_lr_lambda(current_step: int, *, num_warmup_steps: int, num_training_steps: int): + if current_step < num_warmup_steps: + return float(current_step) / float(max(1, num_warmup_steps)) + return max(0.0, float(num_training_steps - current_step) / float(max(1, num_training_steps - num_warmup_steps))) + + +def get_linear_schedule_with_warmup(optimizer, num_warmup_steps, num_training_steps, last_epoch=-1): + """ + Create a schedule with a learning rate that decreases linearly from the initial lr set in the optimizer to 0, after + a warmup period during which it increases linearly from 0 to the initial lr set in the optimizer. + + Args: + optimizer ([`~torch.optim.Optimizer`]): + The optimizer for which to schedule the learning rate. + num_warmup_steps (`int`): + The number of steps for the warmup phase. + num_training_steps (`int`): + The total number of training steps. + last_epoch (`int`, *optional*, defaults to -1): + The index of the last epoch when resuming training. + + Return: + `torch.optim.lr_scheduler.LambdaLR` with the appropriate schedule. + """ + + lr_lambda = partial( + _get_linear_schedule_with_warmup_lr_lambda, + num_warmup_steps=num_warmup_steps, + num_training_steps=num_training_steps, + ) + return LambdaLR(optimizer, lr_lambda, last_epoch) + + +def _get_cosine_schedule_with_warmup_lr_lambda( + current_step: int, *, num_warmup_steps: int, num_training_steps: int, num_cycles: float +): + if current_step < num_warmup_steps: + return float(current_step) / float(max(1, num_warmup_steps)) + progress = float(current_step - num_warmup_steps) / float(max(1, num_training_steps - num_warmup_steps)) + return max(0.0, 0.5 * (1.0 + math.cos(math.pi * float(num_cycles) * 2.0 * progress))) + + +def get_cosine_schedule_with_warmup( + optimizer: Optimizer, num_warmup_steps: int, num_training_steps: int, num_cycles: float = 0.5, last_epoch: int = -1 +): + """ + Create a schedule with a learning rate that decreases following the values of the cosine function between the + initial lr set in the optimizer to 0, after a warmup period during which it increases linearly between 0 and the + initial lr set in the optimizer. + + Args: + optimizer ([`~torch.optim.Optimizer`]): + The optimizer for which to schedule the learning rate. + num_warmup_steps (`int`): + The number of steps for the warmup phase. + num_training_steps (`int`): + The total number of training steps. + num_cycles (`float`, *optional*, defaults to 0.5): + The number of waves in the cosine schedule (the defaults is to just decrease from the max value to 0 + following a half-cosine). + last_epoch (`int`, *optional*, defaults to -1): + The index of the last epoch when resuming training. + + Return: + `torch.optim.lr_scheduler.LambdaLR` with the appropriate schedule. + """ + + lr_lambda = partial( + _get_cosine_schedule_with_warmup_lr_lambda, + num_warmup_steps=num_warmup_steps, + num_training_steps=num_training_steps, + num_cycles=num_cycles, + ) + return LambdaLR(optimizer, lr_lambda, last_epoch) + + +def _get_cosine_with_hard_restarts_schedule_with_warmup_lr_lambda( + current_step: int, *, num_warmup_steps: int, num_training_steps: int, num_cycles: int +): + if current_step < num_warmup_steps: + return float(current_step) / float(max(1, num_warmup_steps)) + progress = float(current_step - num_warmup_steps) / float(max(1, num_training_steps - num_warmup_steps)) + if progress >= 1.0: + return 0.0 + return max(0.0, 0.5 * (1.0 + math.cos(math.pi * ((float(num_cycles) * progress) % 1.0)))) + + +def get_cosine_with_hard_restarts_schedule_with_warmup( + optimizer: Optimizer, num_warmup_steps: int, num_training_steps: int, num_cycles: int = 1, last_epoch: int = -1 +): + """ + Create a schedule with a learning rate that decreases following the values of the cosine function between the + initial lr set in the optimizer to 0, with several hard restarts, after a warmup period during which it increases + linearly between 0 and the initial lr set in the optimizer. + + Args: + optimizer ([`~torch.optim.Optimizer`]): + The optimizer for which to schedule the learning rate. + num_warmup_steps (`int`): + The number of steps for the warmup phase. + num_training_steps (`int`): + The total number of training steps. + num_cycles (`int`, *optional*, defaults to 1): + The number of hard restarts to use. + last_epoch (`int`, *optional*, defaults to -1): + The index of the last epoch when resuming training. + + Return: + `torch.optim.lr_scheduler.LambdaLR` with the appropriate schedule. + """ + + lr_lambda = partial( + _get_cosine_with_hard_restarts_schedule_with_warmup_lr_lambda, + num_warmup_steps=num_warmup_steps, + num_training_steps=num_training_steps, + num_cycles=num_cycles, + ) + return LambdaLR(optimizer, lr_lambda, last_epoch) + + +def _get_polynomial_decay_schedule_with_warmup_lr_lambda( + current_step: int, + *, + num_warmup_steps: int, + num_training_steps: int, + lr_end: float, + power: float, + lr_init: int, +): + if current_step < num_warmup_steps: + return float(current_step) / float(max(1, num_warmup_steps)) + elif current_step > num_training_steps: + return lr_end / lr_init # as LambdaLR multiplies by lr_init + else: + lr_range = lr_init - lr_end + decay_steps = num_training_steps - num_warmup_steps + pct_remaining = 1 - (current_step - num_warmup_steps) / decay_steps + decay = lr_range * pct_remaining**power + lr_end + return decay / lr_init # as LambdaLR multiplies by lr_init + + +def get_polynomial_decay_schedule_with_warmup( + optimizer, num_warmup_steps, num_training_steps, lr_end=1e-7, power=1.0, last_epoch=-1 +): + """ + Create a schedule with a learning rate that decreases as a polynomial decay from the initial lr set in the + optimizer to end lr defined by *lr_end*, after a warmup period during which it increases linearly from 0 to the + initial lr set in the optimizer. + + Args: + optimizer ([`~torch.optim.Optimizer`]): + The optimizer for which to schedule the learning rate. + num_warmup_steps (`int`): + The number of steps for the warmup phase. + num_training_steps (`int`): + The total number of training steps. + lr_end (`float`, *optional*, defaults to 1e-7): + The end LR. + power (`float`, *optional*, defaults to 1.0): + Power factor. + last_epoch (`int`, *optional*, defaults to -1): + The index of the last epoch when resuming training. + + Note: *power* defaults to 1.0 as in the fairseq implementation, which in turn is based on the original BERT + implementation at + https://github.com/google-research/bert/blob/f39e881b169b9d53bea03d2d341b31707a6c052b/optimization.py#L37 + + Return: + `torch.optim.lr_scheduler.LambdaLR` with the appropriate schedule. + + """ + + lr_init = optimizer.defaults["lr"] + if not (lr_init > lr_end): + raise ValueError(f"lr_end ({lr_end}) must be smaller than initial lr ({lr_init})") + + lr_lambda = partial( + _get_polynomial_decay_schedule_with_warmup_lr_lambda, + num_warmup_steps=num_warmup_steps, + num_training_steps=num_training_steps, + lr_end=lr_end, + power=power, + lr_init=lr_init, + ) + return LambdaLR(optimizer, lr_lambda, last_epoch) + + +def _get_inverse_sqrt_schedule_lr_lambda(current_step: int, *, num_warmup_steps: int, timescale: Optional[int] = None): + if current_step < num_warmup_steps: + return float(current_step) / float(max(1, num_warmup_steps)) + shift = timescale - num_warmup_steps + decay = 1.0 / math.sqrt((current_step + shift) / timescale) + return decay + + +def get_inverse_sqrt_schedule( + optimizer: Optimizer, num_warmup_steps: int, timescale: Optional[int] = None, last_epoch: int = -1 +): + """ + Create a schedule with an inverse square-root learning rate, from the initial lr set in the optimizer, after a + warmup period which increases lr linearly from 0 to the initial lr set in the optimizer. + + Args: + optimizer ([`~torch.optim.Optimizer`]): + The optimizer for which to schedule the learning rate. + num_warmup_steps (`int`): + The number of steps for the warmup phase. + timescale (`int`, *optional*, defaults to `num_warmup_steps`): + Time scale. + last_epoch (`int`, *optional*, defaults to -1): + The index of the last epoch when resuming training. + + Return: + `torch.optim.lr_scheduler.LambdaLR` with the appropriate schedule. + """ + # Note: this implementation is adapted from + # https://github.com/google-research/big_vision/blob/f071ce68852d56099437004fd70057597a95f6ef/big_vision/utils.py#L930 + + if timescale is None: + timescale = num_warmup_steps or 10_000 + + lr_lambda = partial(_get_inverse_sqrt_schedule_lr_lambda, num_warmup_steps=num_warmup_steps, timescale=timescale) + return LambdaLR(optimizer, lr_lambda, last_epoch=last_epoch) + + +def _get_cosine_schedule_with_warmup_lr_lambda( + current_step: int, *, num_warmup_steps: int, num_training_steps: int, num_cycles: float, min_lr_rate: float = 0.0 +): + if current_step < num_warmup_steps: + return float(current_step) / float(max(1, num_warmup_steps)) + progress = float(current_step - num_warmup_steps) / float(max(1, num_training_steps - num_warmup_steps)) + factor = 0.5 * (1.0 + math.cos(math.pi * float(num_cycles) * 2.0 * progress)) + factor = factor * (1 - min_lr_rate) + min_lr_rate + return max(0, factor) + + +def get_cosine_with_min_lr_schedule_with_warmup( + optimizer: Optimizer, + num_warmup_steps: int, + num_training_steps: int, + num_cycles: float = 0.5, + last_epoch: int = -1, + min_lr: Optional[float] = None, + min_lr_rate: Optional[float] = None, +): + """ + Create a schedule with a learning rate that decreases following the values of the cosine function between the + initial lr set in the optimizer to min_lr, after a warmup period during which it increases linearly between 0 and the + initial lr set in the optimizer. + + Args: + optimizer ([`~torch.optim.Optimizer`]): + The optimizer for which to schedule the learning rate. + num_warmup_steps (`int`): + The number of steps for the warmup phase. + num_training_steps (`int`): + The total number of training steps. + num_cycles (`float`, *optional*, defaults to 0.5): + The number of waves in the cosine schedule (the defaults is to just decrease from the max value to 0 + following a half-cosine). + last_epoch (`int`, *optional*, defaults to -1): + The index of the last epoch when resuming training. + min_lr (`float`, *optional*): + The minimum learning rate to reach after the cosine schedule. + min_lr_rate (`float`, *optional*): + The minimum learning rate as a ratio of the initial learning rate. If set, `min_lr` should not be set. + + Return: + `torch.optim.lr_scheduler.LambdaLR` with the appropriate schedule. + """ + + if min_lr is not None and min_lr_rate is not None: + raise ValueError("Only one of min_lr or min_lr_rate should be set") + elif min_lr is not None: + min_lr_rate = min_lr / optimizer.defaults["lr"] + elif min_lr_rate is None: + raise ValueError("One of min_lr or min_lr_rate should be set through the `lr_scheduler_kwargs`") + + lr_lambda = partial( + _get_cosine_schedule_with_warmup_lr_lambda, + num_warmup_steps=num_warmup_steps, + num_training_steps=num_training_steps, + num_cycles=num_cycles, + min_lr_rate=min_lr_rate, + ) + return LambdaLR(optimizer, lr_lambda, last_epoch) + + +def _get_wsd_scheduler_lambda( + current_step: int, + *, + num_warmup_steps: int, + num_stable_steps: int, + num_decay_steps: int, + warmup_type: str, + decay_type: str, + min_lr_ratio: float, + num_cycles: float, +): + if current_step < num_warmup_steps: + progress = float(current_step) / float(max(1, num_warmup_steps)) + if warmup_type == "linear": + factor = progress + elif warmup_type == "cosine": + factor = 0.5 * (1.0 - math.cos(math.pi * progress)) + elif warmup_type == "1-sqrt": + factor = 1.0 - math.sqrt(1.0 - progress) + factor = factor * (1.0 - min_lr_ratio) + min_lr_ratio + return max(0.0, factor) + + if current_step < num_warmup_steps + num_stable_steps: + return 1.0 + + if current_step < num_warmup_steps + num_stable_steps + num_decay_steps: + progress = float(current_step - num_warmup_steps - num_stable_steps) / float(max(1, num_decay_steps)) + if decay_type == "linear": + factor = 1.0 - progress + elif decay_type == "cosine": + factor = 0.5 * (1.0 + math.cos(math.pi * float(num_cycles) * 2.0 * progress)) + elif decay_type == "1-sqrt": + factor = 1.0 - math.sqrt(progress) + factor = factor * (1.0 - min_lr_ratio) + min_lr_ratio + return max(0.0, factor) + return min_lr_ratio + + +def get_wsd_schedule( + optimizer: Optimizer, + num_warmup_steps: int, + num_decay_steps: int, + num_training_steps: Optional[int] = None, + num_stable_steps: Optional[int] = None, + warmup_type: str = "linear", + decay_type: str = "cosine", + min_lr_ratio: float = 0, + num_cycles: float = 0.5, + last_epoch: int = -1, +): + """ + Create a schedule with a learning rate that has three stages: + 1. warmup: increase from min_lr_ratio times the initial learning rate to the initial learning rate following a warmup_type. + 2. stable: constant learning rate. + 3. decay: decrease from the initial learning rate to min_lr_ratio times the initial learning rate following a decay_type. + + Args: + optimizer ([`~torch.optim.Optimizer`]): + The optimizer for which to schedule the learning rate. + num_warmup_steps (`int`): + The number of steps for the warmup phase. + num_decay_steps (`int`): + The number of steps for the decay phase. + num_training_steps (`int`, *optional*): + The total number of training steps. This is the sum of the warmup, stable and decay steps. If `num_stable_steps` is not provided, the stable phase will be `num_training_steps - num_warmup_steps - num_decay_steps`. + num_stable_steps (`int`, *optional*): + The number of steps for the stable phase. Please ensure that `num_warmup_steps + num_stable_steps + num_decay_steps` equals `num_training_steps`, otherwise the other steps will default to the minimum learning rate. + warmup_type (`str`, *optional*, defaults to "linear"): + The type of warmup to use. Can be 'linear', 'cosine' or '1-sqrt'. + decay_type (`str`, *optional*, defaults to "cosine"): + The type of decay to use. Can be 'linear', 'cosine' or '1-sqrt'. + min_lr_ratio (`float`, *optional*, defaults to 0): + The minimum learning rate as a ratio of the initial learning rate. + num_cycles (`float`, *optional*, defaults to 0.5): + The number of waves in the cosine schedule (the defaults is to just decrease from the max value to 0 + following a half-cosine). + last_epoch (`int`, *optional*, defaults to -1): + The index of the last epoch when resuming training. + + Return: + `torch.optim.lr_scheduler.LambdaLR` with the appropriate schedule. + """ + + if num_training_steps is None and num_stable_steps is None: + raise ValueError("Either num_training_steps or num_stable_steps must be specified.") + + if num_training_steps is not None and num_stable_steps is not None: + warnings.warn("Both num_training_steps and num_stable_steps are specified. num_stable_steps will be used.") + + if warmup_type not in ["linear", "cosine", "1-sqrt"]: + raise ValueError(f"Unknown warmup type: {warmup_type}, expected 'linear', 'cosine' or '1-sqrt'") + + if decay_type not in ["linear", "cosine", "1-sqrt"]: + raise ValueError(f"Unknown decay type: {decay_type}, expected 'linear', 'cosine' or '1-sqrt'") + + if num_stable_steps is None: + num_stable_steps = num_training_steps - num_warmup_steps - num_decay_steps + + lr_lambda = partial( + _get_wsd_scheduler_lambda, + num_warmup_steps=num_warmup_steps, + num_stable_steps=num_stable_steps, + num_decay_steps=num_decay_steps, + warmup_type=warmup_type, + decay_type=decay_type, + min_lr_ratio=min_lr_ratio, + num_cycles=num_cycles, + ) + return LambdaLR(optimizer, lr_lambda, last_epoch) + + +TYPE_TO_SCHEDULER_FUNCTION = { + SchedulerType.LINEAR: get_linear_schedule_with_warmup, + SchedulerType.COSINE: get_cosine_schedule_with_warmup, + SchedulerType.COSINE_WITH_RESTARTS: get_cosine_with_hard_restarts_schedule_with_warmup, + SchedulerType.POLYNOMIAL: get_polynomial_decay_schedule_with_warmup, + SchedulerType.CONSTANT: get_constant_schedule, + SchedulerType.CONSTANT_WITH_WARMUP: get_constant_schedule_with_warmup, + SchedulerType.INVERSE_SQRT: get_inverse_sqrt_schedule, + SchedulerType.REDUCE_ON_PLATEAU: get_reduce_on_plateau_schedule, + SchedulerType.COSINE_WITH_MIN_LR: get_cosine_with_min_lr_schedule_with_warmup, + SchedulerType.WARMUP_STABLE_DECAY: get_wsd_schedule, +} + + +def get_scheduler( + name: Union[str, SchedulerType], + optimizer: Optimizer, + num_warmup_steps: Optional[int] = None, + num_training_steps: Optional[int] = None, + scheduler_specific_kwargs: Optional[dict] = None, +): + """ + Unified API to get any scheduler from its name. + + Args: + name (`str` or `SchedulerType`): + The name of the scheduler to use. + optimizer (`torch.optim.Optimizer`): + The optimizer that will be used during training. + num_warmup_steps (`int`, *optional*): + The number of warmup steps to do. This is not required by all schedulers (hence the argument being + optional), the function will raise an error if it's unset and the scheduler type requires it. + num_training_steps (`int``, *optional*): + The number of training steps to do. This is not required by all schedulers (hence the argument being + optional), the function will raise an error if it's unset and the scheduler type requires it. + scheduler_specific_kwargs (`dict`, *optional*): + Extra parameters for schedulers such as cosine with restarts. Mismatched scheduler types and scheduler + parameters will cause the scheduler function to raise a TypeError. + """ + name = SchedulerType(name) + schedule_func = TYPE_TO_SCHEDULER_FUNCTION[name] + + # If a `LayerWiseDummyOptimizer` is passed we extract the optimizer dict and + # recursively call `get_scheduler` to get the proper schedulers on each parameter + if optimizer is not None and isinstance(optimizer, LayerWiseDummyOptimizer): + optimizer_dict = optimizer.optimizer_dict + scheduler_dict = {} + + for param in optimizer_dict.keys(): + scheduler_dict[param] = get_scheduler( + name, + optimizer=optimizer_dict[param], + num_warmup_steps=num_warmup_steps, + num_training_steps=num_training_steps, + ) + + def scheduler_hook(param): + # Since the optimizer hook has been already attached we only need to + # attach the scheduler hook, the gradients have been zeroed here + scheduler_dict[param].step() + + for param in optimizer_dict.keys(): + if param.requires_grad: + param.register_post_accumulate_grad_hook(scheduler_hook) + + return LayerWiseDummyScheduler(optimizer_dict=optimizer_dict, lr=optimizer.defaults["lr"]) + + if name == SchedulerType.CONSTANT: + return schedule_func(optimizer) + + if scheduler_specific_kwargs is None: + scheduler_specific_kwargs = {} + + if name == SchedulerType.REDUCE_ON_PLATEAU: + return schedule_func(optimizer, **scheduler_specific_kwargs) + + # All other schedulers require `num_warmup_steps` + if num_warmup_steps is None: + raise ValueError(f"{name} requires `num_warmup_steps`, please provide that argument.") + + if name == SchedulerType.CONSTANT_WITH_WARMUP: + return schedule_func(optimizer, num_warmup_steps=num_warmup_steps) + + if name == SchedulerType.INVERSE_SQRT: + return schedule_func(optimizer, num_warmup_steps=num_warmup_steps) + + # wsd scheduler requires either num_training_steps or num_stable_steps + if name == SchedulerType.WARMUP_STABLE_DECAY: + return schedule_func( + optimizer, + num_warmup_steps=num_warmup_steps, + num_training_steps=num_training_steps, + **scheduler_specific_kwargs, + ) + + # All other schedulers require `num_training_steps` + if num_training_steps is None: + raise ValueError(f"{name} requires `num_training_steps`, please provide that argument.") + + return schedule_func( + optimizer, + num_warmup_steps=num_warmup_steps, + num_training_steps=num_training_steps, + **scheduler_specific_kwargs, + ) + + +class Adafactor(Optimizer): + """ + AdaFactor pytorch implementation can be used as a drop in replacement for Adam original fairseq code: + https://github.com/pytorch/fairseq/blob/master/fairseq/optim/adafactor.py + + Paper: *Adafactor: Adaptive Learning Rates with Sublinear Memory Cost* https://arxiv.org/abs/1804.04235 Note that + this optimizer internally adjusts the learning rate depending on the `scale_parameter`, `relative_step` and + `warmup_init` options. To use a manual (external) learning rate schedule you should set `scale_parameter=False` and + `relative_step=False`. + + Arguments: + params (`Iterable[nn.parameter.Parameter]`): + Iterable of parameters to optimize or dictionaries defining parameter groups. + lr (`float`, *optional*): + The external learning rate. + eps (`Tuple[float, float]`, *optional*, defaults to `(1e-30, 0.001)`): + Regularization constants for square gradient and parameter scale respectively + clip_threshold (`float`, *optional*, defaults to 1.0): + Threshold of root mean square of final gradient update + decay_rate (`float`, *optional*, defaults to -0.8): + Coefficient used to compute running averages of square + beta1 (`float`, *optional*): + Coefficient used for computing running averages of gradient + weight_decay (`float`, *optional*, defaults to 0.0): + Weight decay (L2 penalty) + scale_parameter (`bool`, *optional*, defaults to `True`): + If True, learning rate is scaled by root mean square + relative_step (`bool`, *optional*, defaults to `True`): + If True, time-dependent learning rate is computed instead of external learning rate + warmup_init (`bool`, *optional*, defaults to `False`): + Time-dependent learning rate computation depends on whether warm-up initialization is being used + + This implementation handles low-precision (FP16, bfloat) values, but we have not thoroughly tested. + + Recommended T5 finetuning settings (https://discuss.huggingface.co/t/t5-finetuning-tips/684/3): + + - Training without LR warmup or clip_threshold is not recommended. + + - use scheduled LR warm-up to fixed LR + - use clip_threshold=1.0 (https://arxiv.org/abs/1804.04235) + - Disable relative updates + - Use scale_parameter=False + - Additional optimizer operations like gradient clipping should not be used alongside Adafactor + + Example: + + ```python + Adafactor(model.parameters(), scale_parameter=False, relative_step=False, warmup_init=False, lr=1e-3) + ``` + + Others reported the following combination to work well: + + ```python + Adafactor(model.parameters(), scale_parameter=True, relative_step=True, warmup_init=True, lr=None) + ``` + + When using `lr=None` with [`Trainer`] you will most likely need to use [`~optimization.AdafactorSchedule`] + scheduler as following: + + ```python + from transformers.optimization import Adafactor, AdafactorSchedule + + optimizer = Adafactor(model.parameters(), scale_parameter=True, relative_step=True, warmup_init=True, lr=None) + lr_scheduler = AdafactorSchedule(optimizer) + trainer = Trainer(..., optimizers=(optimizer, lr_scheduler)) + ``` + + Usage: + + ```python + # replace AdamW with Adafactor + optimizer = Adafactor( + model.parameters(), + lr=1e-3, + eps=(1e-30, 1e-3), + clip_threshold=1.0, + decay_rate=-0.8, + beta1=None, + weight_decay=0.0, + relative_step=False, + scale_parameter=False, + warmup_init=False, + ) + ```""" + + def __init__( + self, + params, + lr=None, + eps=(1e-30, 1e-3), + clip_threshold=1.0, + decay_rate=-0.8, + beta1=None, + weight_decay=0.0, + scale_parameter=True, + relative_step=True, + warmup_init=False, + ): + if lr is not None and relative_step: + raise ValueError("Cannot combine manual `lr` and `relative_step=True` options") + if warmup_init and not relative_step: + raise ValueError("`warmup_init=True` requires `relative_step=True`") + + defaults = { + "lr": lr, + "eps": eps, + "clip_threshold": clip_threshold, + "decay_rate": decay_rate, + "beta1": beta1, + "weight_decay": weight_decay, + "scale_parameter": scale_parameter, + "relative_step": relative_step, + "warmup_init": warmup_init, + } + super().__init__(params, defaults) + + @staticmethod + def _get_lr(param_group, param_state): + rel_step_sz = param_group["lr"] + if param_group["relative_step"]: + min_step = 1e-6 * param_state["step"] if param_group["warmup_init"] else 1e-2 + rel_step_sz = min(min_step, 1.0 / math.sqrt(param_state["step"])) + param_scale = 1.0 + if param_group["scale_parameter"]: + param_scale = max(param_group["eps"][1], param_state["RMS"]) + return param_scale * rel_step_sz + + @staticmethod + def _get_options(param_group, param_shape): + factored = len(param_shape) >= 2 + use_first_moment = param_group["beta1"] is not None + return factored, use_first_moment + + @staticmethod + def _rms(tensor): + return tensor.norm(2) / (tensor.numel() ** 0.5) + + @staticmethod + def _approx_sq_grad(exp_avg_sq_row, exp_avg_sq_col): + # copy from fairseq's adafactor implementation: + # https://github.com/huggingface/transformers/blob/8395f14de6068012787d83989c3627c3df6a252b/src/transformers/optimization.py#L505 + r_factor = (exp_avg_sq_row / exp_avg_sq_row.mean(dim=-1, keepdim=True)).rsqrt_().unsqueeze(-1) + c_factor = exp_avg_sq_col.unsqueeze(-2).rsqrt() + return torch.mul(r_factor, c_factor) + + @torch.no_grad() + def step(self, closure=None): + """ + Performs a single optimization step + + Arguments: + closure (callable, optional): A closure that reevaluates the model + and returns the loss. + """ + loss = None + if closure is not None: + loss = closure() + + for group in self.param_groups: + for p in group["params"]: + if p.grad is None: + continue + grad = p.grad + if grad.dtype in {torch.float16, torch.bfloat16}: + grad = grad.float() + if grad.is_sparse: + raise RuntimeError("Adafactor does not support sparse gradients.") + + state = self.state[p] + grad_shape = grad.shape + + factored, use_first_moment = self._get_options(group, grad_shape) + # State Initialization + if len(state) == 0: + state["step"] = 0 + + if use_first_moment: + # Exponential moving average of gradient values + state["exp_avg"] = torch.zeros_like(grad) + if factored: + state["exp_avg_sq_row"] = torch.zeros(grad_shape[:-1]).to(grad) + state["exp_avg_sq_col"] = torch.zeros(grad_shape[:-2] + grad_shape[-1:]).to(grad) + else: + state["exp_avg_sq"] = torch.zeros_like(grad) + + state["RMS"] = 0 + else: + if use_first_moment: + state["exp_avg"] = state["exp_avg"].to(grad) + if factored: + state["exp_avg_sq_row"] = state["exp_avg_sq_row"].to(grad) + state["exp_avg_sq_col"] = state["exp_avg_sq_col"].to(grad) + else: + state["exp_avg_sq"] = state["exp_avg_sq"].to(grad) + + p_data_fp32 = p + if p.dtype in {torch.float16, torch.bfloat16}: + p_data_fp32 = p_data_fp32.float() + + state["step"] += 1 + state["RMS"] = self._rms(p_data_fp32) + lr = self._get_lr(group, state) + + beta2t = 1.0 - math.pow(state["step"], group["decay_rate"]) + update = (grad**2) + group["eps"][0] + if factored: + exp_avg_sq_row = state["exp_avg_sq_row"] + exp_avg_sq_col = state["exp_avg_sq_col"] + + exp_avg_sq_row.mul_(beta2t).add_(update.mean(dim=-1), alpha=(1.0 - beta2t)) + exp_avg_sq_col.mul_(beta2t).add_(update.mean(dim=-2), alpha=(1.0 - beta2t)) + + # Approximation of exponential moving average of square of gradient + update = self._approx_sq_grad(exp_avg_sq_row, exp_avg_sq_col) + update.mul_(grad) + else: + exp_avg_sq = state["exp_avg_sq"] + + exp_avg_sq.mul_(beta2t).add_(update, alpha=(1.0 - beta2t)) + update = exp_avg_sq.rsqrt().mul_(grad) + + update.div_((self._rms(update) / group["clip_threshold"]).clamp_(min=1.0)) + update.mul_(lr) + + if use_first_moment: + exp_avg = state["exp_avg"] + exp_avg.mul_(group["beta1"]).add_(update, alpha=(1 - group["beta1"])) + update = exp_avg + + if group["weight_decay"] != 0: + p_data_fp32.add_(p_data_fp32, alpha=(-group["weight_decay"] * lr)) + + p_data_fp32.add_(-update) + + if p.dtype in {torch.float16, torch.bfloat16}: + p.copy_(p_data_fp32) + + return loss + + +class AdafactorSchedule(LambdaLR): + """ + Since [`~optimization.Adafactor`] performs its own scheduling, if the training loop relies on a scheduler (e.g., + for logging), this class creates a proxy object that retrieves the current lr values from the optimizer. + + It returns `initial_lr` during startup and the actual `lr` during stepping. + """ + + def __init__(self, optimizer, initial_lr=0.0): + def lr_lambda(_): + return initial_lr + + for group in optimizer.param_groups: + group["initial_lr"] = initial_lr + super().__init__(optimizer, lr_lambda) + for group in optimizer.param_groups: + del group["initial_lr"] + + def get_lr(self): + opt = self.optimizer + lrs = [ + opt._get_lr(group, opt.state[group["params"][0]]) + for group in opt.param_groups + if group["params"][0].grad is not None + ] + if len(lrs) == 0: + lrs = self.base_lrs # if called before stepping + return lrs + + +def get_adafactor_schedule(optimizer, initial_lr=0.0): + """ + Get a proxy schedule for [`~optimization.Adafactor`] + + Args: + optimizer ([`~torch.optim.Optimizer`]): + The optimizer for which to schedule the learning rate. + initial_lr (`float`, *optional*, defaults to 0.0): + Initial lr + + Return: + [`~optimization.Adafactor`] proxy schedule object. + + + """ + return AdafactorSchedule(optimizer, initial_lr) diff --git a/docs/transformers/build/lib/transformers/optimization_tf.py b/docs/transformers/build/lib/transformers/optimization_tf.py new file mode 100644 index 0000000000000000000000000000000000000000..4da4ecc901914bc12a94c8ac3193309b85661865 --- /dev/null +++ b/docs/transformers/build/lib/transformers/optimization_tf.py @@ -0,0 +1,379 @@ +# Copyright 2019 The TensorFlow Authors, The Hugging Face Team. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +"""Functions and classes related to optimization (weight updates).""" + +import re +from typing import Callable, Optional, Union + +import tensorflow as tf + + +try: + from tf_keras.optimizers.legacy import Adam +except (ImportError, ModuleNotFoundError): + from tensorflow.keras.optimizers.legacy import Adam + +from .modeling_tf_utils import keras + + +# This block because Keras loves randomly moving things to different places - this changed somewhere between 2.10 - 2.15 +if hasattr(keras.optimizers.schedules, "learning_rate_schedule"): + schedules = keras.optimizers.schedules.learning_rate_schedule +else: + schedules = keras.optimizers.schedules + + +class WarmUp(schedules.LearningRateSchedule): + """ + Applies a warmup schedule on a given learning rate decay schedule. + + Args: + initial_learning_rate (`float`): + The initial learning rate for the schedule after the warmup (so this will be the learning rate at the end + of the warmup). + decay_schedule_fn (`Callable`): + The schedule function to apply after the warmup for the rest of training. + warmup_steps (`int`): + The number of steps for the warmup part of training. + power (`float`, *optional*, defaults to 1.0): + The power to use for the polynomial warmup (defaults is a linear warmup). + name (`str`, *optional*): + Optional name prefix for the returned tensors during the schedule. + """ + + def __init__( + self, + initial_learning_rate: float, + decay_schedule_fn: Callable, + warmup_steps: int, + power: float = 1.0, + name: Optional[str] = None, + ): + super().__init__() + self.initial_learning_rate = initial_learning_rate + self.warmup_steps = warmup_steps + self.power = power + self.decay_schedule_fn = decay_schedule_fn + self.name = name + + def __call__(self, step): + with tf.name_scope(self.name or "WarmUp") as name: + # Implements polynomial warmup. i.e., if global_step < warmup_steps, the + # learning rate will be `global_step/num_warmup_steps * init_lr`. + global_step_float = tf.cast(step, tf.float32) + warmup_steps_float = tf.cast(self.warmup_steps, tf.float32) + warmup_percent_done = global_step_float / warmup_steps_float + warmup_learning_rate = self.initial_learning_rate * tf.math.pow(warmup_percent_done, self.power) + return tf.cond( + global_step_float < warmup_steps_float, + lambda: warmup_learning_rate, + lambda: self.decay_schedule_fn(step - self.warmup_steps), + name=name, + ) + + def get_config(self): + return { + "initial_learning_rate": self.initial_learning_rate, + "decay_schedule_fn": self.decay_schedule_fn, + "warmup_steps": self.warmup_steps, + "power": self.power, + "name": self.name, + } + + +def create_optimizer( + init_lr: float, + num_train_steps: int, + num_warmup_steps: int, + min_lr_ratio: float = 0.0, + adam_beta1: float = 0.9, + adam_beta2: float = 0.999, + adam_epsilon: float = 1e-8, + adam_clipnorm: Optional[float] = None, + adam_global_clipnorm: Optional[float] = None, + weight_decay_rate: float = 0.0, + power: float = 1.0, + include_in_weight_decay: Optional[list[str]] = None, +): + """ + Creates an optimizer with a learning rate schedule using a warmup phase followed by a linear decay. + + Args: + init_lr (`float`): + The desired learning rate at the end of the warmup phase. + num_train_steps (`int`): + The total number of training steps. + num_warmup_steps (`int`): + The number of warmup steps. + min_lr_ratio (`float`, *optional*, defaults to 0): + The final learning rate at the end of the linear decay will be `init_lr * min_lr_ratio`. + adam_beta1 (`float`, *optional*, defaults to 0.9): + The beta1 to use in Adam. + adam_beta2 (`float`, *optional*, defaults to 0.999): + The beta2 to use in Adam. + adam_epsilon (`float`, *optional*, defaults to 1e-8): + The epsilon to use in Adam. + adam_clipnorm (`float`, *optional*, defaults to `None`): + If not `None`, clip the gradient norm for each weight tensor to this value. + adam_global_clipnorm (`float`, *optional*, defaults to `None`) + If not `None`, clip gradient norm to this value. When using this argument, the norm is computed over all + weight tensors, as if they were concatenated into a single vector. + weight_decay_rate (`float`, *optional*, defaults to 0): + The weight decay to use. + power (`float`, *optional*, defaults to 1.0): + The power to use for PolynomialDecay. + include_in_weight_decay (`List[str]`, *optional*): + List of the parameter names (or re patterns) to apply weight decay to. If none is passed, weight decay is + applied to all parameters except bias and layer norm parameters. + """ + # Implements linear decay of the learning rate. + lr_schedule = schedules.PolynomialDecay( + initial_learning_rate=init_lr, + decay_steps=num_train_steps - num_warmup_steps, + end_learning_rate=init_lr * min_lr_ratio, + power=power, + ) + if num_warmup_steps: + lr_schedule = WarmUp( + initial_learning_rate=init_lr, + decay_schedule_fn=lr_schedule, + warmup_steps=num_warmup_steps, + ) + if weight_decay_rate > 0.0: + optimizer = AdamWeightDecay( + learning_rate=lr_schedule, + weight_decay_rate=weight_decay_rate, + beta_1=adam_beta1, + beta_2=adam_beta2, + epsilon=adam_epsilon, + clipnorm=adam_clipnorm, + global_clipnorm=adam_global_clipnorm, + exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"], + include_in_weight_decay=include_in_weight_decay, + ) + else: + optimizer = keras.optimizers.Adam( + learning_rate=lr_schedule, + beta_1=adam_beta1, + beta_2=adam_beta2, + epsilon=adam_epsilon, + clipnorm=adam_clipnorm, + global_clipnorm=adam_global_clipnorm, + ) + # We return the optimizer and the LR scheduler in order to better track the + # evolution of the LR independently of the optimizer. + return optimizer, lr_schedule + + +class AdamWeightDecay(Adam): + """ + Adam enables L2 weight decay and clip_by_global_norm on gradients. Just adding the square of the weights to the + loss function is *not* the correct way of using L2 regularization/weight decay with Adam, since that will interact + with the m and v parameters in strange ways as shown in [Decoupled Weight Decay + Regularization](https://arxiv.org/abs/1711.05101). + + Instead we want to decay the weights in a manner that doesn't interact with the m/v parameters. This is equivalent + to adding the square of the weights to the loss with plain (non-momentum) SGD. + + Args: + learning_rate (`Union[float, LearningRateSchedule]`, *optional*, defaults to 0.001): + The learning rate to use or a schedule. + beta_1 (`float`, *optional*, defaults to 0.9): + The beta1 parameter in Adam, which is the exponential decay rate for the 1st momentum estimates. + beta_2 (`float`, *optional*, defaults to 0.999): + The beta2 parameter in Adam, which is the exponential decay rate for the 2nd momentum estimates. + epsilon (`float`, *optional*, defaults to 1e-07): + The epsilon parameter in Adam, which is a small constant for numerical stability. + amsgrad (`bool`, *optional*, defaults to `False`): + Whether to apply AMSGrad variant of this algorithm or not, see [On the Convergence of Adam and + Beyond](https://arxiv.org/abs/1904.09237). + weight_decay_rate (`float`, *optional*, defaults to 0.0): + The weight decay to apply. + include_in_weight_decay (`List[str]`, *optional*): + List of the parameter names (or re patterns) to apply weight decay to. If none is passed, weight decay is + applied to all parameters by default (unless they are in `exclude_from_weight_decay`). + exclude_from_weight_decay (`List[str]`, *optional*): + List of the parameter names (or re patterns) to exclude from applying weight decay to. If a + `include_in_weight_decay` is passed, the names in it will supersede this list. + name (`str`, *optional*, defaults to `"AdamWeightDecay"`): + Optional name for the operations created when applying gradients. + kwargs (`Dict[str, Any]`, *optional*): + Keyword arguments. Allowed to be {`clipnorm`, `clipvalue`, `lr`, `decay`}. `clipnorm` is clip gradients by + norm; `clipvalue` is clip gradients by value, `decay` is included for backward compatibility to allow time + inverse decay of learning rate. `lr` is included for backward compatibility, recommended to use + `learning_rate` instead. + """ + + def __init__( + self, + learning_rate: Union[float, schedules.LearningRateSchedule] = 0.001, + beta_1: float = 0.9, + beta_2: float = 0.999, + epsilon: float = 1e-7, + amsgrad: bool = False, + weight_decay_rate: float = 0.0, + include_in_weight_decay: Optional[list[str]] = None, + exclude_from_weight_decay: Optional[list[str]] = None, + name: str = "AdamWeightDecay", + **kwargs, + ): + super().__init__(learning_rate, beta_1, beta_2, epsilon, amsgrad, name, **kwargs) + self.weight_decay_rate = weight_decay_rate + self._include_in_weight_decay = include_in_weight_decay + self._exclude_from_weight_decay = exclude_from_weight_decay + + @classmethod + def from_config(cls, config): + """Creates an optimizer from its config with WarmUp custom object.""" + custom_objects = {"WarmUp": WarmUp} + return super().from_config(config, custom_objects=custom_objects) + + def _prepare_local(self, var_device, var_dtype, apply_state): + super()._prepare_local(var_device, var_dtype, apply_state) + apply_state[(var_device, var_dtype)]["weight_decay_rate"] = tf.constant( + self.weight_decay_rate, name="adam_weight_decay_rate" + ) + + def _decay_weights_op(self, var, learning_rate, apply_state): + do_decay = self._do_use_weight_decay(var.name) + if do_decay: + return var.assign_sub( + learning_rate * var * apply_state[(var.device, var.dtype.base_dtype)]["weight_decay_rate"], + use_locking=self._use_locking, + ) + return tf.no_op() + + def apply_gradients(self, grads_and_vars, name=None, **kwargs): + grads, tvars = list(zip(*grads_and_vars)) + return super().apply_gradients(zip(grads, tvars), name=name, **kwargs) + + def _get_lr(self, var_device, var_dtype, apply_state): + """Retrieves the learning rate with the given state.""" + if apply_state is None: + return self._decayed_lr_t[var_dtype], {} + + apply_state = apply_state or {} + coefficients = apply_state.get((var_device, var_dtype)) + if coefficients is None: + coefficients = self._fallback_apply_state(var_device, var_dtype) + apply_state[(var_device, var_dtype)] = coefficients + + return coefficients["lr_t"], {"apply_state": apply_state} + + def _resource_apply_dense(self, grad, var, apply_state=None): + lr_t, kwargs = self._get_lr(var.device, var.dtype.base_dtype, apply_state) + decay = self._decay_weights_op(var, lr_t, apply_state) + with tf.control_dependencies([decay]): + return super()._resource_apply_dense(grad, var, **kwargs) + + def _resource_apply_sparse(self, grad, var, indices, apply_state=None): + lr_t, kwargs = self._get_lr(var.device, var.dtype.base_dtype, apply_state) + decay = self._decay_weights_op(var, lr_t, apply_state) + with tf.control_dependencies([decay]): + return super()._resource_apply_sparse(grad, var, indices, **kwargs) + + def get_config(self): + config = super().get_config() + config.update({"weight_decay_rate": self.weight_decay_rate}) + return config + + def _do_use_weight_decay(self, param_name): + """Whether to use L2 weight decay for `param_name`.""" + if self.weight_decay_rate == 0: + return False + + if self._include_in_weight_decay: + for r in self._include_in_weight_decay: + if re.search(r, param_name) is not None: + return True + + if self._exclude_from_weight_decay: + for r in self._exclude_from_weight_decay: + if re.search(r, param_name) is not None: + return False + return True + + +# Extracted from https://github.com/OpenNMT/OpenNMT-tf/blob/master/opennmt/optimizers/utils.py +class GradientAccumulator: + """ + Gradient accumulation utility. When used with a distribution strategy, the accumulator should be called in a + replica context. Gradients will be accumulated locally on each replica and without synchronization. Users should + then call `.gradients`, scale the gradients if required, and pass the result to `apply_gradients`. + """ + + # We use the ON_READ synchronization policy so that no synchronization is + # performed on assignment. To get the value, we call .value() which returns the + # value on the current replica without synchronization. + + def __init__(self): + """Initializes the accumulator.""" + self._gradients = [] + self._accum_steps = None + + @property + def step(self): + """Number of accumulated steps.""" + if self._accum_steps is None: + self._accum_steps = tf.Variable( + tf.constant(0, dtype=tf.int64), + trainable=False, + synchronization=tf.VariableSynchronization.ON_READ, + aggregation=tf.VariableAggregation.ONLY_FIRST_REPLICA, + ) + + return self._accum_steps.value() + + @property + def gradients(self): + """The accumulated gradients on the current replica.""" + if not self._gradients: + raise ValueError("The accumulator should be called first to initialize the gradients") + return [gradient.value() if gradient is not None else gradient for gradient in self._gradients] + + def __call__(self, gradients): + """Accumulates `gradients` on the current replica.""" + if not self._gradients: + _ = self.step # Create the step variable. + self._gradients.extend( + [ + tf.Variable( + tf.zeros_like(gradient), + trainable=False, + synchronization=tf.VariableSynchronization.ON_READ, + aggregation=tf.VariableAggregation.ONLY_FIRST_REPLICA, + ) + if gradient is not None + else gradient + for gradient in gradients + ] + ) + if len(gradients) != len(self._gradients): + raise ValueError(f"Expected {len(self._gradients)} gradients, but got {len(gradients)}") + + for accum_gradient, gradient in zip(self._gradients, gradients): + if accum_gradient is not None and gradient is not None: + accum_gradient.assign_add(gradient) + + self._accum_steps.assign_add(1) + + def reset(self): + """Resets the accumulated gradients on the current replica.""" + if not self._gradients: + return + self._accum_steps.assign(0) + for gradient in self._gradients: + if gradient is not None: + gradient.assign(tf.zeros_like(gradient)) diff --git a/docs/transformers/build/lib/transformers/processing_utils.py b/docs/transformers/build/lib/transformers/processing_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..dc6bd4bf1428dfb0bdbaa2f78e27e1c3283c0f7c --- /dev/null +++ b/docs/transformers/build/lib/transformers/processing_utils.py @@ -0,0 +1,1699 @@ +# Copyright 2022 The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Processing saving/loading class for common processors. +""" + +import copy +import inspect +import json +import os +import sys +import typing +import warnings +from pathlib import Path +from typing import Any, Dict, List, Optional, TypedDict, Union + +import numpy as np +import typing_extensions +from huggingface_hub.errors import EntryNotFoundError + +from .audio_utils import load_audio +from .dynamic_module_utils import custom_object_save +from .feature_extraction_utils import BatchFeature +from .image_utils import ( + ChannelDimension, + ImageInput, + VideoInput, + is_valid_image, + is_vision_available, + load_image, + load_video, +) + + +if is_vision_available(): + from .image_utils import PILImageResampling + +from .tokenization_utils_base import ( + PaddingStrategy, + PreTokenizedInput, + PreTrainedTokenizerBase, + TextInput, + TruncationStrategy, +) +from .utils import ( + CHAT_TEMPLATE_DIR, + CHAT_TEMPLATE_FILE, + LEGACY_PROCESSOR_CHAT_TEMPLATE_FILE, + PROCESSOR_NAME, + PushToHubMixin, + TensorType, + add_model_info_to_auto_map, + add_model_info_to_custom_pipelines, + cached_file, + copy_func, + direct_transformers_import, + download_url, + is_offline_mode, + is_remote_url, + list_repo_templates, + logging, +) + + +logger = logging.get_logger(__name__) + +# Dynamically import the Transformers module to grab the attribute classes of the processor from their names. +transformers_module = direct_transformers_import(Path(__file__).parent) + + +AUTO_TO_BASE_CLASS_MAPPING = { + "AutoTokenizer": "PreTrainedTokenizerBase", + "AutoFeatureExtractor": "FeatureExtractionMixin", + "AutoImageProcessor": "ImageProcessingMixin", +} + +if sys.version_info >= (3, 11): + Unpack = typing.Unpack +else: + Unpack = typing_extensions.Unpack + + +class TextKwargs(TypedDict, total=False): + """ + Keyword arguments for text processing. For extended documentation, check out tokenization_utils_base methods and + docstrings associated. + + Attributes: + add_special_tokens (`bool`, *optional*) + Whether or not to add special tokens when encoding the sequences. + padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*) + Activates and controls padding. + truncation (`bool`, `str` or [`~tokenization_utils_base.TruncationStrategy`], *optional*): + Activates and controls truncation. + max_length (`int`, *optional*): + Controls the maximum length to use by one of the truncation/padding parameters. + stride (`int`, *optional*): + If set, the overflowing tokens will contain some tokens from the end of the truncated sequence. + is_split_into_words (`bool`, *optional*): + Whether or not the input is already pre-tokenized. + pad_to_multiple_of (`int`, *optional*): + If set, will pad the sequence to a multiple of the provided value. + return_token_type_ids (`bool`, *optional*): + Whether to return token type IDs. + return_attention_mask (`bool`, *optional*): + Whether to return the attention mask. + return_overflowing_tokens (`bool`, *optional*): + Whether or not to return overflowing token sequences. + return_special_tokens_mask (`bool`, *optional*): + Whether or not to return special tokens mask information. + return_offsets_mapping (`bool`, *optional*): + Whether or not to return `(char_start, char_end)` for each token. + return_length (`bool`, *optional*): + Whether or not to return the lengths of the encoded inputs. + verbose (`bool`, *optional*): + Whether or not to print more information and warnings. + padding_side (`str`, *optional*): + The side on which padding will be applied. + """ + + text_pair: Optional[Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]]] + text_target: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] + text_pair_target: Optional[Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]]] + add_special_tokens: Optional[bool] + padding: Union[bool, str, PaddingStrategy] + truncation: Union[bool, str, TruncationStrategy] + max_length: Optional[int] + stride: Optional[int] + is_split_into_words: Optional[bool] + pad_to_multiple_of: Optional[int] + return_token_type_ids: Optional[bool] + return_attention_mask: Optional[bool] + return_overflowing_tokens: Optional[bool] + return_special_tokens_mask: Optional[bool] + return_offsets_mapping: Optional[bool] + return_length: Optional[bool] + verbose: Optional[bool] + padding_side: Optional[str] + + +class ImagesKwargs(TypedDict, total=False): + """ + Keyword arguments for image processing. For extended documentation, check the appropriate ImageProcessor + class methods and docstrings. + + Attributes: + do_resize (`bool`, *optional*): + Whether to resize the image. + size (`Dict[str, int]`, *optional*): + Resize the shorter side of the input to `size["shortest_edge"]`. + size_divisor (`int`, *optional*): + The size by which to make sure both the height and width can be divided. + crop_size (`Dict[str, int]`, *optional*): + Desired output size when applying center-cropping. + resample (`PILImageResampling`, *optional*): + Resampling filter to use if resizing the image. + do_rescale (`bool`, *optional*): + Whether to rescale the image by the specified scale `rescale_factor`. + rescale_factor (`int` or `float`, *optional*): + Scale factor to use if rescaling the image. + do_normalize (`bool`, *optional*): + Whether to normalize the image. + image_mean (`float` or `List[float]`, *optional*): + Mean to use if normalizing the image. + image_std (`float` or `List[float]`, *optional*): + Standard deviation to use if normalizing the image. + do_pad (`bool`, *optional*): + Whether to pad the image to the `(max_height, max_width)` of the images in the batch. + pad_size (`Dict[str, int]`, *optional*): + The size `{"height": int, "width" int}` to pad the images to. + do_center_crop (`bool`, *optional*): + Whether to center crop the image. + data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format for the output image. + input_data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format for the input image. + device (`str`, *optional*): + The device to use for processing (e.g. "cpu", "cuda"), only relevant for fast image processing. + """ + + do_resize: Optional[bool] + size: Optional[dict[str, int]] + size_divisor: Optional[int] + crop_size: Optional[dict[str, int]] + resample: Optional[Union["PILImageResampling", int]] + do_rescale: Optional[bool] + rescale_factor: Optional[float] + do_normalize: Optional[bool] + image_mean: Optional[Union[float, list[float]]] + image_std: Optional[Union[float, list[float]]] + do_pad: Optional[bool] + pad_size: Optional[dict[str, int]] + do_center_crop: Optional[bool] + data_format: Optional[ChannelDimension] + input_data_format: Optional[Union[str, ChannelDimension]] + device: Optional[str] + + +class VideosKwargs(TypedDict, total=False): + """ + Keyword arguments for video processing. + + Attributes: + do_resize (`bool`): + Whether to resize the image. + size (`Dict[str, int]`, *optional*): + Resize the shorter side of the input to `size["shortest_edge"]`. + size_divisor (`int`, *optional*): + The size by which to make sure both the height and width can be divided. + resample (`PILImageResampling`, *optional*): + Resampling filter to use if resizing the image. + do_rescale (`bool`, *optional*): + Whether to rescale the image by the specified scale `rescale_factor`. + rescale_factor (`int` or `float`, *optional*): + Scale factor to use if rescaling the image. + do_normalize (`bool`, *optional*): + Whether to normalize the image. + image_mean (`float` or `List[float]`, *optional*): + Mean to use if normalizing the image. + image_std (`float` or `List[float]`, *optional*): + Standard deviation to use if normalizing the image. + do_pad (`bool`, *optional*): + Whether to pad the image to the `(max_height, max_width)` of the images in the batch. + do_center_crop (`bool`, *optional*): + Whether to center crop the image. + data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format for the output image. + input_data_format (`ChannelDimension` or `str`, *optional*): + The channel dimension format for the input image. + """ + + do_resize: Optional[bool] + size: Optional[dict[str, int]] + size_divisor: Optional[int] + resample: Optional["PILImageResampling"] + do_rescale: Optional[bool] + rescale_factor: Optional[float] + do_normalize: Optional[bool] + image_mean: Optional[Union[float, list[float]]] + image_std: Optional[Union[float, list[float]]] + do_pad: Optional[bool] + do_center_crop: Optional[bool] + data_format: Optional[ChannelDimension] + input_data_format: Optional[Union[str, ChannelDimension]] + + +class AudioKwargs(TypedDict, total=False): + """ + Keyword arguments for audio processing. + + Attributes: + sampling_rate (`int`, *optional*): + The sampling rate at which the `raw_speech` input was sampled. + raw_speech (`np.ndarray`, `List[float]`, `List[np.ndarray]`, `List[List[float]]`): + The sequence or batch of sequences to be padded. Each sequence can be a numpy array, a list of float + values, a list of numpy arrays or a list of list of float values. Must be mono channel audio, not + stereo, i.e. single float per timestep. + padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*): + Select a strategy to pad the returned sequences (according to the model's padding side and padding + index) among: + + - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single + sequence if provided). + - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum + acceptable input length for the model if that argument is not provided. + - `False` or `'do_not_pad'` + max_length (`int`, *optional*): + Maximum length of the returned list and optionally padding length (see above). + truncation (`bool`, *optional*): + Activates truncation to cut input sequences longer than *max_length* to *max_length*. + pad_to_multiple_of (`int`, *optional*): + If set, will pad the sequence to a multiple of the provided value. + return_attention_mask (`bool`, *optional*): + Whether or not [`~ASTFeatureExtractor.__call__`] should return `attention_mask`. + """ + + sampling_rate: Optional[int] + raw_speech: Optional[Union["np.ndarray", list[float], list["np.ndarray"], list[list[float]]]] + padding: Optional[Union[bool, str, PaddingStrategy]] + max_length: Optional[int] + truncation: Optional[bool] + pad_to_multiple_of: Optional[int] + return_attention_mask: Optional[bool] + + +class CommonKwargs(TypedDict, total=False): + return_tensors: Optional[Union[str, TensorType]] + + +class ProcessingKwargs(TextKwargs, ImagesKwargs, VideosKwargs, AudioKwargs, CommonKwargs, total=False): + """ + Base class for kwargs passing to processors. + A model should have its own `ModelProcessorKwargs` class that inherits from `ProcessingKwargs` to provide: + 1) Additional typed keys and that this model requires to process inputs. + 2) Default values for existing keys under a `_defaults` attribute. + New keys have to be defined as follows to ensure type hinting is done correctly. + + ```python + # adding a new image kwarg for this model + class ModelImagesKwargs(ImagesKwargs, total=False): + new_image_kwarg: Optional[bool] + + class ModelProcessorKwargs(ProcessingKwargs, total=False): + images_kwargs: ModelImagesKwargs + _defaults = { + "images_kwargs: { + "new_image_kwarg": False, + } + "text_kwargs": { + "padding": "max_length", + }, + } + + ``` + + For Python 3.8 compatibility, when inheriting from this class and overriding one of the kwargs, + you need to manually update the __annotations__ dictionary. This can be done as follows: + + ```python + class CustomProcessorKwargs(ProcessingKwargs, total=False): + images_kwargs: CustomImagesKwargs + + CustomProcessorKwargs.__annotations__["images_kwargs"] = CustomImagesKwargs # python 3.8 compatibility + ```python + + """ + + common_kwargs: CommonKwargs = { + **CommonKwargs.__annotations__, + } + text_kwargs: TextKwargs = { + **TextKwargs.__annotations__, + } + images_kwargs: ImagesKwargs = { + **ImagesKwargs.__annotations__, + } + videos_kwargs: VideosKwargs = { + **VideosKwargs.__annotations__, + } + audio_kwargs: AudioKwargs = { + **AudioKwargs.__annotations__, + } + + +class TokenizerChatTemplateKwargs(TypedDict, total=False): + """ + Keyword arguments for tokenizer's `apply_chat_template`, when it is called from within a processor. + + tools (`List[Dict]`, *optional*): + A list of tools (callable functions) that will be accessible to the model. If the template does not + support function calling, this argument will have no effect. Each tool should be passed as a JSON Schema, + giving the name, description and argument types for the tool. See our + [chat templating guide](https://huggingface.co/docs/transformers/main/en/chat_templating#automated-function-conversion-for-tool-use) + for more information. + documents (`List[Dict[str, str]]`, *optional*): + A list of dicts representing documents that will be accessible to the model if it is performing RAG + (retrieval-augmented generation). If the template does not support RAG, this argument will have no + effect. We recommend that each document should be a dict containing "title" and "text" keys. Please + see the RAG section of the [chat templating guide](https://huggingface.co/docs/transformers/main/en/chat_templating#arguments-for-RAG) + for examples of passing documents with chat templates. + add_generation_prompt (bool, *optional*): + If this is set, a prompt with the token(s) that indicate + the start of an assistant message will be appended to the formatted output. This is useful when you want to generate a response from the model. + Note that this argument will be passed to the chat template, and so it must be supported in the + template for this argument to have any effect. + continue_final_message (bool, *optional*): + If this is set, the chat will be formatted so that the final + message in the chat is open-ended, without any EOS tokens. The model will continue this message + rather than starting a new one. This allows you to "prefill" part of + the model's response for it. Cannot be used at the same time as `add_generation_prompt`. + return_assistant_tokens_mask (`bool`, defaults to `False`): + Whether to return a mask of the assistant generated tokens. For tokens generated by the assistant, + the mask will contain 1. For user and system tokens, the mask will contain 0. + This functionality is only available for chat templates that support it via the `{% generation %}` keyword. + """ + + tools: Optional[list[dict]] = None + documents: Optional[list[dict[str, str]]] = None + add_generation_prompt: Optional[bool] = False + continue_final_message: Optional[bool] = False + return_assistant_tokens_mask: Optional[bool] = False + + +class ChatTemplateLoadKwargs(TypedDict, total=False): + """ + Keyword arguments used to load multimodal data in processor chat templates. + + num_frames (`int`, *optional*): + Number of frames to sample uniformly. If not passed, the whole video is loaded. + video_load_backend (`str`, *optional*, defaults to `"pyav"`): + The backend to use when loading the video which will be used only when there are videos in the conversation. + Can be any of ["decord", "pyav", "opencv", "torchvision"]. Defaults to "pyav" because it is the only backend + that supports all types of sources to load from. + video_fps (`int`, *optional*): + Number of frames to sample per second. Should be passed only when `num_frames=None`. + If not specified and `num_frames==None`, all frames are sampled. + sample_indices_fn (`Callable`, *optional*): + A callable function that will return indices at which the video should be sampled. If the video has to be loaded using + by a different sampling technique than provided by `num_frames` or `fps` arguments, one should provide their own `sample_indices_fn`. + If not provided, simple uniformt sampling with fps is performed, otherwise `sample_indices_fn` has priority over other args. + The function expects at input the all args along with all kwargs passed to `load_video` and should output valid + indices at which the video should be sampled. For example: + + def sample_indices_fn(num_frames, fps, metadata, **kwargs): + # add you sampling logic here ... + return np.linspace(start_idx, end_idx, num_frames, dtype=int) + """ + + num_frames: Optional[int] = None + video_load_backend: Optional[str] = "pyav" + video_fps: Optional[int] = None + sampling_rate: Optional[int] = 16_000 + load_audio_from_video: Optional[bool] = False + + +class ProcessorChatTemplateKwargs(ChatTemplateLoadKwargs, TokenizerChatTemplateKwargs, total=False): + """ + Keyword arguments for processor's `apply_chat_template`. + + tokenize (`bool`, *optional*, defaults to `False`): + Whether to tokenize the output or not. + return_dict (`bool`, defaults to `False`): + Whether to return a dictionary with named outputs. Has no effect if tokenize is `False`. + """ + + tokenize: Optional[bool] = False + return_dict: Optional[bool] = False + + +class AllKwargsForChatTemplate( + TextKwargs, ImagesKwargs, VideosKwargs, AudioKwargs, CommonKwargs, ProcessorChatTemplateKwargs +): + processor_kwargs: ProcessingKwargs = { + **ProcessingKwargs.__annotations__, + } + mm_load_kwargs: ChatTemplateLoadKwargs = { + **TextKwargs.__annotations__, + } + template_kwargs: ProcessorChatTemplateKwargs = { + **ProcessorChatTemplateKwargs.__annotations__, + } + + +class ProcessorMixin(PushToHubMixin): + """ + This is a mixin used to provide saving/loading functionality for all processor classes. + """ + + attributes = ["feature_extractor", "tokenizer"] + optional_attributes = ["chat_template"] + optional_call_args: list[str] = [] + # Names need to be attr_class for attr in attributes + feature_extractor_class = None + tokenizer_class = None + _auto_class = None + valid_kwargs: list[str] = [] + + # args have to match the attributes class attribute + def __init__(self, *args, **kwargs): + # First, extract optional attributes from kwargs if present + # Optional attributes can never be positional arguments + for optional_attribute in self.optional_attributes: + setattr(self, optional_attribute, kwargs.pop(optional_attribute, None)) + # Sanitize args and kwargs + for key in kwargs: + if key not in self.attributes: + raise TypeError(f"Unexpected keyword argument {key}.") + for arg, attribute_name in zip(args, self.attributes): + if attribute_name in kwargs: + raise TypeError(f"Got multiple values for argument {attribute_name}.") + else: + kwargs[attribute_name] = arg + + if len(kwargs) != len(self.attributes): + raise ValueError( + f"This processor requires {len(self.attributes)} arguments: {', '.join(self.attributes)}. Got " + f"{len(args)} arguments instead." + ) + + # Check each arg is of the proper class (this will also catch a user initializing in the wrong order) + for attribute_name, arg in kwargs.items(): + class_name = getattr(self, f"{attribute_name}_class") + # Nothing is ever going to be an instance of "AutoXxx", in that case we check the base class. + class_name = AUTO_TO_BASE_CLASS_MAPPING.get(class_name, class_name) + if isinstance(class_name, tuple): + proper_class = tuple(self.get_possibly_dynamic_module(n) for n in class_name if n is not None) + else: + proper_class = self.get_possibly_dynamic_module(class_name) + + if not isinstance(arg, proper_class): + raise TypeError( + f"Received a {type(arg).__name__} for argument {attribute_name}, but a {class_name} was expected." + ) + + setattr(self, attribute_name, arg) + + def to_dict(self) -> dict[str, Any]: + """ + Serializes this instance to a Python dictionary. + + Returns: + `Dict[str, Any]`: Dictionary of all the attributes that make up this processor instance. + """ + output = copy.deepcopy(self.__dict__) + + # Get the kwargs in `__init__`. + sig = inspect.signature(self.__init__) + # Only save the attributes that are presented in the kwargs of `__init__`. + attrs_to_save = sig.parameters + # Don't save attributes like `tokenizer`, `image processor` etc. + attrs_to_save = [x for x in attrs_to_save if x not in self.__class__.attributes] + # extra attributes to be kept + attrs_to_save += ["auto_map"] + + output = {k: v for k, v in output.items() if k in attrs_to_save} + + output["processor_class"] = self.__class__.__name__ + + if "tokenizer" in output: + del output["tokenizer"] + if "image_processor" in output: + del output["image_processor"] + if "feature_extractor" in output: + del output["feature_extractor"] + if "chat_template" in output: + del output["chat_template"] + + # Some attributes have different names but containing objects that are not simple strings + output = { + k: v + for k, v in output.items() + if not (isinstance(v, PushToHubMixin) or v.__class__.__name__ == "BeamSearchDecoderCTC") + } + + return output + + def to_json_string(self) -> str: + """ + Serializes this instance to a JSON string. + + Returns: + `str`: String containing all the attributes that make up this feature_extractor instance in JSON format. + """ + dictionary = self.to_dict() + + return json.dumps(dictionary, indent=2, sort_keys=True) + "\n" + + def to_json_file(self, json_file_path: Union[str, os.PathLike]): + """ + Save this instance to a JSON file. + + Args: + json_file_path (`str` or `os.PathLike`): + Path to the JSON file in which this processor instance's parameters will be saved. + """ + with open(json_file_path, "w", encoding="utf-8") as writer: + writer.write(self.to_json_string()) + + def __repr__(self): + attributes_repr = [f"- {name}: {repr(getattr(self, name))}" for name in self.attributes] + attributes_repr = "\n".join(attributes_repr) + return f"{self.__class__.__name__}:\n{attributes_repr}\n\n{self.to_json_string()}" + + def save_pretrained(self, save_directory, push_to_hub: bool = False, **kwargs): + """ + Saves the attributes of this processor (feature extractor, tokenizer...) in the specified directory so that it + can be reloaded using the [`~ProcessorMixin.from_pretrained`] method. + + + + This class method is simply calling [`~feature_extraction_utils.FeatureExtractionMixin.save_pretrained`] and + [`~tokenization_utils_base.PreTrainedTokenizerBase.save_pretrained`]. Please refer to the docstrings of the + methods above for more information. + + + + Args: + save_directory (`str` or `os.PathLike`): + Directory where the feature extractor JSON file and the tokenizer files will be saved (directory will + be created if it does not exist). + push_to_hub (`bool`, *optional*, defaults to `False`): + Whether or not to push your model to the Hugging Face model hub after saving it. You can specify the + repository you want to push to with `repo_id` (will default to the name of `save_directory` in your + namespace). + kwargs (`Dict[str, Any]`, *optional*): + Additional key word arguments passed along to the [`~utils.PushToHubMixin.push_to_hub`] method. + """ + use_auth_token = kwargs.pop("use_auth_token", None) + + if use_auth_token is not None: + warnings.warn( + "The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers. Please use `token` instead.", + FutureWarning, + ) + if kwargs.get("token", None) is not None: + raise ValueError( + "`token` and `use_auth_token` are both specified. Please set only the argument `token`." + ) + kwargs["token"] = use_auth_token + + os.makedirs(save_directory, exist_ok=True) + + if push_to_hub: + commit_message = kwargs.pop("commit_message", None) + repo_id = kwargs.pop("repo_id", save_directory.split(os.path.sep)[-1]) + repo_id = self._create_repo(repo_id, **kwargs) + files_timestamps = self._get_files_timestamps(save_directory) + # If we have a custom config, we copy the file defining it in the folder and set the attributes so it can be + # loaded from the Hub. + if self._auto_class is not None: + attrs = [getattr(self, attribute_name) for attribute_name in self.attributes] + configs = [(a.init_kwargs if isinstance(a, PreTrainedTokenizerBase) else a) for a in attrs] + configs.append(self) + custom_object_save(self, save_directory, config=configs) + + save_jinja_files = kwargs.get("save_jinja_files", True) + + for attribute_name in self.attributes: + attribute = getattr(self, attribute_name) + # Include the processor class in the attribute config so this processor can then be reloaded with the + # `AutoProcessor` API. + if hasattr(attribute, "_set_processor_class"): + attribute._set_processor_class(self.__class__.__name__) + if attribute_name == "tokenizer": + # Propagate save_jinja_files to tokenizer to ensure we don't get conflicts + attribute.save_pretrained(save_directory, save_jinja_files=save_jinja_files) + else: + attribute.save_pretrained(save_directory) + + if self._auto_class is not None: + # We added an attribute to the init_kwargs of the tokenizers, which needs to be cleaned up. + for attribute_name in self.attributes: + attribute = getattr(self, attribute_name) + if isinstance(attribute, PreTrainedTokenizerBase): + del attribute.init_kwargs["auto_map"] + + # If we save using the predefined names, we can load using `from_pretrained` + # plus we save chat_template in its own file + output_processor_file = os.path.join(save_directory, PROCESSOR_NAME) + output_chat_template_file_jinja = os.path.join(save_directory, CHAT_TEMPLATE_FILE) + output_chat_template_file_legacy = os.path.join( + save_directory, LEGACY_PROCESSOR_CHAT_TEMPLATE_FILE + ) # Legacy filename + chat_template_dir = os.path.join(save_directory, CHAT_TEMPLATE_DIR) + + processor_dict = self.to_dict() + # Save `chat_template` in its own file. We can't get it from `processor_dict` as we popped it in `to_dict` + # to avoid serializing chat template in json config file. So let's get it from `self` directly + if self.chat_template is not None: + save_jinja_files = kwargs.get("save_jinja_files", True) + is_single_template = isinstance(self.chat_template, str) + if save_jinja_files and is_single_template: + # New format for single templates is to save them as chat_template.jinja + with open(output_chat_template_file_jinja, "w", encoding="utf-8") as f: + f.write(self.chat_template) + logger.info(f"chat template saved in {output_chat_template_file_jinja}") + elif save_jinja_files and not is_single_template: + # New format for multiple templates is to save the default as chat_template.jinja + # and the other templates in the chat_templates/ directory + for template_name, template in self.chat_template.items(): + if template_name == "default": + with open(output_chat_template_file_jinja, "w", encoding="utf-8") as f: + f.write(self.chat_template["default"]) + logger.info(f"chat template saved in {output_chat_template_file_jinja}") + else: + os.makedirs(chat_template_dir, exist_ok=True) + template_filepath = os.path.join(chat_template_dir, f"{template_name}.jinja") + with open(template_filepath, "w", encoding="utf-8") as f: + f.write(template) + logger.info(f"chat template saved in {template_filepath}") + elif is_single_template: + # Legacy format for single templates: Put them in chat_template.json + chat_template_json_string = ( + json.dumps({"chat_template": self.chat_template}, indent=2, sort_keys=True) + "\n" + ) + with open(output_chat_template_file_legacy, "w", encoding="utf-8") as writer: + writer.write(chat_template_json_string) + logger.info(f"chat template saved in {output_chat_template_file_legacy}") + elif self.chat_template is not None: + # At this point we have multiple templates in the legacy format, which is not supported + # chat template dicts are saved to chat_template.json as lists of dicts with fixed key names. + raise ValueError( + "Multiple chat templates are not supported in the legacy format. Please save them as " + "separate files using the `save_jinja_files` argument." + ) + + # For now, let's not save to `processor_config.json` if the processor doesn't have extra attributes and + # `auto_map` is not specified. + if set(processor_dict.keys()) != {"processor_class"}: + self.to_json_file(output_processor_file) + logger.info(f"processor saved in {output_processor_file}") + + if push_to_hub: + self._upload_modified_files( + save_directory, + repo_id, + files_timestamps, + commit_message=commit_message, + token=kwargs.get("token"), + ) + + if set(processor_dict.keys()) == {"processor_class"}: + return [] + return [output_processor_file] + + @classmethod + def get_processor_dict( + cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs + ) -> tuple[dict[str, Any], dict[str, Any]]: + """ + From a `pretrained_model_name_or_path`, resolve to a dictionary of parameters, to be used for instantiating a + processor of type [`~processing_utils.ProcessingMixin`] using `from_args_and_dict`. + + Parameters: + pretrained_model_name_or_path (`str` or `os.PathLike`): + The identifier of the pre-trained checkpoint from which we want the dictionary of parameters. + subfolder (`str`, *optional*, defaults to `""`): + In case the relevant files are located inside a subfolder of the model repo on huggingface.co, you can + specify the folder name here. + + Returns: + `Tuple[Dict, Dict]`: The dictionary(ies) that will be used to instantiate the processor object. + """ + cache_dir = kwargs.pop("cache_dir", None) + force_download = kwargs.pop("force_download", False) + resume_download = kwargs.pop("resume_download", None) + proxies = kwargs.pop("proxies", None) + token = kwargs.pop("token", None) + local_files_only = kwargs.pop("local_files_only", False) + revision = kwargs.pop("revision", None) + subfolder = kwargs.pop("subfolder", "") + + from_pipeline = kwargs.pop("_from_pipeline", None) + from_auto_class = kwargs.pop("_from_auto", False) + + user_agent = {"file_type": "processor", "from_auto_class": from_auto_class} + if from_pipeline is not None: + user_agent["using_pipeline"] = from_pipeline + + if is_offline_mode() and not local_files_only: + logger.info("Offline mode: forcing local_files_only=True") + local_files_only = True + + pretrained_model_name_or_path = str(pretrained_model_name_or_path) + is_local = os.path.isdir(pretrained_model_name_or_path) + if os.path.isdir(pretrained_model_name_or_path): + processor_file = os.path.join(pretrained_model_name_or_path, PROCESSOR_NAME) + + additional_chat_template_files = {} + resolved_additional_chat_template_files = {} + if os.path.isfile(pretrained_model_name_or_path): + resolved_processor_file = pretrained_model_name_or_path + # cant't load chat-template when given a file as pretrained_model_name_or_path + resolved_chat_template_file = None + resolved_raw_chat_template_file = None + is_local = True + elif is_remote_url(pretrained_model_name_or_path): + processor_file = pretrained_model_name_or_path + resolved_processor_file = download_url(pretrained_model_name_or_path) + # can't load chat-template when given a file url as pretrained_model_name_or_path + resolved_chat_template_file = None + resolved_raw_chat_template_file = None + else: + if is_local: + template_dir = Path(pretrained_model_name_or_path, CHAT_TEMPLATE_DIR) + if template_dir.is_dir(): + for template_file in template_dir.glob("*.jinja"): + template_name = template_file.stem + additional_chat_template_files[template_name] = f"{CHAT_TEMPLATE_DIR}/{template_file.name}" + else: + try: + for template in list_repo_templates( + pretrained_model_name_or_path, + local_files_only=local_files_only, + revision=revision, + cache_dir=cache_dir, + ): + additional_chat_template_files[template] = f"{CHAT_TEMPLATE_DIR}/{template}.jinja" + except EntryNotFoundError: + pass # No template dir means no template files + processor_file = PROCESSOR_NAME + + try: + # Load from local folder or from cache or download from model Hub and cache + resolved_processor_file = cached_file( + pretrained_model_name_or_path, + processor_file, + cache_dir=cache_dir, + force_download=force_download, + proxies=proxies, + resume_download=resume_download, + local_files_only=local_files_only, + token=token, + user_agent=user_agent, + revision=revision, + subfolder=subfolder, + _raise_exceptions_for_missing_entries=False, + ) + + # chat_template.json is a legacy file used by the processor class + # a raw chat_template.jinja is preferred in future + resolved_chat_template_file = cached_file( + pretrained_model_name_or_path, + LEGACY_PROCESSOR_CHAT_TEMPLATE_FILE, + cache_dir=cache_dir, + force_download=force_download, + proxies=proxies, + resume_download=resume_download, + local_files_only=local_files_only, + token=token, + user_agent=user_agent, + revision=revision, + subfolder=subfolder, + _raise_exceptions_for_missing_entries=False, + ) + + resolved_raw_chat_template_file = cached_file( + pretrained_model_name_or_path, + CHAT_TEMPLATE_FILE, + cache_dir=cache_dir, + force_download=force_download, + proxies=proxies, + resume_download=resume_download, + local_files_only=local_files_only, + token=token, + user_agent=user_agent, + revision=revision, + subfolder=subfolder, + _raise_exceptions_for_missing_entries=False, + ) + + resolved_additional_chat_template_files = { + template_name: cached_file( + pretrained_model_name_or_path, + template_file, + cache_dir=cache_dir, + force_download=force_download, + proxies=proxies, + resume_download=resume_download, + local_files_only=local_files_only, + token=token, + user_agent=user_agent, + revision=revision, + subfolder=subfolder, + _raise_exceptions_for_missing_entries=False, + ) + for template_name, template_file in additional_chat_template_files.items() + } + except OSError: + # Raise any environment error raise by `cached_file`. It will have a helpful error message adapted to + # the original exception. + raise + except Exception: + # For any other exception, we throw a generic error. + raise OSError( + f"Can't load processor for '{pretrained_model_name_or_path}'. If you were trying to load" + " it from 'https://huggingface.co/models', make sure you don't have a local directory with the" + f" same name. Otherwise, make sure '{pretrained_model_name_or_path}' is the correct path to a" + f" directory containing a {PROCESSOR_NAME} file" + ) + + # Add chat template as kwarg before returning because most models don't have processor config + if resolved_chat_template_file is not None: + # This is the legacy path + with open(resolved_chat_template_file, encoding="utf-8") as reader: + chat_template_json = json.loads(reader.read()) + chat_templates = {"default": chat_template_json["chat_template"]} + if resolved_additional_chat_template_files: + raise ValueError( + "Cannot load chat template due to conflicting files - this checkpoint combines " + "a legacy chat_template.json file with separate template files, which is not " + "supported. To resolve this error, replace the legacy chat_template.json file " + "with a modern chat_template.jinja file." + ) + else: + chat_templates = { + template_name: open(template_file, "r", encoding="utf-8").read() + for template_name, template_file in resolved_additional_chat_template_files.items() + } + if resolved_raw_chat_template_file is not None: + with open(resolved_raw_chat_template_file, "r", encoding="utf-8") as reader: + chat_templates["default"] = reader.read() + if isinstance(chat_templates, dict) and "default" in chat_templates and len(chat_templates) == 1: + chat_templates = chat_templates["default"] # Flatten when we just have a single template/file + + if chat_templates: + kwargs["chat_template"] = chat_templates + + # Existing processors on the Hub created before #27761 being merged don't have `processor_config.json` (if not + # updated afterward), and we need to keep `from_pretrained` work. So here it fallbacks to the empty dict. + # (`cached_file` called using `_raise_exceptions_for_missing_entries=False` to avoid exception) + # However, for models added in the future, we won't get the expected error if this file is missing. + if resolved_processor_file is None: + # In any case we need to pass `chat_template` if it is available + processor_dict = {} + if "chat_template" in kwargs: + processor_dict = {"chat_template": kwargs.pop("chat_template")} + return processor_dict, kwargs + + try: + # Load processor dict + with open(resolved_processor_file, encoding="utf-8") as reader: + text = reader.read() + processor_dict = json.loads(text) + + except json.JSONDecodeError: + raise OSError(f"It looks like the config file at '{resolved_processor_file}' is not a valid JSON file.") + + if is_local: + logger.info(f"loading configuration file {resolved_processor_file}") + else: + logger.info(f"loading configuration file {processor_file} from cache at {resolved_processor_file}") + + if "chat_template" in processor_dict and processor_dict["chat_template"] is not None: + logger.warning_once( + "Chat templates should be in a 'chat_template.jinja' file but found key='chat_template' " + "in the processor's config. Make sure to move your template to its own file." + ) + + if "chat_template" in kwargs: + processor_dict["chat_template"] = kwargs.pop("chat_template") + + if not is_local: + if "auto_map" in processor_dict: + processor_dict["auto_map"] = add_model_info_to_auto_map( + processor_dict["auto_map"], pretrained_model_name_or_path + ) + if "custom_pipelines" in processor_dict: + processor_dict["custom_pipelines"] = add_model_info_to_custom_pipelines( + processor_dict["custom_pipelines"], pretrained_model_name_or_path + ) + + return processor_dict, kwargs + + @classmethod + def from_args_and_dict(cls, args, processor_dict: dict[str, Any], **kwargs): + """ + Instantiates a type of [`~processing_utils.ProcessingMixin`] from a Python dictionary of parameters. + + Args: + processor_dict (`Dict[str, Any]`): + Dictionary that will be used to instantiate the processor object. Such a dictionary can be + retrieved from a pretrained checkpoint by leveraging the + [`~processing_utils.ProcessingMixin.to_dict`] method. + kwargs (`Dict[str, Any]`): + Additional parameters from which to initialize the processor object. + + Returns: + [`~processing_utils.ProcessingMixin`]: The processor object instantiated from those + parameters. + """ + processor_dict = processor_dict.copy() + return_unused_kwargs = kwargs.pop("return_unused_kwargs", False) + + # We have to pop up some unused (but specific) kwargs and then validate that it doesn't contain unused kwargs + # If we don't pop, some specific kwargs will raise a warning + if "processor_class" in processor_dict: + del processor_dict["processor_class"] + + if "auto_map" in processor_dict: + del processor_dict["auto_map"] + + unused_kwargs = cls.validate_init_kwargs(processor_config=processor_dict, valid_kwargs=cls.valid_kwargs) + processor = cls(*args, **processor_dict) + + # Update processor with kwargs if needed + for key in set(kwargs.keys()): + if hasattr(processor, key): + setattr(processor, key, kwargs.pop(key)) + + kwargs.update(unused_kwargs) + logger.info(f"Processor {processor}") + if return_unused_kwargs: + return processor, kwargs + else: + return processor + + def _merge_kwargs( + self, + ModelProcessorKwargs: ProcessingKwargs, + tokenizer_init_kwargs: Optional[dict] = None, + **kwargs, + ) -> dict[str, dict]: + """ + Method to merge dictionaries of kwargs cleanly separated by modality within a Processor instance. + The order of operations is as follows: + 1) kwargs passed as before have highest priority to preserve BC. + ```python + high_priority_kwargs = {"crop_size" = {"height": 222, "width": 222}, "padding" = "max_length"} + processor(..., **high_priority_kwargs) + ``` + 2) kwargs passed as modality-specific kwargs have second priority. This is the recommended API. + ```python + processor(..., text_kwargs={"padding": "max_length"}, images_kwargs={"crop_size": {"height": 222, "width": 222}}}) + ``` + 3) kwargs passed during instantiation of a modality processor have fourth priority. + ```python + tokenizer = tokenizer_class(..., {"padding": "max_length"}) + image_processor = image_processor_class(...) + processor(tokenizer, image_processor) # will pass max_length unless overridden by kwargs at call + ``` + 4) defaults kwargs specified at processor level have lowest priority. + ```python + class MyProcessingKwargs(ProcessingKwargs, CommonKwargs, TextKwargs, ImagesKwargs, total=False): + _defaults = { + "text_kwargs": { + "padding": "max_length", + "max_length": 64, + }, + } + ``` + Args: + ModelProcessorKwargs (`ProcessingKwargs`): + Typed dictionary of kwargs specifically required by the model passed. + tokenizer_init_kwargs (`Dict`, *optional*): + Dictionary of kwargs the tokenizer was instantiated with and need to take precedence over defaults. + + Returns: + output_kwargs (`Dict`): + Dictionary of per-modality kwargs to be passed to each modality-specific processor. + + """ + # Initialize dictionaries + output_kwargs = { + "text_kwargs": {}, + "images_kwargs": {}, + "audio_kwargs": {}, + "videos_kwargs": {}, + "common_kwargs": {}, + } + + default_kwargs = { + "text_kwargs": {}, + "images_kwargs": {}, + "audio_kwargs": {}, + "videos_kwargs": {}, + "common_kwargs": {}, + } + + possible_modality_keywords = {"text", "audio", "videos", "images"} + used_keys = set() + + # get defaults from set model processor kwargs if they exist + for modality in default_kwargs: + default_kwargs[modality] = ModelProcessorKwargs._defaults.get(modality, {}).copy() + # update defaults with arguments from tokenizer init + for modality_key in ModelProcessorKwargs.__annotations__[modality].__annotations__.keys(): + # init with tokenizer init kwargs if necessary + if modality_key in tokenizer_init_kwargs: + value = ( + getattr(self.tokenizer, modality_key) + if hasattr(self.tokenizer, modality_key) + else tokenizer_init_kwargs[modality_key] + ) + default_kwargs[modality][modality_key] = value + # now defaults kwargs are updated with the tokenizers defaults. + # pass defaults to output dictionary + output_kwargs.update(default_kwargs) + + # update modality kwargs with passed kwargs + non_modality_kwargs = set(kwargs) - set(output_kwargs) + for modality in output_kwargs: + for modality_key in ModelProcessorKwargs.__annotations__[modality].__annotations__.keys(): + # check if we received a structured kwarg dict or not to handle it correctly + if modality in kwargs: + kwarg_value = kwargs[modality].pop(modality_key, "__empty__") + # check if this key was passed as a flat kwarg. + if kwarg_value != "__empty__" and modality_key in non_modality_kwargs: + raise ValueError( + f"Keyword argument {modality_key} was passed two times:\n" + f"in a dictionary for {modality} and as a **kwarg." + ) + elif modality_key in kwargs: + # we get a modality_key instead of popping it because modality-specific processors + # can have overlapping kwargs + kwarg_value = kwargs.get(modality_key, "__empty__") + else: + kwarg_value = "__empty__" + if not isinstance(kwarg_value, str) or kwarg_value != "__empty__": + output_kwargs[modality][modality_key] = kwarg_value + used_keys.add(modality_key) + + # Determine if kwargs is a flat dictionary or contains nested dictionaries + if any(key in default_kwargs for key in kwargs): + # kwargs is dictionary-based, and some keys match modality names + for modality, subdict in kwargs.items(): + if modality in default_kwargs: + for subkey, subvalue in subdict.items(): + if subkey not in used_keys: + output_kwargs[modality][subkey] = subvalue + used_keys.add(subkey) + else: + # kwargs is a flat dictionary + for key in kwargs: + if key not in used_keys: + if key in ModelProcessorKwargs.__annotations__["common_kwargs"].__annotations__.keys(): + output_kwargs["common_kwargs"][key] = kwargs[key] + elif key not in possible_modality_keywords: + logger.warning_once( + f"Keyword argument `{key}` is not a valid argument for this processor and will be ignored." + ) + + # all modality-specific kwargs are updated with common kwargs + for modality in output_kwargs: + output_kwargs[modality].update(output_kwargs["common_kwargs"]) + return output_kwargs + + @classmethod + def from_pretrained( + cls, + pretrained_model_name_or_path: Union[str, os.PathLike], + cache_dir: Optional[Union[str, os.PathLike]] = None, + force_download: bool = False, + local_files_only: bool = False, + token: Optional[Union[str, bool]] = None, + revision: str = "main", + **kwargs, + ): + r""" + Instantiate a processor associated with a pretrained model. + + + + This class method is simply calling the feature extractor + [`~feature_extraction_utils.FeatureExtractionMixin.from_pretrained`], image processor + [`~image_processing_utils.ImageProcessingMixin`] and the tokenizer + [`~tokenization_utils_base.PreTrainedTokenizer.from_pretrained`] methods. Please refer to the docstrings of the + methods above for more information. + + + + Args: + pretrained_model_name_or_path (`str` or `os.PathLike`): + This can be either: + + - a string, the *model id* of a pretrained feature_extractor hosted inside a model repo on + huggingface.co. + - a path to a *directory* containing a feature extractor file saved using the + [`~SequenceFeatureExtractor.save_pretrained`] method, e.g., `./my_model_directory/`. + - a path or url to a saved feature extractor JSON *file*, e.g., + `./my_model_directory/preprocessor_config.json`. + **kwargs + Additional keyword arguments passed along to both + [`~feature_extraction_utils.FeatureExtractionMixin.from_pretrained`] and + [`~tokenization_utils_base.PreTrainedTokenizer.from_pretrained`]. + """ + kwargs["cache_dir"] = cache_dir + kwargs["force_download"] = force_download + kwargs["local_files_only"] = local_files_only + kwargs["revision"] = revision + + use_auth_token = kwargs.pop("use_auth_token", None) + if use_auth_token is not None: + warnings.warn( + "The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers. Please use `token` instead.", + FutureWarning, + ) + if token is not None: + raise ValueError( + "`token` and `use_auth_token` are both specified. Please set only the argument `token`." + ) + token = use_auth_token + + if token is not None: + kwargs["token"] = token + + args = cls._get_arguments_from_pretrained(pretrained_model_name_or_path, **kwargs) + processor_dict, kwargs = cls.get_processor_dict(pretrained_model_name_or_path, **kwargs) + return cls.from_args_and_dict(args, processor_dict, **kwargs) + + @classmethod + def register_for_auto_class(cls, auto_class="AutoProcessor"): + """ + Register this class with a given auto class. This should only be used for custom feature extractors as the ones + in the library are already mapped with `AutoProcessor`. + + + + This API is experimental and may have some slight breaking changes in the next releases. + + + + Args: + auto_class (`str` or `type`, *optional*, defaults to `"AutoProcessor"`): + The auto class to register this new feature extractor with. + """ + if not isinstance(auto_class, str): + auto_class = auto_class.__name__ + + import transformers.models.auto as auto_module + + if not hasattr(auto_module, auto_class): + raise ValueError(f"{auto_class} is not a valid auto class.") + + cls._auto_class = auto_class + + @classmethod + def _get_arguments_from_pretrained(cls, pretrained_model_name_or_path, **kwargs): + """ + Identify and instantiate the subcomponents of Processor classes, like image processors and + tokenizers. This method uses the Processor attributes like `tokenizer_class` to figure out what class those + subcomponents should be. Note that any subcomponents must either be library classes that are accessible in + the `transformers` root, or they must be custom code that has been registered with the relevant autoclass, + via methods like `AutoTokenizer.register()`. If neither of these conditions are fulfilled, this method + will be unable to find the relevant subcomponent class and will raise an error. + """ + args = [] + for attribute_name in cls.attributes: + class_name = getattr(cls, f"{attribute_name}_class") + if isinstance(class_name, tuple): + classes = tuple(cls.get_possibly_dynamic_module(n) if n is not None else None for n in class_name) + if attribute_name == "image_processor": + # TODO: @yoni, change logic in v4.52 (when use_fast set to True by default) + use_fast = kwargs.get("use_fast", None) + if use_fast is None: + logger.warning_once( + "Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. " + "`use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. " + "This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`." + ) + else: + use_fast = kwargs.get("use_fast", True) + if use_fast and classes[1] is not None: + attribute_class = classes[1] + else: + attribute_class = classes[0] + else: + attribute_class = cls.get_possibly_dynamic_module(class_name) + + args.append(attribute_class.from_pretrained(pretrained_model_name_or_path, **kwargs)) + return args + + @staticmethod + def get_possibly_dynamic_module(module_name): + if hasattr(transformers_module, module_name): + return getattr(transformers_module, module_name) + lookup_locations = [ + transformers_module.IMAGE_PROCESSOR_MAPPING, + transformers_module.TOKENIZER_MAPPING, + transformers_module.FEATURE_EXTRACTOR_MAPPING, + ] + for lookup_location in lookup_locations: + for custom_class in lookup_location._extra_content.values(): + if isinstance(custom_class, tuple): + for custom_subclass in custom_class: + if custom_subclass is not None and custom_subclass.__name__ == module_name: + return custom_subclass + elif custom_class is not None and custom_class.__name__ == module_name: + return custom_class + else: + raise ValueError( + f"Could not find module {module_name} in `transformers`. If this is a custom class, " + f"it should be registered using the relevant `AutoClass.register()` function so that " + f"other functions can find it!" + ) + + @property + def model_input_names(self): + first_attribute = getattr(self, self.attributes[0]) + return getattr(first_attribute, "model_input_names", None) + + @staticmethod + def validate_init_kwargs(processor_config, valid_kwargs): + kwargs_from_config = processor_config.keys() + unused_kwargs = {} + unused_keys = set(kwargs_from_config) - set(valid_kwargs) + if unused_keys: + unused_kwargs = {k: processor_config[k] for k in unused_keys} + return unused_kwargs + + def prepare_and_validate_optional_call_args(self, *args): + """ + Matches optional positional arguments to their corresponding names in `optional_call_args` + in the processor class in the order they are passed to the processor call. + + Note that this should only be used in the `__call__` method of the processors with special + arguments. Special arguments are arguments that aren't `text`, `images`, `audio`, nor `videos` + but also aren't passed to the tokenizer, image processor, etc. Examples of such processors are: + - `CLIPSegProcessor` + - `LayoutLMv2Processor` + - `OwlViTProcessor` + + Also note that passing by position to the processor call is now deprecated and will be disallowed + in future versions. We only have this for backward compatibility. + + Example: + Suppose that the processor class has `optional_call_args = ["arg_name_1", "arg_name_2"]`. + And we define the call method as: + ```python + def __call__( + self, + text: str, + images: Optional[ImageInput] = None, + *arg, + audio=None, + videos=None, + ) + ``` + + Then, if we call the processor as: + ```python + images = [...] + processor("What is common in these images?", images, arg_value_1, arg_value_2) + ``` + + Then, this method will return: + ```python + { + "arg_name_1": arg_value_1, + "arg_name_2": arg_value_2, + } + ``` + which we could then pass as kwargs to `self._merge_kwargs` + """ + if len(args): + warnings.warn( + "Passing positional arguments to the processor call is now deprecated and will be disallowed in v4.47. " + "Please pass all arguments as keyword arguments." + ) + if len(args) > len(self.optional_call_args): + raise ValueError( + f"Expected *at most* {len(self.optional_call_args)} optional positional arguments in processor call" + f"which will be matched with {' '.join(self.optional_call_args)} in the order they are passed." + f"However, got {len(args)} positional arguments instead." + "Please pass all arguments as keyword arguments instead (e.g. `processor(arg_name_1=..., arg_name_2=...))`." + ) + return {arg_name: arg_value for arg_value, arg_name in zip(args, self.optional_call_args)} + + def _process_messages_for_chat_template( + self, + conversation: List[List[Dict[str, str]]], + batch_images: List[ImageInput], + batch_videos: List[VideoInput], + batch_video_metadata: List[List[Dict[str, any]]], + **mm_load_kwargs: Unpack[ChatTemplateLoadKwargs], + ): + """ + Used within `apply_chat_template` when a model has a special way to process conversation history. For example, + video models might want to specify in the prompt the duration of video or which frame indices at which timestamps + were sampled. This information cannot be accessed before the video is loaded. + + For most models it is a no-op, and must be overridden by model processors which require special processing. + + Args: + conversation (`List[Dict, str, str]`): + The conversation to process. Always comes in batched format. + batch_images (`List[List[ImageInput]]`): + Batch of images that were loaded from url/path defined in the conversation. The images + are ordered in the same way as in the conversation. Comes in nested list format, one list of `PIL` images + per batch. + batch_videos (`List[List[ImageInput]]`): + Batch of videos that were loaded from url/path defined in the conversation. The videos + are ordered in the samm way as in the conversation. Comes in nested list format, one list of 4D video arrays + per batch. + batch_video_metadata (`List[List[Dict[[str, any]]]]`): + Batch of metadata returned from loading videos. That includes video fps, duration and total number of framer in original video. + Metadata are ordered in the same way as `batch_videos`. Comes in nested list format, one list of 4D video arrays + per batch. + + """ + return conversation + + def apply_chat_template( + self, + conversation: Union[list[dict[str, str]], list[list[dict[str, str]]]], + chat_template: Optional[str] = None, + **kwargs: Unpack[AllKwargsForChatTemplate], + ) -> str: + """ + Similar to the `apply_chat_template` method on tokenizers, this method applies a Jinja template to input + conversations to turn them into a single tokenizable string. + + The input is expected to be in the following format, where each message content is a list consisting of text and + optionally image or video inputs. One can also provide an image, video, URL or local path which will be used to form + `pixel_values` when `return_dict=True`. If not provided, one will get only the formatted text, optionally tokenized text. + + conversation = [ + { + "role": "user", + "content": [ + {"type": "image", "image": "https://www.ilankelman.org/stopsigns/australia.jpg"}, + {"type": "text", "text": "Please describe this image in detail."}, + ], + }, + ] + + Args: + conversation (`Union[List[Dict, [str, str]], List[List[Dict[str, str]]]]`): + The conversation to format. + chat_template (`Optional[str]`, *optional*): + The Jinja template to use for formatting the conversation. If not provided, the tokenizer's + chat template is used. + """ + + if chat_template is None: + if isinstance(self.chat_template, dict) and "default" in self.chat_template: + chat_template = self.chat_template["default"] + elif isinstance(self.chat_template, dict): + raise ValueError( + 'The processor has multiple chat templates but none of them are named "default". You need to specify' + " which one to use by passing the `chat_template` argument. Available templates are: " + f"{', '.join(self.chat_template.keys())}" + ) + elif self.chat_template is not None: + chat_template = self.chat_template + else: + raise ValueError( + "Cannot use apply_chat_template because this processor does not have a chat template." + ) + else: + if isinstance(self.chat_template, dict) and chat_template in self.chat_template: + # It's the name of a template, not a full template string + chat_template = self.chat_template[chat_template] + else: + # It's a template string, render it directly + chat_template = chat_template + + # Fill sets of kwargs that should be used by different parts of template + processed_kwargs = { + "mm_load_kwargs": {}, + "template_kwargs": {}, + } + + for kwarg_type in processed_kwargs: + for key in AllKwargsForChatTemplate.__annotations__[kwarg_type].__annotations__.keys(): + kwarg_type_defaults = AllKwargsForChatTemplate.__annotations__[kwarg_type] + default_value = getattr(kwarg_type_defaults, key, None) + value = kwargs.pop(key, default_value) + if value is not None and not isinstance(value, dict): + processed_kwargs[kwarg_type][key] = value + + if isinstance(conversation, (list, tuple)) and ( + isinstance(conversation[0], (list, tuple)) or hasattr(conversation[0], "content") + ): + is_batched = True + conversations = conversation + else: + is_batched = False + conversations = [conversation] + + tokenize = processed_kwargs["template_kwargs"].pop("tokenize", False) + return_dict = processed_kwargs["template_kwargs"].pop("return_dict", False) + mm_load_kwargs = processed_kwargs["mm_load_kwargs"] + + if tokenize: + batch_images, batch_videos = [], [] + batch_audios = [] + batch_video_metadata = [] + for conversation in conversations: + images, videos = [], [] + video_metadata = [] + for message in conversation: + visuals = [content for content in message["content"] if content["type"] in ["image", "video"]] + audio_fnames = [ + content[key] + for content in message["content"] + for key in ["audio", "url", "path"] + if key in content and content["type"] == "audio" + ] + image_fnames = [ + vision_info[key] + for vision_info in visuals + for key in ["image", "url", "path", "base64"] + if key in vision_info and vision_info["type"] == "image" + ] + video_fnames = [ + vision_info[key] + for vision_info in visuals + for key in ["video", "url", "path"] + if key in vision_info and vision_info["type"] == "video" + ] + + for fname in image_fnames: + images.append(load_image(fname)) + + # Audio models do not accept nested list of audios (yet!) so we construct a flat input audio list + if not mm_load_kwargs["load_audio_from_video"]: + for fname in audio_fnames: + batch_audios.append(load_audio(fname, sampling_rate=mm_load_kwargs["sampling_rate"])) + else: + for fname in video_fnames: + batch_audios.append(load_audio(fname, sampling_rate=mm_load_kwargs["sampling_rate"])) + + for fname in video_fnames: + if isinstance(fname, (list, tuple)) and isinstance(fname[0], str): + video = [np.array(load_image(image_fname)) for image_fname in fname] + # create a 4D video because `load_video` always returns a 4D array + video = np.stack(video) + metadata = None + logger.warning( + "When loading the video from list of images, we cannot infer metadata such as `fps` or `duration`. " + "If your model uses this metadata during processing, please load the whole video and let the model sample frames instead." + ) + else: + # TODO: raushan, should be `self.video_processor.load_video_for_model` when API is added + video, metadata = self._load_video_for_model( + fname, + num_frames=mm_load_kwargs.get("num_frames", None), + fps=mm_load_kwargs.get("video_fps", None), + backend=mm_load_kwargs["video_load_backend"], + **kwargs, + ) + videos.append(video) + video_metadata.append(metadata) + + # Currently all processors can accept nested list of batches, but not flat list of visuals + # So we'll make a batched list of images and let the processor handle it + if images: + batch_images.append(images) + if videos: + batch_videos.append(videos) + batch_video_metadata.append(video_metadata) + + # Process conversation with video/image information if needed. Then convert into a prompt using Jinja template + conversations = self._process_messages_for_chat_template( + conversations, + batch_images=batch_images, + batch_videos=batch_videos, + batch_video_metadata=batch_video_metadata, + **processed_kwargs["mm_load_kwargs"], + ) + + prompt = self.tokenizer.apply_chat_template( + conversations, + chat_template=chat_template, + tokenize=False, + return_dict=False, + **processed_kwargs["template_kwargs"], + ) + + if not is_batched: + prompt = prompt[0] + + if tokenize: + # Tokenizer's `apply_chat_template` never adds special tokens when tokenizing + # But processor's `apply_chat_template` didn't have an option to tokenize, so users had to format the prompt + # and pass it to the processor. Users thus never worried about special tokens relying on processor handling + # everything internally. The below line is to keep BC for that and be able to work with model that have + # special tokens in the template (consistent with tokenizers). We dont want to raise warning, it will flood command line + # without actionable solution for users + single_prompt = prompt[0] if is_batched else prompt + if self.tokenizer.bos_token is not None and single_prompt.startswith(self.tokenizer.bos_token): + kwargs["add_special_tokens"] = False + + out = self( + text=prompt, + images=batch_images if batch_images else None, + videos=batch_videos if batch_videos else None, + audio=batch_audios if batch_audios else None, + **kwargs, + ) + if return_dict: + return out + else: + return out["input_ids"] + return prompt + + # TODO: raushan, has to be public method under `VideoProcessorBase` when API is added + # Keep private so we can simply remove when needed + def _load_video_for_model( + self, + video: Union[str, "VideoInput"], + num_frames: Optional[int] = None, + fps: Optional[int] = None, + backend: str = "opencv", + **kwargs, + ) -> np.array: + """ + Loads `video` to a numpy array. + + Args: + video (`str` or `VideoInput`): + The video to convert to the numpy array format. Can be a link to video or local path. + num_frames (`int`, *optional*): + Number of frames to sample uniformly. If not passed, the whole video is loaded. + fps (`int`, *optional*): + Number of frames to sample per second. Should be passed only when `num_frames=None`. + If not specified and `num_frames==None`, all frames are sampled. + backend (`str`, *optional*, defaults to `"opencv"`): + The backend to use when loading the video. Can be any of ["decord", "pyav", "opencv", "torchvision"]. Defaults to "opencv". + + Returns: + Tuple[`np.array`, Dict]: A tuple containing: + - Numpy array of frames in RGB (shape: [num_frames, height, width, 3]). + - Metadata dictionary. + """ + video, metadata = load_video(video, num_frames, fps=fps, backend=backend) + return video, metadata + + def post_process_image_text_to_text(self, generated_outputs, skip_special_tokens=True, **kwargs): + """ + Post-process the output of a vlm to decode the text. + + Args: + generated_outputs (`torch.Tensor` or `np.ndarray`): + The output of the model `generate` function. The output is expected to be a tensor of shape `(batch_size, sequence_length)` + or `(sequence_length,)`. + skip_special_tokens (`bool`, *optional*, defaults to `True`): + Whether or not to remove special tokens in the output. Argument passed to the tokenizer's `batch_decode` method. + **kwargs: + Additional arguments to be passed to the tokenizer's `batch_decode method`. + + Returns: + `List[str]`: The decoded text. + """ + return self.tokenizer.batch_decode(generated_outputs, skip_special_tokens=skip_special_tokens, **kwargs) + + def _check_special_mm_tokens(self, text: list[str], text_inputs: "BatchFeature", modalities: list[str]): + """ + Checks that number of special tokens in text and processed text is same. The count can be different + if tokenized text was truncated, leading to issues in model code. + """ + for modality in modalities: + token_str = getattr(self, f"{modality}_token") + token_id = getattr(self, f"{modality}_token_id") + ids_count = [list(ids).count(token_id) for ids in text_inputs["input_ids"]] + text_count = [sample.count(token_str) for sample in text] + + if ids_count != text_count: + raise ValueError( + f"Mismatch in `{modality}` token count between text and `input_ids`. Got ids={ids_count} and text={text_count}. " + "Likely due to `truncation='max_length'`. Please disable truncation or increase `max_length`." + ) + + +def _validate_images_text_input_order(images, text): + """ + For backward compatibility: reverse the order of `images` and `text` inputs if they are swapped. + This method should only be called for processors where `images` and `text` have been swapped for uniformization purposes. + Note that this method assumes that two `None` inputs are valid inputs. If this is not the case, it should be handled + in the processor's `__call__` method before calling this method. + """ + + def is_url(val) -> bool: + return isinstance(val, str) and val.startswith("http") + + def _is_valid_images_input_for_processor(imgs): + # If we have an list of images, make sure every image is valid + if isinstance(imgs, (list, tuple)): + for img in imgs: + if not _is_valid_images_input_for_processor(img): + return False + # If not a list or tuple, we have been given a single image or batched tensor of images + elif not (is_valid_image(imgs) or is_url(imgs)): + return False + return True + + def _is_valid_text_input_for_processor(t): + if isinstance(t, str): + # Strings are fine + return True + elif isinstance(t, (list, tuple)): + # List are fine as long as they are... + if len(t) == 0: + # ... not empty + return False + for t_s in t: + return _is_valid_text_input_for_processor(t_s) + return False + + def _is_valid(input, validator): + return validator(input) or input is None + + images_is_valid = _is_valid(images, _is_valid_images_input_for_processor) + images_is_text = _is_valid_text_input_for_processor(images) + + text_is_valid = _is_valid(text, _is_valid_text_input_for_processor) + text_is_images = _is_valid_images_input_for_processor(text) + # Handle cases where both inputs are valid + if images_is_valid and text_is_valid: + return images, text + + # Handle cases where inputs need to and can be swapped + if (images is None and text_is_images) or (text is None and images_is_text) or (images_is_text and text_is_images): + logger.warning_once( + "You may have used the wrong order for inputs. `images` should be passed before `text`. " + "The `images` and `text` inputs will be swapped. This behavior will be deprecated in transformers v4.47." + ) + return text, images + + raise ValueError("Invalid input type. Check that `images` and/or `text` are valid inputs.") + + +ProcessorMixin.push_to_hub = copy_func(ProcessorMixin.push_to_hub) +if ProcessorMixin.push_to_hub.__doc__ is not None: + ProcessorMixin.push_to_hub.__doc__ = ProcessorMixin.push_to_hub.__doc__.format( + object="processor", object_class="AutoProcessor", object_files="processor files" + ) diff --git a/docs/transformers/build/lib/transformers/py.typed b/docs/transformers/build/lib/transformers/py.typed new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/docs/transformers/build/lib/transformers/pytorch_utils.py b/docs/transformers/build/lib/transformers/pytorch_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..b2fe91253576724e52e5b4642137d59f0f588805 --- /dev/null +++ b/docs/transformers/build/lib/transformers/pytorch_utils.py @@ -0,0 +1,360 @@ +# Copyright 2022 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from __future__ import annotations + +import inspect +from functools import lru_cache, wraps +from typing import Callable + +import torch +from safetensors.torch import storage_ptr, storage_size +from torch import nn + +from .utils import is_torch_greater_or_equal, is_torch_xla_available, is_torchdynamo_compiling, logging + + +ALL_LAYERNORM_LAYERS = [nn.LayerNorm] + +logger = logging.get_logger(__name__) + +is_torch_greater_or_equal_than_2_6 = is_torch_greater_or_equal("2.6", accept_dev=True) +is_torch_greater_or_equal_than_2_4 = is_torch_greater_or_equal("2.4", accept_dev=True) +is_torch_greater_or_equal_than_2_3 = is_torch_greater_or_equal("2.3", accept_dev=True) +is_torch_greater_or_equal_than_2_2 = is_torch_greater_or_equal("2.2", accept_dev=True) + +# For backwards compatibility (e.g. some remote codes on Hub using those variables). +is_torch_greater_or_equal_than_2_1 = is_torch_greater_or_equal("2.1", accept_dev=True) +is_torch_greater_or_equal_than_2_0 = is_torch_greater_or_equal("2.0", accept_dev=True) +is_torch_greater_or_equal_than_1_13 = is_torch_greater_or_equal("1.13", accept_dev=True) +is_torch_greater_or_equal_than_1_12 = is_torch_greater_or_equal("1.12", accept_dev=True) + +# Cache this result has it's a C FFI call which can be pretty time-consuming +_torch_distributed_available = torch.distributed.is_available() + +if is_torch_greater_or_equal("2.5") and _torch_distributed_available: + pass + + +def softmax_backward_data(parent, grad_output, output, dim, self): + """ + A function that calls the internal `_softmax_backward_data` PyTorch method and that adjusts the arguments according + to the torch version detected. + """ + + from torch import _softmax_backward_data + + return _softmax_backward_data(grad_output, output, parent.dim, self.dtype) + + +def prune_linear_layer(layer: nn.Linear, index: torch.LongTensor, dim: int = 0) -> nn.Linear: + """ + Prune a linear layer to keep only entries in index. + + Used to remove heads. + + Args: + layer (`torch.nn.Linear`): The layer to prune. + index (`torch.LongTensor`): The indices to keep in the layer. + dim (`int`, *optional*, defaults to 0): The dimension on which to keep the indices. + + Returns: + `torch.nn.Linear`: The pruned layer as a new layer with `requires_grad=True`. + """ + index = index.to(layer.weight.device) + W = layer.weight.index_select(dim, index).detach().clone() + if layer.bias is not None: + if dim == 1: + b = layer.bias.detach().clone() + else: + b = layer.bias[index].detach().clone() + new_size = list(layer.weight.size()) + new_size[dim] = len(index) + new_layer = nn.Linear(new_size[1], new_size[0], bias=layer.bias is not None).to(layer.weight.device) + new_layer.weight.requires_grad = False + new_layer.weight.copy_(W.contiguous()) + new_layer.weight.requires_grad = True + if layer.bias is not None: + new_layer.bias.requires_grad = False + new_layer.bias.copy_(b.contiguous()) + new_layer.bias.requires_grad = True + return new_layer + + +class Conv1D(nn.Module): + """ + 1D-convolutional layer as defined by Radford et al. for OpenAI GPT (and also used in GPT-2). + + Basically works like a linear layer but the weights are transposed. + + Args: + nf (`int`): The number of output features. + nx (`int`): The number of input features. + """ + + def __init__(self, nf, nx): + super().__init__() + self.nf = nf + self.nx = nx + self.weight = nn.Parameter(torch.empty(nx, nf)) + self.bias = nn.Parameter(torch.zeros(nf)) + nn.init.normal_(self.weight, std=0.02) + + def __repr__(self) -> str: + return "Conv1D(nf={nf}, nx={nx})".format(**self.__dict__) + + def forward(self, x): + size_out = x.size()[:-1] + (self.nf,) + x = torch.addmm(self.bias, x.view(-1, x.size(-1)), self.weight) + x = x.view(size_out) + return x + + +def prune_conv1d_layer(layer: Conv1D, index: torch.LongTensor, dim: int = 1) -> Conv1D: + """ + Prune a Conv1D layer to keep only entries in index. A Conv1D work as a Linear layer (see e.g. BERT) but the weights + are transposed. + + Used to remove heads. + + Args: + layer ([`~pytorch_utils.Conv1D`]): The layer to prune. + index (`torch.LongTensor`): The indices to keep in the layer. + dim (`int`, *optional*, defaults to 1): The dimension on which to keep the indices. + + Returns: + [`~pytorch_utils.Conv1D`]: The pruned layer as a new layer with `requires_grad=True`. + """ + index = index.to(layer.weight.device) + W = layer.weight.index_select(dim, index).detach().clone() + if dim == 0: + b = layer.bias.detach().clone() + else: + b = layer.bias[index].detach().clone() + new_size = list(layer.weight.size()) + new_size[dim] = len(index) + new_layer = Conv1D(new_size[1], new_size[0]).to(layer.weight.device) + new_layer.weight.requires_grad = False + new_layer.weight.copy_(W.contiguous()) + new_layer.weight.requires_grad = True + new_layer.bias.requires_grad = False + new_layer.bias.copy_(b.contiguous()) + new_layer.bias.requires_grad = True + return new_layer + + +def prune_layer(layer: nn.Linear | Conv1D, index: torch.LongTensor, dim: int | None = None) -> nn.Linear | Conv1D: + """ + Prune a Conv1D or linear layer to keep only entries in index. + + Used to remove heads. + + Args: + layer (`Union[torch.nn.Linear, Conv1D]`): The layer to prune. + index (`torch.LongTensor`): The indices to keep in the layer. + dim (`int`, *optional*): The dimension on which to keep the indices. + + Returns: + `torch.nn.Linear` or [`~pytorch_utils.Conv1D`]: The pruned layer as a new layer with `requires_grad=True`. + """ + if isinstance(layer, nn.Linear): + return prune_linear_layer(layer, index, dim=0 if dim is None else dim) + elif isinstance(layer, Conv1D): + return prune_conv1d_layer(layer, index, dim=1 if dim is None else dim) + else: + raise ValueError(f"Can't prune layer of class {layer.__class__}") + + +def apply_chunking_to_forward( + forward_fn: Callable[..., torch.Tensor], + chunk_size: int, + chunk_dim: int, + *input_tensors, +) -> torch.Tensor: + """ + This function chunks the `input_tensors` into smaller input tensor parts of size `chunk_size` over the dimension + `chunk_dim`. It then applies a layer `forward_fn` to each chunk independently to save memory. + + If the `forward_fn` is independent across the `chunk_dim` this function will yield the same result as directly + applying `forward_fn` to `input_tensors`. + + Args: + forward_fn (`Callable[..., torch.Tensor]`): + The forward function of the model. + chunk_size (`int`): + The chunk size of a chunked tensor: `num_chunks = len(input_tensors[0]) / chunk_size`. + chunk_dim (`int`): + The dimension over which the `input_tensors` should be chunked. + input_tensors (`Tuple[torch.Tensor]`): + The input tensors of `forward_fn` which will be chunked + + Returns: + `torch.Tensor`: A tensor with the same shape as the `forward_fn` would have given if applied`. + + + Examples: + + ```python + # rename the usual forward() fn to forward_chunk() + def forward_chunk(self, hidden_states): + hidden_states = self.decoder(hidden_states) + return hidden_states + + + # implement a chunked forward function + def forward(self, hidden_states): + return apply_chunking_to_forward(self.forward_chunk, self.chunk_size_lm_head, self.seq_len_dim, hidden_states) + ```""" + + assert len(input_tensors) > 0, f"{input_tensors} has to be a tuple/list of tensors" + + # inspect.signature exist since python 3.5 and is a python method -> no problem with backward compatibility + num_args_in_forward_chunk_fn = len(inspect.signature(forward_fn).parameters) + if num_args_in_forward_chunk_fn != len(input_tensors): + raise ValueError( + f"forward_chunk_fn expects {num_args_in_forward_chunk_fn} arguments, but only {len(input_tensors)} input " + "tensors are given" + ) + + if chunk_size > 0: + tensor_shape = input_tensors[0].shape[chunk_dim] + for input_tensor in input_tensors: + if input_tensor.shape[chunk_dim] != tensor_shape: + raise ValueError( + f"All input tenors have to be of the same shape: {tensor_shape}, " + f"found shape {input_tensor.shape[chunk_dim]}" + ) + + if input_tensors[0].shape[chunk_dim] % chunk_size != 0: + raise ValueError( + f"The dimension to be chunked {input_tensors[0].shape[chunk_dim]} has to be a multiple of the chunk " + f"size {chunk_size}" + ) + + num_chunks = input_tensors[0].shape[chunk_dim] // chunk_size + + # chunk input tensor into tuples + input_tensors_chunks = tuple(input_tensor.chunk(num_chunks, dim=chunk_dim) for input_tensor in input_tensors) + # apply forward fn to every tuple + output_chunks = tuple(forward_fn(*input_tensors_chunk) for input_tensors_chunk in zip(*input_tensors_chunks)) + # concatenate output at same dimension + return torch.cat(output_chunks, dim=chunk_dim) + + return forward_fn(*input_tensors) + + +def find_pruneable_heads_and_indices( + heads: list[int], n_heads: int, head_size: int, already_pruned_heads: set[int] +) -> tuple[set[int], torch.LongTensor]: + """ + Finds the heads and their indices taking `already_pruned_heads` into account. + + Args: + heads (`List[int]`): List of the indices of heads to prune. + n_heads (`int`): The number of heads in the model. + head_size (`int`): The size of each head. + already_pruned_heads (`Set[int]`): A set of already pruned heads. + + Returns: + `Tuple[Set[int], torch.LongTensor]`: A tuple with the indices of heads to prune taking `already_pruned_heads` + into account and the indices of rows/columns to keep in the layer weight. + """ + mask = torch.ones(n_heads, head_size) + heads = set(heads) - already_pruned_heads # Convert to set and remove already pruned heads + for head in heads: + # Compute how many pruned heads are before the head and move the index accordingly + head = head - sum(1 if h < head else 0 for h in already_pruned_heads) + mask[head] = 0 + mask = mask.view(-1).contiguous().eq(1) + index: torch.LongTensor = torch.arange(len(mask))[mask].long() + return heads, index + + +def meshgrid(*tensors: torch.Tensor | list[torch.Tensor], indexing: str | None = None) -> tuple[torch.Tensor, ...]: + """ + Wrapper around torch.meshgrid to avoid warning messages about the introduced `indexing` argument. + + Reference: https://pytorch.org/docs/1.13/generated/torch.meshgrid.html + """ + return torch.meshgrid(*tensors, indexing=indexing) + + +def id_tensor_storage(tensor: torch.Tensor) -> tuple[torch.device, int, int]: + """ + Unique identifier to a tensor storage. Multiple different tensors can share the same underlying storage. For + example, "meta" tensors all share the same storage, and thus their identifier will all be equal. This identifier is + guaranteed to be unique and constant for this tensor's storage during its lifetime. Two tensor storages with + non-overlapping lifetimes may have the same id. + """ + if tensor.device.type == "xla" and is_torch_xla_available(): + # NOTE: xla tensors dont have storage + # use some other unique id to distinguish. + # this is a XLA tensor, it must be created using torch_xla's + # device. So the following import is safe: + import torch_xla + + unique_id = torch_xla._XLAC._xla_get_tensor_id(tensor) + else: + unique_id = storage_ptr(tensor) + + return tensor.device, unique_id, storage_size(tensor) + + +def isin_mps_friendly(elements: torch.Tensor, test_elements: torch.Tensor | int) -> torch.Tensor: + """ + Same as `torch.isin` without flags, but MPS-friendly. We can remove this function when we stop supporting + torch <= 2.3. See https://github.com/pytorch/pytorch/issues/77764#issuecomment-2067838075 + + Args: + elements (`torch.Tensor`): Input elements + test_elements (`torch.Tensor` or `int`): The elements to check against. + + Returns: + `torch.Tensor`: A boolean tensor of the same shape as `elements` that is True for `elements` in `test_elements` + and False otherwise + """ + + if elements.device.type == "mps" and not is_torch_greater_or_equal_than_2_4: + test_elements = torch.tensor(test_elements) + if test_elements.ndim == 0: + test_elements = test_elements.unsqueeze(0) + return elements.tile(test_elements.shape[0], 1).eq(test_elements.unsqueeze(1)).sum(dim=0).bool().squeeze() + else: + # Note: don't use named arguments in `torch.isin`, see https://github.com/pytorch/pytorch/issues/126045 + return torch.isin(elements, test_elements) + + +def compile_compatible_method_lru_cache(*lru_args, **lru_kwargs): + """ + LRU cache decorator from standard functools library, but with a workaround to disable + caching when torchdynamo is compiling. Expected to work with class methods. + """ + + def decorator(func): + @wraps(func) + def wrapper(self, *args, **kwargs): + if not is_torchdynamo_compiling(): + # Cache the function only if the model is not being compiled + # check if the function is already cached, otherwise create it + if not hasattr(self, f"_cached_{func.__name__}"): + self.__setattr__( + f"_cached_{func.__name__}", lru_cache(*lru_args, **lru_kwargs)(func.__get__(self)) + ) + return self.__getattribute__(f"_cached_{func.__name__}")(*args, **kwargs) + else: + # Otherwise, just call the original function + return func(self, *args, **kwargs) + + return wrapper + + return decorator diff --git a/docs/transformers/build/lib/transformers/safetensors_conversion.py b/docs/transformers/build/lib/transformers/safetensors_conversion.py new file mode 100644 index 0000000000000000000000000000000000000000..f1612d3ea57c98fd1d383887cfbeb4e2882d3963 --- /dev/null +++ b/docs/transformers/build/lib/transformers/safetensors_conversion.py @@ -0,0 +1,105 @@ +from typing import Optional + +import requests +from huggingface_hub import Discussion, HfApi, get_repo_discussions + +from .utils import cached_file, http_user_agent, logging + + +logger = logging.get_logger(__name__) + + +def previous_pr(api: HfApi, model_id: str, pr_title: str, token: str) -> Optional["Discussion"]: + main_commit = api.list_repo_commits(model_id, token=token)[0].commit_id + for discussion in get_repo_discussions(repo_id=model_id, token=token): + if discussion.title == pr_title and discussion.status == "open" and discussion.is_pull_request: + commits = api.list_repo_commits(model_id, revision=discussion.git_reference, token=token) + + if main_commit == commits[1].commit_id: + return discussion + return None + + +def spawn_conversion(token: str, private: bool, model_id: str): + logger.info("Attempting to convert .bin model on the fly to safetensors.") + + safetensors_convert_space_url = "https://safetensors-convert.hf.space" + sse_url = f"{safetensors_convert_space_url}/call/run" + + def start(_sse_connection): + for line in _sse_connection.iter_lines(): + line = line.decode() + if line.startswith("event:"): + status = line[7:] + logger.debug(f"Safetensors conversion status: {status}") + + if status == "complete": + return + elif status == "heartbeat": + logger.debug("Heartbeat") + else: + logger.debug(f"Unknown status {status}") + else: + logger.debug(line) + + data = {"data": [model_id, private, token]} + + result = requests.post(sse_url, stream=True, json=data).json() + event_id = result["event_id"] + + with requests.get(f"{sse_url}/{event_id}", stream=True) as sse_connection: + try: + logger.debug("Spawning safetensors automatic conversion.") + start(sse_connection) + except Exception as e: + logger.warning(f"Error during conversion: {repr(e)}") + + +def get_conversion_pr_reference(api: HfApi, model_id: str, **kwargs): + private = api.model_info(model_id).private + + logger.info("Attempting to create safetensors variant") + pr_title = "Adding `safetensors` variant of this model" + token = kwargs.get("token") + + # This looks into the current repo's open PRs to see if a PR for safetensors was already open. If so, it + # returns it. It checks that the PR was opened by the bot and not by another user so as to prevent + # security breaches. + pr = previous_pr(api, model_id, pr_title, token=token) + + if pr is None or (not private and pr.author != "SFconvertbot"): + spawn_conversion(token, private, model_id) + pr = previous_pr(api, model_id, pr_title, token=token) + else: + logger.info("Safetensors PR exists") + + sha = f"refs/pr/{pr.num}" + + return sha + + +def auto_conversion(pretrained_model_name_or_path: str, ignore_errors_during_conversion=False, **cached_file_kwargs): + try: + api = HfApi(token=cached_file_kwargs.get("token"), headers={"user-agent": http_user_agent()}) + sha = get_conversion_pr_reference(api, pretrained_model_name_or_path, **cached_file_kwargs) + + if sha is None: + return None, None + cached_file_kwargs["revision"] = sha + del cached_file_kwargs["_commit_hash"] + + # This is an additional HEAD call that could be removed if we could infer sharded/non-sharded from the PR + # description. + sharded = api.file_exists( + pretrained_model_name_or_path, + "model.safetensors.index.json", + revision=sha, + token=cached_file_kwargs.get("token"), + ) + filename = "model.safetensors.index.json" if sharded else "model.safetensors" + + resolved_archive_file = cached_file(pretrained_model_name_or_path, filename, **cached_file_kwargs) + return resolved_archive_file, sha, sharded + except Exception as e: + if not ignore_errors_during_conversion: + raise e diff --git a/docs/transformers/build/lib/transformers/testing_utils.py b/docs/transformers/build/lib/transformers/testing_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..a2ff5330acf8e5f6064d5d4c35cd91fb1abf6b9f --- /dev/null +++ b/docs/transformers/build/lib/transformers/testing_utils.py @@ -0,0 +1,3182 @@ +# Copyright 2020 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import collections +import contextlib +import copy +import doctest +import functools +import gc +import importlib +import inspect +import logging +import multiprocessing +import os +import re +import shlex +import shutil +import subprocess +import sys +import tempfile +import threading +import time +import unittest +from collections import UserDict, defaultdict +from collections.abc import Generator, Iterable, Iterator, Mapping +from dataclasses import MISSING, fields +from functools import cache, wraps +from io import StringIO +from pathlib import Path +from typing import Any, Callable, Optional, Union +from unittest import mock +from unittest.mock import patch + +import huggingface_hub.utils +import requests +import urllib3 +from huggingface_hub import delete_repo +from packaging import version + +from transformers import Trainer +from transformers import logging as transformers_logging + +from .integrations import ( + is_clearml_available, + is_optuna_available, + is_ray_available, + is_sigopt_available, + is_swanlab_available, + is_tensorboard_available, + is_wandb_available, +) +from .integrations.deepspeed import is_deepspeed_available +from .utils import ( + ACCELERATE_MIN_VERSION, + GGUF_MIN_VERSION, + is_accelerate_available, + is_apex_available, + is_apollo_torch_available, + is_aqlm_available, + is_auto_awq_available, + is_auto_gptq_available, + is_auto_round_available, + is_av_available, + is_bitsandbytes_available, + is_bitsandbytes_multi_backend_available, + is_bs4_available, + is_compressed_tensors_available, + is_cv2_available, + is_cython_available, + is_detectron2_available, + is_eetq_available, + is_essentia_available, + is_faiss_available, + is_fbgemm_gpu_available, + is_flash_attn_2_available, + is_flax_available, + is_flute_available, + is_fsdp_available, + is_ftfy_available, + is_g2p_en_available, + is_galore_torch_available, + is_gguf_available, + is_gptqmodel_available, + is_grokadamw_available, + is_hadamard_available, + is_hqq_available, + is_ipex_available, + is_jieba_available, + is_jinja_available, + is_jumanpp_available, + is_keras_nlp_available, + is_levenshtein_available, + is_librosa_available, + is_liger_kernel_available, + is_lomo_available, + is_natten_available, + is_nltk_available, + is_onnx_available, + is_optimum_available, + is_optimum_quanto_available, + is_pandas_available, + is_peft_available, + is_phonemizer_available, + is_pretty_midi_available, + is_pyctcdecode_available, + is_pytesseract_available, + is_pytest_available, + is_pytorch_quantization_available, + is_quark_available, + is_rjieba_available, + is_sacremoses_available, + is_safetensors_available, + is_schedulefree_available, + is_scipy_available, + is_sentencepiece_available, + is_seqio_available, + is_soundfile_available, + is_spacy_available, + is_spqr_available, + is_sudachi_available, + is_sudachi_projection_available, + is_tensorflow_probability_available, + is_tensorflow_text_available, + is_tf2onnx_available, + is_tf_available, + is_tiktoken_available, + is_timm_available, + is_tokenizers_available, + is_torch_available, + is_torch_bf16_available_on_device, + is_torch_bf16_gpu_available, + is_torch_fp16_available_on_device, + is_torch_greater_or_equal, + is_torch_hpu_available, + is_torch_mlu_available, + is_torch_neuroncore_available, + is_torch_npu_available, + is_torch_sdpa_available, + is_torch_tensorrt_fx_available, + is_torch_tf32_available, + is_torch_xla_available, + is_torch_xpu_available, + is_torchao_available, + is_torchaudio_available, + is_torchdynamo_available, + is_torchvision_available, + is_vision_available, + is_vptq_available, + strtobool, +) + + +if is_accelerate_available(): + from accelerate.state import AcceleratorState, PartialState + from accelerate.utils.imports import is_fp8_available + + +if is_pytest_available(): + from _pytest.doctest import ( + Module, + _get_checker, + _get_continue_on_failure, + _get_runner, + _is_mocked, + _patch_unwrap_mock_aware, + get_optionflags, + ) + from _pytest.outcomes import skip + from _pytest.pathlib import import_path + from pytest import DoctestItem +else: + Module = object + DoctestItem = object + + +SMALL_MODEL_IDENTIFIER = "julien-c/bert-xsmall-dummy" +DUMMY_UNKNOWN_IDENTIFIER = "julien-c/dummy-unknown" +DUMMY_DIFF_TOKENIZER_IDENTIFIER = "julien-c/dummy-diff-tokenizer" +# Used to test Auto{Config, Model, Tokenizer} model_type detection. + +# Used to test the hub +USER = "__DUMMY_TRANSFORMERS_USER__" +ENDPOINT_STAGING = "https://hub-ci.huggingface.co" + +# Not critical, only usable on the sandboxed CI instance. +TOKEN = "hf_94wBhPGp6KrrTH3KDchhKpRxZwd6dmHWLL" + +if is_torch_available(): + import torch + + IS_ROCM_SYSTEM = torch.version.hip is not None + IS_CUDA_SYSTEM = torch.version.cuda is not None + IS_XPU_SYSTEM = getattr(torch.version, "xpu", None) is not None +else: + IS_ROCM_SYSTEM = False + IS_CUDA_SYSTEM = False + IS_XPU_SYSTEM = False + +logger = transformers_logging.get_logger(__name__) + + +def parse_flag_from_env(key, default=False): + try: + value = os.environ[key] + except KeyError: + # KEY isn't set, default to `default`. + _value = default + else: + # KEY is set, convert it to True or False. + try: + _value = strtobool(value) + except ValueError: + # More values are supported, but let's keep the message simple. + raise ValueError(f"If set, {key} must be yes or no.") + return _value + + +def parse_int_from_env(key, default=None): + try: + value = os.environ[key] + except KeyError: + _value = default + else: + try: + _value = int(value) + except ValueError: + raise ValueError(f"If set, {key} must be a int.") + return _value + + +_run_slow_tests = parse_flag_from_env("RUN_SLOW", default=False) +_run_custom_tokenizers = parse_flag_from_env("RUN_CUSTOM_TOKENIZERS", default=False) +_run_staging = parse_flag_from_env("HUGGINGFACE_CO_STAGING", default=False) +_run_pipeline_tests = parse_flag_from_env("RUN_PIPELINE_TESTS", default=True) +_run_agent_tests = parse_flag_from_env("RUN_AGENT_TESTS", default=False) + + +def is_staging_test(test_case): + """ + Decorator marking a test as a staging test. + + Those tests will run using the staging environment of huggingface.co instead of the real model hub. + """ + if not _run_staging: + return unittest.skip(reason="test is staging test")(test_case) + else: + try: + import pytest # We don't need a hard dependency on pytest in the main library + except ImportError: + return test_case + else: + return pytest.mark.is_staging_test()(test_case) + + +def is_pipeline_test(test_case): + """ + Decorator marking a test as a pipeline test. If RUN_PIPELINE_TESTS is set to a falsy value, those tests will be + skipped. + """ + if not _run_pipeline_tests: + return unittest.skip(reason="test is pipeline test")(test_case) + else: + try: + import pytest # We don't need a hard dependency on pytest in the main library + except ImportError: + return test_case + else: + return pytest.mark.is_pipeline_test()(test_case) + + +def is_agent_test(test_case): + """ + Decorator marking a test as an agent test. If RUN_TOOL_TESTS is set to a falsy value, those tests will be skipped. + """ + if not _run_agent_tests: + return unittest.skip(reason="test is an agent test")(test_case) + else: + try: + import pytest # We don't need a hard dependency on pytest in the main library + except ImportError: + return test_case + else: + return pytest.mark.is_agent_test()(test_case) + + +def slow(test_case): + """ + Decorator marking a test as slow. + + Slow tests are skipped by default. Set the RUN_SLOW environment variable to a truthy value to run them. + + """ + return unittest.skipUnless(_run_slow_tests, "test is slow")(test_case) + + +def tooslow(test_case): + """ + Decorator marking a test as too slow. + + Slow tests are skipped while they're in the process of being fixed. No test should stay tagged as "tooslow" as + these will not be tested by the CI. + + """ + return unittest.skip(reason="test is too slow")(test_case) + + +def skip_if_not_implemented(test_func): + @functools.wraps(test_func) + def wrapper(*args, **kwargs): + try: + return test_func(*args, **kwargs) + except NotImplementedError as e: + raise unittest.SkipTest(f"Test skipped due to NotImplementedError: {e}") + + return wrapper + + +def apply_skip_if_not_implemented(cls): + """ + Class decorator to apply @skip_if_not_implemented to all test methods. + """ + for attr_name in dir(cls): + if attr_name.startswith("test_"): + attr = getattr(cls, attr_name) + if callable(attr): + setattr(cls, attr_name, skip_if_not_implemented(attr)) + return cls + + +def custom_tokenizers(test_case): + """ + Decorator marking a test for a custom tokenizer. + + Custom tokenizers require additional dependencies, and are skipped by default. Set the RUN_CUSTOM_TOKENIZERS + environment variable to a truthy value to run them. + """ + return unittest.skipUnless(_run_custom_tokenizers, "test of custom tokenizers")(test_case) + + +def require_bs4(test_case): + """ + Decorator marking a test that requires BeautifulSoup4. These tests are skipped when BeautifulSoup4 isn't installed. + """ + return unittest.skipUnless(is_bs4_available(), "test requires BeautifulSoup4")(test_case) + + +def require_galore_torch(test_case): + """ + Decorator marking a test that requires GaLore. These tests are skipped when GaLore isn't installed. + https://github.com/jiaweizzhao/GaLore + """ + return unittest.skipUnless(is_galore_torch_available(), "test requires GaLore")(test_case) + + +def require_apollo_torch(test_case): + """ + Decorator marking a test that requires GaLore. These tests are skipped when APOLLO isn't installed. + https://github.com/zhuhanqing/APOLLO + """ + return unittest.skipUnless(is_apollo_torch_available(), "test requires APOLLO")(test_case) + + +def require_lomo(test_case): + """ + Decorator marking a test that requires LOMO. These tests are skipped when LOMO-optim isn't installed. + https://github.com/OpenLMLab/LOMO + """ + return unittest.skipUnless(is_lomo_available(), "test requires LOMO")(test_case) + + +def require_grokadamw(test_case): + """ + Decorator marking a test that requires GrokAdamW. These tests are skipped when GrokAdamW isn't installed. + """ + return unittest.skipUnless(is_grokadamw_available(), "test requires GrokAdamW")(test_case) + + +def require_schedulefree(test_case): + """ + Decorator marking a test that requires schedulefree. These tests are skipped when schedulefree isn't installed. + https://github.com/facebookresearch/schedule_free + """ + return unittest.skipUnless(is_schedulefree_available(), "test requires schedulefree")(test_case) + + +def require_cv2(test_case): + """ + Decorator marking a test that requires OpenCV. + + These tests are skipped when OpenCV isn't installed. + + """ + return unittest.skipUnless(is_cv2_available(), "test requires OpenCV")(test_case) + + +def require_levenshtein(test_case): + """ + Decorator marking a test that requires Levenshtein. + + These tests are skipped when Levenshtein isn't installed. + + """ + return unittest.skipUnless(is_levenshtein_available(), "test requires Levenshtein")(test_case) + + +def require_nltk(test_case): + """ + Decorator marking a test that requires NLTK. + + These tests are skipped when NLTK isn't installed. + + """ + return unittest.skipUnless(is_nltk_available(), "test requires NLTK")(test_case) + + +def require_accelerate(test_case, min_version: str = ACCELERATE_MIN_VERSION): + """ + Decorator marking a test that requires accelerate. These tests are skipped when accelerate isn't installed. + """ + return unittest.skipUnless( + is_accelerate_available(min_version), f"test requires accelerate version >= {min_version}" + )(test_case) + + +def require_gguf(test_case, min_version: str = GGUF_MIN_VERSION): + """ + Decorator marking a test that requires ggguf. These tests are skipped when gguf isn't installed. + """ + return unittest.skipUnless(is_gguf_available(min_version), f"test requires gguf version >= {min_version}")( + test_case + ) + + +def require_fsdp(test_case, min_version: str = "1.12.0"): + """ + Decorator marking a test that requires fsdp. These tests are skipped when fsdp isn't installed. + """ + return unittest.skipUnless(is_fsdp_available(min_version), f"test requires torch version >= {min_version}")( + test_case + ) + + +def require_g2p_en(test_case): + """ + Decorator marking a test that requires g2p_en. These tests are skipped when SentencePiece isn't installed. + """ + return unittest.skipUnless(is_g2p_en_available(), "test requires g2p_en")(test_case) + + +def require_safetensors(test_case): + """ + Decorator marking a test that requires safetensors. These tests are skipped when safetensors isn't installed. + """ + return unittest.skipUnless(is_safetensors_available(), "test requires safetensors")(test_case) + + +def require_rjieba(test_case): + """ + Decorator marking a test that requires rjieba. These tests are skipped when rjieba isn't installed. + """ + return unittest.skipUnless(is_rjieba_available(), "test requires rjieba")(test_case) + + +def require_jieba(test_case): + """ + Decorator marking a test that requires jieba. These tests are skipped when jieba isn't installed. + """ + return unittest.skipUnless(is_jieba_available(), "test requires jieba")(test_case) + + +def require_jinja(test_case): + """ + Decorator marking a test that requires jinja. These tests are skipped when jinja isn't installed. + """ + return unittest.skipUnless(is_jinja_available(), "test requires jinja")(test_case) + + +def require_tf2onnx(test_case): + return unittest.skipUnless(is_tf2onnx_available(), "test requires tf2onnx")(test_case) + + +def require_onnx(test_case): + return unittest.skipUnless(is_onnx_available(), "test requires ONNX")(test_case) + + +def require_timm(test_case): + """ + Decorator marking a test that requires Timm. + + These tests are skipped when Timm isn't installed. + + """ + return unittest.skipUnless(is_timm_available(), "test requires Timm")(test_case) + + +def require_natten(test_case): + """ + Decorator marking a test that requires NATTEN. + + These tests are skipped when NATTEN isn't installed. + + """ + return unittest.skipUnless(is_natten_available(), "test requires natten")(test_case) + + +def require_torch(test_case): + """ + Decorator marking a test that requires PyTorch. + + These tests are skipped when PyTorch isn't installed. + + """ + return unittest.skipUnless(is_torch_available(), "test requires PyTorch")(test_case) + + +def require_torch_greater_or_equal(version: str): + """ + Decorator marking a test that requires PyTorch version >= `version`. + + These tests are skipped when PyTorch version is less than `version`. + """ + + def decorator(test_case): + return unittest.skipUnless(is_torch_greater_or_equal(version), f"test requires PyTorch version >= {version}")( + test_case + ) + + return decorator + + +def require_flash_attn(test_case): + """ + Decorator marking a test that requires Flash Attention. + + These tests are skipped when Flash Attention isn't installed. + + """ + return unittest.skipUnless(is_flash_attn_2_available(), "test requires Flash Attention")(test_case) + + +def require_torch_sdpa(test_case): + """ + Decorator marking a test that requires PyTorch's SDPA. + + These tests are skipped when requirements are not met (torch version). + """ + return unittest.skipUnless(is_torch_sdpa_available(), "test requires PyTorch SDPA")(test_case) + + +def require_read_token(fn): + """ + A decorator that loads the HF token for tests that require to load gated models. + """ + token = os.getenv("HF_HUB_READ_TOKEN") + + @wraps(fn) + def _inner(*args, **kwargs): + if token is not None: + with patch("huggingface_hub.utils._headers.get_token", return_value=token): + return fn(*args, **kwargs) + else: # Allow running locally with the default token env variable + return fn(*args, **kwargs) + + return _inner + + +def require_peft(test_case): + """ + Decorator marking a test that requires PEFT. + + These tests are skipped when PEFT isn't installed. + + """ + return unittest.skipUnless(is_peft_available(), "test requires PEFT")(test_case) + + +def require_torchvision(test_case): + """ + Decorator marking a test that requires Torchvision. + + These tests are skipped when Torchvision isn't installed. + + """ + return unittest.skipUnless(is_torchvision_available(), "test requires Torchvision")(test_case) + + +def require_torch_or_tf(test_case): + """ + Decorator marking a test that requires PyTorch or TensorFlow. + + These tests are skipped when neither PyTorch not TensorFlow is installed. + + """ + return unittest.skipUnless(is_torch_available() or is_tf_available(), "test requires PyTorch or TensorFlow")( + test_case + ) + + +def require_intel_extension_for_pytorch(test_case): + """ + Decorator marking a test that requires Intel Extension for PyTorch. + + These tests are skipped when Intel Extension for PyTorch isn't installed or it does not match current PyTorch + version. + + """ + return unittest.skipUnless( + is_ipex_available(), + "test requires Intel Extension for PyTorch to be installed and match current PyTorch version, see" + " https://github.com/intel/intel-extension-for-pytorch", + )(test_case) + + +def require_tensorflow_probability(test_case): + """ + Decorator marking a test that requires TensorFlow probability. + + These tests are skipped when TensorFlow probability isn't installed. + + """ + return unittest.skipUnless(is_tensorflow_probability_available(), "test requires TensorFlow probability")( + test_case + ) + + +def require_torchaudio(test_case): + """ + Decorator marking a test that requires torchaudio. These tests are skipped when torchaudio isn't installed. + """ + return unittest.skipUnless(is_torchaudio_available(), "test requires torchaudio")(test_case) + + +def require_tf(test_case): + """ + Decorator marking a test that requires TensorFlow. These tests are skipped when TensorFlow isn't installed. + """ + return unittest.skipUnless(is_tf_available(), "test requires TensorFlow")(test_case) + + +def require_flax(test_case): + """ + Decorator marking a test that requires JAX & Flax. These tests are skipped when one / both are not installed + """ + return unittest.skipUnless(is_flax_available(), "test requires JAX & Flax")(test_case) + + +def require_sentencepiece(test_case): + """ + Decorator marking a test that requires SentencePiece. These tests are skipped when SentencePiece isn't installed. + """ + return unittest.skipUnless(is_sentencepiece_available(), "test requires SentencePiece")(test_case) + + +def require_sacremoses(test_case): + """ + Decorator marking a test that requires Sacremoses. These tests are skipped when Sacremoses isn't installed. + """ + return unittest.skipUnless(is_sacremoses_available(), "test requires Sacremoses")(test_case) + + +def require_seqio(test_case): + """ + Decorator marking a test that requires SentencePiece. These tests are skipped when SentencePiece isn't installed. + """ + return unittest.skipUnless(is_seqio_available(), "test requires Seqio")(test_case) + + +def require_scipy(test_case): + """ + Decorator marking a test that requires Scipy. These tests are skipped when SentencePiece isn't installed. + """ + return unittest.skipUnless(is_scipy_available(), "test requires Scipy")(test_case) + + +def require_tokenizers(test_case): + """ + Decorator marking a test that requires 🤗 Tokenizers. These tests are skipped when 🤗 Tokenizers isn't installed. + """ + return unittest.skipUnless(is_tokenizers_available(), "test requires tokenizers")(test_case) + + +def require_tensorflow_text(test_case): + """ + Decorator marking a test that requires tensorflow_text. These tests are skipped when tensroflow_text isn't + installed. + """ + return unittest.skipUnless(is_tensorflow_text_available(), "test requires tensorflow_text")(test_case) + + +def require_keras_nlp(test_case): + """ + Decorator marking a test that requires keras_nlp. These tests are skipped when keras_nlp isn't installed. + """ + return unittest.skipUnless(is_keras_nlp_available(), "test requires keras_nlp")(test_case) + + +def require_pandas(test_case): + """ + Decorator marking a test that requires pandas. These tests are skipped when pandas isn't installed. + """ + return unittest.skipUnless(is_pandas_available(), "test requires pandas")(test_case) + + +def require_pytesseract(test_case): + """ + Decorator marking a test that requires PyTesseract. These tests are skipped when PyTesseract isn't installed. + """ + return unittest.skipUnless(is_pytesseract_available(), "test requires PyTesseract")(test_case) + + +def require_pytorch_quantization(test_case): + """ + Decorator marking a test that requires PyTorch Quantization Toolkit. These tests are skipped when PyTorch + Quantization Toolkit isn't installed. + """ + return unittest.skipUnless(is_pytorch_quantization_available(), "test requires PyTorch Quantization Toolkit")( + test_case + ) + + +def require_vision(test_case): + """ + Decorator marking a test that requires the vision dependencies. These tests are skipped when torchaudio isn't + installed. + """ + return unittest.skipUnless(is_vision_available(), "test requires vision")(test_case) + + +def require_ftfy(test_case): + """ + Decorator marking a test that requires ftfy. These tests are skipped when ftfy isn't installed. + """ + return unittest.skipUnless(is_ftfy_available(), "test requires ftfy")(test_case) + + +def require_spacy(test_case): + """ + Decorator marking a test that requires SpaCy. These tests are skipped when SpaCy isn't installed. + """ + return unittest.skipUnless(is_spacy_available(), "test requires spacy")(test_case) + + +def require_torch_multi_gpu(test_case): + """ + Decorator marking a test that requires a multi-GPU CUDA setup (in PyTorch). These tests are skipped on a machine without + multiple CUDA GPUs. + + To run *only* the multi_gpu tests, assuming all test names contain multi_gpu: $ pytest -sv ./tests -k "multi_gpu" + """ + if not is_torch_available(): + return unittest.skip(reason="test requires PyTorch")(test_case) + + import torch + + return unittest.skipUnless(torch.cuda.device_count() > 1, "test requires multiple CUDA GPUs")(test_case) + + +def require_torch_multi_accelerator(test_case): + """ + Decorator marking a test that requires a multi-accelerator (in PyTorch). These tests are skipped on a machine + without multiple accelerators. To run *only* the multi_accelerator tests, assuming all test names contain + multi_accelerator: $ pytest -sv ./tests -k "multi_accelerator" + """ + if not is_torch_available(): + return unittest.skip(reason="test requires PyTorch")(test_case) + + return unittest.skipUnless(backend_device_count(torch_device) > 1, "test requires multiple accelerators")( + test_case + ) + + +def require_torch_non_multi_gpu(test_case): + """ + Decorator marking a test that requires 0 or 1 GPU setup (in PyTorch). + """ + if not is_torch_available(): + return unittest.skip(reason="test requires PyTorch")(test_case) + + import torch + + return unittest.skipUnless(torch.cuda.device_count() < 2, "test requires 0 or 1 GPU")(test_case) + + +def require_torch_non_multi_accelerator(test_case): + """ + Decorator marking a test that requires 0 or 1 accelerator setup (in PyTorch). + """ + if not is_torch_available(): + return unittest.skip(reason="test requires PyTorch")(test_case) + + return unittest.skipUnless(backend_device_count(torch_device) < 2, "test requires 0 or 1 accelerator")(test_case) + + +def require_torch_up_to_2_gpus(test_case): + """ + Decorator marking a test that requires 0 or 1 or 2 GPU setup (in PyTorch). + """ + if not is_torch_available(): + return unittest.skip(reason="test requires PyTorch")(test_case) + + import torch + + return unittest.skipUnless(torch.cuda.device_count() < 3, "test requires 0 or 1 or 2 GPUs")(test_case) + + +def require_torch_up_to_2_accelerators(test_case): + """ + Decorator marking a test that requires 0 or 1 or 2 accelerator setup (in PyTorch). + """ + if not is_torch_available(): + return unittest.skip(reason="test requires PyTorch")(test_case) + + return unittest.skipUnless(backend_device_count(torch_device) < 3, "test requires 0 or 1 or 2 accelerators")( + test_case + ) + + +def require_torch_xla(test_case): + """ + Decorator marking a test that requires TorchXLA (in PyTorch). + """ + return unittest.skipUnless(is_torch_xla_available(), "test requires TorchXLA")(test_case) + + +def require_torch_neuroncore(test_case): + """ + Decorator marking a test that requires NeuronCore (in PyTorch). + """ + return unittest.skipUnless(is_torch_neuroncore_available(check_device=False), "test requires PyTorch NeuronCore")( + test_case + ) + + +def require_torch_npu(test_case): + """ + Decorator marking a test that requires NPU (in PyTorch). + """ + return unittest.skipUnless(is_torch_npu_available(), "test requires PyTorch NPU")(test_case) + + +def require_torch_multi_npu(test_case): + """ + Decorator marking a test that requires a multi-NPU setup (in PyTorch). These tests are skipped on a machine without + multiple NPUs. + + To run *only* the multi_npu tests, assuming all test names contain multi_npu: $ pytest -sv ./tests -k "multi_npu" + """ + if not is_torch_npu_available(): + return unittest.skip(reason="test requires PyTorch NPU")(test_case) + + return unittest.skipUnless(torch.npu.device_count() > 1, "test requires multiple NPUs")(test_case) + + +def require_non_hpu(test_case): + """ + Decorator marking a test that should be skipped for HPU. + """ + return unittest.skipUnless(torch_device != "hpu", "test requires a non-HPU")(test_case) + + +def require_torch_xpu(test_case): + """ + Decorator marking a test that requires XPU (in PyTorch). + + These tests are skipped when XPU backend is not available. XPU backend might be available either via stock + PyTorch (>=2.4) or via Intel Extension for PyTorch. In the latter case, if IPEX is installed, its version + must match match current PyTorch version. + """ + return unittest.skipUnless(is_torch_xpu_available(), "test requires XPU device")(test_case) + + +def require_non_xpu(test_case): + """ + Decorator marking a test that should be skipped for XPU. + """ + return unittest.skipUnless(torch_device != "xpu", "test requires a non-XPU")(test_case) + + +def require_torch_multi_xpu(test_case): + """ + Decorator marking a test that requires a multi-XPU setup (in PyTorch). These tests are skipped on a machine without + multiple XPUs. + + To run *only* the multi_xpu tests, assuming all test names contain multi_xpu: $ pytest -sv ./tests -k "multi_xpu" + """ + if not is_torch_xpu_available(): + return unittest.skip(reason="test requires PyTorch XPU")(test_case) + + return unittest.skipUnless(torch.xpu.device_count() > 1, "test requires multiple XPUs")(test_case) + + +def require_torch_multi_hpu(test_case): + """ + Decorator marking a test that requires a multi-HPU setup (in PyTorch). These tests are skipped on a machine without + multiple HPUs. + + To run *only* the multi_hpu tests, assuming all test names contain multi_hpu: $ pytest -sv ./tests -k "multi_hpu" + """ + if not is_torch_hpu_available(): + return unittest.skip(reason="test requires PyTorch HPU")(test_case) + + return unittest.skipUnless(torch.hpu.device_count() > 1, "test requires multiple HPUs")(test_case) + + +if is_torch_available(): + # Set env var CUDA_VISIBLE_DEVICES="" to force cpu-mode + import torch + + if "TRANSFORMERS_TEST_BACKEND" in os.environ: + backend = os.environ["TRANSFORMERS_TEST_BACKEND"] + try: + _ = importlib.import_module(backend) + except ModuleNotFoundError as e: + raise ModuleNotFoundError( + f"Failed to import `TRANSFORMERS_TEST_BACKEND` '{backend}'! This should be the name of an installed module. The original error (look up to see its" + f" traceback):\n{e}" + ) from e + + if "TRANSFORMERS_TEST_DEVICE" in os.environ: + torch_device = os.environ["TRANSFORMERS_TEST_DEVICE"] + if torch_device == "cuda" and not torch.cuda.is_available(): + raise ValueError( + f"TRANSFORMERS_TEST_DEVICE={torch_device}, but CUDA is unavailable. Please double-check your testing environment." + ) + if torch_device == "xpu" and not is_torch_xpu_available(): + raise ValueError( + f"TRANSFORMERS_TEST_DEVICE={torch_device}, but XPU is unavailable. Please double-check your testing environment." + ) + if torch_device == "npu" and not is_torch_npu_available(): + raise ValueError( + f"TRANSFORMERS_TEST_DEVICE={torch_device}, but NPU is unavailable. Please double-check your testing environment." + ) + if torch_device == "mlu" and not is_torch_mlu_available(): + raise ValueError( + f"TRANSFORMERS_TEST_DEVICE={torch_device}, but MLU is unavailable. Please double-check your testing environment." + ) + if torch_device == "hpu" and not is_torch_hpu_available(): + raise ValueError( + f"TRANSFORMERS_TEST_DEVICE={torch_device}, but HPU is unavailable. Please double-check your testing environment." + ) + + try: + # try creating device to see if provided device is valid + _ = torch.device(torch_device) + except RuntimeError as e: + raise RuntimeError( + f"Unknown testing device specified by environment variable `TRANSFORMERS_TEST_DEVICE`: {torch_device}" + ) from e + elif torch.cuda.is_available(): + torch_device = "cuda" + elif is_torch_npu_available(): + torch_device = "npu" + elif is_torch_mlu_available(): + torch_device = "mlu" + elif is_torch_hpu_available(): + torch_device = "hpu" + elif is_torch_xpu_available(): + torch_device = "xpu" + else: + torch_device = "cpu" +else: + torch_device = None + +if is_tf_available(): + import tensorflow as tf + +if is_flax_available(): + import jax + + jax_device = jax.default_backend() +else: + jax_device = None + + +def require_torchdynamo(test_case): + """Decorator marking a test that requires TorchDynamo""" + return unittest.skipUnless(is_torchdynamo_available(), "test requires TorchDynamo")(test_case) + + +def require_torchao(test_case): + """Decorator marking a test that requires torchao""" + return unittest.skipUnless(is_torchao_available(), "test requires torchao")(test_case) + + +def require_torchao_version_greater_or_equal(torchao_version): + def decorator(test_case): + correct_torchao_version = is_torchao_available() and version.parse( + version.parse(importlib.metadata.version("torchao")).base_version + ) >= version.parse(torchao_version) + return unittest.skipUnless( + correct_torchao_version, f"Test requires torchao with the version greater than {torchao_version}." + )(test_case) + + return decorator + + +def require_torch_tensorrt_fx(test_case): + """Decorator marking a test that requires Torch-TensorRT FX""" + return unittest.skipUnless(is_torch_tensorrt_fx_available(), "test requires Torch-TensorRT FX")(test_case) + + +def require_torch_gpu(test_case): + """Decorator marking a test that requires CUDA and PyTorch.""" + return unittest.skipUnless(torch_device == "cuda", "test requires CUDA")(test_case) + + +def require_torch_large_gpu(test_case, memory: float = 20): + """Decorator marking a test that requires a CUDA GPU with more than `memory` GiB of memory.""" + if torch_device != "cuda": + return unittest.skip(reason=f"test requires a CUDA GPU with more than {memory} GiB of memory")(test_case) + + return unittest.skipUnless( + torch.cuda.get_device_properties(0).total_memory / 1024**3 > memory, + f"test requires a GPU with more than {memory} GiB of memory", + )(test_case) + + +def require_torch_large_accelerator(test_case, memory: float = 20): + """Decorator marking a test that requires an accelerator with more than `memory` GiB of memory.""" + if torch_device != "cuda" and torch_device != "xpu": + return unittest.skip(reason=f"test requires a GPU or XPU with more than {memory} GiB of memory")(test_case) + + torch_accelerator_module = getattr(torch, torch_device) + + return unittest.skipUnless( + torch_accelerator_module.get_device_properties(0).total_memory / 1024**3 > memory, + f"test requires a GPU or XPU with more than {memory} GiB of memory", + )(test_case) + + +def require_torch_gpu_if_bnb_not_multi_backend_enabled(test_case): + """ + Decorator marking a test that requires a GPU if bitsandbytes multi-backend feature is not enabled. + """ + if is_bitsandbytes_available() and is_bitsandbytes_multi_backend_available(): + return test_case + return require_torch_gpu(test_case) + + +def require_torch_accelerator(test_case): + """Decorator marking a test that requires an accessible accelerator and PyTorch.""" + return unittest.skipUnless(torch_device is not None and torch_device != "cpu", "test requires accelerator")( + test_case + ) + + +def require_torch_fp16(test_case): + """Decorator marking a test that requires a device that supports fp16""" + return unittest.skipUnless( + is_torch_fp16_available_on_device(torch_device), "test requires device with fp16 support" + )(test_case) + + +def require_fp8(test_case): + """Decorator marking a test that requires supports for fp8""" + return unittest.skipUnless(is_accelerate_available() and is_fp8_available(), "test requires fp8 support")( + test_case + ) + + +def require_torch_bf16(test_case): + """Decorator marking a test that requires a device that supports bf16""" + return unittest.skipUnless( + is_torch_bf16_available_on_device(torch_device), "test requires device with bf16 support" + )(test_case) + + +def require_torch_bf16_gpu(test_case): + """Decorator marking a test that requires torch>=1.10, using Ampere GPU or newer arch with cuda>=11.0""" + return unittest.skipUnless( + is_torch_bf16_gpu_available(), + "test requires torch>=1.10, using Ampere GPU or newer arch with cuda>=11.0", + )(test_case) + + +def require_deterministic_for_xpu(test_case): + @wraps(test_case) + def wrapper(*args, **kwargs): + if is_torch_xpu_available(): + original_state = torch.are_deterministic_algorithms_enabled() + try: + torch.use_deterministic_algorithms(True) + return test_case(*args, **kwargs) + finally: + torch.use_deterministic_algorithms(original_state) + else: + return test_case(*args, **kwargs) + + return wrapper + + +def require_torch_tf32(test_case): + """Decorator marking a test that requires Ampere or a newer GPU arch, cuda>=11 and torch>=1.7.""" + return unittest.skipUnless( + is_torch_tf32_available(), "test requires Ampere or a newer GPU arch, cuda>=11 and torch>=1.7" + )(test_case) + + +def require_detectron2(test_case): + """Decorator marking a test that requires detectron2.""" + return unittest.skipUnless(is_detectron2_available(), "test requires `detectron2`")(test_case) + + +def require_faiss(test_case): + """Decorator marking a test that requires faiss.""" + return unittest.skipUnless(is_faiss_available(), "test requires `faiss`")(test_case) + + +def require_optuna(test_case): + """ + Decorator marking a test that requires optuna. + + These tests are skipped when optuna isn't installed. + + """ + return unittest.skipUnless(is_optuna_available(), "test requires optuna")(test_case) + + +def require_ray(test_case): + """ + Decorator marking a test that requires Ray/tune. + + These tests are skipped when Ray/tune isn't installed. + + """ + return unittest.skipUnless(is_ray_available(), "test requires Ray/tune")(test_case) + + +def require_sigopt(test_case): + """ + Decorator marking a test that requires SigOpt. + + These tests are skipped when SigOpt isn't installed. + + """ + return unittest.skipUnless(is_sigopt_available(), "test requires SigOpt")(test_case) + + +def require_swanlab(test_case): + """ + Decorator marking a test that requires swanlab. + + These tests are skipped when swanlab isn't installed. + + """ + return unittest.skipUnless(is_swanlab_available(), "test requires swanlab")(test_case) + + +def require_wandb(test_case): + """ + Decorator marking a test that requires wandb. + + These tests are skipped when wandb isn't installed. + + """ + return unittest.skipUnless(is_wandb_available(), "test requires wandb")(test_case) + + +def require_clearml(test_case): + """ + Decorator marking a test requires clearml. + + These tests are skipped when clearml isn't installed. + + """ + return unittest.skipUnless(is_clearml_available(), "test requires clearml")(test_case) + + +def require_soundfile(test_case): + """ + Decorator marking a test that requires soundfile + + These tests are skipped when soundfile isn't installed. + + """ + return unittest.skipUnless(is_soundfile_available(), "test requires soundfile")(test_case) + + +def require_deepspeed(test_case): + """ + Decorator marking a test that requires deepspeed + """ + return unittest.skipUnless(is_deepspeed_available(), "test requires deepspeed")(test_case) + + +def require_apex(test_case): + """ + Decorator marking a test that requires apex + """ + return unittest.skipUnless(is_apex_available(), "test requires apex")(test_case) + + +def require_aqlm(test_case): + """ + Decorator marking a test that requires aqlm + """ + return unittest.skipUnless(is_aqlm_available(), "test requires aqlm")(test_case) + + +def require_vptq(test_case): + """ + Decorator marking a test that requires vptq + """ + return unittest.skipUnless(is_vptq_available(), "test requires vptq")(test_case) + + +def require_spqr(test_case): + """ + Decorator marking a test that requires spqr + """ + return unittest.skipUnless(is_spqr_available(), "test requires spqr")(test_case) + + +def require_eetq(test_case): + """ + Decorator marking a test that requires eetq + """ + eetq_available = is_eetq_available() + if eetq_available: + try: + import eetq # noqa: F401 + except ImportError as exc: + if "shard_checkpoint" in str(exc): + # EETQ 1.0.0 is currently broken with the latest transformers because it tries to import the removed + # shard_checkpoint function, see https://github.com/NetEase-FuXi/EETQ/issues/34. + # TODO: Remove once eetq releases a fix and this release is used in CI + eetq_available = False + return unittest.skipUnless(eetq_available, "test requires eetq")(test_case) + + +def require_av(test_case): + """ + Decorator marking a test that requires av + """ + return unittest.skipUnless(is_av_available(), "test requires av")(test_case) + + +def require_bitsandbytes(test_case): + """ + Decorator marking a test that requires the bitsandbytes library. Will be skipped when the library or its hard dependency torch is not installed. + """ + if is_bitsandbytes_available() and is_torch_available(): + try: + import pytest + + return pytest.mark.bitsandbytes(test_case) + except ImportError: + return test_case + else: + return unittest.skip(reason="test requires bitsandbytes and torch")(test_case) + + +def require_optimum(test_case): + """ + Decorator for optimum dependency + """ + return unittest.skipUnless(is_optimum_available(), "test requires optimum")(test_case) + + +def require_tensorboard(test_case): + """ + Decorator for `tensorboard` dependency + """ + return unittest.skipUnless(is_tensorboard_available(), "test requires tensorboard") + + +def require_gptq(test_case): + """ + Decorator for auto_gptq dependency + """ + return unittest.skipUnless( + is_gptqmodel_available() or is_auto_gptq_available(), "test requires gptqmodel or auto-gptq" + )(test_case) + + +def require_hqq(test_case): + """ + Decorator for hqq dependency + """ + return unittest.skipUnless(is_hqq_available(), "test requires hqq")(test_case) + + +def require_auto_awq(test_case): + """ + Decorator for auto_awq dependency + """ + return unittest.skipUnless(is_auto_awq_available(), "test requires autoawq")(test_case) + + +def require_auto_round(test_case): + """ + Decorator for auto_round dependency + """ + return unittest.skipUnless(is_auto_round_available(), "test requires autoround")(test_case) + + +def require_optimum_quanto(test_case): + """ + Decorator for quanto dependency + """ + return unittest.skipUnless(is_optimum_quanto_available(), "test requires optimum-quanto")(test_case) + + +def require_compressed_tensors(test_case): + """ + Decorator for compressed_tensors dependency + """ + return unittest.skipUnless(is_compressed_tensors_available(), "test requires compressed_tensors")(test_case) + + +def require_fbgemm_gpu(test_case): + """ + Decorator for fbgemm_gpu dependency + """ + return unittest.skipUnless(is_fbgemm_gpu_available(), "test requires fbgemm-gpu")(test_case) + + +def require_quark(test_case): + """ + Decorator for quark dependency + """ + return unittest.skipUnless(is_quark_available(), "test requires quark")(test_case) + + +def require_flute_hadamard(test_case): + """ + Decorator marking a test that requires higgs and hadamard + """ + return unittest.skipUnless( + is_flute_available() and is_hadamard_available(), "test requires flute and fast_hadamard_transform" + )(test_case) + + +def require_phonemizer(test_case): + """ + Decorator marking a test that requires phonemizer + """ + return unittest.skipUnless(is_phonemizer_available(), "test requires phonemizer")(test_case) + + +def require_pyctcdecode(test_case): + """ + Decorator marking a test that requires pyctcdecode + """ + return unittest.skipUnless(is_pyctcdecode_available(), "test requires pyctcdecode")(test_case) + + +def require_librosa(test_case): + """ + Decorator marking a test that requires librosa + """ + return unittest.skipUnless(is_librosa_available(), "test requires librosa")(test_case) + + +def require_liger_kernel(test_case): + """ + Decorator marking a test that requires liger_kernel + """ + return unittest.skipUnless(is_liger_kernel_available(), "test requires liger_kernel")(test_case) + + +def require_essentia(test_case): + """ + Decorator marking a test that requires essentia + """ + return unittest.skipUnless(is_essentia_available(), "test requires essentia")(test_case) + + +def require_pretty_midi(test_case): + """ + Decorator marking a test that requires pretty_midi + """ + return unittest.skipUnless(is_pretty_midi_available(), "test requires pretty_midi")(test_case) + + +def cmd_exists(cmd): + return shutil.which(cmd) is not None + + +def require_usr_bin_time(test_case): + """ + Decorator marking a test that requires `/usr/bin/time` + """ + return unittest.skipUnless(cmd_exists("/usr/bin/time"), "test requires /usr/bin/time")(test_case) + + +def require_sudachi(test_case): + """ + Decorator marking a test that requires sudachi + """ + return unittest.skipUnless(is_sudachi_available(), "test requires sudachi")(test_case) + + +def require_sudachi_projection(test_case): + """ + Decorator marking a test that requires sudachi_projection + """ + return unittest.skipUnless(is_sudachi_projection_available(), "test requires sudachi which supports projection")( + test_case + ) + + +def require_jumanpp(test_case): + """ + Decorator marking a test that requires jumanpp + """ + return unittest.skipUnless(is_jumanpp_available(), "test requires jumanpp")(test_case) + + +def require_cython(test_case): + """ + Decorator marking a test that requires jumanpp + """ + return unittest.skipUnless(is_cython_available(), "test requires cython")(test_case) + + +def require_tiktoken(test_case): + """ + Decorator marking a test that requires TikToken. These tests are skipped when TikToken isn't installed. + """ + return unittest.skipUnless(is_tiktoken_available(), "test requires TikToken")(test_case) + + +def get_gpu_count(): + """ + Return the number of available gpus (regardless of whether torch, tf or jax is used) + """ + if is_torch_available(): + import torch + + return torch.cuda.device_count() + elif is_tf_available(): + import tensorflow as tf + + return len(tf.config.list_physical_devices("GPU")) + elif is_flax_available(): + import jax + + return jax.device_count() + else: + return 0 + + +def get_tests_dir(append_path=None): + """ + Args: + append_path: optional path to append to the tests dir path + + Return: + The full path to the `tests` dir, so that the tests can be invoked from anywhere. Optionally `append_path` is + joined after the `tests` dir the former is provided. + + """ + # this function caller's __file__ + caller__file__ = inspect.stack()[1][1] + tests_dir = os.path.abspath(os.path.dirname(caller__file__)) + + while not tests_dir.endswith("tests"): + tests_dir = os.path.dirname(tests_dir) + + if append_path: + return os.path.join(tests_dir, append_path) + else: + return tests_dir + + +def get_steps_per_epoch(trainer: Trainer) -> int: + training_args = trainer.args + train_dataloader = trainer.get_train_dataloader() + + initial_training_values = trainer.set_initial_training_values( + args=training_args, + dataloader=train_dataloader, + total_train_batch_size=training_args.per_device_train_batch_size, + ) + steps_per_epoch = initial_training_values[1] + + return steps_per_epoch + + +def evaluate_side_effect_factory( + side_effect_values: list[dict[str, float]], +) -> Generator[dict[str, float], None, None]: + """ + Function that returns side effects for the _evaluate method. + Used when we're unsure of exactly how many times _evaluate will be called. + """ + yield from side_effect_values + + while True: + yield side_effect_values[-1] + + +# +# Helper functions for dealing with testing text outputs +# The original code came from: +# https://github.com/fastai/fastai/blob/master/tests/utils/text.py + + +# When any function contains print() calls that get overwritten, like progress bars, +# a special care needs to be applied, since under pytest -s captured output (capsys +# or contextlib.redirect_stdout) contains any temporary printed strings, followed by +# \r's. This helper function ensures that the buffer will contain the same output +# with and without -s in pytest, by turning: +# foo bar\r tar mar\r final message +# into: +# final message +# it can handle a single string or a multiline buffer +def apply_print_resets(buf): + return re.sub(r"^.*\r", "", buf, 0, re.M) + + +def assert_screenout(out, what): + out_pr = apply_print_resets(out).lower() + match_str = out_pr.find(what.lower()) + assert match_str != -1, f"expecting to find {what} in output: f{out_pr}" + + +def set_model_tester_for_less_flaky_test(test_case): + target_num_hidden_layers = 1 + # TODO (if possible): Avoid exceptional cases + exceptional_classes = [ + "ZambaModelTester", + "Zamba2ModelTester", + "RwkvModelTester", + "AriaVisionText2TextModelTester", + "GPTNeoModelTester", + "DPTModelTester", + ] + if test_case.model_tester.__class__.__name__ in exceptional_classes: + target_num_hidden_layers = None + if hasattr(test_case.model_tester, "out_features") or hasattr(test_case.model_tester, "out_indices"): + target_num_hidden_layers = None + + if hasattr(test_case.model_tester, "num_hidden_layers") and target_num_hidden_layers is not None: + test_case.model_tester.num_hidden_layers = target_num_hidden_layers + if ( + hasattr(test_case.model_tester, "vision_config") + and "num_hidden_layers" in test_case.model_tester.vision_config + and target_num_hidden_layers is not None + ): + test_case.model_tester.vision_config = copy.deepcopy(test_case.model_tester.vision_config) + if isinstance(test_case.model_tester.vision_config, dict): + test_case.model_tester.vision_config["num_hidden_layers"] = 1 + else: + test_case.model_tester.vision_config.num_hidden_layers = 1 + if ( + hasattr(test_case.model_tester, "text_config") + and "num_hidden_layers" in test_case.model_tester.text_config + and target_num_hidden_layers is not None + ): + test_case.model_tester.text_config = copy.deepcopy(test_case.model_tester.text_config) + if isinstance(test_case.model_tester.text_config, dict): + test_case.model_tester.text_config["num_hidden_layers"] = 1 + else: + test_case.model_tester.text_config.num_hidden_layers = 1 + + # A few model class specific handling + + # For Albert + if hasattr(test_case.model_tester, "num_hidden_groups"): + test_case.model_tester.num_hidden_groups = test_case.model_tester.num_hidden_layers + + +def set_config_for_less_flaky_test(config): + target_attrs = [ + "rms_norm_eps", + "layer_norm_eps", + "norm_eps", + "norm_epsilon", + "layer_norm_epsilon", + "batch_norm_eps", + ] + for target_attr in target_attrs: + setattr(config, target_attr, 1.0) + + # norm layers (layer/group norm, etc.) could cause flaky tests when the tensors have very small variance. + # (We don't need the original epsilon values to check eager/sdpa matches) + attrs = ["text_config", "vision_config", "text_encoder", "audio_encoder", "decoder"] + for attr in attrs: + if hasattr(config, attr): + for target_attr in target_attrs: + setattr(getattr(config, attr), target_attr, 1.0) + + +def set_model_for_less_flaky_test(model): + # Another way to make sure norm layers have desired epsilon. (Some models don't set it from its config.) + target_names = ( + "LayerNorm", + "GroupNorm", + "BatchNorm", + "RMSNorm", + "BatchNorm2d", + "BatchNorm1d", + "BitGroupNormActivation", + "WeightStandardizedConv2d", + ) + target_attrs = ["eps", "epsilon", "variance_epsilon"] + if is_torch_available() and isinstance(model, torch.nn.Module): + for module in model.modules(): + if type(module).__name__.endswith(target_names): + for attr in target_attrs: + if hasattr(module, attr): + setattr(module, attr, 1.0) + + +class CaptureStd: + """ + Context manager to capture: + + - stdout: replay it, clean it up and make it available via `obj.out` + - stderr: replay it and make it available via `obj.err` + + Args: + out (`bool`, *optional*, defaults to `True`): Whether to capture stdout or not. + err (`bool`, *optional*, defaults to `True`): Whether to capture stderr or not. + replay (`bool`, *optional*, defaults to `True`): Whether to replay or not. + By default each captured stream gets replayed back on context's exit, so that one can see what the test was + doing. If this is a not wanted behavior and the captured data shouldn't be replayed, pass `replay=False` to + disable this feature. + + Examples: + + ```python + # to capture stdout only with auto-replay + with CaptureStdout() as cs: + print("Secret message") + assert "message" in cs.out + + # to capture stderr only with auto-replay + import sys + + with CaptureStderr() as cs: + print("Warning: ", file=sys.stderr) + assert "Warning" in cs.err + + # to capture both streams with auto-replay + with CaptureStd() as cs: + print("Secret message") + print("Warning: ", file=sys.stderr) + assert "message" in cs.out + assert "Warning" in cs.err + + # to capture just one of the streams, and not the other, with auto-replay + with CaptureStd(err=False) as cs: + print("Secret message") + assert "message" in cs.out + # but best use the stream-specific subclasses + + # to capture without auto-replay + with CaptureStd(replay=False) as cs: + print("Secret message") + assert "message" in cs.out + ```""" + + def __init__(self, out=True, err=True, replay=True): + self.replay = replay + + if out: + self.out_buf = StringIO() + self.out = "error: CaptureStd context is unfinished yet, called too early" + else: + self.out_buf = None + self.out = "not capturing stdout" + + if err: + self.err_buf = StringIO() + self.err = "error: CaptureStd context is unfinished yet, called too early" + else: + self.err_buf = None + self.err = "not capturing stderr" + + def __enter__(self): + if self.out_buf: + self.out_old = sys.stdout + sys.stdout = self.out_buf + + if self.err_buf: + self.err_old = sys.stderr + sys.stderr = self.err_buf + + return self + + def __exit__(self, *exc): + if self.out_buf: + sys.stdout = self.out_old + captured = self.out_buf.getvalue() + if self.replay: + sys.stdout.write(captured) + self.out = apply_print_resets(captured) + + if self.err_buf: + sys.stderr = self.err_old + captured = self.err_buf.getvalue() + if self.replay: + sys.stderr.write(captured) + self.err = captured + + def __repr__(self): + msg = "" + if self.out_buf: + msg += f"stdout: {self.out}\n" + if self.err_buf: + msg += f"stderr: {self.err}\n" + return msg + + +# in tests it's the best to capture only the stream that's wanted, otherwise +# it's easy to miss things, so unless you need to capture both streams, use the +# subclasses below (less typing). Or alternatively, configure `CaptureStd` to +# disable the stream you don't need to test. + + +class CaptureStdout(CaptureStd): + """Same as CaptureStd but captures only stdout""" + + def __init__(self, replay=True): + super().__init__(err=False, replay=replay) + + +class CaptureStderr(CaptureStd): + """Same as CaptureStd but captures only stderr""" + + def __init__(self, replay=True): + super().__init__(out=False, replay=replay) + + +class CaptureLogger: + """ + Context manager to capture `logging` streams + + Args: + logger: 'logging` logger object + + Returns: + The captured output is available via `self.out` + + Example: + + ```python + >>> from transformers import logging + >>> from transformers.testing_utils import CaptureLogger + + >>> msg = "Testing 1, 2, 3" + >>> logging.set_verbosity_info() + >>> logger = logging.get_logger("transformers.models.bart.tokenization_bart") + >>> with CaptureLogger(logger) as cl: + ... logger.info(msg) + >>> assert cl.out, msg + "\n" + ``` + """ + + def __init__(self, logger): + self.logger = logger + self.io = StringIO() + self.sh = logging.StreamHandler(self.io) + self.out = "" + + def __enter__(self): + self.logger.addHandler(self.sh) + return self + + def __exit__(self, *exc): + self.logger.removeHandler(self.sh) + self.out = self.io.getvalue() + + def __repr__(self): + return f"captured: {self.out}\n" + + +@contextlib.contextmanager +def LoggingLevel(level): + """ + This is a context manager to temporarily change transformers modules logging level to the desired value and have it + restored to the original setting at the end of the scope. + + Example: + + ```python + with LoggingLevel(logging.INFO): + AutoModel.from_pretrained("openai-community/gpt2") # calls logger.info() several times + ``` + """ + orig_level = transformers_logging.get_verbosity() + try: + transformers_logging.set_verbosity(level) + yield + finally: + transformers_logging.set_verbosity(orig_level) + + +class TemporaryHubRepo: + """Create a temporary Hub repository and return its `RepoUrl` object. This is similar to + `tempfile.TemporaryDirectory` and can be used as a context manager. For example: + + with TemporaryHubRepo(token=self._token) as temp_repo: + ... + + Upon exiting the context, the repository and everything contained in it are removed. + + Example: + + ```python + with TemporaryHubRepo(token=self._token) as temp_repo: + model.push_to_hub(tmp_repo.repo_id, token=self._token) + ``` + """ + + def __init__(self, namespace: Optional[str] = None, token: Optional[str] = None) -> None: + self.token = token + with tempfile.TemporaryDirectory() as tmp_dir: + repo_id = Path(tmp_dir).name + if namespace is not None: + repo_id = f"{namespace}/{repo_id}" + self.repo_url = huggingface_hub.create_repo(repo_id, token=self.token) + + def __enter__(self): + return self.repo_url + + def __exit__(self, exc, value, tb): + delete_repo(repo_id=self.repo_url.repo_id, token=self.token, missing_ok=True) + + +@contextlib.contextmanager +# adapted from https://stackoverflow.com/a/64789046/9201239 +def ExtendSysPath(path: Union[str, os.PathLike]) -> Iterator[None]: + """ + Temporary add given path to `sys.path`. + + Usage : + + ```python + with ExtendSysPath("/path/to/dir"): + mymodule = importlib.import_module("mymodule") + ``` + """ + + path = os.fspath(path) + try: + sys.path.insert(0, path) + yield + finally: + sys.path.remove(path) + + +class TestCasePlus(unittest.TestCase): + """ + This class extends *unittest.TestCase* with additional features. + + Feature 1: A set of fully resolved important file and dir path accessors. + + In tests often we need to know where things are relative to the current test file, and it's not trivial since the + test could be invoked from more than one directory or could reside in sub-directories with different depths. This + class solves this problem by sorting out all the basic paths and provides easy accessors to them: + + - `pathlib` objects (all fully resolved): + + - `test_file_path` - the current test file path (=`__file__`) + - `test_file_dir` - the directory containing the current test file + - `tests_dir` - the directory of the `tests` test suite + - `examples_dir` - the directory of the `examples` test suite + - `repo_root_dir` - the directory of the repository + - `src_dir` - the directory of `src` (i.e. where the `transformers` sub-dir resides) + + - stringified paths---same as above but these return paths as strings, rather than `pathlib` objects: + + - `test_file_path_str` + - `test_file_dir_str` + - `tests_dir_str` + - `examples_dir_str` + - `repo_root_dir_str` + - `src_dir_str` + + Feature 2: Flexible auto-removable temporary dirs which are guaranteed to get removed at the end of test. + + 1. Create a unique temporary dir: + + ```python + def test_whatever(self): + tmp_dir = self.get_auto_remove_tmp_dir() + ``` + + `tmp_dir` will contain the path to the created temporary dir. It will be automatically removed at the end of the + test. + + + 2. Create a temporary dir of my choice, ensure it's empty before the test starts and don't + empty it after the test. + + ```python + def test_whatever(self): + tmp_dir = self.get_auto_remove_tmp_dir("./xxx") + ``` + + This is useful for debug when you want to monitor a specific directory and want to make sure the previous tests + didn't leave any data in there. + + 3. You can override the first two options by directly overriding the `before` and `after` args, leading to the + following behavior: + + `before=True`: the temporary dir will always be cleared at the beginning of the test. + + `before=False`: if the temporary dir already existed, any existing files will remain there. + + `after=True`: the temporary dir will always be deleted at the end of the test. + + `after=False`: the temporary dir will always be left intact at the end of the test. + + Note 1: In order to run the equivalent of `rm -r` safely, only subdirs of the project repository checkout are + allowed if an explicit `tmp_dir` is used, so that by mistake no `/tmp` or similar important part of the filesystem + will get nuked. i.e. please always pass paths that start with `./` + + Note 2: Each test can register multiple temporary dirs and they all will get auto-removed, unless requested + otherwise. + + Feature 3: Get a copy of the `os.environ` object that sets up `PYTHONPATH` specific to the current test suite. This + is useful for invoking external programs from the test suite - e.g. distributed training. + + + ```python + def test_whatever(self): + env = self.get_env() + ```""" + + def setUp(self): + # get_auto_remove_tmp_dir feature: + self.teardown_tmp_dirs = [] + + # figure out the resolved paths for repo_root, tests, examples, etc. + self._test_file_path = inspect.getfile(self.__class__) + path = Path(self._test_file_path).resolve() + self._test_file_dir = path.parents[0] + for up in [1, 2, 3]: + tmp_dir = path.parents[up] + if (tmp_dir / "src").is_dir() and (tmp_dir / "tests").is_dir(): + break + if tmp_dir: + self._repo_root_dir = tmp_dir + else: + raise ValueError(f"can't figure out the root of the repo from {self._test_file_path}") + self._tests_dir = self._repo_root_dir / "tests" + self._examples_dir = self._repo_root_dir / "examples" + self._src_dir = self._repo_root_dir / "src" + + @property + def test_file_path(self): + return self._test_file_path + + @property + def test_file_path_str(self): + return str(self._test_file_path) + + @property + def test_file_dir(self): + return self._test_file_dir + + @property + def test_file_dir_str(self): + return str(self._test_file_dir) + + @property + def tests_dir(self): + return self._tests_dir + + @property + def tests_dir_str(self): + return str(self._tests_dir) + + @property + def examples_dir(self): + return self._examples_dir + + @property + def examples_dir_str(self): + return str(self._examples_dir) + + @property + def repo_root_dir(self): + return self._repo_root_dir + + @property + def repo_root_dir_str(self): + return str(self._repo_root_dir) + + @property + def src_dir(self): + return self._src_dir + + @property + def src_dir_str(self): + return str(self._src_dir) + + def get_env(self): + """ + Return a copy of the `os.environ` object that sets up `PYTHONPATH` correctly, depending on the test suite it's + invoked from. This is useful for invoking external programs from the test suite - e.g. distributed training. + + It always inserts `./src` first, then `./tests` or `./examples` depending on the test suite type and finally + the preset `PYTHONPATH` if any (all full resolved paths). + + """ + env = os.environ.copy() + paths = [self.src_dir_str] + if "/examples" in self.test_file_dir_str: + paths.append(self.examples_dir_str) + else: + paths.append(self.tests_dir_str) + paths.append(env.get("PYTHONPATH", "")) + + env["PYTHONPATH"] = ":".join(paths) + return env + + def get_auto_remove_tmp_dir(self, tmp_dir=None, before=None, after=None): + """ + Args: + tmp_dir (`string`, *optional*): + if `None`: + + - a unique temporary path will be created + - sets `before=True` if `before` is `None` + - sets `after=True` if `after` is `None` + else: + + - `tmp_dir` will be created + - sets `before=True` if `before` is `None` + - sets `after=False` if `after` is `None` + before (`bool`, *optional*): + If `True` and the `tmp_dir` already exists, make sure to empty it right away if `False` and the + `tmp_dir` already exists, any existing files will remain there. + after (`bool`, *optional*): + If `True`, delete the `tmp_dir` at the end of the test if `False`, leave the `tmp_dir` and its contents + intact at the end of the test. + + Returns: + tmp_dir(`string`): either the same value as passed via *tmp_dir* or the path to the auto-selected tmp dir + """ + if tmp_dir is not None: + # defining the most likely desired behavior for when a custom path is provided. + # this most likely indicates the debug mode where we want an easily locatable dir that: + # 1. gets cleared out before the test (if it already exists) + # 2. is left intact after the test + if before is None: + before = True + if after is None: + after = False + + # using provided path + path = Path(tmp_dir).resolve() + + # to avoid nuking parts of the filesystem, only relative paths are allowed + if not tmp_dir.startswith("./"): + raise ValueError( + f"`tmp_dir` can only be a relative path, i.e. `./some/path`, but received `{tmp_dir}`" + ) + + # ensure the dir is empty to start with + if before is True and path.exists(): + shutil.rmtree(tmp_dir, ignore_errors=True) + + path.mkdir(parents=True, exist_ok=True) + + else: + # defining the most likely desired behavior for when a unique tmp path is auto generated + # (not a debug mode), here we require a unique tmp dir that: + # 1. is empty before the test (it will be empty in this situation anyway) + # 2. gets fully removed after the test + if before is None: + before = True + if after is None: + after = True + + # using unique tmp dir (always empty, regardless of `before`) + tmp_dir = tempfile.mkdtemp() + + if after is True: + # register for deletion + self.teardown_tmp_dirs.append(tmp_dir) + + return tmp_dir + + def python_one_liner_max_rss(self, one_liner_str): + """ + Runs the passed python one liner (just the code) and returns how much max cpu memory was used to run the + program. + + Args: + one_liner_str (`string`): + a python one liner code that gets passed to `python -c` + + Returns: + max cpu memory bytes used to run the program. This value is likely to vary slightly from run to run. + + Requirements: + this helper needs `/usr/bin/time` to be installed (`apt install time`) + + Example: + + ``` + one_liner_str = 'from transformers import AutoModel; AutoModel.from_pretrained("google-t5/t5-large")' + max_rss = self.python_one_liner_max_rss(one_liner_str) + ``` + """ + + if not cmd_exists("/usr/bin/time"): + raise ValueError("/usr/bin/time is required, install with `apt install time`") + + cmd = shlex.split(f"/usr/bin/time -f %M python -c '{one_liner_str}'") + with CaptureStd() as cs: + execute_subprocess_async(cmd, env=self.get_env()) + # returned data is in KB so convert to bytes + max_rss = int(cs.err.split("\n")[-2].replace("stderr: ", "")) * 1024 + return max_rss + + def tearDown(self): + # get_auto_remove_tmp_dir feature: remove registered temp dirs + for path in self.teardown_tmp_dirs: + shutil.rmtree(path, ignore_errors=True) + self.teardown_tmp_dirs = [] + if is_accelerate_available(): + AcceleratorState._reset_state() + PartialState._reset_state() + + # delete all the env variables having `ACCELERATE` in them + for k in list(os.environ.keys()): + if "ACCELERATE" in k: + del os.environ[k] + + +def mockenv(**kwargs): + """ + this is a convenience wrapper, that allows this :: + + @mockenv(RUN_SLOW=True, USE_TF=False) def test_something(): + run_slow = os.getenv("RUN_SLOW", False) use_tf = os.getenv("USE_TF", False) + + """ + return mock.patch.dict(os.environ, kwargs) + + +# from https://stackoverflow.com/a/34333710/9201239 +@contextlib.contextmanager +def mockenv_context(*remove, **update): + """ + Temporarily updates the `os.environ` dictionary in-place. Similar to mockenv + + The `os.environ` dictionary is updated in-place so that the modification is sure to work in all situations. + + Args: + remove: Environment variables to remove. + update: Dictionary of environment variables and values to add/update. + """ + env = os.environ + update = update or {} + remove = remove or [] + + # List of environment variables being updated or removed. + stomped = (set(update.keys()) | set(remove)) & set(env.keys()) + # Environment variables and values to restore on exit. + update_after = {k: env[k] for k in stomped} + # Environment variables and values to remove on exit. + remove_after = frozenset(k for k in update if k not in env) + + try: + env.update(update) + [env.pop(k, None) for k in remove] + yield + finally: + env.update(update_after) + [env.pop(k) for k in remove_after] + + +# --- pytest conf functions --- # + +# to avoid multiple invocation from tests/conftest.py and examples/conftest.py - make sure it's called only once +pytest_opt_registered = {} + + +def pytest_addoption_shared(parser): + """ + This function is to be called from `conftest.py` via `pytest_addoption` wrapper that has to be defined there. + + It allows loading both `conftest.py` files at once without causing a failure due to adding the same `pytest` + option. + + """ + option = "--make-reports" + if option not in pytest_opt_registered: + parser.addoption( + option, + action="store", + default=False, + help="generate report files. The value of this option is used as a prefix to report names", + ) + pytest_opt_registered[option] = 1 + + +def pytest_terminal_summary_main(tr, id): + """ + Generate multiple reports at the end of test suite run - each report goes into a dedicated file in the current + directory. The report files are prefixed with the test suite name. + + This function emulates --duration and -rA pytest arguments. + + This function is to be called from `conftest.py` via `pytest_terminal_summary` wrapper that has to be defined + there. + + Args: + - tr: `terminalreporter` passed from `conftest.py` + - id: unique id like `tests` or `examples` that will be incorporated into the final reports filenames - this is + needed as some jobs have multiple runs of pytest, so we can't have them overwrite each other. + + NB: this functions taps into a private _pytest API and while unlikely, it could break should pytest do internal + changes - also it calls default internal methods of terminalreporter which can be hijacked by various `pytest-` + plugins and interfere. + + """ + from _pytest.config import create_terminal_writer + + if not len(id): + id = "tests" + + config = tr.config + orig_writer = config.get_terminal_writer() + orig_tbstyle = config.option.tbstyle + orig_reportchars = tr.reportchars + + dir = f"reports/{id}" + Path(dir).mkdir(parents=True, exist_ok=True) + report_files = { + k: f"{dir}/{k}.txt" + for k in [ + "durations", + "errors", + "failures_long", + "failures_short", + "failures_line", + "passes", + "stats", + "summary_short", + "warnings", + ] + } + + # custom durations report + # note: there is no need to call pytest --durations=XX to get this separate report + # adapted from https://github.com/pytest-dev/pytest/blob/897f151e/src/_pytest/runner.py#L66 + dlist = [] + for replist in tr.stats.values(): + for rep in replist: + if hasattr(rep, "duration"): + dlist.append(rep) + if dlist: + dlist.sort(key=lambda x: x.duration, reverse=True) + with open(report_files["durations"], "w") as f: + durations_min = 0.05 # sec + f.write("slowest durations\n") + for i, rep in enumerate(dlist): + if rep.duration < durations_min: + f.write(f"{len(dlist) - i} durations < {durations_min} secs were omitted") + break + f.write(f"{rep.duration:02.2f}s {rep.when:<8} {rep.nodeid}\n") + + def summary_failures_short(tr): + # expecting that the reports were --tb=long (default) so we chop them off here to the last frame + reports = tr.getreports("failed") + if not reports: + return + tr.write_sep("=", "FAILURES SHORT STACK") + for rep in reports: + msg = tr._getfailureheadline(rep) + tr.write_sep("_", msg, red=True, bold=True) + # chop off the optional leading extra frames, leaving only the last one + longrepr = re.sub(r".*_ _ _ (_ ){10,}_ _ ", "", rep.longreprtext, 0, re.M | re.S) + tr._tw.line(longrepr) + # note: not printing out any rep.sections to keep the report short + + # use ready-made report funcs, we are just hijacking the filehandle to log to a dedicated file each + # adapted from https://github.com/pytest-dev/pytest/blob/897f151e/src/_pytest/terminal.py#L814 + # note: some pytest plugins may interfere by hijacking the default `terminalreporter` (e.g. + # pytest-instafail does that) + + # report failures with line/short/long styles + config.option.tbstyle = "auto" # full tb + with open(report_files["failures_long"], "w") as f: + tr._tw = create_terminal_writer(config, f) + tr.summary_failures() + + # config.option.tbstyle = "short" # short tb + with open(report_files["failures_short"], "w") as f: + tr._tw = create_terminal_writer(config, f) + summary_failures_short(tr) + + config.option.tbstyle = "line" # one line per error + with open(report_files["failures_line"], "w") as f: + tr._tw = create_terminal_writer(config, f) + tr.summary_failures() + + with open(report_files["errors"], "w") as f: + tr._tw = create_terminal_writer(config, f) + tr.summary_errors() + + with open(report_files["warnings"], "w") as f: + tr._tw = create_terminal_writer(config, f) + tr.summary_warnings() # normal warnings + tr.summary_warnings() # final warnings + + tr.reportchars = "wPpsxXEf" # emulate -rA (used in summary_passes() and short_test_summary()) + + # Skip the `passes` report, as it starts to take more than 5 minutes, and sometimes it timeouts on CircleCI if it + # takes > 10 minutes (as this part doesn't generate any output on the terminal). + # (also, it seems there is no useful information in this report, and we rarely need to read it) + # with open(report_files["passes"], "w") as f: + # tr._tw = create_terminal_writer(config, f) + # tr.summary_passes() + + with open(report_files["summary_short"], "w") as f: + tr._tw = create_terminal_writer(config, f) + tr.short_test_summary() + + with open(report_files["stats"], "w") as f: + tr._tw = create_terminal_writer(config, f) + tr.summary_stats() + + # restore: + tr._tw = orig_writer + tr.reportchars = orig_reportchars + config.option.tbstyle = orig_tbstyle + + +# --- distributed testing functions --- # + +# adapted from https://stackoverflow.com/a/59041913/9201239 +import asyncio # noqa + + +class _RunOutput: + def __init__(self, returncode, stdout, stderr): + self.returncode = returncode + self.stdout = stdout + self.stderr = stderr + + +async def _read_stream(stream, callback): + while True: + line = await stream.readline() + if line: + callback(line) + else: + break + + +async def _stream_subprocess(cmd, env=None, stdin=None, timeout=None, quiet=False, echo=False) -> _RunOutput: + if echo: + print("\nRunning: ", " ".join(cmd)) + + p = await asyncio.create_subprocess_exec( + cmd[0], + *cmd[1:], + stdin=stdin, + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE, + env=env, + ) + + # note: there is a warning for a possible deadlock when using `wait` with huge amounts of data in the pipe + # https://docs.python.org/3/library/asyncio-subprocess.html#asyncio.asyncio.subprocess.Process.wait + # + # If it starts hanging, will need to switch to the following code. The problem is that no data + # will be seen until it's done and if it hangs for example there will be no debug info. + # out, err = await p.communicate() + # return _RunOutput(p.returncode, out, err) + + out = [] + err = [] + + def tee(line, sink, pipe, label=""): + line = line.decode("utf-8").rstrip() + sink.append(line) + if not quiet: + print(label, line, file=pipe) + + # XXX: the timeout doesn't seem to make any difference here + await asyncio.wait( + [ + asyncio.create_task(_read_stream(p.stdout, lambda l: tee(l, out, sys.stdout, label="stdout:"))), + asyncio.create_task(_read_stream(p.stderr, lambda l: tee(l, err, sys.stderr, label="stderr:"))), + ], + timeout=timeout, + ) + return _RunOutput(await p.wait(), out, err) + + +def execute_subprocess_async(cmd, env=None, stdin=None, timeout=180, quiet=False, echo=True) -> _RunOutput: + loop = asyncio.get_event_loop() + result = loop.run_until_complete( + _stream_subprocess(cmd, env=env, stdin=stdin, timeout=timeout, quiet=quiet, echo=echo) + ) + + cmd_str = " ".join(cmd) + if result.returncode > 0: + stderr = "\n".join(result.stderr) + raise RuntimeError( + f"'{cmd_str}' failed with returncode {result.returncode}\n\n" + f"The combined stderr from workers follows:\n{stderr}" + ) + + # check that the subprocess actually did run and produced some output, should the test rely on + # the remote side to do the testing + if not result.stdout and not result.stderr: + raise RuntimeError(f"'{cmd_str}' produced no output.") + + return result + + +def pytest_xdist_worker_id(): + """ + Returns an int value of worker's numerical id under `pytest-xdist`'s concurrent workers `pytest -n N` regime, or 0 + if `-n 1` or `pytest-xdist` isn't being used. + """ + worker = os.environ.get("PYTEST_XDIST_WORKER", "gw0") + worker = re.sub(r"^gw", "", worker, 0, re.M) + return int(worker) + + +def get_torch_dist_unique_port(): + """ + Returns a port number that can be fed to `torch.distributed.launch`'s `--master_port` argument. + + Under `pytest-xdist` it adds a delta number based on a worker id so that concurrent tests don't try to use the same + port at once. + """ + port = 29500 + uniq_delta = pytest_xdist_worker_id() + return port + uniq_delta + + +def nested_simplify(obj, decimals=3): + """ + Simplifies an object by rounding float numbers, and downcasting tensors/numpy arrays to get simple equality test + within tests. + """ + import numpy as np + + if isinstance(obj, list): + return [nested_simplify(item, decimals) for item in obj] + if isinstance(obj, tuple): + return tuple([nested_simplify(item, decimals) for item in obj]) + elif isinstance(obj, np.ndarray): + return nested_simplify(obj.tolist()) + elif isinstance(obj, Mapping): + return {nested_simplify(k, decimals): nested_simplify(v, decimals) for k, v in obj.items()} + elif isinstance(obj, (str, int, np.int64)): + return obj + elif obj is None: + return obj + elif is_torch_available() and isinstance(obj, torch.Tensor): + return nested_simplify(obj.tolist(), decimals) + elif is_tf_available() and tf.is_tensor(obj): + return nested_simplify(obj.numpy().tolist()) + elif isinstance(obj, float): + return round(obj, decimals) + elif isinstance(obj, (np.int32, np.float32, np.float16)): + return nested_simplify(obj.item(), decimals) + else: + raise Exception(f"Not supported: {type(obj)}") + + +def check_json_file_has_correct_format(file_path): + with open(file_path) as f: + lines = f.readlines() + if len(lines) == 1: + # length can only be 1 if dict is empty + assert lines[0] == "{}" + else: + # otherwise make sure json has correct format (at least 3 lines) + assert len(lines) >= 3 + # each key one line, ident should be 2, min length is 3 + assert lines[0].strip() == "{" + for line in lines[1:-1]: + left_indent = len(lines[1]) - len(lines[1].lstrip()) + assert left_indent == 2 + assert lines[-1].strip() == "}" + + +def to_2tuple(x): + if isinstance(x, collections.abc.Iterable): + return x + return (x, x) + + +# These utils relate to ensuring the right error message is received when running scripts +class SubprocessCallException(Exception): + pass + + +def run_command(command: list[str], return_stdout=False): + """ + Runs `command` with `subprocess.check_output` and will potentially return the `stdout`. Will also properly capture + if an error occurred while running `command` + """ + try: + output = subprocess.check_output(command, stderr=subprocess.STDOUT) + if return_stdout: + if hasattr(output, "decode"): + output = output.decode("utf-8") + return output + except subprocess.CalledProcessError as e: + raise SubprocessCallException( + f"Command `{' '.join(command)}` failed with the following error:\n\n{e.output.decode()}" + ) from e + + +class RequestCounter: + """ + Helper class that will count all requests made online. + + Might not be robust if urllib3 changes its logging format but should be good enough for us. + + Usage: + ```py + with RequestCounter() as counter: + _ = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-bert") + assert counter["GET"] == 0 + assert counter["HEAD"] == 1 + assert counter.total_calls == 1 + ``` + """ + + def __enter__(self): + self._counter = defaultdict(int) + self._thread_id = threading.get_ident() + self._extra_info = [] + + def patched_with_thread_info(func): + def wrap(*args, **kwargs): + self._extra_info.append(threading.get_ident()) + return func(*args, **kwargs) + + return wrap + + self.patcher = patch.object( + urllib3.connectionpool.log, "debug", side_effect=patched_with_thread_info(urllib3.connectionpool.log.debug) + ) + self.mock = self.patcher.start() + return self + + def __exit__(self, *args, **kwargs) -> None: + assert len(self.mock.call_args_list) == len(self._extra_info) + for thread_id, call in zip(self._extra_info, self.mock.call_args_list): + if thread_id != self._thread_id: + continue + # code 307: the URL being requested by the user has moved to a temporary location + if call.args[-2] == 307: + continue + log = call.args[0] % call.args[1:] + for method in ("HEAD", "GET", "POST", "PUT", "DELETE", "CONNECT", "OPTIONS", "TRACE", "PATCH"): + if method in log: + self._counter[method] += 1 + break + self.patcher.stop() + + def __getitem__(self, key: str) -> int: + return self._counter[key] + + @property + def total_calls(self) -> int: + return sum(self._counter.values()) + + +def is_flaky(max_attempts: int = 5, wait_before_retry: Optional[float] = None, description: Optional[str] = None): + """ + To decorate flaky tests. They will be retried on failures. + + Please note that our push tests use `pytest-rerunfailures`, which prompts the CI to rerun certain types of + failed tests. More specifically, if the test exception contains any substring in `FLAKY_TEST_FAILURE_PATTERNS` + (in `.circleci/create_circleci_config.py`), it will be rerun. If you find a recurrent pattern of failures, + expand `FLAKY_TEST_FAILURE_PATTERNS` in our CI configuration instead of using `is_flaky`. + + Args: + max_attempts (`int`, *optional*, defaults to 5): + The maximum number of attempts to retry the flaky test. + wait_before_retry (`float`, *optional*): + If provided, will wait that number of seconds before retrying the test. + description (`str`, *optional*): + A string to describe the situation (what / where / why is flaky, link to GH issue/PR comments, errors, + etc.) + """ + + def decorator(test_func_ref): + @functools.wraps(test_func_ref) + def wrapper(*args, **kwargs): + retry_count = 1 + + while retry_count < max_attempts: + try: + return test_func_ref(*args, **kwargs) + + except Exception as err: + logger.error(f"Test failed with {err} at try {retry_count}/{max_attempts}.") + if wait_before_retry is not None: + time.sleep(wait_before_retry) + retry_count += 1 + + return test_func_ref(*args, **kwargs) + + return wrapper + + return decorator + + +def hub_retry(max_attempts: int = 5, wait_before_retry: Optional[float] = 2): + """ + To decorate tests that download from the Hub. They can fail due to a + variety of network issues such as timeouts, connection resets, etc. + + Args: + max_attempts (`int`, *optional*, defaults to 5): + The maximum number of attempts to retry the flaky test. + wait_before_retry (`float`, *optional*, defaults to 2): + If provided, will wait that number of seconds before retrying the test. + """ + + def decorator(test_func_ref): + @functools.wraps(test_func_ref) + def wrapper(*args, **kwargs): + retry_count = 1 + + while retry_count < max_attempts: + try: + return test_func_ref(*args, **kwargs) + # We catch all exceptions related to network issues from requests + except ( + requests.exceptions.ConnectionError, + requests.exceptions.Timeout, + requests.exceptions.ReadTimeout, + requests.exceptions.HTTPError, + requests.exceptions.RequestException, + ) as err: + logger.error( + f"Test failed with {err} at try {retry_count}/{max_attempts} as it couldn't connect to the specified Hub repository." + ) + if wait_before_retry is not None: + time.sleep(wait_before_retry) + retry_count += 1 + + return test_func_ref(*args, **kwargs) + + return wrapper + + return decorator + + +def run_first(test_case): + """ + Decorator marking a test with order(1). When pytest-order plugin is installed, tests marked with this decorator + are garanteed to run first. + + This is especially useful in some test settings like on a Gaudi instance where a Gaudi device can only be used by a + single process at a time. So we make sure all tests that run in a subprocess are launched first, to avoid device + allocation conflicts. + """ + import pytest + + return pytest.mark.order(1)(test_case) + + +def run_test_in_subprocess(test_case, target_func, inputs=None, timeout=None): + """ + To run a test in a subprocess. In particular, this can avoid (GPU) memory issue. + + Args: + test_case (`unittest.TestCase`): + The test that will run `target_func`. + target_func (`Callable`): + The function implementing the actual testing logic. + inputs (`dict`, *optional*, defaults to `None`): + The inputs that will be passed to `target_func` through an (input) queue. + timeout (`int`, *optional*, defaults to `None`): + The timeout (in seconds) that will be passed to the input and output queues. If not specified, the env. + variable `PYTEST_TIMEOUT` will be checked. If still `None`, its value will be set to `600`. + """ + if timeout is None: + timeout = int(os.environ.get("PYTEST_TIMEOUT", 600)) + + start_methohd = "spawn" + ctx = multiprocessing.get_context(start_methohd) + + input_queue = ctx.Queue(1) + output_queue = ctx.JoinableQueue(1) + + # We can't send `unittest.TestCase` to the child, otherwise we get issues regarding pickle. + input_queue.put(inputs, timeout=timeout) + + process = ctx.Process(target=target_func, args=(input_queue, output_queue, timeout)) + process.start() + # Kill the child process if we can't get outputs from it in time: otherwise, the hanging subprocess prevents + # the test to exit properly. + try: + results = output_queue.get(timeout=timeout) + output_queue.task_done() + except Exception as e: + process.terminate() + test_case.fail(e) + process.join(timeout=timeout) + + if results["error"] is not None: + test_case.fail(f"{results['error']}") + + +def run_test_using_subprocess(func): + """ + To decorate a test to run in a subprocess using the `subprocess` module. This could avoid potential GPU memory + issues (GPU OOM or a test that causes many subsequential failing with `CUDA error: device-side assert triggered`). + """ + import pytest + + @functools.wraps(func) + def wrapper(*args, **kwargs): + if os.getenv("_INSIDE_SUB_PROCESS", None) == "1": + func(*args, **kwargs) + else: + test = " ".join(os.environ.get("PYTEST_CURRENT_TEST").split(" ")[:-1]) + try: + import copy + + env = copy.deepcopy(os.environ) + env["_INSIDE_SUB_PROCESS"] = "1" + # This prevents the entries in `short test summary info` given by the subprocess being truncated. so the + # full information can be passed to the parent pytest process. + # See: https://docs.pytest.org/en/stable/explanation/ci.html + env["CI"] = "true" + + # If not subclass of `unitTest.TestCase` and `pytestconfig` is used: try to grab and use the arguments + if "pytestconfig" in kwargs: + command = list(kwargs["pytestconfig"].invocation_params.args) + for idx, x in enumerate(command): + if x in kwargs["pytestconfig"].args: + test = test.split("::")[1:] + command[idx] = "::".join([f"{func.__globals__['__file__']}"] + test) + command = [f"{sys.executable}", "-m", "pytest"] + command + command = [x for x in command if x not in ["--no-summary"]] + # Otherwise, simply run the test with no option at all + else: + command = [f"{sys.executable}", "-m", "pytest", f"{test}"] + + subprocess.run(command, env=env, check=True, capture_output=True) + except subprocess.CalledProcessError as e: + exception_message = e.stdout.decode() + lines = exception_message.split("\n") + # Add a first line with more informative information instead of just `= test session starts =`. + # This makes the `short test summary info` section more useful. + if "= test session starts =" in lines[0]: + text = "" + for line in lines[1:]: + if line.startswith("FAILED "): + text = line[len("FAILED ") :] + text = "".join(text.split(" - ")[1:]) + elif line.startswith("=") and line.endswith("=") and " failed in " in line: + break + elif len(text) > 0: + text += f"\n{line}" + text = "(subprocess) " + text + lines = [text] + lines + exception_message = "\n".join(lines) + raise pytest.fail(exception_message, pytrace=False) + + return wrapper + + +""" +The following contains utils to run the documentation tests without having to overwrite any files. + +The `preprocess_string` function adds `# doctest: +IGNORE_RESULT` markers on the fly anywhere a `load_dataset` call is +made as a print would otherwise fail the corresponding line. + +To skip cuda tests, make sure to call `SKIP_CUDA_DOCTEST=1 pytest --doctest-modules +""" + + +def preprocess_string(string, skip_cuda_tests): + """Prepare a docstring or a `.md` file to be run by doctest. + + The argument `string` would be the whole file content if it is a `.md` file. For a python file, it would be one of + its docstring. In each case, it may contain multiple python code examples. If `skip_cuda_tests` is `True` and a + cuda stuff is detective (with a heuristic), this method will return an empty string so no doctest will be run for + `string`. + """ + codeblock_pattern = r"(```(?:python|py)\s*\n\s*>>> )(.*?```)" + codeblocks = re.split(codeblock_pattern, string, flags=re.DOTALL) + is_cuda_found = False + for i, codeblock in enumerate(codeblocks): + if "load_dataset(" in codeblock and "# doctest: +IGNORE_RESULT" not in codeblock: + codeblocks[i] = re.sub(r"(>>> .*load_dataset\(.*)", r"\1 # doctest: +IGNORE_RESULT", codeblock) + if ( + (">>>" in codeblock or "..." in codeblock) + and re.search(r"cuda|to\(0\)|device=0", codeblock) + and skip_cuda_tests + ): + is_cuda_found = True + break + + modified_string = "" + if not is_cuda_found: + modified_string = "".join(codeblocks) + + return modified_string + + +class HfDocTestParser(doctest.DocTestParser): + """ + Overwrites the DocTestParser from doctest to properly parse the codeblocks that are formatted with black. This + means that there are no extra lines at the end of our snippets. The `# doctest: +IGNORE_RESULT` marker is also + added anywhere a `load_dataset` call is made as a print would otherwise fail the corresponding line. + + Tests involving cuda are skipped base on a naive pattern that should be updated if it is not enough. + """ + + # This regular expression is used to find doctest examples in a + # string. It defines three groups: `source` is the source code + # (including leading indentation and prompts); `indent` is the + # indentation of the first (PS1) line of the source code; and + # `want` is the expected output (including leading indentation). + # fmt: off + _EXAMPLE_RE = re.compile(r''' + # Source consists of a PS1 line followed by zero or more PS2 lines. + (?P + (?:^(?P [ ]*) >>> .*) # PS1 line + (?:\n [ ]* \.\.\. .*)*) # PS2 lines + \n? + # Want consists of any non-blank lines that do not start with PS1. + (?P (?:(?![ ]*$) # Not a blank line + (?![ ]*>>>) # Not a line starting with PS1 + # !!!!!!!!!!! HF Specific !!!!!!!!!!! + (?:(?!```).)* # Match any character except '`' until a '```' is found (this is specific to HF because black removes the last line) + # !!!!!!!!!!! HF Specific !!!!!!!!!!! + (?:\n|$) # Match a new line or end of string + )*) + ''', re.MULTILINE | re.VERBOSE + ) + # fmt: on + + # !!!!!!!!!!! HF Specific !!!!!!!!!!! + skip_cuda_tests: bool = bool(os.environ.get("SKIP_CUDA_DOCTEST", False)) + # !!!!!!!!!!! HF Specific !!!!!!!!!!! + + def parse(self, string, name=""): + """ + Overwrites the `parse` method to incorporate a skip for CUDA tests, and remove logs and dataset prints before + calling `super().parse` + """ + string = preprocess_string(string, self.skip_cuda_tests) + return super().parse(string, name) + + +class HfDoctestModule(Module): + """ + Overwrites the `DoctestModule` of the pytest package to make sure the HFDocTestParser is used when discovering + tests. + """ + + def collect(self) -> Iterable[DoctestItem]: + class MockAwareDocTestFinder(doctest.DocTestFinder): + """A hackish doctest finder that overrides stdlib internals to fix a stdlib bug. + + https://github.com/pytest-dev/pytest/issues/3456 https://bugs.python.org/issue25532 + """ + + def _find_lineno(self, obj, source_lines): + """Doctest code does not take into account `@property`, this + is a hackish way to fix it. https://bugs.python.org/issue17446 + + Wrapped Doctests will need to be unwrapped so the correct line number is returned. This will be + reported upstream. #8796 + """ + if isinstance(obj, property): + obj = getattr(obj, "fget", obj) + + if hasattr(obj, "__wrapped__"): + # Get the main obj in case of it being wrapped + obj = inspect.unwrap(obj) + + # Type ignored because this is a private function. + return super()._find_lineno( # type:ignore[misc] + obj, + source_lines, + ) + + def _find(self, tests, obj, name, module, source_lines, globs, seen) -> None: + if _is_mocked(obj): + return + with _patch_unwrap_mock_aware(): + # Type ignored because this is a private function. + super()._find( # type:ignore[misc] + tests, obj, name, module, source_lines, globs, seen + ) + + if self.path.name == "conftest.py": + module = self.config.pluginmanager._importconftest( + self.path, + self.config.getoption("importmode"), + rootpath=self.config.rootpath, + ) + else: + try: + module = import_path( + self.path, + root=self.config.rootpath, + mode=self.config.getoption("importmode"), + ) + except ImportError: + if self.config.getvalue("doctest_ignore_import_errors"): + skip("unable to import module %r" % self.path) + else: + raise + + # !!!!!!!!!!! HF Specific !!!!!!!!!!! + finder = MockAwareDocTestFinder(parser=HfDocTestParser()) + # !!!!!!!!!!! HF Specific !!!!!!!!!!! + optionflags = get_optionflags(self) + runner = _get_runner( + verbose=False, + optionflags=optionflags, + checker=_get_checker(), + continue_on_failure=_get_continue_on_failure(self.config), + ) + for test in finder.find(module, module.__name__): + if test.examples: # skip empty doctests and cuda + yield DoctestItem.from_parent(self, name=test.name, runner=runner, dtest=test) + + +def _device_agnostic_dispatch(device: str, dispatch_table: dict[str, Callable], *args, **kwargs): + if device not in dispatch_table: + return dispatch_table["default"](*args, **kwargs) + + fn = dispatch_table[device] + + # Some device agnostic functions return values. Need to guard against `None` + # instead at user level. + if fn is None: + return None + return fn(*args, **kwargs) + + +if is_torch_available(): + # Mappings from device names to callable functions to support device agnostic + # testing. + BACKEND_MANUAL_SEED = { + "cuda": torch.cuda.manual_seed, + "cpu": torch.manual_seed, + "default": torch.manual_seed, + } + BACKEND_EMPTY_CACHE = { + "cuda": torch.cuda.empty_cache, + "cpu": None, + "default": None, + } + BACKEND_DEVICE_COUNT = { + "cuda": torch.cuda.device_count, + "cpu": lambda: 0, + "default": lambda: 1, + } +else: + BACKEND_MANUAL_SEED = {"default": None} + BACKEND_EMPTY_CACHE = {"default": None} + BACKEND_DEVICE_COUNT = {"default": lambda: 0} + +if is_torch_hpu_available(): + BACKEND_MANUAL_SEED["hpu"] = torch.hpu.manual_seed + BACKEND_DEVICE_COUNT["hpu"] = torch.hpu.device_count + +if is_torch_mlu_available(): + BACKEND_EMPTY_CACHE["mlu"] = torch.mlu.empty_cache + BACKEND_MANUAL_SEED["mlu"] = torch.mlu.manual_seed + BACKEND_DEVICE_COUNT["mlu"] = torch.mlu.device_count + +if is_torch_npu_available(): + BACKEND_EMPTY_CACHE["npu"] = torch.npu.empty_cache + BACKEND_MANUAL_SEED["npu"] = torch.npu.manual_seed + BACKEND_DEVICE_COUNT["npu"] = torch.npu.device_count + +if is_torch_xpu_available(): + BACKEND_EMPTY_CACHE["xpu"] = torch.xpu.empty_cache + BACKEND_MANUAL_SEED["xpu"] = torch.xpu.manual_seed + BACKEND_DEVICE_COUNT["xpu"] = torch.xpu.device_count + +if is_torch_xla_available(): + BACKEND_EMPTY_CACHE["xla"] = torch.cuda.empty_cache + BACKEND_MANUAL_SEED["xla"] = torch.cuda.manual_seed + BACKEND_DEVICE_COUNT["xla"] = torch.cuda.device_count + + +def backend_manual_seed(device: str, seed: int): + return _device_agnostic_dispatch(device, BACKEND_MANUAL_SEED, seed) + + +def backend_empty_cache(device: str): + return _device_agnostic_dispatch(device, BACKEND_EMPTY_CACHE) + + +def backend_device_count(device: str): + return _device_agnostic_dispatch(device, BACKEND_DEVICE_COUNT) + + +if is_torch_available(): + # If `TRANSFORMERS_TEST_DEVICE_SPEC` is enabled we need to import extra entries + # into device to function mappings. + if "TRANSFORMERS_TEST_DEVICE_SPEC" in os.environ: + device_spec_path = os.environ["TRANSFORMERS_TEST_DEVICE_SPEC"] + if not Path(device_spec_path).is_file(): + raise ValueError( + f"Specified path to device spec file is not a file or not found. Received '{device_spec_path}" + ) + + # Try to strip extension for later import – also verifies we are importing a + # python file. + device_spec_dir, _ = os.path.split(os.path.realpath(device_spec_path)) + sys.path.append(device_spec_dir) + try: + import_name = device_spec_path[: device_spec_path.index(".py")] + except ValueError as e: + raise ValueError(f"Provided device spec file was not a Python file! Received '{device_spec_path}") from e + + device_spec_module = importlib.import_module(import_name) + + # Imported file must contain `DEVICE_NAME`. If it doesn't, terminate early. + try: + device_name = device_spec_module.DEVICE_NAME + except AttributeError as e: + raise AttributeError("Device spec file did not contain `DEVICE_NAME`") from e + + if "TRANSFORMERS_TEST_DEVICE" in os.environ and torch_device != device_name: + msg = f"Mismatch between environment variable `TRANSFORMERS_TEST_DEVICE` '{torch_device}' and device found in spec '{device_name}'\n" + msg += "Either unset `TRANSFORMERS_TEST_DEVICE` or ensure it matches device spec name." + raise ValueError(msg) + + torch_device = device_name + + def update_mapping_from_spec(device_fn_dict: dict[str, Callable], attribute_name: str): + try: + # Try to import the function directly + spec_fn = getattr(device_spec_module, attribute_name) + device_fn_dict[torch_device] = spec_fn + except AttributeError as e: + # If the function doesn't exist, and there is no default, throw an error + if "default" not in device_fn_dict: + raise AttributeError( + f"`{attribute_name}` not found in '{device_spec_path}' and no default fallback function found." + ) from e + + # Add one entry here for each `BACKEND_*` dictionary. + update_mapping_from_spec(BACKEND_MANUAL_SEED, "MANUAL_SEED_FN") + update_mapping_from_spec(BACKEND_EMPTY_CACHE, "EMPTY_CACHE_FN") + update_mapping_from_spec(BACKEND_DEVICE_COUNT, "DEVICE_COUNT_FN") + + +def compare_pipeline_output_to_hub_spec(output, hub_spec): + missing_keys = [] + unexpected_keys = [] + all_field_names = {field.name for field in fields(hub_spec)} + matching_keys = sorted([key for key in output.keys() if key in all_field_names]) + + # Fields with a MISSING default are required and must be in the output + for field in fields(hub_spec): + if field.default is MISSING and field.name not in output: + missing_keys.append(field.name) + + # All output keys must match either a required or optional field in the Hub spec + for output_key in output: + if output_key not in all_field_names: + unexpected_keys.append(output_key) + + if missing_keys or unexpected_keys: + error = ["Pipeline output does not match Hub spec!"] + if matching_keys: + error.append(f"Matching keys: {matching_keys}") + if missing_keys: + error.append(f"Missing required keys in pipeline output: {missing_keys}") + if unexpected_keys: + error.append(f"Keys in pipeline output that are not in Hub spec: {unexpected_keys}") + raise KeyError("\n".join(error)) + + +@require_torch +def cleanup(device: str, gc_collect=False): + if gc_collect: + gc.collect() + backend_empty_cache(device) + torch._dynamo.reset() + + +# Type definition of key used in `Expectations` class. +DeviceProperties = tuple[Union[str, None], Union[int, None]] + + +@cache +def get_device_properties() -> DeviceProperties: + """ + Get environment device properties. + """ + if IS_CUDA_SYSTEM or IS_ROCM_SYSTEM: + import torch + + major, _ = torch.cuda.get_device_capability() + if IS_ROCM_SYSTEM: + return ("rocm", major) + else: + return ("cuda", major) + elif IS_XPU_SYSTEM: + import torch + + # To get more info of the architecture meaning and bit allocation, refer to https://github.com/intel/llvm/blob/sycl/sycl/include/sycl/ext/oneapi/experimental/device_architecture.def + arch = torch.xpu.get_device_capability()["architecture"] + gen_mask = 0x000000FF00000000 + gen = (arch & gen_mask) >> 32 + return ("xpu", gen) + else: + return (torch_device, None) + + +class Expectations(UserDict[DeviceProperties, Any]): + def get_expectation(self) -> Any: + """ + Find best matching expectation based on environment device properties. + """ + return self.find_expectation(get_device_properties()) + + @staticmethod + def is_default(key: DeviceProperties) -> bool: + return all(p is None for p in key) + + @staticmethod + def score(key: DeviceProperties, other: DeviceProperties) -> int: + """ + Returns score indicating how similar two instances of the `Properties` tuple are. + Points are calculated using bits, but documented as int. + Rules are as follows: + * Matching `type` gives 8 points. + * Semi-matching `type`, for example cuda and rocm, gives 4 points. + * Matching `major` (compute capability major version) gives 2 points. + * Default expectation (if present) gives 1 points. + """ + (device_type, major) = key + (other_device_type, other_major) = other + + score = 0b0 + if device_type == other_device_type: + score |= 0b1000 + elif device_type in ["cuda", "rocm"] and other_device_type in ["cuda", "rocm"]: + score |= 0b100 + + if major == other_major and other_major is not None: + score |= 0b10 + + if Expectations.is_default(other): + score |= 0b1 + + return int(score) + + def find_expectation(self, key: DeviceProperties = (None, None)) -> Any: + """ + Find best matching expectation based on provided device properties. + """ + (result_key, result) = max(self.data.items(), key=lambda x: Expectations.score(key, x[0])) + + if Expectations.score(key, result_key) == 0: + raise ValueError(f"No matching expectation found for {key}") + + return result + + def __repr__(self): + return f"{self.data}" diff --git a/docs/transformers/build/lib/transformers/tf_utils.py b/docs/transformers/build/lib/transformers/tf_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..c3770cb1237f3c6ea3517bda3dd29b2b6f44f7de --- /dev/null +++ b/docs/transformers/build/lib/transformers/tf_utils.py @@ -0,0 +1,294 @@ +# Copyright 2022 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Optional, Union + +import numpy as np +import tensorflow as tf + +from .feature_extraction_utils import BatchFeature +from .tokenization_utils_base import BatchEncoding +from .utils import logging + + +logger = logging.get_logger(__name__) + + +def shape_list(tensor: Union[tf.Tensor, np.ndarray]) -> list[int]: + """ + Deal with dynamic shape in tensorflow cleanly. + + Args: + tensor (`tf.Tensor` or `np.ndarray`): The tensor we want the shape of. + + Returns: + `List[int]`: The shape of the tensor as a list. + """ + if isinstance(tensor, np.ndarray): + return list(tensor.shape) + + dynamic = tf.shape(tensor) + + if tensor.shape == tf.TensorShape(None): + return dynamic + + static = tensor.shape.as_list() + + return [dynamic[i] if s is None else s for i, s in enumerate(static)] + + +def stable_softmax(logits: tf.Tensor, axis: Optional[int] = None, name: Optional[str] = None) -> tf.Tensor: + """ + Stable wrapper that returns the same output as `tf.nn.softmax`, but that works reliably with XLA on CPU. It is + meant as a workaround for the [following issue](https://github.com/tensorflow/tensorflow/issues/55682), and will be + removed after it gets fixed. The arguments and outputs are the same as `tf.nn.softmax`, and relies on the fact that + `softmax(x) = softmax(x + c)` (see https://ogunlao.github.io/2020/04/26/you_dont_really_know_softmax.html). + + Args: + logits (`tf.Tensor`): + Must be one of the following types: half, float32, float64. + axis (`int`, *optional*): + The dimension softmax would be performed on. The default is -1 which indicates the last dimension. + name (`str`, *optional*): + A name for the operation. + + Returns: + `tf.Tensor`: + A Tensor. Has the same type and shape as logits. + """ + # TODO: When the issue linked above gets sorted, add a check on TF version here and use the original function if + # it has the fix. After we drop the support for unfixed versions, remove this function. + return tf.nn.softmax(logits=logits + 1e-9, axis=axis, name=name) + + +def functional_layernorm(inputs, weight, bias, epsilon=1e-5, axis=-1): + # This is a very simplified functional layernorm, designed to duplicate + # the functionality of PyTorch nn.functional.layer_norm when this is needed to port + # models in Transformers. + + if weight.shape.rank != 1 or bias.shape.rank != 1 or not isinstance(axis, int): + raise NotImplementedError("Only 1D weight and bias tensors are supported for now, with only a single axis.") + + # Get mean and variance on the axis to be normalized + mean, variance = tf.nn.moments(inputs, axes=[axis], keepdims=True) + + if axis != -1: + # Reshape scale and weight to have the same rank as inputs, but with 1 dimensions + # on every dimension except axis + shape = [1] * inputs.shape.rank + shape[axis] = shape_list(inputs)[axis] + weight = tf.reshape(weight, shape) + bias = tf.reshape(bias, shape) + + # Compute layer normalization using the batch_normalization + # function. + outputs = tf.nn.batch_normalization( + inputs, + mean, + variance, + offset=bias, + scale=weight, + variance_epsilon=epsilon, + ) + return outputs + + +def scaled_dot_product_attention( + query, key, value, attn_mask=None, dropout_p=0.0, is_causal=False, scale: Optional[float] = None +): + """TF equivalent for torch's nn.functional.scaled_dot_product_attention""" + if dropout_p != 0.0: + raise ValueError( + "Dropout is not supported in this implementation - file an issue " + "with Transformers and ping @Rocketknight1 if you need it for a port!" + ) + if is_causal and attn_mask is not None: + raise ValueError("You cannot specify an attn_mask and is_causal at the same time!") + if is_causal: + attn_mask = tf.ones((tf.shape(query)[-2], tf.shape(key)[-2]), dtype=tf.int32) + attn_mask = tf.experimental.numpy.tril(attn_mask, k=0) + if attn_mask is not None and (attn_mask.dtype.is_integer or attn_mask.dtype.is_bool): + # Convert boolean mask to a negative logit bias + attn_mask = tf.where(attn_mask > 0, tf.cast(0.0, query.dtype), tf.cast(-1000.0, query.dtype)) + logits = tf.einsum("...qd, ...kd -> ...qk", query, key) + if scale is None: + scale = tf.cast(tf.shape(key)[-1], logits.dtype) ** -0.5 + logits *= scale # scale by 1/sqrt(key_dim) + if attn_mask is not None: + logits += attn_mask + probs = tf.nn.softmax(logits) + return probs @ value + + +def flatten(input, start_dim=0, end_dim=-1): + # Replicates the behavior of torch.flatten in TF + + # If end_dim or start_dim is negative, count them from the end + if end_dim < 0: + end_dim += input.shape.rank + if start_dim < 0: + start_dim += input.shape.rank + + if start_dim == end_dim: + return input + + in_shape = tf.shape(input) + flattened_dim = tf.math.reduce_prod(in_shape[start_dim : end_dim + 1]) + out_shape = tf.concat([in_shape[:start_dim], [flattened_dim], in_shape[end_dim + 1 :]], axis=0) + return tf.reshape(input, out_shape) + + +def invert_attention_mask(encoder_attention_mask: tf.Tensor) -> tf.Tensor: + """ + Invert an attention mask (e.g., switches 0. and 1.). + + Args: + encoder_attention_mask (`torch.Tensor`): An attention mask. + + Returns: + `tf.Tensor`: The inverted attention mask. + """ + if not isinstance(encoder_attention_mask, tf.Tensor): + encoder_attention_mask = tf.convert_to_tensor(encoder_attention_mask) # Catches stray NumPy inputs + if encoder_attention_mask.shape.rank == 3: + encoder_extended_attention_mask = encoder_attention_mask[:, None, :, :] + if encoder_attention_mask.shape.rank == 2: + encoder_extended_attention_mask = encoder_attention_mask[:, None, None, :] + # T5 has a mask that can compare sequence ids, we can simulate this here with this transposition + # Cf. https://github.com/tensorflow/mesh/blob/8d2465e9bc93129b913b5ccc6a59aa97abd96ec6/mesh_tensorflow + # /transformer/transformer_layers.py#L270 + # encoder_extended_attention_mask = (encoder_extended_attention_mask == + # encoder_extended_attention_mask.transpose(-1, -2)) + encoder_extended_attention_mask = ( + tf.cast(1, encoder_attention_mask.dtype) - encoder_extended_attention_mask + ) * encoder_extended_attention_mask.dtype.min + + return encoder_extended_attention_mask + + +def check_embeddings_within_bounds(tensor: tf.Tensor, embed_dim: int, tensor_name: str = "input_ids") -> None: + """ + `tf.gather`, on which TF embedding layers are based, won't check positive out of bound indices on GPU, returning + zeros instead. This function adds a check against that dangerous silent behavior. + + Args: + tensor (`tf.Tensor`): The tensor of indices to check. + embed_dim (`int`): The embedding dimension. + tensor_name (`str`, *optional*): The name of the tensor to use in the error message. + """ + tf.debugging.assert_less( + tensor, + tf.cast(embed_dim, dtype=tensor.dtype), + message=( + f"The maximum value of {tensor_name} ({tf.math.reduce_max(tensor)}) must be smaller than the embedding " + f"layer's input dimension ({embed_dim}). The likely cause is some problem at tokenization time." + ), + ) + + +def save_attributes_to_hdf5_group(group, name, data): + """Saves attributes (data) of the specified name into the HDF5 group. + + This method deals with an inherent problem of HDF5 file which is not able to store data larger than + HDF5_OBJECT_HEADER_LIMIT bytes. + + Args: + group: A pointer to a HDF5 group. + name: A name of the attributes to save. + data: Attributes data to store. + + Raises: + RuntimeError: If any single attribute is too large to be saved. + + Copied from Keras to Transformers to avoid versioning issues. + """ + HDF5_OBJECT_HEADER_LIMIT = 64512 + # Check that no item in `data` is larger than `HDF5_OBJECT_HEADER_LIMIT` + # because in that case even chunking the array would not make the saving + # possible. + bad_attributes = [x for x in data if len(x) > HDF5_OBJECT_HEADER_LIMIT] + + # Expecting this to never be true. + if bad_attributes: + raise RuntimeError( + "The following attributes cannot be saved to HDF5 file because " + f"they are larger than {HDF5_OBJECT_HEADER_LIMIT} " + f"bytes: {bad_attributes}" + ) + + data_npy = np.asarray(data) + + num_chunks = 1 + chunked_data = np.array_split(data_npy, num_chunks) + + # This will never loop forever thanks to the test above. + while any(x.nbytes > HDF5_OBJECT_HEADER_LIMIT for x in chunked_data): + num_chunks += 1 + chunked_data = np.array_split(data_npy, num_chunks) + + if num_chunks > 1: + for chunk_id, chunk_data in enumerate(chunked_data): + group.attrs["%s%d" % (name, chunk_id)] = chunk_data + else: + group.attrs[name] = data + + +def load_attributes_from_hdf5_group(group, name): + """Loads attributes of the specified name from the HDF5 group. + + This method deals with an inherent problem of HDF5 file which is not able to store data larger than + HDF5_OBJECT_HEADER_LIMIT bytes. + + Args: + group: A pointer to a HDF5 group. + name: A name of the attributes to load. + + Returns: + data: Attributes data. + + Copied from Keras to Transformers to avoid versioning issues. + """ + if name in group.attrs: + data = [n.decode("utf8") if hasattr(n, "decode") else n for n in group.attrs[name]] + else: + data = [] + chunk_id = 0 + while "%s%d" % (name, chunk_id) in group.attrs: + data.extend( + [n.decode("utf8") if hasattr(n, "decode") else n for n in group.attrs["%s%d" % (name, chunk_id)]] + ) + chunk_id += 1 + return data + + +def expand_1d(data): + """Expands 1-dimensional `Tensor`s into 2-dimensional `Tensor`s. + Copied from Keras to here to avoid versioning issues.""" + + def _expand_single_1d_tensor(t): + if isinstance(t, tf.Tensor) and t.shape.rank == 1: + return tf.expand_dims(t, axis=-1) + return t + + return tf.nest.map_structure(_expand_single_1d_tensor, data) + + +def convert_batch_encoding(*args, **kwargs): + # Convert HF BatchEncoding/BatchFeature objects in the inputs to dicts that Keras understands + if args and isinstance(args[0], (BatchEncoding, BatchFeature)): + args = list(args) + args[0] = dict(args[0]) + elif "x" in kwargs and isinstance(kwargs["x"], (BatchEncoding, BatchFeature)): + kwargs["x"] = dict(kwargs["x"]) + return args, kwargs diff --git a/docs/transformers/build/lib/transformers/time_series_utils.py b/docs/transformers/build/lib/transformers/time_series_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..3a5cf4f2f4d8f636e623c07ae3400ca5e17b5891 --- /dev/null +++ b/docs/transformers/build/lib/transformers/time_series_utils.py @@ -0,0 +1,225 @@ +# Copyright 2023 The HuggingFace Inc. team. +# Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Time series distributional output classes and utilities. +""" + +from typing import Callable, Optional + +import torch +from torch import nn +from torch.distributions import ( + AffineTransform, + Distribution, + Independent, + NegativeBinomial, + Normal, + StudentT, + TransformedDistribution, +) + + +class AffineTransformed(TransformedDistribution): + def __init__(self, base_distribution: Distribution, loc=None, scale=None, event_dim=0): + self.scale = 1.0 if scale is None else scale + self.loc = 0.0 if loc is None else loc + + super().__init__(base_distribution, [AffineTransform(loc=self.loc, scale=self.scale, event_dim=event_dim)]) + + @property + def mean(self): + """ + Returns the mean of the distribution. + """ + return self.base_dist.mean * self.scale + self.loc + + @property + def variance(self): + """ + Returns the variance of the distribution. + """ + return self.base_dist.variance * self.scale**2 + + @property + def stddev(self): + """ + Returns the standard deviation of the distribution. + """ + return self.variance.sqrt() + + +class ParameterProjection(nn.Module): + def __init__( + self, in_features: int, args_dim: dict[str, int], domain_map: Callable[..., tuple[torch.Tensor]], **kwargs + ) -> None: + super().__init__(**kwargs) + self.args_dim = args_dim + self.proj = nn.ModuleList([nn.Linear(in_features, dim) for dim in args_dim.values()]) + self.domain_map = domain_map + + def forward(self, x: torch.Tensor) -> tuple[torch.Tensor]: + params_unbounded = [proj(x) for proj in self.proj] + + return self.domain_map(*params_unbounded) + + +class LambdaLayer(nn.Module): + def __init__(self, function): + super().__init__() + self.function = function + + def forward(self, x, *args): + return self.function(x, *args) + + +class DistributionOutput: + distribution_class: type + in_features: int + args_dim: dict[str, int] + + def __init__(self, dim: int = 1) -> None: + self.dim = dim + self.args_dim = {k: dim * self.args_dim[k] for k in self.args_dim} + + def _base_distribution(self, distr_args): + if self.dim == 1: + return self.distribution_class(*distr_args) + else: + return Independent(self.distribution_class(*distr_args), 1) + + def distribution( + self, + distr_args, + loc: Optional[torch.Tensor] = None, + scale: Optional[torch.Tensor] = None, + ) -> Distribution: + distr = self._base_distribution(distr_args) + if loc is None and scale is None: + return distr + else: + return AffineTransformed(distr, loc=loc, scale=scale, event_dim=self.event_dim) + + @property + def event_shape(self) -> tuple: + r""" + Shape of each individual event contemplated by the distributions that this object constructs. + """ + return () if self.dim == 1 else (self.dim,) + + @property + def event_dim(self) -> int: + r""" + Number of event dimensions, i.e., length of the `event_shape` tuple, of the distributions that this object + constructs. + """ + return len(self.event_shape) + + @property + def value_in_support(self) -> float: + r""" + A float that will have a valid numeric value when computing the log-loss of the corresponding distribution. By + default 0.0. This value will be used when padding data series. + """ + return 0.0 + + def get_parameter_projection(self, in_features: int) -> nn.Module: + r""" + Return the parameter projection layer that maps the input to the appropriate parameters of the distribution. + """ + return ParameterProjection( + in_features=in_features, + args_dim=self.args_dim, + domain_map=LambdaLayer(self.domain_map), + ) + + def domain_map(self, *args: torch.Tensor): + r""" + Converts arguments to the right shape and domain. The domain depends on the type of distribution, while the + correct shape is obtained by reshaping the trailing axis in such a way that the returned tensors define a + distribution of the right event_shape. + """ + raise NotImplementedError() + + @staticmethod + def squareplus(x: torch.Tensor) -> torch.Tensor: + r""" + Helper to map inputs to the positive orthant by applying the square-plus operation. Reference: + https://twitter.com/jon_barron/status/1387167648669048833 + """ + return (x + torch.sqrt(torch.square(x) + 4.0)) / 2.0 + + +class StudentTOutput(DistributionOutput): + """ + Student-T distribution output class. + """ + + args_dim: dict[str, int] = {"df": 1, "loc": 1, "scale": 1} + distribution_class: type = StudentT + + @classmethod + def domain_map(cls, df: torch.Tensor, loc: torch.Tensor, scale: torch.Tensor): + scale = cls.squareplus(scale).clamp_min(torch.finfo(scale.dtype).eps) + df = 2.0 + cls.squareplus(df) + return df.squeeze(-1), loc.squeeze(-1), scale.squeeze(-1) + + +class NormalOutput(DistributionOutput): + """ + Normal distribution output class. + """ + + args_dim: dict[str, int] = {"loc": 1, "scale": 1} + distribution_class: type = Normal + + @classmethod + def domain_map(cls, loc: torch.Tensor, scale: torch.Tensor): + scale = cls.squareplus(scale).clamp_min(torch.finfo(scale.dtype).eps) + return loc.squeeze(-1), scale.squeeze(-1) + + +class NegativeBinomialOutput(DistributionOutput): + """ + Negative Binomial distribution output class. + """ + + args_dim: dict[str, int] = {"total_count": 1, "logits": 1} + distribution_class: type = NegativeBinomial + + @classmethod + def domain_map(cls, total_count: torch.Tensor, logits: torch.Tensor): + total_count = cls.squareplus(total_count) + return total_count.squeeze(-1), logits.squeeze(-1) + + def _base_distribution(self, distr_args) -> Distribution: + total_count, logits = distr_args + if self.dim == 1: + return self.distribution_class(total_count=total_count, logits=logits) + else: + return Independent(self.distribution_class(total_count=total_count, logits=logits), 1) + + # Overwrites the parent class method. We cannot scale using the affine + # transformation since negative binomial should return integers. Instead + # we scale the parameters. + def distribution( + self, distr_args, loc: Optional[torch.Tensor] = None, scale: Optional[torch.Tensor] = None + ) -> Distribution: + total_count, logits = distr_args + + if scale is not None: + # See scaling property of Gamma. + logits += scale.log() + + return self._base_distribution((total_count, logits)) diff --git a/docs/transformers/build/lib/transformers/tokenization_utils.py b/docs/transformers/build/lib/transformers/tokenization_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..51382d776f0643b7526d9a634fb98d3b43380031 --- /dev/null +++ b/docs/transformers/build/lib/transformers/tokenization_utils.py @@ -0,0 +1,1133 @@ +# Copyright 2020 The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Tokenization classes for python tokenizers. For fast tokenizers (provided by HuggingFace's tokenizers library) see +tokenization_utils_fast.py +""" + +import bisect +import itertools +import re +import unicodedata +from collections import OrderedDict +from typing import Any, Optional, Union, overload + +from .tokenization_utils_base import ( + ENCODE_KWARGS_DOCSTRING, + ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING, + INIT_TOKENIZER_DOCSTRING, + AddedToken, + BatchEncoding, + EncodedInput, + EncodedInputPair, + PreTokenizedInput, + PreTokenizedInputPair, + PreTrainedTokenizerBase, + TextInput, + TextInputPair, + TruncationStrategy, +) +from .utils import PaddingStrategy, TensorType, add_end_docstrings, logging + + +logger = logging.get_logger(__name__) + +# Slow tokenizers are saved in a vocabulary plus three separated files +SPECIAL_TOKENS_MAP_FILE = "special_tokens_map.json" +ADDED_TOKENS_FILE = "added_tokens.json" +TOKENIZER_CONFIG_FILE = "tokenizer_config.json" + + +class Trie: + """ + Trie in Python. Creates a Trie out of a list of words. The trie is used to split on `added_tokens` in one pass + Loose reference https://en.wikipedia.org/wiki/Trie + """ + + def __init__(self, *args): + self.data = {} + self._tokens = set() + self._termination_char = "" + self.update(*args) + + def update(self, *args): + """ + Updates the Trie with new tokens provided as arguments. + + Args: + *args: Variable number of words to be added to the Trie. + """ + for token in tuple(*args): + self.add(token) + + def add(self, word: str): + """ + Passes over every char (utf-8 char) on word and recursively adds it to the internal `data` trie representation. + The special key `""` in `self._termination_char` is used to represent termination. + + This function is idempotent, adding twice the same word will leave the trie unchanged + + Example: + + ```python + >>> trie = Trie() + >>> trie.add("Hello 友達") + >>> trie.data + {"H": {"e": {"l": {"l": {"o": {" ": {"友": {"達": {"": 1}}}}}}}}} + + >>> trie.add("Hello") + >>> trie.data + {"H": {"e": {"l": {"l": {"o": {"": 1, " ": {"友": {"達": {"": 1}}}}}}}}} + ``` + """ + if not word: + # Prevent empty string + return + + self._tokens.add(word) + ref = self.data + for char in word: + ref[char] = ref.setdefault(char, {}) + ref = ref[char] + ref[self._termination_char] = 1 + + def split(self, text: str) -> list[str]: + """ + Will look for the words added to the trie within `text`. Output is the original string splitted along the + boundaries of the words found. + + This trie will match the longest possible word first ! + + Example: + + ```python + >>> trie = Trie() + >>> trie.split("[CLS] This is a extra_id_100") + ["[CLS] This is a extra_id_100"] + + >>> trie.add("[CLS]") + >>> trie.add("extra_id_1") + >>> trie.add("extra_id_100") + >>> trie.split("[CLS] This is a extra_id_100") + ["[CLS]", " This is a ", "extra_id_100"] + ``` + """ + # indexes are counted left of the chars index. + # "hello", index 0, is left of h, index 1 is between h and e. + # index 5 is right of the "o". + + # States are going to capture every possible start (indexes as above) + # as keys, and have as values, a pointer to the position in the trie + # where we're at. This is a partial match for now. + # This enables to keep track of multiple matches while we're iterating + # the string + # If the trie contains, "blowing", and "lower" and we encounter the + # string "blower", we need to split into ["b", "lower"]. + # This is where we need to keep track of multiple possible starts. + states = OrderedDict() + + # This will contain every indices where we need + # to cut. + # We force to cut at offset 0 and len(text) (added later) + offsets = [0] + + # This is used by the lookahead which needs to skip over + # some text where the full match exceeded the place in the initial + # for loop + skip = 0 + # Main loop, Giving this algorithm O(n) complexity + for current, current_char in enumerate(text): + if skip and current < skip: + # Prevents the lookahead for matching twice + # like extra_id_100 and id_100 + continue + + # This will track every state + # that stop matching, we need to stop tracking them. + # If we look at "lowball", we're going to match "l" (add it to states), "o", "w", then + # fail on "b", we need to remove 0 from the valid states. + to_remove = set() + # Whenever we found a match, we need to drop everything + # this is a greedy algorithm, it will match on the first found token + reset = False + + # In this case, we already have partial matches (But unfinished) + for start, trie_pointer in states.items(): + if "" in trie_pointer: + # This is a final match, we need to reset and + # store the results in `offsets`. + + # Lookahead to match longest first + # Important in case of extra_id_1 vs extra_id_100 + # Here we are also actively looking for other earlier partial + # matches + # "[CLS]", "L", we need to match CLS even if L is special + for lookstart, looktrie_pointer in states.items(): + if lookstart > start: + # This partial match is later, we can stop looking + break + elif lookstart < start: + # This partial match is earlier, the trie pointer + # was already updated, so index is + 1 + lookahead_index = current + 1 + end = current + 1 + else: + # Here lookstart == start and + # looktrie_pointer == trie_pointer + # It wasn't updated yet so indices are current ones + lookahead_index = current + end = current + next_char = text[lookahead_index] if lookahead_index < len(text) else None + if "" in looktrie_pointer: + start = lookstart + end = lookahead_index + skip = lookahead_index + + while next_char in looktrie_pointer: + looktrie_pointer = looktrie_pointer[next_char] + lookahead_index += 1 + if "" in looktrie_pointer: + start = lookstart + end = lookahead_index + skip = lookahead_index + + if lookahead_index == len(text): + # End of string + break + next_char = text[lookahead_index] + # End lookahead + + # Storing and resetting + offsets.append(start) + offsets.append(end) + reset = True + break + elif current_char in trie_pointer: + # The current character being looked at has a match within the trie + # update the pointer (it will be stored back into states later). + trie_pointer = trie_pointer[current_char] + + # Storing back the new pointer into the states. + # Partial matches got longer by one. + states[start] = trie_pointer + else: + # The new character has not match in the trie, we need + # to stop keeping track of this partial match. + # We can't do it directly within the loop because of how + # python iteration works + to_remove.add(start) + + # Either clearing the full start (we found a real match) + # Or clearing only the partial matches that didn't work. + if reset: + states = {} + else: + for start in to_remove: + del states[start] + + # If this character is a starting character within the trie + # start keeping track of this partial match. + if current >= skip and current_char in self.data: + states[current] = self.data[current_char] + + # We have a cut at the end with states. + for start, trie_pointer in states.items(): + if "" in trie_pointer: + # This is a final match, we need to reset and + # store the results in `offsets`. + end = len(text) + offsets.append(start) + offsets.append(end) + # Longest cut is always the one with lower start so the first + # item so we need to break. + break + + return self.cut_text(text, offsets) + + def cut_text(self, text, offsets): + # We have all the offsets now, we just need to do the actual splitting. + # We need to eventually add the first part of the string and the eventual + # last part. + offsets.append(len(text)) + tokens = [] + start = 0 + for end in offsets: + if start > end: + logger.error( + "There was a bug in Trie algorithm in tokenization. Attempting to recover. Please report it" + " anyway." + ) + continue + elif start == end: + # This might happen if there's a match at index 0 + # we're also preventing zero-width cuts in case of two + # consecutive matches + continue + tokens.append(text[start:end]) + start = end + + return tokens + + +class ExtensionsTrie(Trie): + def __init__(self, *args): + super().__init__(*args) + + def extensions(self, prefix: str): + """ + Generates all extensions of a given prefix token in the Trie. + + Example: + + ```python + >>> trie = Trie() + >>> trie.add("apple") + >>> trie.add("app") + >>> trie.add("application") + >>> trie.extensions("app") + ['app', 'apple', 'application'] + ``` + """ + prefix_node = self._get_node(prefix) + ret = self._collect_tokens(prefix_node) + return [prefix + token for token in ret] + + def _get_node(self, token: str) -> dict: + """ + Retrieves the node corresponding to the given token in the Trie. + + Args: + token (str): The token for which the corresponding node needs to be retrieved. + + Returns: + dict: The node in the Trie corresponding to the given token. + """ + node = self.data + for char in token: + if char not in node: + break + + node = node[char] + return node + + def _collect_tokens(self, node: dict) -> list: + """ + Generates all tokens in the Trie starting from a given node. + + Args: + node (dict): The node in the Trie from which tokens need to be generated. + + Returns: + list: List of tokens generated from the given node. + """ + tokens = [self._termination_char] if self._termination_char in node else [] + for token, subtrie_head in node.items(): + if token != self._termination_char: + subtokens = self._collect_tokens(subtrie_head) + tokens.extend([token + subtoken for subtoken in subtokens]) + return tokens + + +def _is_whitespace(char): + """Checks whether `char` is a whitespace character.""" + # \t, \n, and \r are technically control characters but we treat them + # as whitespace since they are generally considered as such. + if char == " " or char == "\t" or char == "\n" or char == "\r": + return True + cat = unicodedata.category(char) + if cat == "Zs": + return True + return False + + +def _is_control(char): + """Checks whether `char` is a control character.""" + # These are technically control characters but we count them as whitespace + # characters. + if char == "\t" or char == "\n" or char == "\r": + return False + cat = unicodedata.category(char) + if cat.startswith("C"): + return True + return False + + +def _is_punctuation(char): + """Checks whether `char` is a punctuation character.""" + cp = ord(char) + # We treat all non-letter/number ASCII as punctuation. + # Characters such as "^", "$", and "`" are not in the Unicode + # Punctuation class but we treat them as punctuation anyways, for + # consistency. + if (cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126): + return True + cat = unicodedata.category(char) + if cat.startswith("P"): + return True + return False + + +def _is_end_of_word(text): + """Checks whether the last character in text is one of a punctuation, control or whitespace character.""" + last_char = text[-1] + return bool(_is_control(last_char) | _is_punctuation(last_char) | _is_whitespace(last_char)) + + +def _is_start_of_word(text): + """Checks whether the first character in text is one of a punctuation, control or whitespace character.""" + first_char = text[0] + return bool(_is_control(first_char) | _is_punctuation(first_char) | _is_whitespace(first_char)) + + +def _insert_one_token_to_ordered_list(token_list: list[str], new_token: str): + """ + Inserts one token to an ordered list if it does not already exist. Note: token_list must be sorted. + """ + insertion_idx = bisect.bisect_left(token_list, new_token) + # Checks if new_token is already in the ordered token_list + if insertion_idx < len(token_list) and token_list[insertion_idx] == new_token: + # new_token is in token_list, don't add + return + else: + token_list.insert(insertion_idx, new_token) + + +@add_end_docstrings(INIT_TOKENIZER_DOCSTRING) +class PreTrainedTokenizer(PreTrainedTokenizerBase): + """ + Base class for all slow tokenizers. + + Inherits from [`~tokenization_utils_base.PreTrainedTokenizerBase`]. + + Handle all the shared methods for tokenization and special tokens as well as methods downloading/caching/loading + pretrained tokenizers as well as adding tokens to the vocabulary. + + This class also contain the added tokens in a unified way on top of all tokenizers so we don't have to handle the + specific vocabulary augmentation methods of the various underlying dictionary structures (BPE, sentencepiece...). + """ + + def __init__(self, **kwargs): + # 1. Init the parent class + + self.tokens_trie = Trie() + + # 2. init `_added_tokens_decoder` if child class did not + if not hasattr(self, "_added_tokens_decoder"): + self._added_tokens_decoder: dict[int, AddedToken] = {} + + # 3. if a `added_tokens_decoder` is passed, we are loading from a saved tokenizer, we overwrite + self._added_tokens_decoder.update(kwargs.pop("added_tokens_decoder", {})) + self._added_tokens_encoder: dict[str, int] = {k.content: v for v, k in self._added_tokens_decoder.items()} + + # 4 init the parent class + super().__init__(**kwargs) + + # 4. If some of the special tokens are not part of the vocab, we add them, at the end. + # the order of addition is the same as self.SPECIAL_TOKENS_ATTRIBUTES following `tokenizers` + self._add_tokens( + [token for token in self.all_special_tokens_extended if token not in self._added_tokens_encoder], + special_tokens=True, + ) + + self._decode_use_source_tokenizer = False + + @property + def is_fast(self) -> bool: + return False + + @property + def vocab_size(self) -> int: + """ + `int`: Size of the base vocabulary (without the added tokens). + """ + raise NotImplementedError + + @property + def added_tokens_encoder(self) -> dict[str, int]: + """ + Returns the sorted mapping from string to index. The added tokens encoder is cached for performance + optimisation in `self._added_tokens_encoder` for the slow tokenizers. + """ + return {k.content: v for v, k in sorted(self._added_tokens_decoder.items(), key=lambda item: item[0])} + + @property + def added_tokens_decoder(self) -> dict[int, AddedToken]: + """ + Returns the added tokens in the vocabulary as a dictionary of index to AddedToken. + + Returns: + `Dict[str, int]`: The added tokens. + """ + return dict(sorted(self._added_tokens_decoder.items(), key=lambda item: item[0])) + + @added_tokens_decoder.setter + def added_tokens_decoder(self, value: dict[int, Union[AddedToken, str]]) -> dict[int, AddedToken]: + # Always raise an error if string because users should define the behavior + for index, token in value.items(): + if not isinstance(token, (str, AddedToken)) or not isinstance(index, int): + raise TypeError( + f"The provided `added_tokens_decoder` has an element of type {index.__class__, token.__class__}, should be a dict of {int, Union[AddedToken, str]}" + ) + + self._added_tokens_decoder[index] = AddedToken(token) if isinstance(token, str) else token + self._added_tokens_encoder[str(token)] = index + self._update_total_vocab_size() + + def get_added_vocab(self) -> dict[str, int]: + """ + Returns the added tokens in the vocabulary as a dictionary of token to index. Results might be different from + the fast call because for now we always add the tokens even if they are already in the vocabulary. This is + something we should change. + + Returns: + `Dict[str, int]`: The added tokens. + """ + return self._added_tokens_encoder + + def __len__(self): + """ + Size of the full vocabulary with the added tokens. + """ + return self.total_vocab_size + + def _update_total_vocab_size(self): + """ + Update the size of the full vocabulary with the added tokens. Counts the `keys` and not the `values` because + otherwise if there is a hole in the vocab, we will add tokenizers at a wrong index. This operation is slow and + is only updated when adding tokens. + """ + self.total_vocab_size = len(self.get_vocab()) + + def _add_tokens(self, new_tokens: Union[list[str], list[AddedToken]], special_tokens: bool = False) -> int: + """ + Add a list of new tokens to the tokenizer class. If the new tokens are not in the vocabulary, they are added to + it with indices starting from length of the current vocabulary. Special tokens are sometimes already in the + vocab which is why they have to be handled specifically. + + Args: + new_tokens (`List[str]`or `List[tokenizers.AddedToken]`): + Token(s) to add in vocabulary. A token is counted as added if it's not already in the vocabulary + (tested by checking if the tokenizer assign the index of the `unk_token` to them). If a token is part + of the vocabulary then we simply mark this token as an `AddedToken` which allows to control the + stripping and normalization of this token. This is NOT possible in `tokenizers`. + special_tokens (`bool`, *optional*, defaults to `False`): + Whether or not the tokens should be added as special tokens. + + Returns: + `int`: The number of tokens actually added to the vocabulary. + + Examples: + + ```python + # Let's see how to increase the vocabulary of Bert model and tokenizer + tokenizer = BertTokenizer.from_pretrained("google-bert/bert-base-uncased") + model = BertModel.from_pretrained("google-bert/bert-base-uncased") + + num_added_toks = tokenizer.add_tokens(["new_tok1", "my_new-tok2"]) + print("We have added", num_added_toks, "tokens") + # Note: resize_token_embeddings expects to receive the full size of the new vocabulary, i.e. the length of the tokenizer. + model.resize_token_embeddings(len(tokenizer)) + ```""" + added_tokens = 0 + if new_tokens is None: + return added_tokens + # TODO this is fairly slow to improve! + current_vocab = self.get_vocab().copy() + new_idx = len(current_vocab) # only call this once, len gives the last index + 1 + for token in new_tokens: + if not isinstance(token, (str, AddedToken)): + raise TypeError(f"Token {token} is not a string but a {type(token)}.") + if str(token) == "": + continue + if isinstance(token, str): + if token in self._added_tokens_encoder: + continue + else: + # very important for fast and slow equivalence! + is_special = token in self.all_special_tokens or special_tokens + token = AddedToken( + token, rstrip=False, lstrip=False, normalized=not is_special, special=is_special + ) + elif special_tokens: + # doing token.special=True changes the normalization! will fix in rust + # this is important and the only reason why the AddedTokens in each class are normalized by default + token.__setstate__({"special": True, "normalized": token.normalized}) + if token in self._added_tokens_decoder: + continue + if not token.special and token.normalized and getattr(self, "do_lower_case", False): + # Normalize if requested + token.content = token.content.lower() + if token.content not in current_vocab: + token_index = new_idx + added_tokens + current_vocab[token.content] = token_index + added_tokens += 1 + else: + token_index = current_vocab[token.content] + + if token.special and str(token) not in self.all_special_tokens: + self._special_tokens_map["additional_special_tokens"].append(token) + # the setter automatically updates the reverse map + self._added_tokens_decoder[token_index] = token + self._added_tokens_encoder[token.content] = token_index + if self.verbose: + logger.info(f"Adding {token} to the vocabulary") + + self._update_trie() + self._update_total_vocab_size() + return added_tokens + + def _update_trie(self, unique_no_split_tokens: Optional[str] = []): + for token in self._added_tokens_decoder.values(): + if token not in self.tokens_trie._tokens: + self.tokens_trie.add(token.content) + for token in unique_no_split_tokens: + if token not in self.tokens_trie._tokens: + self.tokens_trie.add(token) + + def num_special_tokens_to_add(self, pair: bool = False) -> int: + """ + Returns the number of added tokens when encoding a sequence with special tokens. + + + + This encodes a dummy input and checks the number of added tokens, and is therefore not efficient. Do not put + this inside your training loop. + + + + Args: + pair (`bool`, *optional*, defaults to `False`): + Whether the number of added tokens should be computed in the case of a sequence pair or a single + sequence. + + Returns: + `int`: Number of special tokens added to sequences. + """ + token_ids_0 = [] + token_ids_1 = [] + return len(self.build_inputs_with_special_tokens(token_ids_0, token_ids_1 if pair else None)) + + def tokenize(self, text: TextInput, **kwargs) -> list[str]: + """ + Converts a string into a sequence of tokens, using the tokenizer. + + Split in words for word-based vocabulary or sub-words for sub-word-based vocabularies + (BPE/SentencePieces/WordPieces). Takes care of added tokens. + + Args: + text (`str`): + The sequence to be encoded. + **kwargs (additional keyword arguments): + Passed along to the model-specific `prepare_for_tokenization` preprocessing method. + + Returns: + `List[str]`: The list of tokens. + """ + split_special_tokens = kwargs.pop("split_special_tokens", self.split_special_tokens) + + text, kwargs = self.prepare_for_tokenization(text, **kwargs) + + if kwargs: + logger.warning(f"Keyword arguments {kwargs} not recognized.") + + if hasattr(self, "do_lower_case") and self.do_lower_case: + # convert non-special tokens to lowercase. Might be super slow as well? + escaped_special_toks = [re.escape(s_tok) for s_tok in (self.all_special_tokens)] + escaped_special_toks += [ + re.escape(s_tok.content) + for s_tok in (self._added_tokens_decoder.values()) + if not s_tok.special and s_tok.normalized + ] + pattern = r"(" + r"|".join(escaped_special_toks) + r")|" + r"(.+?)" + text = re.sub(pattern, lambda m: m.groups()[0] or m.groups()[1].lower(), text) + + if split_special_tokens: + no_split_token = [] + tokens = [text] + else: + no_split_token = self._added_tokens_encoder.keys() # don't split on any of the added tokens + # "This is something else" + tokens = self.tokens_trie.split(text) + + # ["This is something", "", " else"] + for i, token in enumerate(tokens): + if token in no_split_token: + tok_extended = self._added_tokens_decoder.get(self._added_tokens_encoder[token], None) + left = tokens[i - 1] if i > 0 else None + right = tokens[i + 1] if i < len(tokens) - 1 else None + if isinstance(tok_extended, AddedToken): + if tok_extended.rstrip and right: + # A bit counter-intuitive but we strip the left of the string + # since tok_extended.rstrip means the special token is eating all white spaces on its right + tokens[i + 1] = right.lstrip() + # Strip white spaces on the left + if tok_extended.lstrip and left: + tokens[i - 1] = left.rstrip() # Opposite here + if tok_extended.single_word and left and left[-1] != " ": + tokens[i - 1] += token + tokens[i] = "" + elif tok_extended.single_word and right and right[0] != " ": + tokens[i + 1] = token + tokens[i + 1] + tokens[i] = "" + else: + raise ValueError( + f"{tok_extended} cannot be tokenized because it was not properly added" + f" to the tokenizer. This means that it is not an `AddedToken` but a {type(tok_extended)}" + ) + # ["This is something", "", "else"] + tokenized_text = [] + for token in tokens: + # Need to skip eventual empty (fully stripped) tokens + if not token: + continue + if token in no_split_token: + tokenized_text.append(token) + else: + tokenized_text.extend(self._tokenize(token)) + # ["This", " is", " something", "", "else"] + return tokenized_text + + def _tokenize(self, text, **kwargs): + """ + Converts a string into a sequence of tokens (string), using the tokenizer. Split in words for word-based + vocabulary or sub-words for sub-word-based vocabularies (BPE/SentencePieces/WordPieces). + + Do NOT take care of added tokens. + """ + raise NotImplementedError + + def convert_tokens_to_ids(self, tokens: Union[str, list[str]]) -> Union[int, list[int]]: + """ + Converts a token string (or a sequence of tokens) in a single integer id (or a sequence of ids), using the + vocabulary. + + Args: + tokens (`str` or `List[str]`): One or several token(s) to convert to token id(s). + + Returns: + `int` or `List[int]`: The token id or list of token ids. + """ + if tokens is None: + return None + + if isinstance(tokens, str): + return self._convert_token_to_id_with_added_voc(tokens) + + ids = [] + for token in tokens: + ids.append(self._convert_token_to_id_with_added_voc(token)) + return ids + + def _convert_token_to_id_with_added_voc(self, token): + if token is None: + return None + + if token in self._added_tokens_encoder: + return self._added_tokens_encoder[token] + return self._convert_token_to_id(token) + + def _convert_token_to_id(self, token): + raise NotImplementedError + + def _encode_plus( + self, + text: Union[TextInput, PreTokenizedInput, EncodedInput], + text_pair: Optional[Union[TextInput, PreTokenizedInput, EncodedInput]] = None, + add_special_tokens: bool = True, + padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD, + truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE, + max_length: Optional[int] = None, + stride: int = 0, + is_split_into_words: bool = False, + pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[str] = None, + return_tensors: Optional[Union[str, TensorType]] = None, + return_token_type_ids: Optional[bool] = None, + return_attention_mask: Optional[bool] = None, + return_overflowing_tokens: bool = False, + return_special_tokens_mask: bool = False, + return_offsets_mapping: bool = False, + return_length: bool = False, + verbose: bool = True, + **kwargs, + ) -> BatchEncoding: + def get_input_ids(text): + if isinstance(text, str): + tokens = self.tokenize(text, **kwargs) + return self.convert_tokens_to_ids(tokens) + elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], str): + if is_split_into_words: + tokens = list( + itertools.chain(*(self.tokenize(t, is_split_into_words=True, **kwargs) for t in text)) + ) + return self.convert_tokens_to_ids(tokens) + else: + return self.convert_tokens_to_ids(text) + elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], int): + return text + else: + if is_split_into_words: + raise ValueError( + f"Input {text} is not valid. Should be a string or a list/tuple of strings when" + " `is_split_into_words=True`." + ) + else: + raise ValueError( + f"Input {text} is not valid. Should be a string, a list/tuple of strings or a list/tuple of" + " integers." + ) + + if return_offsets_mapping: + raise NotImplementedError( + "return_offset_mapping is not available when using Python tokenizers. " + "To use this feature, change your tokenizer to one deriving from " + "transformers.PreTrainedTokenizerFast. " + "More information on available tokenizers at " + "https://github.com/huggingface/transformers/pull/2674" + ) + + first_ids = get_input_ids(text) + second_ids = get_input_ids(text_pair) if text_pair is not None else None + + return self.prepare_for_model( + first_ids, + pair_ids=second_ids, + add_special_tokens=add_special_tokens, + padding=padding_strategy.value, + truncation=truncation_strategy.value, + max_length=max_length, + stride=stride, + pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, + return_tensors=return_tensors, + prepend_batch_axis=True, + return_attention_mask=return_attention_mask, + return_token_type_ids=return_token_type_ids, + return_overflowing_tokens=return_overflowing_tokens, + return_special_tokens_mask=return_special_tokens_mask, + return_length=return_length, + verbose=verbose, + ) + + def _batch_encode_plus( + self, + batch_text_or_text_pairs: Union[ + list[TextInput], + list[TextInputPair], + list[PreTokenizedInput], + list[PreTokenizedInputPair], + list[EncodedInput], + list[EncodedInputPair], + ], + add_special_tokens: bool = True, + padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD, + truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE, + max_length: Optional[int] = None, + stride: int = 0, + is_split_into_words: bool = False, + pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[str] = None, + return_tensors: Optional[Union[str, TensorType]] = None, + return_token_type_ids: Optional[bool] = None, + return_attention_mask: Optional[bool] = None, + return_overflowing_tokens: bool = False, + return_special_tokens_mask: bool = False, + return_offsets_mapping: bool = False, + return_length: bool = False, + verbose: bool = True, + split_special_tokens: bool = False, + **kwargs, + ) -> BatchEncoding: + def get_input_ids(text): + if isinstance(text, str): + tokens = self.tokenize(text, **kwargs) + return self.convert_tokens_to_ids(tokens) + elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], str): + if is_split_into_words: + tokens = list( + itertools.chain(*(self.tokenize(t, is_split_into_words=True, **kwargs) for t in text)) + ) + return self.convert_tokens_to_ids(tokens) + else: + return self.convert_tokens_to_ids(text) + elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], int): + return text + else: + raise ValueError( + "Input is not valid. Should be a string, a list/tuple of strings or a list/tuple of integers." + ) + + if return_offsets_mapping: + raise NotImplementedError( + "return_offset_mapping is not available when using Python tokenizers. " + "To use this feature, change your tokenizer to one deriving from " + "transformers.PreTrainedTokenizerFast." + ) + + input_ids = [] + for ids_or_pair_ids in batch_text_or_text_pairs: + if not isinstance(ids_or_pair_ids, (list, tuple)): + ids, pair_ids = ids_or_pair_ids, None + elif is_split_into_words and not isinstance(ids_or_pair_ids[0], (list, tuple)): + ids, pair_ids = ids_or_pair_ids, None + else: + ids, pair_ids = ids_or_pair_ids + + first_ids = get_input_ids(ids) + second_ids = get_input_ids(pair_ids) if pair_ids is not None else None + input_ids.append((first_ids, second_ids)) + + batch_outputs = self._batch_prepare_for_model( + input_ids, + add_special_tokens=add_special_tokens, + padding_strategy=padding_strategy, + truncation_strategy=truncation_strategy, + max_length=max_length, + stride=stride, + pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, + return_attention_mask=return_attention_mask, + return_token_type_ids=return_token_type_ids, + return_overflowing_tokens=return_overflowing_tokens, + return_special_tokens_mask=return_special_tokens_mask, + return_length=return_length, + return_tensors=return_tensors, + verbose=verbose, + split_special_tokens=split_special_tokens, + ) + + return BatchEncoding(batch_outputs) + + @add_end_docstrings(ENCODE_KWARGS_DOCSTRING, ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING) + def _batch_prepare_for_model( + self, + batch_ids_pairs: list[Union[PreTokenizedInputPair, tuple[list[int], None]]], + add_special_tokens: bool = True, + padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD, + truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE, + max_length: Optional[int] = None, + stride: int = 0, + pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[str] = None, + return_tensors: Optional[str] = None, + return_token_type_ids: Optional[bool] = None, + return_attention_mask: Optional[bool] = None, + return_overflowing_tokens: bool = False, + return_special_tokens_mask: bool = False, + return_length: bool = False, + verbose: bool = True, + split_special_tokens: bool = False, + ) -> BatchEncoding: + """ + Prepares a sequence of input id, or a pair of sequences of inputs ids so that it can be used by the model. It + adds special tokens, truncates sequences if overflowing while taking into account the special tokens and + manages a moving window (with user defined stride) for overflowing tokens + + Args: + batch_ids_pairs: list of tokenized input ids or input ids pairs + """ + + batch_outputs = {} + for first_ids, second_ids in batch_ids_pairs: + outputs = self.prepare_for_model( + first_ids, + second_ids, + add_special_tokens=add_special_tokens, + padding=PaddingStrategy.DO_NOT_PAD.value, # we pad in batch afterward + truncation=truncation_strategy.value, + max_length=max_length, + stride=stride, + pad_to_multiple_of=None, # we pad in batch afterward + padding_side=None, # we pad in batch afterward + return_attention_mask=False, # we pad in batch afterward + return_token_type_ids=return_token_type_ids, + return_overflowing_tokens=return_overflowing_tokens, + return_special_tokens_mask=return_special_tokens_mask, + return_length=return_length, + return_tensors=None, # We convert the whole batch to tensors at the end + prepend_batch_axis=False, + verbose=verbose, + split_special_tokens=split_special_tokens, + ) + + for key, value in outputs.items(): + if key not in batch_outputs: + batch_outputs[key] = [] + batch_outputs[key].append(value) + + batch_outputs = self.pad( + batch_outputs, + padding=padding_strategy.value, + max_length=max_length, + pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, + return_attention_mask=return_attention_mask, + ) + + batch_outputs = BatchEncoding(batch_outputs, tensor_type=return_tensors) + + return batch_outputs + + def prepare_for_tokenization( + self, text: str, is_split_into_words: bool = False, **kwargs + ) -> tuple[str, dict[str, Any]]: + """ + Performs any necessary transformations before tokenization. + + This method should pop the arguments from kwargs and return the remaining `kwargs` as well. We test the + `kwargs` at the end of the encoding process to be sure all the arguments have been used. + + Args: + text (`str`): + The text to prepare. + is_split_into_words (`bool`, *optional*, defaults to `False`): + Whether or not the input is already pre-tokenized (e.g., split into words). If set to `True`, the + tokenizer assumes the input is already split into words (for instance, by splitting it on whitespace) + which it will tokenize. This is useful for NER or token classification. + kwargs (`Dict[str, Any]`, *optional*): + Keyword arguments to use for the tokenization. + + Returns: + `Tuple[str, Dict[str, Any]]`: The prepared text and the unused kwargs. + """ + return (text, kwargs) + + def get_special_tokens_mask( + self, token_ids_0: list, token_ids_1: Optional[list] = None, already_has_special_tokens: bool = False + ) -> list[int]: + """ + Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding + special tokens using the tokenizer `prepare_for_model` or `encode_plus` methods. + + Args: + token_ids_0 (`List[int]`): + List of ids of the first sequence. + token_ids_1 (`List[int]`, *optional*): + List of ids of the second sequence. + already_has_special_tokens (`bool`, *optional*, defaults to `False`): + Whether or not the token list is already formatted with special tokens for the model. + + Returns: + A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. + """ + if already_has_special_tokens: + if token_ids_1 is not None: + raise ValueError( + "You should not supply a second sequence if the provided sequence of " + "ids is already formatted with special tokens for the model." + ) + + return super().get_special_tokens_mask( + token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True + ) + return [0] * ((len(token_ids_1) if token_ids_1 else 0) + len(token_ids_0)) + + @overload + def convert_ids_to_tokens(self, ids: int, skip_special_tokens: bool = False) -> str: ... + + @overload + def convert_ids_to_tokens(self, ids: list[int], skip_special_tokens: bool = False) -> list[str]: ... + + def convert_ids_to_tokens( + self, ids: Union[int, list[int]], skip_special_tokens: bool = False + ) -> Union[str, list[str]]: + """ + Converts a single index or a sequence of indices in a token or a sequence of tokens, using the vocabulary and + added tokens. + + Args: + ids (`int` or `List[int]`): + The token id (or token ids) to convert to tokens. + skip_special_tokens (`bool`, *optional*, defaults to `False`): + Whether or not to remove special tokens in the decoding. + + Returns: + `str` or `List[str]`: The decoded token(s). + """ + if isinstance(ids, int): + if ids in self._added_tokens_decoder: + return self._added_tokens_decoder[ids].content + else: + return self._convert_id_to_token(ids) + tokens = [] + for index in ids: + index = int(index) + if skip_special_tokens and index in self.all_special_ids: + continue + if index in self._added_tokens_decoder: + tokens.append(self._added_tokens_decoder[index].content) + else: + tokens.append(self._convert_id_to_token(index)) + return tokens + + def _convert_id_to_token(self, index: int) -> str: + raise NotImplementedError + + def convert_tokens_to_string(self, tokens: list[str]) -> str: + return " ".join(tokens) + + def _decode( + self, + token_ids: Union[int, list[int]], + skip_special_tokens: bool = False, + clean_up_tokenization_spaces: Optional[bool] = None, + spaces_between_special_tokens: bool = True, + **kwargs, + ) -> str: + self._decode_use_source_tokenizer = kwargs.pop("use_source_tokenizer", False) + + filtered_tokens = self.convert_ids_to_tokens(token_ids, skip_special_tokens=skip_special_tokens) + # If given is a single id, prevents splitting the string in upcoming loop + if isinstance(filtered_tokens, str): + filtered_tokens = [filtered_tokens] + + legacy_added_tokens = set(self._added_tokens_encoder.keys()) - set(self.all_special_tokens) | { + token for token in self.additional_special_tokens if self.convert_tokens_to_ids(token) >= self.vocab_size + } + # To avoid mixing byte-level and unicode for byte-level BPT + # we need to build string separately for added tokens and byte-level tokens + # cf. https://github.com/huggingface/transformers/issues/1133 + sub_texts = [] + current_sub_text = [] + # TODO @ArthurZ in version 5, special tokens should be handled in convert_tokens_to_string, while _convert_tokens_to_string + for token in filtered_tokens: + if skip_special_tokens and token in self.all_special_tokens: + continue + if token in legacy_added_tokens: + if current_sub_text: + string = self.convert_tokens_to_string(current_sub_text) + if len(string) > 0: + sub_texts.append(string) + current_sub_text = [] + sub_texts.append(token) + else: + current_sub_text.append(token) + if current_sub_text: + sub_texts.append(self.convert_tokens_to_string(current_sub_text)) + + if spaces_between_special_tokens: + text = " ".join(sub_texts) + else: + text = "".join(sub_texts) + + clean_up_tokenization_spaces = ( + clean_up_tokenization_spaces + if clean_up_tokenization_spaces is not None + else self.clean_up_tokenization_spaces + ) + if clean_up_tokenization_spaces: + clean_text = self.clean_up_tokenization(text) + return clean_text + else: + return text diff --git a/docs/transformers/build/lib/transformers/tokenization_utils_base.py b/docs/transformers/build/lib/transformers/tokenization_utils_base.py new file mode 100644 index 0000000000000000000000000000000000000000..7c2778e947621dc390725586b1c1dca1c3cba170 --- /dev/null +++ b/docs/transformers/build/lib/transformers/tokenization_utils_base.py @@ -0,0 +1,4260 @@ +# coding=utf-8 +# Copyright 2020 The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Base classes common to both the slow and the fast tokenization classes: PreTrainedTokenizerBase (host all the user +fronting encoding methods) Special token mixing (host the special tokens logic) and BatchEncoding (wrap the dictionary +of output with special method for the Fast tokenizers) +""" + +import copy +import json +import os +import re +import warnings +from collections import UserDict +from collections.abc import Mapping, Sized +from contextlib import contextmanager +from dataclasses import dataclass +from inspect import isfunction +from pathlib import Path +from typing import TYPE_CHECKING, Any, Callable, Dict, List, NamedTuple, Optional, Sequence, Tuple, Union + +import numpy as np +from packaging import version + +from . import __version__ +from .dynamic_module_utils import custom_object_save +from .utils import ( + CHAT_TEMPLATE_DIR, + CHAT_TEMPLATE_FILE, + ExplicitEnum, + PaddingStrategy, + PushToHubMixin, + TensorType, + add_end_docstrings, + add_model_info_to_auto_map, + add_model_info_to_custom_pipelines, + cached_file, + copy_func, + download_url, + extract_commit_hash, + get_json_schema, + is_flax_available, + is_jax_tensor, + is_mlx_available, + is_numpy_array, + is_offline_mode, + is_protobuf_available, + is_remote_url, + is_tf_available, + is_tf_tensor, + is_tokenizers_available, + is_torch_available, + is_torch_device, + is_torch_tensor, + list_repo_templates, + logging, + requires_backends, + to_py_obj, +) +from .utils.chat_template_utils import _compile_jinja_template, _render_with_assistant_indices +from .utils.import_utils import PROTOBUF_IMPORT_ERROR + + +if TYPE_CHECKING: + if is_torch_available(): + import torch + if is_tf_available(): + import tensorflow as tf + if is_flax_available(): + import jax.numpy as jnp # noqa: F401 + + +def import_protobuf_decode_error(error_message=""): + if is_protobuf_available(): + from google.protobuf.message import DecodeError + + return DecodeError + else: + raise ImportError(PROTOBUF_IMPORT_ERROR.format(error_message)) + + +if is_tokenizers_available(): + from tokenizers import AddedToken + from tokenizers import Encoding as EncodingFast +else: + + @dataclass(frozen=False, eq=True) + class AddedToken: + """ + AddedToken represents a token to be added to a Tokenizer An AddedToken can have special options defining the + way it should behave. + + The `normalized` will default to `not special` if it is not specified, similarly to the definition in + `tokenizers`. + """ + + def __init__( + self, content: str, single_word=False, lstrip=False, rstrip=False, special=False, normalized=None + ): + self.content = content + self.single_word = single_word + self.lstrip = lstrip + self.rstrip = rstrip + self.special = special + self.normalized = normalized if normalized is not None else not special + + def __getstate__(self): + return self.__dict__ + + def __str__(self): + return self.content + + @dataclass + class EncodingFast: + """This is dummy class because without the `tokenizers` library we don't have these objects anyway""" + + pass + + +logger = logging.get_logger(__name__) + +VERY_LARGE_INTEGER = int(1e30) # This is used to set the max input length for a model with infinite size input +LARGE_INTEGER = int(1e20) # This is used when we need something big but slightly smaller than VERY_LARGE_INTEGER + +# Define type aliases and NamedTuples +TextInput = str +PreTokenizedInput = List[str] +EncodedInput = List[int] +TextInputPair = Tuple[str, str] +PreTokenizedInputPair = Tuple[List[str], List[str]] +EncodedInputPair = Tuple[List[int], List[int]] + +# Define type aliases for text-related non-text modalities +AudioInput = Union["np.ndarray", "torch.Tensor", List["np.ndarray"], List["torch.Tensor"]] + +# Slow tokenizers used to be saved in three separated files +SPECIAL_TOKENS_MAP_FILE = "special_tokens_map.json" +ADDED_TOKENS_FILE = "added_tokens.json" +TOKENIZER_CONFIG_FILE = "tokenizer_config.json" + +# Fast tokenizers (provided by HuggingFace tokenizer's library) can be saved in a single file +FULL_TOKENIZER_FILE = "tokenizer.json" +_re_tokenizer_file = re.compile(r"tokenizer\.(.*)\.json") + + +class TruncationStrategy(ExplicitEnum): + """ + Possible values for the `truncation` argument in [`PreTrainedTokenizerBase.__call__`]. Useful for tab-completion in + an IDE. + """ + + ONLY_FIRST = "only_first" + ONLY_SECOND = "only_second" + LONGEST_FIRST = "longest_first" + DO_NOT_TRUNCATE = "do_not_truncate" + + +class CharSpan(NamedTuple): + """ + Character span in the original string. + + Args: + start (`int`): Index of the first character in the original string. + end (`int`): Index of the character following the last character in the original string. + """ + + start: int + end: int + + +class TokenSpan(NamedTuple): + """ + Token span in an encoded string (list of tokens). + + Args: + start (`int`): Index of the first token in the span. + end (`int`): Index of the token following the last token in the span. + """ + + start: int + end: int + + +class BatchEncoding(UserDict): + """ + Holds the output of the [`~tokenization_utils_base.PreTrainedTokenizerBase.__call__`], + [`~tokenization_utils_base.PreTrainedTokenizerBase.encode_plus`] and + [`~tokenization_utils_base.PreTrainedTokenizerBase.batch_encode_plus`] methods (tokens, attention_masks, etc). + + This class is derived from a python dictionary and can be used as a dictionary. In addition, this class exposes + utility methods to map from word/character space to token space. + + Args: + data (`dict`, *optional*): + Dictionary of lists/arrays/tensors returned by the `__call__`/`encode_plus`/`batch_encode_plus` methods + ('input_ids', 'attention_mask', etc.). + encoding (`tokenizers.Encoding` or `Sequence[tokenizers.Encoding]`, *optional*): + If the tokenizer is a fast tokenizer which outputs additional information like mapping from word/character + space to token space the `tokenizers.Encoding` instance or list of instance (for batches) hold this + information. + tensor_type (`Union[None, str, TensorType]`, *optional*): + You can give a tensor_type here to convert the lists of integers in PyTorch/TensorFlow/Numpy Tensors at + initialization. + prepend_batch_axis (`bool`, *optional*, defaults to `False`): + Whether or not to add a batch axis when converting to tensors (see `tensor_type` above). Note that this + parameter has an effect if the parameter `tensor_type` is set, *otherwise has no effect*. + n_sequences (`Optional[int]`, *optional*): + You can give a tensor_type here to convert the lists of integers in PyTorch/TensorFlow/Numpy Tensors at + initialization. + """ + + def __init__( + self, + data: Optional[Dict[str, Any]] = None, + encoding: Optional[Union[EncodingFast, Sequence[EncodingFast]]] = None, + tensor_type: Union[None, str, TensorType] = None, + prepend_batch_axis: bool = False, + n_sequences: Optional[int] = None, + ): + super().__init__(data) + + if isinstance(encoding, EncodingFast): + encoding = [encoding] + + self._encodings = encoding + + if n_sequences is None and encoding is not None and len(encoding): + n_sequences = encoding[0].n_sequences + + self._n_sequences = n_sequences + + self.convert_to_tensors(tensor_type=tensor_type, prepend_batch_axis=prepend_batch_axis) + + @property + def n_sequences(self) -> Optional[int]: + """ + `Optional[int]`: The number of sequences used to generate each sample from the batch encoded in this + [`BatchEncoding`]. Currently can be one of `None` (unknown), `1` (a single sentence) or `2` (a pair of + sentences) + """ + return self._n_sequences + + @property + def is_fast(self) -> bool: + """ + `bool`: Indicate whether this [`BatchEncoding`] was generated from the result of a [`PreTrainedTokenizerFast`] + or not. + """ + return self._encodings is not None + + def __getitem__(self, item: Union[int, str]) -> Union[Any, EncodingFast]: + """ + If the key is a string, returns the value of the dict associated to `key` ('input_ids', 'attention_mask', + etc.). + + If the key is an integer, get the `tokenizers.Encoding` for batch item with index `key`. + + If the key is a slice, returns the value of the dict associated to `key` ('input_ids', 'attention_mask', etc.) + with the constraint of slice. + """ + if isinstance(item, str): + return self.data[item] + elif self._encodings is not None: + return self._encodings[item] + elif isinstance(item, slice): + return {key: self.data[key][item] for key in self.data.keys()} + else: + raise KeyError( + "Invalid key. Only three types of key are available: " + "(1) string, (2) integers for backend Encoding, and (3) slices for data subsetting." + ) + + def __getattr__(self, item: str): + try: + return self.data[item] + except KeyError: + raise AttributeError + + def __getstate__(self): + return {"data": self.data, "encodings": self._encodings} + + def __setstate__(self, state): + if "data" in state: + self.data = state["data"] + + if "encodings" in state: + self._encodings = state["encodings"] + + def keys(self): + return self.data.keys() + + def values(self): + return self.data.values() + + def items(self): + return self.data.items() + + # After this point: + # Extended properties and methods only available for fast (Rust-based) tokenizers + # provided by HuggingFace tokenizers library. + + @property + def encodings(self) -> Optional[List[EncodingFast]]: + """ + `Optional[List[tokenizers.Encoding]]`: The list all encodings from the tokenization process. Returns `None` if + the input was tokenized through Python (i.e., not a fast) tokenizer. + """ + return self._encodings + + def tokens(self, batch_index: int = 0) -> List[str]: + """ + Return the list of tokens (sub-parts of the input strings after word/subword splitting and before conversion to + integer indices) at a given batch index (only works for the output of a fast tokenizer). + + Args: + batch_index (`int`, *optional*, defaults to 0): The index to access in the batch. + + Returns: + `List[str]`: The list of tokens at that index. + """ + if not self._encodings: + raise ValueError( + "tokens() is not available when using non-fast tokenizers (e.g. instance of a `XxxTokenizerFast`" + " class)." + ) + return self._encodings[batch_index].tokens + + def sequence_ids(self, batch_index: int = 0) -> List[Optional[int]]: + """ + Return a list mapping the tokens to the id of their original sentences: + + - `None` for special tokens added around or between sequences, + - `0` for tokens corresponding to words in the first sequence, + - `1` for tokens corresponding to words in the second sequence when a pair of sequences was jointly + encoded. + + Args: + batch_index (`int`, *optional*, defaults to 0): The index to access in the batch. + + Returns: + `List[Optional[int]]`: A list indicating the sequence id corresponding to each token. Special tokens added + by the tokenizer are mapped to `None` and other tokens are mapped to the index of their corresponding + sequence. + """ + if not self._encodings: + raise ValueError( + "sequence_ids() is not available when using non-fast tokenizers (e.g. instance of a `XxxTokenizerFast`" + " class)." + ) + return self._encodings[batch_index].sequence_ids + + def words(self, batch_index: int = 0) -> List[Optional[int]]: + """ + Return a list mapping the tokens to their actual word in the initial sentence for a fast tokenizer. + + Args: + batch_index (`int`, *optional*, defaults to 0): The index to access in the batch. + + Returns: + `List[Optional[int]]`: A list indicating the word corresponding to each token. Special tokens added by the + tokenizer are mapped to `None` and other tokens are mapped to the index of their corresponding word + (several tokens will be mapped to the same word index if they are parts of that word). + """ + if not self._encodings: + raise ValueError( + "words() is not available when using non-fast tokenizers (e.g. instance of a `XxxTokenizerFast`" + " class)." + ) + warnings.warn( + "`BatchEncoding.words()` property is deprecated and should be replaced with the identical, " + "but more self-explanatory `BatchEncoding.word_ids()` property.", + FutureWarning, + ) + return self.word_ids(batch_index) + + def word_ids(self, batch_index: int = 0) -> List[Optional[int]]: + """ + Return a list mapping the tokens to their actual word in the initial sentence for a fast tokenizer. + + Args: + batch_index (`int`, *optional*, defaults to 0): The index to access in the batch. + + Returns: + `List[Optional[int]]`: A list indicating the word corresponding to each token. Special tokens added by the + tokenizer are mapped to `None` and other tokens are mapped to the index of their corresponding word + (several tokens will be mapped to the same word index if they are parts of that word). + """ + if not self._encodings: + raise ValueError( + "word_ids() is not available when using non-fast tokenizers (e.g. instance of a `XxxTokenizerFast`" + " class)." + ) + return self._encodings[batch_index].word_ids + + def token_to_sequence(self, batch_or_token_index: int, token_index: Optional[int] = None) -> int: + """ + Get the index of the sequence represented by the given token. In the general use case, this method returns `0` + for a single sequence or the first sequence of a pair, and `1` for the second sequence of a pair + + Can be called as: + + - `self.token_to_sequence(token_index)` if batch size is 1 + - `self.token_to_sequence(batch_index, token_index)` if batch size is greater than 1 + + This method is particularly suited when the input sequences are provided as pre-tokenized sequences (i.e., + words are defined by the user). In this case it allows to easily associate encoded tokens with provided + tokenized words. + + Args: + batch_or_token_index (`int`): + Index of the sequence in the batch. If the batch only comprises one sequence, this can be the index of + the token in the sequence. + token_index (`int`, *optional*): + If a batch index is provided in *batch_or_token_index*, this can be the index of the token in the + sequence. + + Returns: + `int`: Index of the word in the input sequence. + """ + + if not self._encodings: + raise ValueError("token_to_sequence() is not available when using Python based tokenizers") + if token_index is not None: + batch_index = batch_or_token_index + else: + batch_index = 0 + token_index = batch_or_token_index + if batch_index < 0: + batch_index = self._batch_size + batch_index + if token_index < 0: + token_index = self._seq_len + token_index + return self._encodings[batch_index].token_to_sequence(token_index) + + def token_to_word(self, batch_or_token_index: int, token_index: Optional[int] = None) -> int: + """ + Get the index of the word corresponding (i.e. comprising) to an encoded token in a sequence of the batch. + + Can be called as: + + - `self.token_to_word(token_index)` if batch size is 1 + - `self.token_to_word(batch_index, token_index)` if batch size is greater than 1 + + This method is particularly suited when the input sequences are provided as pre-tokenized sequences (i.e., + words are defined by the user). In this case it allows to easily associate encoded tokens with provided + tokenized words. + + Args: + batch_or_token_index (`int`): + Index of the sequence in the batch. If the batch only comprise one sequence, this can be the index of + the token in the sequence. + token_index (`int`, *optional*): + If a batch index is provided in *batch_or_token_index*, this can be the index of the token in the + sequence. + + Returns: + `int`: Index of the word in the input sequence. + """ + + if not self._encodings: + raise ValueError("token_to_word() is not available when using Python based tokenizers") + if token_index is not None: + batch_index = batch_or_token_index + else: + batch_index = 0 + token_index = batch_or_token_index + if batch_index < 0: + batch_index = self._batch_size + batch_index + if token_index < 0: + token_index = self._seq_len + token_index + return self._encodings[batch_index].token_to_word(token_index) + + def word_to_tokens( + self, batch_or_word_index: int, word_index: Optional[int] = None, sequence_index: int = 0 + ) -> Optional[TokenSpan]: + """ + Get the encoded token span corresponding to a word in a sequence of the batch. + + Token spans are returned as a [`~tokenization_utils_base.TokenSpan`] with: + + - **start** -- Index of the first token. + - **end** -- Index of the token following the last token. + + Can be called as: + + - `self.word_to_tokens(word_index, sequence_index: int = 0)` if batch size is 1 + - `self.word_to_tokens(batch_index, word_index, sequence_index: int = 0)` if batch size is greater or equal to + 1 + + This method is particularly suited when the input sequences are provided as pre-tokenized sequences (i.e. words + are defined by the user). In this case it allows to easily associate encoded tokens with provided tokenized + words. + + Args: + batch_or_word_index (`int`): + Index of the sequence in the batch. If the batch only comprises one sequence, this can be the index of + the word in the sequence. + word_index (`int`, *optional*): + If a batch index is provided in *batch_or_token_index*, this can be the index of the word in the + sequence. + sequence_index (`int`, *optional*, defaults to 0): + If pair of sequences are encoded in the batch this can be used to specify which sequence in the pair (0 + or 1) the provided word index belongs to. + + Returns: + ([`~tokenization_utils_base.TokenSpan`], *optional*): Span of tokens in the encoded sequence. Returns + `None` if no tokens correspond to the word. This can happen especially when the token is a special token + that has been used to format the tokenization. For example when we add a class token at the very beginning + of the tokenization. + """ + + if not self._encodings: + raise ValueError("word_to_tokens() is not available when using Python based tokenizers") + if word_index is not None: + batch_index = batch_or_word_index + else: + batch_index = 0 + word_index = batch_or_word_index + if batch_index < 0: + batch_index = self._batch_size + batch_index + if word_index < 0: + word_index = self._seq_len + word_index + span = self._encodings[batch_index].word_to_tokens(word_index, sequence_index) + return TokenSpan(*span) if span is not None else None + + def token_to_chars(self, batch_or_token_index: int, token_index: Optional[int] = None) -> Optional[CharSpan]: + """ + Get the character span corresponding to an encoded token in a sequence of the batch. + + Character spans are returned as a [`~tokenization_utils_base.CharSpan`] with: + + - **start** -- Index of the first character in the original string associated to the token. + - **end** -- Index of the character following the last character in the original string associated to the + token. + + Can be called as: + + - `self.token_to_chars(token_index)` if batch size is 1 + - `self.token_to_chars(batch_index, token_index)` if batch size is greater or equal to 1 + + Args: + batch_or_token_index (`int`): + Index of the sequence in the batch. If the batch only comprise one sequence, this can be the index of + the token in the sequence. + token_index (`int`, *optional*): + If a batch index is provided in *batch_or_token_index*, this can be the index of the token or tokens in + the sequence. + + Returns: + [`~tokenization_utils_base.CharSpan`]: Span of characters in the original string, or None, if the token + (e.g. , ) doesn't correspond to any chars in the origin string. + """ + + if not self._encodings: + raise ValueError("token_to_chars() is not available when using Python based tokenizers") + if token_index is not None: + batch_index = batch_or_token_index + else: + batch_index = 0 + token_index = batch_or_token_index + span_indices = self._encodings[batch_index].token_to_chars(token_index) + + return CharSpan(*span_indices) if span_indices is not None else None + + def char_to_token( + self, batch_or_char_index: int, char_index: Optional[int] = None, sequence_index: int = 0 + ) -> int: + """ + Get the index of the token in the encoded output comprising a character in the original string for a sequence + of the batch. + + Can be called as: + + - `self.char_to_token(char_index)` if batch size is 1 + - `self.char_to_token(batch_index, char_index)` if batch size is greater or equal to 1 + + This method is particularly suited when the input sequences are provided as pre-tokenized sequences (i.e. words + are defined by the user). In this case it allows to easily associate encoded tokens with provided tokenized + words. + + Args: + batch_or_char_index (`int`): + Index of the sequence in the batch. If the batch only comprise one sequence, this can be the index of + the word in the sequence + char_index (`int`, *optional*): + If a batch index is provided in *batch_or_token_index*, this can be the index of the word in the + sequence. + sequence_index (`int`, *optional*, defaults to 0): + If pair of sequences are encoded in the batch this can be used to specify which sequence in the pair (0 + or 1) the provided character index belongs to. + + + Returns: + `int`: Index of the token, or None if the char index refers to a whitespace only token and whitespace is + trimmed with `trim_offsets=True`. + """ + + if not self._encodings: + raise ValueError("char_to_token() is not available when using Python based tokenizers") + if char_index is not None: + batch_index = batch_or_char_index + else: + batch_index = 0 + char_index = batch_or_char_index + return self._encodings[batch_index].char_to_token(char_index, sequence_index) + + def word_to_chars( + self, batch_or_word_index: int, word_index: Optional[int] = None, sequence_index: int = 0 + ) -> CharSpan: + """ + Get the character span in the original string corresponding to given word in a sequence of the batch. + + Character spans are returned as a CharSpan NamedTuple with: + + - start: index of the first character in the original string + - end: index of the character following the last character in the original string + + Can be called as: + + - `self.word_to_chars(word_index)` if batch size is 1 + - `self.word_to_chars(batch_index, word_index)` if batch size is greater or equal to 1 + + Args: + batch_or_word_index (`int`): + Index of the sequence in the batch. If the batch only comprise one sequence, this can be the index of + the word in the sequence + word_index (`int`, *optional*): + If a batch index is provided in *batch_or_token_index*, this can be the index of the word in the + sequence. + sequence_index (`int`, *optional*, defaults to 0): + If pair of sequences are encoded in the batch this can be used to specify which sequence in the pair (0 + or 1) the provided word index belongs to. + + Returns: + `CharSpan` or `List[CharSpan]`: Span(s) of the associated character or characters in the string. CharSpan + are NamedTuple with: + + - start: index of the first character associated to the token in the original string + - end: index of the character following the last character associated to the token in the original + string + """ + + if not self._encodings: + raise ValueError("word_to_chars() is not available when using Python based tokenizers") + if word_index is not None: + batch_index = batch_or_word_index + else: + batch_index = 0 + word_index = batch_or_word_index + return CharSpan(*(self._encodings[batch_index].word_to_chars(word_index, sequence_index))) + + def char_to_word(self, batch_or_char_index: int, char_index: Optional[int] = None, sequence_index: int = 0) -> int: + """ + Get the word in the original string corresponding to a character in the original string of a sequence of the + batch. + + Can be called as: + + - `self.char_to_word(char_index)` if batch size is 1 + - `self.char_to_word(batch_index, char_index)` if batch size is greater than 1 + + This method is particularly suited when the input sequences are provided as pre-tokenized sequences (i.e. words + are defined by the user). In this case it allows to easily associate encoded tokens with provided tokenized + words. + + Args: + batch_or_char_index (`int`): + Index of the sequence in the batch. If the batch only comprise one sequence, this can be the index of + the character in the original string. + char_index (`int`, *optional*): + If a batch index is provided in *batch_or_token_index*, this can be the index of the character in the + original string. + sequence_index (`int`, *optional*, defaults to 0): + If pair of sequences are encoded in the batch this can be used to specify which sequence in the pair (0 + or 1) the provided character index belongs to. + + + Returns: + `int` or `List[int]`: Index or indices of the associated encoded token(s). + """ + + if not self._encodings: + raise ValueError("char_to_word() is not available when using Python based tokenizers") + if char_index is not None: + batch_index = batch_or_char_index + else: + batch_index = 0 + char_index = batch_or_char_index + return self._encodings[batch_index].char_to_word(char_index, sequence_index) + + def convert_to_tensors( + self, tensor_type: Optional[Union[str, TensorType]] = None, prepend_batch_axis: bool = False + ): + """ + Convert the inner content to tensors. + + Args: + tensor_type (`str` or [`~utils.TensorType`], *optional*): + The type of tensors to use. If `str`, should be one of the values of the enum [`~utils.TensorType`]. If + `None`, no modification is done. + prepend_batch_axis (`int`, *optional*, defaults to `False`): + Whether or not to add the batch dimension during the conversion. + """ + if tensor_type is None: + return self + + # Convert to TensorType + if not isinstance(tensor_type, TensorType): + tensor_type = TensorType(tensor_type) + + # Get a function reference for the correct framework + if tensor_type == TensorType.TENSORFLOW: + if not is_tf_available(): + raise ImportError( + "Unable to convert output to TensorFlow tensors format, TensorFlow is not installed." + ) + import tensorflow as tf + + as_tensor = tf.constant + is_tensor = tf.is_tensor + elif tensor_type == TensorType.PYTORCH: + if not is_torch_available(): + raise ImportError("Unable to convert output to PyTorch tensors format, PyTorch is not installed.") + import torch + + is_tensor = torch.is_tensor + + def as_tensor(value, dtype=None): + if isinstance(value, list) and isinstance(value[0], np.ndarray): + return torch.from_numpy(np.array(value)) + return torch.tensor(value) + + elif tensor_type == TensorType.JAX: + if not is_flax_available(): + raise ImportError("Unable to convert output to JAX tensors format, JAX is not installed.") + import jax.numpy as jnp # noqa: F811 + + as_tensor = jnp.array + is_tensor = is_jax_tensor + + elif tensor_type == TensorType.MLX: + if not is_mlx_available(): + raise ImportError("Unable to convert output to MLX tensors format, MLX is not installed.") + import mlx.core as mx + + as_tensor = mx.array + + def is_tensor(obj): + return isinstance(obj, mx.array) + else: + + def as_tensor(value, dtype=None): + if isinstance(value, (list, tuple)) and isinstance(value[0], (list, tuple, np.ndarray)): + value_lens = [len(val) for val in value] + if len(set(value_lens)) > 1 and dtype is None: + # we have a ragged list so handle explicitly + value = as_tensor([np.asarray(val) for val in value], dtype=object) + return np.asarray(value, dtype=dtype) + + is_tensor = is_numpy_array + + # Do the tensor conversion in batch + for key, value in self.items(): + try: + if prepend_batch_axis: + value = [value] + + if not is_tensor(value): + tensor = as_tensor(value) + + # Removing this for now in favor of controlling the shape with `prepend_batch_axis` + # # at-least2d + # if tensor.ndim > 2: + # tensor = tensor.squeeze(0) + # elif tensor.ndim < 2: + # tensor = tensor[None, :] + + self[key] = tensor + except Exception as e: + if key == "overflowing_tokens": + raise ValueError( + "Unable to create tensor returning overflowing tokens of different lengths. " + "Please see if a fast version of this tokenizer is available to have this feature available." + ) from e + raise ValueError( + "Unable to create tensor, you should probably activate truncation and/or padding with" + " 'padding=True' 'truncation=True' to have batched tensors with the same length. Perhaps your" + f" features (`{key}` in this case) have excessive nesting (inputs type `list` where type `int` is" + " expected)." + ) from e + + return self + + def to(self, device: Union[str, "torch.device"], *, non_blocking: bool = False) -> "BatchEncoding": + """ + Send all values to device by calling `v.to(device, non_blocking=non_blocking)` (PyTorch only). + + Args: + device (`str` or `torch.device`): The device to put the tensors on. + non_blocking (`bool`): Whether to perform the copy asynchronously. + + Returns: + [`BatchEncoding`]: The same instance after modification. + """ + requires_backends(self, ["torch"]) + import torch + + # This check catches things like APEX blindly calling "to" on all inputs to a module + # Otherwise it passes the casts down and casts the LongTensor containing the token idxs + # into a HalfTensor + if isinstance(device, str) or is_torch_device(device) or isinstance(device, int): + self.data = { + k: v.to(device=device, non_blocking=non_blocking) if isinstance(v, torch.Tensor) else v + for k, v in self.data.items() + } + else: + logger.warning(f"Attempting to cast a BatchEncoding to type {str(device)}. This is not supported.") + return self + + +class SpecialTokensMixin: + """ + A mixin derived by [`PreTrainedTokenizer`] and [`PreTrainedTokenizerFast`] to handle specific behaviors related to + special tokens. In particular, this class hold the attributes which can be used to directly access these special + tokens in a model-independent manner and allow to set and update the special tokens. + + Args: + bos_token (`str` or `tokenizers.AddedToken`, *optional*): + A special token representing the beginning of a sentence. + eos_token (`str` or `tokenizers.AddedToken`, *optional*): + A special token representing the end of a sentence. + unk_token (`str` or `tokenizers.AddedToken`, *optional*): + A special token representing an out-of-vocabulary token. + sep_token (`str` or `tokenizers.AddedToken`, *optional*): + A special token separating two different sentences in the same input (used by BERT for instance). + pad_token (`str` or `tokenizers.AddedToken`, *optional*): + A special token used to make arrays of tokens the same size for batching purpose. Will then be ignored by + attention mechanisms or loss computation. + cls_token (`str` or `tokenizers.AddedToken`, *optional*): + A special token representing the class of the input (used by BERT for instance). + mask_token (`str` or `tokenizers.AddedToken`, *optional*): + A special token representing a masked token (used by masked-language modeling pretraining objectives, like + BERT). + additional_special_tokens (tuple or list of `str` or `tokenizers.AddedToken`, *optional*): + A tuple or a list of additional tokens, which will be marked as `special`, meaning that they will be + skipped when decoding if `skip_special_tokens` is set to `True`. + """ + + SPECIAL_TOKENS_ATTRIBUTES = [ + "bos_token", + "eos_token", + "unk_token", + "sep_token", + "pad_token", + "cls_token", + "mask_token", + "additional_special_tokens", + ] + + def __init__(self, verbose=False, **kwargs): + self._pad_token_type_id = 0 + self.verbose = verbose + self._special_tokens_map = dict.fromkeys(self.SPECIAL_TOKENS_ATTRIBUTES) + self._special_tokens_map["additional_special_tokens"] = [] # for BC where it defaults to empty list + + # We directly set the hidden value to allow initialization with special tokens + # which are not yet in the vocabulary. Necessary for serialization/de-serialization + # TODO clean this up at some point (probably by switching to fast tokenizers) + + for key, value in kwargs.items(): + if value is None: + continue + if key in self.SPECIAL_TOKENS_ATTRIBUTES: + if key == "additional_special_tokens": + assert isinstance(value, (list, tuple)), f"Value {value} is not a list or tuple" + assert all(isinstance(t, (str, AddedToken)) for t in value), ( + "One of the tokens is not a string or an AddedToken" + ) + setattr(self, key, value) + elif isinstance(value, (str, AddedToken)): + setattr(self, key, value) + else: + raise TypeError(f"Special token {key} has to be either str or AddedToken but got: {type(value)}") + + def sanitize_special_tokens(self) -> int: + """ + The `sanitize_special_tokens` is now deprecated kept for backward compatibility and will be removed in + transformers v5. + """ + logger.warning_once("The `sanitize_special_tokens` will be removed in transformers v5.") + return self.add_tokens(self.all_special_tokens_extended, special_tokens=True) + + def add_special_tokens( + self, special_tokens_dict: Dict[str, Union[str, AddedToken]], replace_additional_special_tokens=True + ) -> int: + """ + Add a dictionary of special tokens (eos, pad, cls, etc.) to the encoder and link them to class attributes. If + special tokens are NOT in the vocabulary, they are added to it (indexed starting from the last index of the + current vocabulary). + + When adding new tokens to the vocabulary, you should make sure to also resize the token embedding matrix of the + model so that its embedding matrix matches the tokenizer. + + In order to do that, please use the [`~PreTrainedModel.resize_token_embeddings`] method. + + Using `add_special_tokens` will ensure your special tokens can be used in several ways: + + - Special tokens can be skipped when decoding using `skip_special_tokens = True`. + - Special tokens are carefully handled by the tokenizer (they are never split), similar to `AddedTokens`. + - You can easily refer to special tokens using tokenizer class attributes like `tokenizer.cls_token`. This + makes it easy to develop model-agnostic training and fine-tuning scripts. + + When possible, special tokens are already registered for provided pretrained models (for instance + [`BertTokenizer`] `cls_token` is already registered to be :obj*'[CLS]'* and XLM's one is also registered to be + `''`). + + Args: + special_tokens_dict (dictionary *str* to *str* or `tokenizers.AddedToken`): + Keys should be in the list of predefined special attributes: [`bos_token`, `eos_token`, `unk_token`, + `sep_token`, `pad_token`, `cls_token`, `mask_token`, `additional_special_tokens`]. + + Tokens are only added if they are not already in the vocabulary (tested by checking if the tokenizer + assign the index of the `unk_token` to them). + replace_additional_special_tokens (`bool`, *optional*,, defaults to `True`): + If `True`, the existing list of additional special tokens will be replaced by the list provided in + `special_tokens_dict`. Otherwise, `self._special_tokens_map["additional_special_tokens"]` is just extended. In the former + case, the tokens will NOT be removed from the tokenizer's full vocabulary - they are only being flagged + as non-special tokens. Remember, this only affects which tokens are skipped during decoding, not the + `added_tokens_encoder` and `added_tokens_decoder`. This means that the previous + `additional_special_tokens` are still added tokens, and will not be split by the model. + + Returns: + `int`: Number of tokens added to the vocabulary. + + Examples: + + ```python + # Let's see how to add a new classification token to GPT-2 + tokenizer = GPT2Tokenizer.from_pretrained("openai-community/gpt2") + model = GPT2Model.from_pretrained("openai-community/gpt2") + + special_tokens_dict = {"cls_token": ""} + + num_added_toks = tokenizer.add_special_tokens(special_tokens_dict) + print("We have added", num_added_toks, "tokens") + # Notice: resize_token_embeddings expect to receive the full size of the new vocabulary, i.e., the length of the tokenizer. + model.resize_token_embeddings(len(tokenizer)) + + assert tokenizer.cls_token == "" + ```""" + if not special_tokens_dict: + return 0 + + added_tokens = [] + for key, value in special_tokens_dict.items(): + assert key in self.SPECIAL_TOKENS_ATTRIBUTES, f"Key {key} is not a special token" + + if self.verbose: + logger.info(f"Assigning {value} to the {key} key of the tokenizer") + + if key == "additional_special_tokens": + assert isinstance(value, (list, tuple)) and all(isinstance(t, (str, AddedToken)) for t in value), ( + f"Tokens {value} for key {key} should all be str or AddedToken instances" + ) + + to_add = [] + for token in value: + if isinstance(token, str): + # for legacy purpose we default to stripping. `test_add_tokens_tokenizer` depends on this + token = AddedToken(token, rstrip=False, lstrip=False, normalized=False, special=True) + if not replace_additional_special_tokens and str(token) in self.additional_special_tokens: + continue + to_add.append(token) + if replace_additional_special_tokens and len(to_add) > 0: + setattr(self, key, list(to_add)) + else: + self._special_tokens_map["additional_special_tokens"].extend(to_add) + added_tokens += to_add + + else: + if not isinstance(value, (str, AddedToken)): + raise ValueError(f"Token {value} for key {key} should be a str or an AddedToken instance") + if isinstance(value, (str)): + # for legacy purpose we default to stripping. `False` depends on this + value = AddedToken(value, rstrip=False, lstrip=False, normalized=False, special=True) + if isinstance(value, AddedToken): + setattr(self, key, value) + if value not in added_tokens: + added_tokens.append(value) + + # if we are adding tokens that were not part of the vocab, we ought to add them + added_tokens = self.add_tokens(added_tokens, special_tokens=True) + return added_tokens + + def add_tokens( + self, new_tokens: Union[str, AddedToken, List[Union[str, AddedToken]]], special_tokens: bool = False + ) -> int: + """ + Add a list of new tokens to the tokenizer class. If the new tokens are not in the vocabulary, they are added to + it with indices starting from length of the current vocabulary and will be isolated before the tokenization + algorithm is applied. Added tokens and tokens from the vocabulary of the tokenization algorithm are therefore + not treated in the same way. + + Note, when adding new tokens to the vocabulary, you should make sure to also resize the token embedding matrix + of the model so that its embedding matrix matches the tokenizer. + + In order to do that, please use the [`~PreTrainedModel.resize_token_embeddings`] method. + + Args: + new_tokens (`str`, `tokenizers.AddedToken` or a list of *str* or `tokenizers.AddedToken`): + Tokens are only added if they are not already in the vocabulary. `tokenizers.AddedToken` wraps a string + token to let you personalize its behavior: whether this token should only match against a single word, + whether this token should strip all potential whitespaces on the left side, whether this token should + strip all potential whitespaces on the right side, etc. + special_tokens (`bool`, *optional*, defaults to `False`): + Can be used to specify if the token is a special token. This mostly change the normalization behavior + (special tokens like CLS or [MASK] are usually not lower-cased for instance). + + See details for `tokenizers.AddedToken` in HuggingFace tokenizers library. + + Returns: + `int`: Number of tokens added to the vocabulary. + + Examples: + + ```python + # Let's see how to increase the vocabulary of Bert model and tokenizer + tokenizer = BertTokenizerFast.from_pretrained("google-bert/bert-base-uncased") + model = BertModel.from_pretrained("google-bert/bert-base-uncased") + + num_added_toks = tokenizer.add_tokens(["new_tok1", "my_new-tok2"]) + print("We have added", num_added_toks, "tokens") + # Notice: resize_token_embeddings expect to receive the full size of the new vocabulary, i.e., the length of the tokenizer. + model.resize_token_embeddings(len(tokenizer)) + ```""" + if not new_tokens: + return 0 + + if not isinstance(new_tokens, (list, tuple)): + new_tokens = [new_tokens] + + return self._add_tokens(new_tokens, special_tokens=special_tokens) + + def _add_tokens(self, new_tokens: Union[List[str], List[AddedToken]], special_tokens: bool = False) -> int: + raise NotImplementedError + + @property + def pad_token_type_id(self) -> int: + """ + `int`: Id of the padding token type in the vocabulary. + """ + return self._pad_token_type_id + + def __setattr__(self, key, value): + key_without_id = key + key_is_special_id = key.endswith("_id") or key.endswith("_ids") + if key_is_special_id: + key_without_id = key[:-3] if not key.endswith("_ids") else key[:-4] + + if self.__dict__.get("_special_tokens_map", None) is not None and any( + name in self.__dict__["_special_tokens_map"] for name in [key, key_without_id] + ): + if key_is_special_id: + if value is not None: + value = ( + self.convert_ids_to_tokens(value) + if key != "additional_special_tokens" + else [self.convert_ids_to_tokens(val) for val in value] + ) + key = key_without_id + + if key != "additional_special_tokens" and not isinstance(value, (str, AddedToken)) and value is not None: + raise ValueError(f"Cannot set a non-string value as the {key}") + self._special_tokens_map[key] = value + else: + super().__setattr__(key, value) + + def __getattr__(self, key): + key_without_id = key + key_is_special_id = key.endswith("_id") or key.endswith("_ids") + if key_is_special_id: + key_without_id = key[:-3] if not key.endswith("_ids") else key[:-4] + + if self.__dict__.get("_special_tokens_map", None) is not None and any( + name in self.__dict__["_special_tokens_map"] for name in [key, key_without_id] + ): + _special_tokens_map = self.__dict__["_special_tokens_map"] + if not key_is_special_id: + if _special_tokens_map[key] is None: + if self.verbose: + logger.error(f"Using {key}, but it is not set yet.") + return None + value = _special_tokens_map[key] + return str(value) if key != "additional_special_tokens" else [str(tok) for tok in value] + else: + attr_as_tokens = getattr(self, key_without_id) + return self.convert_tokens_to_ids(attr_as_tokens) if attr_as_tokens is not None else None + + if key not in self.__dict__: + raise AttributeError(f"{self.__class__.__name__} has no attribute {key}") + else: + return super().__getattr__(key) + + @property + def special_tokens_map(self) -> Dict[str, Union[str, List[str]]]: + """ + `Dict[str, Union[str, List[str]]]`: A dictionary mapping special token class attributes (`cls_token`, + `unk_token`, etc.) to their values (`''`, `''`, etc.). + + Convert potential tokens of `tokenizers.AddedToken` type to string. + """ + set_attr = {} + for attr in self.SPECIAL_TOKENS_ATTRIBUTES: + attr_value = getattr(self, attr) + if attr_value: + set_attr[attr] = attr_value + return set_attr + + @property + def special_tokens_map_extended(self) -> Dict[str, Union[str, AddedToken, List[Union[str, AddedToken]]]]: + """ + `Dict[str, Union[str, tokenizers.AddedToken, List[Union[str, tokenizers.AddedToken]]]]`: A dictionary mapping + special token class attributes (`cls_token`, `unk_token`, etc.) to their values (`''`, `''`, etc.). + + Don't convert tokens of `tokenizers.AddedToken` type to string so they can be used to control more finely how + special tokens are tokenized. + """ + set_attr = {} + for attr in self.SPECIAL_TOKENS_ATTRIBUTES: + attr_value = self._special_tokens_map[attr] + if attr_value: + set_attr[attr] = attr_value + return set_attr + + @property + def all_special_tokens_extended(self) -> List[Union[str, AddedToken]]: + """ + `List[Union[str, tokenizers.AddedToken]]`: All the special tokens (`''`, `''`, etc.), the order has + nothing to do with the index of each tokens. If you want to know the correct indices, check + `self.added_tokens_encoder`. We can't create an order anymore as the keys are `AddedTokens` and not `Strings`. + + Don't convert tokens of `tokenizers.AddedToken` type to string so they can be used to control more finely how + special tokens are tokenized. + """ + all_tokens = [] + seen = set() + for value in self.special_tokens_map_extended.values(): + if isinstance(value, (list, tuple)): + tokens_to_add = [token for token in value if str(token) not in seen] + else: + tokens_to_add = [value] if str(value) not in seen else [] + seen.update(map(str, tokens_to_add)) + all_tokens.extend(tokens_to_add) + return all_tokens + + @property + def all_special_tokens(self) -> List[str]: + """ + `List[str]`: A list of the unique special tokens (`''`, `''`, ..., etc.). + + Convert tokens of `tokenizers.AddedToken` type to string. + """ + all_toks = [str(s) for s in self.all_special_tokens_extended] + return all_toks + + @property + def all_special_ids(self) -> List[int]: + """ + `List[int]`: List the ids of the special tokens(`''`, `''`, etc.) mapped to class attributes. + """ + all_toks = self.all_special_tokens + all_ids = self.convert_tokens_to_ids(all_toks) + return all_ids + + def _set_model_specific_special_tokens(self, special_tokens: List[str]): + """ + Adds new special tokens to the "SPECIAL_TOKENS_ATTRIBUTES" list which will be part + of "self.special_tokens" and saved as a special token in tokenizer's config. + This allows us to dynamically add new model-type specific tokens after initializing the tokenizer. + For example: if the model tokenizers is multimodal, we can support special image or audio tokens. + """ + self.SPECIAL_TOKENS_ATTRIBUTES = self.SPECIAL_TOKENS_ATTRIBUTES + list(special_tokens.keys()) + for key, value in special_tokens.items(): + if isinstance(value, (str, AddedToken)): + self._special_tokens_map[key] = value + else: + raise TypeError(f"Special token {key} has to be either str or AddedToken but got: {type(value)}") + + +ENCODE_KWARGS_DOCSTRING = r""" + add_special_tokens (`bool`, *optional*, defaults to `True`): + Whether or not to add special tokens when encoding the sequences. This will use the underlying + `PretrainedTokenizerBase.build_inputs_with_special_tokens` function, which defines which tokens are + automatically added to the input ids. This is useful if you want to add `bos` or `eos` tokens + automatically. + padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `False`): + Activates and controls padding. Accepts the following values: + + - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single + sequence if provided). + - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum + acceptable input length for the model if that argument is not provided. + - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different + lengths). + truncation (`bool`, `str` or [`~tokenization_utils_base.TruncationStrategy`], *optional*, defaults to `False`): + Activates and controls truncation. Accepts the following values: + + - `True` or `'longest_first'`: Truncate to a maximum length specified with the argument `max_length` or + to the maximum acceptable input length for the model if that argument is not provided. This will + truncate token by token, removing a token from the longest sequence in the pair if a pair of + sequences (or a batch of pairs) is provided. + - `'only_first'`: Truncate to a maximum length specified with the argument `max_length` or to the + maximum acceptable input length for the model if that argument is not provided. This will only + truncate the first sequence of a pair if a pair of sequences (or a batch of pairs) is provided. + - `'only_second'`: Truncate to a maximum length specified with the argument `max_length` or to the + maximum acceptable input length for the model if that argument is not provided. This will only + truncate the second sequence of a pair if a pair of sequences (or a batch of pairs) is provided. + - `False` or `'do_not_truncate'` (default): No truncation (i.e., can output batch with sequence lengths + greater than the model maximum admissible input size). + max_length (`int`, *optional*): + Controls the maximum length to use by one of the truncation/padding parameters. + + If left unset or set to `None`, this will use the predefined model maximum length if a maximum length + is required by one of the truncation/padding parameters. If the model has no specific maximum input + length (like XLNet) truncation/padding to a maximum length will be deactivated. + stride (`int`, *optional*, defaults to 0): + If set to a number along with `max_length`, the overflowing tokens returned when + `return_overflowing_tokens=True` will contain some tokens from the end of the truncated sequence + returned to provide some overlap between truncated and overflowing sequences. The value of this + argument defines the number of overlapping tokens. + is_split_into_words (`bool`, *optional*, defaults to `False`): + Whether or not the input is already pre-tokenized (e.g., split into words). If set to `True`, the + tokenizer assumes the input is already split into words (for instance, by splitting it on whitespace) + which it will tokenize. This is useful for NER or token classification. + pad_to_multiple_of (`int`, *optional*): + If set will pad the sequence to a multiple of the provided value. Requires `padding` to be activated. + This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability + `>= 7.5` (Volta). + padding_side (`str`, *optional*): + The side on which the model should have padding applied. Should be selected between ['right', 'left']. + Default value is picked from the class attribute of the same name. + return_tensors (`str` or [`~utils.TensorType`], *optional*): + If set, will return tensors instead of list of python integers. Acceptable values are: + + - `'tf'`: Return TensorFlow `tf.constant` objects. + - `'pt'`: Return PyTorch `torch.Tensor` objects. + - `'np'`: Return Numpy `np.ndarray` objects. +""" + +ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING = r""" + return_token_type_ids (`bool`, *optional*): + Whether to return token type IDs. If left to the default, will return the token type IDs according to + the specific tokenizer's default, defined by the `return_outputs` attribute. + + [What are token type IDs?](../glossary#token-type-ids) + return_attention_mask (`bool`, *optional*): + Whether to return the attention mask. If left to the default, will return the attention mask according + to the specific tokenizer's default, defined by the `return_outputs` attribute. + + [What are attention masks?](../glossary#attention-mask) + return_overflowing_tokens (`bool`, *optional*, defaults to `False`): + Whether or not to return overflowing token sequences. If a pair of sequences of input ids (or a batch + of pairs) is provided with `truncation_strategy = longest_first` or `True`, an error is raised instead + of returning overflowing tokens. + return_special_tokens_mask (`bool`, *optional*, defaults to `False`): + Whether or not to return special tokens mask information. + return_offsets_mapping (`bool`, *optional*, defaults to `False`): + Whether or not to return `(char_start, char_end)` for each token. + + This is only available on fast tokenizers inheriting from [`PreTrainedTokenizerFast`], if using + Python's tokenizer, this method will raise `NotImplementedError`. + return_length (`bool`, *optional*, defaults to `False`): + Whether or not to return the lengths of the encoded inputs. + verbose (`bool`, *optional*, defaults to `True`): + Whether or not to print more information and warnings. + **kwargs: passed to the `self.tokenize()` method + + Return: + [`BatchEncoding`]: A [`BatchEncoding`] with the following fields: + + - **input_ids** -- List of token ids to be fed to a model. + + [What are input IDs?](../glossary#input-ids) + + - **token_type_ids** -- List of token type ids to be fed to a model (when `return_token_type_ids=True` or + if *"token_type_ids"* is in `self.model_input_names`). + + [What are token type IDs?](../glossary#token-type-ids) + + - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when + `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names`). + + [What are attention masks?](../glossary#attention-mask) + + - **overflowing_tokens** -- List of overflowing tokens sequences (when a `max_length` is specified and + `return_overflowing_tokens=True`). + - **num_truncated_tokens** -- Number of tokens truncated (when a `max_length` is specified and + `return_overflowing_tokens=True`). + - **special_tokens_mask** -- List of 0s and 1s, with 1 specifying added special tokens and 0 specifying + regular sequence tokens (when `add_special_tokens=True` and `return_special_tokens_mask=True`). + - **length** -- The length of the inputs (when `return_length=True`) +""" + + +INIT_TOKENIZER_DOCSTRING = r""" + Class attributes (overridden by derived classes) + + - **vocab_files_names** (`Dict[str, str]`) -- A dictionary with, as keys, the `__init__` keyword name of each + vocabulary file required by the model, and as associated values, the filename for saving the associated file + (string). + - **pretrained_vocab_files_map** (`Dict[str, Dict[str, str]]`) -- A dictionary of dictionaries, with the + high-level keys being the `__init__` keyword name of each vocabulary file required by the model, the + low-level being the `short-cut-names` of the pretrained models with, as associated values, the `url` to the + associated pretrained vocabulary file. + - **model_input_names** (`List[str]`) -- A list of inputs expected in the forward pass of the model. + - **padding_side** (`str`) -- The default value for the side on which the model should have padding applied. + Should be `'right'` or `'left'`. + - **truncation_side** (`str`) -- The default value for the side on which the model should have truncation + applied. Should be `'right'` or `'left'`. + + Args: + model_max_length (`int`, *optional*): + The maximum length (in number of tokens) for the inputs to the transformer model. When the tokenizer is + loaded with [`~tokenization_utils_base.PreTrainedTokenizerBase.from_pretrained`], this will be set to the + value stored for the associated model in `max_model_input_sizes` (see above). If no value is provided, will + default to VERY_LARGE_INTEGER (`int(1e30)`). + padding_side (`str`, *optional*): + The side on which the model should have padding applied. Should be selected between ['right', 'left']. + Default value is picked from the class attribute of the same name. + truncation_side (`str`, *optional*): + The side on which the model should have truncation applied. Should be selected between ['right', 'left']. + Default value is picked from the class attribute of the same name. + chat_template (`str`, *optional*): + A Jinja template string that will be used to format lists of chat messages. See + https://huggingface.co/docs/transformers/chat_templating for a full description. + model_input_names (`List[string]`, *optional*): + The list of inputs accepted by the forward pass of the model (like `"token_type_ids"` or + `"attention_mask"`). Default value is picked from the class attribute of the same name. + bos_token (`str` or `tokenizers.AddedToken`, *optional*): + A special token representing the beginning of a sentence. Will be associated to `self.bos_token` and + `self.bos_token_id`. + eos_token (`str` or `tokenizers.AddedToken`, *optional*): + A special token representing the end of a sentence. Will be associated to `self.eos_token` and + `self.eos_token_id`. + unk_token (`str` or `tokenizers.AddedToken`, *optional*): + A special token representing an out-of-vocabulary token. Will be associated to `self.unk_token` and + `self.unk_token_id`. + sep_token (`str` or `tokenizers.AddedToken`, *optional*): + A special token separating two different sentences in the same input (used by BERT for instance). Will be + associated to `self.sep_token` and `self.sep_token_id`. + pad_token (`str` or `tokenizers.AddedToken`, *optional*): + A special token used to make arrays of tokens the same size for batching purpose. Will then be ignored by + attention mechanisms or loss computation. Will be associated to `self.pad_token` and `self.pad_token_id`. + cls_token (`str` or `tokenizers.AddedToken`, *optional*): + A special token representing the class of the input (used by BERT for instance). Will be associated to + `self.cls_token` and `self.cls_token_id`. + mask_token (`str` or `tokenizers.AddedToken`, *optional*): + A special token representing a masked token (used by masked-language modeling pretraining objectives, like + BERT). Will be associated to `self.mask_token` and `self.mask_token_id`. + additional_special_tokens (tuple or list of `str` or `tokenizers.AddedToken`, *optional*): + A tuple or a list of additional special tokens. Add them here to ensure they are skipped when decoding with + `skip_special_tokens` is set to True. If they are not part of the vocabulary, they will be added at the end + of the vocabulary. + clean_up_tokenization_spaces (`bool`, *optional*, defaults to `True`): + Whether or not the model should cleanup the spaces that were added when splitting the input text during the + tokenization process. + split_special_tokens (`bool`, *optional*, defaults to `False`): + Whether or not the special tokens should be split during the tokenization process. Passing will affect the + internal state of the tokenizer. The default behavior is to not split special tokens. This means that if + `` is the `bos_token`, then `tokenizer.tokenize("") = ['`]. Otherwise, if + `split_special_tokens=True`, then `tokenizer.tokenize("")` will be give `['<','s', '>']`. +""" + + +@add_end_docstrings(INIT_TOKENIZER_DOCSTRING) +class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin): + """ + Base class for [`PreTrainedTokenizer`] and [`PreTrainedTokenizerFast`]. + + Handles shared (mostly boiler plate) methods for those two classes. + """ + + vocab_files_names: Dict[str, str] = {} + pretrained_vocab_files_map: Dict[str, Dict[str, str]] = {} + _auto_class: Optional[str] = None + + # first name has to correspond to main model input name + # to make sure `tokenizer.pad(...)` works correctly + model_input_names: List[str] = ["input_ids", "token_type_ids", "attention_mask"] + padding_side: str = "right" + truncation_side: str = "right" + slow_tokenizer_class = None + + def __init__(self, **kwargs): + # inputs and kwargs for saving and re-loading (see ``from_pretrained`` and ``save_pretrained``) + self.init_inputs = () + for key in kwargs: + if hasattr(self, key) and callable(getattr(self, key)): + raise AttributeError(f"{key} conflicts with the method {key} in {self.__class__.__name__}") + + self.init_kwargs = copy.deepcopy(kwargs) + self.name_or_path = kwargs.pop("name_or_path", "") + self._processor_class = kwargs.pop("processor_class", None) + + # For backward compatibility we fallback to set model_max_length from max_len if provided + model_max_length = kwargs.pop("model_max_length", kwargs.pop("max_len", None)) + self.model_max_length = model_max_length if model_max_length is not None else VERY_LARGE_INTEGER + + # Padding and truncation side are right by default and overridden in subclasses. If specified in the kwargs, it + # is changed. + self.padding_side = kwargs.pop("padding_side", self.padding_side) + if self.padding_side not in ["right", "left"]: + raise ValueError( + f"Padding side should be selected between 'right' and 'left', current value: {self.padding_side}" + ) + + self.truncation_side = kwargs.pop("truncation_side", self.truncation_side) + if self.truncation_side not in ["right", "left"]: + raise ValueError( + f"Truncation side should be selected between 'right' and 'left', current value: {self.truncation_side}" + ) + + self.model_input_names = kwargs.pop("model_input_names", self.model_input_names) + + # By default, cleaning tokenization spaces for both fast and slow tokenizers + self.clean_up_tokenization_spaces = kwargs.pop("clean_up_tokenization_spaces", False) + + # By default, do not split special tokens for both fast and slow tokenizers + self.split_special_tokens = kwargs.pop("split_special_tokens", False) + + self.deprecation_warnings = {} # Use to store when we have already noticed a deprecation warning (avoid overlogging). + self._in_target_context_manager = False + + # Stores a Jinja template that formats chat histories into tokenizable strings + self.chat_template = kwargs.pop("chat_template", None) + if isinstance(self.chat_template, (list, tuple)): + # Chat templates are stored as lists of dicts with fixed key names, + # we reconstruct that into a single dict while loading them. + self.chat_template = {template["name"]: template["template"] for template in self.chat_template} + + super().__init__(**kwargs) + + self.extra_special_tokens = kwargs.pop("extra_special_tokens", {}) + self._set_model_specific_special_tokens(special_tokens=self.extra_special_tokens) + + @property + def max_len_single_sentence(self) -> int: + """ + `int`: The maximum length of a sentence that can be fed to the model. + """ + return self.model_max_length - self.num_special_tokens_to_add(pair=False) + + @property + def max_len_sentences_pair(self) -> int: + """ + `int`: The maximum combined length of a pair of sentences that can be fed to the model. + """ + return self.model_max_length - self.num_special_tokens_to_add(pair=True) + + @max_len_single_sentence.setter + def max_len_single_sentence(self, value) -> int: + # For backward compatibility, allow to try to setup 'max_len_single_sentence'. + if value == self.model_max_length - self.num_special_tokens_to_add(pair=False) and self.verbose: + if not self.deprecation_warnings.get("max_len_single_sentence", False): + logger.warning( + "Setting 'max_len_single_sentence' is now deprecated. This value is automatically set up." + ) + self.deprecation_warnings["max_len_single_sentence"] = True + else: + raise ValueError( + "Setting 'max_len_single_sentence' is now deprecated. This value is automatically set up." + ) + + @max_len_sentences_pair.setter + def max_len_sentences_pair(self, value) -> int: + # For backward compatibility, allow to try to setup 'max_len_sentences_pair'. + if value == self.model_max_length - self.num_special_tokens_to_add(pair=True) and self.verbose: + if not self.deprecation_warnings.get("max_len_sentences_pair", False): + logger.warning( + "Setting 'max_len_sentences_pair' is now deprecated. This value is automatically set up." + ) + self.deprecation_warnings["max_len_sentences_pair"] = True + else: + raise ValueError("Setting 'max_len_sentences_pair' is now deprecated. This value is automatically set up.") + + def _set_processor_class(self, processor_class: str): + """Sets processor class as an attribute.""" + self._processor_class = processor_class + + @property + def added_tokens_decoder(self) -> Dict[int, AddedToken]: + raise NotImplementedError() + + def __repr__(self) -> str: + added_tokens_decoder_rep = "\n\t".join([f"{k}: {v.__repr__()}," for k, v in self.added_tokens_decoder.items()]) + return ( + f"{self.__class__.__name__}(name_or_path='{self.name_or_path}'," + f" vocab_size={self.vocab_size}, model_max_length={self.model_max_length}, is_fast={self.is_fast}," + f" padding_side='{self.padding_side}', truncation_side='{self.truncation_side}'," + f" special_tokens={self.special_tokens_map}, clean_up_tokenization_spaces={self.clean_up_tokenization_spaces}," + " added_tokens_decoder={\n\t" + added_tokens_decoder_rep + "\n}\n)" + ) + + def __len__(self) -> int: + raise NotImplementedError() + + def get_vocab(self) -> Dict[str, int]: + """ + Returns the vocabulary as a dictionary of token to index. + + `tokenizer.get_vocab()[token]` is equivalent to `tokenizer.convert_tokens_to_ids(token)` when `token` is in the + vocab. + + Returns: + `Dict[str, int]`: The vocabulary. + """ + raise NotImplementedError() + + def apply_chat_template( + self, + conversation: Union[List[Dict[str, str]], List[List[Dict[str, str]]]], + tools: Optional[List[Union[Dict, Callable]]] = None, + documents: Optional[List[Dict[str, str]]] = None, + chat_template: Optional[str] = None, + add_generation_prompt: bool = False, + continue_final_message: bool = False, + tokenize: bool = True, + padding: Union[bool, str, PaddingStrategy] = False, + truncation: bool = False, + max_length: Optional[int] = None, + return_tensors: Optional[Union[str, TensorType]] = None, + return_dict: bool = False, + return_assistant_tokens_mask: bool = False, + tokenizer_kwargs: Optional[Dict[str, Any]] = None, + **kwargs, + ) -> Union[str, List[int], List[str], List[List[int]], BatchEncoding]: + """ + Converts a list of dictionaries with `"role"` and `"content"` keys to a list of token + ids. This method is intended for use with chat models, and will read the tokenizer's chat_template attribute to + determine the format and control tokens to use when converting. + + Args: + conversation (Union[List[Dict[str, str]], List[List[Dict[str, str]]]]): A list of dicts + with "role" and "content" keys, representing the chat history so far. + tools (`List[Dict]`, *optional*): + A list of tools (callable functions) that will be accessible to the model. If the template does not + support function calling, this argument will have no effect. Each tool should be passed as a JSON Schema, + giving the name, description and argument types for the tool. See our + [chat templating guide](https://huggingface.co/docs/transformers/main/en/chat_templating#automated-function-conversion-for-tool-use) + for more information. + documents (`List[Dict[str, str]]`, *optional*): + A list of dicts representing documents that will be accessible to the model if it is performing RAG + (retrieval-augmented generation). If the template does not support RAG, this argument will have no + effect. We recommend that each document should be a dict containing "title" and "text" keys. Please + see the RAG section of the [chat templating guide](https://huggingface.co/docs/transformers/main/en/chat_templating#arguments-for-RAG) + for examples of passing documents with chat templates. + chat_template (`str`, *optional*): + A Jinja template to use for this conversion. It is usually not necessary to pass anything to this + argument, as the model's template will be used by default. + add_generation_prompt (bool, *optional*): + If this is set, a prompt with the token(s) that indicate + the start of an assistant message will be appended to the formatted output. This is useful when you want to generate a response from the model. + Note that this argument will be passed to the chat template, and so it must be supported in the + template for this argument to have any effect. + continue_final_message (bool, *optional*): + If this is set, the chat will be formatted so that the final + message in the chat is open-ended, without any EOS tokens. The model will continue this message + rather than starting a new one. This allows you to "prefill" part of + the model's response for it. Cannot be used at the same time as `add_generation_prompt`. + tokenize (`bool`, defaults to `True`): + Whether to tokenize the output. If `False`, the output will be a string. + padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `False`): + Select a strategy to pad the returned sequences (according to the model's padding side and padding + index) among: + + - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single + sequence if provided). + - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum + acceptable input length for the model if that argument is not provided. + - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different + lengths). + truncation (`bool`, defaults to `False`): + Whether to truncate sequences at the maximum length. Has no effect if tokenize is `False`. + max_length (`int`, *optional*): + Maximum length (in tokens) to use for padding or truncation. Has no effect if tokenize is `False`. If + not specified, the tokenizer's `max_length` attribute will be used as a default. + return_tensors (`str` or [`~utils.TensorType`], *optional*): + If set, will return tensors of a particular framework. Has no effect if tokenize is `False`. Acceptable + values are: + - `'tf'`: Return TensorFlow `tf.Tensor` objects. + - `'pt'`: Return PyTorch `torch.Tensor` objects. + - `'np'`: Return NumPy `np.ndarray` objects. + - `'jax'`: Return JAX `jnp.ndarray` objects. + return_dict (`bool`, defaults to `False`): + Whether to return a dictionary with named outputs. Has no effect if tokenize is `False`. + tokenizer_kwargs (`Dict[str: Any]`, *optional*): Additional kwargs to pass to the tokenizer. + return_assistant_tokens_mask (`bool`, defaults to `False`): + Whether to return a mask of the assistant generated tokens. For tokens generated by the assistant, + the mask will contain 1. For user and system tokens, the mask will contain 0. + This functionality is only available for chat templates that support it via the `{% generation %}` keyword. + **kwargs: Additional kwargs to pass to the template renderer. Will be accessible by the chat template. + + Returns: + `Union[List[int], Dict]`: A list of token ids representing the tokenized chat so far, including control tokens. This + output is ready to pass to the model, either directly or via methods like `generate()`. If `return_dict` is + set, will return a dict of tokenizer outputs instead. + """ + + if return_dict and not tokenize: + raise ValueError( + "`return_dict=True` is incompatible with `tokenize=False`, because there is no dict " + "of tokenizer outputs to return." + ) + + if return_assistant_tokens_mask and not return_dict: + raise ValueError("`return_assistant_tokens_mask=True` is incompatible with `return_dict=False`") + + if tokenizer_kwargs is None: + tokenizer_kwargs = {} + + chat_template = self.get_chat_template(chat_template, tools) + + if return_assistant_tokens_mask and not re.search(r"\{\%-?\s*generation\s*-?\%\}", chat_template): + logger.warning_once( + "return_assistant_tokens_mask==True but chat template does not contain `{% generation %}` keyword." + ) + + # Compilation function uses a cache to avoid recompiling the same template + compiled_template = _compile_jinja_template(chat_template) + + if isinstance(conversation, (list, tuple)) and ( + isinstance(conversation[0], (list, tuple)) or hasattr(conversation[0], "messages") + ): + conversations = conversation + is_batched = True + else: + conversations = [conversation] + is_batched = False + + if continue_final_message: + if add_generation_prompt: + raise ValueError( + "continue_final_message and add_generation_prompt are not compatible. Use continue_final_message when you want the model to continue the final message, and add_generation_prompt when you want to add a header that will prompt it to start a new assistant message instead." + ) + if return_assistant_tokens_mask: + raise ValueError("continue_final_message is not compatible with return_assistant_tokens_mask.") + + # We accept either JSON schemas or functions for tools. If we get functions, we convert them to schemas + if tools is not None: + tool_schemas = [] + for tool in tools: + if isinstance(tool, dict): + tool_schemas.append(tool) + elif isfunction(tool): + tool_schemas.append(get_json_schema(tool)) + else: + raise ValueError( + "Tools should either be a JSON schema, or a callable function with type hints " + "and a docstring suitable for auto-conversion to a schema." + ) + else: + tool_schemas = None + + if documents is not None: + for document in documents: + if not isinstance(document, dict): + raise TypeError("Documents should be a list of dicts with 'title' and 'text' keys!") + + rendered = [] + all_generation_indices = [] + template_kwargs = {**self.special_tokens_map, **kwargs} # kwargs overwrite special tokens if both are present + for chat in conversations: + if hasattr(chat, "messages"): + # Indicates it's a Conversation object + chat = chat.messages + if return_assistant_tokens_mask: + rendered_chat, generation_indices = _render_with_assistant_indices( + compiled_template=compiled_template, + messages=chat, + tools=tool_schemas, + documents=documents, + add_generation_prompt=add_generation_prompt, + **template_kwargs, + ) + all_generation_indices.append(generation_indices) + else: + rendered_chat = compiled_template.render( + messages=chat, + tools=tool_schemas, + documents=documents, + add_generation_prompt=add_generation_prompt, + **template_kwargs, + ) + if continue_final_message: + final_message = chat[-1]["content"] + if isinstance(final_message, (list, tuple)): + for content_block in reversed(final_message): + if "text" in content_block: + # Pick the last text block in the message (the first one we hit while iterating in reverse) + final_message = content_block["text"] + break + else: + raise ValueError( + "continue_final_message is set but we could not find any text to continue" + "in the final message!" + ) + if final_message.strip() not in rendered_chat: + raise ValueError( + "continue_final_message is set but the final message does not appear in the chat after " + "applying the chat template! This can happen if the chat template deletes portions of " + "the final message. Please verify the chat template and final message in your chat to " + "ensure they are compatible." + ) + final_msg_loc = rendered_chat.rindex(final_message.strip()) + if rendered_chat[final_msg_loc : final_msg_loc + len(final_message.lstrip())] == final_message: + # The template preserves spacing or the message doesn't have trailing spacing, so things are simple + rendered_chat = rendered_chat[: final_msg_loc + len(final_message.lstrip())] + else: + # The message has trailing spacing that was trimmed, so we must be more cautious + rendered_chat = rendered_chat[: final_msg_loc + len(final_message.strip())] + rendered.append(rendered_chat) + + if not is_batched: + rendered = rendered[0] + + if tokenize: + out = self( + rendered, + padding=padding, + truncation=truncation, + max_length=max_length, + add_special_tokens=False, + return_tensors=return_tensors, + **tokenizer_kwargs, + ) + if return_dict: + if return_assistant_tokens_mask: + assistant_masks = [] + if is_batched or return_tensors: + input_ids = out["input_ids"] + else: + input_ids = [out["input_ids"]] + for i in range(len(input_ids)): + current_mask = [0] * len(input_ids[i]) + for assistant_start_char, assistant_end_char in all_generation_indices[i]: + start_token = out.char_to_token(i, assistant_start_char) + end_token = out.char_to_token(i, assistant_end_char - 1) + if start_token is None: + # start_token is out of bounds maybe due to truncation. + break + for token_id in range(start_token, end_token + 1 if end_token else len(input_ids[i])): + current_mask[token_id] = 1 + assistant_masks.append(current_mask) + + if not is_batched and not return_tensors: + assistant_masks = assistant_masks[0] + + out["assistant_masks"] = assistant_masks + + if return_tensors: + out.convert_to_tensors(tensor_type=return_tensors) + + return out + else: + return out["input_ids"] + else: + return rendered + + def get_chat_template(self, chat_template: Optional[str] = None, tools: Optional[List[Dict]] = None) -> str: + """ + Retrieve the chat template string used for tokenizing chat messages. This template is used + internally by the `apply_chat_template` method and can also be used externally to retrieve the model's chat + template for better generation tracking. + + Args: + chat_template (`str`, *optional*): + A Jinja template or the name of a template to use for this conversion. + It is usually not necessary to pass anything to this argument, + as the model's template will be used by default. + tools (`List[Dict]`, *optional*): + A list of tools (callable functions) that will be accessible to the model. If the template does not + support function calling, this argument will have no effect. Each tool should be passed as a JSON Schema, + giving the name, description and argument types for the tool. See our + [chat templating guide](https://huggingface.co/docs/transformers/main/en/chat_templating#automated-function-conversion-for-tool-use) + for more information. + + Returns: + `str`: The chat template string. + """ + # First, handle the cases when the model has a dict of multiple templates + if isinstance(self.chat_template, dict): + template_dict = self.chat_template + if chat_template is not None and chat_template in template_dict: + # The user can pass the name of a template to the chat template argument instead of an entire template + chat_template = template_dict[chat_template] + elif chat_template is None: + if tools is not None and "tool_use" in template_dict: + chat_template = template_dict["tool_use"] + elif "default" in template_dict: + chat_template = template_dict["default"] + else: + raise ValueError( + "This model has multiple chat templates with no default specified! Please either pass a chat " + "template or the name of the template you wish to use to the `chat_template` argument. Available " + f"template names are {sorted(template_dict.keys())}." + ) + + elif chat_template is None: + # These are the cases when the model has a single template + # priority: `chat_template` argument > `tokenizer.chat_template` + if self.chat_template is not None: + chat_template = self.chat_template + else: + raise ValueError( + "Cannot use chat template functions because tokenizer.chat_template is not set and no template " + "argument was passed! For information about writing templates and setting the " + "tokenizer.chat_template attribute, please see the documentation at " + "https://huggingface.co/docs/transformers/main/en/chat_templating" + ) + + return chat_template + + @classmethod + def from_pretrained( + cls, + pretrained_model_name_or_path: Union[str, os.PathLike], + *init_inputs, + cache_dir: Optional[Union[str, os.PathLike]] = None, + force_download: bool = False, + local_files_only: bool = False, + token: Optional[Union[str, bool]] = None, + revision: str = "main", + trust_remote_code=False, + **kwargs, + ): + r""" + Instantiate a [`~tokenization_utils_base.PreTrainedTokenizerBase`] (or a derived class) from a predefined + tokenizer. + + Args: + pretrained_model_name_or_path (`str` or `os.PathLike`): + Can be either: + + - A string, the *model id* of a predefined tokenizer hosted inside a model repo on huggingface.co. + - A path to a *directory* containing vocabulary files required by the tokenizer, for instance saved + using the [`~tokenization_utils_base.PreTrainedTokenizerBase.save_pretrained`] method, e.g., + `./my_model_directory/`. + - (**Deprecated**, not applicable to all derived classes) A path or url to a single saved vocabulary + file (if and only if the tokenizer only requires a single vocabulary file like Bert or XLNet), e.g., + `./my_model_directory/vocab.txt`. + cache_dir (`str` or `os.PathLike`, *optional*): + Path to a directory in which a downloaded predefined tokenizer vocabulary files should be cached if the + standard cache should not be used. + force_download (`bool`, *optional*, defaults to `False`): + Whether or not to force the (re-)download the vocabulary files and override the cached versions if they + exist. + resume_download: + Deprecated and ignored. All downloads are now resumed by default when possible. + Will be removed in v5 of Transformers. + proxies (`Dict[str, str]`, *optional*): + A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128', + 'http://hostname': 'foo.bar:4012'}`. The proxies are used on each request. + token (`str` or *bool*, *optional*): + The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated + when running `huggingface-cli login` (stored in `~/.huggingface`). + local_files_only (`bool`, *optional*, defaults to `False`): + Whether or not to only rely on local files and not to attempt to download any files. + revision (`str`, *optional*, defaults to `"main"`): + The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a + git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any + identifier allowed by git. + subfolder (`str`, *optional*): + In case the relevant files are located inside a subfolder of the model repo on huggingface.co (e.g. for + facebook/rag-token-base), specify it here. + inputs (additional positional arguments, *optional*): + Will be passed along to the Tokenizer `__init__` method. + trust_remote_code (`bool`, *optional*, defaults to `False`): + Whether or not to allow for custom models defined on the Hub in their own modeling files. This option + should only be set to `True` for repositories you trust and in which you have read the code, as it will + execute code present on the Hub on your local machine. + kwargs (additional keyword arguments, *optional*): + Will be passed to the Tokenizer `__init__` method. Can be used to set special tokens like `bos_token`, + `eos_token`, `unk_token`, `sep_token`, `pad_token`, `cls_token`, `mask_token`, + `additional_special_tokens`. See parameters in the `__init__` for more details. + + + + Passing `token=True` is required when you want to use a private model. + + + + Examples: + + ```python + # We can't instantiate directly the base class *PreTrainedTokenizerBase* so let's show our examples on a derived class: BertTokenizer + # Download vocabulary from huggingface.co and cache. + tokenizer = BertTokenizer.from_pretrained("google-bert/bert-base-uncased") + + # Download vocabulary from huggingface.co (user-uploaded) and cache. + tokenizer = BertTokenizer.from_pretrained("dbmdz/bert-base-german-cased") + + # If vocabulary files are in a directory (e.g. tokenizer was saved using *save_pretrained('./test/saved_model/')*) + tokenizer = BertTokenizer.from_pretrained("./test/saved_model/") + + # If the tokenizer uses a single vocabulary file, you can point directly to this file + tokenizer = BertTokenizer.from_pretrained("./test/saved_model/my_vocab.txt") + + # You can link tokens to special vocabulary when instantiating + tokenizer = BertTokenizer.from_pretrained("google-bert/bert-base-uncased", unk_token="") + # You should be sure '' is in the vocabulary when doing that. + # Otherwise use tokenizer.add_special_tokens({'unk_token': ''}) instead) + assert tokenizer.unk_token == "" + ```""" + resume_download = kwargs.pop("resume_download", None) + proxies = kwargs.pop("proxies", None) + use_auth_token = kwargs.pop("use_auth_token", None) + subfolder = kwargs.pop("subfolder", None) + from_pipeline = kwargs.pop("_from_pipeline", None) + from_auto_class = kwargs.pop("_from_auto", False) + commit_hash = kwargs.pop("_commit_hash", None) + gguf_file = kwargs.get("gguf_file", None) + + if use_auth_token is not None: + warnings.warn( + "The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers. Please use `token` instead.", + FutureWarning, + ) + if token is not None: + raise ValueError( + "`token` and `use_auth_token` are both specified. Please set only the argument `token`." + ) + token = use_auth_token + + user_agent = {"file_type": "tokenizer", "from_auto_class": from_auto_class, "is_fast": "Fast" in cls.__name__} + if from_pipeline is not None: + user_agent["using_pipeline"] = from_pipeline + + if is_offline_mode() and not local_files_only: + logger.info("Offline mode: forcing local_files_only=True") + local_files_only = True + + pretrained_model_name_or_path = str(pretrained_model_name_or_path) + vocab_files = {} + init_configuration = {} + + is_local = os.path.isdir(pretrained_model_name_or_path) + single_file_id = None + if os.path.isfile(pretrained_model_name_or_path) or is_remote_url(pretrained_model_name_or_path): + if len(cls.vocab_files_names) > 1 and not gguf_file: + raise ValueError( + f"Calling {cls.__name__}.from_pretrained() with the path to a single file or url is not " + "supported for this tokenizer. Use a model identifier or the path to a directory instead." + ) + warnings.warn( + f"Calling {cls.__name__}.from_pretrained() with the path to a single file or url is deprecated and " + "won't be possible anymore in v5. Use a model identifier or the path to a directory instead.", + FutureWarning, + ) + file_id = list(cls.vocab_files_names.keys())[0] + + vocab_files[file_id] = pretrained_model_name_or_path + single_file_id = file_id + else: + if gguf_file: + vocab_files["vocab_file"] = gguf_file + else: + # At this point pretrained_model_name_or_path is either a directory or a model identifier name + additional_files_names = { + "added_tokens_file": ADDED_TOKENS_FILE, # kept only for legacy + "special_tokens_map_file": SPECIAL_TOKENS_MAP_FILE, # kept only for legacy + "tokenizer_config_file": TOKENIZER_CONFIG_FILE, + # tokenizer_file used to initialize a slow from a fast. Properly copy the `addedTokens` instead of adding in random orders + "tokenizer_file": FULL_TOKENIZER_FILE, + "chat_template_file": CHAT_TEMPLATE_FILE, + } + + vocab_files = {**cls.vocab_files_names, **additional_files_names} + if "tokenizer_file" in vocab_files: + # Try to get the tokenizer config to see if there are versioned tokenizer files. + fast_tokenizer_file = FULL_TOKENIZER_FILE + + try: + resolved_config_file = cached_file( + pretrained_model_name_or_path, + TOKENIZER_CONFIG_FILE, + cache_dir=cache_dir, + force_download=force_download, + resume_download=resume_download, + proxies=proxies, + token=token, + revision=revision, + local_files_only=local_files_only, + subfolder=subfolder, + user_agent=user_agent, + _raise_exceptions_for_missing_entries=False, + _commit_hash=commit_hash, + ) + except OSError: + # Re-raise any error raised by cached_file in order to get a helpful error message + raise + except Exception: + # For any other exception, we throw a generic error. + raise OSError( + f"Can't load tokenizer for '{pretrained_model_name_or_path}'. If you were trying to load it from " + "'https://huggingface.co/models', make sure you don't have a local directory with the same name. " + f"Otherwise, make sure '{pretrained_model_name_or_path}' is the correct path to a directory " + f"containing all relevant files for a {cls.__name__} tokenizer." + ) + + commit_hash = extract_commit_hash(resolved_config_file, commit_hash) + if resolved_config_file is not None: + with open(resolved_config_file, encoding="utf-8") as reader: + tokenizer_config = json.load(reader) + if "fast_tokenizer_files" in tokenizer_config: + fast_tokenizer_file = get_fast_tokenizer_file(tokenizer_config["fast_tokenizer_files"]) + vocab_files["tokenizer_file"] = fast_tokenizer_file + + # This block looks for any extra chat template files + if is_local: + template_dir = Path(pretrained_model_name_or_path, CHAT_TEMPLATE_DIR) + if template_dir.is_dir(): + for template_file in template_dir.glob("*.jinja"): + template_name = template_file.name.removesuffix(".jinja") + vocab_files[f"chat_template_{template_name}"] = ( + f"{CHAT_TEMPLATE_DIR}/{template_file.name}" + ) + else: + for template in list_repo_templates( + pretrained_model_name_or_path, + local_files_only=local_files_only, + revision=revision, + cache_dir=cache_dir, + ): + vocab_files[f"chat_template_{template}"] = f"{CHAT_TEMPLATE_DIR}/{template}.jinja" + + # Get files from url, cache, or disk depending on the case + resolved_vocab_files = {} + for file_id, file_path in vocab_files.items(): + if file_path is None: + resolved_vocab_files[file_id] = None + elif single_file_id == file_id: + if os.path.isfile(file_path): + resolved_vocab_files[file_id] = file_path + elif is_remote_url(file_path): + resolved_vocab_files[file_id] = download_url(file_path, proxies=proxies) + else: + try: + resolved_vocab_files[file_id] = cached_file( + pretrained_model_name_or_path, + file_path, + cache_dir=cache_dir, + force_download=force_download, + proxies=proxies, + resume_download=resume_download, + local_files_only=local_files_only, + token=token, + user_agent=user_agent, + revision=revision, + subfolder=subfolder, + _raise_exceptions_for_missing_entries=False, + _commit_hash=commit_hash, + ) + except OSError: + # Re-raise any error raised by cached_file in order to get a helpful error message + raise + except Exception: + # For any other exception, we throw a generic error. + raise OSError( + f"Can't load tokenizer for '{pretrained_model_name_or_path}'. If you were trying to load it from " + "'https://huggingface.co/models', make sure you don't have a local directory with the same name. " + f"Otherwise, make sure '{pretrained_model_name_or_path}' is the correct path to a directory " + f"containing all relevant files for a {cls.__name__} tokenizer." + ) + commit_hash = extract_commit_hash(resolved_vocab_files[file_id], commit_hash) + + for file_id, file_path in vocab_files.items(): + if file_id not in resolved_vocab_files: + continue + + if is_local: + logger.info(f"loading file {file_path}") + else: + logger.info(f"loading file {file_path} from cache at {resolved_vocab_files[file_id]}") + + return cls._from_pretrained( + resolved_vocab_files, + pretrained_model_name_or_path, + init_configuration, + *init_inputs, + token=token, + cache_dir=cache_dir, + local_files_only=local_files_only, + _commit_hash=commit_hash, + _is_local=is_local, + trust_remote_code=trust_remote_code, + **kwargs, + ) + + @classmethod + def _from_pretrained( + cls, + resolved_vocab_files, + pretrained_model_name_or_path, + init_configuration, + *init_inputs, + token=None, + cache_dir=None, + local_files_only=False, + _commit_hash=None, + _is_local=False, + trust_remote_code=False, + **kwargs, + ): + # We instantiate fast tokenizers based on a slow tokenizer if we don't have access to the tokenizer.json + # file or if `from_slow` is set to True. + from_slow = kwargs.get("from_slow", False) + gguf_file = kwargs.get("gguf_file", None) + has_tokenizer_file = resolved_vocab_files.get("tokenizer_file", None) is not None + + # If one passes a GGUF file path to `gguf_file` there is no need for this check as the tokenizer will be + # loaded directly from the GGUF file. + if (from_slow or not has_tokenizer_file) and cls.slow_tokenizer_class is not None and not gguf_file: + slow_tokenizer = (cls.slow_tokenizer_class)._from_pretrained( + copy.deepcopy(resolved_vocab_files), + pretrained_model_name_or_path, + copy.deepcopy(init_configuration), + *init_inputs, + token=token, + cache_dir=cache_dir, + local_files_only=local_files_only, + _commit_hash=_commit_hash, + **(copy.deepcopy(kwargs)), + ) + else: + slow_tokenizer = None + + # Prepare tokenizer initialization kwargs + # Did we saved some inputs and kwargs to reload ? + tokenizer_config_file = resolved_vocab_files.pop("tokenizer_config_file", None) + if tokenizer_config_file is not None: + with open(tokenizer_config_file, encoding="utf-8") as tokenizer_config_handle: + init_kwargs = json.load(tokenizer_config_handle) + # First attempt. We get tokenizer_class from tokenizer_config to check mismatch between tokenizers. + config_tokenizer_class = init_kwargs.get("tokenizer_class") + init_kwargs.pop("tokenizer_class", None) + if not has_tokenizer_file: + init_kwargs.pop("tokenizer_file", None) + saved_init_inputs = init_kwargs.pop("init_inputs", ()) + if not init_inputs: + init_inputs = saved_init_inputs + else: + config_tokenizer_class = None + init_kwargs = init_configuration + + # If independent chat template file(s) exist, they take priority over template entries in the tokenizer config + chat_templates = {} + chat_template_file = resolved_vocab_files.pop("chat_template_file", None) + extra_chat_templates = [key for key in resolved_vocab_files if key.startswith("chat_template_")] + if chat_template_file is not None: + with open(chat_template_file) as chat_template_handle: + chat_templates["default"] = chat_template_handle.read() + for extra_chat_template in extra_chat_templates: + template_file = resolved_vocab_files.pop(extra_chat_template, None) + if template_file is None: + continue # I think this should never happen, but just in case + template_name = extra_chat_template.removeprefix("chat_template_") + with open(template_file) as chat_template_handle: + chat_templates[template_name] = chat_template_handle.read() + if len(chat_templates) == 1 and "default" in chat_templates: + init_kwargs["chat_template"] = chat_templates["default"] + elif chat_templates: + init_kwargs["chat_template"] = chat_templates + + if not _is_local: + if "auto_map" in init_kwargs: + # For backward compatibility with odl format. + if isinstance(init_kwargs["auto_map"], (tuple, list)): + init_kwargs["auto_map"] = {"AutoTokenizer": init_kwargs["auto_map"]} + init_kwargs["auto_map"] = add_model_info_to_auto_map( + init_kwargs["auto_map"], pretrained_model_name_or_path + ) + if "custom_pipelines" in init_kwargs: + init_kwargs["custom_pipelines"] = add_model_info_to_custom_pipelines( + init_kwargs["custom_pipelines"], pretrained_model_name_or_path + ) + + if config_tokenizer_class is None: + # Matt: This entire block is only used to decide if the tokenizer class matches the class in the repo. + # If not, it raises a warning, but otherwise continues. Since we mostly load tokenizers with + # AutoTokenizer these days, it seems like a lot of work (and a source of bugs) for little gain. + # Maybe we can just remove this entirely? + from .models.auto.configuration_auto import AutoConfig # tests_ignore + + # Second attempt. If we have not yet found tokenizer_class, let's try to use the config. + try: + config = AutoConfig.from_pretrained( + pretrained_model_name_or_path, + token=token, + cache_dir=cache_dir, + local_files_only=local_files_only, + trust_remote_code=trust_remote_code, + _commit_hash=_commit_hash, + ) + config_tokenizer_class = config.tokenizer_class + except (OSError, ValueError, KeyError): + # skip if an error occurred. + config = None + if config_tokenizer_class is None: + # Third attempt. If we have not yet found the original type of the tokenizer, + # we are loading we see if we can infer it from the type of the configuration file + from .models.auto.tokenization_auto import TOKENIZER_MAPPING_NAMES # tests_ignore + + if hasattr(config, "model_type"): + model_type = config.model_type + else: + # Fallback: use pattern matching on the string. + model_type = None + for pattern in TOKENIZER_MAPPING_NAMES.keys(): + if pattern in str(pretrained_model_name_or_path): + model_type = pattern + break + + if model_type is not None: + config_tokenizer_class, config_tokenizer_class_fast = TOKENIZER_MAPPING_NAMES.get( + model_type, (None, None) + ) + if config_tokenizer_class is None: + config_tokenizer_class = config_tokenizer_class_fast + + if config_tokenizer_class is not None: + if cls.__name__.replace("Fast", "") != config_tokenizer_class.replace("Fast", ""): + logger.warning( + "The tokenizer class you load from this checkpoint is not the same type as the class this" + " function is called from. It may result in unexpected tokenization. \nThe tokenizer class you" + f" load from this checkpoint is '{config_tokenizer_class}'. \nThe class this function is called" + f" from is '{cls.__name__}'." + ) + + # Update with newly provided kwargs + init_kwargs.update(kwargs) + + # Merge resolved_vocab_files arguments in init_kwargs. + added_tokens_file = resolved_vocab_files.pop("added_tokens_file", None) + special_tokens_map_file = resolved_vocab_files.pop("special_tokens_map_file", None) + for args_name, file_path in resolved_vocab_files.items(): + if args_name not in init_kwargs: + init_kwargs[args_name] = file_path + tokenizer_file = resolved_vocab_files.pop("tokenizer_file", None) + + if slow_tokenizer is not None: + init_kwargs["__slow_tokenizer"] = slow_tokenizer + init_kwargs["name_or_path"] = pretrained_model_name_or_path + + #### Handle tokenizer serialization of added and special tokens + added_tokens_decoder: Dict[int, AddedToken] = {} + added_tokens_map: Dict[str, AddedToken] = {} + # if we have info on the slow added tokens + if "added_tokens_decoder" in init_kwargs: + for idx, token in init_kwargs["added_tokens_decoder"].items(): + if isinstance(token, dict): + token = AddedToken(**token) + if isinstance(token, AddedToken): + added_tokens_decoder[int(idx)] = token + added_tokens_map[str(token)] = token + else: + raise ValueError( + f"Found a {token.__class__} in the saved `added_tokens_decoder`, should be a dictionary or an AddedToken instance" + ) + else: + # begin legacy: read the added_tokens_file and update kwargs with special_tokens_map if modified + if special_tokens_map_file is not None: + with open(special_tokens_map_file, encoding="utf-8") as special_tokens_map_handle: + special_tokens_map = json.load(special_tokens_map_handle) + for key, value in special_tokens_map.items(): + if key in kwargs and kwargs[key]: + # This value has already been redefined by the kwargs + # We keep this new value and ignore the one stored in the special_tokens_map_file + continue + if isinstance(value, dict): + value["special"] = True + value = AddedToken(**value) + elif key == "additional_special_tokens" and isinstance(value, list): + additional_special_tokens = init_kwargs.pop("additional_special_tokens", []) or [] + for token in value: + if isinstance(token, dict): + token["special"] = True + token = AddedToken(**token) + if token not in additional_special_tokens: + additional_special_tokens.append(token) + value = additional_special_tokens + init_kwargs[key] = value + + # slow -> slow|fast, legacy: convert the `"added_tokens.json"` file to `added_tokens_decoder`. + # this is for legacy purpose. We don't add the tokens after init for efficiency. + if added_tokens_file is not None: + special_tokens = [] + for key in cls.SPECIAL_TOKENS_ATTRIBUTES & init_kwargs.keys(): + if init_kwargs[key] is not None: + if key == "additional_special_tokens": + special_tokens += [str(token) for token in init_kwargs[key]] + else: + special_tokens.append(str(init_kwargs[key])) + + with open(added_tokens_file, encoding="utf-8") as added_tokens_handle: + added_tok_encoder = json.load(added_tokens_handle) + for str_token, index in added_tok_encoder.items(): + # if index not in added_tokens_decoder and str_token not in added_tokens_map: + special = str_token in special_tokens + added_tokens_decoder[index] = AddedToken( + str_token, rstrip=False, lstrip=False, normalized=not special, special=special + ) + added_tokens_map[str(token)] = added_tokens_decoder[index] + + # allows converting a fast -> slow: add the `tokenizer.json`'s `"added_tokens"` to the slow tokenizer + # if `tokenizer_config.json` is `None` + if tokenizer_file is not None: + # This is for slow so can be done before + with open(tokenizer_file, encoding="utf-8") as tokenizer_file_handle: + tokenizer_file_handle = json.load(tokenizer_file_handle) + added_tokens = tokenizer_file_handle.pop("added_tokens") + for serialized_tokens in added_tokens: + idx = serialized_tokens.pop("id") + added_tokens_decoder[idx] = AddedToken(**serialized_tokens) + added_tokens_map[str(added_tokens_decoder[idx])] = added_tokens_decoder[idx] + # end legacy + + # Passing AddedTokens and not strings to the class to prevent it from casting the string to a different AddedToken + # convert {'__type': 'AddedToken', 'content': '', 'lstrip': False, 'normalized': True, ...} to AddedTokens + init_kwargs["added_tokens_decoder"] = added_tokens_decoder + init_kwargs = cls.convert_added_tokens(init_kwargs, save=False) + for key in cls.SPECIAL_TOKENS_ATTRIBUTES & init_kwargs.keys(): + if added_tokens_map != {} and init_kwargs[key] is not None: + if key != "additional_special_tokens": + init_kwargs[key] = added_tokens_map.get(str(init_kwargs[key]), init_kwargs[key]) + + # Instantiate the tokenizer. + try: + tokenizer = cls(*init_inputs, **init_kwargs) + except import_protobuf_decode_error(): + logger.info( + "Unable to load tokenizer model from SPM, loading from TikToken will be attempted instead." + "(Google protobuf error: Tried to load SPM model with non-SPM vocab file).", + ) + return False + except RuntimeError as e: + if "sentencepiece_processor.cc" in str(e): + logger.info( + "Unable to load tokenizer model from SPM, loading from TikToken will be attempted instead." + "(SentencePiece RuntimeError: Tried to load SPM model with non-SPM vocab file).", + ) + return False + except OSError: + raise OSError( + "Unable to load vocabulary from file. " + "Please check that the provided vocabulary is accessible and not corrupted." + ) + + if added_tokens_decoder != {} and max(list(added_tokens_decoder.keys())[-1], 0) > tokenizer.vocab_size: + logger.info( + "Special tokens have been added in the vocabulary, make sure the associated word embeddings are" + " fine-tuned or trained." + ) + return tokenizer + + @staticmethod + def _eventually_correct_t5_max_length(pretrained_model_name_or_path, max_model_length, init_max_model_length): + # This method should be deleted in Transformers v5 + # Its only purpose is to potentially throw a warning + # that incorrectly defined max lengths of T5's tokenizer are used + # which we will correct in Transformers v5. + return max_model_length + + @classmethod + def convert_added_tokens(cls, obj: Union[AddedToken, Any], save=False, add_type_field=True): + if isinstance(obj, dict) and "__type" in obj and obj["__type"] == "AddedToken": + obj.pop("__type") + return AddedToken(**obj) + if isinstance(obj, AddedToken) and save: + obj = obj.__getstate__() + if add_type_field: + obj["__type"] = "AddedToken" + else: + # Don't save "special" for previous tokenizers + obj.pop("special") + return obj + elif isinstance(obj, (list, tuple)): + return [cls.convert_added_tokens(o, save=save, add_type_field=add_type_field) for o in obj] + elif isinstance(obj, dict): + return {k: cls.convert_added_tokens(v, save=save, add_type_field=add_type_field) for k, v in obj.items()} + return obj + + def save_chat_templates( + self, + save_directory: Union[str, os.PathLike], + tokenizer_config: dict, + filename_prefix: Optional[str], + save_jinja_files: bool, + ): + """ + Writes chat templates out to the save directory if we're using the new format, and removes them from + the tokenizer config if present. If we're using the legacy format, it doesn't write any files, and instead + writes the templates to the tokenizer config in the correct format. + """ + chat_template_file = os.path.join( + save_directory, (filename_prefix + "-" if filename_prefix else "") + CHAT_TEMPLATE_FILE + ) + chat_template_dir = os.path.join( + save_directory, (filename_prefix + "-" if filename_prefix else "") + CHAT_TEMPLATE_DIR + ) + + saved_raw_chat_template_files = [] + if save_jinja_files and isinstance(self.chat_template, str): + # New format for single templates is to save them as chat_template.jinja + with open(chat_template_file, "w", encoding="utf-8") as f: + f.write(self.chat_template) + logger.info(f"chat template saved in {chat_template_file}") + saved_raw_chat_template_files.append(chat_template_file) + if "chat_template" in tokenizer_config: + tokenizer_config.pop("chat_template") # To ensure it doesn't somehow end up in the config too + elif save_jinja_files and isinstance(self.chat_template, dict): + # New format for multiple templates is to save the default as chat_template.jinja + # and the other templates in the chat_templates/ directory + for template_name, template in self.chat_template.items(): + if template_name == "default": + with open(chat_template_file, "w", encoding="utf-8") as f: + f.write(self.chat_template["default"]) + logger.info(f"chat template saved in {chat_template_file}") + saved_raw_chat_template_files.append(chat_template_file) + else: + Path(chat_template_dir).mkdir(exist_ok=True) + template_filepath = os.path.join(chat_template_dir, f"{template_name}.jinja") + with open(template_filepath, "w", encoding="utf-8") as f: + f.write(template) + logger.info(f"chat template saved in {template_filepath}") + saved_raw_chat_template_files.append(template_filepath) + if "chat_template" in tokenizer_config: + tokenizer_config.pop("chat_template") # To ensure it doesn't somehow end up in the config too + elif isinstance(self.chat_template, dict): + # Legacy format for multiple templates: + # chat template dicts are saved to the config as lists of dicts with fixed key names. + tokenizer_config["chat_template"] = [{"name": k, "template": v} for k, v in self.chat_template.items()] + elif self.chat_template is not None: + # Legacy format for single templates: Just make them a key in tokenizer_config.json + tokenizer_config["chat_template"] = self.chat_template + return tokenizer_config, saved_raw_chat_template_files + + def save_pretrained( + self, + save_directory: Union[str, os.PathLike], + legacy_format: Optional[bool] = None, + filename_prefix: Optional[str] = None, + push_to_hub: bool = False, + **kwargs, + ) -> Tuple[str]: + """ + Save the full tokenizer state. + + + This method make sure the full tokenizer can then be re-loaded using the + [`~tokenization_utils_base.PreTrainedTokenizer.from_pretrained`] class method.. + + Warning,None This won't save modifications you may have applied to the tokenizer after the instantiation (for + instance, modifying `tokenizer.do_lower_case` after creation). + + Args: + save_directory (`str` or `os.PathLike`): The path to a directory where the tokenizer will be saved. + legacy_format (`bool`, *optional*): + Only applicable for a fast tokenizer. If unset (default), will save the tokenizer in the unified JSON + format as well as in legacy format if it exists, i.e. with tokenizer specific vocabulary and a separate + added_tokens files. + + If `False`, will only save the tokenizer in the unified JSON format. This format is incompatible with + "slow" tokenizers (not powered by the *tokenizers* library), so the tokenizer will not be able to be + loaded in the corresponding "slow" tokenizer. + + If `True`, will save the tokenizer in legacy format. If the "slow" tokenizer doesn't exits, a value + error is raised. + filename_prefix (`str`, *optional*): + A prefix to add to the names of the files saved by the tokenizer. + push_to_hub (`bool`, *optional*, defaults to `False`): + Whether or not to push your model to the Hugging Face model hub after saving it. You can specify the + repository you want to push to with `repo_id` (will default to the name of `save_directory` in your + namespace). + kwargs (`Dict[str, Any]`, *optional*): + Additional key word arguments passed along to the [`~utils.PushToHubMixin.push_to_hub`] method. + + Returns: + A tuple of `str`: The files saved. + """ + use_auth_token = kwargs.pop("use_auth_token", None) + + if use_auth_token is not None: + warnings.warn( + "The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers. Please use `token` instead.", + FutureWarning, + ) + if kwargs.get("token", None) is not None: + raise ValueError( + "`token` and `use_auth_token` are both specified. Please set only the argument `token`." + ) + kwargs["token"] = use_auth_token + + if os.path.isfile(save_directory): + logger.error(f"Provided path ({save_directory}) should be a directory, not a file") + return + + os.makedirs(save_directory, exist_ok=True) + + if push_to_hub: + commit_message = kwargs.pop("commit_message", None) + repo_id = kwargs.pop("repo_id", save_directory.split(os.path.sep)[-1]) + repo_id = self._create_repo(repo_id, **kwargs) + files_timestamps = self._get_files_timestamps(save_directory) + + special_tokens_map_file = os.path.join( + save_directory, (filename_prefix + "-" if filename_prefix else "") + SPECIAL_TOKENS_MAP_FILE + ) + tokenizer_config_file = os.path.join( + save_directory, (filename_prefix + "-" if filename_prefix else "") + TOKENIZER_CONFIG_FILE + ) + + tokenizer_config = copy.deepcopy(self.init_kwargs) + + # Let's save the init kwargs + target_keys = set(self.init_kwargs.keys()) + # Let's save the special tokens map (only the strings) + target_keys.update(["model_max_length", "clean_up_tokenization_spaces"]) + + for k in target_keys: + if hasattr(self, k): + tokenizer_config[k] = getattr(self, k) + + # Let's make sure we properly save the special tokens + tokenizer_config.update(self.special_tokens_map) + if "extra_special_tokens" not in tokenizer_config: + tokenizer_config["extra_special_tokens"] = self.extra_special_tokens + tokenizer_config.update(self.extra_special_tokens) + + save_jinja_files = kwargs.get("save_jinja_files", True) + tokenizer_config, saved_raw_chat_template_files = self.save_chat_templates( + save_directory, tokenizer_config, filename_prefix, save_jinja_files + ) + + if len(self.init_inputs) > 0: + tokenizer_config["init_inputs"] = copy.deepcopy(self.init_inputs) + for file_id in self.vocab_files_names.keys(): + tokenizer_config.pop(file_id, None) + + # no typefields, this way old fast and slow can load it + tokenizer_config = self.convert_added_tokens(tokenizer_config, add_type_field=True, save=True) + + # Process added tokens separately: allows previous versions to ignore it! + added_tokens = {} + for key, value in self.added_tokens_decoder.items(): + added_tokens[key] = value.__getstate__() + tokenizer_config["added_tokens_decoder"] = added_tokens + + # Add tokenizer class to the tokenizer config to be able to reload it with from_pretrained + tokenizer_class = self.__class__.__name__ + # Remove the Fast at the end if we can save the slow tokenizer + if tokenizer_class.endswith("Fast") and getattr(self, "can_save_slow_tokenizer", False): + tokenizer_class = tokenizer_class[:-4] + tokenizer_config["tokenizer_class"] = tokenizer_class + if getattr(self, "_auto_map", None) is not None: + tokenizer_config["auto_map"] = self._auto_map + if getattr(self, "_processor_class", None) is not None: + tokenizer_config["processor_class"] = self._processor_class + + # If we have a custom model, we copy the file defining it in the folder and set the attributes so it can be + # loaded from the Hub. + if self._auto_class is not None: + custom_object_save(self, save_directory, config=tokenizer_config) + + # remove private information + if "name_or_path" in tokenizer_config: + tokenizer_config.pop("name_or_path") + tokenizer_config.pop("special_tokens_map_file", None) + tokenizer_config.pop("tokenizer_file", None) + if "device_map" in tokenizer_config: + tokenizer_config.pop("device_map") + + with open(tokenizer_config_file, "w", encoding="utf-8") as f: + out_str = json.dumps(tokenizer_config, indent=2, sort_keys=True, ensure_ascii=False) + "\n" + f.write(out_str) + logger.info(f"tokenizer config file saved in {tokenizer_config_file}") + + # Sanitize AddedTokens in special_tokens_map + + # kept for forward compatibility, will be removed in transoformers 5. Typefields are not saved for FC, special should not be save either + write_dict = self.convert_added_tokens(self.special_tokens_map_extended, save=True, add_type_field=False) + with open(special_tokens_map_file, "w", encoding="utf-8") as f: + out_str = json.dumps(write_dict, indent=2, sort_keys=True, ensure_ascii=False) + "\n" + f.write(out_str) + logger.info(f"Special tokens file saved in {special_tokens_map_file}") + + file_names = (tokenizer_config_file, special_tokens_map_file, *saved_raw_chat_template_files) + + save_files = self._save_pretrained( + save_directory=save_directory, + file_names=file_names, + legacy_format=legacy_format, + filename_prefix=filename_prefix, + ) + + if push_to_hub: + self._upload_modified_files( + save_directory, + repo_id, + files_timestamps, + commit_message=commit_message, + token=kwargs.get("token"), + ) + + return save_files + + def _save_pretrained( + self, + save_directory: Union[str, os.PathLike], + file_names: Tuple[str], + legacy_format: Optional[bool] = None, + filename_prefix: Optional[str] = None, + ) -> Tuple[str]: + """ + Save a tokenizer using the slow-tokenizer/legacy format: vocabulary + added tokens. + + Fast tokenizers can also be saved in a unique JSON file containing {config + vocab + added-tokens} using the + specific [`~tokenization_utils_fast.PreTrainedTokenizerFast._save_pretrained`] + """ + if legacy_format is False: + raise ValueError( + "Only fast tokenizers (instances of PreTrainedTokenizerFast) can be saved in non legacy format." + ) + + save_directory = str(save_directory) + + added_tokens_file = os.path.join( + save_directory, (filename_prefix + "-" if filename_prefix else "") + ADDED_TOKENS_FILE + ) + # the new get_added_vocab() also returns special tokens and tokens that have an index < vocab_size + added_vocab = {tok: index for tok, index in self.added_tokens_encoder.items() if index >= self.vocab_size} + if added_vocab: + with open(added_tokens_file, "w", encoding="utf-8") as f: + out_str = json.dumps(added_vocab, indent=2, sort_keys=True, ensure_ascii=False) + "\n" + f.write(out_str) + logger.info(f"added tokens file saved in {added_tokens_file}") + + vocab_files = self.save_vocabulary(save_directory, filename_prefix=filename_prefix) + + return file_names + vocab_files + (added_tokens_file,) + + def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]: + """ + Save only the vocabulary of the tokenizer (vocabulary + added tokens). + + This method won't save the configuration and special token mappings of the tokenizer. Use + [`~PreTrainedTokenizerFast._save_pretrained`] to save the whole state of the tokenizer. + + Args: + save_directory (`str`): + The directory in which to save the vocabulary. + filename_prefix (`str`, *optional*): + An optional prefix to add to the named of the saved files. + + Returns: + `Tuple(str)`: Paths to the files saved. + """ + raise NotImplementedError + + def tokenize(self, text: str, pair: Optional[str] = None, add_special_tokens: bool = False, **kwargs) -> List[str]: + """ + Converts a string into a sequence of tokens, replacing unknown tokens with the `unk_token`. + + Args: + text (`str`): + The sequence to be encoded. + pair (`str`, *optional*): + A second sequence to be encoded with the first. + add_special_tokens (`bool`, *optional*, defaults to `False`): + Whether or not to add the special tokens associated with the corresponding model. + kwargs (additional keyword arguments, *optional*): + Will be passed to the underlying model specific encode method. See details in + [`~PreTrainedTokenizerBase.__call__`] + + Returns: + `List[str]`: The list of tokens. + """ + raise NotImplementedError + + @add_end_docstrings( + ENCODE_KWARGS_DOCSTRING, + """ + **kwargs: Passed along to the `.tokenize()` method. + """, + """ + Returns: + `List[int]`, `torch.Tensor`, `tf.Tensor` or `np.ndarray`: The tokenized ids of the text. + """, + ) + def encode( + self, + text: Union[TextInput, PreTokenizedInput, EncodedInput], + text_pair: Optional[Union[TextInput, PreTokenizedInput, EncodedInput]] = None, + add_special_tokens: bool = True, + padding: Union[bool, str, PaddingStrategy] = False, + truncation: Union[bool, str, TruncationStrategy, None] = None, + max_length: Optional[int] = None, + stride: int = 0, + padding_side: Optional[str] = None, + return_tensors: Optional[Union[str, TensorType]] = None, + **kwargs, + ) -> List[int]: + """ + Converts a string to a sequence of ids (integer), using the tokenizer and vocabulary. + + Same as doing `self.convert_tokens_to_ids(self.tokenize(text))`. + + Args: + text (`str`, `List[str]` or `List[int]`): + The first sequence to be encoded. This can be a string, a list of strings (tokenized string using the + `tokenize` method) or a list of integers (tokenized string ids using the `convert_tokens_to_ids` + method). + text_pair (`str`, `List[str]` or `List[int]`, *optional*): + Optional second sequence to be encoded. This can be a string, a list of strings (tokenized string using + the `tokenize` method) or a list of integers (tokenized string ids using the `convert_tokens_to_ids` + method). + """ + encoded_inputs = self.encode_plus( + text, + text_pair=text_pair, + add_special_tokens=add_special_tokens, + padding=padding, + truncation=truncation, + max_length=max_length, + stride=stride, + padding_side=padding_side, + return_tensors=return_tensors, + **kwargs, + ) + + return encoded_inputs["input_ids"] + + def num_special_tokens_to_add(self, pair: bool = False) -> int: + raise NotImplementedError + + def _get_padding_truncation_strategies( + self, padding=False, truncation=None, max_length=None, pad_to_multiple_of=None, verbose=True, **kwargs + ): + """ + Find the correct padding/truncation strategy with backward compatibility for old arguments (truncation_strategy + and pad_to_max_length) and behaviors. + """ + old_truncation_strategy = kwargs.pop("truncation_strategy", "do_not_truncate") + old_pad_to_max_length = kwargs.pop("pad_to_max_length", False) + + # Backward compatibility for previous behavior, maybe we should deprecate it: + # If you only set max_length, it activates truncation for max_length + if max_length is not None and padding is False and truncation is None: + if verbose: + if not self.deprecation_warnings.get("Truncation-not-explicitly-activated", False): + logger.warning( + "Truncation was not explicitly activated but `max_length` is provided a specific value, please" + " use `truncation=True` to explicitly truncate examples to max length. Defaulting to" + " 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the" + " tokenizer you can select this strategy more precisely by providing a specific strategy to" + " `truncation`." + ) + self.deprecation_warnings["Truncation-not-explicitly-activated"] = True + truncation = "longest_first" + + # Get padding strategy + if padding is False and old_pad_to_max_length: + if verbose: + warnings.warn( + "The `pad_to_max_length` argument is deprecated and will be removed in a future version, " + "use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or " + "use `padding='max_length'` to pad to a max length. In this case, you can give a specific " + "length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the " + "maximal input size of the model (e.g. 512 for Bert).", + FutureWarning, + ) + if max_length is None: + padding_strategy = PaddingStrategy.LONGEST + else: + padding_strategy = PaddingStrategy.MAX_LENGTH + elif padding is not False: + if padding is True: + if verbose: + if max_length is not None and ( + truncation is None or truncation is False or truncation == "do_not_truncate" + ): + warnings.warn( + "`max_length` is ignored when `padding`=`True` and there is no truncation strategy. " + "To pad to max length, use `padding='max_length'`." + ) + if old_pad_to_max_length is not False: + warnings.warn("Though `pad_to_max_length` = `True`, it is ignored because `padding`=`True`.") + padding_strategy = PaddingStrategy.LONGEST # Default to pad to the longest sequence in the batch + elif not isinstance(padding, PaddingStrategy): + padding_strategy = PaddingStrategy(padding) + elif isinstance(padding, PaddingStrategy): + padding_strategy = padding + else: + padding_strategy = PaddingStrategy.DO_NOT_PAD + + # Get truncation strategy + if truncation is None and old_truncation_strategy != "do_not_truncate": + if verbose: + warnings.warn( + "The `truncation_strategy` argument is deprecated and will be removed in a future version, use" + " `truncation=True` to truncate examples to a max length. You can give a specific length with" + " `max_length` (e.g. `max_length=45`) or leave max_length to None to truncate to the maximal input" + " size of the model (e.g. 512 for Bert). If you have pairs of inputs, you can give a specific" + " truncation strategy selected among `truncation='only_first'` (will only truncate the first" + " sentence in the pairs) `truncation='only_second'` (will only truncate the second sentence in the" + " pairs) or `truncation='longest_first'` (will iteratively remove tokens from the longest sentence" + " in the pairs).", + FutureWarning, + ) + truncation_strategy = TruncationStrategy(old_truncation_strategy) + elif truncation is not False and truncation is not None: + if truncation is True: + truncation_strategy = ( + TruncationStrategy.LONGEST_FIRST + ) # Default to truncate the longest sequences in pairs of inputs + elif not isinstance(truncation, TruncationStrategy): + truncation_strategy = TruncationStrategy(truncation) + elif isinstance(truncation, TruncationStrategy): + truncation_strategy = truncation + else: + truncation_strategy = TruncationStrategy.DO_NOT_TRUNCATE + + # Set max length if needed + if max_length is None: + if padding_strategy == PaddingStrategy.MAX_LENGTH: + if self.model_max_length > LARGE_INTEGER: + if verbose: + if not self.deprecation_warnings.get("Asking-to-pad-to-max_length", False): + logger.warning( + "Asking to pad to max_length but no maximum length is provided and the model has no" + " predefined maximum length. Default to no padding." + ) + self.deprecation_warnings["Asking-to-pad-to-max_length"] = True + padding_strategy = PaddingStrategy.DO_NOT_PAD + else: + max_length = self.model_max_length + + if truncation_strategy != TruncationStrategy.DO_NOT_TRUNCATE: + if self.model_max_length > LARGE_INTEGER: + if verbose: + if not self.deprecation_warnings.get("Asking-to-truncate-to-max_length", False): + logger.warning( + "Asking to truncate to max_length but no maximum length is provided and the model has" + " no predefined maximum length. Default to no truncation." + ) + self.deprecation_warnings["Asking-to-truncate-to-max_length"] = True + truncation_strategy = TruncationStrategy.DO_NOT_TRUNCATE + else: + max_length = self.model_max_length + + # Test if we have a padding token + if padding_strategy != PaddingStrategy.DO_NOT_PAD and (self.pad_token is None or self.pad_token_id < 0): + raise ValueError( + "Asking to pad but the tokenizer does not have a padding token. " + "Please select a token to use as `pad_token` `(tokenizer.pad_token = tokenizer.eos_token e.g.)` " + "or add a new pad token via `tokenizer.add_special_tokens({'pad_token': '[PAD]'})`." + ) + + # Check that we will truncate to a multiple of pad_to_multiple_of if both are provided + if ( + truncation_strategy != TruncationStrategy.DO_NOT_TRUNCATE + and padding_strategy != PaddingStrategy.DO_NOT_PAD + and pad_to_multiple_of is not None + and max_length is not None + and (max_length % pad_to_multiple_of != 0) + ): + raise ValueError( + "Truncation and padding are both activated but " + f"truncation length ({max_length}) is not a multiple of pad_to_multiple_of ({pad_to_multiple_of})." + ) + + return padding_strategy, truncation_strategy, max_length, kwargs + + @add_end_docstrings(ENCODE_KWARGS_DOCSTRING, ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING) + def __call__( + self, + text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput], None] = None, + text_pair: Optional[Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]] = None, + text_target: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput], None] = None, + text_pair_target: Optional[ + Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] + ] = None, + add_special_tokens: bool = True, + padding: Union[bool, str, PaddingStrategy] = False, + truncation: Union[bool, str, TruncationStrategy, None] = None, + max_length: Optional[int] = None, + stride: int = 0, + is_split_into_words: bool = False, + pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[str] = None, + return_tensors: Optional[Union[str, TensorType]] = None, + return_token_type_ids: Optional[bool] = None, + return_attention_mask: Optional[bool] = None, + return_overflowing_tokens: bool = False, + return_special_tokens_mask: bool = False, + return_offsets_mapping: bool = False, + return_length: bool = False, + verbose: bool = True, + **kwargs, + ) -> BatchEncoding: + """ + Main method to tokenize and prepare for the model one or several sequence(s) or one or several pair(s) of + sequences. + + Args: + text (`str`, `List[str]`, `List[List[str]]`, *optional*): + The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings + (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set + `is_split_into_words=True` (to lift the ambiguity with a batch of sequences). + text_pair (`str`, `List[str]`, `List[List[str]]`, *optional*): + The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings + (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set + `is_split_into_words=True` (to lift the ambiguity with a batch of sequences). + text_target (`str`, `List[str]`, `List[List[str]]`, *optional*): + The sequence or batch of sequences to be encoded as target texts. Each sequence can be a string or a + list of strings (pretokenized string). If the sequences are provided as list of strings (pretokenized), + you must set `is_split_into_words=True` (to lift the ambiguity with a batch of sequences). + text_pair_target (`str`, `List[str]`, `List[List[str]]`, *optional*): + The sequence or batch of sequences to be encoded as target texts. Each sequence can be a string or a + list of strings (pretokenized string). If the sequences are provided as list of strings (pretokenized), + you must set `is_split_into_words=True` (to lift the ambiguity with a batch of sequences). + """ + # To avoid duplicating + all_kwargs = { + "add_special_tokens": add_special_tokens, + "padding": padding, + "truncation": truncation, + "max_length": max_length, + "stride": stride, + "is_split_into_words": is_split_into_words, + "pad_to_multiple_of": pad_to_multiple_of, + "padding_side": padding_side, + "return_tensors": return_tensors, + "return_token_type_ids": return_token_type_ids, + "return_attention_mask": return_attention_mask, + "return_overflowing_tokens": return_overflowing_tokens, + "return_special_tokens_mask": return_special_tokens_mask, + "return_offsets_mapping": return_offsets_mapping, + "return_length": return_length, + "split_special_tokens": kwargs.pop("split_special_tokens", self.split_special_tokens), + "verbose": verbose, + } + all_kwargs.update(kwargs) + if text is None and text_target is None: + raise ValueError("You need to specify either `text` or `text_target`.") + if text is not None: + # The context manager will send the inputs as normal texts and not text_target, but we shouldn't change the + # input mode in this case. + if not self._in_target_context_manager: + self._switch_to_input_mode() + encodings = self._call_one(text=text, text_pair=text_pair, **all_kwargs) + if text_target is not None: + self._switch_to_target_mode() + target_encodings = self._call_one(text=text_target, text_pair=text_pair_target, **all_kwargs) + # Leave back tokenizer in input mode + self._switch_to_input_mode() + + if text_target is None: + return encodings + elif text is None: + return target_encodings + else: + encodings["labels"] = target_encodings["input_ids"] + return encodings + + def _call_one( + self, + text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]], + text_pair: Optional[Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]] = None, + add_special_tokens: bool = True, + padding: Union[bool, str, PaddingStrategy] = False, + truncation: Union[bool, str, TruncationStrategy, None] = None, + max_length: Optional[int] = None, + stride: int = 0, + is_split_into_words: bool = False, + pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[str] = None, + return_tensors: Optional[Union[str, TensorType]] = None, + return_token_type_ids: Optional[bool] = None, + return_attention_mask: Optional[bool] = None, + return_overflowing_tokens: bool = False, + return_special_tokens_mask: bool = False, + return_offsets_mapping: bool = False, + return_length: bool = False, + verbose: bool = True, + split_special_tokens: bool = False, + **kwargs, + ) -> BatchEncoding: + # Input type checking for clearer error + def _is_valid_text_input(t): + if isinstance(t, str): + # Strings are fine + return True + elif isinstance(t, (list, tuple)): + # List are fine as long as they are... + if len(t) == 0: + # ... empty + return True + elif isinstance(t[0], str): + # ... list of strings + return True + elif isinstance(t[0], (list, tuple)): + # ... list with an empty list or with a list of strings + return len(t[0]) == 0 or isinstance(t[0][0], str) + else: + return False + else: + return False + + if not _is_valid_text_input(text): + raise ValueError( + "text input must be of type `str` (single example), `List[str]` (batch or single pretokenized example) " + "or `List[List[str]]` (batch of pretokenized examples)." + ) + + if text_pair is not None and not _is_valid_text_input(text_pair): + raise ValueError( + "text input must be of type `str` (single example), `List[str]` (batch or single pretokenized example) " + "or `List[List[str]]` (batch of pretokenized examples)." + ) + + if is_split_into_words: + is_batched = isinstance(text, (list, tuple)) and text and isinstance(text[0], (list, tuple)) + else: + is_batched = isinstance(text, (list, tuple)) + + if is_batched: + if isinstance(text_pair, str): + raise TypeError( + "when tokenizing batches of text, `text_pair` must be a list or tuple with the same length as" + " `text`." + ) + if text_pair is not None and len(text) != len(text_pair): + raise ValueError( + f"batch length of `text`: {len(text)} does not match batch length of `text_pair`:" + f" {len(text_pair)}." + ) + batch_text_or_text_pairs = list(zip(text, text_pair)) if text_pair is not None else text + return self.batch_encode_plus( + batch_text_or_text_pairs=batch_text_or_text_pairs, + add_special_tokens=add_special_tokens, + padding=padding, + truncation=truncation, + max_length=max_length, + stride=stride, + is_split_into_words=is_split_into_words, + pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, + return_tensors=return_tensors, + return_token_type_ids=return_token_type_ids, + return_attention_mask=return_attention_mask, + return_overflowing_tokens=return_overflowing_tokens, + return_special_tokens_mask=return_special_tokens_mask, + return_offsets_mapping=return_offsets_mapping, + return_length=return_length, + verbose=verbose, + split_special_tokens=split_special_tokens, + **kwargs, + ) + else: + return self.encode_plus( + text=text, + text_pair=text_pair, + add_special_tokens=add_special_tokens, + padding=padding, + truncation=truncation, + max_length=max_length, + stride=stride, + is_split_into_words=is_split_into_words, + pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, + return_tensors=return_tensors, + return_token_type_ids=return_token_type_ids, + return_attention_mask=return_attention_mask, + return_overflowing_tokens=return_overflowing_tokens, + return_special_tokens_mask=return_special_tokens_mask, + return_offsets_mapping=return_offsets_mapping, + return_length=return_length, + verbose=verbose, + split_special_tokens=split_special_tokens, + **kwargs, + ) + + @add_end_docstrings(ENCODE_KWARGS_DOCSTRING, ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING) + def encode_plus( + self, + text: Union[TextInput, PreTokenizedInput, EncodedInput], + text_pair: Optional[Union[TextInput, PreTokenizedInput, EncodedInput]] = None, + add_special_tokens: bool = True, + padding: Union[bool, str, PaddingStrategy] = False, + truncation: Union[bool, str, TruncationStrategy, None] = None, + max_length: Optional[int] = None, + stride: int = 0, + is_split_into_words: bool = False, + pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[str] = None, + return_tensors: Optional[Union[str, TensorType]] = None, + return_token_type_ids: Optional[bool] = None, + return_attention_mask: Optional[bool] = None, + return_overflowing_tokens: bool = False, + return_special_tokens_mask: bool = False, + return_offsets_mapping: bool = False, + return_length: bool = False, + verbose: bool = True, + **kwargs, + ) -> BatchEncoding: + """ + Tokenize and prepare for the model a sequence or a pair of sequences. + + + + This method is deprecated, `__call__` should be used instead. + + + + Args: + text (`str`, `List[str]` or (for non-fast tokenizers) `List[int]`): + The first sequence to be encoded. This can be a string, a list of strings (tokenized string using the + `tokenize` method) or a list of integers (tokenized string ids using the `convert_tokens_to_ids` + method). + text_pair (`str`, `List[str]` or `List[int]`, *optional*): + Optional second sequence to be encoded. This can be a string, a list of strings (tokenized string using + the `tokenize` method) or a list of integers (tokenized string ids using the `convert_tokens_to_ids` + method). + """ + + # Backward compatibility for 'truncation_strategy', 'pad_to_max_length' + padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies( + padding=padding, + truncation=truncation, + max_length=max_length, + pad_to_multiple_of=pad_to_multiple_of, + verbose=verbose, + **kwargs, + ) + + return self._encode_plus( + text=text, + text_pair=text_pair, + add_special_tokens=add_special_tokens, + padding_strategy=padding_strategy, + truncation_strategy=truncation_strategy, + max_length=max_length, + stride=stride, + is_split_into_words=is_split_into_words, + pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, + return_tensors=return_tensors, + return_token_type_ids=return_token_type_ids, + return_attention_mask=return_attention_mask, + return_overflowing_tokens=return_overflowing_tokens, + return_special_tokens_mask=return_special_tokens_mask, + return_offsets_mapping=return_offsets_mapping, + return_length=return_length, + verbose=verbose, + split_special_tokens=kwargs.pop("split_special_tokens", self.split_special_tokens), + **kwargs, + ) + + def _encode_plus( + self, + text: Union[TextInput, PreTokenizedInput, EncodedInput], + text_pair: Optional[Union[TextInput, PreTokenizedInput, EncodedInput]] = None, + add_special_tokens: bool = True, + padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD, + truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE, + max_length: Optional[int] = None, + stride: int = 0, + is_split_into_words: bool = False, + pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[str] = None, + return_tensors: Optional[Union[str, TensorType]] = None, + return_token_type_ids: Optional[bool] = None, + return_attention_mask: Optional[bool] = None, + return_overflowing_tokens: bool = False, + return_special_tokens_mask: bool = False, + return_offsets_mapping: bool = False, + return_length: bool = False, + verbose: bool = True, + split_special_tokens: bool = False, + **kwargs, + ) -> BatchEncoding: + raise NotImplementedError + + @add_end_docstrings(ENCODE_KWARGS_DOCSTRING, ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING) + def batch_encode_plus( + self, + batch_text_or_text_pairs: Union[ + List[TextInput], + List[TextInputPair], + List[PreTokenizedInput], + List[PreTokenizedInputPair], + List[EncodedInput], + List[EncodedInputPair], + ], + add_special_tokens: bool = True, + padding: Union[bool, str, PaddingStrategy] = False, + truncation: Union[bool, str, TruncationStrategy, None] = None, + max_length: Optional[int] = None, + stride: int = 0, + is_split_into_words: bool = False, + pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[str] = None, + return_tensors: Optional[Union[str, TensorType]] = None, + return_token_type_ids: Optional[bool] = None, + return_attention_mask: Optional[bool] = None, + return_overflowing_tokens: bool = False, + return_special_tokens_mask: bool = False, + return_offsets_mapping: bool = False, + return_length: bool = False, + verbose: bool = True, + split_special_tokens: bool = False, + **kwargs, + ) -> BatchEncoding: + """ + Tokenize and prepare for the model a list of sequences or a list of pairs of sequences. + + + + This method is deprecated, `__call__` should be used instead. + + + + Args: + batch_text_or_text_pairs (`List[str]`, `List[Tuple[str, str]]`, `List[List[str]]`, `List[Tuple[List[str], List[str]]]`, and for not-fast tokenizers, also `List[List[int]]`, `List[Tuple[List[int], List[int]]]`): + Batch of sequences or pair of sequences to be encoded. This can be a list of + string/string-sequences/int-sequences or a list of pair of string/string-sequences/int-sequence (see + details in `encode_plus`). + """ + + # Backward compatibility for 'truncation_strategy', 'pad_to_max_length' + padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies( + padding=padding, + truncation=truncation, + max_length=max_length, + pad_to_multiple_of=pad_to_multiple_of, + verbose=verbose, + **kwargs, + ) + + return self._batch_encode_plus( + batch_text_or_text_pairs=batch_text_or_text_pairs, + add_special_tokens=add_special_tokens, + padding_strategy=padding_strategy, + truncation_strategy=truncation_strategy, + max_length=max_length, + stride=stride, + is_split_into_words=is_split_into_words, + pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, + return_tensors=return_tensors, + return_token_type_ids=return_token_type_ids, + return_attention_mask=return_attention_mask, + return_overflowing_tokens=return_overflowing_tokens, + return_special_tokens_mask=return_special_tokens_mask, + return_offsets_mapping=return_offsets_mapping, + return_length=return_length, + verbose=verbose, + split_special_tokens=split_special_tokens, + **kwargs, + ) + + def _batch_encode_plus( + self, + batch_text_or_text_pairs: Union[ + List[TextInput], + List[TextInputPair], + List[PreTokenizedInput], + List[PreTokenizedInputPair], + List[EncodedInput], + List[EncodedInputPair], + ], + add_special_tokens: bool = True, + padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD, + truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE, + max_length: Optional[int] = None, + stride: int = 0, + is_split_into_words: bool = False, + pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[str] = None, + return_tensors: Optional[Union[str, TensorType]] = None, + return_token_type_ids: Optional[bool] = None, + return_attention_mask: Optional[bool] = None, + return_overflowing_tokens: bool = False, + return_special_tokens_mask: bool = False, + return_offsets_mapping: bool = False, + return_length: bool = False, + verbose: bool = True, + split_special_tokens: bool = False, + **kwargs, + ) -> BatchEncoding: + raise NotImplementedError + + def pad( + self, + encoded_inputs: Union[ + BatchEncoding, + List[BatchEncoding], + Dict[str, EncodedInput], + Dict[str, List[EncodedInput]], + List[Dict[str, EncodedInput]], + ], + padding: Union[bool, str, PaddingStrategy] = True, + max_length: Optional[int] = None, + pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[str] = None, + return_attention_mask: Optional[bool] = None, + return_tensors: Optional[Union[str, TensorType]] = None, + verbose: bool = True, + ) -> BatchEncoding: + """ + Pad a single encoded input or a batch of encoded inputs up to predefined length or to the max sequence length + in the batch. + + Padding side (left/right) padding token ids are defined at the tokenizer level (with `self.padding_side`, + `self.pad_token_id` and `self.pad_token_type_id`). + + Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the + text followed by a call to the `pad` method to get a padded encoding. + + + + If the `encoded_inputs` passed are dictionary of numpy arrays, PyTorch tensors or TensorFlow tensors, the + result will use the same type unless you provide a different tensor type with `return_tensors`. In the case of + PyTorch tensors, you will lose the specific device of your tensors however. + + + + Args: + encoded_inputs ([`BatchEncoding`], list of [`BatchEncoding`], `Dict[str, List[int]]`, `Dict[str, List[List[int]]` or `List[Dict[str, List[int]]]`): + Tokenized inputs. Can represent one input ([`BatchEncoding`] or `Dict[str, List[int]]`) or a batch of + tokenized inputs (list of [`BatchEncoding`], *Dict[str, List[List[int]]]* or *List[Dict[str, + List[int]]]*) so you can use this method during preprocessing as well as in a PyTorch Dataloader + collate function. + + Instead of `List[int]` you can have tensors (numpy arrays, PyTorch tensors or TensorFlow tensors), see + the note above for the return type. + padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `True`): + Select a strategy to pad the returned sequences (according to the model's padding side and padding + index) among: + + - `True` or `'longest'` (default): Pad to the longest sequence in the batch (or no padding if only a single + sequence if provided). + - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum + acceptable input length for the model if that argument is not provided. + - `False` or `'do_not_pad'`: No padding (i.e., can output a batch with sequences of different + lengths). + max_length (`int`, *optional*): + Maximum length of the returned list and optionally padding length (see above). + pad_to_multiple_of (`int`, *optional*): + If set will pad the sequence to a multiple of the provided value. + + This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability + `>= 7.5` (Volta). + padding_side (`str`, *optional*): + The side on which the model should have padding applied. Should be selected between ['right', 'left']. + Default value is picked from the class attribute of the same name. + return_attention_mask (`bool`, *optional*): + Whether to return the attention mask. If left to the default, will return the attention mask according + to the specific tokenizer's default, defined by the `return_outputs` attribute. + + [What are attention masks?](../glossary#attention-mask) + return_tensors (`str` or [`~utils.TensorType`], *optional*): + If set, will return tensors instead of list of python integers. Acceptable values are: + + - `'tf'`: Return TensorFlow `tf.constant` objects. + - `'pt'`: Return PyTorch `torch.Tensor` objects. + - `'np'`: Return Numpy `np.ndarray` objects. + verbose (`bool`, *optional*, defaults to `True`): + Whether or not to print more information and warnings. + """ + if self.__class__.__name__.endswith("Fast"): + if not self.deprecation_warnings.get("Asking-to-pad-a-fast-tokenizer", False): + logger.warning_advice( + f"You're using a {self.__class__.__name__} tokenizer. Please note that with a fast tokenizer," + " using the `__call__` method is faster than using a method to encode the text followed by a call" + " to the `pad` method to get a padded encoding." + ) + self.deprecation_warnings["Asking-to-pad-a-fast-tokenizer"] = True + + # If we have a list of dicts, let's convert it in a dict of lists + # We do this to allow using this method as a collate_fn function in PyTorch Dataloader + if isinstance(encoded_inputs, (list, tuple)) and isinstance(encoded_inputs[0], Mapping): + encoded_inputs = {key: [example[key] for example in encoded_inputs] for key in encoded_inputs[0].keys()} + + # The model's main input name, usually `input_ids`, has been passed for padding + if self.model_input_names[0] not in encoded_inputs: + raise ValueError( + "You should supply an encoding or a list of encodings to this method " + f"that includes {self.model_input_names[0]}, but you provided {list(encoded_inputs.keys())}" + ) + + required_input = encoded_inputs[self.model_input_names[0]] + + if required_input is None or (isinstance(required_input, Sized) and len(required_input) == 0): + if return_attention_mask: + encoded_inputs["attention_mask"] = [] + return encoded_inputs + + # If we have PyTorch/TF/NumPy tensors/arrays as inputs, we cast them as python objects + # and rebuild them afterwards if no return_tensors is specified + # Note that we lose the specific device the tensor may be on for PyTorch + + first_element = required_input[0] + if isinstance(first_element, (list, tuple)): + # first_element might be an empty list/tuple in some edge cases so we grab the first non empty element. + for item in required_input: + if len(item) != 0: + first_element = item[0] + break + # At this state, if `first_element` is still a list/tuple, it's an empty one so there is nothing to do. + if not isinstance(first_element, (int, list, tuple)): + if is_tf_tensor(first_element): + return_tensors = "tf" if return_tensors is None else return_tensors + elif is_torch_tensor(first_element): + return_tensors = "pt" if return_tensors is None else return_tensors + elif isinstance(first_element, np.ndarray): + return_tensors = "np" if return_tensors is None else return_tensors + else: + raise ValueError( + f"type of {first_element} unknown: {type(first_element)}. " + "Should be one of a python, numpy, pytorch or tensorflow object." + ) + + for key, value in encoded_inputs.items(): + encoded_inputs[key] = to_py_obj(value) + + # Convert padding_strategy in PaddingStrategy + padding_strategy, _, max_length, _ = self._get_padding_truncation_strategies( + padding=padding, max_length=max_length, verbose=verbose + ) + + required_input = encoded_inputs[self.model_input_names[0]] + if required_input and not isinstance(required_input[0], (list, tuple)): + encoded_inputs = self._pad( + encoded_inputs, + max_length=max_length, + padding_strategy=padding_strategy, + pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, + return_attention_mask=return_attention_mask, + ) + return BatchEncoding(encoded_inputs, tensor_type=return_tensors) + + batch_size = len(required_input) + assert all(len(v) == batch_size for v in encoded_inputs.values()), ( + "Some items in the output dictionary have a different batch size than others." + ) + + if padding_strategy == PaddingStrategy.LONGEST: + max_length = max(len(inputs) for inputs in required_input) + padding_strategy = PaddingStrategy.MAX_LENGTH + + batch_outputs = {} + for i in range(batch_size): + inputs = {k: v[i] for k, v in encoded_inputs.items()} + outputs = self._pad( + inputs, + max_length=max_length, + padding_strategy=padding_strategy, + pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, + return_attention_mask=return_attention_mask, + ) + + for key, value in outputs.items(): + if key not in batch_outputs: + batch_outputs[key] = [] + batch_outputs[key].append(value) + + return BatchEncoding(batch_outputs, tensor_type=return_tensors) + + def create_token_type_ids_from_sequences( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None + ) -> List[int]: + """ + Create the token type IDs corresponding to the sequences passed. [What are token type + IDs?](../glossary#token-type-ids) + + Should be overridden in a subclass if the model has a special way of building those. + + Args: + token_ids_0 (`List[int]`): The first tokenized sequence. + token_ids_1 (`List[int]`, *optional*): The second tokenized sequence. + + Returns: + `List[int]`: The token type ids. + """ + if token_ids_1 is None: + return len(token_ids_0) * [0] + return [0] * len(token_ids_0) + [1] * len(token_ids_1) + + def build_inputs_with_special_tokens( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None + ) -> List[int]: + """ + Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and + adding special tokens. + + This implementation does not add special tokens and this method should be overridden in a subclass. + + Args: + token_ids_0 (`List[int]`): The first tokenized sequence. + token_ids_1 (`List[int]`, *optional*): The second tokenized sequence. + + Returns: + `List[int]`: The model input with special tokens. + """ + if token_ids_1 is None: + return token_ids_0 + return token_ids_0 + token_ids_1 + + @add_end_docstrings(ENCODE_KWARGS_DOCSTRING, ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING) + def prepare_for_model( + self, + ids: List[int], + pair_ids: Optional[List[int]] = None, + add_special_tokens: bool = True, + padding: Union[bool, str, PaddingStrategy] = False, + truncation: Union[bool, str, TruncationStrategy, None] = None, + max_length: Optional[int] = None, + stride: int = 0, + pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[str] = None, + return_tensors: Optional[Union[str, TensorType]] = None, + return_token_type_ids: Optional[bool] = None, + return_attention_mask: Optional[bool] = None, + return_overflowing_tokens: bool = False, + return_special_tokens_mask: bool = False, + return_offsets_mapping: bool = False, + return_length: bool = False, + verbose: bool = True, + prepend_batch_axis: bool = False, + **kwargs, + ) -> BatchEncoding: + """ + Prepares a sequence of input id, or a pair of sequences of inputs ids so that it can be used by the model. It + adds special tokens, truncates sequences if overflowing while taking into account the special tokens and + manages a moving window (with user defined stride) for overflowing tokens. Please Note, for *pair_ids* + different than `None` and *truncation_strategy = longest_first* or `True`, it is not possible to return + overflowing tokens. Such a combination of arguments will raise an error. + + Args: + ids (`List[int]`): + Tokenized input ids of the first sequence. Can be obtained from a string by chaining the `tokenize` and + `convert_tokens_to_ids` methods. + pair_ids (`List[int]`, *optional*): + Tokenized input ids of the second sequence. Can be obtained from a string by chaining the `tokenize` + and `convert_tokens_to_ids` methods. + """ + + # Backward compatibility for 'truncation_strategy', 'pad_to_max_length' + padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies( + padding=padding, + truncation=truncation, + max_length=max_length, + pad_to_multiple_of=pad_to_multiple_of, + verbose=verbose, + **kwargs, + ) + + pair = bool(pair_ids is not None) + len_ids = len(ids) + len_pair_ids = len(pair_ids) if pair else 0 + + if return_token_type_ids and not add_special_tokens: + raise ValueError( + "Asking to return token_type_ids while setting add_special_tokens to False " + "results in an undefined behavior. Please set add_special_tokens to True or " + "set return_token_type_ids to None." + ) + + if ( + return_overflowing_tokens + and truncation_strategy == TruncationStrategy.LONGEST_FIRST + and pair_ids is not None + ): + raise ValueError( + "Not possible to return overflowing tokens for pair of sequences with the " + "`longest_first`. Please select another truncation strategy than `longest_first`, " + "for instance `only_second` or `only_first`." + ) + + # Load from model defaults + if return_token_type_ids is None: + return_token_type_ids = "token_type_ids" in self.model_input_names + if return_attention_mask is None: + return_attention_mask = "attention_mask" in self.model_input_names + + encoded_inputs = {} + + # Compute the total size of the returned encodings + total_len = len_ids + len_pair_ids + (self.num_special_tokens_to_add(pair=pair) if add_special_tokens else 0) + + # Truncation: Handle max sequence length + overflowing_tokens = [] + if truncation_strategy != TruncationStrategy.DO_NOT_TRUNCATE and max_length and total_len > max_length: + ids, pair_ids, overflowing_tokens = self.truncate_sequences( + ids, + pair_ids=pair_ids, + num_tokens_to_remove=total_len - max_length, + truncation_strategy=truncation_strategy, + stride=stride, + ) + + if return_overflowing_tokens: + encoded_inputs["overflowing_tokens"] = overflowing_tokens + encoded_inputs["num_truncated_tokens"] = total_len - max_length + + # Add special tokens + if add_special_tokens: + sequence = self.build_inputs_with_special_tokens(ids, pair_ids) + token_type_ids = self.create_token_type_ids_from_sequences(ids, pair_ids) + else: + sequence = ids + pair_ids if pair else ids + token_type_ids = [0] * len(ids) + ([0] * len(pair_ids) if pair else []) + + # Build output dictionary + encoded_inputs["input_ids"] = sequence + if return_token_type_ids: + encoded_inputs["token_type_ids"] = token_type_ids + if return_special_tokens_mask: + if add_special_tokens: + encoded_inputs["special_tokens_mask"] = self.get_special_tokens_mask(ids, pair_ids) + else: + encoded_inputs["special_tokens_mask"] = [0] * len(sequence) + + # Check lengths + self._eventual_warn_about_too_long_sequence(encoded_inputs["input_ids"], max_length, verbose) + + # Padding + if padding_strategy != PaddingStrategy.DO_NOT_PAD or return_attention_mask: + encoded_inputs = self.pad( + encoded_inputs, + max_length=max_length, + padding=padding_strategy.value, + pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, + return_attention_mask=return_attention_mask, + ) + + if return_length: + encoded_inputs["length"] = len(encoded_inputs["input_ids"]) + + batch_outputs = BatchEncoding( + encoded_inputs, tensor_type=return_tensors, prepend_batch_axis=prepend_batch_axis + ) + + return batch_outputs + + def truncate_sequences( + self, + ids: List[int], + pair_ids: Optional[List[int]] = None, + num_tokens_to_remove: int = 0, + truncation_strategy: Union[str, TruncationStrategy] = "longest_first", + stride: int = 0, + ) -> Tuple[List[int], List[int], List[int]]: + """ + Truncates a sequence pair in-place following the strategy. + + Args: + ids (`List[int]`): + Tokenized input ids of the first sequence. Can be obtained from a string by chaining the `tokenize` and + `convert_tokens_to_ids` methods. + pair_ids (`List[int]`, *optional*): + Tokenized input ids of the second sequence. Can be obtained from a string by chaining the `tokenize` + and `convert_tokens_to_ids` methods. + num_tokens_to_remove (`int`, *optional*, defaults to 0): + Number of tokens to remove using the truncation strategy. + truncation_strategy (`str` or [`~tokenization_utils_base.TruncationStrategy`], *optional*, defaults to `'longest_first'`): + The strategy to follow for truncation. Can be: + + - `'longest_first'`: Truncate to a maximum length specified with the argument `max_length` or to the + maximum acceptable input length for the model if that argument is not provided. This will truncate + token by token, removing a token from the longest sequence in the pair if a pair of sequences (or a + batch of pairs) is provided. + - `'only_first'`: Truncate to a maximum length specified with the argument `max_length` or to the + maximum acceptable input length for the model if that argument is not provided. This will only + truncate the first sequence of a pair if a pair of sequences (or a batch of pairs) is provided. + - `'only_second'`: Truncate to a maximum length specified with the argument `max_length` or to the + maximum acceptable input length for the model if that argument is not provided. This will only + truncate the second sequence of a pair if a pair of sequences (or a batch of pairs) is provided. + - `'do_not_truncate'` (default): No truncation (i.e., can output batch with sequence lengths greater + than the model maximum admissible input size). + stride (`int`, *optional*, defaults to 0): + If set to a positive number, the overflowing tokens returned will contain some tokens from the main + sequence returned. The value of this argument defines the number of additional tokens. + + Returns: + `Tuple[List[int], List[int], List[int]]`: The truncated `ids`, the truncated `pair_ids` and the list of + overflowing tokens. Note: The *longest_first* strategy returns empty list of overflowing tokens if a pair + of sequences (or a batch of pairs) is provided. + """ + if num_tokens_to_remove <= 0: + return ids, pair_ids, [] + + if not isinstance(truncation_strategy, TruncationStrategy): + truncation_strategy = TruncationStrategy(truncation_strategy) + + overflowing_tokens = [] + if truncation_strategy == TruncationStrategy.ONLY_FIRST or ( + truncation_strategy == TruncationStrategy.LONGEST_FIRST and pair_ids is None + ): + if len(ids) > num_tokens_to_remove: + window_len = min(len(ids), stride + num_tokens_to_remove) + if self.truncation_side == "left": + overflowing_tokens = ids[:window_len] + ids = ids[num_tokens_to_remove:] + elif self.truncation_side == "right": + overflowing_tokens = ids[-window_len:] + ids = ids[:-num_tokens_to_remove] + else: + raise ValueError(f"invalid truncation strategy: {self.truncation_side}, use 'left' or 'right'.") + + else: + error_msg = ( + f"We need to remove {num_tokens_to_remove} to truncate the input " + f"but the first sequence has a length {len(ids)}. " + ) + if truncation_strategy == TruncationStrategy.ONLY_FIRST: + error_msg = ( + error_msg + "Please select another truncation strategy than " + f"{truncation_strategy}, for instance 'longest_first' or 'only_second'." + ) + logger.error(error_msg) + elif truncation_strategy == TruncationStrategy.LONGEST_FIRST: + logger.warning( + "Be aware, overflowing tokens are not returned for the setting you have chosen," + f" i.e. sequence pairs with the '{TruncationStrategy.LONGEST_FIRST.value}' " + "truncation strategy. So the returned list will always be empty even if some " + "tokens have been removed." + ) + len_pair_ids = len(pair_ids) if pair_ids is not None else 0 + len_ids = len(ids) + first_remove = min(abs(len_pair_ids - len_ids), num_tokens_to_remove) + second_remove = num_tokens_to_remove - first_remove + if len_ids > len_pair_ids: + ids_to_move = first_remove + second_remove // 2 + pair_ids_to_move = second_remove - second_remove // 2 + else: + ids_to_move = second_remove // 2 + pair_ids_to_move = first_remove + second_remove - (second_remove // 2) + + if self.truncation_side == "right": + ids = ids[:-ids_to_move] if ids_to_move > 0 else ids + pair_ids = pair_ids[:-pair_ids_to_move] if pair_ids is not None and pair_ids_to_move > 0 else pair_ids + elif self.truncation_side == "left": + ids = ids[ids_to_move:] + pair_ids = pair_ids[pair_ids_to_move:] if pair_ids is not None else None + else: + raise ValueError(f"invalid truncation strategy:{self.truncation_side}") + + elif truncation_strategy == TruncationStrategy.ONLY_SECOND and pair_ids is not None: + if len(pair_ids) > num_tokens_to_remove: + window_len = min(len(pair_ids), stride + num_tokens_to_remove) + if self.truncation_side == "right": + overflowing_tokens = pair_ids[-window_len:] + pair_ids = pair_ids[:-num_tokens_to_remove] + elif self.truncation_side == "left": + overflowing_tokens = pair_ids[:window_len] + pair_ids = pair_ids[num_tokens_to_remove:] + else: + raise ValueError(f"invalid truncation strategy:{self.truncation_side}") + else: + logger.error( + f"We need to remove {num_tokens_to_remove} to truncate the input " + f"but the second sequence has a length {len(pair_ids)}. " + f"Please select another truncation strategy than {truncation_strategy}, " + "for instance 'longest_first' or 'only_first'." + ) + + return (ids, pair_ids, overflowing_tokens) + + def _pad( + self, + encoded_inputs: Union[Dict[str, EncodedInput], BatchEncoding], + max_length: Optional[int] = None, + padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD, + pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[str] = None, + return_attention_mask: Optional[bool] = None, + ) -> dict: + """ + Pad encoded inputs (on left/right and up to predefined length or max length in the batch) + + Args: + encoded_inputs: + Dictionary of tokenized inputs (`List[int]`) or batch of tokenized inputs (`List[List[int]]`). + max_length: maximum length of the returned list and optionally padding length (see below). + Will truncate by taking into account the special tokens. + padding_strategy: PaddingStrategy to use for padding. + + - PaddingStrategy.LONGEST Pad to the longest sequence in the batch + - PaddingStrategy.MAX_LENGTH: Pad to the max length (default) + - PaddingStrategy.DO_NOT_PAD: Do not pad + The tokenizer padding sides are defined in `padding_side` argument: + + - 'left': pads on the left of the sequences + - 'right': pads on the right of the sequences + pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value. + This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability + `>= 7.5` (Volta). + padding_side: + The side on which the model should have padding applied. Should be selected between ['right', 'left']. + Default value is picked from the class attribute of the same name. + return_attention_mask: + (optional) Set to False to avoid returning attention mask (default: set to model specifics) + """ + # Load from model defaults + if return_attention_mask is None: + return_attention_mask = "attention_mask" in self.model_input_names + + required_input = encoded_inputs[self.model_input_names[0]] + + if padding_strategy == PaddingStrategy.LONGEST: + max_length = len(required_input) + + if max_length is not None and pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0): + max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of + + needs_to_be_padded = padding_strategy != PaddingStrategy.DO_NOT_PAD and len(required_input) != max_length + + # Initialize attention mask if not present. + if return_attention_mask and "attention_mask" not in encoded_inputs: + encoded_inputs["attention_mask"] = [1] * len(required_input) + + if needs_to_be_padded: + difference = max_length - len(required_input) + padding_side = padding_side if padding_side is not None else self.padding_side + + if padding_side == "right": + if return_attention_mask: + encoded_inputs["attention_mask"] = encoded_inputs["attention_mask"] + [0] * difference + if "token_type_ids" in encoded_inputs: + encoded_inputs["token_type_ids"] = ( + encoded_inputs["token_type_ids"] + [self.pad_token_type_id] * difference + ) + if "special_tokens_mask" in encoded_inputs: + encoded_inputs["special_tokens_mask"] = encoded_inputs["special_tokens_mask"] + [1] * difference + encoded_inputs[self.model_input_names[0]] = required_input + [self.pad_token_id] * difference + elif padding_side == "left": + if return_attention_mask: + encoded_inputs["attention_mask"] = [0] * difference + encoded_inputs["attention_mask"] + if "token_type_ids" in encoded_inputs: + encoded_inputs["token_type_ids"] = [self.pad_token_type_id] * difference + encoded_inputs[ + "token_type_ids" + ] + if "special_tokens_mask" in encoded_inputs: + encoded_inputs["special_tokens_mask"] = [1] * difference + encoded_inputs["special_tokens_mask"] + encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input + else: + raise ValueError(f"Invalid padding strategy:{padding_side}") + + return encoded_inputs + + def convert_tokens_to_string(self, tokens: List[str]) -> str: + """ + Converts a sequence of tokens in a single string. The most simple way to do it is `" ".join(tokens)` but we + often want to remove sub-word tokenization artifacts at the same time. + + Args: + tokens (`List[str]`): The token to join in a string. + + Returns: + `str`: The joined tokens. + """ + raise NotImplementedError + + def batch_decode( + self, + sequences: Union[List[int], List[List[int]], "np.ndarray", "torch.Tensor", "tf.Tensor"], + skip_special_tokens: bool = False, + clean_up_tokenization_spaces: Optional[bool] = None, + **kwargs, + ) -> List[str]: + """ + Convert a list of lists of token ids into a list of strings by calling decode. + + Args: + sequences (`Union[List[int], List[List[int]], np.ndarray, torch.Tensor, tf.Tensor]`): + List of tokenized input ids. Can be obtained using the `__call__` method. + skip_special_tokens (`bool`, *optional*, defaults to `False`): + Whether or not to remove special tokens in the decoding. + clean_up_tokenization_spaces (`bool`, *optional*): + Whether or not to clean up the tokenization spaces. If `None`, will default to + `self.clean_up_tokenization_spaces`. + kwargs (additional keyword arguments, *optional*): + Will be passed to the underlying model specific decode method. + + Returns: + `List[str]`: The list of decoded sentences. + """ + return [ + self.decode( + seq, + skip_special_tokens=skip_special_tokens, + clean_up_tokenization_spaces=clean_up_tokenization_spaces, + **kwargs, + ) + for seq in sequences + ] + + def decode( + self, + token_ids: Union[int, List[int], "np.ndarray", "torch.Tensor", "tf.Tensor"], + skip_special_tokens: bool = False, + clean_up_tokenization_spaces: Optional[bool] = None, + **kwargs, + ) -> str: + """ + Converts a sequence of ids in a string, using the tokenizer and vocabulary with options to remove special + tokens and clean up tokenization spaces. + + Similar to doing `self.convert_tokens_to_string(self.convert_ids_to_tokens(token_ids))`. + + Args: + token_ids (`Union[int, List[int], np.ndarray, torch.Tensor, tf.Tensor]`): + List of tokenized input ids. Can be obtained using the `__call__` method. + skip_special_tokens (`bool`, *optional*, defaults to `False`): + Whether or not to remove special tokens in the decoding. + clean_up_tokenization_spaces (`bool`, *optional*): + Whether or not to clean up the tokenization spaces. If `None`, will default to + `self.clean_up_tokenization_spaces`. + kwargs (additional keyword arguments, *optional*): + Will be passed to the underlying model specific decode method. + + Returns: + `str`: The decoded sentence. + """ + # Convert inputs to python lists + token_ids = to_py_obj(token_ids) + + return self._decode( + token_ids=token_ids, + skip_special_tokens=skip_special_tokens, + clean_up_tokenization_spaces=clean_up_tokenization_spaces, + **kwargs, + ) + + def _decode( + self, + token_ids: Union[int, List[int]], + skip_special_tokens: bool = False, + clean_up_tokenization_spaces: Optional[bool] = None, + **kwargs, + ) -> str: + raise NotImplementedError + + def get_special_tokens_mask( + self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False + ) -> List[int]: + """ + Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding + special tokens using the tokenizer `prepare_for_model` or `encode_plus` methods. + + Args: + token_ids_0 (`List[int]`): + List of ids of the first sequence. + token_ids_1 (`List[int]`, *optional*): + List of ids of the second sequence. + already_has_special_tokens (`bool`, *optional*, defaults to `False`): + Whether or not the token list is already formatted with special tokens for the model. + + Returns: + A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token. + """ + assert already_has_special_tokens and token_ids_1 is None, ( + "You cannot use ``already_has_special_tokens=False`` with this tokenizer. " + "Please use a slow (full python) tokenizer to activate this argument. " + "Or set `return_special_tokens_mask=True` when calling the encoding method " + "to get the special tokens mask in any tokenizer. " + ) + + all_special_ids = self.all_special_ids # cache the property + + special_tokens_mask = [1 if token in all_special_ids else 0 for token in token_ids_0] + + return special_tokens_mask + + @staticmethod + def clean_up_tokenization(out_string: str) -> str: + """ + Clean up a list of simple English tokenization artifacts like spaces before punctuations and abbreviated forms. + + Args: + out_string (`str`): The text to clean up. + + Returns: + `str`: The cleaned-up string. + """ + out_string = ( + out_string.replace(" .", ".") + .replace(" ?", "?") + .replace(" !", "!") + .replace(" ,", ",") + .replace(" ' ", "'") + .replace(" n't", "n't") + .replace(" 'm", "'m") + .replace(" 's", "'s") + .replace(" 've", "'ve") + .replace(" 're", "'re") + ) + return out_string + + def _eventual_warn_about_too_long_sequence(self, ids: List[int], max_length: Optional[int], verbose: bool): + """ + Depending on the input and internal state we might trigger a warning about a sequence that is too long for its + corresponding model + + Args: + ids (`List[str]`): The ids produced by the tokenization + max_length (`int`, *optional*): The max_length desired (does not trigger a warning if it is set) + verbose (`bool`): Whether or not to print more information and warnings. + + """ + if max_length is None and len(ids) > self.model_max_length and verbose and self.model_max_length != 0: + if not self.deprecation_warnings.get("sequence-length-is-longer-than-the-specified-maximum", False): + logger.warning( + "Token indices sequence length is longer than the specified maximum sequence length " + f"for this model ({len(ids)} > {self.model_max_length}). Running this sequence through the model " + "will result in indexing errors" + ) + self.deprecation_warnings["sequence-length-is-longer-than-the-specified-maximum"] = True + + def _switch_to_input_mode(self): + """ + Private method to put the tokenizer in input mode (when it has different modes for input/outputs) + """ + pass + + def _switch_to_target_mode(self): + """ + Private method to put the tokenizer in target mode (when it has different modes for input/outputs) + """ + pass + + @contextmanager + def as_target_tokenizer(self): + """ + Temporarily sets the tokenizer for encoding the targets. Useful for tokenizer associated to + sequence-to-sequence models that need a slightly different processing for the labels. + """ + warnings.warn( + "`as_target_tokenizer` is deprecated and will be removed in v5 of Transformers. You can tokenize your " + "labels by using the argument `text_target` of the regular `__call__` method (either in the same call as " + "your input texts if you use the same keyword arguments, or in a separate call." + ) + self._switch_to_target_mode() + self._in_target_context_manager = True + yield + self._in_target_context_manager = False + self._switch_to_input_mode() + + @classmethod + def register_for_auto_class(cls, auto_class="AutoTokenizer"): + """ + Register this class with a given auto class. This should only be used for custom tokenizers as the ones in the + library are already mapped with `AutoTokenizer`. + + + + This API is experimental and may have some slight breaking changes in the next releases. + + + + Args: + auto_class (`str` or `type`, *optional*, defaults to `"AutoTokenizer"`): + The auto class to register this new tokenizer with. + """ + if not isinstance(auto_class, str): + auto_class = auto_class.__name__ + + import transformers.models.auto as auto_module + + if not hasattr(auto_module, auto_class): + raise ValueError(f"{auto_class} is not a valid auto class.") + + cls._auto_class = auto_class + + def prepare_seq2seq_batch( + self, + src_texts: List[str], + tgt_texts: Optional[List[str]] = None, + max_length: Optional[int] = None, + max_target_length: Optional[int] = None, + padding: str = "longest", + return_tensors: Optional[str] = None, + truncation: bool = True, + **kwargs, + ) -> BatchEncoding: + """ + Prepare model inputs for translation. For best performance, translate one sentence at a time. + + Arguments: + src_texts (`List[str]`): + List of documents to summarize or source language texts. + tgt_texts (`list`, *optional*): + List of summaries or target language texts. + max_length (`int`, *optional*): + Controls the maximum length for encoder inputs (documents to summarize or source language texts) If + left unset or set to `None`, this will use the predefined model maximum length if a maximum length is + required by one of the truncation/padding parameters. If the model has no specific maximum input length + (like XLNet) truncation/padding to a maximum length will be deactivated. + max_target_length (`int`, *optional*): + Controls the maximum length of decoder inputs (target language texts or summaries) If left unset or set + to `None`, this will use the max_length value. + padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `False`): + Activates and controls padding. Accepts the following values: + + - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single + sequence if provided). + - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum + acceptable input length for the model if that argument is not provided. + - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different + lengths). + return_tensors (`str` or [`~utils.TensorType`], *optional*): + If set, will return tensors instead of list of python integers. Acceptable values are: + + - `'tf'`: Return TensorFlow `tf.constant` objects. + - `'pt'`: Return PyTorch `torch.Tensor` objects. + - `'np'`: Return Numpy `np.ndarray` objects. + truncation (`bool`, `str` or [`~tokenization_utils_base.TruncationStrategy`], *optional*, defaults to `True`): + Activates and controls truncation. Accepts the following values: + + - `True` or `'longest_first'`: Truncate to a maximum length specified with the argument `max_length` or + to the maximum acceptable input length for the model if that argument is not provided. This will + truncate token by token, removing a token from the longest sequence in the pair if a pair of + sequences (or a batch of pairs) is provided. + - `'only_first'`: Truncate to a maximum length specified with the argument `max_length` or to the + maximum acceptable input length for the model if that argument is not provided. This will only + truncate the first sequence of a pair if a pair of sequences (or a batch of pairs) is provided. + - `'only_second'`: Truncate to a maximum length specified with the argument `max_length` or to the + maximum acceptable input length for the model if that argument is not provided. This will only + truncate the second sequence of a pair if a pair of sequences (or a batch of pairs) is provided. + - `False` or `'do_not_truncate'` (default): No truncation (i.e., can output batch with sequence lengths + greater than the model maximum admissible input size). + **kwargs: + Additional keyword arguments passed along to `self.__call__`. + + Return: + [`BatchEncoding`]: A [`BatchEncoding`] with the following fields: + + - **input_ids** -- List of token ids to be fed to the encoder. + - **attention_mask** -- List of indices specifying which tokens should be attended to by the model. + - **labels** -- List of token ids for tgt_texts. + + The full set of keys `[input_ids, attention_mask, labels]`, will only be returned if tgt_texts is passed. + Otherwise, input_ids, attention_mask will be the only keys. + """ + # docstyle-ignore + formatted_warning = """ +`prepare_seq2seq_batch` is deprecated and will be removed in version 5 of HuggingFace Transformers. Use the regular +`__call__` method to prepare your inputs and targets. + +Here is a short example: + +model_inputs = tokenizer(src_texts, text_target=tgt_texts, ...) + +If you either need to use different keyword arguments for the source and target texts, you should do two calls like +this: + +model_inputs = tokenizer(src_texts, ...) +labels = tokenizer(text_target=tgt_texts, ...) +model_inputs["labels"] = labels["input_ids"] + +See the documentation of your specific tokenizer for more details on the specific arguments to the tokenizer of choice. +For a more complete example, see the implementation of `prepare_seq2seq_batch`. +""" + warnings.warn(formatted_warning, FutureWarning) + # mBART-specific kwargs that should be ignored by other models. + kwargs.pop("src_lang", None) + kwargs.pop("tgt_lang", None) + if max_length is None: + max_length = self.model_max_length + model_inputs = self( + src_texts, + add_special_tokens=True, + return_tensors=return_tensors, + max_length=max_length, + padding=padding, + truncation=truncation, + **kwargs, + ) + if tgt_texts is None: + return model_inputs + # Process tgt_texts + if max_target_length is None: + max_target_length = max_length + with self.as_target_tokenizer(): + labels = self( + tgt_texts, + add_special_tokens=True, + return_tensors=return_tensors, + padding=padding, + max_length=max_target_length, + truncation=truncation, + **kwargs, + ) + model_inputs["labels"] = labels["input_ids"] + return model_inputs + + +def get_fast_tokenizer_file(tokenization_files: List[str]) -> str: + """ + Get the tokenization file to use for this version of transformers. + + Args: + tokenization_files (`List[str]`): The list of available configuration files. + + Returns: + `str`: The tokenization file to use. + """ + tokenizer_files_map = {} + for file_name in tokenization_files: + search = _re_tokenizer_file.search(file_name) + if search is not None: + v = search.groups()[0] + tokenizer_files_map[v] = file_name + available_versions = sorted(tokenizer_files_map.keys()) + + # Defaults to FULL_TOKENIZER_FILE and then try to look at some newer versions. + tokenizer_file = FULL_TOKENIZER_FILE + transformers_version = version.parse(__version__) + for v in available_versions: + if version.parse(v) <= transformers_version: + tokenizer_file = tokenizer_files_map[v] + else: + # No point going further since the versions are sorted. + break + + return tokenizer_file + + +# To update the docstring, we need to copy the method, otherwise we change the original docstring. +PreTrainedTokenizerBase.push_to_hub = copy_func(PreTrainedTokenizerBase.push_to_hub) +if PreTrainedTokenizerBase.push_to_hub.__doc__ is not None: + PreTrainedTokenizerBase.push_to_hub.__doc__ = PreTrainedTokenizerBase.push_to_hub.__doc__.format( + object="tokenizer", object_class="AutoTokenizer", object_files="tokenizer files" + ) diff --git a/docs/transformers/build/lib/transformers/tokenization_utils_fast.py b/docs/transformers/build/lib/transformers/tokenization_utils_fast.py new file mode 100644 index 0000000000000000000000000000000000000000..54605dbb02f63aabfc7efb1f957e10462c62e609 --- /dev/null +++ b/docs/transformers/build/lib/transformers/tokenization_utils_fast.py @@ -0,0 +1,908 @@ +# Copyright 2020 The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Tokenization classes for fast tokenizers (provided by HuggingFace's tokenizers library). For slow (python) tokenizers +see tokenization_utils.py +""" + +import copy +import json +import os +from collections import defaultdict +from collections.abc import Iterable +from typing import Any, Optional, Union + +import tokenizers.pre_tokenizers as pre_tokenizers_fast +from tokenizers import Encoding as EncodingFast +from tokenizers import Tokenizer as TokenizerFast +from tokenizers.decoders import Decoder as DecoderFast +from tokenizers.trainers import BpeTrainer, UnigramTrainer, WordLevelTrainer, WordPieceTrainer + +from .convert_slow_tokenizer import convert_slow_tokenizer +from .integrations.ggml import convert_gguf_tokenizer +from .modeling_gguf_pytorch_utils import load_gguf_checkpoint +from .tokenization_utils import PreTrainedTokenizer +from .tokenization_utils_base import ( + INIT_TOKENIZER_DOCSTRING, + AddedToken, + BatchEncoding, + PreTokenizedInput, + PreTokenizedInputPair, + PreTrainedTokenizerBase, + SpecialTokensMixin, + TextInput, + TextInputPair, + TruncationStrategy, +) +from .utils import PaddingStrategy, add_end_docstrings, logging + + +logger = logging.get_logger(__name__) + +# Fast tokenizers (provided by HuggingFace tokenizer's library) can be saved in a single file +TOKENIZER_FILE = "tokenizer.json" +SPECIAL_TOKENS_MAP_FILE = "special_tokens_map.json" +TOKENIZER_CONFIG_FILE = "tokenizer_config.json" +TIKTOKEN_VOCAB_FILE = "tokenizer.model" + +# Slow tokenizers have an additional added tokens files +ADDED_TOKENS_FILE = "added_tokens.json" + +INIT_TOKENIZER_DOCSTRING += """ + tokenizer_object ([`tokenizers.Tokenizer`]): + A [`tokenizers.Tokenizer`] object from 🤗 tokenizers to instantiate from. See [Using tokenizers from 🤗 + tokenizers](../fast_tokenizers) for more information. + tokenizer_file ([`str`]): + A path to a local JSON file representing a previously serialized [`tokenizers.Tokenizer`] object from 🤗 + tokenizers. +""" + +MODEL_TO_TRAINER_MAPPING = { + "BPE": BpeTrainer, + "Unigram": UnigramTrainer, + "WordLevel": WordLevelTrainer, + "WordPiece": WordPieceTrainer, +} + +VOCAB_FILES_NAMES = {"tokenizer_file": TOKENIZER_FILE, "vocab_file": TIKTOKEN_VOCAB_FILE} + + +@add_end_docstrings(INIT_TOKENIZER_DOCSTRING) +class PreTrainedTokenizerFast(PreTrainedTokenizerBase): + """ + Base class for all fast tokenizers (wrapping HuggingFace tokenizers library). + + Inherits from [`~tokenization_utils_base.PreTrainedTokenizerBase`]. + + Handles all the shared methods for tokenization and special tokens, as well as methods for + downloading/caching/loading pretrained tokenizers, as well as adding tokens to the vocabulary. + + This class also contains the added tokens in a unified way on top of all tokenizers so we don't have to handle the + specific vocabulary augmentation methods of the various underlying dictionary structures (BPE, sentencepiece...). + """ + + vocab_files_names = VOCAB_FILES_NAMES + slow_tokenizer_class: PreTrainedTokenizer = None + + def __init__(self, *args, **kwargs): + tokenizer_object = kwargs.pop("tokenizer_object", None) + slow_tokenizer = kwargs.pop("__slow_tokenizer", None) + gguf_file = kwargs.pop("gguf_file", None) + fast_tokenizer_file = kwargs.pop("tokenizer_file", None) + from_slow = kwargs.pop("from_slow", False) + added_tokens_decoder = kwargs.pop("added_tokens_decoder", {}) + self.add_prefix_space = kwargs.get("add_prefix_space", False) + + if from_slow and slow_tokenizer is None and self.slow_tokenizer_class is None: + raise ValueError( + "Cannot instantiate this tokenizer from a slow version. If it's based on sentencepiece, make sure you " + "have sentencepiece installed." + ) + + if tokenizer_object is not None: + fast_tokenizer = copy.deepcopy(tokenizer_object) + elif fast_tokenizer_file is not None and not from_slow: + # We have a serialization from tokenizers which let us directly build the backend + fast_tokenizer = TokenizerFast.from_file(fast_tokenizer_file) + elif slow_tokenizer: + # We need to convert a slow tokenizer to build the backend + fast_tokenizer = convert_slow_tokenizer(slow_tokenizer) + elif gguf_file is not None: + # We need to convert a slow tokenizer to build the backend + gguf_param = load_gguf_checkpoint(kwargs.get("vocab_file")) + architecture = gguf_param["config"]["model_type"] + tokenizer_dict = gguf_param["tokenizer"] + tokenizer_config = gguf_param["tokenizer_config"] + fast_tokenizer, additional_kwargs = convert_gguf_tokenizer(architecture, tokenizer_dict) + kwargs.update(tokenizer_config) + if len(additional_kwargs) > 0: + kwargs.update(additional_kwargs) + elif self.slow_tokenizer_class is not None and slow_tokenizer is not False: + # We need to create and convert a slow tokenizer to build the backend + slow_tokenizer = self.slow_tokenizer_class(*args, **kwargs) + fast_tokenizer = convert_slow_tokenizer(slow_tokenizer) + elif not slow_tokenizer: + # We tried loading a slow_tokenizer with spm and failed, try to load with tiktoken + self.vocab_file = kwargs.get("vocab_file", None) + self.additional_special_tokens = kwargs.get("additional_special_tokens", []) + fast_tokenizer = convert_slow_tokenizer(self, from_tiktoken=True) + slow_tokenizer = None + else: + raise ValueError( + "Couldn't instantiate the backend tokenizer from one of: \n" + "(1) a `tokenizers` library serialization file, \n" + "(2) a slow tokenizer instance to convert or \n" + "(3) an equivalent slow tokenizer class to instantiate and convert. \n" + "You need to have sentencepiece or tiktoken installed to convert a slow tokenizer to a fast one." + ) + + self._tokenizer = fast_tokenizer + + if slow_tokenizer is not None: + kwargs.update(slow_tokenizer.init_kwargs) + + self._decode_use_source_tokenizer = False + + _truncation = self._tokenizer.truncation + + if _truncation is not None: + self._tokenizer.enable_truncation(**_truncation) + kwargs.setdefault("max_length", _truncation["max_length"]) + kwargs.setdefault("truncation_side", _truncation["direction"]) + kwargs.setdefault("stride", _truncation["stride"]) + kwargs.setdefault("truncation_strategy", _truncation["strategy"]) + else: + self._tokenizer.no_truncation() + + _padding = self._tokenizer.padding + if _padding is not None: + self._tokenizer.enable_padding(**_padding) + kwargs.setdefault("pad_token", _padding["pad_token"]) + kwargs.setdefault("pad_token_type_id", _padding["pad_type_id"]) + kwargs.setdefault("padding_side", _padding["direction"]) + kwargs.setdefault("max_length", _padding["length"]) + kwargs.setdefault("pad_to_multiple_of", _padding["pad_to_multiple_of"]) + + # We call this after having initialized the backend tokenizer because we update it. + super().__init__(**kwargs) + self._tokenizer.encode_special_tokens = self.split_special_tokens + + added_tokens_decoder_hash = {hash(repr(token)) for token in self.added_tokens_decoder} + tokens_to_add = [ + token + for index, token in sorted(added_tokens_decoder.items(), key=lambda x: x[0]) + if hash(repr(token)) not in added_tokens_decoder_hash + ] + encoder = list(self.added_tokens_encoder.keys()) + [str(token) for token in tokens_to_add] + # if some of the special tokens are strings, we check if we don't already have a token + tokens_to_add += [ + token for token in self.all_special_tokens_extended if token not in encoder and token not in tokens_to_add + ] + + if len(tokens_to_add) > 0: + tokens = [] + special_tokens = self.all_special_tokens + for token in tokens_to_add: + is_special = ( + (token.special or str(token) in special_tokens) + if isinstance(token, AddedToken) + else str(token) in special_tokens + ) + if isinstance(token, str): + token = AddedToken(token, special=is_special) + else: + token.special = is_special + tokens.append(token) + if tokens: + self.add_tokens(tokens) + + try: + pre_tok_state = json.loads(self.backend_tokenizer.pre_tokenizer.__getstate__()) + if pre_tok_state.get("add_prefix_space", self.add_prefix_space) != self.add_prefix_space: + pre_tok_class = getattr(pre_tokenizers_fast, pre_tok_state.pop("type")) + pre_tok_state["add_prefix_space"] = self.add_prefix_space + self.backend_tokenizer.pre_tokenizer = pre_tok_class(**pre_tok_state) + except Exception: + # We'll get an error if there is no pre_tokenizer, or if it's a custom pre_tokenizer that can + # not be serialized. In those cases, we just ignore the error as there's no pre_tokenizer + # for which we need to update the `add_prefix_space` attribute. + pass + + @property + def is_fast(self) -> bool: + return True + + @property + def can_save_slow_tokenizer(self) -> bool: + """ + `bool`: Whether or not the slow tokenizer can be saved. Usually for sentencepiece based slow tokenizer, this + can only be `True` if the original `"sentencepiece.model"` was not deleted. + """ + return True + + @property + def vocab_size(self) -> int: + """ + `int`: Size of the base vocabulary (without the added tokens). + """ + return self._tokenizer.get_vocab_size(with_added_tokens=False) + + def get_vocab(self) -> dict[str, int]: + return self._tokenizer.get_vocab(with_added_tokens=True) + + @property + def vocab(self) -> dict[str, int]: + return self.get_vocab() + + @property + def added_tokens_encoder(self) -> dict[str, int]: + """ + Returns the sorted mapping from string to index. The added tokens encoder is cached for performance + optimisation in `self._added_tokens_encoder` for the slow tokenizers. + """ + return {k.content: v for v, k in sorted(self.added_tokens_decoder.items(), key=lambda item: item[0])} + + @property + def added_tokens_decoder(self) -> dict[int, AddedToken]: + """ + Returns the added tokens in the vocabulary as a dictionary of index to AddedToken. + + Returns: + `Dict[str, int]`: The added tokens. + """ + return self._tokenizer.get_added_tokens_decoder() + + def get_added_vocab(self) -> dict[str, int]: + """ + Returns the added tokens in the vocabulary as a dictionary of token to index. + + Returns: + `Dict[str, int]`: The added tokens. + """ + return {k.content: v for v, k in sorted(self.added_tokens_decoder.items(), key=lambda item: item[0])} + + def __len__(self) -> int: + """ + Size of the full vocabulary with the added tokens. + """ + return self._tokenizer.get_vocab_size(with_added_tokens=True) + + @property + def backend_tokenizer(self) -> TokenizerFast: + """ + `tokenizers.implementations.BaseTokenizer`: The Rust tokenizer used as a backend. + """ + return self._tokenizer + + @property + def decoder(self) -> DecoderFast: + """ + `tokenizers.decoders.Decoder`: The Rust decoder for this tokenizer. + """ + return self._tokenizer.decoder + + def _convert_encoding( + self, + encoding: EncodingFast, + return_token_type_ids: Optional[bool] = None, + return_attention_mask: Optional[bool] = None, + return_overflowing_tokens: bool = False, + return_special_tokens_mask: bool = False, + return_offsets_mapping: bool = False, + return_length: bool = False, + verbose: bool = True, + ) -> tuple[dict[str, Any], list[EncodingFast]]: + """ + Convert the encoding representation (from low-level HuggingFace tokenizer output) to a python Dict and a list + of encodings, take care of building a batch from overflowing tokens. + + Overflowing tokens are converted to additional examples (like batches) so the output values of the dict are + lists (overflows) of lists (tokens). + + Output shape: (overflows, sequence length) + """ + if return_token_type_ids is None: + return_token_type_ids = "token_type_ids" in self.model_input_names + if return_attention_mask is None: + return_attention_mask = "attention_mask" in self.model_input_names + + if return_overflowing_tokens and encoding.overflowing is not None: + encodings = [encoding] + encoding.overflowing + else: + encodings = [encoding] + + encoding_dict = defaultdict(list) + for e in encodings: + encoding_dict["input_ids"].append(e.ids) + + if return_token_type_ids: + encoding_dict["token_type_ids"].append(e.type_ids) + if return_attention_mask: + encoding_dict["attention_mask"].append(e.attention_mask) + if return_special_tokens_mask: + encoding_dict["special_tokens_mask"].append(e.special_tokens_mask) + if return_offsets_mapping: + encoding_dict["offset_mapping"].append(e.offsets) + if return_length: + encoding_dict["length"].append(len(e.ids)) + + return encoding_dict, encodings + + def convert_tokens_to_ids(self, tokens: Union[str, Iterable[str]]) -> Union[int, list[int]]: + """ + Converts a token string (or a sequence of tokens) in a single integer id (or a Iterable of ids), using the + vocabulary. + + Args: + tokens (`str` or `Iterable[str]`): One or several token(s) to convert to token id(s). + + Returns: + `int` or `List[int]`: The token id or list of token ids. + """ + if isinstance(tokens, str): + return self._convert_token_to_id_with_added_voc(tokens) + + return [self._convert_token_to_id_with_added_voc(token) for token in tokens] + + def _convert_token_to_id_with_added_voc(self, token: str) -> int: + index = self._tokenizer.token_to_id(token) + if index is None: + return self.unk_token_id + return index + + def _convert_id_to_token(self, index: int) -> Optional[str]: + return self._tokenizer.id_to_token(int(index)) + + def _add_tokens(self, new_tokens: list[Union[str, AddedToken]], special_tokens=False) -> int: + if special_tokens: + return self._tokenizer.add_special_tokens(new_tokens) + + return self._tokenizer.add_tokens(new_tokens) + + def num_special_tokens_to_add(self, pair: bool = False) -> int: + """ + Returns the number of added tokens when encoding a sequence with special tokens. + + + + This encodes a dummy input and checks the number of added tokens, and is therefore not efficient. Do not put + this inside your training loop. + + + + Args: + pair (`bool`, *optional*, defaults to `False`): + Whether the number of added tokens should be computed in the case of a sequence pair or a single + sequence. + + Returns: + `int`: Number of special tokens added to sequences. + """ + return self._tokenizer.num_special_tokens_to_add(pair) + + def convert_ids_to_tokens( + self, ids: Union[int, list[int]], skip_special_tokens: bool = False + ) -> Union[str, list[str]]: + """ + Converts a single index or a sequence of indices in a token or a sequence of tokens, using the vocabulary and + added tokens. + + Args: + ids (`int` or `List[int]`): + The token id (or token ids) to convert to tokens. + skip_special_tokens (`bool`, *optional*, defaults to `False`): + Whether or not to remove special tokens in the decoding. + + Returns: + `str` or `List[str]`: The decoded token(s). + """ + if isinstance(ids, int): + return self._tokenizer.id_to_token(ids) + tokens = [] + for index in ids: + index = int(index) + if skip_special_tokens and index in self.all_special_ids: + continue + tokens.append(self._tokenizer.id_to_token(index)) + return tokens + + def tokenize(self, text: str, pair: Optional[str] = None, add_special_tokens: bool = False, **kwargs) -> list[str]: + return self.encode_plus(text=text, text_pair=pair, add_special_tokens=add_special_tokens, **kwargs).tokens() + + def set_truncation_and_padding( + self, + padding_strategy: PaddingStrategy, + truncation_strategy: TruncationStrategy, + max_length: int, + stride: int, + pad_to_multiple_of: Optional[int], + padding_side: Optional[str], + ): + """ + Define the truncation and the padding strategies for fast tokenizers (provided by HuggingFace tokenizers + library) and restore the tokenizer settings afterwards. + + The provided tokenizer has no padding / truncation strategy before the managed section. If your tokenizer set a + padding / truncation strategy before, then it will be reset to no padding / truncation when exiting the managed + section. + + Args: + padding_strategy ([`~utils.PaddingStrategy`]): + The kind of padding that will be applied to the input + truncation_strategy ([`~tokenization_utils_base.TruncationStrategy`]): + The kind of truncation that will be applied to the input + max_length (`int`): + The maximum size of a sequence. + stride (`int`): + The stride to use when handling overflow. + pad_to_multiple_of (`int`, *optional*): + If set will pad the sequence to a multiple of the provided value. This is especially useful to enable + the use of Tensor Cores on NVIDIA hardware with compute capability `>= 7.5` (Volta). + padding_side (`str`, *optional*): + The side on which the model should have padding applied. Should be selected between ['right', 'left']. + Default value is picked from the class attribute of the same name. + """ + _truncation = self._tokenizer.truncation + _padding = self._tokenizer.padding + # Set truncation and padding on the backend tokenizer + if truncation_strategy == TruncationStrategy.DO_NOT_TRUNCATE: + if _truncation is not None: + self._tokenizer.no_truncation() + else: + target = { + "max_length": max_length, + "stride": stride, + "strategy": truncation_strategy.value, + "direction": self.truncation_side, + } + + # _truncation might contain more keys that the target `transformers` + # supports. Use only the target keys to trigger `enable_truncation`. + # This should enable this code to works on various `tokenizers` + # targets. + if _truncation is None: + current = None + else: + current = {k: _truncation.get(k, None) for k in target} + + if current != target: + self._tokenizer.enable_truncation(**target) + + if padding_strategy == PaddingStrategy.DO_NOT_PAD: + if _padding is not None: + self._tokenizer.no_padding() + else: + length = max_length if padding_strategy == PaddingStrategy.MAX_LENGTH else None + target = { + "length": length, + "direction": padding_side if padding_side is not None else self.padding_side, + "pad_id": self.pad_token_id, + "pad_token": self.pad_token, + "pad_type_id": self.pad_token_type_id, + "pad_to_multiple_of": pad_to_multiple_of, + } + if _padding != target: + self._tokenizer.enable_padding(**target) + + def _batch_encode_plus( + self, + batch_text_or_text_pairs: Union[ + list[TextInput], list[TextInputPair], list[PreTokenizedInput], list[PreTokenizedInputPair] + ], + add_special_tokens: bool = True, + padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD, + truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE, + max_length: Optional[int] = None, + stride: int = 0, + is_split_into_words: bool = False, + pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[str] = None, + return_tensors: Optional[str] = None, + return_token_type_ids: Optional[bool] = None, + return_attention_mask: Optional[bool] = None, + return_overflowing_tokens: bool = False, + return_special_tokens_mask: bool = False, + return_offsets_mapping: bool = False, + return_length: bool = False, + verbose: bool = True, + split_special_tokens: bool = False, + ) -> BatchEncoding: + if not isinstance(batch_text_or_text_pairs, (tuple, list)): + raise TypeError( + f"batch_text_or_text_pairs has to be a list or a tuple (got {type(batch_text_or_text_pairs)})" + ) + + # Set the truncation and padding strategy and restore the initial configuration + self.set_truncation_and_padding( + padding_strategy=padding_strategy, + truncation_strategy=truncation_strategy, + max_length=max_length, + stride=stride, + pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, + ) + + if self._tokenizer.encode_special_tokens != split_special_tokens: + self._tokenizer.encode_special_tokens = split_special_tokens + + encodings = self._tokenizer.encode_batch( + batch_text_or_text_pairs, + add_special_tokens=add_special_tokens, + is_pretokenized=is_split_into_words, + ) + + # Convert encoding to dict + # `Tokens` has type: Tuple[ + # List[Dict[str, List[List[int]]]] or List[Dict[str, 2D-Tensor]], + # List[EncodingFast] + # ] + # with nested dimensions corresponding to batch, overflows, sequence length + tokens_and_encodings = [ + self._convert_encoding( + encoding=encoding, + return_token_type_ids=return_token_type_ids, + return_attention_mask=return_attention_mask, + return_overflowing_tokens=return_overflowing_tokens, + return_special_tokens_mask=return_special_tokens_mask, + return_offsets_mapping=return_offsets_mapping, + return_length=return_length, + verbose=verbose, + ) + for encoding in encodings + ] + + # Convert the output to have dict[list] from list[dict] and remove the additional overflows dimension + # From (variable) shape (batch, overflows, sequence length) to ~ (batch * overflows, sequence length) + # (we say ~ because the number of overflow varies with the example in the batch) + # + # To match each overflowing sample with the original sample in the batch + # we add an overflow_to_sample_mapping array (see below) + sanitized_tokens = {} + for key in tokens_and_encodings[0][0].keys(): + stack = [e for item, _ in tokens_and_encodings for e in item[key]] + sanitized_tokens[key] = stack + sanitized_encodings = [e for _, item in tokens_and_encodings for e in item] + + # If returning overflowing tokens, we need to return a mapping + # from the batch idx to the original sample + if return_overflowing_tokens: + overflow_to_sample_mapping = [] + for i, (toks, _) in enumerate(tokens_and_encodings): + overflow_to_sample_mapping += [i] * len(toks["input_ids"]) + sanitized_tokens["overflow_to_sample_mapping"] = overflow_to_sample_mapping + + for input_ids in sanitized_tokens["input_ids"]: + self._eventual_warn_about_too_long_sequence(input_ids, max_length, verbose) + return BatchEncoding(sanitized_tokens, sanitized_encodings, tensor_type=return_tensors) + + def _encode_plus( + self, + text: Union[TextInput, PreTokenizedInput], + text_pair: Optional[Union[TextInput, PreTokenizedInput]] = None, + add_special_tokens: bool = True, + padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD, + truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE, + max_length: Optional[int] = None, + stride: int = 0, + is_split_into_words: bool = False, + pad_to_multiple_of: Optional[int] = None, + padding_side: Optional[str] = None, + return_tensors: Optional[bool] = None, + return_token_type_ids: Optional[bool] = None, + return_attention_mask: Optional[bool] = None, + return_overflowing_tokens: bool = False, + return_special_tokens_mask: bool = False, + return_offsets_mapping: bool = False, + return_length: bool = False, + verbose: bool = True, + split_special_tokens: bool = False, + **kwargs, + ) -> BatchEncoding: + batched_input = [(text, text_pair)] if text_pair else [text] + batched_output = self._batch_encode_plus( + batched_input, + is_split_into_words=is_split_into_words, + add_special_tokens=add_special_tokens, + padding_strategy=padding_strategy, + truncation_strategy=truncation_strategy, + max_length=max_length, + stride=stride, + pad_to_multiple_of=pad_to_multiple_of, + padding_side=padding_side, + return_tensors=return_tensors, + return_token_type_ids=return_token_type_ids, + return_attention_mask=return_attention_mask, + return_overflowing_tokens=return_overflowing_tokens, + return_special_tokens_mask=return_special_tokens_mask, + return_offsets_mapping=return_offsets_mapping, + return_length=return_length, + verbose=verbose, + split_special_tokens=split_special_tokens, + **kwargs, + ) + + # Return tensor is None, then we can remove the leading batch axis + # Overflowing tokens are returned as a batch of output so we keep them in this case + if return_tensors is None and not return_overflowing_tokens: + batched_output = BatchEncoding( + { + key: (value[0] if len(value) > 0 and isinstance(value[0], list) else value) + for key, value in batched_output.items() + }, + batched_output.encodings, + ) + + self._eventual_warn_about_too_long_sequence(batched_output["input_ids"], max_length, verbose) + + return batched_output + + def convert_tokens_to_string(self, tokens: list[str]) -> str: + return ( + self.backend_tokenizer.decoder.decode(tokens) + if self.backend_tokenizer.decoder is not None + else " ".join(tokens) + ) + + def _decode( + self, + token_ids: Union[int, list[int]], + skip_special_tokens: bool = False, + clean_up_tokenization_spaces: Optional[bool] = None, + **kwargs, + ) -> str: + self._decode_use_source_tokenizer = kwargs.pop("use_source_tokenizer", False) + + if isinstance(token_ids, int): + token_ids = [token_ids] + text = self._tokenizer.decode(token_ids, skip_special_tokens=skip_special_tokens) + + clean_up_tokenization_spaces = ( + clean_up_tokenization_spaces + if clean_up_tokenization_spaces is not None + else self.clean_up_tokenization_spaces + ) + if clean_up_tokenization_spaces: + clean_text = self.clean_up_tokenization(text) + return clean_text + else: + return text + + def _save_pretrained( + self, + save_directory: Union[str, os.PathLike], + file_names: tuple[str], + legacy_format: Optional[bool] = None, + filename_prefix: Optional[str] = None, + ) -> tuple[str]: + """ + Save a tokenizer using the slow-tokenizer/legacy format: vocabulary + added tokens as well as in a unique JSON + file containing {config + vocab + added-tokens}. + """ + save_directory = str(save_directory) + + if self.slow_tokenizer_class is None and legacy_format is True: + raise ValueError( + "Your tokenizer does not have a legacy version defined and therefore cannot register this version. You" + " might consider leaving the legacy_format at `None` or setting it to `False`." + ) + + save_slow = ( + (legacy_format is None or legacy_format is True) + and self.slow_tokenizer_class is not None + and self.can_save_slow_tokenizer + ) + save_fast = legacy_format is None or legacy_format is False + + if save_slow: + added_tokens_file = os.path.join( + save_directory, (filename_prefix + "-" if filename_prefix else "") + ADDED_TOKENS_FILE + ) + # make sure to be forward compatible + added_vocab = {tok: index for tok, index in self.added_tokens_encoder.items() if index >= self.vocab_size} + if added_vocab: + with open(added_tokens_file, "w", encoding="utf-8") as f: + out_str = json.dumps(added_vocab, indent=2, sort_keys=True, ensure_ascii=False) + "\n" + f.write(out_str) + + vocab_files = self.save_vocabulary(save_directory, filename_prefix=filename_prefix) + file_names = file_names + vocab_files + (added_tokens_file,) + + if save_fast: + tokenizer_file = os.path.join( + save_directory, (filename_prefix + "-" if filename_prefix else "") + TOKENIZER_FILE + ) + self.backend_tokenizer.save(tokenizer_file) + file_names = file_names + (tokenizer_file,) + + return file_names + + def train_new_from_iterator( + self, + text_iterator, + vocab_size, + length=None, + new_special_tokens=None, + special_tokens_map=None, + **kwargs, + ): + """ + Trains a tokenizer on a new corpus with the same defaults (in terms of special tokens or tokenization pipeline) + as the current one. + + Args: + text_iterator (generator of `List[str]`): + The training corpus. Should be a generator of batches of texts, for instance a list of lists of texts + if you have everything in memory. + vocab_size (`int`): + The size of the vocabulary you want for your tokenizer. + length (`int`, *optional*): + The total number of sequences in the iterator. This is used to provide meaningful progress tracking + new_special_tokens (list of `str` or `AddedToken`, *optional*): + A list of new special tokens to add to the tokenizer you are training. + special_tokens_map (`Dict[str, str]`, *optional*): + If you want to rename some of the special tokens this tokenizer uses, pass along a mapping old special + token name to new special token name in this argument. + kwargs (`Dict[str, Any]`, *optional*): + Additional keyword arguments passed along to the trainer from the 🤗 Tokenizers library. + + Returns: + [`PreTrainedTokenizerFast`]: A new tokenizer of the same type as the original one, trained on + `text_iterator`. + + """ + tokenizer_json = json.loads(self._tokenizer.to_str()) + # Remove added tokens for now (uses IDs of tokens) + added_tokens = tokenizer_json.pop("added_tokens") + # Remove post processor for now (uses IDs of tokens) + post_processor = tokenizer_json.pop("post_processor") + + unk_token = None + # Remove vocab + if tokenizer_json["model"]["type"] == "BPE": + tokenizer_json["model"]["vocab"] = {} + tokenizer_json["model"]["merges"] = [] + elif tokenizer_json["model"]["type"] == "Unigram": + if tokenizer_json["model"]["unk_id"] is not None: + unk_id = tokenizer_json["model"]["unk_id"] + unk_token = tokenizer_json["model"]["vocab"][unk_id][0] + if special_tokens_map is not None and unk_token in special_tokens_map: + unk_token = special_tokens_map[unk_token] + tokenizer_json["model"]["unk_id"] = 0 + tokenizer_json["model"]["vocab"] = [[unk_token, 0.0]] + elif tokenizer_json["model"]["type"] in ["WordLevel", "WordPiece"]: + tokenizer_json["model"]["vocab"] = {} + else: + raise ValueError( + f"This method does not support this type of tokenizer (found {tokenizer_json['model']['type']}) " + "only BPE, Unigram, WordLevel and WordPiece." + ) + + if ( + special_tokens_map is not None + and "unk_token" in tokenizer_json["model"] + and tokenizer_json["model"]["unk_token"] in special_tokens_map + ): + tokenizer_json["model"]["unk_token"] = special_tokens_map[tokenizer_json["model"]["unk_token"]] + + tokenizer = TokenizerFast.from_str(json.dumps(tokenizer_json)) + + # Get the special tokens from the current tokenizer if none are specified. + special_tokens = [] + for added_token in added_tokens: + special = added_token.pop("special", None) + _ = added_token.pop("id", None) + if tokenizer_json["model"]["type"] != "Unigram" and not special: + continue + if special_tokens_map is not None and added_token["content"] in special_tokens_map: + added_token["content"] = special_tokens_map[added_token["content"]] + special_tokens.append(AddedToken(**added_token)) + + if new_special_tokens is not None: + special_tokens.extend(new_special_tokens) + + # Trainer needs to know the end of word / continuing subword thingies in BPE + if ( + tokenizer_json["model"]["type"] == "BPE" + and "continuing_subword_prefix" not in kwargs + and tokenizer_json["model"]["continuing_subword_prefix"] is not None + ): + kwargs["continuing_subword_prefix"] = tokenizer_json["model"]["continuing_subword_prefix"] + if ( + tokenizer_json["model"]["type"] == "BPE" + and "end_of_word_suffix" not in kwargs + and tokenizer_json["model"]["end_of_word_suffix"] is not None + ): + kwargs["end_of_word_suffix"] = tokenizer_json["model"]["end_of_word_suffix"] + if tokenizer_json["model"]["type"] == "Unigram" and unk_token is not None: + kwargs["unk_token"] = unk_token + if tokenizer_json["pre_tokenizer"] is not None: + if ( + tokenizer_json["pre_tokenizer"]["type"] == "ByteLevel" + or tokenizer_json["pre_tokenizer"]["type"] == "Sequence" + and "pretokenizers" in tokenizer_json["pre_tokenizer"] + and any( + pretokenizer["type"] == "ByteLevel" + for pretokenizer in tokenizer_json["pre_tokenizer"]["pretokenizers"] + ) + ): + kwargs["initial_alphabet"] = pre_tokenizers_fast.ByteLevel.alphabet() + + trainer_class = MODEL_TO_TRAINER_MAPPING[tokenizer_json["model"]["type"]] + trainer = trainer_class(vocab_size=vocab_size, special_tokens=special_tokens, **kwargs) + tokenizer.train_from_iterator(text_iterator, length=length, trainer=trainer) + + if post_processor is not None: + trained_tokenizer_json = json.loads(tokenizer.to_str()) + # Almost done, we just have to adjust the token IDs in the post processor + if "special_tokens" in post_processor: + for key in post_processor["special_tokens"]: + tokens = post_processor["special_tokens"][key]["tokens"] + if special_tokens_map is not None: + tokens = [special_tokens_map.get(token, token) for token in tokens] + post_processor["special_tokens"][key]["tokens"] = tokens + for token in tokens: + token_id = tokenizer.token_to_id(token) + if token_id is None: + raise ValueError( + "Attempted to set a token in the post processor that does not exist in the mapping" + ) + + post_processor["special_tokens"][key]["ids"] = [tokenizer.token_to_id(token) for token in tokens] + + for special_token in ["cls", "sep"]: + if special_token in post_processor: + token, _ = post_processor[special_token] + if special_tokens_map is not None and token in special_tokens_map: + token = special_tokens_map[token] + token_id = tokenizer.token_to_id(token) + if token_id is None: + raise ValueError( + "Attempted to set a token in the post processor that does not exist in the mapping" + ) + post_processor[special_token] = [token, token_id] + + trained_tokenizer_json["post_processor"] = post_processor + tokenizer = TokenizerFast.from_str(json.dumps(trained_tokenizer_json)) + + kwargs = self.init_kwargs.copy() + # Map pad/cls/mask token at the Transformers level + special_tokens_list = SpecialTokensMixin.SPECIAL_TOKENS_ATTRIBUTES.copy() + special_tokens_list.remove("additional_special_tokens") + for token in special_tokens_list: + if getattr(self, token) is not None: + special_token = getattr(self, token) + if special_tokens_map is not None and special_token in special_tokens_map: + special_token = special_tokens_map[special_token] + + special_token_full = self._special_tokens_map.get(token, None) + if isinstance(special_token_full, AddedToken): + # Create an added token with the same parameters except the content + kwargs[token] = AddedToken( + special_token, + single_word=special_token_full.single_word, + lstrip=special_token_full.lstrip, + rstrip=special_token_full.rstrip, + normalized=special_token_full.normalized, + special=True, + ) + else: + kwargs[token] = special_token + + additional_special_tokens = self.additional_special_tokens + if new_special_tokens is not None: + additional_special_tokens.extend(new_special_tokens) + if len(additional_special_tokens) > 0: + kwargs["additional_special_tokens"] = additional_special_tokens + + return self.__class__(tokenizer_object=tokenizer, **kwargs) diff --git a/docs/transformers/build/lib/transformers/trainer.py b/docs/transformers/build/lib/transformers/trainer.py new file mode 100644 index 0000000000000000000000000000000000000000..fb31e921cfaf8326f538f6a91c5a3b2cb45b4ad8 --- /dev/null +++ b/docs/transformers/build/lib/transformers/trainer.py @@ -0,0 +1,5344 @@ +# Copyright 2020-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +The Trainer class, to easily train a 🤗 Transformers from scratch or finetune it on a new task. +""" + +import contextlib +import copy +import functools +import glob +import importlib.metadata +import inspect +import json +import math +import os +import random +import re +import shutil +import sys +import tempfile +import time +import warnings +from collections.abc import Mapping +from pathlib import Path +from typing import TYPE_CHECKING, Any, Callable, Optional, Union + + +# Integrations must be imported before ML frameworks: +# isort: off +from .integrations import ( + get_reporting_integration_callbacks, +) + +# isort: on + +import huggingface_hub.utils as hf_hub_utils +import numpy as np +import torch +import torch.distributed as dist +from huggingface_hub import ModelCard, create_repo, upload_folder +from packaging import version +from torch import nn +from torch.utils.data import DataLoader, Dataset, IterableDataset, RandomSampler, SequentialSampler + +from . import __version__ +from .configuration_utils import PretrainedConfig +from .data.data_collator import DataCollator, DataCollatorWithPadding, default_data_collator +from .debug_utils import DebugOption, DebugUnderflowOverflow +from .feature_extraction_sequence_utils import SequenceFeatureExtractor +from .feature_extraction_utils import FeatureExtractionMixin +from .hyperparameter_search import ALL_HYPERPARAMETER_SEARCH_BACKENDS, default_hp_search_backend +from .image_processing_utils import BaseImageProcessor +from .integrations.deepspeed import deepspeed_init, deepspeed_load_checkpoint, is_deepspeed_available +from .integrations.tpu import tpu_spmd_dataloader +from .modelcard import TrainingSummary +from .modeling_utils import PreTrainedModel, load_sharded_checkpoint, unwrap_model +from .models.auto.modeling_auto import ( + MODEL_FOR_CAUSAL_LM_MAPPING_NAMES, + MODEL_MAPPING_NAMES, +) +from .optimization import Adafactor, get_scheduler +from .processing_utils import ProcessorMixin +from .pytorch_utils import ( + ALL_LAYERNORM_LAYERS, + is_torch_greater_or_equal_than_2_3, +) +from .tokenization_utils_base import PreTrainedTokenizerBase +from .trainer_callback import ( + CallbackHandler, + DefaultFlowCallback, + ExportableState, + PrinterCallback, + ProgressCallback, + TrainerCallback, + TrainerControl, + TrainerState, +) +from .trainer_pt_utils import ( + DistributedTensorGatherer, + EvalLoopContainer, + IterableDatasetShard, + LabelSmoother, + LayerWiseDummyOptimizer, + LengthGroupedSampler, + SequentialDistributedSampler, + distributed_broadcast_scalars, + distributed_concat, + find_batch_size, + get_model_param_count, + get_module_class_from_name, + get_parameter_names, + nested_concat, + nested_detach, + nested_numpify, + nested_xla_mesh_reduce, + reissue_pt_warnings, + remove_dummy_checkpoint, + set_rng_state_for_device, +) +from .trainer_utils import ( + PREFIX_CHECKPOINT_DIR, + BestRun, + EvalLoopOutput, + EvalPrediction, + HPSearchBackend, + HubStrategy, + PredictionOutput, + RemoveColumnsCollator, + SaveStrategy, + TrainerMemoryTracker, + TrainOutput, + check_target_module_exists, + default_compute_objective, + denumpify_detensorize, + enable_full_determinism, + find_executable_batch_size, + get_last_checkpoint, + has_length, + neftune_post_forward_hook, + number_of_arguments, + seed_worker, + set_seed, + speed_metrics, +) +from .training_args import OptimizerNames, ParallelMode, TrainingArguments +from .utils import ( + ADAPTER_CONFIG_NAME, + ADAPTER_SAFE_WEIGHTS_NAME, + ADAPTER_WEIGHTS_NAME, + CONFIG_NAME, + SAFE_WEIGHTS_INDEX_NAME, + SAFE_WEIGHTS_NAME, + WEIGHTS_INDEX_NAME, + WEIGHTS_NAME, + XLA_FSDPV2_MIN_VERSION, + PushInProgress, + PushToHubMixin, + can_return_loss, + find_labels, + is_accelerate_available, + is_apex_available, + is_apollo_torch_available, + is_bitsandbytes_available, + is_datasets_available, + is_galore_torch_available, + is_grokadamw_available, + is_in_notebook, + is_ipex_available, + is_liger_kernel_available, + is_lomo_available, + is_peft_available, + is_safetensors_available, + is_sagemaker_dp_enabled, + is_sagemaker_mp_enabled, + is_schedulefree_available, + is_torch_hpu_available, + is_torch_mlu_available, + is_torch_mps_available, + is_torch_musa_available, + is_torch_neuroncore_available, + is_torch_npu_available, + is_torch_xla_available, + is_torch_xpu_available, + is_torchao_available, + logging, + strtobool, +) +from .utils.deprecation import deprecate_kwarg +from .utils.import_utils import requires +from .utils.quantization_config import QuantizationMethod + + +DEFAULT_CALLBACKS = [DefaultFlowCallback] +DEFAULT_PROGRESS_CALLBACK = ProgressCallback + +if is_in_notebook(): + from .utils.notebook import NotebookProgressCallback + + DEFAULT_PROGRESS_CALLBACK = NotebookProgressCallback + +if is_apex_available(): + from apex import amp + +if is_datasets_available(): + import datasets + +if is_torch_xla_available(): + import torch_xla.core.xla_model as xm + import torch_xla.debug.metrics as met + from torch_xla import __version__ as XLA_VERSION + + IS_XLA_FSDPV2_POST_2_2 = version.parse(XLA_VERSION) >= version.parse(XLA_FSDPV2_MIN_VERSION) + if IS_XLA_FSDPV2_POST_2_2: + import torch_xla.distributed.spmd as xs + import torch_xla.runtime as xr +else: + IS_XLA_FSDPV2_POST_2_2 = False + + +if is_sagemaker_mp_enabled(): + import smdistributed.modelparallel.torch as smp + from smdistributed.modelparallel import __version__ as SMP_VERSION + + IS_SAGEMAKER_MP_POST_1_10 = version.parse(SMP_VERSION) >= version.parse("1.10") + + from .trainer_pt_utils import smp_forward_backward, smp_forward_only, smp_gather, smp_nested_concat +else: + IS_SAGEMAKER_MP_POST_1_10 = False + + +if is_safetensors_available(): + import safetensors.torch + +if is_peft_available(): + from peft import PeftModel + + +if is_accelerate_available(): + from accelerate import Accelerator, skip_first_batches + from accelerate import __version__ as accelerate_version + from accelerate.state import AcceleratorState + from accelerate.utils import ( + AutocastKwargs, + DistributedDataParallelKwargs, + DistributedType, + load_fsdp_model, + load_fsdp_optimizer, + save_fsdp_model, + save_fsdp_optimizer, + ) + + DATA_SAMPLERS = [RandomSampler] + if version.parse(accelerate_version) > version.parse("1.3.0"): + from accelerate.utils import TorchTensorParallelPlugin + if version.parse(accelerate_version) > version.parse("0.23.0"): + from accelerate.data_loader import SeedableRandomSampler + + DATA_SAMPLERS += [SeedableRandomSampler] + + if is_deepspeed_available(): + from accelerate.utils import DeepSpeedSchedulerWrapper + +if is_accelerate_available("0.28.0"): + from accelerate.utils import DataLoaderConfiguration + + +def _is_peft_model(model): + if is_peft_available(): + classes_to_check = (PeftModel,) + # Here we also check if the model is an instance of `PeftMixedModel` introduced in peft>=0.7.0: https://github.com/huggingface/transformers/pull/28321 + if version.parse(importlib.metadata.version("peft")) >= version.parse("0.7.0"): + from peft import PeftMixedModel + + classes_to_check = (*classes_to_check, PeftMixedModel) + return isinstance(model, classes_to_check) + return False + + +def _get_fsdp_ckpt_kwargs(): + # TODO: @AjayP13, @younesbelkada replace this check with version check at the next `accelerate` release + if is_accelerate_available() and "adapter_only" in list(inspect.signature(save_fsdp_model).parameters): + return {"adapter_only": True} + else: + return {} + + +def safe_globals(): + # Starting from version 2.4 PyTorch introduces a check for the objects loaded + # with torch.load(weights_only=True). Starting from 2.6 weights_only=True becomes + # a default and requires allowlisting of objects being loaded. + # See: https://github.com/pytorch/pytorch/pull/137602 + # See: https://pytorch.org/docs/stable/notes/serialization.html#torch.serialization.add_safe_globals + # See: https://github.com/huggingface/accelerate/pull/3036 + if version.parse(torch.__version__).release < version.parse("2.6").release: + return contextlib.nullcontext() + + np_core = np._core if version.parse(np.__version__) >= version.parse("2.0.0") else np.core + allowlist = [np_core.multiarray._reconstruct, np.ndarray, np.dtype] + # numpy >1.25 defines numpy.dtypes.UInt32DType, but below works for + # all versions of numpy + allowlist += [type(np.dtype(np.uint32))] + + return torch.serialization.safe_globals(allowlist) + + +if TYPE_CHECKING: + import optuna + + if is_datasets_available(): + import datasets + +logger = logging.get_logger(__name__) + + +# Name of the files used for checkpointing +TRAINING_ARGS_NAME = "training_args.bin" +TRAINER_STATE_NAME = "trainer_state.json" +OPTIMIZER_NAME = "optimizer.pt" +SCALER_NAME = "scaler.pt" +OPTIMIZER_NAME_BIN = "optimizer.bin" +SCHEDULER_NAME = "scheduler.pt" +FSDP_MODEL_NAME = "pytorch_model_fsdp" + + +@requires( + backends=( + "torch", + "accelerate", + ) +) +class Trainer: + """ + Trainer is a simple but feature-complete training and eval loop for PyTorch, optimized for 🤗 Transformers. + + Args: + model ([`PreTrainedModel`] or `torch.nn.Module`, *optional*): + The model to train, evaluate or use for predictions. If not provided, a `model_init` must be passed. + + + + [`Trainer`] is optimized to work with the [`PreTrainedModel`] provided by the library. You can still use + your own models defined as `torch.nn.Module` as long as they work the same way as the 🤗 Transformers + models. + + + + args ([`TrainingArguments`], *optional*): + The arguments to tweak for training. Will default to a basic instance of [`TrainingArguments`] with the + `output_dir` set to a directory named *tmp_trainer* in the current directory if not provided. + data_collator (`DataCollator`, *optional*): + The function to use to form a batch from a list of elements of `train_dataset` or `eval_dataset`. Will + default to [`default_data_collator`] if no `processing_class` is provided, an instance of + [`DataCollatorWithPadding`] otherwise if the processing_class is a feature extractor or tokenizer. + train_dataset (Union[`torch.utils.data.Dataset`, `torch.utils.data.IterableDataset`, `datasets.Dataset`], *optional*): + The dataset to use for training. If it is a [`~datasets.Dataset`], columns not accepted by the + `model.forward()` method are automatically removed. + + Note that if it's a `torch.utils.data.IterableDataset` with some randomization and you are training in a + distributed fashion, your iterable dataset should either use a internal attribute `generator` that is a + `torch.Generator` for the randomization that must be identical on all processes (and the Trainer will + manually set the seed of this `generator` at each epoch) or have a `set_epoch()` method that internally + sets the seed of the RNGs used. + eval_dataset (Union[`torch.utils.data.Dataset`, Dict[str, `torch.utils.data.Dataset`, `datasets.Dataset`]), *optional*): + The dataset to use for evaluation. If it is a [`~datasets.Dataset`], columns not accepted by the + `model.forward()` method are automatically removed. If it is a dictionary, it will evaluate on each + dataset prepending the dictionary key to the metric name. + processing_class (`PreTrainedTokenizerBase` or `BaseImageProcessor` or `FeatureExtractionMixin` or `ProcessorMixin`, *optional*): + Processing class used to process the data. If provided, will be used to automatically process the inputs + for the model, and it will be saved along the model to make it easier to rerun an interrupted training or + reuse the fine-tuned model. + This supercedes the `tokenizer` argument, which is now deprecated. + model_init (`Callable[[], PreTrainedModel]`, *optional*): + A function that instantiates the model to be used. If provided, each call to [`~Trainer.train`] will start + from a new instance of the model as given by this function. + + The function may have zero argument, or a single one containing the optuna/Ray Tune/SigOpt trial object, to + be able to choose different architectures according to hyper parameters (such as layer count, sizes of + inner layers, dropout probabilities etc). + compute_loss_func (`Callable`, *optional*): + A function that accepts the raw model outputs, labels, and the number of items in the entire accumulated + batch (batch_size * gradient_accumulation_steps) and returns the loss. For example, see the default [loss function](https://github.com/huggingface/transformers/blob/052e652d6d53c2b26ffde87e039b723949a53493/src/transformers/trainer.py#L3618) used by [`Trainer`]. + compute_metrics (`Callable[[EvalPrediction], Dict]`, *optional*): + The function that will be used to compute metrics at evaluation. Must take a [`EvalPrediction`] and return + a dictionary string to metric values. *Note* When passing TrainingArgs with `batch_eval_metrics` set to + `True`, your compute_metrics function must take a boolean `compute_result` argument. This will be triggered + after the last eval batch to signal that the function needs to calculate and return the global summary + statistics rather than accumulating the batch-level statistics + callbacks (List of [`TrainerCallback`], *optional*): + A list of callbacks to customize the training loop. Will add those to the list of default callbacks + detailed in [here](callback). + + If you want to remove one of the default callbacks used, use the [`Trainer.remove_callback`] method. + optimizers (`Tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR]`, *optional*, defaults to `(None, None)`): + A tuple containing the optimizer and the scheduler to use. Will default to an instance of [`AdamW`] on your + model and a scheduler given by [`get_linear_schedule_with_warmup`] controlled by `args`. + optimizer_cls_and_kwargs (`Tuple[Type[torch.optim.Optimizer], Dict[str, Any]]`, *optional*): + A tuple containing the optimizer class and keyword arguments to use. + Overrides `optim` and `optim_args` in `args`. Incompatible with the `optimizers` argument. + + Unlike `optimizers`, this argument avoids the need to place model parameters on the correct devices before initializing the Trainer. + preprocess_logits_for_metrics (`Callable[[torch.Tensor, torch.Tensor], torch.Tensor]`, *optional*): + A function that preprocess the logits right before caching them at each evaluation step. Must take two + tensors, the logits and the labels, and return the logits once processed as desired. The modifications made + by this function will be reflected in the predictions received by `compute_metrics`. + + Note that the labels (second parameter) will be `None` if the dataset does not have them. + + Important attributes: + + - **model** -- Always points to the core model. If using a transformers model, it will be a [`PreTrainedModel`] + subclass. + - **model_wrapped** -- Always points to the most external model in case one or more other modules wrap the + original model. This is the model that should be used for the forward pass. For example, under `DeepSpeed`, + the inner model is wrapped in `DeepSpeed` and then again in `torch.nn.DistributedDataParallel`. If the inner + model hasn't been wrapped, then `self.model_wrapped` is the same as `self.model`. + - **is_model_parallel** -- Whether or not a model has been switched to a model parallel mode (different from + data parallelism, this means some of the model layers are split on different GPUs). + - **place_model_on_device** -- Whether or not to automatically place the model on the device - it will be set + to `False` if model parallel or deepspeed is used, or if the default + `TrainingArguments.place_model_on_device` is overridden to return `False` . + - **is_in_train** -- Whether or not a model is currently running `train` (e.g. when `evaluate` is called while + in `train`) + + """ + + # Those are used as methods of the Trainer in examples. + from .trainer_pt_utils import _get_learning_rate, log_metrics, metrics_format, save_metrics, save_state + + @deprecate_kwarg("tokenizer", new_name="processing_class", version="5.0.0", raise_if_both_names=True) + def __init__( + self, + model: Union[PreTrainedModel, nn.Module, None] = None, + args: TrainingArguments = None, + data_collator: Optional[DataCollator] = None, + train_dataset: Optional[Union[Dataset, IterableDataset, "datasets.Dataset"]] = None, + eval_dataset: Optional[Union[Dataset, dict[str, Dataset], "datasets.Dataset"]] = None, + processing_class: Optional[ + Union[PreTrainedTokenizerBase, BaseImageProcessor, FeatureExtractionMixin, ProcessorMixin] + ] = None, + model_init: Optional[Callable[[], PreTrainedModel]] = None, + compute_loss_func: Optional[Callable] = None, + compute_metrics: Optional[Callable[[EvalPrediction], dict]] = None, + callbacks: Optional[list[TrainerCallback]] = None, + optimizers: tuple[Optional[torch.optim.Optimizer], Optional[torch.optim.lr_scheduler.LambdaLR]] = (None, None), + optimizer_cls_and_kwargs: Optional[tuple[type[torch.optim.Optimizer], dict[str, Any]]] = None, + preprocess_logits_for_metrics: Optional[Callable[[torch.Tensor, torch.Tensor], torch.Tensor]] = None, + ): + if args is None: + output_dir = "tmp_trainer" + logger.info(f"No `TrainingArguments` passed, using `output_dir={output_dir}`.") + args = TrainingArguments(output_dir=output_dir) + if args.batch_eval_metrics and compute_metrics is not None: + if "compute_result" not in inspect.signature(compute_metrics).parameters.keys(): + raise ValueError( + "When using `batch_eval_metrics`, your `compute_metrics` function must take a `compute_result`" + " boolean argument which will be triggered after the last batch of the eval set to signal that the" + " summary statistics should be returned by the function." + ) + if args.eval_strategy is not None and args.eval_strategy != "no" and eval_dataset is None: + raise ValueError( + f"You have set `args.eval_strategy` to {args.eval_strategy} but you didn't pass an `eval_dataset` to `Trainer`. Either set `args.eval_strategy` to `no` or pass an `eval_dataset`. " + ) + if args.save_strategy == SaveStrategy.BEST or args.load_best_model_at_end: + if args.metric_for_best_model is None: + raise ValueError( + "`args.metric_for_best_model` must be provided when using 'best' save_strategy or if `args.load_best_model_at_end` is set to `True`." + ) + + self.args = args + self.compute_loss_func = compute_loss_func + # Seed must be set before instantiating the model when using model + enable_full_determinism(self.args.seed) if self.args.full_determinism else set_seed(self.args.seed) + + self.hp_name = None + self.deepspeed = None + self.is_in_train = False + self.model = model + self.create_accelerator_and_postprocess() + + # memory metrics - must set up as early as possible + self._memory_tracker = TrainerMemoryTracker(self.args.skip_memory_metrics) + self._memory_tracker.start() + + # set the correct log level depending on the node + log_level = args.get_process_log_level() + logging.set_verbosity(log_level) + + # force device and distributed setup init explicitly + args._setup_devices + + if model is None: + if model_init is not None: + self.model_init = model_init + model = self.call_model_init() + else: + raise RuntimeError("`Trainer` requires either a `model` or `model_init` argument") + else: + if model_init is not None: + warnings.warn( + "`Trainer` requires either a `model` or `model_init` argument, but not both. `model_init` will" + " overwrite your model when calling the `train` method. This will become a fatal error in the next" + " release.", + FutureWarning, + ) + self.model_init = model_init + + if model.__class__.__name__ in MODEL_MAPPING_NAMES: + raise ValueError( + f"The model you have picked ({model.__class__.__name__}) cannot be used as is for training: it only " + "computes hidden states and does not accept any labels. You should choose a model with a head " + "suitable for your task like any of the `AutoModelForXxx` listed at " + "https://huggingface.co/docs/transformers/model_doc/auto" + ) + + if getattr(model, "is_parallelizable", False) and getattr(model, "model_parallel", False): + self.is_model_parallel = True + else: + self.is_model_parallel = False + + if getattr(model, "hf_device_map", None) is not None: + devices = [device for device in set(model.hf_device_map.values()) if device not in ["cpu", "disk"]] + if len(devices) > 1: + self.is_model_parallel = True + elif len(devices) == 1: + self.is_model_parallel = self.args.device != torch.device(devices[0]) + else: + self.is_model_parallel = False + + # warn users + if self.is_model_parallel: + logger.info( + "You have loaded a model on multiple GPUs. `is_model_parallel` attribute will be force-set" + " to `True` to avoid any unexpected behavior such as device placement mismatching." + ) + + if self.args.use_liger_kernel: + if is_liger_kernel_available(): + from liger_kernel.transformers import _apply_liger_kernel_to_instance + + if isinstance(model, PreTrainedModel): + # Patch the model with liger kernels. Use the default kernel configurations. + _apply_liger_kernel_to_instance(model=model) + elif hasattr(model, "get_base_model") and isinstance(model.get_base_model(), PreTrainedModel): + # Patch the base model with liger kernels where model is a PeftModel. Use the default kernel configurations. + _apply_liger_kernel_to_instance(model=model.get_base_model()) + else: + logger.warning( + "The model is not an instance of PreTrainedModel. No liger kernels will be applied." + ) + else: + raise ImportError( + "You have set `use_liger_kernel` to `True` but liger-kernel >= 0.3.0 is not available. " + "Please install it with `pip install liger-kernel`" + ) + + _is_quantized_and_base_model = getattr(model, "is_quantized", False) and not getattr( + model, "_hf_peft_config_loaded", False + ) + _quantization_method_supports_training = ( + getattr(model, "hf_quantizer", None) is not None and model.hf_quantizer.is_trainable + ) + + _is_model_quantized_and_qat_trainable = getattr(model, "hf_quantizer", None) is not None and getattr( + model.hf_quantizer, "is_qat_trainable", False + ) + + # Filter out quantized + compiled models + if _is_quantized_and_base_model and hasattr(model, "_orig_mod"): + raise ValueError( + "You cannot fine-tune quantized model with `torch.compile()` make sure to pass a non-compiled model when fine-tuning a quantized model with PEFT" + ) + + # At this stage the model is already loaded + if _is_quantized_and_base_model and not _is_peft_model(model) and not _is_model_quantized_and_qat_trainable: + raise ValueError( + "You cannot perform fine-tuning on purely quantized models. Please attach trainable adapters on top of" + " the quantized model to correctly perform fine-tuning. Please see: https://huggingface.co/docs/transformers/peft" + " for more details" + ) + elif _is_quantized_and_base_model and not _quantization_method_supports_training: + raise ValueError( + f"The model you are trying to fine-tune is quantized with {model.hf_quantizer.quantization_config.quant_method}" + " but that quantization method do not support training. Please open an issue on GitHub: https://github.com/huggingface/transformers" + f" to request the support for training support for {model.hf_quantizer.quantization_config.quant_method}" + ) + + self.is_fsdp_xla_enabled = args.fsdp_config["xla"] + if len(args.fsdp) > 0: + if self.is_deepspeed_enabled: + raise ValueError( + "Using --fsdp xxx together with --deepspeed is not possible, deactivate one of those flags." + ) + if not args.fsdp_config["xla"] and args.parallel_mode != ParallelMode.DISTRIBUTED: + raise ValueError("Using fsdp only works in distributed training.") + + # one place to sort out whether to place the model on device or not + # postpone switching model to cuda when: + # 1. MP - since we are trying to fit a much bigger than 1 gpu model + # 2. fp16-enabled DeepSpeed loads the model in half the size and it doesn't need .to() anyway, + # and we only use deepspeed for training at the moment + # 3. full bf16 or fp16 eval - since the model needs to be cast to the right dtype first + # 4. FSDP - same as MP + self.place_model_on_device = args.place_model_on_device + if ( + self.is_model_parallel + or self.is_deepspeed_enabled + or ((args.fp16_full_eval or args.bf16_full_eval) and not args.do_train) + or self.is_fsdp_xla_enabled + or self.is_fsdp_enabled + ): + self.place_model_on_device = False + + default_collator = ( + DataCollatorWithPadding(processing_class) + if processing_class is not None + and isinstance(processing_class, (PreTrainedTokenizerBase, SequenceFeatureExtractor)) + else default_data_collator + ) + self.data_collator = data_collator if data_collator is not None else default_collator + self.train_dataset = train_dataset + self.eval_dataset = eval_dataset + self.processing_class = processing_class + + # Bnb Quantized models doesn't support `.to` operation. + if ( + self.place_model_on_device + and not getattr(model, "quantization_method", None) == QuantizationMethod.BITS_AND_BYTES + ): + self._move_model_to_device(model, args.device) + + # Force n_gpu to 1 to avoid DataParallel as MP will manage the GPUs + if self.is_model_parallel: + self.args._n_gpu = 1 + + # later use `self.model is self.model_wrapped` to check if it's wrapped or not + self.model_wrapped = model + self.model = model + + # Just in case the model was wrapped outside of the `Trainer` + unwrapped_model = self.accelerator.unwrap_model(model) + model_forward = ( + unwrapped_model.forward + if not _is_peft_model(unwrapped_model) + else unwrapped_model.get_base_model().forward + ) + forward_params = inspect.signature(model_forward).parameters + + # Check if the model has explicit setup for loss kwargs, + # if not, check if `**kwargs` are in model.forward + if hasattr(model, "accepts_loss_kwargs"): + self.model_accepts_loss_kwargs = model.accepts_loss_kwargs + else: + self.model_accepts_loss_kwargs = any( + k.kind == inspect.Parameter.VAR_KEYWORD for k in forward_params.values() + ) + + self.neftune_noise_alpha = args.neftune_noise_alpha + + self.compute_metrics = compute_metrics + self.preprocess_logits_for_metrics = preprocess_logits_for_metrics + self.optimizer, self.lr_scheduler = optimizers + self.optimizer_cls_and_kwargs = optimizer_cls_and_kwargs + if self.optimizer_cls_and_kwargs is not None and self.optimizer is not None: + raise RuntimeError("Passing both `optimizers` and `optimizer_cls_and_kwargs` arguments is incompatible.") + if model_init is not None and (self.optimizer is not None or self.lr_scheduler is not None): + raise RuntimeError( + "Passing a `model_init` is incompatible with providing the `optimizers` argument. " + "You should subclass `Trainer` and override the `create_optimizer_and_scheduler` method." + ) + if is_torch_xla_available() and self.optimizer is not None: + for param in self.model.parameters(): + model_device = param.device + break + for param_group in self.optimizer.param_groups: + if len(param_group["params"]) > 0: + optimizer_device = param_group["params"][0].device + break + if model_device != optimizer_device: + raise ValueError( + "The model and the optimizer parameters are not on the same device, which probably means you" + " created an optimizer around your model **before** putting on the device and passing it to the" + " `Trainer`. Make sure the lines `import torch_xla.core.xla_model as xm` and" + " `model.to(xm.xla_device())` is performed before the optimizer creation in your script." + ) + if (self.is_fsdp_xla_enabled or self.is_fsdp_enabled) and ( + self.optimizer is not None or self.lr_scheduler is not None + ): + raise RuntimeError( + "Passing `optimizers` is not allowed if PyTorch FSDP is enabled. " + "You should subclass `Trainer` and override the `create_optimizer_and_scheduler` method." + ) + default_callbacks = DEFAULT_CALLBACKS + get_reporting_integration_callbacks(self.args.report_to) + callbacks = default_callbacks if callbacks is None else default_callbacks + callbacks + self.callback_handler = CallbackHandler( + callbacks, self.model, self.processing_class, self.optimizer, self.lr_scheduler + ) + self.add_callback(PrinterCallback if self.args.disable_tqdm else DEFAULT_PROGRESS_CALLBACK) + + # Will be set to True by `self._setup_loggers()` on first call to `self.log()`. + self._loggers_initialized = False + + # Create distant repo and output directory if needed + self.hub_model_id = None + if self.args.push_to_hub: + self.init_hf_repo() + if self.args.should_save: + os.makedirs(self.args.output_dir, exist_ok=True) + + if not callable(self.data_collator) and callable(getattr(self.data_collator, "collate_batch", None)): + raise ValueError("The `data_collator` should be a simple callable (function, class with `__call__`).") + + if args.max_steps > 0 and args.num_train_epochs > 0: + logger.info("max_steps is given, it will override any value given in num_train_epochs") + + if train_dataset is not None and not has_length(train_dataset) and args.max_steps <= 0: + raise ValueError( + "The train_dataset does not implement __len__, max_steps has to be specified. " + "The number of steps needs to be known in advance for the learning rate scheduler." + ) + + if ( + train_dataset is not None + and isinstance(train_dataset, torch.utils.data.IterableDataset) + and args.group_by_length + ): + raise ValueError("the `--group_by_length` option is only available for `Dataset`, not `IterableDataset") + + self._signature_columns = None + + # Mixed precision setup + self.use_apex = False + self.use_cpu_amp = False + + # Mixed precision setup for SageMaker Model Parallel + if is_sagemaker_mp_enabled(): + # BF16 + model parallelism in SageMaker: currently not supported, raise an error + if args.bf16: + raise ValueError("SageMaker Model Parallelism does not support BF16 yet. Please use FP16 instead ") + + if IS_SAGEMAKER_MP_POST_1_10: + # When there's mismatch between SMP config and trainer argument, use SMP config as truth + if args.fp16 != smp.state.cfg.fp16: + logger.warning( + f"FP16 provided in SM_HP_MP_PARAMETERS is {smp.state.cfg.fp16}, " + f"but FP16 provided in trainer argument is {args.fp16}, " + f"setting to {smp.state.cfg.fp16}" + ) + args.fp16 = smp.state.cfg.fp16 + else: + # smp < 1.10 does not support fp16 in trainer. + if hasattr(smp.state.cfg, "fp16"): + logger.warning( + f"FP16 provided in SM_HP_MP_PARAMETERS is {smp.state.cfg.fp16}, " + "but SageMaker Model Parallelism < 1.10 does not support FP16 in trainer." + ) + if (args.fp16 or args.bf16) and args.half_precision_backend == "auto": + if args.device == torch.device("cpu"): + if args.fp16: + if not is_torch_greater_or_equal_than_2_3: + raise ValueError("Tried to use `fp16` but it is not supported on cpu") + else: + args.half_precision_backend = "cpu_amp" + logger.info(f"Using {args.half_precision_backend} half precision backend") + + if (args.fp16 or args.bf16) and not (self.is_deepspeed_enabled or is_sagemaker_mp_enabled()): + # deepspeed and SageMaker Model Parallel manage their own half precision + if args.half_precision_backend == "cpu_amp": + self.use_cpu_amp = True + self.amp_dtype = torch.bfloat16 + elif args.half_precision_backend == "apex": + if not is_apex_available(): + raise ImportError( + "Using FP16 with APEX but APEX is not installed, please refer to" + " https://www.github.com/nvidia/apex." + ) + self.use_apex = True + + # Label smoothing + if self.args.label_smoothing_factor != 0: + self.label_smoother = LabelSmoother(epsilon=self.args.label_smoothing_factor) + else: + self.label_smoother = None + + self.control = TrainerControl() + + self.state = TrainerState( + is_local_process_zero=self.is_local_process_zero(), + is_world_process_zero=self.is_world_process_zero(), + stateful_callbacks=[ + cb for cb in self.callback_handler.callbacks + [self.control] if isinstance(cb, ExportableState) + ], + ) + # Internal variable to count flos in each process, will be accumulated in `self.state.total_flos` then + # returned to 0 every time flos need to be logged + self.current_flos = 0 + self.hp_search_backend = None + if _is_peft_model(self.model) and self.args.label_names is None: + logger.warning( + f"No label_names provided for model class `{self.model.__class__.__name__}`." + " Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`." + " Note that empty label_names list will be used instead." + ) + default_label_names = find_labels(self.model.__class__) + self.label_names = default_label_names if self.args.label_names is None else self.args.label_names + self.can_return_loss = can_return_loss(self.model.__class__) + self.control = self.callback_handler.on_init_end(self.args, self.state, self.control) + + # Internal variables to help with automatic batch size reduction + self._train_batch_size = args.train_batch_size + self._created_lr_scheduler = False + + # very last + self._memory_tracker.stop_and_update_metrics() + + self.is_fsdp_xla_v2_enabled = args.fsdp_config.get("xla_fsdp_v2", False) + if self.is_fsdp_xla_v2_enabled: + if not IS_XLA_FSDPV2_POST_2_2: + raise ValueError("FSDPv2 requires `torch_xla` 2.2 or higher.") + # Prepare the SPMD mesh that is going to be used by the data loader and the FSDPv2 wrapper. + # Tensor axis is just a placeholder where it will not be used in FSDPv2. + num_devices = xr.global_runtime_device_count() + xs.set_global_mesh(xs.Mesh(np.array(range(num_devices)), (num_devices, 1), axis_names=("fsdp", "tensor"))) + self.is_fsdp_xla_v1_enabled = self.is_fsdp_xla_enabled and not self.is_fsdp_xla_v2_enabled + + @property + def tokenizer(self) -> Optional[PreTrainedTokenizerBase]: + logger.warning("Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.") + return self.processing_class + + @tokenizer.setter + def tokenizer(self, processing_class) -> None: + logger.warning( + "Trainer.tokenizer is now deprecated. You should use `Trainer.processing_class = processing_class` instead." + ) + self.processing_class = processing_class + + def _activate_neftune(self, model): + r""" + Activates the neftune as presented in this code: https://github.com/neelsjain/NEFTune and paper: + https://arxiv.org/abs/2310.05914 + """ + unwrapped_model = self.accelerator.unwrap_model(model) + + if _is_peft_model(unwrapped_model): + embeddings = unwrapped_model.base_model.model.get_input_embeddings() + else: + embeddings = unwrapped_model.get_input_embeddings() + + del unwrapped_model + + embeddings.neftune_noise_alpha = self.neftune_noise_alpha + hook_handle = embeddings.register_forward_hook(neftune_post_forward_hook) + self.neftune_hook_handle = hook_handle + return model + + def _deactivate_neftune(self, model): + """ + Deactivates the neftune method. Make sure to call `_activate_neftune` first. + """ + if not hasattr(self, "neftune_hook_handle"): + raise ValueError("Neftune is not activated make sure to call `trainer._activate_neftune()` first") + + unwrapped_model = self.accelerator.unwrap_model(model) + + if _is_peft_model(unwrapped_model): + embeddings = unwrapped_model.base_model.model.get_input_embeddings() + else: + embeddings = unwrapped_model.get_input_embeddings() + + self.neftune_hook_handle.remove() + del embeddings.neftune_noise_alpha, unwrapped_model + + def add_callback(self, callback): + """ + Add a callback to the current list of [`~transformers.TrainerCallback`]. + + Args: + callback (`type` or [`~transformers.TrainerCallback]`): + A [`~transformers.TrainerCallback`] class or an instance of a [`~transformers.TrainerCallback`]. In the + first case, will instantiate a member of that class. + """ + self.callback_handler.add_callback(callback) + + def pop_callback(self, callback): + """ + Remove a callback from the current list of [`~transformers.TrainerCallback`] and returns it. + + If the callback is not found, returns `None` (and no error is raised). + + Args: + callback (`type` or [`~transformers.TrainerCallback]`): + A [`~transformers.TrainerCallback`] class or an instance of a [`~transformers.TrainerCallback`]. In the + first case, will pop the first member of that class found in the list of callbacks. + + Returns: + [`~transformers.TrainerCallback`]: The callback removed, if found. + """ + return self.callback_handler.pop_callback(callback) + + def remove_callback(self, callback): + """ + Remove a callback from the current list of [`~transformers.TrainerCallback`]. + + Args: + callback (`type` or [`~transformers.TrainerCallback]`): + A [`~transformers.TrainerCallback`] class or an instance of a [`~transformers.TrainerCallback`]. In the + first case, will remove the first member of that class found in the list of callbacks. + """ + self.callback_handler.remove_callback(callback) + + def _move_model_to_device(self, model, device): + model = model.to(device) + # Moving a model to an XLA device disconnects the tied weights, so we have to retie them. + if self.args.parallel_mode == ParallelMode.TPU and hasattr(model, "tie_weights"): + model.tie_weights() + + def _set_signature_columns_if_needed(self): + if self._signature_columns is None: + # Inspect model forward signature to keep only the arguments it accepts. + model_to_inspect = self.model + if _is_peft_model(self.model): + if hasattr(self.model, "get_base_model"): + model_to_inspect = self.model.get_base_model() + else: + # PeftMixedModel do not provide a `get_base_model` method + model_to_inspect = self.model.base_model.model + signature = inspect.signature(model_to_inspect.forward) + self._signature_columns = list(signature.parameters.keys()) + # Labels may be named label or label_ids, the default data collator handles that. + self._signature_columns += list(set(["label", "label_ids"] + self.label_names)) + + def _remove_unused_columns(self, dataset: "datasets.Dataset", description: Optional[str] = None): + if not self.args.remove_unused_columns: + return dataset + self._set_signature_columns_if_needed() + signature_columns = self._signature_columns + + ignored_columns = list(set(dataset.column_names) - set(signature_columns)) + if len(ignored_columns) > 0: + dset_description = "" if description is None else f"in the {description} set" + logger.info( + f"The following columns {dset_description} don't have a corresponding argument in " + f"`{self.model.__class__.__name__}.forward` and have been ignored: {', '.join(ignored_columns)}." + f" If {', '.join(ignored_columns)} are not expected by `{self.model.__class__.__name__}.forward`, " + " you can safely ignore this message." + ) + + columns = [k for k in signature_columns if k in dataset.column_names] + if len(columns) == 0: + raise ValueError( + "No columns in the dataset match the model's forward method signature: ({', '.join(signature_columns)}). " + f"The following columns have been ignored: [{', '.join(ignored_columns)}]. " + "Please check the dataset and model. You may need to set `remove_unused_columns=False` in `TrainingArguments`." + ) + + if version.parse(datasets.__version__) < version.parse("1.4.0"): + dataset.set_format( + type=dataset.format["type"], columns=columns, format_kwargs=dataset.format["format_kwargs"] + ) + return dataset + else: + return dataset.remove_columns(ignored_columns) + + def _get_collator_with_removed_columns( + self, data_collator: Callable, description: Optional[str] = None + ) -> Callable: + """Wrap the data collator in a callable removing unused columns.""" + if not self.args.remove_unused_columns: + return data_collator + self._set_signature_columns_if_needed() + signature_columns = self._signature_columns + + remove_columns_collator = RemoveColumnsCollator( + data_collator=data_collator, + signature_columns=signature_columns, + logger=logger, + description=description, + model_name=self.model.__class__.__name__, + ) + return remove_columns_collator + + def _get_train_sampler(self) -> Optional[torch.utils.data.Sampler]: + if self.train_dataset is None or not has_length(self.train_dataset): + return None + + # Build the sampler. + if self.args.group_by_length: + if is_datasets_available() and isinstance(self.train_dataset, datasets.Dataset): + lengths = ( + self.train_dataset[self.args.length_column_name] + if self.args.length_column_name in self.train_dataset.column_names + else None + ) + else: + lengths = None + model_input_name = ( + self.processing_class.model_input_names[0] if self.processing_class is not None else None + ) + return LengthGroupedSampler( + self.args.train_batch_size * self.args.gradient_accumulation_steps, + dataset=self.train_dataset, + lengths=lengths, + model_input_name=model_input_name, + ) + + else: + return RandomSampler(self.train_dataset) + + def get_train_dataloader(self) -> DataLoader: + """ + Returns the training [`~torch.utils.data.DataLoader`]. + + Will use no sampler if `train_dataset` does not implement `__len__`, a random sampler (adapted to distributed + training if necessary) otherwise. + + Subclass and override this method if you want to inject some custom behavior. + """ + if self.train_dataset is None: + raise ValueError("Trainer: training requires a train_dataset.") + + train_dataset = self.train_dataset + data_collator = self.data_collator + if is_datasets_available() and isinstance(train_dataset, datasets.Dataset): + train_dataset = self._remove_unused_columns(train_dataset, description="training") + else: + data_collator = self._get_collator_with_removed_columns(data_collator, description="training") + + dataloader_params = { + "batch_size": self._train_batch_size, + "collate_fn": data_collator, + "num_workers": self.args.dataloader_num_workers, + "pin_memory": self.args.dataloader_pin_memory, + "persistent_workers": self.args.dataloader_persistent_workers, + } + + if not isinstance(train_dataset, torch.utils.data.IterableDataset): + dataloader_params["sampler"] = self._get_train_sampler() + dataloader_params["drop_last"] = self.args.dataloader_drop_last + dataloader_params["worker_init_fn"] = seed_worker + dataloader_params["prefetch_factor"] = self.args.dataloader_prefetch_factor + + return self.accelerator.prepare(DataLoader(train_dataset, **dataloader_params)) + + def _get_eval_sampler(self, eval_dataset: Dataset) -> Optional[torch.utils.data.Sampler]: + if eval_dataset is None or not has_length(eval_dataset): + return None + # Build the sampler. + + # Deprecated code + if self.args.use_legacy_prediction_loop: + if is_torch_xla_available(): + return SequentialDistributedSampler( + eval_dataset, num_replicas=xm.xrt_world_size(), rank=xm.get_ordinal() + ) + elif is_sagemaker_mp_enabled(): + return SequentialDistributedSampler( + eval_dataset, + num_replicas=smp.dp_size(), + rank=smp.dp_rank(), + batch_size=self.args.per_device_eval_batch_size, + ) + else: + return SequentialSampler(eval_dataset) + + if self.args.group_by_length: + if is_datasets_available() and isinstance(eval_dataset, datasets.Dataset): + lengths = ( + eval_dataset[self.args.length_column_name] + if self.args.length_column_name in eval_dataset.column_names + else None + ) + else: + lengths = None + model_input_name = ( + self.processing_class.model_input_names[0] if self.processing_class is not None else None + ) + return LengthGroupedSampler( + self.args.eval_batch_size, + dataset=eval_dataset, + lengths=lengths, + model_input_name=model_input_name, + ) + + if self.args.world_size <= 1: + return SequentialSampler(eval_dataset) + else: + return None + + def get_eval_dataloader(self, eval_dataset: Optional[Union[str, Dataset]] = None) -> DataLoader: + """ + Returns the evaluation [`~torch.utils.data.DataLoader`]. + + Subclass and override this method if you want to inject some custom behavior. + + Args: + eval_dataset (`str` or `torch.utils.data.Dataset`, *optional*): + If a `str`, will use `self.eval_dataset[eval_dataset]` as the evaluation dataset. If a `Dataset`, will override `self.eval_dataset` and must implement `__len__`. If it is a [`~datasets.Dataset`], columns not accepted by the `model.forward()` method are automatically removed. + """ + if eval_dataset is None and self.eval_dataset is None: + raise ValueError("Trainer: evaluation requires an eval_dataset.") + + # If we have persistent workers, don't do a fork bomb especially as eval datasets + # don't change during training + dataloader_key = eval_dataset if isinstance(eval_dataset, str) else "eval" + if ( + hasattr(self, "_eval_dataloaders") + and dataloader_key in self._eval_dataloaders + and self.args.dataloader_persistent_workers + ): + return self.accelerator.prepare(self._eval_dataloaders[dataloader_key]) + + eval_dataset = ( + self.eval_dataset[eval_dataset] + if isinstance(eval_dataset, str) + else eval_dataset + if eval_dataset is not None + else self.eval_dataset + ) + data_collator = self.data_collator + + if is_datasets_available() and isinstance(eval_dataset, datasets.Dataset): + eval_dataset = self._remove_unused_columns(eval_dataset, description="evaluation") + else: + data_collator = self._get_collator_with_removed_columns(data_collator, description="evaluation") + + dataloader_params = { + "batch_size": self.args.eval_batch_size, + "collate_fn": data_collator, + "num_workers": self.args.dataloader_num_workers, + "pin_memory": self.args.dataloader_pin_memory, + "persistent_workers": self.args.dataloader_persistent_workers, + } + + if not isinstance(eval_dataset, torch.utils.data.IterableDataset): + dataloader_params["sampler"] = self._get_eval_sampler(eval_dataset) + dataloader_params["drop_last"] = self.args.dataloader_drop_last + dataloader_params["prefetch_factor"] = self.args.dataloader_prefetch_factor + + # accelerator.free_memory() will destroy the references, so + # we need to store the non-prepared version + eval_dataloader = DataLoader(eval_dataset, **dataloader_params) + if self.args.dataloader_persistent_workers: + if hasattr(self, "_eval_dataloaders"): + self._eval_dataloaders[dataloader_key] = eval_dataloader + else: + self._eval_dataloaders = {dataloader_key: eval_dataloader} + + return self.accelerator.prepare(eval_dataloader) + + def get_test_dataloader(self, test_dataset: Dataset) -> DataLoader: + """ + Returns the test [`~torch.utils.data.DataLoader`]. + + Subclass and override this method if you want to inject some custom behavior. + + Args: + test_dataset (`torch.utils.data.Dataset`, *optional*): + The test dataset to use. If it is a [`~datasets.Dataset`], columns not accepted by the + `model.forward()` method are automatically removed. It must implement `__len__`. + """ + data_collator = self.data_collator + + if is_datasets_available() and isinstance(test_dataset, datasets.Dataset): + test_dataset = self._remove_unused_columns(test_dataset, description="test") + else: + data_collator = self._get_collator_with_removed_columns(data_collator, description="test") + + dataloader_params = { + "batch_size": self.args.eval_batch_size, + "collate_fn": data_collator, + "num_workers": self.args.dataloader_num_workers, + "pin_memory": self.args.dataloader_pin_memory, + "persistent_workers": self.args.dataloader_persistent_workers, + } + + if not isinstance(test_dataset, torch.utils.data.IterableDataset): + dataloader_params["sampler"] = self._get_eval_sampler(test_dataset) + dataloader_params["drop_last"] = self.args.dataloader_drop_last + dataloader_params["prefetch_factor"] = self.args.dataloader_prefetch_factor + + # We use the same batch_size as for eval. + return self.accelerator.prepare(DataLoader(test_dataset, **dataloader_params)) + + def create_optimizer_and_scheduler(self, num_training_steps: int): + """ + Setup the optimizer and the learning rate scheduler. + + We provide a reasonable default that works well. If you want to use something else, you can pass a tuple in the + Trainer's init through `optimizers`, or subclass and override this method (or `create_optimizer` and/or + `create_scheduler`) in a subclass. + """ + self.create_optimizer() + if IS_SAGEMAKER_MP_POST_1_10 and smp.state.cfg.fp16: + # If smp >= 1.10 and fp16 is enabled, we unwrap the optimizer + optimizer = self.optimizer.optimizer + else: + optimizer = self.optimizer + self.create_scheduler(num_training_steps=num_training_steps, optimizer=optimizer) + + def get_decay_parameter_names(self, model) -> list[str]: + """ + Get all parameter names that weight decay will be applied to. + + This function filters out parameters in two ways: + 1. By layer type (instances of layers specified in ALL_LAYERNORM_LAYERS) + 2. By parameter name patterns (containing 'bias', 'layernorm', or 'rmsnorm') + """ + decay_parameters = get_parameter_names(model, ALL_LAYERNORM_LAYERS, ["bias", "layernorm", "rmsnorm"]) + return decay_parameters + + def create_optimizer(self): + """ + Setup the optimizer. + + We provide a reasonable default that works well. If you want to use something else, you can pass a tuple in the + Trainer's init through `optimizers`, or subclass and override this method in a subclass. + """ + opt_model = self.model_wrapped if is_sagemaker_mp_enabled() else self.model + + if self.optimizer is None: + decay_parameters = self.get_decay_parameter_names(opt_model) + optimizer_grouped_parameters = [ + { + "params": [ + p for n, p in opt_model.named_parameters() if (n in decay_parameters and p.requires_grad) + ], + "weight_decay": self.args.weight_decay, + }, + { + "params": [ + p for n, p in opt_model.named_parameters() if (n not in decay_parameters and p.requires_grad) + ], + "weight_decay": 0.0, + }, + ] + + if self.optimizer_cls_and_kwargs is not None: + optimizer_cls, optimizer_kwargs = self.optimizer_cls_and_kwargs + else: + optimizer_cls, optimizer_kwargs = self.get_optimizer_cls_and_kwargs(self.args, opt_model) + + # Overwrite `params` in case it's created by `get_optimizer_cls_and_kwargs` + # e.g. for GaLore optimizer. + if "params" in optimizer_kwargs: + optimizer_grouped_parameters = optimizer_kwargs.pop("params") + + # Overwrite `model` in case it's created by `get_optimizer_cls_and_kwargs` + # e.g. for LOMO optimizer. + if "model" in optimizer_kwargs: + optimizer_grouped_parameters = optimizer_kwargs.pop("model") + + # For layer-wise dummy optimizers we overwrite optimizer_grouped_parameters with `optimizer_dict` + # to avoid arguments conflicts. + if "optimizer_dict" in optimizer_kwargs: + optimizer_grouped_parameters = optimizer_kwargs.pop("optimizer_dict") + + self.optimizer = optimizer_cls(optimizer_grouped_parameters, **optimizer_kwargs) + + if "bitsandbytes" in str(optimizer_cls) and optimizer_kwargs.get("optim_bits", None) == 8: + import bitsandbytes + + manager = bitsandbytes.optim.GlobalOptimManager.get_instance() + + skipped = 0 + for module in opt_model.modules(): + if isinstance(module, nn.Embedding): + skipped += sum({p.data_ptr(): p.numel() for p in module.parameters()}.values()) + logger.info(f"skipped {module}: {skipped / 2**20}M params") + manager.register_module_override(module, "weight", {"optim_bits": 32}) + logger.debug(f"bitsandbytes: will optimize {module} in fp32") + logger.info(f"skipped: {skipped / 2**20}M params") + + if is_sagemaker_mp_enabled(): + self.optimizer = smp.DistributedOptimizer(self.optimizer) + + return self.optimizer + + def get_num_trainable_parameters(self): + """ + Get the number of trainable parameters. + """ + return sum(p.numel() for p in self.model.parameters() if p.requires_grad) + + def get_learning_rates(self): + """ + Returns the learning rate of each parameter from self.optimizer. + """ + if self.optimizer is None: + raise ValueError("Trainer optimizer is None, please make sure you have setup the optimizer before.") + return [group["lr"] for group in self.optimizer.param_groups] + + def get_optimizer_group(self, param: Optional[Union[str, torch.nn.parameter.Parameter]] = None): + """ + Returns optimizer group for a parameter if given, else returns all optimizer groups for params. + + Args: + param (`str` or `torch.nn.parameter.Parameter`, *optional*): + The parameter for which optimizer group needs to be returned. + """ + if self.optimizer is None: + raise ValueError("Trainer optimizer is None, please make sure you have setup the optimizer before.") + if param is not None: + for group in self.optimizer.param_groups: + if param in group["params"]: + return group + return [group["params"] for group in self.optimizer.param_groups] + + @staticmethod + def get_optimizer_cls_and_kwargs( + args: TrainingArguments, model: Optional[PreTrainedModel] = None + ) -> tuple[Any, Any]: + """ + Returns the optimizer class and optimizer parameters based on the training arguments. + + Args: + args (`transformers.training_args.TrainingArguments`): + The training arguments for the training session. + + """ + + # parse args.optim_args + optim_args = {} + if args.optim_args: + for mapping in args.optim_args.replace(" ", "").split(","): + key, value = mapping.split("=") + optim_args[key] = value + + optimizer_kwargs = {"lr": args.learning_rate} + + adam_kwargs = { + "betas": (args.adam_beta1, args.adam_beta2), + "eps": args.adam_epsilon, + } + + def setup_low_rank_optimizer( + optimizer_name: str, + optimizer_mapping: dict[str, Any], + optim_kwargs: dict[str, Any], + is_layerwise_supported: bool = True, + ) -> tuple[Any, Any]: + """ + Helper function to set up low-rank optimizers like GaLore and Apollo. + + Args: + optimizer_name (str): Name of the optimizer. + optimizer_mapping (dict): Mapping of optimizer names to their classes. + optim_kwargs (dict): Keyword arguments for the optimizer. + is_layerwise_supported (bool): Whether layerwise optimization is supported. + + Returns: + Tuple[Any, Any]: Optimizer class and updated optimizer kwargs. + """ + is_layerwise = optimizer_name.lower().endswith("layerwise") + if is_layerwise and args.parallel_mode == ParallelMode.DISTRIBUTED and is_layerwise_supported: + raise NotImplementedError(f"Layer-wise {optimizer_name} does not support DDP at this time") + + optimizer_cls = optimizer_mapping[optimizer_name] + + if args.optim_target_modules is None: + raise ValueError(f"You need to define `optim_target_modules` to use {optimizer_name} optimizers") + + if not isinstance(args.optim_target_modules, (list, str)): + raise ValueError( + f"`optim_target_modules` must be a list of strings, a regex string, or 'all-linear'. Got: {args.optim_target_modules}" + ) + + if model is None: + raise ValueError(f"You need to pass a model to initialize {optimizer_name} optimizer.") + + all_linear = ( + isinstance(args.optim_target_modules, str) + and args.optim_target_modules.replace("_", "-") == "all-linear" + ) + + target_params_names = [] + for module_name, module in model.named_modules(): + target_module_exists, is_regex = check_target_module_exists( + args.optim_target_modules, module_name, return_is_regex=True + ) + + if not isinstance(module, nn.Linear): + if target_module_exists and not is_regex: + logger.warning( + f"{module_name} matched but ignored. {optimizer_name} only supports linear layers." + ) + continue + + if not target_module_exists and not all_linear: + continue + + target_params_names.append(module_name + ".weight") + + if len(target_params_names) == 0: + raise ValueError(f"No target modules found for {optimizer_name} ({args.optim_target_modules}).") + + target_params = [p for n, p in model.named_parameters() if n in target_params_names] + non_target_params = [p for n, p in model.named_parameters() if n not in target_params_names] + optim_kwargs.update(optim_args) + + param_groups = [ + {"params": non_target_params}, + {"params": target_params, **optim_kwargs}, + ] + + if is_layerwise: + if args.gradient_accumulation_steps != 1: + raise ValueError(f"Layerwise {optimizer_name} does not support gradient accumulation!") + + optimizer_dict = {} + for param in non_target_params: + optimizer_dict[param] = optimizer_cls([{"params": [param]}], **optimizer_kwargs) + for param in target_params: + optimizer_dict[param] = optimizer_cls([{"params": [param], **optim_kwargs}], **optimizer_kwargs) + + def optimizer_hook(param): + if param.grad is not None: + optimizer_dict[param].step() + optimizer_dict[param].zero_grad() + + for param in model.parameters(): + if param.requires_grad: + param.register_post_accumulate_grad_hook(optimizer_hook) + + optimizer_cls = LayerWiseDummyOptimizer + optimizer_kwargs.update({"optimizer_dict": optimizer_dict}) + + optimizer_kwargs.update({"params": param_groups}) + return optimizer_cls, optimizer_kwargs + + if args.optim == OptimizerNames.ADAFACTOR: + optimizer_cls = Adafactor + optimizer_kwargs.update({"scale_parameter": False, "relative_step": False}) + elif args.optim in [OptimizerNames.ADAMW_TORCH, OptimizerNames.ADAMW_TORCH_FUSED]: + from torch.optim import AdamW + + optimizer_cls = AdamW + optimizer_kwargs.update(adam_kwargs) + if args.optim == OptimizerNames.ADAMW_TORCH_FUSED: + optimizer_kwargs.update({"fused": True}) + elif args.optim == OptimizerNames.ADAMW_TORCH_XLA: + try: + from torch_xla.amp.syncfree import AdamW + + optimizer_cls = AdamW + optimizer_kwargs.update(adam_kwargs) + except ImportError: + raise ValueError("Trainer failed to import syncfree AdamW from torch_xla.") + elif args.optim == OptimizerNames.ADAMW_TORCH_NPU_FUSED: + try: + from torch_npu.optim import NpuFusedAdamW + + optimizer_cls = NpuFusedAdamW + optimizer_kwargs.update(adam_kwargs) + except ImportError: + raise ValueError("Trainer failed to import FusedAdamW from torch_npu.") + elif args.optim == OptimizerNames.ADAMW_APEX_FUSED: + try: + from apex.optimizers import FusedAdam + + optimizer_cls = FusedAdam + optimizer_kwargs.update(adam_kwargs) + except ImportError: + raise ValueError("Trainer tried to instantiate apex FusedAdam but apex is not installed!") + elif args.optim in [ + OptimizerNames.ADAMW_BNB, + OptimizerNames.ADAMW_8BIT, + OptimizerNames.PAGED_ADAMW, + OptimizerNames.PAGED_ADAMW_8BIT, + OptimizerNames.ADEMAMIX, + OptimizerNames.ADEMAMIX_8BIT, + OptimizerNames.PAGED_ADEMAMIX, + OptimizerNames.PAGED_ADEMAMIX_8BIT, + OptimizerNames.LION, + OptimizerNames.LION_8BIT, + OptimizerNames.PAGED_LION, + OptimizerNames.PAGED_LION_8BIT, + OptimizerNames.RMSPROP_BNB, + OptimizerNames.RMSPROP_8BIT, + OptimizerNames.RMSPROP_32BIT, + ]: + try: + from bitsandbytes.optim import AdamW, Lion, RMSprop + + is_paged = False + optim_bits = 32 + optimizer_cls = None + additional_optim_kwargs = adam_kwargs + if "paged" in args.optim: + is_paged = True + if "8bit" in args.optim: + optim_bits = 8 + if "adam" in args.optim: + optimizer_cls = AdamW + elif "lion" in args.optim: + optimizer_cls = Lion + additional_optim_kwargs = {"betas": (args.adam_beta1, args.adam_beta2)} + elif "rmsprop" in args.optim: + optimizer_cls = RMSprop + # Above we pass all `adam_kwargs` to the optimizer, here + # we only pass `optim_args` which can be passed by the user. + additional_optim_kwargs = optim_args + elif "ademamix" in args.optim: + if is_bitsandbytes_available() and version.parse( + importlib.metadata.version("bitsandbytes") + ) < version.parse("0.44.0"): + raise ValueError( + "The AdEMAMix optimizer is not supported by your current version of `bitsandbytes`. " + "Please install `bitsandbytes` >= 0.44.0." + ) + + from bitsandbytes.optim import AdEMAMix + + optimizer_cls = AdEMAMix + additional_optim_kwargs = { + "betas": ( + float(optim_args.get("beta1", args.adam_beta1)), + float(optim_args.get("beta2", args.adam_beta2)), + float(optim_args.get("beta3", 0.9999)), + ), + "alpha": float(optim_args.get("alpha", 5.0)), + "eps": float(optim_args.get("eps", args.adam_epsilon)), + } + + if "t_alpha" in optim_args: + additional_optim_kwargs["t_alpha"] = int(optim_args["t_alpha"]) + + if "t_beta3" in optim_args: + additional_optim_kwargs["t_beta3"] = int(optim_args["t_beta3"]) + + bnb_kwargs = {"optim_bits": optim_bits} + if "rmsprop" not in args.optim: + bnb_kwargs["is_paged"] = is_paged + + optimizer_kwargs.update(additional_optim_kwargs) + optimizer_kwargs.update(bnb_kwargs) + except ImportError: + raise ValueError("Trainer tried to instantiate bnb optimizer but `bitsandbytes` is not installed!") + if is_bitsandbytes_available() and version.parse( + importlib.metadata.version("bitsandbytes") + ) < version.parse("0.41.1"): + logger.warning( + "You are using 8-bit optimizers with a version of `bitsandbytes` < 0.41.1. " + "It is recommended to update your version as a major bug has been fixed in 8-bit optimizers." + ) + elif args.optim == OptimizerNames.ADAMW_ANYPRECISION: + try: + from torchdistx.optimizers import AnyPrecisionAdamW + + optimizer_cls = AnyPrecisionAdamW + optimizer_kwargs.update(adam_kwargs) + + # TODO Change dtypes back to M=FP32, Var = BF16, Kahan = False once they can be cast together in torchdistx. + optimizer_kwargs.update( + { + "use_kahan_summation": strtobool(optim_args.get("use_kahan_summation", "False")), + "momentum_dtype": getattr(torch, optim_args.get("momentum_dtype", "float32")), + "variance_dtype": getattr(torch, optim_args.get("variance_dtype", "float32")), + "compensation_buffer_dtype": getattr( + torch, optim_args.get("compensation_buffer_dtype", "bfloat16") + ), + } + ) + except ImportError: + raise ValueError("Please install https://github.com/pytorch/torchdistx") + elif args.optim == OptimizerNames.SGD: + optimizer_cls = torch.optim.SGD + elif args.optim == OptimizerNames.ADAGRAD: + optimizer_cls = torch.optim.Adagrad + elif args.optim == OptimizerNames.RMSPROP: + optimizer_cls = torch.optim.RMSprop + elif args.optim in [ + OptimizerNames.GALORE_ADAMW, + OptimizerNames.GALORE_ADAMW_8BIT, + OptimizerNames.GALORE_ADAFACTOR, + OptimizerNames.GALORE_ADAMW_LAYERWISE, + OptimizerNames.GALORE_ADAMW_8BIT_LAYERWISE, + OptimizerNames.GALORE_ADAFACTOR_LAYERWISE, + ]: + if not is_galore_torch_available(): + raise ImportError( + "You need to install `galore_torch` in order to use GaLore optimizers" + " install it with `pip install git+https://github.com/jiaweizzhao/GaLore`" + ) + from galore_torch import GaLoreAdafactor, GaLoreAdamW, GaLoreAdamW8bit + + optimizer_mapping = { + OptimizerNames.GALORE_ADAMW: GaLoreAdamW, + OptimizerNames.GALORE_ADAMW_8BIT: GaLoreAdamW8bit, + OptimizerNames.GALORE_ADAFACTOR: GaLoreAdafactor, + OptimizerNames.GALORE_ADAMW_LAYERWISE: GaLoreAdamW, + OptimizerNames.GALORE_ADAMW_8BIT_LAYERWISE: GaLoreAdamW8bit, + OptimizerNames.GALORE_ADAFACTOR_LAYERWISE: GaLoreAdafactor, + } + + galore_optim_kwargs = { + "rank": int(optim_args.pop("rank", 128)), + "update_proj_gap": int(optim_args.pop("update_proj_gap", 200)), + "scale": float(optim_args.pop("scale", 0.25)), + "proj_type": optim_args.pop("proj_type", "std"), + } + + optimizer_cls, optimizer_kwargs = setup_low_rank_optimizer( + args.optim, optimizer_mapping, galore_optim_kwargs + ) + if args.optim == OptimizerNames.GALORE_ADAFACTOR: + optimizer_kwargs.update({"scale_parameter": False, "relative_step": False}) + elif args.optim in [ + OptimizerNames.APOLLO_ADAMW, + OptimizerNames.APOLLO_ADAMW_LAYERWISE, + ]: + if not is_apollo_torch_available(): + raise ImportError( + "You need to install `apollo_torch` in order to use APOLLO optimizers" + " install it with `pip install git+https://github.com/zhuhanqing/APOLLO`" + ) + from apollo_torch import APOLLOAdamW + + optimizer_mapping = { + OptimizerNames.APOLLO_ADAMW: APOLLOAdamW, + OptimizerNames.APOLLO_ADAMW_LAYERWISE: APOLLOAdamW, + } + + apollo_optim_kwargs = { + "rank": int(optim_args.pop("rank", 128)), + "proj": optim_args.pop("proj", "random"), + "scale_type": optim_args.pop("scale_type", "channel"), + "update_proj_gap": int(optim_args.pop("update_proj_gap", 200)), + "scale": float(optim_args.pop("scale", 1.0)), + "proj_type": optim_args.pop("proj_type", "std"), + } + + optimizer_cls, optimizer_kwargs = setup_low_rank_optimizer( + args.optim, optimizer_mapping, apollo_optim_kwargs + ) + elif args.optim in [OptimizerNames.LOMO, OptimizerNames.ADALOMO]: + if not is_lomo_available(): + raise ImportError( + "You need to install `lomo_optim` in order to use LOMO optimizers" + " install it with `pip install lomo-optim`" + ) + if not is_accelerate_available("0.30.0"): + raise ImportError("You need to have `accelerate>=0.30.0` to be able to use LOMO optimizers") + + if model is None: + raise ValueError("You need to pass a `model` in order to correctly initialize a LOMO optimizer.") + + from lomo_optim import AdaLomo, Lomo + + if "ada" in args.optim: + optimizer_cls = AdaLomo + else: + optimizer_cls = Lomo + + optimizer_kwargs.update({"model": model}) + elif args.optim == OptimizerNames.GROKADAMW: + if not is_grokadamw_available(): + raise ValueError("Please install grokadamw with `pip install grokadamw`") + + from grokadamw import GrokAdamW + + optimizer_cls = GrokAdamW + optimizer_kwargs.update( + { + "alpha_init": float(optim_args.get("alpha_init", 0.98)), + "lamb": float(optim_args.get("lamb", 2.0)), + "gamma": float(optim_args.get("gamma", 0.1)), + "grokking_signal_decay_rate": float(optim_args.get("grokking_signal_decay_rate", 0.1)), + "gradient_clipping": float(optim_args.get("gradient_clipping", 1.0)), + } + ) + elif args.optim in [ + OptimizerNames.ADAMW_TORCH_4BIT, + OptimizerNames.ADAMW_TORCH_8BIT, + ]: + if not is_torchao_available() or version.parse(importlib.metadata.version("torchao")) < version.parse( + "0.4.0" + ): + raise ImportError( + "You need to have `torchao>=0.4.0` in order to use torch 4-bit optimizers." + "Install it with `pip install torchao` or follow the instructions here: https://github.com/pytorch/ao" + ) + if version.parse(importlib.metadata.version("torch")) <= version.parse("2.4"): + raise ImportError( + "You need to have `torch>2.4` in order to use torch 4-bit optimizers. " + "Install it with `pip install --upgrade torch` it is available on pipy. Otherwise, you need to install torch nightly." + ) + from torchao.prototype.low_bit_optim import AdamW4bit, AdamW8bit + + if args.optim == OptimizerNames.ADAMW_TORCH_4BIT: + optimizer_cls = AdamW4bit + elif args.optim == OptimizerNames.ADAMW_TORCH_8BIT: + optimizer_cls = AdamW8bit + else: + raise ValueError("Invalid optimizer") + optimizer_kwargs.update(adam_kwargs) + elif args.optim in [ + OptimizerNames.SCHEDULE_FREE_RADAM, + OptimizerNames.SCHEDULE_FREE_ADAMW, + OptimizerNames.SCHEDULE_FREE_SGD, + ]: + if not is_schedulefree_available(): + raise ImportError( + "You need to install `schedulefree` in order to use schedulefree optimizers. " + "Install it with `pip install schedulefree.`" + ) + if not is_accelerate_available("0.30.0"): + raise ImportError("You need to have `accelerate>=0.30.0` to be able to use schedulefree optimizers") + from schedulefree import AdamWScheduleFree, SGDScheduleFree + + additional_optim_kwargs = {} + require_warmup = True + + if args.optim == OptimizerNames.SCHEDULE_FREE_RADAM: + if not is_schedulefree_available("1.4.0"): + raise ImportError( + "You need to install `schedulefree>=1.4.0` in order to use RAdamScheduleFree optimizer. " + "Install it with `pip install schedulefree.`" + ) + from schedulefree import RAdamScheduleFree + + optimizer_cls = RAdamScheduleFree + additional_optim_kwargs = adam_kwargs + require_warmup = False + elif args.optim == OptimizerNames.SCHEDULE_FREE_ADAMW: + optimizer_cls = AdamWScheduleFree + additional_optim_kwargs = adam_kwargs + elif args.optim == OptimizerNames.SCHEDULE_FREE_SGD: + optimizer_cls = SGDScheduleFree + else: + raise ValueError("Invalid schedulefree optimizer") + + additional_optim_kwargs["weight_decay"] = args.weight_decay + if require_warmup: + additional_optim_kwargs["warmup_steps"] = args.warmup_steps + additional_optim_kwargs.update( + { + "weight_lr_power": float(optim_args.get("weight_lr_power", 2.0)), + "r": float(optim_args.get("r", 0.0)), + } + ) + optimizer_kwargs.update(additional_optim_kwargs) + else: + raise ValueError(f"Trainer cannot instantiate unsupported optimizer: {args.optim}") + return optimizer_cls, optimizer_kwargs + + def create_scheduler(self, num_training_steps: int, optimizer: torch.optim.Optimizer = None): + """ + Setup the scheduler. The optimizer of the trainer must have been set up either before this method is called or + passed as an argument. + + Args: + num_training_steps (int): The number of training steps to do. + """ + if self.lr_scheduler is None: + self.lr_scheduler = get_scheduler( + self.args.lr_scheduler_type, + optimizer=self.optimizer if optimizer is None else optimizer, + num_warmup_steps=self.args.get_warmup_steps(num_training_steps), + num_training_steps=num_training_steps, + scheduler_specific_kwargs=self.args.lr_scheduler_kwargs, + ) + self._created_lr_scheduler = True + return self.lr_scheduler + + def num_examples(self, dataloader: DataLoader) -> int: + """ + Helper to get number of samples in a [`~torch.utils.data.DataLoader`] by accessing its dataset. When + dataloader.dataset does not exist or has no length, estimates as best it can + """ + try: + dataset = dataloader.dataset + # Special case for IterableDatasetShard, we need to dig deeper + if isinstance(dataset, IterableDatasetShard): + return len(dataloader.dataset.dataset) + return len(dataloader.dataset) + except (NameError, AttributeError, TypeError): # no dataset or length, estimate by length of dataloader + return len(dataloader) * self.args.per_device_train_batch_size + + @staticmethod + def num_tokens(train_dl: DataLoader, max_steps: Optional[int] = None) -> int: + """ + Helper to get number of tokens in a [`~torch.utils.data.DataLoader`] by enumerating dataloader. + """ + train_tokens = 0 + try: + for batch in train_dl: + tokens = batch["input_ids"].numel() + if max_steps is not None: + return tokens * max_steps + train_tokens += tokens + except KeyError: + logger.warning("Cannot get num_tokens from dataloader") + return train_tokens + + def _hp_search_setup(self, trial: Union["optuna.Trial", dict[str, Any]]): + """HP search setup code""" + self._trial = trial + + if self.hp_search_backend is None or trial is None: + return + if self.hp_search_backend == HPSearchBackend.OPTUNA: + params = self.hp_space(trial) + elif self.hp_search_backend == HPSearchBackend.RAY: + params = trial + params.pop("wandb", None) + elif self.hp_search_backend == HPSearchBackend.SIGOPT: + params = {k: int(v) if isinstance(v, str) else v for k, v in trial.assignments.items()} + elif self.hp_search_backend == HPSearchBackend.WANDB: + params = trial + + for key, value in params.items(): + if not hasattr(self.args, key): + logger.warning( + f"Trying to set {key} in the hyperparameter search but there is no corresponding field in" + " `TrainingArguments`." + ) + continue + old_attr = getattr(self.args, key, None) + # Casting value to the proper type + if old_attr is not None: + value = type(old_attr)(value) + + setattr(self.args, key, value) + if self.hp_search_backend == HPSearchBackend.OPTUNA: + logger.info(f"Trial: {trial.params}") + if self.hp_search_backend == HPSearchBackend.SIGOPT: + logger.info(f"SigOpt Assignments: {trial.assignments}") + if self.hp_search_backend == HPSearchBackend.WANDB: + logger.info(f"W&B Sweep parameters: {trial}") + if self.is_deepspeed_enabled: + if self.args.deepspeed is None: + raise ValueError("For sweeps with deepspeed, `args.deepspeed` must be set") + + self.accelerator.free_memory() + + # Rebuild the deepspeed config to reflect the updated training parameters + from accelerate.utils import DeepSpeedPlugin + + from transformers.integrations.deepspeed import HfTrainerDeepSpeedConfig + + self.args.hf_deepspeed_config = HfTrainerDeepSpeedConfig(self.args.deepspeed) + self.args.hf_deepspeed_config.trainer_config_process(self.args) + self.args.deepspeed_plugin = DeepSpeedPlugin(hf_ds_config=self.args.hf_deepspeed_config) + + # From 1.0 on, we need to fully wipe the DS plugin when doing sweeps. + # Simply calling `_reset_state` is enough and doesn't need a version pin. + AcceleratorState()._reset_state() + + self.create_accelerator_and_postprocess() + + def _report_to_hp_search(self, trial: Union["optuna.Trial", dict[str, Any]], step: int, metrics: dict[str, float]): + if self.hp_search_backend is None or trial is None: + return + metrics = metrics.copy() + self.objective = self.compute_objective(metrics) + if self.hp_search_backend == HPSearchBackend.OPTUNA: + import optuna + + if hasattr(trial, "study") and not trial.study._is_multi_objective(): + trial.report(self.objective, step) + if trial.should_prune(): + self.callback_handler.on_train_end(self.args, self.state, self.control) + raise optuna.TrialPruned() + elif self.hp_search_backend == HPSearchBackend.RAY: + import ray.train + + with tempfile.TemporaryDirectory() as temp_checkpoint_dir: + checkpoint = None + if self.control.should_save: + self._tune_save_checkpoint(checkpoint_dir=temp_checkpoint_dir) + checkpoint = ray.train.Checkpoint.from_directory(temp_checkpoint_dir) + metrics["objective"] = self.objective + ray.train.report(metrics, checkpoint=checkpoint) + + def _tune_save_checkpoint(self, checkpoint_dir: str): + output_dir = os.path.join(checkpoint_dir, f"{PREFIX_CHECKPOINT_DIR}-{self.state.global_step}") + self.save_model(output_dir, _internal_call=True) + if self.args.should_save: + # Update the `TrainerControl` state to where we are currently + self.state.stateful_callbacks["TrainerControl"] = self.control.state() + self.state.save_to_json(os.path.join(output_dir, TRAINER_STATE_NAME)) + torch.save(self.optimizer.state_dict(), os.path.join(output_dir, OPTIMIZER_NAME)) + torch.save(self.lr_scheduler.state_dict(), os.path.join(output_dir, SCHEDULER_NAME)) + + def call_model_init(self, trial=None): + model_init_argcount = number_of_arguments(self.model_init) + if model_init_argcount == 0: + model = self.model_init() + elif model_init_argcount == 1: + model = self.model_init(trial) + else: + raise RuntimeError("model_init should have 0 or 1 argument.") + + if model is None: + raise RuntimeError("model_init should not return None.") + + return model + + def torch_jit_model_eval(self, model, dataloader, training=False): + if not training: + if dataloader is None: + logger.warning("failed to use PyTorch jit mode due to current dataloader is none.") + return model + example_batch = next(iter(dataloader)) + example_batch = self._prepare_inputs(example_batch) + try: + jit_model = copy.copy(model) + jit_model.eval() + original_forward = jit_model.__dict__.pop("_original_forward", None) + # remove mixed precision hooks from the model + if original_forward: + jit_model.forward = original_forward + autocast_handler = AutocastKwargs(cache_enabled=False) + with self.accelerator.autocast(autocast_handler=autocast_handler), torch.no_grad(): + if isinstance(example_batch, dict): + jit_model = torch.jit.trace(jit_model, example_kwarg_inputs=example_batch, strict=False) + else: + jit_model = torch.jit.trace( + jit_model, + example_kwarg_inputs={key: example_batch[key] for key in example_batch}, + strict=False, + ) + jit_model = torch.jit.freeze(jit_model) + with torch.no_grad(): + jit_model(**example_batch) + jit_model(**example_batch) + model = jit_model + self.use_cpu_amp = False + except (RuntimeError, TypeError, ValueError, NameError, IndexError) as e: + logger.warning(f"failed to use PyTorch jit mode due to: {e}.") + + return model + + def ipex_optimize_model(self, model, training=False, dtype=torch.float32): + if not is_ipex_available(): + raise ImportError( + "Using IPEX but IPEX is not installed or IPEX's version does not match current PyTorch, please refer" + " to https://github.com/intel/intel-extension-for-pytorch." + ) + + import intel_extension_for_pytorch as ipex + + if not training: + model.eval() + dtype = torch.bfloat16 if not self.is_in_train and self.args.bf16_full_eval else dtype + # conv_bn_folding is disabled as it fails in symbolic tracing, resulting in ipex warnings + model = ipex.optimize(model, dtype=dtype, level="O1", conv_bn_folding=False, inplace=not self.is_in_train) + else: + if not model.training: + model.train() + model, self.optimizer = ipex.optimize( + model, dtype=dtype, optimizer=self.optimizer, inplace=True, level="O1" + ) + + return model + + def compare_trainer_and_checkpoint_args(self, training_args, trainer_state): + attributes_map = { + "logging_steps": "logging_steps", + "eval_steps": "eval_steps", + "save_steps": "save_steps", + } + + has_warning = False + warning_str = "Warning: The following arguments do not match the ones in the `trainer_state.json` within the checkpoint directory: " + for arg_attr, state_attr in attributes_map.items(): + arg_value = getattr(training_args, arg_attr, None) + state_value = getattr(trainer_state, state_attr, None) + + if arg_value is not None and state_value is not None and arg_value != state_value: + warning_str += f"\n\t{arg_attr}: {arg_value} (from args) != {state_value} (from trainer_state.json)" + has_warning = True + + # train bs is special as we need to account for multi-GPU + train_bs_args = training_args.per_device_train_batch_size + train_bs_state = trainer_state.train_batch_size // max(1, training_args.n_gpu) + + if train_bs_args != train_bs_state: + warning_str += f"\n\tper_device_train_batch_size: {train_bs_args} (from args) != {train_bs_state} (from trainer_state.json)" + has_warning = True + + if has_warning: + logger.warning_once(warning_str) + + def _wrap_model(self, model, training=True, dataloader=None): + if self.args.use_ipex: + dtype = torch.bfloat16 if self.use_cpu_amp else torch.float32 + model = self.ipex_optimize_model(model, training, dtype=dtype) + + if is_sagemaker_mp_enabled(): + # Wrapping the base model twice in a DistributedModel will raise an error. + if isinstance(self.model_wrapped, smp.model.DistributedModel): + return self.model_wrapped + return smp.DistributedModel(model, backward_passes_per_step=self.args.gradient_accumulation_steps) + + # train/eval could be run multiple-times - if already wrapped, don't re-wrap it again + if self.accelerator.unwrap_model(model) is not model: + return model + + # Mixed precision training with apex + if self.use_apex and training: + model, self.optimizer = amp.initialize(model, self.optimizer, opt_level=self.args.fp16_opt_level) + + # Multi-gpu training (should be after apex fp16 initialization) / 8bit models does not support DDP + if self.args.n_gpu > 1 and not getattr(model, "is_loaded_in_8bit", False): + model = nn.DataParallel(model) + + if self.args.jit_mode_eval: + start_time = time.time() + model = self.torch_jit_model_eval(model, dataloader, training) + self.jit_compilation_time = round(time.time() - start_time, 4) + + # Note: in torch.distributed mode, there's no point in wrapping the model + # inside a DistributedDataParallel as we'll be under `no_grad` anyways. + if not training: + return model + + # Distributed training (should be after apex fp16 initialization) + # Distributed training using PyTorch FSDP + if self.is_fsdp_xla_enabled: + try: + from torch_xla.distributed.fsdp import XlaFullyShardedDataParallel as FSDP + from torch_xla.distributed.fsdp import checkpoint_module + from torch_xla.distributed.fsdp.wrap import ( + size_based_auto_wrap_policy, + transformer_auto_wrap_policy, + ) + + if self.is_fsdp_xla_v2_enabled: + from torch_xla.experimental.spmd_fully_sharded_data_parallel import ( + SpmdFullyShardedDataParallel as FSDPv2, + ) + except ImportError: + raise ImportError("Missing XLA FSDP related module; please make sure to use torch-xla >= 2.0.") + auto_wrap_policy = None + auto_wrapper_callable = None + default_transformer_cls_names_to_wrap = getattr(model, "_no_split_modules", None) + fsdp_transformer_layer_cls_to_wrap = self.args.fsdp_config.get( + "transformer_layer_cls_to_wrap", default_transformer_cls_names_to_wrap + ) + + if self.args.fsdp_config["min_num_params"] > 0: + auto_wrap_policy = functools.partial( + size_based_auto_wrap_policy, min_num_params=self.args.fsdp_config["min_num_params"] + ) + elif fsdp_transformer_layer_cls_to_wrap is not None: + transformer_cls_to_wrap = set() + for layer_class in fsdp_transformer_layer_cls_to_wrap: + transformer_cls = get_module_class_from_name(model, layer_class) + if transformer_cls is None: + raise Exception("Could not find the transformer layer class to wrap in the model.") + else: + transformer_cls_to_wrap.add(transformer_cls) + + auto_wrap_policy = functools.partial( + transformer_auto_wrap_policy, + # Transformer layer class to wrap + transformer_layer_cls=transformer_cls_to_wrap, + ) + fsdp_kwargs = self.args.xla_fsdp_config + if self.args.fsdp_config["xla_fsdp_grad_ckpt"]: + if model.config.use_cache: + logger.warning_once( + "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`." + ) + model.config.use_cache = False + + # Apply gradient checkpointing to auto-wrapped sub-modules if specified + def auto_wrapper_callable(m, *args, **kwargs): + target_cls = FSDP if not self.is_fsdp_xla_v2_enabled else FSDPv2 + return target_cls(checkpoint_module(m), *args, **kwargs) + + # Wrap the base model with an outer FSDP wrapper + if self.is_fsdp_xla_v2_enabled: + + def shard_output(output, mesh): + from .modeling_outputs import CausalLMOutputWithPast + + real_output = None + if isinstance(output, torch.Tensor): + real_output = output + elif isinstance(output, tuple): + real_output = output[0] + elif isinstance(output, CausalLMOutputWithPast): + real_output = output.logits + + if real_output is None: + raise ValueError("Something went wrong, the output of the model shouldn't be `None`") + xs.mark_sharding(real_output, mesh, ("fsdp", None, None)) + + self.model = model = FSDPv2( + model, + shard_output=shard_output, + auto_wrap_policy=auto_wrap_policy, + auto_wrapper_callable=auto_wrapper_callable, + ) + else: + self.model = model = FSDP( + model, + auto_wrap_policy=auto_wrap_policy, + auto_wrapper_callable=auto_wrapper_callable, + **fsdp_kwargs, + ) + + # Patch `xm.optimizer_step` should not reduce gradients in this case, + # as FSDP does not need gradient reduction over sharded parameters. + def patched_optimizer_step(optimizer, barrier=False, optimizer_args={}): + loss = optimizer.step(**optimizer_args) + if barrier: + xm.mark_step() + return loss + + xm.optimizer_step = patched_optimizer_step + elif is_sagemaker_dp_enabled(): + model = nn.parallel.DistributedDataParallel( + model, device_ids=[int(os.getenv("SMDATAPARALLEL_LOCAL_RANK"))] + ) + elif self.args.parallel_mode == ParallelMode.DISTRIBUTED: + if is_torch_neuroncore_available(): + return model + kwargs = {} + if self.args.ddp_find_unused_parameters is not None: + kwargs["find_unused_parameters"] = self.args.ddp_find_unused_parameters + elif isinstance(model, PreTrainedModel): + # find_unused_parameters breaks checkpointing as per + # https://github.com/huggingface/transformers/pull/4659#issuecomment-643356021 + kwargs["find_unused_parameters"] = not model.is_gradient_checkpointing + else: + kwargs["find_unused_parameters"] = True + + if self.args.ddp_bucket_cap_mb is not None: + kwargs["bucket_cap_mb"] = self.args.ddp_bucket_cap_mb + + if self.args.ddp_broadcast_buffers is not None: + kwargs["broadcast_buffers"] = self.args.ddp_broadcast_buffers + + self.accelerator.ddp_handler = DistributedDataParallelKwargs(**kwargs) + + return model + + def train( + self, + resume_from_checkpoint: Optional[Union[str, bool]] = None, + trial: Union["optuna.Trial", dict[str, Any], None] = None, + ignore_keys_for_eval: Optional[list[str]] = None, + **kwargs, + ): + """ + Main training entry point. + + Args: + resume_from_checkpoint (`str` or `bool`, *optional*): + If a `str`, local path to a saved checkpoint as saved by a previous instance of [`Trainer`]. If a + `bool` and equals `True`, load the last checkpoint in *args.output_dir* as saved by a previous instance + of [`Trainer`]. If present, training will resume from the model/optimizer/scheduler states loaded here. + trial (`optuna.Trial` or `Dict[str, Any]`, *optional*): + The trial run or the hyperparameter dictionary for hyperparameter search. + ignore_keys_for_eval (`List[str]`, *optional*) + A list of keys in the output of your model (if it is a dictionary) that should be ignored when + gathering predictions for evaluation during the training. + kwargs (`Dict[str, Any]`, *optional*): + Additional keyword arguments used to hide deprecated arguments + """ + if resume_from_checkpoint is False: + resume_from_checkpoint = None + + # memory metrics - must set up as early as possible + self._memory_tracker.start() + + args = self.args + + self.is_in_train = True + + # Attach NEFTune hooks if necessary + if self.neftune_noise_alpha is not None: + self.model = self._activate_neftune(self.model) + + # do_train is not a reliable argument, as it might not be set and .train() still called, so + # the following is a workaround: + if ( + (args.fp16_full_eval or args.bf16_full_eval) + and not args.do_train + and not self.is_model_parallel + and self.model_init is None + ): + self._move_model_to_device(self.model, args.device) + + if "model_path" in kwargs: + resume_from_checkpoint = kwargs.pop("model_path") + warnings.warn( + "`model_path` is deprecated and will be removed in a future version. Use `resume_from_checkpoint` " + "instead.", + FutureWarning, + ) + if len(kwargs) > 0: + raise TypeError(f"train() got unexpected keyword arguments: {', '.join(list(kwargs.keys()))}.") + # This might change the seed so needs to run first. + self._hp_search_setup(trial) + self._train_batch_size = self.args.train_batch_size + + # Model re-init + model_reloaded = False + if self.model_init is not None: + # Seed must be set before instantiating the model when using model_init. + enable_full_determinism(self.args.seed) if self.args.full_determinism else set_seed(self.args.seed) + self.model = self.call_model_init(trial) + model_reloaded = True + # Reinitializes optimizer and scheduler + self.optimizer, self.lr_scheduler = None, None + + # Load potential model checkpoint + if isinstance(resume_from_checkpoint, bool) and resume_from_checkpoint: + resume_from_checkpoint = get_last_checkpoint(args.output_dir) + if resume_from_checkpoint is None: + raise ValueError(f"No valid checkpoint found in output directory ({args.output_dir})") + + if resume_from_checkpoint is not None: + if not is_sagemaker_mp_enabled() and not self.is_deepspeed_enabled and not self.is_fsdp_enabled: + self._load_from_checkpoint(resume_from_checkpoint) + # In case of repeating the find_executable_batch_size, set `self._train_batch_size` properly + state = TrainerState.load_from_json(os.path.join(resume_from_checkpoint, TRAINER_STATE_NAME)) + if state.train_batch_size is not None: + self._train_batch_size = state.train_batch_size + + # If model was re-initialized, put it on the right device and update self.model_wrapped + if model_reloaded: + if self.place_model_on_device: + self._move_model_to_device(self.model, args.device) + self.model_wrapped = self.model + + inner_training_loop = find_executable_batch_size( + self._inner_training_loop, self._train_batch_size, args.auto_find_batch_size + ) + if args.push_to_hub: + try: + # Disable progress bars when uploading models during checkpoints to avoid polluting stdout + hf_hub_utils.disable_progress_bars() + return inner_training_loop( + args=args, + resume_from_checkpoint=resume_from_checkpoint, + trial=trial, + ignore_keys_for_eval=ignore_keys_for_eval, + ) + finally: + hf_hub_utils.enable_progress_bars() + else: + return inner_training_loop( + args=args, + resume_from_checkpoint=resume_from_checkpoint, + trial=trial, + ignore_keys_for_eval=ignore_keys_for_eval, + ) + + def _inner_training_loop( + self, batch_size=None, args=None, resume_from_checkpoint=None, trial=None, ignore_keys_for_eval=None + ): + self.accelerator.free_memory() + self._train_batch_size = batch_size + if self.args.auto_find_batch_size: + if self.state.train_batch_size != self._train_batch_size: + from accelerate.utils import release_memory + + (self.model_wrapped,) = release_memory(self.model_wrapped) + self.model_wrapped = self.model + + # Check for DeepSpeed *after* the initial pass and modify the config + if self.is_deepspeed_enabled: + # Temporarily unset `self.args.train_batch_size` + original_bs = self.args.per_device_train_batch_size + self.args.per_device_train_batch_size = self._train_batch_size // max(1, self.args.n_gpu) + self.propagate_args_to_deepspeed(True) + self.args.per_device_train_batch_size = original_bs + self.state.train_batch_size = self._train_batch_size + logger.debug(f"Currently training with a batch size of: {self._train_batch_size}") + # Data loader and number of training steps + train_dataloader = self.get_train_dataloader() + if self.is_fsdp_xla_v2_enabled: + train_dataloader = tpu_spmd_dataloader(train_dataloader) + + # Setting up training control variables: + # number of training epochs: num_train_epochs + # number of training steps per epoch: num_update_steps_per_epoch + # total number of training steps to execute: max_steps + total_train_batch_size = self._train_batch_size * args.gradient_accumulation_steps * args.world_size + ( + num_train_epochs, + num_update_steps_per_epoch, + num_examples, + num_train_samples, + epoch_based, + len_dataloader, + max_steps, + ) = self.set_initial_training_values(args, train_dataloader, total_train_batch_size) + + num_train_tokens = None + if self.args.include_tokens_per_second: + num_train_tokens = self.num_tokens(train_dataloader, None if epoch_based else max_steps) + # If going by epochs, multiply tokens linearly + if len_dataloader is not None and epoch_based: + num_train_tokens *= args.num_train_epochs + # Otherwise since its steps, we just multiply by grad accum + else: + num_train_tokens *= args.gradient_accumulation_steps + + if DebugOption.UNDERFLOW_OVERFLOW in self.args.debug: + if self.args.n_gpu > 1: + # nn.DataParallel(model) replicates the model, creating new variables and module + # references registered here no longer work on other gpus, breaking the module + raise ValueError( + "Currently --debug underflow_overflow is not supported under DP. Please use DDP" + " (torchrun or torch.distributed.launch (deprecated))." + ) + else: + debug_overflow = DebugUnderflowOverflow(self.model) # noqa + + delay_optimizer_creation = is_sagemaker_mp_enabled() or self.is_fsdp_xla_enabled or self.is_fsdp_enabled + + # Can't delay optimizer creation when using FSDP2: https://github.com/huggingface/accelerate/blob/3f636d626063ffcf9a337c7d3624d61b7d187d59/src/accelerate/accelerator.py#L1404 + is_fsdp2 = self.is_fsdp_enabled and (getattr(self.accelerator.state.fsdp_plugin, "fsdp_version", 1) == 2) + if is_fsdp2: + delay_optimizer_creation = False + + # We need to reset the scheduler, as its parameters may be different on subsequent calls + if self._created_lr_scheduler: + self.lr_scheduler = None + self._created_lr_scheduler = False + + if self.is_deepspeed_enabled: + self.optimizer, self.lr_scheduler = deepspeed_init(self, num_training_steps=max_steps) + + if not delay_optimizer_creation: + self.create_optimizer_and_scheduler(num_training_steps=max_steps) + + self.state = TrainerState( + stateful_callbacks=[ + cb for cb in self.callback_handler.callbacks + [self.control] if isinstance(cb, ExportableState) + ] + ) + self.state.is_hyper_param_search = trial is not None + self.state.train_batch_size = self._train_batch_size + + # Compute absolute values for logging, eval, and save if given as ratio + self.state.compute_steps(args, max_steps) + + # Activate gradient checkpointing if needed + if args.gradient_checkpointing: + self.model.gradient_checkpointing_enable(gradient_checkpointing_kwargs=args.gradient_checkpointing_kwargs) + + model = self._wrap_model(self.model_wrapped) + + # as the model is wrapped, don't use `accelerator.prepare` + # this is for unhandled cases such as + # FSDP-XLA, SageMaker MP/DP, DataParallel, IPEX + use_accelerator_prepare = True if model is self.model else False + + if use_accelerator_prepare and self.is_fsdp_enabled: + # In case of auto_find_batch_size=True + # Remove FSDP wrapping from sub-models. + self.model = unwrap_model(self.model, recursive=True) + + if delay_optimizer_creation: + if use_accelerator_prepare: + # configure fsdp plugin for qlora if any + self._fsdp_qlora_plugin_updates() + if self.accelerator.mixed_precision != "fp8": + self.model = self.accelerator.prepare(self.model) + self.create_optimizer_and_scheduler(num_training_steps=max_steps) + + # prepare using `accelerator` prepare + if use_accelerator_prepare: + self.model.train() + if hasattr(self.lr_scheduler, "step"): + if self.use_apex: + model = self.accelerator.prepare(self.model) + else: + model, self.optimizer = self.accelerator.prepare(self.model, self.optimizer) + else: + # to handle cases wherein we pass "DummyScheduler" such as when it is specified in DeepSpeed config. + model, self.optimizer, self.lr_scheduler = self.accelerator.prepare( + self.model, self.optimizer, self.lr_scheduler + ) + elif self.args.optim in [OptimizerNames.LOMO, OptimizerNames.ADALOMO]: + # In this case we are in DDP + LOMO, which should be supported + self.optimizer = self.accelerator.prepare(self.optimizer) + + if self.is_fsdp_enabled: + self.model = self.model_wrapped = model + + # for the rest of this function `model` is the outside model, whether it was wrapped or not + if model is not self.model: + self.model_wrapped = model + + # backward compatibility + if self.is_deepspeed_enabled: + self.deepspeed = self.model_wrapped + + # ckpt loading + if resume_from_checkpoint is not None: + if self.is_deepspeed_enabled: + deepspeed_load_checkpoint( + self.model_wrapped, resume_from_checkpoint, load_module_strict=not _is_peft_model(self.model) + ) + elif is_sagemaker_mp_enabled() or self.is_fsdp_enabled: + self._load_from_checkpoint(resume_from_checkpoint, self.model_wrapped) + + # Check if saved optimizer or scheduler states exist + self._load_optimizer_and_scheduler(resume_from_checkpoint) + self._load_scaler(resume_from_checkpoint) + + # important: at this point: + # self.model is the Transformers Model + # self.model_wrapped is DDP(Transformers Model), Deepspeed(Transformers Model), + # FSDP(Transformers Model), Dynamo Optimized Module(Transformers Model) etc. + + # Train! + logger.info("***** Running training *****") + logger.info(f" Num examples = {num_examples:,}") + logger.info(f" Num Epochs = {num_train_epochs:,}") + logger.info(f" Instantaneous batch size per device = {self.args.per_device_train_batch_size:,}") + if self.args.per_device_train_batch_size != self._train_batch_size: + logger.info(f" Training with DataParallel so batch size has been adjusted to: {self._train_batch_size:,}") + logger.info(f" Total train batch size (w. parallel, distributed & accumulation) = {total_train_batch_size:,}") + logger.info(f" Gradient Accumulation steps = {args.gradient_accumulation_steps}") + logger.info(f" Total optimization steps = {max_steps:,}") + logger.info(f" Number of trainable parameters = {get_model_param_count(model, trainable_only=True):,}") + + self.state.epoch = 0 + start_time = time.time() + epochs_trained = 0 + steps_trained_in_current_epoch = 0 + steps_trained_progress_bar = None + + # Check if continuing training from a checkpoint + if resume_from_checkpoint is not None and os.path.isfile( + os.path.join(resume_from_checkpoint, TRAINER_STATE_NAME) + ): + self.state = TrainerState.load_from_json(os.path.join(resume_from_checkpoint, TRAINER_STATE_NAME)) + self.compare_trainer_and_checkpoint_args(self.args, self.state) + self._load_callback_state() + epochs_trained = int(self.state.global_step // num_update_steps_per_epoch) + if not args.ignore_data_skip: + steps_trained_in_current_epoch = self.state.global_step % (num_update_steps_per_epoch) + steps_trained_in_current_epoch *= args.gradient_accumulation_steps + else: + steps_trained_in_current_epoch = 0 + + logger.info(" Continuing training from checkpoint, will skip to saved global_step") + logger.info(f" Continuing training from epoch {epochs_trained}") + logger.info(f" Continuing training from global step {self.state.global_step}") + if not args.ignore_data_skip: + logger.info( + f" Will skip the first {epochs_trained} epochs then the first" + f" {steps_trained_in_current_epoch} batches in the first epoch." + ) + + # Update the references + for attr in ("model", "optimizer", "lr_scheduler"): + setattr(self.callback_handler, attr, getattr(self, attr)) + self.callback_handler.train_dataloader = train_dataloader + + self.state.init_training_references(self, max_steps, num_train_epochs, trial) + + # tr_loss is a tensor to avoid synchronization of TPUs through .item() + tr_loss = torch.tensor(0.0, device=args.device) + # _total_loss_scalar is updated everytime .item() has to be called on tr_loss and stores the sum of all losses + self._total_loss_scalar = 0.0 + self._globalstep_last_logged = self.state.global_step + model.zero_grad() + grad_norm: Optional[float] = None + learning_rate = None + self.control = self.callback_handler.on_train_begin(args, self.state, self.control) + + if args.eval_on_start: + self._evaluate(trial, ignore_keys_for_eval, skip_scheduler=True) + + for epoch in range(epochs_trained, num_train_epochs): + epoch_dataloader = train_dataloader + if hasattr(epoch_dataloader, "set_epoch"): + epoch_dataloader.set_epoch(epoch) + + # Reset the past mems state at the beginning of each epoch if necessary. + if args.past_index >= 0: + self._past = None + + steps_in_epoch = ( + len(epoch_dataloader) + if len_dataloader is not None + else args.max_steps * args.gradient_accumulation_steps + ) + self.control = self.callback_handler.on_epoch_begin(args, self.state, self.control) + + if epoch == epochs_trained and resume_from_checkpoint is not None and steps_trained_in_current_epoch == 0: + self._load_rng_state(resume_from_checkpoint) + + rng_to_sync = False + steps_skipped = 0 + if steps_trained_in_current_epoch > 0: + epoch_dataloader = skip_first_batches(epoch_dataloader, steps_trained_in_current_epoch) + steps_skipped = steps_trained_in_current_epoch + steps_trained_in_current_epoch = 0 + rng_to_sync = True + + step = -1 + epoch_iterator = iter(epoch_dataloader) + # We chunkify the epoch iterator into gradient accumulation steps `n` batches + remainder = num_examples % args.gradient_accumulation_steps + if remainder == 0: + remainder = args.gradient_accumulation_steps + update_step = -1 + total_updates = steps_in_epoch // args.gradient_accumulation_steps + 1 + if args.gradient_accumulation_steps == 1: + total_updates -= 1 + for _ in range(total_updates): + update_step += 1 + num_batches = args.gradient_accumulation_steps if update_step != (total_updates - 1) else remainder + batch_samples, num_items_in_batch = self.get_batch_samples(epoch_iterator, num_batches, args.device) + for i, inputs in enumerate(batch_samples): + step += 1 + do_sync_step = (step + 1) % args.gradient_accumulation_steps == 0 or (step + 1) == steps_in_epoch + # Since we perform prefetching, we need to manually set sync_gradients + self.accelerator.gradient_state._set_sync_gradients(do_sync_step) + + if self.args.include_num_input_tokens_seen: + main_input_name = getattr(self.model, "main_input_name", "input_ids") + if main_input_name not in inputs: + logger.warning( + "Tried to track the number of tokens seen, however the current model is " + "not configured properly to know what item is the input. To fix this, add " + "a `main_input_name` attribute to the model class you are using." + ) + else: + input_tokens = inputs[main_input_name].numel() + input_tokens = torch.tensor(input_tokens, device=self.args.device, dtype=torch.int64) + self.state.num_input_tokens_seen += self.accelerator.gather(input_tokens).sum().item() + if rng_to_sync: + self._load_rng_state(resume_from_checkpoint) + rng_to_sync = False + + # Skip past any already trained steps if resuming training + if steps_trained_in_current_epoch > 0: + steps_trained_in_current_epoch -= 1 + if steps_trained_progress_bar is not None: + steps_trained_progress_bar.update(1) + if steps_trained_in_current_epoch == 0: + self._load_rng_state(resume_from_checkpoint) + continue + elif steps_trained_progress_bar is not None: + steps_trained_progress_bar.close() + steps_trained_progress_bar = None + + if step % args.gradient_accumulation_steps == 0: + self.control = self.callback_handler.on_step_begin(args, self.state, self.control) + + # We explicitly want to avoid relying on `accelerator.accumulate` for generation training + context = ( + functools.partial(self.accelerator.no_sync, model=model) + if i != len(batch_samples) - 1 + and self.accelerator.distributed_type != DistributedType.DEEPSPEED + else contextlib.nullcontext + ) + with context(): + tr_loss_step = self.training_step(model, inputs, num_items_in_batch) + + if ( + args.logging_nan_inf_filter + and not is_torch_xla_available() + and (torch.isnan(tr_loss_step) or torch.isinf(tr_loss_step)) + ): + # if loss is nan or inf simply add the average of previous logged losses + tr_loss = tr_loss + tr_loss / (1 + self.state.global_step - self._globalstep_last_logged) + else: + if tr_loss.device != tr_loss_step.device: + raise ValueError( + f"Calculated loss must be on the original device: {tr_loss.device} but device in use is {tr_loss_step.device}" + ) + tr_loss = tr_loss + tr_loss_step + + self.current_flos += float(self.floating_point_ops(inputs)) + + if do_sync_step: + # Since we perform prefetching, we need to manually set sync_gradients to True + self.accelerator.gradient_state._set_sync_gradients(True) + + # Gradient clipping + if args.max_grad_norm is not None and args.max_grad_norm > 0: + if is_sagemaker_mp_enabled() and args.fp16: + _grad_norm = self.optimizer.clip_master_grads(args.max_grad_norm) + elif self.use_apex: + # Revert to normal clipping otherwise, handling Apex or full precision + _grad_norm = nn.utils.clip_grad_norm_( + amp.master_params(self.optimizer), + args.max_grad_norm, + ) + else: + _grad_norm = self.accelerator.clip_grad_norm_( + model.parameters(), + args.max_grad_norm, + ) + + if ( + is_accelerate_available() + and self.accelerator.distributed_type == DistributedType.DEEPSPEED + ): + grad_norm = model.get_global_grad_norm() + # In some cases the grad norm may not return a float + if hasattr(grad_norm, "item"): + grad_norm = grad_norm.item() + else: + grad_norm = _grad_norm + + self.control = self.callback_handler.on_pre_optimizer_step(args, self.state, self.control) + + self.optimizer.step() + + self.control = self.callback_handler.on_optimizer_step(args, self.state, self.control) + + # get leaning rate before update + learning_rate = self._get_learning_rate() + + if not self.accelerator.optimizer_step_was_skipped: + # Delay optimizer scheduling until metrics are generated + if not isinstance(self.lr_scheduler, torch.optim.lr_scheduler.ReduceLROnPlateau): + self.lr_scheduler.step() + + model.zero_grad() + self.state.global_step += 1 + self.state.epoch = epoch + (step + 1 + steps_skipped) / steps_in_epoch + self.control = self.callback_handler.on_step_end(args, self.state, self.control) + self._maybe_log_save_evaluate( + tr_loss, + grad_norm, + model, + trial, + epoch, + ignore_keys_for_eval, + start_time, + learning_rate=learning_rate, + ) + else: + self.control = self.callback_handler.on_substep_end(args, self.state, self.control) + + # PyTorch/XLA relies on the data loader to insert the mark_step for + # each step. Since we are breaking the loop early, we need to manually + # insert the mark_step here. + if self.control.should_epoch_stop or self.control.should_training_stop: + if is_torch_xla_available(): + xm.mark_step() + break + # We also need to break out of the nested loop + if self.control.should_epoch_stop or self.control.should_training_stop: + if is_torch_xla_available(): + xm.mark_step() + break + if step < 0: + logger.warning( + "There seems not to be a single sample in your epoch_iterator, stopping training at step" + f" {self.state.global_step}! This is expected if you're using an IterableDataset and set" + f" num_steps ({max_steps}) higher than the number of available samples." + ) + self.control.should_training_stop = True + + self.control = self.callback_handler.on_epoch_end(args, self.state, self.control) + self._maybe_log_save_evaluate( + tr_loss, grad_norm, model, trial, epoch, ignore_keys_for_eval, start_time, learning_rate=learning_rate + ) + + if DebugOption.TPU_METRICS_DEBUG in self.args.debug: + if is_torch_xla_available(): + # tpu-comment: Logging debug metrics for PyTorch/XLA (compile, execute times, ops, etc.) + xm.master_print(met.metrics_report()) + else: + logger.warning( + "You enabled PyTorch/XLA debug metrics but you don't have a TPU " + "configured. Check your training configuration if this is unexpected." + ) + if self.control.should_training_stop: + break + + if args.past_index and hasattr(self, "_past"): + # Clean the state at the end of training + delattr(self, "_past") + + logger.info("\n\nTraining completed. Do not forget to share your model on huggingface.co/models =)\n\n") + if args.load_best_model_at_end and self.state.best_model_checkpoint is not None: + # Wait for everyone to get here so we are sure the model has been saved by process 0. + if is_torch_xla_available(): + xm.rendezvous("load_best_model_at_end") + elif args.parallel_mode == ParallelMode.DISTRIBUTED: + dist.barrier() + elif is_sagemaker_mp_enabled(): + smp.barrier() + + self._load_best_model() + + # add remaining tr_loss + self._total_loss_scalar += tr_loss.item() + effective_global_step = max(self.state.global_step, 0.001) # Avoid ZeroDivisionError + train_loss = self._total_loss_scalar / effective_global_step + + metrics = speed_metrics( + "train", + start_time, + num_samples=num_train_samples, + num_steps=self.state.max_steps, + num_tokens=num_train_tokens, + ) + self.store_flos() + metrics["total_flos"] = self.state.total_flos + metrics["train_loss"] = train_loss + + self.is_in_train = False + + self._memory_tracker.stop_and_update_metrics(metrics) + + self.log(metrics) + + run_dir = self._get_output_dir(trial) + checkpoints_sorted = self._sorted_checkpoints(use_mtime=False, output_dir=run_dir) + + # Delete the last checkpoint when save_total_limit=1 if it's different from the best checkpoint and process allowed to save. + if self.args.should_save and self.state.best_model_checkpoint is not None and self.args.save_total_limit == 1: + for checkpoint in checkpoints_sorted: + if not os.path.samefile(checkpoint, self.state.best_model_checkpoint): + logger.info(f"Deleting older checkpoint [{checkpoint}] due to args.save_total_limit") + shutil.rmtree(checkpoint, ignore_errors=True) + + self.control = self.callback_handler.on_train_end(args, self.state, self.control) + + # Wait for the checkpoint to be uploaded. + self._finish_current_push() + + # After training we make sure to retrieve back the original forward pass method + # for the embedding layer by removing the forward post hook. + if self.neftune_noise_alpha is not None: + self._deactivate_neftune(self.model) + + return TrainOutput(self.state.global_step, train_loss, metrics) + + def _get_output_dir(self, trial): + if self.hp_search_backend is not None and trial is not None: + if self.hp_search_backend == HPSearchBackend.OPTUNA: + run_id = trial.number + elif self.hp_search_backend == HPSearchBackend.RAY: + import ray.train + + run_id = ray.train.get_context().get_trial_id() + elif self.hp_search_backend == HPSearchBackend.SIGOPT: + run_id = trial.id + elif self.hp_search_backend == HPSearchBackend.WANDB: + import wandb + + run_id = wandb.run.id + run_name = self.hp_name(trial) if self.hp_name is not None else f"run-{run_id}" + run_dir = os.path.join(self.args.output_dir, run_name) + else: + run_dir = self.args.output_dir + return run_dir + + def _load_from_checkpoint(self, resume_from_checkpoint, model=None): + if model is None: + model = self.model + + config_file = os.path.join(resume_from_checkpoint, CONFIG_NAME) + adapter_weights_file = os.path.join(resume_from_checkpoint, ADAPTER_WEIGHTS_NAME) + adapter_safe_weights_file = os.path.join(resume_from_checkpoint, ADAPTER_SAFE_WEIGHTS_NAME) + weights_file = os.path.join(resume_from_checkpoint, WEIGHTS_NAME) + weights_index_file = os.path.join(resume_from_checkpoint, WEIGHTS_INDEX_NAME) + safe_weights_file = os.path.join(resume_from_checkpoint, SAFE_WEIGHTS_NAME) + safe_weights_index_file = os.path.join(resume_from_checkpoint, SAFE_WEIGHTS_INDEX_NAME) + is_fsdp_ckpt = os.path.isdir(resume_from_checkpoint) and ( + # this checks the FSDP state dict when `SHARDED_STATE_DICT` is used + any( + FSDP_MODEL_NAME in folder_name + for folder_name in os.listdir(resume_from_checkpoint) + if os.path.isdir(os.path.join(resume_from_checkpoint, folder_name)) + ) + # this checks the FSDP state dict when `FULL_STATE_DICT` is used + or os.path.isfile(os.path.join(resume_from_checkpoint, f"{FSDP_MODEL_NAME}.bin")) + ) + # if multiple adapters exist, they get saved in sub directories + adapter_subdirs = ( + [ + folder_name + for folder_name in os.listdir(resume_from_checkpoint) + if os.path.isdir(os.path.join(resume_from_checkpoint, folder_name)) + and ( + os.path.isfile(os.path.join(resume_from_checkpoint, folder_name, ADAPTER_WEIGHTS_NAME)) + or os.path.isfile(os.path.join(resume_from_checkpoint, folder_name, ADAPTER_SAFE_WEIGHTS_NAME)) + ) + ] + if os.path.isdir(resume_from_checkpoint) + else [] + ) + + if is_fsdp_ckpt and not self.is_fsdp_enabled: + raise ValueError(f"Checkpoint found at {resume_from_checkpoint} is only supported when using PyTorch FSDP") + + if not ( + any( + os.path.isfile(f) + for f in [ + weights_file, + safe_weights_file, + weights_index_file, + safe_weights_index_file, + adapter_weights_file, + adapter_safe_weights_file, + ] + ) + or is_fsdp_ckpt + or adapter_subdirs + ): + raise ValueError(f"Can't find a valid checkpoint at {resume_from_checkpoint}") + + logger.info(f"Loading model from {resume_from_checkpoint}.") + + if os.path.isfile(config_file): + config = PretrainedConfig.from_json_file(config_file) + checkpoint_version = config.transformers_version + if checkpoint_version is not None and checkpoint_version != __version__: + logger.warning( + f"You are resuming training from a checkpoint trained with {checkpoint_version} of " + f"Transformers but your current version is {__version__}. This is not recommended and could " + "yield to errors or unwanted behaviors." + ) + + if os.path.isfile(weights_file) or os.path.isfile(safe_weights_file) or is_fsdp_ckpt: + # If the model is on the GPU, it still works! + if is_sagemaker_mp_enabled(): + if os.path.isfile(os.path.join(resume_from_checkpoint, "user_content.pt")): + # If the 'user_content.pt' file exists, load with the new smp api. + # Checkpoint must have been saved with the new smp api. + smp.resume_from_checkpoint( + path=resume_from_checkpoint, tag=WEIGHTS_NAME, partial=False, load_optimizer=False + ) + else: + # If the 'user_content.pt' file does NOT exist, load with the old smp api. + # Checkpoint must have been saved with the old smp api. + if hasattr(self.args, "fp16") and self.args.fp16 is True: + logger.warning( + "Enabling FP16 and loading from smp < 1.10 checkpoint together is not supported." + ) + state_dict = torch.load(weights_file, map_location="cpu", weights_only=True) + # Required for smp to not auto-translate state_dict from hf to smp (is already smp). + state_dict["_smp_is_partial"] = False + load_result = model.load_state_dict(state_dict, strict=True) + # release memory + del state_dict + elif self.is_fsdp_enabled: + load_fsdp_model( + self.accelerator.state.fsdp_plugin, + self.accelerator, + model, + resume_from_checkpoint, + **_get_fsdp_ckpt_kwargs(), + ) + else: + # We load the model state dict on the CPU to avoid an OOM error. + if self.args.save_safetensors and os.path.isfile(safe_weights_file): + state_dict = safetensors.torch.load_file(safe_weights_file, device="cpu") + else: + state_dict = torch.load(weights_file, map_location="cpu", weights_only=True) + + # workaround for FSDP bug https://github.com/pytorch/pytorch/issues/82963 + # which takes *args instead of **kwargs + load_result = model.load_state_dict(state_dict, False) + # release memory + del state_dict + self._issue_warnings_after_load(load_result) + + # Load adapters following PR # 24096 + elif _is_peft_model(model): + # If train a model using PEFT & LoRA, assume that adapter have been saved properly. + # TODO: in the future support only specific min PEFT versions + if (hasattr(model, "active_adapter") or hasattr(model, "active_adapters")) and hasattr( + model, "load_adapter" + ): + if os.path.exists(resume_from_checkpoint): + # For BC for older PEFT versions + if hasattr(model, "active_adapters"): + active_adapters = model.active_adapters + if len(active_adapters) > 1: + logger.warning("Multiple active adapters detected will only consider the first adapter") + active_adapter = active_adapters[0] + else: + active_adapter = model.active_adapter + + if adapter_subdirs: + for subdir_name in adapter_subdirs: + peft_id = os.path.join(resume_from_checkpoint, subdir_name) + model.load_adapter(peft_id, subdir_name, is_trainable=(subdir_name == active_adapter)) + model.set_adapter(active_adapter) + else: + model.load_adapter(resume_from_checkpoint, active_adapter, is_trainable=True) + else: + logger.warning( + "The intermediate checkpoints of PEFT may not be saved correctly, " + f"consider using a custom callback to save {ADAPTER_WEIGHTS_NAME} in corresponding saving folders. " + "Check some examples here: https://github.com/huggingface/peft/issues/96" + ) + else: + logger.warning("Could not load adapter model, make sure to have `peft>=0.3.0` installed") + else: + # We load the sharded checkpoint + load_result = load_sharded_checkpoint( + model, resume_from_checkpoint, strict=is_sagemaker_mp_enabled(), prefer_safe=self.args.save_safetensors + ) + if not is_sagemaker_mp_enabled(): + self._issue_warnings_after_load(load_result) + + def _load_best_model(self): + logger.info(f"Loading best model from {self.state.best_model_checkpoint} (score: {self.state.best_metric}).") + best_model_path = os.path.join(self.state.best_model_checkpoint, WEIGHTS_NAME) + best_safe_model_path = os.path.join(self.state.best_model_checkpoint, SAFE_WEIGHTS_NAME) + best_adapter_model_path = os.path.join(self.state.best_model_checkpoint, ADAPTER_WEIGHTS_NAME) + best_safe_adapter_model_path = os.path.join(self.state.best_model_checkpoint, ADAPTER_SAFE_WEIGHTS_NAME) + + model = self.model_wrapped if is_sagemaker_mp_enabled() else self.model + if self.is_deepspeed_enabled: + deepspeed_load_checkpoint( + self.model_wrapped, + self.state.best_model_checkpoint, + load_module_strict=not _is_peft_model(self.model), + ) + elif self.is_fsdp_enabled: + load_result = load_fsdp_model( + self.accelerator.state.fsdp_plugin, + self.accelerator, + model, + self.state.best_model_checkpoint, + **_get_fsdp_ckpt_kwargs(), + ) + elif ( + os.path.exists(best_model_path) + or os.path.exists(best_safe_model_path) + or os.path.exists(best_adapter_model_path) + or os.path.exists(best_safe_adapter_model_path) + ): + has_been_loaded = True + if is_sagemaker_mp_enabled(): + if os.path.isfile(os.path.join(self.state.best_model_checkpoint, "user_content.pt")): + # If the 'user_content.pt' file exists, load with the new smp api. + # Checkpoint must have been saved with the new smp api. + smp.resume_from_checkpoint( + path=self.state.best_model_checkpoint, + tag=WEIGHTS_NAME, + partial=False, + load_optimizer=False, + ) + else: + # If the 'user_content.pt' file does NOT exist, load with the old smp api. + # Checkpoint must have been saved with the old smp api. + if self.args.save_safetensors and os.path.isfile(best_safe_model_path): + state_dict = safetensors.torch.load_file(best_safe_model_path, device="cpu") + else: + state_dict = torch.load(best_model_path, map_location="cpu", weights_only=True) + + state_dict["_smp_is_partial"] = False + load_result = model.load_state_dict(state_dict, strict=True) + else: + if _is_peft_model(model): + # If train a model using PEFT & LoRA, assume that adapter have been saved properly. + # TODO: in the future support only specific min PEFT versions + if (hasattr(model, "active_adapter") or hasattr(model, "active_adapters")) and hasattr( + model, "load_adapter" + ): + # For BC for older PEFT versions + if hasattr(model, "active_adapters"): + active_adapter = model.active_adapters[0] + if len(model.active_adapters) > 1: + logger.warning("Detected multiple active adapters, will only consider the first one") + else: + active_adapter = model.active_adapter + + if os.path.exists(best_adapter_model_path) or os.path.exists(best_safe_adapter_model_path): + try: + model.load_adapter(self.state.best_model_checkpoint, active_adapter) + except RuntimeError as exc: + if model.peft_config[active_adapter].is_prompt_learning: + # for context: https://github.com/huggingface/peft/issues/2256 + msg = ( + "When using prompt learning PEFT methods such as " + f"{model.peft_config[active_adapter].peft_type.value}, setting " + "load_best_model_at_end=True can lead to errors, it is recommended " + "to set this to False and to load the model manually from the checkpoint " + "directory using PeftModel.from_pretrained(base_model, ) after training " + "has finished." + ) + raise RuntimeError(msg) from exc + else: + raise + # Load_adapter has no return value present, modify it when appropriate. + from torch.nn.modules.module import _IncompatibleKeys + + load_result = _IncompatibleKeys([], []) + else: + logger.warning( + "The intermediate checkpoints of PEFT may not be saved correctly, " + f"consider using a custom callback to save {ADAPTER_WEIGHTS_NAME} in corresponding saving folders. " + "Check some examples here: https://github.com/huggingface/peft/issues/96" + ) + has_been_loaded = False + else: + logger.warning("Could not load adapter model, make sure to have `peft>=0.3.0` installed") + has_been_loaded = False + else: + # We load the model state dict on the CPU to avoid an OOM error. + if self.args.save_safetensors and os.path.isfile(best_safe_model_path): + state_dict = safetensors.torch.load_file(best_safe_model_path, device="cpu") + else: + state_dict = torch.load(best_model_path, map_location="cpu", weights_only=True) + + # If the model is on the GPU, it still works! + # workaround for FSDP bug https://github.com/pytorch/pytorch/issues/82963 + # which takes *args instead of **kwargs + load_result = model.load_state_dict(state_dict, False) + if not is_sagemaker_mp_enabled() and has_been_loaded: + self._issue_warnings_after_load(load_result) + elif os.path.exists(os.path.join(self.state.best_model_checkpoint, SAFE_WEIGHTS_INDEX_NAME)) or os.path.exists( + os.path.join(self.state.best_model_checkpoint, WEIGHTS_INDEX_NAME) + ): + load_result = load_sharded_checkpoint( + model, self.state.best_model_checkpoint, strict=is_sagemaker_mp_enabled() + ) + if not is_sagemaker_mp_enabled(): + self._issue_warnings_after_load(load_result) + else: + logger.warning( + f"Could not locate the best model at {best_model_path}, if you are running a distributed training " + "on multiple nodes, you should activate `--save_on_each_node`." + ) + + def _issue_warnings_after_load(self, load_result): + if len(load_result.missing_keys) != 0: + if self.model._keys_to_ignore_on_save is not None and set(load_result.missing_keys) == set( + self.model._keys_to_ignore_on_save + ): + self.model.tie_weights() + else: + logger.warning(f"There were missing keys in the checkpoint model loaded: {load_result.missing_keys}.") + if len(load_result.unexpected_keys) != 0: + logger.warning( + f"There were unexpected keys in the checkpoint model loaded: {load_result.unexpected_keys}." + ) + + def _evaluate(self, trial, ignore_keys_for_eval, skip_scheduler=False): + metrics = self.evaluate(ignore_keys=ignore_keys_for_eval) + self._report_to_hp_search(trial, self.state.global_step, metrics) + + # Run delayed LR scheduler now that metrics are populated + if isinstance(self.lr_scheduler, torch.optim.lr_scheduler.ReduceLROnPlateau) and not skip_scheduler: + metric_to_check = self.args.metric_for_best_model + if not metric_to_check.startswith("eval_"): + metric_to_check = f"eval_{metric_to_check}" + try: + self.lr_scheduler.step(metrics[metric_to_check]) + except KeyError as exc: + raise KeyError( + f"The `metric_for_best_model` training argument is set to '{metric_to_check}', " + f"which is not found in the evaluation metrics. " + f"The available evaluation metrics are: {list(metrics.keys())}. " + f"Please ensure that the `compute_metrics` function returns a dictionary that includes '{metric_to_check}' or " + f"consider changing the `metric_for_best_model` via the TrainingArguments." + ) from exc + return metrics + + def _maybe_log_save_evaluate( + self, tr_loss, grad_norm, model, trial, epoch, ignore_keys_for_eval, start_time, learning_rate=None + ): + if self.control.should_log and self.state.global_step > self._globalstep_last_logged: + if is_torch_xla_available(): + xm.mark_step() + + logs: dict[str, float] = {} + + # all_gather + mean() to get average loss over all processes + tr_loss_scalar = self._nested_gather(tr_loss).mean().item() + + # reset tr_loss to zero + tr_loss -= tr_loss + + logs["loss"] = round(tr_loss_scalar / (self.state.global_step - self._globalstep_last_logged), 4) + if grad_norm is not None: + logs["grad_norm"] = grad_norm.item() if isinstance(grad_norm, torch.Tensor) else grad_norm + if learning_rate is not None: + logs["learning_rate"] = ( + learning_rate.item() if isinstance(learning_rate, torch.Tensor) else learning_rate + ) + else: + logs["learning_rate"] = self._get_learning_rate() + + self._total_loss_scalar += tr_loss_scalar + self._globalstep_last_logged = self.state.global_step + self.store_flos() + + self.log(logs, start_time) + + metrics = None + if self.control.should_evaluate: + metrics = self._evaluate(trial, ignore_keys_for_eval) + is_new_best_metric = self._determine_best_metric(metrics=metrics, trial=trial) + + if self.args.save_strategy == SaveStrategy.BEST: + self.control.should_save = is_new_best_metric + + if self.control.should_save: + self._save_checkpoint(model, trial) + self.control = self.callback_handler.on_save(self.args, self.state, self.control) + + def _load_rng_state(self, checkpoint): + # Load RNG states from `checkpoint` + if checkpoint is None: + return + + if self.args.world_size > 1: + process_index = self.args.process_index + rng_file = os.path.join(checkpoint, f"rng_state_{process_index}.pth") + if not os.path.isfile(rng_file): + logger.info( + f"Didn't find an RNG file for process {process_index}, if you are resuming a training that " + "wasn't launched in a distributed fashion, reproducibility is not guaranteed." + ) + return + else: + rng_file = os.path.join(checkpoint, "rng_state.pth") + if not os.path.isfile(rng_file): + logger.info( + "Didn't find an RNG file, if you are resuming a training that was launched in a distributed " + "fashion, reproducibility is not guaranteed." + ) + return + + with safe_globals(): + checkpoint_rng_state = torch.load(rng_file, weights_only=True) + random.setstate(checkpoint_rng_state["python"]) + np.random.set_state(checkpoint_rng_state["numpy"]) + torch.random.set_rng_state(checkpoint_rng_state["cpu"]) + if is_torch_xla_available(): + xm.set_rng_state(checkpoint_rng_state["xla"]) + + is_distributed = self.args.parallel_mode == ParallelMode.DISTRIBUTED + if torch.cuda.is_available(): + set_rng_state_for_device("CUDA", torch.cuda, checkpoint_rng_state, is_distributed) + if is_torch_npu_available(): + set_rng_state_for_device("NPU", torch.npu, checkpoint_rng_state, is_distributed) + if is_torch_hpu_available(): + set_rng_state_for_device("HPU", torch.hpu, checkpoint_rng_state, is_distributed) + if is_torch_mlu_available(): + set_rng_state_for_device("MLU", torch.mlu, checkpoint_rng_state, is_distributed) + if is_torch_musa_available(): + set_rng_state_for_device("MUSA", torch.musa, checkpoint_rng_state, is_distributed) + + def _determine_best_metric(self, metrics, trial): + """ + Determine if the model should be saved based on the evaluation metrics. + + Returns: + bool: True if a new best metric was found, else False + """ + is_new_best_metric = False + + if self.args.metric_for_best_model is not None: + metric_to_check = self.args.metric_for_best_model + + if not metric_to_check.startswith("eval_"): + metric_to_check = f"eval_{metric_to_check}" + + try: + metric_value = metrics[metric_to_check] + except KeyError as exc: + raise KeyError( + f"The `metric_for_best_model` training argument is set to '{metric_to_check}', which is not found in the evaluation metrics. " + f"The available evaluation metrics are: {list(metrics.keys())}. Consider changing the `metric_for_best_model` via the TrainingArguments." + ) from exc + + operator = np.greater if self.args.greater_is_better else np.less + + if self.state.best_metric is None: + self.state.best_metric = float("-inf") if self.args.greater_is_better else float("inf") + + if operator(metric_value, self.state.best_metric): + self.state.best_metric = metric_value + + if self.args.save_strategy in [SaveStrategy.STEPS, SaveStrategy.EPOCH]: + self.state.best_global_step = self.state.global_step + + is_new_best_metric = True + + return is_new_best_metric + + def _save_checkpoint(self, model, trial): + # In all cases, including ddp/dp/deepspeed, self.model is always a reference to the model we + # want to save except FullyShardedDDP. + # assert unwrap_model(model) is self.model, "internal model should be a reference to self.model" + + # Save model checkpoint + checkpoint_folder = f"{PREFIX_CHECKPOINT_DIR}-{self.state.global_step}" + + if self.hp_search_backend is None and trial is None: + self.store_flos() + + run_dir = self._get_output_dir(trial=trial) + output_dir = os.path.join(run_dir, checkpoint_folder) + self.save_model(output_dir, _internal_call=True) + + if self.args.save_strategy in [SaveStrategy.STEPS, SaveStrategy.EPOCH] and self.state.best_global_step: + best_checkpoint_folder = f"{PREFIX_CHECKPOINT_DIR}-{self.state.best_global_step}" + best_checkpoint_dir = os.path.join(run_dir, best_checkpoint_folder) + + if os.path.exists(best_checkpoint_dir): + self.state.best_model_checkpoint = best_checkpoint_dir + + if not self.args.save_only_model: + # Save optimizer and scheduler + self._save_optimizer_and_scheduler(output_dir) + self._save_scaler(output_dir) + # Save RNG state + self._save_rng_state(output_dir) + + # Save the Trainer state + if self.args.should_save: + # Update `ExportableState` callbacks and `TrainerControl` state to where we are currently + for cb in [ + cb for cb in self.callback_handler.callbacks + [self.control] if isinstance(cb, ExportableState) + ]: + cb_name = cb.__class__.__name__ + cb_state = cb.state() + if isinstance(self.state.stateful_callbacks[cb_name], list): + self.state.stateful_callbacks[cb_name].append(cb_state) + else: + self.state.stateful_callbacks[cb_name] = cb_state + self.state.save_to_json(os.path.join(output_dir, TRAINER_STATE_NAME)) + + if self.args.push_to_hub: + self._push_from_checkpoint(output_dir) + + # Maybe delete some older checkpoints. + if self.args.should_save: + # we use mtime as default, filesystems without mtime support will be detected in `_sorted_checkpoints` + self._rotate_checkpoints(use_mtime=True, output_dir=run_dir) + + def _save_rng_state(self, output_dir): + # Save RNG state in non-distributed training + rng_states = { + "python": random.getstate(), + "numpy": np.random.get_state(), + "cpu": torch.random.get_rng_state(), + } + if torch.cuda.is_available(): + if self.args.parallel_mode == ParallelMode.DISTRIBUTED: + # In non distributed, we save the global CUDA RNG state (will take care of DataParallel) + rng_states["cuda"] = torch.cuda.random.get_rng_state_all() + else: + rng_states["cuda"] = torch.cuda.random.get_rng_state() + + if is_torch_xla_available(): + rng_states["xla"] = xm.get_rng_state() + + if is_torch_npu_available(): + if self.args.parallel_mode == ParallelMode.DISTRIBUTED: + rng_states["npu"] = torch.npu.random.get_rng_state_all() + else: + rng_states["npu"] = torch.npu.random.get_rng_state() + + if is_torch_hpu_available(): + if self.args.parallel_mode == ParallelMode.DISTRIBUTED: + rng_states["hpu"] = torch.hpu.random.get_rng_state_all() + else: + rng_states["hpu"] = torch.hpu.random.get_rng_state() + + if is_torch_mlu_available(): + if self.args.parallel_mode == ParallelMode.DISTRIBUTED: + rng_states["mlu"] = torch.mlu.random.get_rng_state_all() + else: + rng_states["mlu"] = torch.mlu.random.get_rng_state() + + if is_torch_musa_available(): + if self.args.parallel_mode == ParallelMode.DISTRIBUTED: + rng_states["musa"] = torch.musa.get_rng_state_all() + else: + rng_states["musa"] = torch.musa.get_rng_state() + + # A process can arrive here before the process 0 has a chance to save the model, in which case output_dir may + # not yet exist. + os.makedirs(output_dir, exist_ok=True) + + if self.args.world_size <= 1: + torch.save(rng_states, os.path.join(output_dir, "rng_state.pth")) + else: + torch.save(rng_states, os.path.join(output_dir, f"rng_state_{self.args.process_index}.pth")) + + def _save_optimizer_and_scheduler(self, output_dir): + if is_torch_xla_available(): + xm.rendezvous("saving_optimizer_states") + if self.is_fsdp_xla_v1_enabled: + optm = { + "optimizer": self.optimizer.state_dict(), + "shard_metadata": self.model.get_shard_metadata(), + } + xm.save( + optm, + os.path.join( + output_dir, f"rank{self.args.process_index}-of-{self.args.world_size}-{OPTIMIZER_NAME}" + ), + master_only=False, + ) + else: + xm.save(self.optimizer.state_dict(), os.path.join(output_dir, OPTIMIZER_NAME)) + with warnings.catch_warnings(record=True) as caught_warnings: + xm.save(self.lr_scheduler.state_dict(), os.path.join(output_dir, SCHEDULER_NAME)) + reissue_pt_warnings(caught_warnings) + elif is_sagemaker_mp_enabled(): + opt_state_dict = self.optimizer.local_state_dict(gather_if_shard=False) + smp.barrier() + if smp.rdp_rank() == 0 or smp.state.cfg.shard_optimizer_state: + smp.save( + opt_state_dict, + os.path.join(output_dir, OPTIMIZER_NAME), + partial=True, + v3=smp.state.cfg.shard_optimizer_state, + ) + elif self.is_deepspeed_enabled: + # under zero3 model file itself doesn't get saved since it's bogus! Unless deepspeed + # config `stage3_gather_16bit_weights_on_model_save` is True + accept_exclude_frozen_parameters = "exclude_frozen_parameters" in set( + inspect.signature(self.model_wrapped.save_checkpoint).parameters.keys() + ) + if accept_exclude_frozen_parameters and _is_peft_model(self.model): + self.model_wrapped.save_checkpoint(output_dir, exclude_frozen_parameters=True) + else: + self.model_wrapped.save_checkpoint(output_dir) + elif self.is_fsdp_enabled: + # save fsdp specific ckpt for resuming from ckpt + save_fsdp_model( + self.accelerator.state.fsdp_plugin, self.accelerator, self.model, output_dir, **_get_fsdp_ckpt_kwargs() + ) + save_fsdp_optimizer( + self.accelerator.state.fsdp_plugin, self.accelerator, self.optimizer, self.model, output_dir + ) + elif self.args.should_save: + # deepspeed.save_checkpoint above saves model/optim/sched + torch.save(self.optimizer.state_dict(), os.path.join(output_dir, OPTIMIZER_NAME)) + + # Save SCHEDULER & SCALER + is_deepspeed_custom_scheduler = self.is_deepspeed_enabled and not isinstance( + self.lr_scheduler, DeepSpeedSchedulerWrapper + ) + if ( + self.args.should_save + and (not self.is_deepspeed_enabled or is_deepspeed_custom_scheduler) + and not is_torch_xla_available() + ): + with warnings.catch_warnings(record=True) as caught_warnings: + torch.save(self.lr_scheduler.state_dict(), os.path.join(output_dir, SCHEDULER_NAME)) + reissue_pt_warnings(caught_warnings) + + def _load_optimizer_and_scheduler(self, checkpoint): + """If optimizer and scheduler states exist, load them.""" + if checkpoint is None: + return + + if self.is_deepspeed_enabled: + # deepspeed loads optimizer/lr_scheduler together with the model in deepspeed_init + if not isinstance(self.lr_scheduler, DeepSpeedSchedulerWrapper): + with warnings.catch_warnings(record=True) as caught_warnings: + self.lr_scheduler.load_state_dict( + torch.load(os.path.join(checkpoint, SCHEDULER_NAME), weights_only=True) + ) + reissue_pt_warnings(caught_warnings) + return + + checkpoint_file_exists = ( + glob.glob(os.path.join(checkpoint, OPTIMIZER_NAME) + "_*") + if is_sagemaker_mp_enabled() + else ( + os.path.isfile(os.path.join(checkpoint, OPTIMIZER_NAME)) + or os.path.isfile(os.path.join(checkpoint, OPTIMIZER_NAME_BIN)) + or ( + os.path.isdir(checkpoint) + and any( + OPTIMIZER_NAME_BIN.split(".")[0] in folder_name + for folder_name in os.listdir(checkpoint) + if os.path.isdir(os.path.join(checkpoint, folder_name)) + ) + ) + ) + ) + checkpoint_file_exists = ( + glob.glob(os.path.join(checkpoint, f"rank*-of-{self.args.world_size}-{OPTIMIZER_NAME}")) + if self.is_fsdp_xla_v1_enabled + else checkpoint_file_exists + ) + if checkpoint_file_exists and os.path.isfile(os.path.join(checkpoint, SCHEDULER_NAME)): + # Load in optimizer and scheduler states + if is_torch_xla_available(): + # On TPU we have to take some extra precautions to properly load the states on the right device. + if self.is_fsdp_xla_v1_enabled: + optimizer_state = torch.load( + os.path.join( + checkpoint, f"rank{self.args.process_index}-of-{self.args.world_size}-{OPTIMIZER_NAME}" + ), + map_location="cpu", + weights_only=True, + ) + # We only need `optimizer` when resuming from checkpoint + optimizer_state = optimizer_state["optimizer"] + else: + optimizer_state = torch.load( + os.path.join(checkpoint, OPTIMIZER_NAME), map_location="cpu", weights_only=True + ) + with warnings.catch_warnings(record=True) as caught_warnings: + lr_scheduler_state = torch.load( + os.path.join(checkpoint, SCHEDULER_NAME), map_location="cpu", weights_only=True + ) + reissue_pt_warnings(caught_warnings) + + xm.send_cpu_data_to_device(optimizer_state, self.args.device) + xm.send_cpu_data_to_device(lr_scheduler_state, self.args.device) + + self.optimizer.load_state_dict(optimizer_state) + self.lr_scheduler.load_state_dict(lr_scheduler_state) + else: + if is_sagemaker_mp_enabled(): + if os.path.isfile(os.path.join(checkpoint, "user_content.pt")): + # Optimizer checkpoint was saved with smp >= 1.10 + def opt_load_hook(mod, opt): + opt.load_state_dict(smp.load(os.path.join(checkpoint, OPTIMIZER_NAME), partial=True)) + + else: + # Optimizer checkpoint was saved with smp < 1.10 + def opt_load_hook(mod, opt): + if IS_SAGEMAKER_MP_POST_1_10: + opt.load_state_dict( + smp.load(os.path.join(checkpoint, OPTIMIZER_NAME), partial=True, back_compat=True) + ) + else: + opt.load_state_dict(smp.load(os.path.join(checkpoint, OPTIMIZER_NAME), partial=True)) + + self.model_wrapped.register_post_step_hook(opt_load_hook) + else: + # We use the CPU when training on one GPU to avoid OOM for GPU RAM when training big models. + # In distributed training however, we load directly on each GPU and risk the GPU OOM as it's more + # likely to get OOM on CPU (since we load num_gpu times the optimizer state + map_location = self.args.device if self.args.world_size > 1 else "cpu" + if self.is_fsdp_enabled: + load_fsdp_optimizer( + self.accelerator.state.fsdp_plugin, + self.accelerator, + self.optimizer, + self.model, + checkpoint, + **_get_fsdp_ckpt_kwargs(), + ) + else: + self.optimizer.load_state_dict( + torch.load( + os.path.join(checkpoint, OPTIMIZER_NAME), map_location=map_location, weights_only=True + ) + ) + with warnings.catch_warnings(record=True) as caught_warnings: + self.lr_scheduler.load_state_dict( + torch.load(os.path.join(checkpoint, SCHEDULER_NAME), weights_only=True) + ) + reissue_pt_warnings(caught_warnings) + + def _save_scaler(self, output_dir): + # See if there is a scaler attribute + try: + scaler = self.accelerator.scaler + except AttributeError: + return + if scaler is None: + return + if is_torch_xla_available(): + xm.rendezvous("saving_scaler_state") + with warnings.catch_warnings(record=True) as caught_warnings: + xm.save(self.accelerator.scaler.state_dict(), os.path.join(output_dir, SCALER_NAME)) + reissue_pt_warnings(caught_warnings) + + # Save SCALER + if self.args.should_save and not is_torch_xla_available(): + with warnings.catch_warnings(record=True) as caught_warnings: + torch.save(self.accelerator.scaler.state_dict(), os.path.join(output_dir, SCALER_NAME)) + reissue_pt_warnings(caught_warnings) + + def _load_scaler(self, checkpoint): + """If scaler state exists, load it.""" + if checkpoint is None: + return + + checkpoint_file_exists = os.path.isfile(os.path.join(checkpoint, SCALER_NAME)) + + if checkpoint_file_exists: + # On TPU we have to take some extra precautions to properly load the states on the right device. + # Load in scaler states + if is_torch_xla_available(): + with warnings.catch_warnings(record=True) as caught_warnings: + scaler_state = torch.load( + os.path.join(checkpoint, SCALER_NAME), map_location="cpu", weights_only=True + ) + reissue_pt_warnings(caught_warnings) + xm.send_cpu_data_to_device(scaler_state, self.args.device) + self.accelerator.scaler.load_state_dict(scaler_state) + else: + with warnings.catch_warnings(record=True) as caught_warnings: + self.accelerator.scaler.load_state_dict( + torch.load(os.path.join(checkpoint, SCALER_NAME), weights_only=True) + ) + reissue_pt_warnings(caught_warnings) + + def _load_callback_state(self): + """If callback states exist and were passed in, restore their states if enabled""" + if not self.args.restore_callback_states_from_checkpoint: + return + # Callback states are stored in stateful_callbacks + not_found = [] + new_callbacks = [] + original_callbacks = self.callback_handler.callbacks + [self.control] + for stored_callback, data in self.state.stateful_callbacks.items(): + if not isinstance(data, list): + data = [data] + if any(callback.__class__.__name__ == stored_callback for callback in original_callbacks): + # We can load/restore from multiple callbacks of the same type. + duplicates = [ + callback for callback in original_callbacks if callback.__class__.__name__ == stored_callback + ] + for callback, callback_data in zip(duplicates, data): + args = callback_data.get("args", {}) + attributes = callback_data.get("attributes", {}) + new_callback = type(callback)(**args) + for attribute, value in attributes.items(): + setattr(new_callback, attribute, value) + if isinstance(callback, TrainerControl): + # Specifically for restoring the `control` state + self.control = new_callback + else: + new_callbacks.append(new_callback) + # We remove the existing callback and add it to the list of new callbacks + self.callback_handler.remove_callback(type(new_callback)) + logger.info("Continuing training from checkpoint, restoring any callbacks that were passed in") + else: + not_found.append(stored_callback) + if len(not_found) > 0: + logger.warning( + f"Checkpoint included callbacks not included in current configuration. Ignoring. ({', '.join(not_found)})" + ) + for callback in new_callbacks: + self.callback_handler.add_callback(callback) + + def hyperparameter_search( + self, + hp_space: Optional[Callable[["optuna.Trial"], dict[str, float]]] = None, + compute_objective: Optional[Callable[[dict[str, float]], float]] = None, + n_trials: int = 20, + direction: Union[str, list[str]] = "minimize", + backend: Optional[Union["str", HPSearchBackend]] = None, + hp_name: Optional[Callable[["optuna.Trial"], str]] = None, + **kwargs, + ) -> Union[BestRun, list[BestRun]]: + """ + Launch an hyperparameter search using `optuna` or `Ray Tune` or `SigOpt`. The optimized quantity is determined + by `compute_objective`, which defaults to a function returning the evaluation loss when no metric is provided, + the sum of all metrics otherwise. + + + + To use this method, you need to have provided a `model_init` when initializing your [`Trainer`]: we need to + reinitialize the model at each new run. This is incompatible with the `optimizers` argument, so you need to + subclass [`Trainer`] and override the method [`~Trainer.create_optimizer_and_scheduler`] for custom + optimizer/scheduler. + + + + Args: + hp_space (`Callable[["optuna.Trial"], Dict[str, float]]`, *optional*): + A function that defines the hyperparameter search space. Will default to + [`~trainer_utils.default_hp_space_optuna`] or [`~trainer_utils.default_hp_space_ray`] or + [`~trainer_utils.default_hp_space_sigopt`] depending on your backend. + compute_objective (`Callable[[Dict[str, float]], float]`, *optional*): + A function computing the objective to minimize or maximize from the metrics returned by the `evaluate` + method. Will default to [`~trainer_utils.default_compute_objective`]. + n_trials (`int`, *optional*, defaults to 100): + The number of trial runs to test. + direction (`str` or `List[str]`, *optional*, defaults to `"minimize"`): + If it's single objective optimization, direction is `str`, can be `"minimize"` or `"maximize"`, you + should pick `"minimize"` when optimizing the validation loss, `"maximize"` when optimizing one or + several metrics. If it's multi objectives optimization, direction is `List[str]`, can be List of + `"minimize"` and `"maximize"`, you should pick `"minimize"` when optimizing the validation loss, + `"maximize"` when optimizing one or several metrics. + backend (`str` or [`~training_utils.HPSearchBackend`], *optional*): + The backend to use for hyperparameter search. Will default to optuna or Ray Tune or SigOpt, depending + on which one is installed. If all are installed, will default to optuna. + hp_name (`Callable[["optuna.Trial"], str]]`, *optional*): + A function that defines the trial/run name. Will default to None. + kwargs (`Dict[str, Any]`, *optional*): + Additional keyword arguments for each backend: + + - `optuna`: parameters from + [optuna.study.create_study](https://optuna.readthedocs.io/en/stable/reference/generated/optuna.study.create_study.html) + and also the parameters `timeout`, `n_jobs` and `gc_after_trial` from + [optuna.study.Study.optimize](https://optuna.readthedocs.io/en/stable/reference/generated/optuna.study.Study.html#optuna.study.Study.optimize) + - `ray`: parameters from [tune.run](https://docs.ray.io/en/latest/tune/api_docs/execution.html#tune-run). + If `resources_per_trial` is not set in the `kwargs`, it defaults to 1 CPU core and 1 GPU (if available). + If `progress_reporter` is not set in the `kwargs`, + [ray.tune.CLIReporter](https://docs.ray.io/en/latest/tune/api/doc/ray.tune.CLIReporter.html) is used. + - `sigopt`: the parameter `proxies` from + [sigopt.Connection.set_proxies](https://docs.sigopt.com/support/faq#how-do-i-use-sigopt-with-a-proxy). + + Returns: + [`trainer_utils.BestRun` or `List[trainer_utils.BestRun]`]: All the information about the best run or best + runs for multi-objective optimization. Experiment summary can be found in `run_summary` attribute for Ray + backend. + """ + if backend is None: + backend = default_hp_search_backend() + backend = HPSearchBackend(backend) + backend_obj = ALL_HYPERPARAMETER_SEARCH_BACKENDS[backend]() + backend_obj.ensure_available() + self.hp_search_backend = backend + if self.model_init is None: + raise RuntimeError( + "To use hyperparameter search, you need to pass your model through a model_init function." + ) + + self.hp_space = backend_obj.default_hp_space if hp_space is None else hp_space + self.hp_name = hp_name + self.compute_objective = default_compute_objective if compute_objective is None else compute_objective + + best_run = backend_obj.run(self, n_trials, direction, **kwargs) + + self.hp_search_backend = None + return best_run + + def log(self, logs: dict[str, float], start_time: Optional[float] = None) -> None: + """ + Log `logs` on the various objects watching training. + + Subclass and override this method to inject custom behavior. + + Args: + logs (`Dict[str, float]`): + The values to log. + start_time (`Optional[float]`): + The start of training. + """ + if self.state.epoch is not None: + logs["epoch"] = self.state.epoch + if self.args.include_num_input_tokens_seen: + logs["num_input_tokens_seen"] = self.state.num_input_tokens_seen + if start_time is not None: + speed_metrics("train", start_time, num_tokens=self.state.num_input_tokens_seen) + + output = {**logs, **{"step": self.state.global_step}} + self.state.log_history.append(output) + self.control = self.callback_handler.on_log(self.args, self.state, self.control, logs) + + def _prepare_input(self, data: Union[torch.Tensor, Any]) -> Union[torch.Tensor, Any]: + """ + Prepares one `data` before feeding it to the model, be it a tensor or a nested list/dictionary of tensors. + """ + if isinstance(data, Mapping): + return type(data)({k: self._prepare_input(v) for k, v in data.items()}) + elif isinstance(data, (tuple, list)): + return type(data)(self._prepare_input(v) for v in data) + elif isinstance(data, torch.Tensor): + kwargs = {"device": self.args.device} + if self.is_deepspeed_enabled and (torch.is_floating_point(data) or torch.is_complex(data)): + # NLP models inputs are int/uint and those get adjusted to the right dtype of the + # embedding. Other models such as wav2vec2's inputs are already float and thus + # may need special handling to match the dtypes of the model + kwargs.update({"dtype": self.accelerator.state.deepspeed_plugin.hf_ds_config.dtype()}) + return data.to(**kwargs) + return data + + def _prepare_inputs(self, inputs: dict[str, Union[torch.Tensor, Any]]) -> dict[str, Union[torch.Tensor, Any]]: + """ + Prepare `inputs` before feeding them to the model, converting them to tensors if they are not already and + handling potential state. + """ + inputs = self._prepare_input(inputs) + if len(inputs) == 0: + raise ValueError( + "The batch received was empty, your model won't be able to train on it. Double-check that your " + f"training dataset contains keys expected by the model: {','.join(self._signature_columns)}." + ) + if self.args.past_index >= 0 and self._past is not None: + inputs["mems"] = self._past + + return inputs + + def compute_loss_context_manager(self): + """ + A helper wrapper to group together context managers. + """ + return self.autocast_smart_context_manager() + + def autocast_smart_context_manager(self, cache_enabled: Optional[bool] = True): + """ + A helper wrapper that creates an appropriate context manager for `autocast` while feeding it the desired + arguments, depending on the situation. + """ + if self.use_cpu_amp: + ctx_manager = torch.amp.autocast("cpu", cache_enabled=cache_enabled, dtype=self.amp_dtype) + else: + ctx_manager = contextlib.nullcontext() + + return ctx_manager + + def training_step( + self, model: nn.Module, inputs: dict[str, Union[torch.Tensor, Any]], num_items_in_batch=None + ) -> torch.Tensor: + """ + Perform a training step on a batch of inputs. + + Subclass and override to inject custom behavior. + + Args: + model (`nn.Module`): + The model to train. + inputs (`Dict[str, Union[torch.Tensor, Any]]`): + The inputs and targets of the model. + + The dictionary will be unpacked before being fed to the model. Most models expect the targets under the + argument `labels`. Check your model's documentation for all accepted arguments. + + Return: + `torch.Tensor`: The tensor with training loss on this batch. + """ + model.train() + if hasattr(self.optimizer, "train") and callable(self.optimizer.train): + self.optimizer.train() + + inputs = self._prepare_inputs(inputs) + if is_sagemaker_mp_enabled(): + loss_mb = smp_forward_backward(model, inputs, self.args.gradient_accumulation_steps) + return loss_mb.reduce_mean().detach().to(self.args.device) + + with self.compute_loss_context_manager(): + loss = self.compute_loss(model, inputs, num_items_in_batch=num_items_in_batch) + + del inputs + if ( + self.args.torch_empty_cache_steps is not None + and self.state.global_step % self.args.torch_empty_cache_steps == 0 + ): + if is_torch_xpu_available(): + torch.xpu.empty_cache() + elif is_torch_mlu_available(): + torch.mlu.empty_cache() + elif is_torch_musa_available(): + torch.musa.empty_cache() + elif is_torch_npu_available(): + torch.npu.empty_cache() + elif is_torch_mps_available(): + torch.mps.empty_cache() + elif is_torch_hpu_available(): + logger.warning( + "`torch_empty_cache_steps` is set but HPU device/backend does not support empty_cache()." + ) + else: + torch.cuda.empty_cache() + + kwargs = {} + + # For LOMO optimizers you need to explicitly use the learnign rate + if self.args.optim in [OptimizerNames.LOMO, OptimizerNames.ADALOMO]: + kwargs["learning_rate"] = self._get_learning_rate() + + if self.args.n_gpu > 1: + loss = loss.mean() # mean() to average on multi-gpu parallel training + + if self.use_apex: + with amp.scale_loss(loss, self.optimizer) as scaled_loss: + scaled_loss.backward() + else: + # Finally we need to normalize the loss for reporting + if not self.model_accepts_loss_kwargs and self.compute_loss_func is None: + loss = loss / self.args.gradient_accumulation_steps + + # Turning off loss scaling w.r.t. gradient accumulation when DeepSpeed is enabled + # https://github.com/huggingface/transformers/pull/35808 + if self.accelerator.distributed_type == DistributedType.DEEPSPEED: + kwargs["scale_wrt_gas"] = False + + self.accelerator.backward(loss, **kwargs) + + return loss.detach() + + def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None): + """ + How the loss is computed by Trainer. By default, all models return the loss in the first element. + + Subclass and override for custom behavior. + """ + if (self.label_smoother is not None or self.compute_loss_func is not None) and "labels" in inputs: + labels = inputs.pop("labels") + else: + labels = None + if self.model_accepts_loss_kwargs: + loss_kwargs = {} + if num_items_in_batch is not None: + loss_kwargs["num_items_in_batch"] = num_items_in_batch + inputs = {**inputs, **loss_kwargs} + outputs = model(**inputs) + # Save past state if it exists + # TODO: this needs to be fixed and made cleaner later. + if self.args.past_index >= 0: + self._past = outputs[self.args.past_index] + + if labels is not None: + unwrapped_model = self.accelerator.unwrap_model(model) + if _is_peft_model(unwrapped_model): + model_name = unwrapped_model.base_model.model._get_name() + else: + model_name = unwrapped_model._get_name() + # User-defined compute_loss function + if self.compute_loss_func is not None: + loss = self.compute_loss_func(outputs, labels, num_items_in_batch=num_items_in_batch) + elif model_name in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES.values(): + loss = self.label_smoother(outputs, labels, shift_labels=True) + else: + loss = self.label_smoother(outputs, labels) + else: + if isinstance(outputs, dict) and "loss" not in outputs: + raise ValueError( + "The model did not return a loss from the inputs, only the following keys: " + f"{','.join(outputs.keys())}. For reference, the inputs it received are {','.join(inputs.keys())}." + ) + # We don't use .loss here since the model may return tuples instead of ModelOutput. + loss = outputs["loss"] if isinstance(outputs, dict) else outputs[0] + + if ( + self.args.average_tokens_across_devices + and (self.model_accepts_loss_kwargs or self.compute_loss_func) + and num_items_in_batch is not None + ): + loss *= self.accelerator.num_processes + + return (loss, outputs) if return_outputs else loss + + def is_local_process_zero(self) -> bool: + """ + Whether or not this process is the local (e.g., on one machine if training in a distributed fashion on several + machines) main process. + """ + return self.args.local_process_index == 0 + + def is_world_process_zero(self) -> bool: + """ + Whether or not this process is the global main process (when training in a distributed fashion on several + machines, this is only going to be `True` for one process). + """ + # Special case for SageMaker ModelParallel since there process_index is dp_process_index, not the global + # process index. + if is_sagemaker_mp_enabled(): + return smp.rank() == 0 + else: + return self.args.process_index == 0 + + def save_model(self, output_dir: Optional[str] = None, _internal_call: bool = False): + """ + Will save the model, so you can reload it using `from_pretrained()`. + + Will only save from the main process. + """ + + if output_dir is None: + output_dir = self.args.output_dir + + if is_torch_xla_available(): + self._save_tpu(output_dir) + elif is_sagemaker_mp_enabled(): + # Calling the state_dict needs to be done on the wrapped model and on all processes. + os.makedirs(output_dir, exist_ok=True) + state_dict = self.model_wrapped.state_dict() + if self.args.should_save: + self._save(output_dir, state_dict=state_dict) + if IS_SAGEMAKER_MP_POST_1_10: + # 'user_content.pt' indicates model state_dict saved with smp >= 1.10 + Path(os.path.join(output_dir, "user_content.pt")).touch() + elif self.is_fsdp_enabled: + if ("FULL_STATE_DICT" in str(self.accelerator.state.fsdp_plugin.state_dict_type)) and ( + version.parse(accelerate_version) > version.parse("0.24.1") + ): + state_dict = self.accelerator.get_state_dict(self.model) + if self.args.should_save: + self._save(output_dir, state_dict=state_dict) + elif self.is_deepspeed_enabled: + try: + state_dict = self.accelerator.get_state_dict(self.deepspeed) + if self.args.should_save: + self._save(output_dir, state_dict=state_dict) + except ValueError: + logger.warning( + " stage3_gather_16bit_weights_on_model_save=false. Saving the full checkpoint instead, use" + " zero_to_fp32.py to recover weights" + ) + if self.args.should_save: + self._save(output_dir, state_dict={}) + # remove the dummy state_dict + remove_dummy_checkpoint(self.args.should_save, output_dir, [WEIGHTS_NAME, SAFE_WEIGHTS_NAME]) + self.model_wrapped.save_checkpoint(output_dir) + + elif self.args.should_save: + self._save(output_dir) + + # Push to the Hub when `save_model` is called by the user. + if self.args.push_to_hub and not _internal_call: + self.push_to_hub(commit_message="Model save") + + def _save_tpu(self, output_dir: Optional[str] = None): + output_dir = output_dir if output_dir is not None else self.args.output_dir + + logger.info(f"Saving model checkpoint to {output_dir}") + model = self.model + xm.mark_step() + + if xm.is_master_ordinal(local=False): + os.makedirs(output_dir, exist_ok=True) + torch.save(self.args, os.path.join(output_dir, TRAINING_ARGS_NAME)) + + # Save a trained model and configuration using `save_pretrained()`. + # They can then be reloaded using `from_pretrained()` + supported_classes = (PushToHubMixin,) + xm.rendezvous("saving_checkpoint") + if self.is_fsdp_xla_v1_enabled: + ckpt = { + "model": model.state_dict(), + "shard_metadata": model.get_shard_metadata(), + } + ckpt_path = os.path.join( + output_dir, f"rank{self.args.process_index}-of-{self.args.world_size}-{WEIGHTS_NAME}" + ) + # All ranks save sharded checkpoint + xm.save(ckpt, ckpt_path, master_only=False) + # Make sure all ranks have saved checkpoints + xm.rendezvous("save_full_checkpoints") + # Master save full checkpoint + if self.args.should_save: + from torch_xla.distributed.fsdp import consolidate_sharded_model_checkpoints + + full_state_dict, _ = consolidate_sharded_model_checkpoints( + ckpt_prefix=os.path.join(output_dir, ""), + ckpt_suffix=f"rank*-of-*-{WEIGHTS_NAME}", + save_model=False, + ) + model = model.module.module + unwrapped_model = self.accelerator.unwrap_model(model) + if isinstance(unwrapped_model, supported_classes): + unwrapped_model.save_pretrained( + output_dir, + state_dict=full_state_dict, + save_function=xm.save, + safe_serialization=self.args.save_safetensors, + ) + else: + logger.info("Trainer.model is not a `PreTrainedModel`, only saving its state dict.") + xm.save(full_state_dict, os.path.join(output_dir, WEIGHTS_NAME)) + elif not isinstance(model, supported_classes): + if isinstance(self.accelerator.unwrap_model(model), supported_classes): + self.accelerator.unwrap_model(model).save_pretrained( + output_dir, + is_main_process=self.args.should_save, + state_dict=xm._maybe_convert_to_cpu(model.state_dict()), + save_function=xm.save, + safe_serialization=self.args.save_safetensors, + ) + else: + logger.info("Trainer.model is not a `PreTrainedModel`, only saving its state dict.") + state_dict = xm._maybe_convert_to_cpu(model.state_dict()) + xm.save(state_dict, os.path.join(output_dir, WEIGHTS_NAME)) + else: + model.save_pretrained( + output_dir, + is_main_process=self.args.should_save, + save_function=xm.save, + safe_serialization=self.args.save_safetensors, + state_dict=xm._maybe_convert_to_cpu(model.state_dict()), + ) + if self.processing_class is not None and self.args.should_save: + self.processing_class.save_pretrained(output_dir) + + def _save(self, output_dir: Optional[str] = None, state_dict=None): + # If we are executing this function, we are the process zero, so we don't check for that. + output_dir = output_dir if output_dir is not None else self.args.output_dir + os.makedirs(output_dir, exist_ok=True) + logger.info(f"Saving model checkpoint to {output_dir}") + + supported_classes = (PreTrainedModel,) if not is_peft_available() else (PreTrainedModel, PeftModel) + # Save a trained model and configuration using `save_pretrained()`. + # They can then be reloaded using `from_pretrained()` + if not isinstance(self.model, supported_classes): + if state_dict is None: + state_dict = self.model.state_dict() + + if isinstance(self.accelerator.unwrap_model(self.model), supported_classes): + self.accelerator.unwrap_model(self.model).save_pretrained( + output_dir, state_dict=state_dict, safe_serialization=self.args.save_safetensors + ) + else: + logger.info("Trainer.model is not a `PreTrainedModel`, only saving its state dict.") + if self.args.save_safetensors: + safetensors.torch.save_file( + state_dict, os.path.join(output_dir, SAFE_WEIGHTS_NAME), metadata={"format": "pt"} + ) + else: + torch.save(state_dict, os.path.join(output_dir, WEIGHTS_NAME)) + else: + self.model.save_pretrained( + output_dir, state_dict=state_dict, safe_serialization=self.args.save_safetensors + ) + + if self.processing_class is not None: + self.processing_class.save_pretrained(output_dir) + elif ( + self.data_collator is not None + and hasattr(self.data_collator, "tokenizer") + and self.data_collator.tokenizer is not None + ): + logger.info("Saving Trainer.data_collator.tokenizer by default as Trainer.processing_class is `None`") + self.data_collator.tokenizer.save_pretrained(output_dir) + + # Good practice: save your training arguments together with the trained model + torch.save(self.args, os.path.join(output_dir, TRAINING_ARGS_NAME)) + + def store_flos(self): + # Storing the number of floating-point operations that went into the model + if self.args.parallel_mode == ParallelMode.DISTRIBUTED: + self.state.total_flos += ( + distributed_broadcast_scalars([self.current_flos], device=self.args.device).sum().item() + ) + self.current_flos = 0 + else: + self.state.total_flos += self.current_flos + self.current_flos = 0 + + def _sorted_checkpoints( + self, output_dir=None, checkpoint_prefix=PREFIX_CHECKPOINT_DIR, use_mtime=False + ) -> list[str]: + ordering_and_checkpoint_path = [] + + glob_checkpoints = [str(x) for x in Path(output_dir).glob(f"{checkpoint_prefix}-*") if os.path.isdir(x)] + + for path in glob_checkpoints: + if use_mtime: + ordering_and_checkpoint_path.append((os.path.getmtime(path), path)) + else: + regex_match = re.match(f".*{checkpoint_prefix}-([0-9]+)", path) + if regex_match is not None and regex_match.groups() is not None: + ordering_and_checkpoint_path.append((int(regex_match.groups()[0]), path)) + + checkpoints_sorted = sorted(ordering_and_checkpoint_path) + # mtime is not reliable on all filesystems, especially on some fuse fs in cloud environments + # so we check if the mtime is fake and fallback to numerical ordering if needed + if use_mtime and len(ordering_and_checkpoint_path) > 1: + mtime_diff = checkpoints_sorted[-1][0] - checkpoints_sorted[0][0] + if mtime_diff < 1.0: # less than 1 second, which is almost impossible when mtime works fine + warnings.warn("mtime may not be reliable on this filesystem, falling back to numerical ordering") + return self._sorted_checkpoints( + use_mtime=False, output_dir=output_dir, checkpoint_prefix=checkpoint_prefix + ) + checkpoints_sorted = [checkpoint[1] for checkpoint in checkpoints_sorted] + + # Make sure we don't delete the best model. + if ( + self.state.best_model_checkpoint is not None + and str(Path(self.state.best_model_checkpoint)) in checkpoints_sorted + ): + best_model_index = checkpoints_sorted.index(str(Path(self.state.best_model_checkpoint))) + for i in range(best_model_index, len(checkpoints_sorted) - 2): + checkpoints_sorted[i], checkpoints_sorted[i + 1] = checkpoints_sorted[i + 1], checkpoints_sorted[i] + return checkpoints_sorted + + def _rotate_checkpoints(self, use_mtime=False, output_dir=None) -> None: + if self.args.save_total_limit is None or self.args.save_total_limit <= 0: + return + + # Check if we should delete older checkpoint(s) + checkpoints_sorted = self._sorted_checkpoints(use_mtime=use_mtime, output_dir=output_dir) + if len(checkpoints_sorted) <= self.args.save_total_limit: + return + + # If save_total_limit=1 with load_best_model_at_end=True, we could end up deleting the last checkpoint, which + # we don't do to allow resuming. + save_total_limit = self.args.save_total_limit + if ( + self.state.best_model_checkpoint is not None + and self.args.save_total_limit == 1 + and checkpoints_sorted[-1] != self.state.best_model_checkpoint + ): + save_total_limit = 2 + + number_of_checkpoints_to_delete = max(0, len(checkpoints_sorted) - save_total_limit) + checkpoints_to_be_deleted = checkpoints_sorted[:number_of_checkpoints_to_delete] + for checkpoint in checkpoints_to_be_deleted: + logger.info(f"Deleting older checkpoint [{checkpoint}] due to args.save_total_limit") + shutil.rmtree(checkpoint, ignore_errors=True) + + def evaluate( + self, + eval_dataset: Optional[Union[Dataset, dict[str, Dataset]]] = None, + ignore_keys: Optional[list[str]] = None, + metric_key_prefix: str = "eval", + ) -> dict[str, float]: + """ + Run evaluation and returns metrics. + + The calling script will be responsible for providing a method to compute metrics, as they are task-dependent + (pass it to the init `compute_metrics` argument). + + You can also subclass and override this method to inject custom behavior. + + Args: + eval_dataset (Union[`Dataset`, Dict[str, `Dataset`]), *optional*): + Pass a dataset if you wish to override `self.eval_dataset`. If it is a [`~datasets.Dataset`], columns + not accepted by the `model.forward()` method are automatically removed. If it is a dictionary, it will + evaluate on each dataset, prepending the dictionary key to the metric name. Datasets must implement the + `__len__` method. + + + + If you pass a dictionary with names of datasets as keys and datasets as values, evaluate will run + separate evaluations on each dataset. This can be useful to monitor how training affects other + datasets or simply to get a more fine-grained evaluation. + When used with `load_best_model_at_end`, make sure `metric_for_best_model` references exactly one + of the datasets. If you, for example, pass in `{"data1": data1, "data2": data2}` for two datasets + `data1` and `data2`, you could specify `metric_for_best_model="eval_data1_loss"` for using the + loss on `data1` and `metric_for_best_model="eval_data2_loss"` for the loss on `data2`. + + + + ignore_keys (`List[str]`, *optional*): + A list of keys in the output of your model (if it is a dictionary) that should be ignored when + gathering predictions. + metric_key_prefix (`str`, *optional*, defaults to `"eval"`): + An optional prefix to be used as the metrics key prefix. For example the metrics "bleu" will be named + "eval_bleu" if the prefix is "eval" (default) + + Returns: + A dictionary containing the evaluation loss and the potential metrics computed from the predictions. The + dictionary also contains the epoch number which comes from the training state. + """ + # handle multiple eval datasets + override = eval_dataset is not None + eval_dataset = eval_dataset if override else self.eval_dataset + if isinstance(eval_dataset, dict): + metrics = {} + for eval_dataset_name, _eval_dataset in eval_dataset.items(): + dataset_metrics = self.evaluate( + eval_dataset=_eval_dataset if override else eval_dataset_name, + ignore_keys=ignore_keys, + metric_key_prefix=f"{metric_key_prefix}_{eval_dataset_name}", + ) + metrics.update(dataset_metrics) + return metrics + + # memory metrics - must set up as early as possible + self._memory_tracker.start() + + eval_dataloader = self.get_eval_dataloader(eval_dataset) + if self.is_fsdp_xla_v2_enabled: + eval_dataloader = tpu_spmd_dataloader(eval_dataloader) + + start_time = time.time() + + eval_loop = self.prediction_loop if self.args.use_legacy_prediction_loop else self.evaluation_loop + output = eval_loop( + eval_dataloader, + description="Evaluation", + # No point gathering the predictions if there are no metrics, otherwise we defer to + # self.args.prediction_loss_only + prediction_loss_only=True if self.compute_metrics is None else None, + ignore_keys=ignore_keys, + metric_key_prefix=metric_key_prefix, + ) + + total_batch_size = self.args.eval_batch_size * self.args.world_size + if f"{metric_key_prefix}_jit_compilation_time" in output.metrics: + start_time += output.metrics[f"{metric_key_prefix}_jit_compilation_time"] + if f"{metric_key_prefix}_model_preparation_time" in output.metrics: + start_time += output.metrics[f"{metric_key_prefix}_model_preparation_time"] + output.metrics.update( + speed_metrics( + metric_key_prefix, + start_time, + num_samples=output.num_samples, + num_steps=math.ceil(output.num_samples / total_batch_size), + ) + ) + + self.log(output.metrics) + + if DebugOption.TPU_METRICS_DEBUG in self.args.debug: + # tpu-comment: Logging debug metrics for PyTorch/XLA (compile, execute times, ops, etc.) + xm.master_print(met.metrics_report()) + + self.control = self.callback_handler.on_evaluate(self.args, self.state, self.control, output.metrics) + + self._memory_tracker.stop_and_update_metrics(output.metrics) + + return output.metrics + + def predict( + self, test_dataset: Dataset, ignore_keys: Optional[list[str]] = None, metric_key_prefix: str = "test" + ) -> PredictionOutput: + """ + Run prediction and returns predictions and potential metrics. + + Depending on the dataset and your use case, your test dataset may contain labels. In that case, this method + will also return metrics, like in `evaluate()`. + + Args: + test_dataset (`Dataset`): + Dataset to run the predictions on. If it is an `datasets.Dataset`, columns not accepted by the + `model.forward()` method are automatically removed. Has to implement the method `__len__` + ignore_keys (`List[str]`, *optional*): + A list of keys in the output of your model (if it is a dictionary) that should be ignored when + gathering predictions. + metric_key_prefix (`str`, *optional*, defaults to `"test"`): + An optional prefix to be used as the metrics key prefix. For example the metrics "bleu" will be named + "test_bleu" if the prefix is "test" (default) + + + + If your predictions or labels have different sequence length (for instance because you're doing dynamic padding + in a token classification task) the predictions will be padded (on the right) to allow for concatenation into + one array. The padding index is -100. + + + + Returns: *NamedTuple* A namedtuple with the following keys: + + - predictions (`np.ndarray`): The predictions on `test_dataset`. + - label_ids (`np.ndarray`, *optional*): The labels (if the dataset contained some). + - metrics (`Dict[str, float]`, *optional*): The potential dictionary of metrics (if the dataset contained + labels). + """ + # memory metrics - must set up as early as possible + self._memory_tracker.start() + + test_dataloader = self.get_test_dataloader(test_dataset) + start_time = time.time() + + eval_loop = self.prediction_loop if self.args.use_legacy_prediction_loop else self.evaluation_loop + output = eval_loop( + test_dataloader, description="Prediction", ignore_keys=ignore_keys, metric_key_prefix=metric_key_prefix + ) + total_batch_size = self.args.eval_batch_size * self.args.world_size + if f"{metric_key_prefix}_jit_compilation_time" in output.metrics: + start_time += output.metrics[f"{metric_key_prefix}_jit_compilation_time"] + if f"{metric_key_prefix}_model_preparation_time" in output.metrics: + start_time += output.metrics[f"{metric_key_prefix}_model_preparation_time"] + output.metrics.update( + speed_metrics( + metric_key_prefix, + start_time, + num_samples=output.num_samples, + num_steps=math.ceil(output.num_samples / total_batch_size), + ) + ) + + self.control = self.callback_handler.on_predict(self.args, self.state, self.control, output.metrics) + self._memory_tracker.stop_and_update_metrics(output.metrics) + + return PredictionOutput(predictions=output.predictions, label_ids=output.label_ids, metrics=output.metrics) + + def evaluation_loop( + self, + dataloader: DataLoader, + description: str, + prediction_loss_only: Optional[bool] = None, + ignore_keys: Optional[list[str]] = None, + metric_key_prefix: str = "eval", + ) -> EvalLoopOutput: + """ + Prediction/evaluation loop, shared by `Trainer.evaluate()` and `Trainer.predict()`. + + Works both with or without labels. + """ + args = self.args + + prediction_loss_only = prediction_loss_only if prediction_loss_only is not None else args.prediction_loss_only + + # if eval is called w/o train, handle model prep here + if self.is_deepspeed_enabled and self.deepspeed is None: + _, _ = deepspeed_init(self, num_training_steps=0, inference=True) + + model = self._wrap_model(self.model, training=False, dataloader=dataloader) + + if len(self.accelerator._models) == 0 and model is self.model: + start_time = time.time() + model = ( + self.accelerator.prepare(model) + if self.is_deepspeed_enabled or (self.is_fsdp_enabled and self.accelerator.mixed_precision != "fp8") + else self.accelerator.prepare_model(model, evaluation_mode=True) + ) + self.model_preparation_time = round(time.time() - start_time, 4) + + if self.is_fsdp_enabled: + self.model = model + + # for the rest of this function `model` is the outside model, whether it was wrapped or not + if model is not self.model: + self.model_wrapped = model + + # backward compatibility + if self.is_deepspeed_enabled: + self.deepspeed = self.model_wrapped + + # if full fp16 or bf16 eval is wanted and this ``evaluation`` or ``predict`` isn't called + # while ``train`` is running, cast it to the right dtype first and then put on device + if not self.is_in_train: + if args.fp16_full_eval: + model = model.to(dtype=torch.float16, device=args.device) + elif args.bf16_full_eval: + model = model.to(dtype=torch.bfloat16, device=args.device) + + batch_size = self.args.eval_batch_size + + logger.info(f"\n***** Running {description} *****") + if has_length(dataloader): + logger.info(f" Num examples = {self.num_examples(dataloader)}") + else: + logger.info(" Num examples: Unknown") + logger.info(f" Batch size = {batch_size}") + + model.eval() + if hasattr(self.optimizer, "eval") and callable(self.optimizer.eval): + self.optimizer.eval() + + self.callback_handler.eval_dataloader = dataloader + # Do this before wrapping. + eval_dataset = getattr(dataloader, "dataset", None) + + if args.past_index >= 0: + self._past = None + + # Initialize containers + all_losses = EvalLoopContainer(self.args.eval_do_concat_batches, padding_index=-100) + all_preds = EvalLoopContainer(self.args.eval_do_concat_batches, padding_index=-100) + all_labels = EvalLoopContainer(self.args.eval_do_concat_batches, padding_index=-100) + all_inputs = EvalLoopContainer(self.args.eval_do_concat_batches, padding_index=-100) + + metrics = None + eval_set_kwargs = {} + + # Will be useful when we have an iterable dataset so don't know its length. + observed_num_examples = 0 + + # Main evaluation loop + for step, inputs in enumerate(dataloader): + # Update the observed num examples + observed_batch_size = find_batch_size(inputs) + if observed_batch_size is not None: + observed_num_examples += observed_batch_size + # For batch samplers, batch_size is not known by the dataloader in advance. + if batch_size is None: + batch_size = observed_batch_size + + # Prediction step + losses, logits, labels = self.prediction_step(model, inputs, prediction_loss_only, ignore_keys=ignore_keys) + main_input_name = getattr(self.model, "main_input_name", "input_ids") + inputs_decode = ( + self._prepare_input(inputs[main_input_name]) if "inputs" in args.include_for_metrics else None + ) + + if is_torch_xla_available(): + xm.mark_step() + + # Update containers + if losses is not None: + losses = self.gather_function(losses.repeat(batch_size)) + all_losses.add(losses) + if inputs_decode is not None: + inputs_decode = self.accelerator.pad_across_processes(inputs_decode, dim=1, pad_index=-100) + inputs_decode = self.gather_function(inputs_decode) + if not self.args.batch_eval_metrics or description == "Prediction": + all_inputs.add(inputs_decode) + if labels is not None: + # Pad labels here, preparing for preprocess_logits_for_metrics in next logits block. + labels = self.accelerator.pad_across_processes(labels, dim=1, pad_index=-100) + if logits is not None: + logits = self.accelerator.pad_across_processes(logits, dim=1, pad_index=-100) + if self.preprocess_logits_for_metrics is not None: + logits = self.preprocess_logits_for_metrics(logits, labels) + logits = self.gather_function(logits) + if not self.args.batch_eval_metrics or description == "Prediction": + all_preds.add(logits) + if labels is not None: + labels = self.gather_function(labels) + if not self.args.batch_eval_metrics or description == "Prediction": + all_labels.add(labels) + + self.control = self.callback_handler.on_prediction_step(args, self.state, self.control) + + if self.args.batch_eval_metrics: + if self.compute_metrics is not None and logits is not None and labels is not None: + is_last_step = self.accelerator.gradient_state.end_of_dataloader + batch_kwargs = {} + batch_kwargs["losses"] = losses if "loss" in args.include_for_metrics else None + batch_kwargs["inputs"] = inputs if "inputs" in args.include_for_metrics else None + metrics = self.compute_metrics( + EvalPrediction(predictions=logits, label_ids=labels, **batch_kwargs), + compute_result=is_last_step, + ) + + del losses, logits, labels, inputs + torch.cuda.empty_cache() + + # Gather all tensors and put them back on the CPU if we have done enough accumulation steps. + elif args.eval_accumulation_steps is not None and (step + 1) % args.eval_accumulation_steps == 0: + all_losses.to_cpu_and_numpy() + all_preds.to_cpu_and_numpy() + all_labels.to_cpu_and_numpy() + all_inputs.to_cpu_and_numpy() + + del losses, logits, labels, inputs + torch.cuda.empty_cache() + + # After all calls to `.gather_function`, reset to `gather_for_metrics`: + self.gather_function = self.accelerator.gather_for_metrics + if args.past_index and hasattr(self, "_past"): + # Clean the state at the end of the evaluation loop + delattr(self, "_past") + + # Gather all remaining tensors and put them back on the CPU + all_losses = all_losses.get_arrays() + all_preds = all_preds.get_arrays() + all_labels = all_labels.get_arrays() + all_inputs = all_inputs.get_arrays() + + # Number of samples + if has_length(eval_dataset): + num_samples = len(eval_dataset) + # The instance check is weird and does not actually check for the type, but whether the dataset has the right + # methods. Therefore we need to make sure it also has the attribute. + elif isinstance(eval_dataset, IterableDatasetShard) and getattr(eval_dataset, "num_examples", 0) > 0: + num_samples = eval_dataset.num_examples + else: + if has_length(dataloader): + num_samples = self.num_examples(dataloader) + else: # both len(dataloader.dataset) and len(dataloader) fail + num_samples = observed_num_examples + if num_samples == 0 and observed_num_examples > 0: + num_samples = observed_num_examples + + # Metrics! + if ( + self.compute_metrics is not None + and all_preds is not None + and all_labels is not None + and not self.args.batch_eval_metrics + ): + eval_set_kwargs["losses"] = all_losses if "loss" in args.include_for_metrics else None + eval_set_kwargs["inputs"] = all_inputs if "inputs" in args.include_for_metrics else None + metrics = self.compute_metrics( + EvalPrediction(predictions=all_preds, label_ids=all_labels, **eval_set_kwargs) + ) + elif metrics is None: + metrics = {} + + # To be JSON-serializable, we need to remove numpy types or zero-d tensors + metrics = denumpify_detensorize(metrics) + + if isinstance(all_losses, list) and all_losses: + metrics[f"{metric_key_prefix}_loss"] = np.concatenate(all_losses).mean().item() + elif isinstance(all_losses, np.ndarray): + metrics[f"{metric_key_prefix}_loss"] = all_losses.mean().item() + if hasattr(self, "jit_compilation_time"): + metrics[f"{metric_key_prefix}_jit_compilation_time"] = self.jit_compilation_time + if hasattr(self, "model_preparation_time"): + metrics[f"{metric_key_prefix}_model_preparation_time"] = self.model_preparation_time + + # Prefix all keys with metric_key_prefix + '_' + for key in list(metrics.keys()): + if not key.startswith(f"{metric_key_prefix}_"): + metrics[f"{metric_key_prefix}_{key}"] = metrics.pop(key) + + return EvalLoopOutput(predictions=all_preds, label_ids=all_labels, metrics=metrics, num_samples=num_samples) + + def _nested_gather(self, tensors, name=None): + """ + Gather value of `tensors` (tensor or list/tuple of nested tensors) and convert them to numpy before + concatenating them to `gathered` + """ + if tensors is None: + return + if is_torch_xla_available(): + if name is None: + name = "nested_gather" + tensors = nested_xla_mesh_reduce(tensors, name) + elif is_sagemaker_mp_enabled(): + tensors = smp_gather(tensors) + elif (self.args.distributed_state is not None and self.args.distributed_state.distributed_type != "NO") or ( + self.args.distributed_state is None and self.args.local_rank != -1 + ): + tensors = distributed_concat(tensors) + return tensors + + def prediction_step( + self, + model: nn.Module, + inputs: dict[str, Union[torch.Tensor, Any]], + prediction_loss_only: bool, + ignore_keys: Optional[list[str]] = None, + ) -> tuple[Optional[torch.Tensor], Optional[torch.Tensor], Optional[torch.Tensor]]: + """ + Perform an evaluation step on `model` using `inputs`. + + Subclass and override to inject custom behavior. + + Args: + model (`nn.Module`): + The model to evaluate. + inputs (`Dict[str, Union[torch.Tensor, Any]]`): + The inputs and targets of the model. + + The dictionary will be unpacked before being fed to the model. Most models expect the targets under the + argument `labels`. Check your model's documentation for all accepted arguments. + prediction_loss_only (`bool`): + Whether or not to return the loss only. + ignore_keys (`List[str]`, *optional*): + A list of keys in the output of your model (if it is a dictionary) that should be ignored when + gathering predictions. + + Return: + Tuple[Optional[torch.Tensor], Optional[torch.Tensor], Optional[torch.Tensor]]: A tuple with the loss, + logits and labels (each being optional). + """ + has_labels = False if len(self.label_names) == 0 else all(inputs.get(k) is not None for k in self.label_names) + # For CLIP-like models capable of returning loss values. + # If `return_loss` is not specified or being `None` in `inputs`, we check if the default value of `return_loss` + # is `True` in `model.forward`. + return_loss = inputs.get("return_loss", None) + if return_loss is None: + return_loss = self.can_return_loss + loss_without_labels = True if len(self.label_names) == 0 and return_loss else False + + inputs = self._prepare_inputs(inputs) + if ignore_keys is None: + if hasattr(self.model, "config"): + ignore_keys = getattr(self.model.config, "keys_to_ignore_at_inference", ["past_key_values"]) + else: + ignore_keys = [] + + # labels may be popped when computing the loss (label smoothing for instance) so we grab them first. + if has_labels or loss_without_labels: + labels = nested_detach(tuple(inputs.get(name) for name in self.label_names)) + if len(labels) == 1: + labels = labels[0] + else: + labels = None + + with torch.no_grad(): + if is_sagemaker_mp_enabled(): + raw_outputs = smp_forward_only(model, inputs) + if has_labels or loss_without_labels: + if isinstance(raw_outputs, dict): + loss_mb = raw_outputs["loss"] + logits_mb = tuple(v for k, v in raw_outputs.items() if k not in ignore_keys + ["loss"]) + else: + loss_mb = raw_outputs[0] + logits_mb = raw_outputs[1:] + + loss = loss_mb.reduce_mean().detach().cpu() + logits = smp_nested_concat(logits_mb) + else: + loss = None + if isinstance(raw_outputs, dict): + logits_mb = tuple(v for k, v in raw_outputs.items() if k not in ignore_keys) + else: + logits_mb = raw_outputs + logits = smp_nested_concat(logits_mb) + else: + if has_labels or loss_without_labels: + with self.compute_loss_context_manager(): + loss, outputs = self.compute_loss(model, inputs, return_outputs=True) + loss = loss.detach().mean() + + if isinstance(outputs, dict): + logits = tuple(v for k, v in outputs.items() if k not in ignore_keys + ["loss"]) + else: + logits = outputs[1:] + else: + loss = None + with self.compute_loss_context_manager(): + outputs = model(**inputs) + if isinstance(outputs, dict): + logits = tuple(v for k, v in outputs.items() if k not in ignore_keys) + else: + logits = outputs + # TODO: this needs to be fixed and made cleaner later. + if self.args.past_index >= 0: + self._past = outputs[self.args.past_index - 1] + + if prediction_loss_only: + return (loss, None, None) + + logits = nested_detach(logits) + if len(logits) == 1: + logits = logits[0] + + return (loss, logits, labels) + + def floating_point_ops(self, inputs: dict[str, Union[torch.Tensor, Any]]): + """ + For models that inherit from [`PreTrainedModel`], uses that method to compute the number of floating point + operations for every backward + forward pass. If using another model, either implement such a method in the + model or subclass and override this method. + + Args: + inputs (`Dict[str, Union[torch.Tensor, Any]]`): + The inputs and targets of the model. + + Returns: + `int`: The number of floating-point operations. + """ + if hasattr(self.model, "floating_point_ops"): + return self.model.floating_point_ops(inputs) + else: + return 0 + + def init_hf_repo(self, token: Optional[str] = None): + """ + Initializes a git repo in `self.args.hub_model_id`. + """ + # Only on process zero + if not self.is_world_process_zero(): + return + + if self.args.hub_model_id is None: + repo_name = Path(self.args.output_dir).absolute().name + else: + repo_name = self.args.hub_model_id + + token = token if token is not None else self.args.hub_token + repo_url = create_repo(repo_name, token=token, private=self.args.hub_private_repo, exist_ok=True) + self.hub_model_id = repo_url.repo_id + self.push_in_progress = None + + def create_model_card( + self, + language: Optional[str] = None, + license: Optional[str] = None, + tags: Union[str, list[str], None] = None, + model_name: Optional[str] = None, + finetuned_from: Optional[str] = None, + tasks: Union[str, list[str], None] = None, + dataset_tags: Union[str, list[str], None] = None, + dataset: Union[str, list[str], None] = None, + dataset_args: Union[str, list[str], None] = None, + ): + """ + Creates a draft of a model card using the information available to the `Trainer`. + + Args: + language (`str`, *optional*): + The language of the model (if applicable) + license (`str`, *optional*): + The license of the model. Will default to the license of the pretrained model used, if the original + model given to the `Trainer` comes from a repo on the Hub. + tags (`str` or `List[str]`, *optional*): + Some tags to be included in the metadata of the model card. + model_name (`str`, *optional*): + The name of the model. + finetuned_from (`str`, *optional*): + The name of the model used to fine-tune this one (if applicable). Will default to the name of the repo + of the original model given to the `Trainer` (if it comes from the Hub). + tasks (`str` or `List[str]`, *optional*): + One or several task identifiers, to be included in the metadata of the model card. + dataset_tags (`str` or `List[str]`, *optional*): + One or several dataset tags, to be included in the metadata of the model card. + dataset (`str` or `List[str]`, *optional*): + One or several dataset identifiers, to be included in the metadata of the model card. + dataset_args (`str` or `List[str]`, *optional*): + One or several dataset arguments, to be included in the metadata of the model card. + """ + if not self.is_world_process_zero(): + return + + model_card_filepath = os.path.join(self.args.output_dir, "README.md") + is_peft_library = False + if os.path.exists(model_card_filepath): + library_name = ModelCard.load(model_card_filepath).data.get("library_name") + is_peft_library = library_name == "peft" + + # Append existing tags in `tags` + existing_tags = ModelCard.load(model_card_filepath).data.tags + if tags is not None and existing_tags is not None: + if isinstance(tags, str): + tags = [tags] + for tag in existing_tags: + if tag not in tags: + tags.append(tag) + + training_summary = TrainingSummary.from_trainer( + self, + language=language, + license=license, + tags=tags, + model_name=model_name, + finetuned_from=finetuned_from, + tasks=tasks, + dataset_tags=dataset_tags, + dataset=dataset, + dataset_args=dataset_args, + ) + model_card = training_summary.to_model_card() + with open(model_card_filepath, "w") as f: + f.write(model_card) + + if is_peft_library: + self.accelerator.unwrap_model(self.model).create_or_update_model_card(self.args.output_dir) + + def _push_from_checkpoint(self, checkpoint_folder): + # Only push from one node. + if not self.is_world_process_zero() or self.args.hub_strategy == HubStrategy.END: + return + # If we haven't finished the last push, we don't do this one unless args.hub_always_push=True. + if not self.args.hub_always_push and self.push_in_progress is not None and not self.push_in_progress.is_done(): + return + + output_dir = self.args.output_dir + # To avoid a new synchronization of all model weights, we just copy the file from the checkpoint folder + modeling_files = [CONFIG_NAME, WEIGHTS_NAME, SAFE_WEIGHTS_NAME] + # Add sharded checkpoints if we have an index + for index_file in [WEIGHTS_INDEX_NAME, SAFE_WEIGHTS_INDEX_NAME]: + index_path = os.path.join(checkpoint_folder, index_file) + if os.path.isfile(index_path): + modeling_files.append(index_file) + with open(index_path) as f: + index = json.loads(f.read()) + shard_files = list(set(index["weight_map"].values())) + modeling_files.extend(shard_files) + if is_peft_available(): + modeling_files.extend([ADAPTER_CONFIG_NAME, ADAPTER_WEIGHTS_NAME, ADAPTER_SAFE_WEIGHTS_NAME]) + for modeling_file in modeling_files: + if os.path.isfile(os.path.join(checkpoint_folder, modeling_file)): + shutil.copy(os.path.join(checkpoint_folder, modeling_file), os.path.join(output_dir, modeling_file)) + # Saving the processing class is fast and we don't know how many files it may have spawned, so we resave it to be sure. + if self.processing_class is not None: + self.processing_class.save_pretrained(output_dir) + # Same for the training arguments + torch.save(self.args, os.path.join(output_dir, TRAINING_ARGS_NAME)) + + if self.args.save_strategy == SaveStrategy.STEPS: + commit_message = f"Training in progress, step {self.state.global_step}" + else: + commit_message = f"Training in progress, epoch {int(self.state.epoch)}" + + model_push_job = upload_folder( + repo_id=self.hub_model_id, + folder_path=output_dir, + commit_message=commit_message, + token=self.args.hub_token, + run_as_future=True, + ignore_patterns=["_*", f"{PREFIX_CHECKPOINT_DIR}-*"], + ) + + push_jobs = [model_push_job] + + if self.args.hub_strategy in [HubStrategy.CHECKPOINT, HubStrategy.ALL_CHECKPOINTS]: + path_in_repo = ( + "last-checkpoint" if self.args.hub_strategy == HubStrategy.CHECKPOINT else Path(checkpoint_folder).name + ) + checkpoint_push = upload_folder( + repo_id=self.hub_model_id, + folder_path=checkpoint_folder, + path_in_repo=path_in_repo, + commit_message=commit_message + ", checkpoint", + token=self.args.hub_token, + run_as_future=True, + ) + push_jobs.append(checkpoint_push) + + if self.push_in_progress is None or self.push_in_progress.is_done(): + self.push_in_progress = PushInProgress(push_jobs) + else: + self.push_in_progress.jobs.extend(push_jobs) + + def _finish_current_push(self): + if not hasattr(self, "push_in_progress"): + return + if self.push_in_progress is not None and not self.push_in_progress.is_done(): + logger.info("Waiting for the current checkpoint push to be finished, this might take a couple of minutes.") + self.push_in_progress.wait_until_done() + + def push_to_hub( + self, + commit_message: Optional[str] = "End of training", + blocking: bool = True, + token: Optional[str] = None, + revision: Optional[str] = None, + **kwargs, + ) -> str: + """ + Upload `self.model` and `self.processing_class` to the 🤗 model hub on the repo `self.args.hub_model_id`. + + Parameters: + commit_message (`str`, *optional*, defaults to `"End of training"`): + Message to commit while pushing. + blocking (`bool`, *optional*, defaults to `True`): + Whether the function should return only when the `git push` has finished. + token (`str`, *optional*, defaults to `None`): + Token with write permission to overwrite Trainer's original args. + revision (`str`, *optional*): + The git revision to commit from. Defaults to the head of the "main" branch. + kwargs (`Dict[str, Any]`, *optional*): + Additional keyword arguments passed along to [`~Trainer.create_model_card`]. + + Returns: + The URL of the repository where the model was pushed if `blocking=False`, or a `Future` object tracking the + progress of the commit if `blocking=True`. + """ + model_name = kwargs.pop("model_name", None) + if model_name is None and self.args.should_save: + if self.args.hub_model_id is None: + model_name = Path(self.args.output_dir).name + else: + model_name = self.args.hub_model_id.split("/")[-1] + token = token if token is not None else self.args.hub_token + + # In case the user calls this method with args.push_to_hub = False + if self.hub_model_id is None: + self.init_hf_repo(token=token) + + # Needs to be executed on all processes for TPU training, but will only save on the processed determined by + # self.args.should_save. + self.save_model(_internal_call=True) + + # Only push from one node. + if not self.is_world_process_zero(): + return + + # Add additional tags in the case the model has already some tags and users pass + # "tags" argument to `push_to_hub` so that trainer automatically handles internal tags + # from all models since Trainer does not call `model.push_to_hub`. + if getattr(self.model, "model_tags", None) is not None: + if "tags" not in kwargs: + kwargs["tags"] = [] + + # If it is a string, convert it to a list + if isinstance(kwargs["tags"], str): + kwargs["tags"] = [kwargs["tags"]] + + for model_tag in self.model.model_tags: + if model_tag not in kwargs["tags"]: + kwargs["tags"].append(model_tag) + + self.create_model_card(model_name=model_name, **kwargs) + + # Wait for the current upload to be finished. + self._finish_current_push() + return upload_folder( + repo_id=self.hub_model_id, + folder_path=self.args.output_dir, + commit_message=commit_message, + token=token, + run_as_future=not blocking, + ignore_patterns=["_*", f"{PREFIX_CHECKPOINT_DIR}-*"], + revision=revision, + ) + + # + # Deprecated code + # + + def prediction_loop( + self, + dataloader: DataLoader, + description: str, + prediction_loss_only: Optional[bool] = None, + ignore_keys: Optional[list[str]] = None, + metric_key_prefix: str = "eval", + ) -> EvalLoopOutput: + """ + Prediction/evaluation loop, shared by `Trainer.evaluate()` and `Trainer.predict()`. + + Works both with or without labels. + """ + args = self.args + + if not has_length(dataloader): + raise ValueError("dataloader must implement a working __len__") + + prediction_loss_only = prediction_loss_only if prediction_loss_only is not None else args.prediction_loss_only + + # if eval is called w/o train, handle model prep here + if self.is_deepspeed_enabled and self.deepspeed is None: + _, _ = deepspeed_init(self, num_training_steps=0, inference=True) + + model = self._wrap_model(self.model, training=False, dataloader=dataloader) + + if len(self.accelerator._models) == 0 and model is self.model: + model = ( + self.accelerator.prepare(model) + if self.is_deepspeed_enabled or self.is_fsdp_enabled + else self.accelerator.prepare_model(model, evaluation_mode=True) + ) + + if self.is_fsdp_enabled: + self.model = model + + # for the rest of this function `model` is the outside model, whether it was wrapped or not + if model is not self.model: + self.model_wrapped = model + + # backward compatibility + if self.is_deepspeed_enabled: + self.deepspeed = self.model_wrapped + + # if full fp16 or bf16 eval is wanted and this ``evaluation`` or ``predict`` isn't called + # while ``train`` is running, cast it to the right dtype first and then put on device + if not self.is_in_train: + if args.fp16_full_eval: + model = model.to(dtype=torch.float16, device=args.device) + elif args.bf16_full_eval: + model = model.to(dtype=torch.bfloat16, device=args.device) + + batch_size = ( + dataloader.total_batch_size + if getattr(dataloader, "_is_accelerate_prepared", False) + else dataloader.batch_size + ) + + if batch_size is None: + raise ValueError( + "Batch size cannot be None. Ensure the dataloader has a valid batch_size or total_batch_size." + ) + + num_examples = self.num_examples(dataloader) + logger.info(f"\n***** Running {description} *****") + logger.info(f" Num examples = {num_examples}") + logger.info(f" Batch size = {batch_size}") + + losses_host: Optional[torch.Tensor] = None + preds_host: Union[torch.Tensor, list[torch.Tensor], None] = None + labels_host: Union[torch.Tensor, list[torch.Tensor], None] = None + inputs_host: Union[torch.Tensor, list[torch.Tensor], None] = None + metrics: Optional[dict] = None + eval_set_kwargs: dict = {} + + world_size = max(1, args.world_size) + + eval_losses_gatherer = DistributedTensorGatherer(world_size, num_examples, make_multiple_of=batch_size) + if not prediction_loss_only: + # The actual number of eval_sample can be greater than num_examples in distributed settings (when we pass + # a batch size to the sampler) + make_multiple_of = None + if hasattr(dataloader, "sampler") and isinstance(dataloader.sampler, SequentialDistributedSampler): + make_multiple_of = dataloader.sampler.batch_size + preds_gatherer = DistributedTensorGatherer(world_size, num_examples, make_multiple_of=make_multiple_of) + labels_gatherer = DistributedTensorGatherer(world_size, num_examples, make_multiple_of=make_multiple_of) + inputs_gatherer = DistributedTensorGatherer(world_size, num_examples, make_multiple_of=make_multiple_of) + + model.eval() + if hasattr(self.optimizer, "eval") and callable(self.optimizer.eval): + self.optimizer.eval() + + if args.past_index >= 0: + self._past = None + + self.callback_handler.eval_dataloader = dataloader + + for step, inputs in enumerate(dataloader): + loss, logits, labels = self.prediction_step(model, inputs, prediction_loss_only, ignore_keys=ignore_keys) + main_input_name = getattr(self.model, "main_input_name", "input_ids") + inputs_decode = ( + self._prepare_input(inputs[main_input_name]) if "inputs" in args.include_for_metrics else None + ) + + if loss is not None: + losses = loss.repeat(batch_size) + losses_host = losses if losses_host is None else torch.cat((losses_host, losses), dim=0) + if logits is not None: + preds_host = logits if preds_host is None else nested_concat(preds_host, logits, padding_index=-100) + if labels is not None: + labels_host = labels if labels_host is None else nested_concat(labels_host, labels, padding_index=-100) + if inputs_decode is not None: + inputs_host = ( + inputs_decode + if inputs_host is None + else nested_concat(inputs_host, inputs_decode, padding_index=-100) + ) + self.control = self.callback_handler.on_prediction_step(args, self.state, self.control) + + if self.args.batch_eval_metrics: + if self.compute_metrics is not None and preds_host is not None and labels_host is not None: + is_last_step = self.accelerator.gradient_state.end_of_dataloader + batch_kwargs = {} + batch_kwargs["losses"] = losses_host if "loss" in args.include_for_metrics else None + batch_kwargs["inputs"] = inputs_host if "inputs" in args.include_for_metrics else None + metrics = self.compute_metrics( + EvalPrediction(predictions=preds_host, label_ids=labels_host, **batch_kwargs), + compute_result=is_last_step, + ) + + if self.args.batch_eval_metrics or ( + args.eval_accumulation_steps is not None and (step + 1) % args.eval_accumulation_steps == 0 + ): + # Gather all tensors and put them back on the CPU if we have done enough accumulation steps. + eval_losses_gatherer.add_arrays(self._gather_and_numpify(losses_host, "eval_losses")) + if not prediction_loss_only: + preds_gatherer.add_arrays(self._gather_and_numpify(preds_host, "eval_preds")) + labels_gatherer.add_arrays(self._gather_and_numpify(labels_host, "eval_label_ids")) + inputs_gatherer.add_arrays(self._gather_and_numpify(inputs_host, "eval_inputs_ids")) + + # Set back to None to begin a new accumulation + del losses_host, preds_host, labels_host, inputs_host + torch.cuda.empty_cache() + losses_host, preds_host, labels_host, inputs_host = None, None, None, None + + if args.past_index and hasattr(self, "_past"): + # Clean the state at the end of the evaluation loop + delattr(self, "_past") + + # Gather all remaining tensors and put them back on the CPU + eval_losses_gatherer.add_arrays(self._gather_and_numpify(losses_host, "eval_losses")) + if not prediction_loss_only: + preds_gatherer.add_arrays(self._gather_and_numpify(preds_host, "eval_preds")) + labels_gatherer.add_arrays(self._gather_and_numpify(labels_host, "eval_label_ids")) + inputs_gatherer.add_arrays(self._gather_and_numpify(inputs_host, "eval_inputs_ids")) + + eval_loss = eval_losses_gatherer.finalize() + preds = preds_gatherer.finalize() if not prediction_loss_only else None + label_ids = labels_gatherer.finalize() if not prediction_loss_only else None + inputs_ids = inputs_gatherer.finalize() if not prediction_loss_only else None + + if ( + self.compute_metrics is not None + and preds is not None + and label_ids is not None + and not self.args.batch_eval_metrics + ): + eval_set_kwargs["losses"] = eval_loss if "loss" in args.include_for_metrics else None + eval_set_kwargs["inputs"] = inputs_ids if "inputs" in args.include_for_metrics else None + metrics = self.compute_metrics(EvalPrediction(predictions=preds, label_ids=label_ids, **eval_set_kwargs)) + elif metrics is None: + metrics = {} + + # To be JSON-serializable, we need to remove numpy types or zero-d tensors + metrics = denumpify_detensorize(metrics) + + if eval_loss is not None: + metrics[f"{metric_key_prefix}_loss"] = eval_loss.mean().item() + + # Prefix all keys with metric_key_prefix + '_' + for key in list(metrics.keys()): + if not key.startswith(f"{metric_key_prefix}_"): + metrics[f"{metric_key_prefix}_{key}"] = metrics.pop(key) + + return EvalLoopOutput(predictions=preds, label_ids=label_ids, metrics=metrics, num_samples=num_examples) + + def _gather_and_numpify(self, tensors, name): + """ + Gather value of `tensors` (tensor or list/tuple of nested tensors) and convert them to numpy before + concatenating them to `gathered` + """ + if tensors is None: + return + if is_torch_xla_available(): + tensors = nested_xla_mesh_reduce(tensors, name) + elif is_sagemaker_mp_enabled(): + tensors = smp_gather(tensors) + elif self.args.parallel_mode == ParallelMode.DISTRIBUTED: + tensors = distributed_concat(tensors) + + return nested_numpify(tensors) + + def _add_sm_patterns_to_gitignore(self) -> None: + """Add SageMaker Checkpointing patterns to .gitignore file.""" + # Make sure we only do this on the main process + if not self.is_world_process_zero(): + return + + patterns = ["*.sagemaker-uploading", "*.sagemaker-uploaded"] + + # Get current .gitignore content + if os.path.exists(os.path.join(self.repo.local_dir, ".gitignore")): + with open(os.path.join(self.repo.local_dir, ".gitignore")) as f: + current_content = f.read() + else: + current_content = "" + + # Add the patterns to .gitignore + content = current_content + for pattern in patterns: + if pattern not in content: + if content.endswith("\n"): + content += pattern + else: + content += f"\n{pattern}" + + # Write the .gitignore file if it has changed + if content != current_content: + with open(os.path.join(self.repo.local_dir, ".gitignore"), "w") as f: + logger.debug(f"Writing .gitignore file. Content: {content}") + f.write(content) + + self.repo.git_add(".gitignore") + + # avoid race condition with git status + time.sleep(0.5) + + if not self.repo.is_repo_clean(): + self.repo.git_commit("Add *.sagemaker patterns to .gitignore.") + self.repo.git_push() + + def create_accelerator_and_postprocess(self): + # We explicitly don't rely on the `Accelerator` to do gradient accumulation + grad_acc_kwargs = {} + if is_accelerate_available("0.28.0") and self.args.accelerator_config.gradient_accumulation_kwargs is not None: + grad_acc_kwargs = self.args.accelerator_config.gradient_accumulation_kwargs + + # check if num_steps is attempted to be passed in gradient_accumulation_kwargs + if "num_steps" in grad_acc_kwargs: + if self.args.gradient_accumulation_steps > 1: + # raise because we do not know which setting is intended. + raise ValueError( + "The `AcceleratorConfig`'s `num_steps` is set but `gradient_accumulation_steps` is greater than 1 in the passed `TrainingArguments`" + "If using the passed `AcceleratorConfig` is desired, do not set the `TrainingArguments` `gradient_accumulation_steps`." + ) + else: + self.args.gradient_accumulation_steps = grad_acc_kwargs["num_steps"] + + accelerator_config = self.args.accelerator_config.to_dict() + + if is_accelerate_available("0.28.0"): + # Extract dataloader config params from accelerator config + dataloader_params = ["split_batches", "dispatch_batches", "even_batches", "use_seedable_sampler"] + dataloader_config = DataLoaderConfiguration( + **{param: accelerator_config.pop(param) for param in dataloader_params} + ) + if is_accelerate_available("1.1.0"): + dataloader_config.data_seed = self.args.data_seed + + non_blocking = accelerator_config.pop("non_blocking") + if not is_accelerate_available("0.30.0"): + if non_blocking: + raise ImportError( + "`non_blocking` is only supported in accelerate v0.30.0 and above. Please upgrade accelerate to use this feature." + ) + else: + if non_blocking and not self.args.dataloader_pin_memory: + logger.warning( + "`non_blocking` is enabled but `dataloader_pin_memory` is not. For the best performance, it's recommended to enable both." + ) + dataloader_config.non_blocking = non_blocking + # this would have been updated above, no need for it anymore + accelerator_config.pop("gradient_accumulation_kwargs") + + args = { + "deepspeed_plugin": self.args.deepspeed_plugin, + } + if is_accelerate_available("0.28.0"): + args["dataloader_config"] = dataloader_config + else: + args.update(accelerator_config) + # tp is initialized at Accelerator init phase so + # args should be prepared here + if hasattr(self.model, "tp_size") and self.model.tp_size is not None and self.model.tp_size > 1: + self.is_tp_enabled = True + if version.parse(accelerate_version) > version.parse("1.3.0"): + args["torch_tp_plugin"] = TorchTensorParallelPlugin(tp_size=self.model.tp_size) + else: + raise ValueError("Requires accelerate>1.3.0 to use Tensor Parallelism.") + + # create accelerator object + self.accelerator = Accelerator(**args) + # some Trainer classes need to use `gather` instead of `gather_for_metrics`, thus we store a flag + self.gather_function = self.accelerator.gather_for_metrics + + if "use_gather_object" in inspect.signature(self.gather_function).parameters.keys(): + self.gather_function = functools.partial( + self.gather_function, use_gather_object=self.args.eval_use_gather_object + ) + + # deepspeed and accelerate flags covering both trainer args and accelerate launcher + self.is_deepspeed_enabled = getattr(self.accelerator.state, "deepspeed_plugin", None) is not None + self.is_fsdp_enabled = getattr(self.accelerator.state, "fsdp_plugin", None) is not None + self.is_tp_enabled = getattr(self.accelerator.state, "torch_tp_plugin", None) is not None + # post accelerator creation setup + if self.is_fsdp_enabled: + fsdp_plugin = self.accelerator.state.fsdp_plugin + for param in ["limit_all_gathers", "activation_checkpointing"]: + setattr(fsdp_plugin, param, self.args.fsdp_config.get(param, getattr(fsdp_plugin, param))) + if fsdp_plugin.activation_checkpointing and self.args.gradient_checkpointing: + raise ValueError( + "The activation_checkpointing in FSDP config and the gradient_checkpointing in training arg " + "can't be set to True simultaneously. Please use FSDP's activation_checkpointing logic " + "when using FSDP." + ) + + if self.is_deepspeed_enabled and getattr(self.args, "hf_deepspeed_config", None) is None: + self.propagate_args_to_deepspeed() + + # `save_only_model` can't be used with DeepSpeed/FSDP along with `load_best_model_at_end` + if ( + self.args.save_only_model + and (self.is_deepspeed_enabled or self.is_fsdp_enabled) + and self.args.load_best_model_at_end + ): + wrapper = "DeepSpeed" if self.is_deepspeed_enabled else "FSDP" + raise ValueError(f"{wrapper} can't be used with `save_only_model` along with `load_best_model_at_end`.") + + # `auto_find_batch_size` isn't supported yet with DeepSpeed Zero-3 + if ( + self.is_deepspeed_enabled + and self.accelerator.state.deepspeed_plugin.zero_stage == 3 + and self.args.auto_find_batch_size + ): + raise ValueError( + "`auto_find_batch_size` isn't supported yet with DeepSpeed Zero-3. Please consider using Zero-2, Zero-1, or FSDP" + ) + if ( + self.args.save_only_model + and self.is_fsdp_enabled + and "SHARDED_STATE_DICT" in str(self.accelerator.state.fsdp_plugin.state_dict_type) + ): + raise ValueError("save_only_model option is not compatible with FSDP state dict type 'SHARDED_STATE_DICT'") + + def propagate_args_to_deepspeed(self, auto_find_batch_size=False): + """ + Sets values in the deepspeed plugin based on the Trainer args + """ + from transformers.integrations.deepspeed import HfTrainerDeepSpeedConfig + + ds_plugin = self.accelerator.state.deepspeed_plugin + + ds_plugin.hf_ds_config = HfTrainerDeepSpeedConfig(ds_plugin.hf_ds_config.config) + ds_plugin.deepspeed_config = ds_plugin.hf_ds_config.config + ds_plugin.hf_ds_config.trainer_config_process(self.args, auto_find_batch_size) + + def _fsdp_qlora_plugin_updates(self): + if self.is_fsdp_enabled and _is_peft_model(self.model): + from peft import PeftConfig + from peft.utils.other import fsdp_auto_wrap_policy + + if isinstance(self.model.active_peft_config, PeftConfig): + self.accelerator.state.fsdp_plugin.auto_wrap_policy = fsdp_auto_wrap_policy(self.model) + if ( + getattr(self.model, "quantization_method", None) == QuantizationMethod.BITS_AND_BYTES + and self.model.hf_quantizer.quantization_config.bnb_4bit_quant_storage.is_floating_point + and version.parse(accelerate_version) > version.parse("0.27.0") + ): + self.accelerator.state.fsdp_plugin.set_mixed_precision( + self.model.hf_quantizer.quantization_config.bnb_4bit_quant_storage, override=True + ) + + def get_batch_samples(self, epoch_iterator, num_batches, device): + batch_samples = [] + num_items_in_batch = None + + for _ in range(num_batches): + try: + batch_samples.append(next(epoch_iterator)) + except StopIteration: + break + + count_num_items_in_batch = ( + len(batch_samples) > 0 + and "labels" in batch_samples[0] + and ( + # num_items_in_batch is passed to model forward + # https://github.com/huggingface/transformers/blob/v4.49.0/src/transformers/trainer.py#L3757 + self.model_accepts_loss_kwargs + # num_items_in_batch is passed to compute_loss_func + # https://github.com/huggingface/transformers/blob/v4.49.0/src/transformers/trainer.py#L3773 + or self.compute_loss_func is not None + # num_items_in_batch is also verified if (self.model_accepts_loss_kwargs or self.compute_loss_func) + # https://github.com/huggingface/transformers/blob/v4.49.0/src/transformers/trainer.py#L3790 + ) + ) + + if count_num_items_in_batch: + # For now we don't support object detection + try: + num_items_in_batch = sum([(batch["labels"].ne(-100)).sum() for batch in batch_samples]) + except (TypeError, AttributeError): + pass + + if num_items_in_batch is not None: + if self.args.average_tokens_across_devices: + num_items_in_batch = self.accelerator.gather(num_items_in_batch).sum() + + if torch.is_tensor(num_items_in_batch): + num_items_in_batch = num_items_in_batch.to(device) + + if self.args.n_gpu > 1 and num_items_in_batch.dim() == 0: + # In the DataParallel case, convert the scalar tensor into a 1-dim tensor + num_items_in_batch = num_items_in_batch.unsqueeze(0) + + return batch_samples, num_items_in_batch + + def set_initial_training_values( + self, args: TrainingArguments, dataloader: DataLoader, total_train_batch_size: int + ): + """ + Calculates and returns the following values: + - `num_train_epochs` + - `num_update_steps_per_epoch` + - `num_examples` + - `num_train_samples` + - `epoch_based` + - `len_dataloader` + - `max_steps` + """ + # Case 1: we rely on `args.max_steps` first + max_steps = args.max_steps + # If max_steps is negative, we use the number of epochs to determine the number of total steps later + epoch_based = max_steps < 0 + len_dataloader = len(dataloader) if has_length(dataloader) else None + + # Case 2: We have a dataloader length and can extrapolate + if len_dataloader is not None: + num_update_steps_per_epoch = max(len_dataloader // args.gradient_accumulation_steps, 1) + # Case 3: We have a length but are using epochs, we can extrapolate the number of steps + if epoch_based: + max_steps = math.ceil(args.num_train_epochs * num_update_steps_per_epoch) + + # Now we figure out `num_examples`, `num_train_epochs`, and `train_samples` + if len_dataloader: + num_examples = self.num_examples(dataloader) + if args.max_steps > 0: + num_train_epochs = max_steps // num_update_steps_per_epoch + int( + max_steps % num_update_steps_per_epoch > 0 + ) + # May be slightly incorrect if the last batch in the training dataloader has a smaller size but it's + # the best we can do. + num_train_samples = max_steps * total_train_batch_size + else: + num_train_epochs = math.ceil(args.num_train_epochs) + num_train_samples = self.num_examples(dataloader) * args.num_train_epochs + elif args.max_steps > 0: # Rely on max_steps when dataloader does not have a working size + # Setting a very large number of epochs so we go as many times as necessary over the iterator. + num_train_epochs = sys.maxsize + num_update_steps_per_epoch = max_steps + num_examples = total_train_batch_size * args.max_steps + num_train_samples = args.max_steps * total_train_batch_size + else: + raise ValueError( + "args.max_steps must be set to a positive value if dataloader does not have a length, was" + f" {args.max_steps}" + ) + return ( + num_train_epochs, + num_update_steps_per_epoch, + num_examples, + num_train_samples, + epoch_based, + len_dataloader, + max_steps, + ) diff --git a/docs/transformers/build/lib/transformers/trainer_callback.py b/docs/transformers/build/lib/transformers/trainer_callback.py new file mode 100644 index 0000000000000000000000000000000000000000..e4c465dfe558ff6a31accd5fdec2198ce32cd4a8 --- /dev/null +++ b/docs/transformers/build/lib/transformers/trainer_callback.py @@ -0,0 +1,785 @@ +# Copyright 2020-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Callbacks to use with the Trainer class and customize the training loop. +""" + +import dataclasses +import json +import math +from dataclasses import dataclass +from typing import Optional, Union + +import numpy as np +from tqdm.auto import tqdm + +from .trainer_utils import HPSearchBackend, IntervalStrategy, SaveStrategy, has_length +from .training_args import TrainingArguments +from .utils import logging + + +logger = logging.get_logger(__name__) + + +@dataclass +class TrainerState: + """ + A class containing the [`Trainer`] inner state that will be saved along the model and optimizer when checkpointing + and passed to the [`TrainerCallback`]. + + + + In all this class, one step is to be understood as one update step. When using gradient accumulation, one update + step may require several forward and backward passes: if you use `gradient_accumulation_steps=n`, then one update + step requires going through *n* batches. + + + + Args: + epoch (`float`, *optional*): + Only set during training, will represent the epoch the training is at (the decimal part being the + percentage of the current epoch completed). + global_step (`int`, *optional*, defaults to 0): + During training, represents the number of update steps completed. + max_steps (`int`, *optional*, defaults to 0): + The number of update steps to do during the current training. + logging_steps (`int`, *optional*, defaults to 500): + Log every X updates steps + eval_steps (`int`, *optional*): + Run an evaluation every X steps. + save_steps (`int`, *optional*, defaults to 500): + Save checkpoint every X updates steps. + train_batch_size (`int`, *optional*): + The batch size for the training dataloader. Only needed when + `auto_find_batch_size` has been used. + num_input_tokens_seen (`int`, *optional*, defaults to 0): + When tracking the inputs tokens, the number of tokens seen during training (number of input tokens, not the + number of prediction tokens). + total_flos (`float`, *optional*, defaults to 0): + The total number of floating operations done by the model since the beginning of training (stored as floats + to avoid overflow). + log_history (`List[Dict[str, float]]`, *optional*): + The list of logs done since the beginning of training. + best_metric (`float`, *optional*): + When tracking the best model, the value of the best metric encountered so far. + best_global_step (`int`, *optional*): + When tracking the best model, the step at which the best metric was encountered. + Used for setting `best_model_checkpoint`. + best_model_checkpoint (`str`, *optional*): + When tracking the best model, the value of the name of the checkpoint for the best model encountered so + far. + is_local_process_zero (`bool`, *optional*, defaults to `True`): + Whether or not this process is the local (e.g., on one machine if training in a distributed fashion on + several machines) main process. + is_world_process_zero (`bool`, *optional*, defaults to `True`): + Whether or not this process is the global main process (when training in a distributed fashion on several + machines, this is only going to be `True` for one process). + is_hyper_param_search (`bool`, *optional*, defaults to `False`): + Whether we are in the process of a hyper parameter search using Trainer.hyperparameter_search. This will + impact the way data will be logged in TensorBoard. + stateful_callbacks (`List[StatefulTrainerCallback]`, *optional*): + Callbacks attached to the `Trainer` that should have their states be saved or restored. + Relevant callbacks should implement a `state` and `from_state` function. + """ + + epoch: Optional[float] = None + global_step: int = 0 + max_steps: int = 0 + logging_steps: int = 500 + eval_steps: int = 500 + save_steps: int = 500 + train_batch_size: Optional[int] = None + num_train_epochs: int = 0 + num_input_tokens_seen: int = 0 + total_flos: float = 0 + log_history: list[dict[str, float]] = None + best_metric: Optional[float] = None + best_global_step: Optional[int] = None + best_model_checkpoint: Optional[str] = None + is_local_process_zero: bool = True + is_world_process_zero: bool = True + is_hyper_param_search: bool = False + trial_name: Optional[str] = None + trial_params: dict[str, Union[str, float, int, bool]] = None + stateful_callbacks: list["TrainerCallback"] = None + + def __post_init__(self): + if self.log_history is None: + self.log_history = [] + if self.stateful_callbacks is None: + self.stateful_callbacks = {} + elif isinstance(self.stateful_callbacks, dict): + # We are loading the callbacks in from the state file, no need to process them + pass + else: + # Saveable callbacks get stored as dict of kwargs + stateful_callbacks = {} + for callback in self.stateful_callbacks: + if not isinstance(callback, (ExportableState)): + raise TypeError( + f"All callbacks passed to be saved must inherit `ExportableState`, but received {type(callback)}" + ) + name = callback.__class__.__name__ + if name in stateful_callbacks: + # We can have multiple versions of the same callback + # if so, we store them as a list of states to restore + if not isinstance(stateful_callbacks[name], list): + stateful_callbacks[name] = [stateful_callbacks[name]] + stateful_callbacks[name].append(callback.state()) + else: + stateful_callbacks[name] = callback.state() + self.stateful_callbacks = stateful_callbacks + + def save_to_json(self, json_path: str): + """Save the content of this instance in JSON format inside `json_path`.""" + json_string = json.dumps(dataclasses.asdict(self), indent=2, sort_keys=True) + "\n" + with open(json_path, "w", encoding="utf-8") as f: + f.write(json_string) + + @classmethod + def load_from_json(cls, json_path: str): + """Create an instance from the content of `json_path`.""" + with open(json_path, encoding="utf-8") as f: + text = f.read() + return cls(**json.loads(text)) + + def compute_steps(self, args, max_steps): + """ + Calculates and stores the absolute value for logging, + eval, and save steps based on if it was a proportion + or not. + """ + for step_kind in ("logging", "eval", "save"): + num_steps = getattr(args, f"{step_kind}_steps") + if num_steps is not None: + if num_steps < 1: + num_steps = math.ceil(max_steps * num_steps) + setattr(self, f"{step_kind}_steps", num_steps) + + def init_training_references(self, trainer, max_steps, num_train_epochs, trial): + """ + Stores the initial training references needed in `self` + """ + if trainer.hp_name is not None and trainer._trial is not None: + # use self._trial because the SigOpt/Optuna hpo only call `_hp_search_setup(trial)` instead of passing trial + # parameter to Train when using DDP. + self.trial_name = trainer.hp_name(trainer._trial) + self.trial_params = None + if trial is not None: + from transformers.integrations import hp_params + + assignments = trial.assignments if trainer.hp_search_backend == HPSearchBackend.SIGOPT else trial + self.trial_params = hp_params(assignments) + + self.max_steps = max_steps + self.num_train_epochs = num_train_epochs + self.is_local_process_zero = trainer.is_local_process_zero() + self.is_world_process_zero = trainer.is_world_process_zero() + + +class ExportableState: + """ + A class for objects that include the ability to have its state + be saved during `Trainer._save_checkpoint` and loaded back in during + `Trainer._load_from_checkpoint`. + + These must implement a `state` function that gets called during the respective + Trainer function call. It should only include parameters and attributes needed to + recreate the state at a particular time, to avoid utilizing pickle/maintain standard + file IO writing. + + Example: + + ```python + class EarlyStoppingCallback(TrainerCallback, ExportableState): + def __init__(self, early_stopping_patience: int = 1, early_stopping_threshold: Optional[float] = 0.0): + self.early_stopping_patience = early_stopping_patience + self.early_stopping_threshold = early_stopping_threshold + # early_stopping_patience_counter denotes the number of times validation metrics failed to improve. + self.early_stopping_patience_counter = 0 + + def state(self) -> dict: + return { + "args": { + "early_stopping_patience": self.early_stopping_patience, + "early_stopping_threshold": self.early_stopping_threshold, + }, + "attributes": { + "early_stopping_patience_counter": self.early_stopping_patience_counter, + } + } + ```""" + + def state(self) -> dict: + raise NotImplementedError("You must implement a `state` function to utilize this class.") + + @classmethod + def from_state(cls, state): + instance = cls(**state["args"]) + for k, v in state["attributes"].items(): + setattr(instance, k, v) + return instance + + +@dataclass +class TrainerControl(ExportableState): + """ + A class that handles the [`Trainer`] control flow. This class is used by the [`TrainerCallback`] to activate some + switches in the training loop. + + Args: + should_training_stop (`bool`, *optional*, defaults to `False`): + Whether or not the training should be interrupted. + + If `True`, this variable will not be set back to `False`. The training will just stop. + should_epoch_stop (`bool`, *optional*, defaults to `False`): + Whether or not the current epoch should be interrupted. + + If `True`, this variable will be set back to `False` at the beginning of the next epoch. + should_save (`bool`, *optional*, defaults to `False`): + Whether or not the model should be saved at this step. + + If `True`, this variable will be set back to `False` at the beginning of the next step. + should_evaluate (`bool`, *optional*, defaults to `False`): + Whether or not the model should be evaluated at this step. + + If `True`, this variable will be set back to `False` at the beginning of the next step. + should_log (`bool`, *optional*, defaults to `False`): + Whether or not the logs should be reported at this step. + + If `True`, this variable will be set back to `False` at the beginning of the next step. + """ + + should_training_stop: bool = False + should_epoch_stop: bool = False + should_save: bool = False + should_evaluate: bool = False + should_log: bool = False + + def _new_training(self): + """Internal method that resets the variable for a new training.""" + self.should_training_stop = False + + def _new_epoch(self): + """Internal method that resets the variable for a new epoch.""" + self.should_epoch_stop = False + + def _new_step(self): + """Internal method that resets the variable for a new step.""" + self.should_save = False + self.should_evaluate = False + self.should_log = False + + def state(self) -> dict: + return { + "args": { + "should_training_stop": self.should_training_stop, + "should_epoch_stop": self.should_epoch_stop, + "should_save": self.should_save, + "should_evaluate": self.should_evaluate, + "should_log": self.should_log, + }, + "attributes": {}, + } + + +class TrainerCallback: + # no-format + """ + A class for objects that will inspect the state of the training loop at some events and take some decisions. At + each of those events the following arguments are available: + + Args: + args ([`TrainingArguments`]): + The training arguments used to instantiate the [`Trainer`]. + state ([`TrainerState`]): + The current state of the [`Trainer`]. + control ([`TrainerControl`]): + The object that is returned to the [`Trainer`] and can be used to make some decisions. + model ([`PreTrainedModel`] or `torch.nn.Module`): + The model being trained. + tokenizer ([`PreTrainedTokenizer`]): + The tokenizer used for encoding the data. This is deprecated in favour of `processing_class`. + processing_class ([`PreTrainedTokenizer` or `BaseImageProcessor` or `ProcessorMixin` or `FeatureExtractionMixin`]): + The processing class used for encoding the data. Can be a tokenizer, a processor, an image processor or a feature extractor. + optimizer (`torch.optim.Optimizer`): + The optimizer used for the training steps. + lr_scheduler (`torch.optim.lr_scheduler.LambdaLR`): + The scheduler used for setting the learning rate. + train_dataloader (`torch.utils.data.DataLoader`, *optional*): + The current dataloader used for training. + eval_dataloader (`torch.utils.data.DataLoader`, *optional*): + The current dataloader used for evaluation. + metrics (`Dict[str, float]`): + The metrics computed by the last evaluation phase. + + Those are only accessible in the event `on_evaluate`. + logs (`Dict[str, float]`): + The values to log. + + Those are only accessible in the event `on_log`. + + The `control` object is the only one that can be changed by the callback, in which case the event that changes it + should return the modified version. + + The argument `args`, `state` and `control` are positionals for all events, all the others are grouped in `kwargs`. + You can unpack the ones you need in the signature of the event using them. As an example, see the code of the + simple [`~transformers.PrinterCallback`]. + + Example: + + ```python + class PrinterCallback(TrainerCallback): + def on_log(self, args, state, control, logs=None, **kwargs): + _ = logs.pop("total_flos", None) + if state.is_local_process_zero: + print(logs) + ```""" + + def on_init_end(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs): + """ + Event called at the end of the initialization of the [`Trainer`]. + """ + pass + + def on_train_begin(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs): + """ + Event called at the beginning of training. + """ + pass + + def on_train_end(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs): + """ + Event called at the end of training. + """ + pass + + def on_epoch_begin(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs): + """ + Event called at the beginning of an epoch. + """ + pass + + def on_epoch_end(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs): + """ + Event called at the end of an epoch. + """ + pass + + def on_step_begin(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs): + """ + Event called at the beginning of a training step. If using gradient accumulation, one training step might take + several inputs. + """ + pass + + def on_pre_optimizer_step(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs): + """ + Event called before the optimizer step but after gradient clipping. Useful for monitoring gradients. + """ + pass + + def on_optimizer_step(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs): + """ + Event called after the optimizer step but before gradients are zeroed out. Useful for monitoring gradients. + """ + pass + + def on_substep_end(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs): + """ + Event called at the end of an substep during gradient accumulation. + """ + pass + + def on_step_end(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs): + """ + Event called at the end of a training step. If using gradient accumulation, one training step might take + several inputs. + """ + pass + + def on_evaluate(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs): + """ + Event called after an evaluation phase. + """ + pass + + def on_predict(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, metrics, **kwargs): + """ + Event called after a successful prediction. + """ + pass + + def on_save(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs): + """ + Event called after a checkpoint save. + """ + pass + + def on_log(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs): + """ + Event called after logging the last logs. + """ + pass + + def on_prediction_step(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs): + """ + Event called after a prediction step. + """ + pass + + +class CallbackHandler(TrainerCallback): + """Internal class that just calls the list of callbacks in order.""" + + def __init__(self, callbacks, model, processing_class, optimizer, lr_scheduler): + self.callbacks = [] + for cb in callbacks: + self.add_callback(cb) + self.model = model + self.processing_class = processing_class + self.optimizer = optimizer + self.lr_scheduler = lr_scheduler + self.train_dataloader = None + self.eval_dataloader = None + + if not any(isinstance(cb, DefaultFlowCallback) for cb in self.callbacks): + logger.warning( + "The Trainer will not work properly if you don't have a `DefaultFlowCallback` in its callbacks. You\n" + + "should add one before training with `trainer.add_callback(DefaultFlowCallback). The current list of" + + "callbacks is\n:" + + self.callback_list + ) + + def add_callback(self, callback): + cb = callback() if isinstance(callback, type) else callback + cb_class = callback if isinstance(callback, type) else callback.__class__ + if cb_class in [c.__class__ for c in self.callbacks]: + logger.warning( + f"You are adding a {cb_class} to the callbacks of this Trainer, but there is already one. The current" + + "list of callbacks is\n:" + + self.callback_list + ) + self.callbacks.append(cb) + + def pop_callback(self, callback): + if isinstance(callback, type): + for cb in self.callbacks: + if isinstance(cb, callback): + self.callbacks.remove(cb) + return cb + else: + for cb in self.callbacks: + if cb == callback: + self.callbacks.remove(cb) + return cb + + def remove_callback(self, callback): + if isinstance(callback, type): + for cb in self.callbacks: + if isinstance(cb, callback): + self.callbacks.remove(cb) + return + else: + self.callbacks.remove(callback) + + @property + def callback_list(self): + return "\n".join(cb.__class__.__name__ for cb in self.callbacks) + + def on_init_end(self, args: TrainingArguments, state: TrainerState, control: TrainerControl): + return self.call_event("on_init_end", args, state, control) + + def on_train_begin(self, args: TrainingArguments, state: TrainerState, control: TrainerControl): + control.should_training_stop = False + return self.call_event("on_train_begin", args, state, control) + + def on_train_end(self, args: TrainingArguments, state: TrainerState, control: TrainerControl): + return self.call_event("on_train_end", args, state, control) + + def on_epoch_begin(self, args: TrainingArguments, state: TrainerState, control: TrainerControl): + control.should_epoch_stop = False + return self.call_event("on_epoch_begin", args, state, control) + + def on_epoch_end(self, args: TrainingArguments, state: TrainerState, control: TrainerControl): + return self.call_event("on_epoch_end", args, state, control) + + def on_step_begin(self, args: TrainingArguments, state: TrainerState, control: TrainerControl): + control.should_log = False + control.should_evaluate = False + control.should_save = False + return self.call_event("on_step_begin", args, state, control) + + def on_pre_optimizer_step(self, args: TrainingArguments, state: TrainerState, control: TrainerControl): + return self.call_event("on_pre_optimizer_step", args, state, control) + + def on_optimizer_step(self, args: TrainingArguments, state: TrainerState, control: TrainerControl): + return self.call_event("on_optimizer_step", args, state, control) + + def on_substep_end(self, args: TrainingArguments, state: TrainerState, control: TrainerControl): + return self.call_event("on_substep_end", args, state, control) + + def on_step_end(self, args: TrainingArguments, state: TrainerState, control: TrainerControl): + return self.call_event("on_step_end", args, state, control) + + def on_evaluate(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, metrics): + control.should_evaluate = False + return self.call_event("on_evaluate", args, state, control, metrics=metrics) + + def on_predict(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, metrics): + return self.call_event("on_predict", args, state, control, metrics=metrics) + + def on_save(self, args: TrainingArguments, state: TrainerState, control: TrainerControl): + control.should_save = False + return self.call_event("on_save", args, state, control) + + def on_log(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, logs): + control.should_log = False + return self.call_event("on_log", args, state, control, logs=logs) + + def on_prediction_step(self, args: TrainingArguments, state: TrainerState, control: TrainerControl): + return self.call_event("on_prediction_step", args, state, control) + + def call_event(self, event, args, state, control, **kwargs): + for callback in self.callbacks: + result = getattr(callback, event)( + args, + state, + control, + model=self.model, + processing_class=self.processing_class, + optimizer=self.optimizer, + lr_scheduler=self.lr_scheduler, + train_dataloader=self.train_dataloader, + eval_dataloader=self.eval_dataloader, + **kwargs, + ) + # A Callback can skip the return of `control` if it doesn't change it. + if result is not None: + control = result + return control + + +class DefaultFlowCallback(TrainerCallback): + """ + A [`TrainerCallback`] that handles the default flow of the training loop for logs, evaluation and checkpoints. + """ + + def on_step_end(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs): + # Log + if state.global_step == 1 and args.logging_first_step: + control.should_log = True + if args.logging_strategy == IntervalStrategy.STEPS and state.global_step % state.logging_steps == 0: + control.should_log = True + + # Evaluate + if ( + args.eval_strategy == IntervalStrategy.STEPS + and state.global_step % state.eval_steps == 0 + and args.eval_delay <= state.global_step + ): + control.should_evaluate = True + + # Save + if ( + args.save_strategy == SaveStrategy.STEPS + and state.save_steps > 0 + and state.global_step % state.save_steps == 0 + ): + control.should_save = True + + # End training + if state.global_step >= state.max_steps: + control.should_training_stop = True + # Save the model at the end if we have a save strategy + if args.save_strategy == SaveStrategy.STEPS: + control.should_save = True + + return control + + def on_epoch_end(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs): + # Log + if args.logging_strategy == IntervalStrategy.EPOCH: + control.should_log = True + + # Evaluate + if args.eval_strategy == IntervalStrategy.EPOCH and args.eval_delay <= state.epoch: + control.should_evaluate = True + + # Save + if args.save_strategy == SaveStrategy.EPOCH: + control.should_save = True + + return control + + +class ProgressCallback(TrainerCallback): + """ + A [`TrainerCallback`] that displays the progress of training or evaluation. + You can modify `max_str_len` to control how long strings are truncated when logging. + """ + + def __init__(self, max_str_len: int = 100): + """ + Initialize the callback with optional max_str_len parameter to control string truncation length. + + Args: + max_str_len (`int`): + Maximum length of strings to display in logs. + Longer strings will be truncated with a message. + """ + self.training_bar = None + self.prediction_bar = None + self.max_str_len = max_str_len + + def on_train_begin(self, args, state, control, **kwargs): + if state.is_world_process_zero: + self.training_bar = tqdm(total=state.max_steps, dynamic_ncols=True) + self.current_step = 0 + + def on_step_end(self, args, state, control, **kwargs): + if state.is_world_process_zero: + self.training_bar.update(state.global_step - self.current_step) + self.current_step = state.global_step + + def on_prediction_step(self, args, state, control, eval_dataloader=None, **kwargs): + if state.is_world_process_zero and has_length(eval_dataloader): + if self.prediction_bar is None: + self.prediction_bar = tqdm( + total=len(eval_dataloader), leave=self.training_bar is None, dynamic_ncols=True + ) + self.prediction_bar.update(1) + + def on_evaluate(self, args, state, control, **kwargs): + if state.is_world_process_zero: + if self.prediction_bar is not None: + self.prediction_bar.close() + self.prediction_bar = None + + def on_predict(self, args, state, control, **kwargs): + if state.is_world_process_zero: + if self.prediction_bar is not None: + self.prediction_bar.close() + self.prediction_bar = None + + def on_log(self, args, state, control, logs=None, **kwargs): + if state.is_world_process_zero and self.training_bar is not None: + # make a shallow copy of logs so we can mutate the fields copied + # but avoid doing any value pickling. + shallow_logs = {} + for k, v in logs.items(): + if isinstance(v, str) and len(v) > self.max_str_len: + shallow_logs[k] = ( + f"[String too long to display, length: {len(v)} > {self.max_str_len}. " + "Consider increasing `max_str_len` if needed.]" + ) + else: + shallow_logs[k] = v + _ = shallow_logs.pop("total_flos", None) + # round numbers so that it looks better in console + if "epoch" in shallow_logs: + shallow_logs["epoch"] = round(shallow_logs["epoch"], 2) + self.training_bar.write(str(shallow_logs)) + + def on_train_end(self, args, state, control, **kwargs): + if state.is_world_process_zero: + self.training_bar.close() + self.training_bar = None + + +class PrinterCallback(TrainerCallback): + """ + A bare [`TrainerCallback`] that just prints the logs. + """ + + def on_log(self, args, state, control, logs=None, **kwargs): + _ = logs.pop("total_flos", None) + if state.is_local_process_zero: + print(logs) + + +class EarlyStoppingCallback(TrainerCallback, ExportableState): + """ + A [`TrainerCallback`] that handles early stopping. + + Args: + early_stopping_patience (`int`): + Use with `metric_for_best_model` to stop training when the specified metric worsens for + `early_stopping_patience` evaluation calls. + early_stopping_threshold(`float`, *optional*): + Use with TrainingArguments `metric_for_best_model` and `early_stopping_patience` to denote how much the + specified metric must improve to satisfy early stopping conditions. ` + + This callback depends on [`TrainingArguments`] argument *load_best_model_at_end* functionality to set best_metric + in [`TrainerState`]. Note that if the [`TrainingArguments`] argument *save_steps* differs from *eval_steps*, the + early stopping will not occur until the next save step. + """ + + def __init__(self, early_stopping_patience: int = 1, early_stopping_threshold: Optional[float] = 0.0): + self.early_stopping_patience = early_stopping_patience + self.early_stopping_threshold = early_stopping_threshold + # early_stopping_patience_counter denotes the number of times validation metrics failed to improve. + self.early_stopping_patience_counter = 0 + + def check_metric_value(self, args, state, control, metric_value): + # best_metric is set by code for load_best_model + operator = np.greater if args.greater_is_better else np.less + if state.best_metric is None or ( + operator(metric_value, state.best_metric) + and abs(metric_value - state.best_metric) > self.early_stopping_threshold + ): + self.early_stopping_patience_counter = 0 + else: + self.early_stopping_patience_counter += 1 + + def on_train_begin(self, args, state, control, **kwargs): + if not args.load_best_model_at_end: + logger.warning( + "Using EarlyStoppingCallback without load_best_model_at_end=True. " + "Once training is finished, the best model will not be loaded automatically." + ) + assert args.metric_for_best_model is not None, ( + "EarlyStoppingCallback requires metric_for_best_model to be defined" + ) + assert args.eval_strategy != IntervalStrategy.NO, ( + "EarlyStoppingCallback requires IntervalStrategy of steps or epoch" + ) + + def on_evaluate(self, args, state, control, metrics, **kwargs): + metric_to_check = args.metric_for_best_model + if not metric_to_check.startswith("eval_"): + metric_to_check = f"eval_{metric_to_check}" + metric_value = metrics.get(metric_to_check) + + if metric_value is None: + logger.warning( + f"early stopping required metric_for_best_model, but did not find {metric_to_check} so early stopping" + " is disabled" + ) + return + + self.check_metric_value(args, state, control, metric_value) + if self.early_stopping_patience_counter >= self.early_stopping_patience: + control.should_training_stop = True + + def state(self) -> dict: + return { + "args": { + "early_stopping_patience": self.early_stopping_patience, + "early_stopping_threshold": self.early_stopping_threshold, + }, + "attributes": { + "early_stopping_patience_counter": self.early_stopping_patience_counter, + }, + } diff --git a/docs/transformers/build/lib/transformers/trainer_pt_utils.py b/docs/transformers/build/lib/transformers/trainer_pt_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..30474daea6e435b6567b99d745f12cc8053f6e83 --- /dev/null +++ b/docs/transformers/build/lib/transformers/trainer_pt_utils.py @@ -0,0 +1,1411 @@ +# Copyright 2020-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Torch utilities for the Trainer class. +""" + +import copy +import datetime +import io +import json +import math +import os +import sys +import warnings +from collections.abc import Iterator, Mapping +from contextlib import contextmanager +from dataclasses import dataclass, field +from itertools import chain +from logging import StreamHandler +from typing import Any, Optional, Union + +import numpy as np +import torch +import torch.distributed as dist +from torch import nn +from torch.utils.data import Dataset, IterableDataset, RandomSampler, Sampler +from torch.utils.data.distributed import DistributedSampler + +from .integrations.deepspeed import is_deepspeed_zero3_enabled +from .tokenization_utils_base import BatchEncoding +from .utils import ( + is_sagemaker_mp_enabled, + is_torch_available, + is_torch_xla_available, + is_training_run_on_sagemaker, + logging, +) + + +if is_training_run_on_sagemaker(): + logging.add_handler(StreamHandler(sys.stdout)) + +if is_torch_xla_available(): + import torch_xla.core.xla_model as xm + +if is_torch_available(): + from torch.optim.lr_scheduler import LRScheduler + + +logger = logging.get_logger(__name__) + + +def get_dataloader_sampler(dataloader): + if hasattr(dataloader, "batch_sampler") and dataloader.batch_sampler is not None: + return get_dataloader_sampler(dataloader.batch_sampler) + elif hasattr(dataloader, "sampler"): + return dataloader.sampler + + +def atleast_1d(tensor_or_array: Union[torch.Tensor, np.ndarray]): + if isinstance(tensor_or_array, torch.Tensor): + if hasattr(torch, "atleast_1d"): + tensor_or_array = torch.atleast_1d(tensor_or_array) + elif tensor_or_array.ndim < 1: + tensor_or_array = tensor_or_array[None] + else: + tensor_or_array = np.atleast_1d(tensor_or_array) + return tensor_or_array + + +def torch_pad_and_concatenate(tensor1, tensor2, padding_index=-100): + """Concatenates `tensor1` and `tensor2` on first axis, applying padding on the second if necessary.""" + tensor1 = atleast_1d(tensor1) + tensor2 = atleast_1d(tensor2) + + if len(tensor1.shape) == 1 or tensor1.shape[1] == tensor2.shape[1]: + return torch.cat((tensor1, tensor2), dim=0) + + # Let's figure out the new shape + new_shape = (tensor1.shape[0] + tensor2.shape[0], max(tensor1.shape[1], tensor2.shape[1])) + tensor1.shape[2:] + + # Now let's fill the result tensor + result = tensor1.new_full(new_shape, padding_index) + result[: tensor1.shape[0], : tensor1.shape[1]] = tensor1 + result[tensor1.shape[0] :, : tensor2.shape[1]] = tensor2 + return result + + +def numpy_pad_and_concatenate(array1, array2, padding_index=-100): + """Concatenates `array1` and `array2` on first axis, applying padding on the second if necessary.""" + array1 = atleast_1d(array1) + array2 = atleast_1d(array2) + + if len(array1.shape) == 1 or array1.shape[1] == array2.shape[1]: + return np.concatenate((array1, array2), axis=0) + + # Let's figure out the new shape + new_shape = (array1.shape[0] + array2.shape[0], max(array1.shape[1], array2.shape[1])) + array1.shape[2:] + + # Now let's fill the result tensor + result = np.full_like(array1, padding_index, shape=new_shape) + result[: array1.shape[0], : array1.shape[1]] = array1 + result[array1.shape[0] :, : array2.shape[1]] = array2 + return result + + +def nested_concat(tensors, new_tensors, padding_index=-100): + """ + Concat the `new_tensors` to `tensors` on the first dim and pad them on the second if needed. Works for tensors or + nested list/tuples/dict of tensors. + """ + if not (isinstance(tensors, torch.Tensor) and isinstance(new_tensors, torch.Tensor)): + assert type(tensors) is type(new_tensors), ( + f"Expected `tensors` and `new_tensors` to have the same type but found {type(tensors)} and {type(new_tensors)}." + ) + if isinstance(tensors, (list, tuple)): + return type(tensors)(nested_concat(t, n, padding_index=padding_index) for t, n in zip(tensors, new_tensors)) + elif isinstance(tensors, torch.Tensor): + return torch_pad_and_concatenate(tensors, new_tensors, padding_index=padding_index) + elif isinstance(tensors, Mapping): + return type(tensors)( + {k: nested_concat(t, new_tensors[k], padding_index=padding_index) for k, t in tensors.items()} + ) + elif isinstance(tensors, np.ndarray): + return numpy_pad_and_concatenate(tensors, new_tensors, padding_index=padding_index) + else: + raise TypeError(f"Unsupported type for concatenation: got {type(tensors)}") + + +def find_batch_size(tensors): + """ + Find the first dimension of a tensor in a nested list/tuple/dict of tensors. + """ + if isinstance(tensors, (list, tuple)): + for t in tensors: + result = find_batch_size(t) + if result is not None: + return result + elif isinstance(tensors, Mapping): + for key, value in tensors.items(): + result = find_batch_size(value) + if result is not None: + return result + elif isinstance(tensors, torch.Tensor): + return tensors.shape[0] if len(tensors.shape) >= 1 else None + elif isinstance(tensors, np.ndarray): + return tensors.shape[0] if len(tensors.shape) >= 1 else None + + +def nested_numpify(tensors): + "Numpify `tensors` (even if it's a nested list/tuple/dict of tensors)." + if isinstance(tensors, (list, tuple)): + return type(tensors)(nested_numpify(t) for t in tensors) + if isinstance(tensors, Mapping): + return type(tensors)({k: nested_numpify(t) for k, t in tensors.items()}) + + t = tensors.cpu() + if t.dtype == torch.bfloat16: + # As of Numpy 1.21.4, NumPy does not support bfloat16 (see + # https://github.com/numpy/numpy/blob/a47ecdea856986cd60eabbd53265c2ca5916ad5d/doc/source/user/basics.types.rst ). + # Until Numpy adds bfloat16, we must convert float32. + t = t.to(torch.float32) + return t.numpy() + + +def nested_detach(tensors): + "Detach `tensors` (even if it's a nested list/tuple/dict of tensors)." + if isinstance(tensors, (list, tuple)): + return type(tensors)(nested_detach(t) for t in tensors) + elif isinstance(tensors, Mapping): + return type(tensors)({k: nested_detach(t) for k, t in tensors.items()}) + return tensors.detach() if isinstance(tensors, torch.Tensor) else tensors + + +def nested_xla_mesh_reduce(tensors, name): + if is_torch_xla_available(): + import torch_xla.core.xla_model as xm + + if isinstance(tensors, (list, tuple)): + return type(tensors)(nested_xla_mesh_reduce(t, f"{name}_{i}") for i, t in enumerate(tensors)) + if isinstance(tensors, Mapping): + return type(tensors)( + {k: nested_xla_mesh_reduce(t, f"{name}_{i}") for i, (k, t) in enumerate(tensors.items())} + ) + + tensors = atleast_1d(tensors) + return xm.mesh_reduce(name, tensors, torch.cat) + else: + raise ImportError("Torch xla must be installed to use `nested_xla_mesh_reduce`") + + +def distributed_concat(tensor: Any, num_total_examples: Optional[int] = None) -> Any: + try: + if isinstance(tensor, (tuple, list)): + return type(tensor)(distributed_concat(t, num_total_examples) for t in tensor) + if isinstance(tensor, Mapping): + return type(tensor)({k: distributed_concat(t, num_total_examples) for k, t in tensor.items()}) + tensor = atleast_1d(tensor).contiguous() + output_tensors = [tensor.clone() for _ in range(dist.get_world_size())] + dist.all_gather(output_tensors, tensor) + concat = torch.cat(output_tensors, dim=0) + + # truncate the dummy elements added by SequentialDistributedSampler + if num_total_examples is not None: + concat = concat[:num_total_examples] + return concat + except AssertionError: + raise AssertionError("Not currently using distributed training") + + +def distributed_broadcast_scalars( + scalars: list[Union[int, float]], + num_total_examples: Optional[int] = None, + device: Optional[torch.device] = torch.device("cuda"), +) -> torch.Tensor: + try: + tensorized_scalar = torch.tensor(scalars, device=device) + output_tensors = [tensorized_scalar.clone() for _ in range(dist.get_world_size())] + dist.all_gather(output_tensors, tensorized_scalar) + concat = torch.cat(output_tensors, dim=0) + + # truncate the dummy elements added by SequentialDistributedSampler + if num_total_examples is not None: + concat = concat[:num_total_examples] + return concat + except AssertionError: + raise AssertionError("Not currently using distributed training") + + +def reissue_pt_warnings(caught_warnings): + # Reissue warnings + if len(caught_warnings) > 1: + for w in caught_warnings: + if w.category is not UserWarning: + warnings.warn(w.message, w.category) + + +@contextmanager +def torch_distributed_zero_first(local_rank: int): + """ + Decorator to make all processes in distributed training wait for each local_master to do something. + + Args: + local_rank (`int`): The rank of the local process. + """ + if local_rank not in [-1, 0]: + dist.barrier() + yield + if local_rank == 0: + dist.barrier() + + +class DistributedSamplerWithLoop(DistributedSampler): + """ + Like a torch.utils.data.distributed.DistributedSampler` but loops at the end back to the beginning of the shuffled + samples to make each process have a round multiple of batch_size samples. + + Args: + dataset (`torch.utils.data.Dataset`): + Dataset used for sampling. + batch_size (`int`): + The batch size used with this sampler + kwargs (`Dict[str, Any]`, *optional*): + All other keyword arguments passed to `DistributedSampler`. + """ + + def __init__(self, dataset, batch_size, **kwargs): + super().__init__(dataset, **kwargs) + self.batch_size = batch_size + + def __iter__(self): + indices = list(super().__iter__()) + remainder = 0 if len(indices) % self.batch_size == 0 else self.batch_size - len(indices) % self.batch_size + # DistributedSampler already added samples from the beginning to make the number of samples a round multiple + # of the world size, so we skip those. + start_remainder = 1 if self.rank < len(self.dataset) % self.num_replicas else 0 + indices += indices[start_remainder : start_remainder + remainder] + return iter(indices) + + +class EvalLoopContainer: + """ + Container to store intermediate results of evaluation loop. + + Args: + do_nested_concat (`bool`, *optional*, defaults to `True`): + If set to `True`, each iteration will recursively concatenate a new object containing tensors to + the existing stored tensors, provided that the structure of the existing object and the new one + are identical. If set to `False`, all newly added tensors will be stored in a list. + padding_index (`int`, *optional*, defaults to -100): + Value used to pad tensors of different shapes when `do_nested_concat=True`. + """ + + def __init__(self, do_nested_concat: bool = True, padding_index: int = -100): + self.do_nested_concat = do_nested_concat + self.padding_index = padding_index + self.tensors = None + self.arrays = None + + def add(self, tensors) -> None: + """Add tensors to the stored objects. If `do_nested_concat=True`, the tensors will be concatenated recursively.""" + if self.tensors is None: + self.tensors = tensors if self.do_nested_concat else [tensors] + elif self.do_nested_concat: + self.tensors = nested_concat(self.tensors, tensors, padding_index=self.padding_index) + else: + self.tensors.append(tensors) + + def to_cpu_and_numpy(self) -> None: + """Move tensors in stored objects to CPU and convert them to numpy arrays.""" + + # Check if we have something to add, if not just return + if self.tensors is None: + return + + new_arrays = nested_numpify(self.tensors) + if self.arrays is None: + self.arrays = new_arrays + elif self.do_nested_concat: + self.arrays = nested_concat(self.arrays, new_arrays, padding_index=self.padding_index) + else: + self.arrays.extend(new_arrays) + + # reset device tensors after adding to cpu + self.tensors = None + + def get_arrays(self): + """Returns the numpified and moved to CPU stored objects.""" + self.to_cpu_and_numpy() + return self.arrays + + +class SequentialDistributedSampler(Sampler): + """ + Distributed Sampler that subsamples indices sequentially, making it easier to collate all results at the end. + + Even though we only use this sampler for eval and predict (no training), which means that the model params won't + have to be synced (i.e. will not hang for synchronization even if varied number of forward passes), we still add + extra samples to the sampler to make it evenly divisible (like in `DistributedSampler`) to make it easy to `gather` + or `reduce` resulting tensors at the end of the loop. + """ + + def __init__(self, dataset, num_replicas=None, rank=None, batch_size=None): + warnings.warn( + "SequentialDistributedSampler is deprecated and will be removed in v5 of Transformers.", + FutureWarning, + ) + if num_replicas is None: + if not dist.is_available(): + raise RuntimeError("Requires distributed package to be available") + num_replicas = dist.get_world_size() + if rank is None: + if not dist.is_available(): + raise RuntimeError("Requires distributed package to be available") + rank = dist.get_rank() + self.dataset = dataset + self.num_replicas = num_replicas + self.rank = rank + num_samples = len(self.dataset) + # Add extra samples to make num_samples a multiple of batch_size if passed + if batch_size is not None: + self.num_samples = int(math.ceil(num_samples / (batch_size * num_replicas))) * batch_size + else: + self.num_samples = int(math.ceil(num_samples / num_replicas)) + self.total_size = self.num_samples * self.num_replicas + self.batch_size = batch_size + + def __iter__(self): + indices = list(range(len(self.dataset))) + + # add extra samples to make it evenly divisible + indices += indices[: (self.total_size - len(indices))] + assert len(indices) == self.total_size, ( + f"Indices length {len(indices)} and total size {self.total_size} mismatched" + ) + + # subsample + indices = indices[self.rank * self.num_samples : (self.rank + 1) * self.num_samples] + assert len(indices) == self.num_samples, ( + f"Indices length {len(indices)} and sample number {self.num_samples} mismatched" + ) + + return iter(indices) + + def __len__(self): + return self.num_samples + + +def get_tpu_sampler(dataset: torch.utils.data.Dataset, batch_size: int): + if xm.xrt_world_size() <= 1: + return RandomSampler(dataset) + return DistributedSampler(dataset, num_replicas=xm.xrt_world_size(), rank=xm.get_ordinal()) + + +def nested_new_like(arrays, num_samples, padding_index=-100): + """Create the same nested structure as `arrays` with a first dimension always at `num_samples`.""" + if isinstance(arrays, (list, tuple)): + return type(arrays)(nested_new_like(x, num_samples) for x in arrays) + return np.full_like(arrays, padding_index, shape=(num_samples, *arrays.shape[1:])) + + +def expand_like(arrays, new_seq_length, padding_index=-100): + """Expand the `arrays` so that the second dimension grows to `new_seq_length`. Uses `padding_index` for padding.""" + result = np.full_like(arrays, padding_index, shape=(arrays.shape[0], new_seq_length) + arrays.shape[2:]) + result[:, : arrays.shape[1]] = arrays + return result + + +def nested_truncate(tensors, limit): + "Truncate `tensors` at `limit` (even if it's a nested list/tuple/dict of tensors)." + if isinstance(tensors, (list, tuple)): + return type(tensors)(nested_truncate(t, limit) for t in tensors) + if isinstance(tensors, Mapping): + return type(tensors)({k: nested_truncate(t, limit) for k, t in tensors.items()}) + + return tensors[:limit] + + +class DistributedTensorGatherer: + """ + A class responsible for properly gathering tensors (or nested list/tuple of tensors) on the CPU by chunks. + + If our dataset has 16 samples with a batch size of 2 on 3 processes and we gather then transfer on CPU at every + step, our sampler will generate the following indices: + + `[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1]` + + to get something of size a multiple of 3 (so that each process gets the same dataset length). Then process 0, 1 and + 2 will be responsible of making predictions for the following samples: + + - P0: `[0, 1, 2, 3, 4, 5]` + - P1: `[6, 7, 8, 9, 10, 11]` + - P2: `[12, 13, 14, 15, 0, 1]` + + The first batch treated on each process will be: + + - P0: `[0, 1]` + - P1: `[6, 7]` + - P2: `[12, 13]` + + So if we gather at the end of the first batch, we will get a tensor (nested list/tuple of tensor) corresponding to + the following indices: + + `[0, 1, 6, 7, 12, 13]` + + If we directly concatenate our results without taking any precautions, the user will then get the predictions for + the indices in this order at the end of the prediction loop: + + `[0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11, 0, 1]` + + For some reason, that's not going to roll their boat. This class is there to solve that problem. + + Args: + world_size (`int`): + The number of processes used in the distributed training. + num_samples (`int`): + The number of samples in our dataset. + make_multiple_of (`int`, *optional*): + If passed, the class assumes the datasets passed to each process are made to be a multiple of this argument + (by adding samples). + padding_index (`int`, *optional*, defaults to -100): + The padding index to use if the arrays don't all have the same sequence length. + """ + + def __init__(self, world_size, num_samples, make_multiple_of=None, padding_index=-100): + warnings.warn( + "DistributedTensorGatherer is deprecated and will be removed in v5 of Transformers.", + FutureWarning, + ) + self.world_size = world_size + self.num_samples = num_samples + total_size = world_size if make_multiple_of is None else world_size * make_multiple_of + self.total_samples = int(np.ceil(num_samples / total_size)) * total_size + self.process_length = self.total_samples // world_size + self._storage = None + self._offsets = None + self.padding_index = padding_index + + def add_arrays(self, arrays): + """ + Add `arrays` to the internal storage, Will initialize the storage to the full size at the first arrays passed + so that if we're bound to get an OOM, it happens at the beginning. + """ + if arrays is None: + return + if self._storage is None: + self._storage = nested_new_like(arrays, self.total_samples, padding_index=self.padding_index) + self._offsets = list(range(0, self.total_samples, self.process_length)) + + slice_len, self._storage = self._nested_set_tensors(self._storage, arrays) + for i in range(self.world_size): + self._offsets[i] += slice_len + + def _nested_set_tensors(self, storage, arrays): + if isinstance(arrays, (list, tuple)): + result = [self._nested_set_tensors(x, y) for x, y in zip(storage, arrays)] + return result[0][0], type(arrays)(r[1] for r in result) + assert arrays.shape[0] % self.world_size == 0, ( + f"Arrays passed should all have a first dimension multiple of {self.world_size}, found {arrays.shape[0]}." + ) + + slice_len = arrays.shape[0] // self.world_size + for i in range(self.world_size): + if len(arrays.shape) == 1: + storage[self._offsets[i] : self._offsets[i] + slice_len] = arrays[i * slice_len : (i + 1) * slice_len] + else: + # Expand the array on the fly if needed. + if len(storage.shape) > 1 and storage.shape[1] < arrays.shape[1]: + storage = expand_like(storage, arrays.shape[1], padding_index=self.padding_index) + storage[self._offsets[i] : self._offsets[i] + slice_len, : arrays.shape[1]] = arrays[ + i * slice_len : (i + 1) * slice_len + ] + return slice_len, storage + + def finalize(self): + """ + Return the properly gathered arrays and truncate to the number of samples (since the sampler added some extras + to get each process a dataset of the same length). + """ + if self._storage is None: + return + if self._offsets[0] != self.process_length: + logger.warning("Not all data has been set. Are you sure you passed all values?") + return nested_truncate(self._storage, self.num_samples) + + +@dataclass +class LabelSmoother: + """ + Adds label-smoothing on a pre-computed output from a Transformers model. + + Args: + epsilon (`float`, *optional*, defaults to 0.1): + The label smoothing factor. + ignore_index (`int`, *optional*, defaults to -100): + The index in the labels to ignore when computing the loss. + """ + + epsilon: float = 0.1 + ignore_index: int = -100 + + def __call__(self, model_output, labels, shift_labels=False): + logits = model_output["logits"] if isinstance(model_output, dict) else model_output[0] + if shift_labels: + logits = logits[..., :-1, :].contiguous() + labels = labels[..., 1:].contiguous() + + log_probs = -nn.functional.log_softmax(logits, dim=-1) + if labels.dim() == log_probs.dim() - 1: + labels = labels.unsqueeze(-1) + + padding_mask = labels.eq(self.ignore_index) + # In case the ignore_index is -100, the gather will fail, so we replace labels by 0. The padding_mask + # will ignore them in any case. + labels = torch.clamp(labels, min=0) + nll_loss = log_probs.gather(dim=-1, index=labels) + # works for fp16 input tensor too, by internally upcasting it to fp32 + smoothed_loss = log_probs.sum(dim=-1, keepdim=True, dtype=torch.float32) + + nll_loss.masked_fill_(padding_mask, 0.0) + smoothed_loss.masked_fill_(padding_mask, 0.0) + + # Take the mean over the label dimensions, then divide by the number of active elements (i.e. not-padded): + num_active_elements = padding_mask.numel() - padding_mask.long().sum() + nll_loss = nll_loss.sum() / num_active_elements + smoothed_loss = smoothed_loss.sum() / (num_active_elements * log_probs.shape[-1]) + return (1 - self.epsilon) * nll_loss + self.epsilon * smoothed_loss + + +def get_length_grouped_indices(lengths, batch_size, mega_batch_mult=None, generator=None): + """ + Return a list of indices so that each slice of `batch_size` consecutive indices correspond to elements of similar + lengths. To do this, the indices are: + + - randomly permuted + - grouped in mega-batches of size `mega_batch_mult * batch_size` + - sorted by length in each mega-batch + + The result is the concatenation of all mega-batches, with the batch of `batch_size` containing the element of + maximum length placed first, so that an OOM happens sooner rather than later. + """ + # Default for mega_batch_mult: 50 or the number to get 4 megabatches, whichever is smaller. + if mega_batch_mult is None: + mega_batch_mult = min(len(lengths) // (batch_size * 4), 50) + # Just in case, for tiny datasets + if mega_batch_mult == 0: + mega_batch_mult = 1 + + # We need to use torch for the random part as a distributed sampler will set the random seed for torch. + indices = torch.randperm(len(lengths), generator=generator) + megabatch_size = mega_batch_mult * batch_size + megabatches = [indices[i : i + megabatch_size].tolist() for i in range(0, len(lengths), megabatch_size)] + megabatches = [sorted(megabatch, key=lambda i: lengths[i], reverse=True) for megabatch in megabatches] + + # The rest is to get the biggest batch first. + # Since each megabatch is sorted by descending length, the longest element is the first + megabatch_maximums = [lengths[megabatch[0]] for megabatch in megabatches] + max_idx = torch.argmax(torch.tensor(megabatch_maximums)).item() + # Switch to put the longest element in first position + megabatches[0][0], megabatches[max_idx][0] = megabatches[max_idx][0], megabatches[0][0] + + return [i for megabatch in megabatches for i in megabatch] + + +class LengthGroupedSampler(Sampler): + r""" + Sampler that samples indices in a way that groups together features of the dataset of roughly the same length while + keeping a bit of randomness. + """ + + def __init__( + self, + batch_size: int, + dataset: Optional[Dataset] = None, + lengths: Optional[list[int]] = None, + model_input_name: Optional[str] = None, + generator=None, + ): + if dataset is None and lengths is None: + raise ValueError("One of dataset and lengths must be provided.") + + self.batch_size = batch_size + if lengths is None: + model_input_name = model_input_name if model_input_name is not None else "input_ids" + if ( + not (isinstance(dataset[0], dict) or isinstance(dataset[0], BatchEncoding)) + or model_input_name not in dataset[0] + ): + raise ValueError( + "Can only automatically infer lengths for datasets whose items are dictionaries with an " + f"'{model_input_name}' key." + ) + lengths = [len(feature[model_input_name]) for feature in dataset] + elif isinstance(lengths, torch.Tensor): + logger.info( + "If lengths is a torch.Tensor, LengthGroupedSampler will be slow. Converting lengths to List[int]..." + ) + lengths = lengths.tolist() + + self.lengths = lengths + self.generator = generator + + def __len__(self): + return len(self.lengths) + + def __iter__(self): + indices = get_length_grouped_indices(self.lengths, self.batch_size, generator=self.generator) + return iter(indices) + + +class DistributedLengthGroupedSampler(DistributedSampler): + r""" + Distributed Sampler that samples indices in a way that groups together features of the dataset of roughly the same + length while keeping a bit of randomness. + """ + + # Copied and adapted from PyTorch DistributedSampler. + def __init__( + self, + batch_size: int, + dataset: Optional[Dataset] = None, + num_replicas: Optional[int] = None, + rank: Optional[int] = None, + seed: int = 0, + drop_last: bool = False, + lengths: Optional[list[int]] = None, + model_input_name: Optional[str] = None, + ): + if dataset is None and lengths is None: + raise ValueError("One of dataset and lengths must be provided.") + if num_replicas is None: + if not dist.is_available(): + raise RuntimeError("Requires distributed package to be available") + num_replicas = dist.get_world_size() + if rank is None: + if not dist.is_available(): + raise RuntimeError("Requires distributed package to be available") + rank = dist.get_rank() + + self.batch_size = batch_size + self.num_replicas = num_replicas + self.rank = rank + self.epoch = 0 + self.drop_last = drop_last + + if lengths is None: + model_input_name = model_input_name if model_input_name is not None else "input_ids" + if ( + not (isinstance(dataset[0], dict) or isinstance(dataset[0], BatchEncoding)) + or model_input_name not in dataset[0] + ): + raise ValueError( + "Can only automatically infer lengths for datasets whose items are dictionaries with an " + f"'{model_input_name}' key." + ) + lengths = [len(feature[model_input_name]) for feature in dataset] + elif isinstance(lengths, torch.Tensor): + logger.info( + "If lengths is a torch.Tensor, DistributedLengthGroupedSampler will be slow. Converting lengths to" + " List[int]..." + ) + lengths = lengths.tolist() + + self.lengths = lengths + + # If the dataset length is evenly divisible by # of replicas, then there + # is no need to drop any data, since the dataset will be split equally. + if self.drop_last and len(self.lengths) % self.num_replicas != 0: + # Split to nearest available length that is evenly divisible. + # This is to ensure each rank receives the same amount of data when + # using this Sampler. + self.num_samples = math.ceil((len(self.lengths) - self.num_replicas) / self.num_replicas) + else: + self.num_samples = math.ceil(len(self.lengths) / self.num_replicas) + self.total_size = self.num_samples * self.num_replicas + self.seed = seed + + def __iter__(self) -> Iterator: + # Deterministically shuffle based on epoch and seed + g = torch.Generator() + g.manual_seed(self.seed + self.epoch) + indices = get_length_grouped_indices(self.lengths, self.batch_size, generator=g) + + if not self.drop_last: + # add extra samples to make it evenly divisible + indices += indices[: (self.total_size - len(indices))] + else: + # remove tail of data to make it evenly divisible + indices = indices[: self.total_size] + assert len(indices) == self.total_size + + # subsample + indices = indices[self.rank : self.total_size : self.num_replicas] + assert len(indices) == self.num_samples + + return iter(indices) + + +class ShardSampler(Sampler): + """ + Sampler that shards batches between several processes. Dispatches indices batch by batch: on 2 processes with batch + size 4, the first two batches are `[0, 1, 2, 3, 4, 5, 6, 7]` and `[8, 9, 10, 11, 12, 13, 14, 15]`, which shard into + `[0, 1, 2, 3]` and `[8, 9, 10, 11]` for GPU-0 and `[4, 5, 6, 7]` and `[12, 13, 14, 15]` for GPU-1. + + The sampler thus yields `[0, 1, 2, 3, 8, 9, 10, 11]` on GPU-0 and `[4, 5, 6, 7, 12, 13, 14, 15]` on GPU-1. + """ + + def __init__( + self, + dataset: Dataset, + batch_size: int = 1, + drop_last: bool = False, + num_processes: int = 1, + process_index: int = 0, + ): + self.dataset = dataset + self.batch_size = batch_size + self.drop_last = drop_last + self.num_processes = num_processes + self.process_index = process_index + + self.total_batch_size = total_batch_size = batch_size * num_processes + + num_batches = len(dataset) // total_batch_size if drop_last else math.ceil(len(dataset) / total_batch_size) + self.total_num_samples = num_batches * total_batch_size + + def __iter__(self): + indices = list(range(len(self.dataset))) + + # Add extra samples to make it evenly divisible. While loop is there in the edge case we have a tiny dataset + # and it needs to be done several times. + while len(indices) < self.total_num_samples: + indices += indices[: (self.total_num_samples - len(indices))] + + result = [] + for batch_start in range(self.batch_size * self.process_index, self.total_num_samples, self.total_batch_size): + result += indices[batch_start : batch_start + self.batch_size] + + return iter(result) + + def __len__(self): + # Each shard only sees a fraction of total_num_samples. + return self.total_num_samples // self.num_processes + + +class IterableDatasetShard(IterableDataset): + """ + Wraps a PyTorch `IterableDataset` to generate samples for one of the processes only. Instances of this class will + always yield a number of samples that is a round multiple of the actual batch size (which is `batch_size x + num_processes`). Depending on the value of the `drop_last` attribute, it will either stop the iteration at the + first batch that would be too small or loop with indices from the beginning. + + On two processes with an iterable dataset yielding of `[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]` with a batch size of + 2: + + - the shard on process 0 will yield `[0, 1, 4, 5, 8, 9]` so will see batches `[0, 1]`, `[4, 5]`, `[8, 9]` + - the shard on process 1 will yield `[2, 3, 6, 7, 10, 11]` so will see batches `[2, 3]`, `[6, 7]`, `[10, 11]` + + + + If your IterableDataset implements some randomization that needs to be applied the same way on all processes + (for instance, a shuffling), you should use a `torch.Generator` in a `generator` attribute of the `dataset` to + generate your random numbers and call the [`~trainer_pt_utils.IterableDatasetShard.set_epoch`] method of this + object. It will set the seed of this `generator` to `seed + epoch` on all processes before starting the + iteration. Alternatively, you can also implement a `set_epoch()` method in your iterable dataset to deal with + this. + + + + Args: + dataset (`torch.utils.data.IterableDataset`): + The batch sampler to split in several shards. + batch_size (`int`, *optional*, defaults to 1): + The size of the batches per shard. + drop_last (`bool`, *optional*, defaults to `False`): + Whether or not to drop the last incomplete batch or complete the last batches by using the samples from the + beginning. + num_processes (`int`, *optional*, defaults to 1): + The number of processes running concurrently. + process_index (`int`, *optional*, defaults to 0): + The index of the current process. + seed (`int`, *optional*, defaults to 0): + A random seed that will be used for the random number generation in + [`~trainer_pt_utils.IterableDatasetShard.set_epoch`]. + """ + + def __init__( + self, + dataset: IterableDataset, + batch_size: int = 1, + drop_last: bool = False, + num_processes: int = 1, + process_index: int = 0, + seed: int = 0, + ): + self.dataset = dataset + self.batch_size = batch_size + self.drop_last = drop_last + self.num_processes = num_processes + self.process_index = process_index + self.seed = seed + self.epoch = 0 + self.num_examples = 0 + + def set_epoch(self, epoch): + self.epoch = epoch + if hasattr(self.dataset, "set_epoch"): + self.dataset.set_epoch(epoch) + + def __iter__(self): + self.num_examples = 0 + if ( + not hasattr(self.dataset, "set_epoch") + and hasattr(self.dataset, "generator") + and isinstance(self.dataset.generator, torch.Generator) + ): + self.dataset.generator.manual_seed(self.seed + self.epoch) + real_batch_size = self.batch_size * self.num_processes + process_slice = range(self.process_index * self.batch_size, (self.process_index + 1) * self.batch_size) + + first_batch = None + current_batch = [] + for element in self.dataset: + self.num_examples += 1 + current_batch.append(element) + # Wait to have a full batch before yielding elements. + if len(current_batch) == real_batch_size: + for i in process_slice: + yield current_batch[i] + if first_batch is None: + first_batch = current_batch.copy() + current_batch = [] + + # Finished if drop_last is True, otherwise complete the last batch with elements from the beginning. + if not self.drop_last and len(current_batch) > 0: + if first_batch is None: + first_batch = current_batch.copy() + while len(current_batch) < real_batch_size: + current_batch += first_batch + for i in process_slice: + yield current_batch[i] + + def __len__(self): + # Will raise an error if the underlying dataset is not sized. + if self.drop_last: + return (len(self.dataset) // (self.batch_size * self.num_processes)) * self.batch_size + else: + return math.ceil(len(self.dataset) / (self.batch_size * self.num_processes)) * self.batch_size + + +# In order to keep `trainer.py` compact and easy to understand, place any secondary PT Trainer +# helper methods here + + +def _get_learning_rate(self): + if self.is_deepspeed_enabled: + # with deepspeed's fp16 and dynamic loss scale enabled the optimizer/scheduler steps may + # not run for the first few dozen steps while loss scale is too large, and thus during + # that time `get_last_lr` will fail if called during that warm up stage, so work around it: + try: + last_lr = self.lr_scheduler.get_last_lr()[0] + except AssertionError as e: + if "need to call step" in str(e): + logger.warning("tried to get lr value before scheduler/optimizer started stepping, returning lr=0") + last_lr = 0 + else: + raise + else: + if isinstance(self.lr_scheduler, torch.optim.lr_scheduler.ReduceLROnPlateau): + last_lr = self.optimizer.param_groups[0]["lr"] + else: + last_lr = self.lr_scheduler.get_last_lr()[0] + if torch.is_tensor(last_lr): + last_lr = last_lr.item() + return last_lr + + +def _secs2timedelta(secs): + """ + Convert seconds to hh:mm:ss.msec, msecs rounded to 2 decimal places. + """ + + msec = int(abs(secs - int(secs)) * 100) + return f"{datetime.timedelta(seconds=int(secs))}.{msec:02d}" + + +def metrics_format(self, metrics: dict[str, float]) -> dict[str, float]: + """ + Reformat Trainer metrics values to a human-readable format. + + Args: + metrics (`Dict[str, float]`): + The metrics returned from train/evaluate/predict + + Returns: + metrics (`Dict[str, float]`): The reformatted metrics + """ + + metrics_copy = metrics.copy() + for k, v in metrics_copy.items(): + if "_mem_" in k: + metrics_copy[k] = f"{v >> 20}MB" + elif "_runtime" in k: + metrics_copy[k] = _secs2timedelta(v) + elif k == "total_flos": + metrics_copy[k] = f"{int(v) >> 30}GF" + elif isinstance(metrics_copy[k], float): + metrics_copy[k] = round(v, 4) + + return metrics_copy + + +def log_metrics(self, split, metrics): + """ + Log metrics in a specially formatted way. + + Under distributed environment this is done only for a process with rank 0. + + Args: + split (`str`): + Mode/split name: one of `train`, `eval`, `test` + metrics (`Dict[str, float]`): + The metrics returned from train/evaluate/predictmetrics: metrics dict + + Notes on memory reports: + + In order to get memory usage report you need to install `psutil`. You can do that with `pip install psutil`. + + Now when this method is run, you will see a report that will include: + + ``` + init_mem_cpu_alloc_delta = 1301MB + init_mem_cpu_peaked_delta = 154MB + init_mem_gpu_alloc_delta = 230MB + init_mem_gpu_peaked_delta = 0MB + train_mem_cpu_alloc_delta = 1345MB + train_mem_cpu_peaked_delta = 0MB + train_mem_gpu_alloc_delta = 693MB + train_mem_gpu_peaked_delta = 7MB + ``` + + **Understanding the reports:** + + - the first segment, e.g., `train__`, tells you which stage the metrics are for. Reports starting with `init_` + will be added to the first stage that gets run. So that if only evaluation is run, the memory usage for the + `__init__` will be reported along with the `eval_` metrics. + - the third segment, is either `cpu` or `gpu`, tells you whether it's the general RAM or the gpu0 memory + metric. + - `*_alloc_delta` - is the difference in the used/allocated memory counter between the end and the start of the + stage - it can be negative if a function released more memory than it allocated. + - `*_peaked_delta` - is any extra memory that was consumed and then freed - relative to the current allocated + memory counter - it is never negative. When you look at the metrics of any stage you add up `alloc_delta` + + `peaked_delta` and you know how much memory was needed to complete that stage. + + The reporting happens only for process of rank 0 and gpu 0 (if there is a gpu). Typically this is enough since the + main process does the bulk of work, but it could be not quite so if model parallel is used and then other GPUs may + use a different amount of gpu memory. This is also not the same under DataParallel where gpu0 may require much more + memory than the rest since it stores the gradient and optimizer states for all participating GPUs. Perhaps in the + future these reports will evolve to measure those too. + + The CPU RAM metric measures RSS (Resident Set Size) includes both the memory which is unique to the process and the + memory shared with other processes. It is important to note that it does not include swapped out memory, so the + reports could be imprecise. + + The CPU peak memory is measured using a sampling thread. Due to python's GIL it may miss some of the peak memory if + that thread didn't get a chance to run when the highest memory was used. Therefore this report can be less than + reality. Using `tracemalloc` would have reported the exact peak memory, but it doesn't report memory allocations + outside of python. So if some C++ CUDA extension allocated its own memory it won't be reported. And therefore it + was dropped in favor of the memory sampling approach, which reads the current process memory usage. + + The GPU allocated and peak memory reporting is done with `torch.cuda.memory_allocated()` and + `torch.cuda.max_memory_allocated()`. This metric reports only "deltas" for pytorch-specific allocations, as + `torch.cuda` memory management system doesn't track any memory allocated outside of pytorch. For example, the very + first cuda call typically loads CUDA kernels, which may take from 0.5 to 2GB of GPU memory. + + Note that this tracker doesn't account for memory allocations outside of [`Trainer`]'s `__init__`, `train`, + `evaluate` and `predict` calls. + + Because `evaluation` calls may happen during `train`, we can't handle nested invocations because + `torch.cuda.max_memory_allocated` is a single counter, so if it gets reset by a nested eval call, `train`'s tracker + will report incorrect info. If this [pytorch issue](https://github.com/pytorch/pytorch/issues/16266) gets resolved + it will be possible to change this class to be re-entrant. Until then we will only track the outer level of + `train`, `evaluate` and `predict` methods. Which means that if `eval` is called during `train`, it's the latter + that will account for its memory usage and that of the former. + + This also means that if any other tool that is used along the [`Trainer`] calls + `torch.cuda.reset_peak_memory_stats`, the gpu peak memory stats could be invalid. And the [`Trainer`] will disrupt + the normal behavior of any such tools that rely on calling `torch.cuda.reset_peak_memory_stats` themselves. + + For best performance you may want to consider turning the memory profiling off for production runs. + """ + if not self.is_world_process_zero(): + return + + print(f"***** {split} metrics *****") + metrics_formatted = self.metrics_format(metrics) + k_width = max(len(str(x)) for x in metrics_formatted.keys()) + v_width = max(len(str(x)) for x in metrics_formatted.values()) + for key in sorted(metrics_formatted.keys()): + print(f" {key: <{k_width}} = {metrics_formatted[key]:>{v_width}}") + + +def save_metrics(self, split, metrics, combined=True): + """ + Save metrics into a json file for that split, e.g. `train_results.json`. + + Under distributed environment this is done only for a process with rank 0. + + Args: + split (`str`): + Mode/split name: one of `train`, `eval`, `test`, `all` + metrics (`Dict[str, float]`): + The metrics returned from train/evaluate/predict + combined (`bool`, *optional*, defaults to `True`): + Creates combined metrics by updating `all_results.json` with metrics of this call + + To understand the metrics please read the docstring of [`~Trainer.log_metrics`]. The only difference is that raw + unformatted numbers are saved in the current method. + + """ + if not self.is_world_process_zero(): + return + + path = os.path.join(self.args.output_dir, f"{split}_results.json") + with open(path, "w") as f: + json.dump(metrics, f, indent=4, sort_keys=True) + + if combined: + path = os.path.join(self.args.output_dir, "all_results.json") + if os.path.exists(path): + with open(path) as f: + all_metrics = json.load(f) + else: + all_metrics = {} + + all_metrics.update(metrics) + with open(path, "w") as f: + json.dump(all_metrics, f, indent=4, sort_keys=True) + + +def save_state(self): + """ + Saves the Trainer state, since Trainer.save_model saves only the tokenizer with the model. + + Under distributed environment this is done only for a process with rank 0. + """ + if not self.is_world_process_zero(): + return + + path = os.path.join(self.args.output_dir, "trainer_state.json") + self.state.save_to_json(path) + + +def get_model_param_count(model, trainable_only=False): + """ + Calculate model's total param count. If trainable_only is True then count only those requiring grads. + """ + if is_deepspeed_zero3_enabled(): + + def numel(p): + return p.ds_numel if hasattr(p, "ds_numel") else p.numel() + + else: + + def numel(p): + return p.numel() + + return sum(numel(p) for p in model.parameters() if not trainable_only or p.requires_grad) + + +def get_parameter_names(model, forbidden_layer_types, forbidden_layer_names=None): + """ + Returns the names of the model parameters that are not inside a forbidden layer. + """ + if forbidden_layer_names is None: + forbidden_layer_names = [] + result = [] + for name, child in model.named_children(): + child_params = get_parameter_names(child, forbidden_layer_types, forbidden_layer_names) + result += [ + f"{name}.{n}" + for n in child_params + if not isinstance(child, tuple(forbidden_layer_types)) + and not any(forbidden in f"{name}.{n}".lower() for forbidden in forbidden_layer_names) + ] + # Add model specific parameters that are not in any child + result += [ + k for k in model._parameters.keys() if not any(forbidden in k.lower() for forbidden in forbidden_layer_names) + ] + return result + + +def get_module_class_from_name(module, name): + """ + Gets a class from a module by its name. + + Args: + module (`torch.nn.Module`): The module to get the class from. + name (`str`): The name of the class. + """ + modules_children = list(module.children()) + if module.__class__.__name__ == name: + return module.__class__ + elif len(modules_children) == 0: + return + else: + for child_module in modules_children: + module_class = get_module_class_from_name(child_module, name) + if module_class is not None: + return module_class + + +def remove_dummy_checkpoint(is_main_process, output_dir, filenames): + if is_main_process: + for filename in filenames: + file = os.path.join(output_dir, filename) + if os.path.isfile(file): + os.remove(file) + + +if is_sagemaker_mp_enabled(): + import smdistributed.modelparallel.torch as smp + + @smp.step() + def smp_forward_backward(model, inputs, gradient_accumulation_steps=1): + outputs = model(**inputs) + loss = outputs["loss"] if isinstance(outputs, dict) else outputs[0] + loss /= gradient_accumulation_steps + model.backward(loss) + return loss + + @smp.step() + def smp_forward_only(model, inputs): + return model(**inputs) + + def smp_gather(tensor): + if isinstance(tensor, (list, tuple)): + return type(tensor)(smp_gather(t) for t in tensor) + elif isinstance(tensor, dict): + return type(tensor)({k: smp_gather(v) for k, v in tensor.items()}) + elif not isinstance(tensor, torch.Tensor): + raise TypeError( + f"Can't gather the values of type {type(tensor)}, only of nested list/tuple/dicts of tensors." + ) + all_tensors = smp.allgather(tensor, smp.CommGroup.DP_GROUP) + all_tensors = [atleast_1d(t) for t in all_tensors] + return torch.cat([t.cpu() for t in all_tensors], dim=0) + + def smp_nested_concat(tensor): + if isinstance(tensor, (list, tuple)): + return type(tensor)(smp_nested_concat(t) for t in tensor) + elif isinstance(tensor, dict): + return type(tensor)({k: smp_nested_concat(v) for k, v in tensor.items()}) + # It doesn't seem possible to check here if `tensor` is a StepOutput because StepOutput lives in `smp.step` + # which is also the name of the decorator so Python is confused. + return tensor.detach().concat().cpu() + + +@dataclass +class AcceleratorConfig: + """ + A subset of arguments relating to the underlying [`accelerate.Accelerator`] + implementation utilized in the `Trainer` that can be customized. + Mostly relating to data. + + Parameters: + split_batches (`bool`, *optional*, defaults to `False`): + Whether or not the accelerator should split the batches yielded by the dataloaders across the devices. If + `True` the actual batch size used will be the same on any kind of distributed processes, but it must be a + round multiple of the `num_processes` you are using. If `False`, actual batch size used will be the one set + in your script multiplied by the number of processes. + dispatch_batches (`bool`, *optional*): + If set to `True`, the dataloader prepared by the Accelerator is only iterated through on the main process + and then the batches are split and broadcast to each process. Will default to `True` for `DataLoader` whose + underlying dataset is an `IterableDataset`, `False` otherwise. + even_batches (`bool`, *optional*, defaults to `True`): + If set to `True`, in cases where the total batch size across all processes does not exactly divide the + dataset, samples at the start of the dataset will be duplicated so the batch can be divided equally among + all workers. + use_seedable_sampler (`bool`, *optional*, defaults to `True`): + Whether or not use a fully seedable random sampler ([`accelerate.data_loader.SeedableRandomSampler`]). Ensures + training results are fully reproducible using a different sampling technique. While seed-to-seed results + may differ, on average the differences are negligible when using multiple different seeds to compare. Should + also be ran with [`~utils.set_seed`] for the best results. + gradient_accumulation_kwargs (`dict`, *optional*): + Additional kwargs to configure gradient accumulation, see [`accelerate.utils.GradientAccumulationPlugin`]. + Any of the following (optional) keys are acceptable: + num_steps (`int`): Will take precedence over [`~.TrainingArguments.gradient_accumulation_steps`] if + the latter is set to 1, otherwise an exception will be raised. + adjust_scheduler (`bool`): Whether to adjust the scheduler steps to account for [`~.TrainingArguments.gradient_accumulation_steps`]. + The [`accelerate.utils.GradientAccumulationPlugin`] default is `True`. + sync_each_batch (`bool`): Whether to synchronize the gradients at each data batch. + The [`accelerate.utils.GradientAccumulationPlugin`] default is `False`. + non_blocking (`bool`, *optional*, defaults to `False`): + Whether to use non-blocking CUDA calls to help minimize synchronization during + distributed training with prepared `DataLoader` inputs being moved to device. + Best if used with `pin_memory=True` in the `TrainingArguments`. + use_configured_state (`bool*, *optional*, defaults to `False`): + Whether or not to use a pre-configured `AcceleratorState` or `PartialState` defined + before calling `TrainingArguments`. If `True`, an `Accelerator` or `PartialState` + must be initialized. May lead to issues using sweeps or hyperparameter tuning. + + """ + + # Data related arguments + split_batches: bool = field( + default=False, + metadata={ + "help": "Whether or not the accelerator should split the batches yielded by the dataloaders across the devices. If" + " `True` the actual batch size used will be the same on any kind of distributed processes, but it must be a" + " round multiple of the `num_processes` you are using. If `False`, actual batch size used will be the one set" + " in your script multiplied by the number of processes." + }, + ) + dispatch_batches: Optional[bool] = field( + default=None, + metadata={ + "help": "If set to `True`, the dataloader prepared by the Accelerator is only iterated through on the main process" + " and then the batches are split and broadcast to each process. Will default to `True` for `DataLoader` whose" + " underlying dataset is an `IterableDataslet`, `False` otherwise." + }, + ) + even_batches: bool = field( + default=True, + metadata={ + "help": "If set to `True`, in cases where the total batch size across all processes does not exactly divide the" + " dataset, samples at the start of the dataset will be duplicated so the batch can be divided equally among" + " all workers." + }, + ) + use_seedable_sampler: bool = field( + default=True, + metadata={ + "help": "Whether or not use a fully seedable random sampler ([`accelerate.data_loader.SeedableRandomSampler`])." + "Ensures training results are fully reproducible using a different sampling technique. " + "While seed-to-seed results may differ, on average the differences are negligible when using" + "multiple different seeds to compare. Should also be ran with [`~utils.set_seed`] for the best results." + }, + ) + + non_blocking: Optional[bool] = field( + default=False, + metadata={ + "help": "Whether to use non-blocking CUDA calls to help minimize synchronization during " + "distributed training with prepared `DataLoader` inputs being moved to device. " + "Best if used with `pin_memory=True` in the `TrainingArguments`. Requires accelerate " + "v0.30.0." + }, + ) + + gradient_accumulation_kwargs: Optional[dict] = field( + default=None, + metadata={ + "help": "Additional kwargs to configure gradient accumulation, see [`accelerate.utils.GradientAccumulationPlugin`]. " + "Any of the following (optional) keys are acceptable: " + " num_steps (`int`): Will take precedence over [`~.TrainingArguments.gradient_accumulation_steps`] if " + " the latter is set to 1, otherwise an exception will be raised. " + " adjust_scheduler (`bool`): Whether to adjust the scheduler steps to account for [`~.TrainingArguments.gradient_accumulation_steps`]. " + " The [`accelerate.utils.GradientAccumulationPlugin`] default is `True`. " + " sync_each_batch (`bool`): Whether to synchronize the gradients at each data batch. " + " The [`accelerate.utils.GradientAccumulationPlugin`] default is `False`." + }, + ) + use_configured_state: bool = field( + default=False, + metadata={ + "help": "Whether or not to use a pre-configured `AcceleratorState` or `PartialState` defined before calling `TrainingArguments`." + "If `True`, an `Accelerator` or `PartialState` must be initialized. May lead to issues using sweeps or hyperparameter tuning." + }, + ) + + @classmethod + def from_json_file(cls, json_file): + # Check if exists + open_file = io.open if os.path.exists(json_file) else open + with open_file(json_file, "r", encoding="utf-8") as f: + config_dict = json.load(f) + # Check for keys and load sensible defaults + extra_keys = sorted(key for key in config_dict.keys() if key not in cls.__dataclass_fields__.keys()) + if len(extra_keys) > 0: + raise ValueError( + f"The config file at {json_file} had unknown keys ({extra_keys}), please try upgrading your `transformers`" + " version or fix (and potentially remove these keys) from your config file." + ) + return cls(**config_dict) + + def to_dict(self): + return copy.deepcopy(self.__dict__) + + def pop(self, key, default=None): + return self.__dict__.pop(key, default) + + +class LayerWiseDummyOptimizer(torch.optim.Optimizer): + """ + For Layer-wise optimizers such as GaLoRE optimizer, the optimization + step is already done through the post gradient hooks. Therefore + the trick is to create a dummy optimizer that can take arbitrary + args and kwargs and return a no-op during training. + + Initial idea from @hiyouga in LLaMA-Factory: + https://github.com/hiyouga/LLaMA-Factory/commit/8664262cde3919e10eaecbd66e8c5d356856362e#diff-ebe08ab14496dfb9e06075f0fdd36799ef6d1535cc4dd4715b74c4e3e06fe3ba + """ + + def __init__(self, optimizer_dict=None, *args, **kwargs): + dummy_tensor = torch.randn(1, 1) + self.optimizer_dict = optimizer_dict + super().__init__([dummy_tensor], {"lr": kwargs.get("lr", 1e-03)}) + + def zero_grad(self, set_to_none: bool = True) -> None: + pass + + def step(self, closure=None) -> Optional[float]: + pass + + +class LayerWiseDummyScheduler(LRScheduler): + """ + For Layer-wise optimizers such as GaLoRE optimizer, the optimization and scheduling step + are already done through the post gradient hooks. Therefore + the trick is to create a dummy scheduler that can take arbitrary + args and kwargs and return a no-op during training. + """ + + def __init__(self, *args, **kwargs): + self.default_lr = kwargs["lr"] + optimizer = LayerWiseDummyOptimizer(**kwargs) + last_epoch = -1 + verbose = False + super().__init__(optimizer, last_epoch, verbose) + + def get_lr(self): + # default value + lrs = [self.default_lr] + + # we take each lr in the parameters if they exist, assumes the optimizer to be the `LayerWiseDummyOptimizer` + if self.optimizer is not None: + param_wise_lrs = [ + [group["lr"] for group in optim.param_groups] for optim in self.optimizer.optimizer_dict.values() + ] + lrs = list(chain(*param_wise_lrs)) + + return lrs + + def _get_closed_form_lr(self): + return self.base_lrs + + +def set_rng_state_for_device(device_name, device_module, checkpoint_rng_state, is_distributed): + """Helper to set RNG state for a specific device type (CUDA, NPU, MLU, MUSA)""" + device_state_key = device_name.lower() + err_template = "Didn't manage to set back the RNG states of the {backend} because of the following error:\n {exception}\nThis won't yield the same results as if the training had not been interrupted." + try: + if is_distributed: + device_module.random.set_rng_state_all(checkpoint_rng_state[device_state_key]) + else: + device_module.random.set_rng_state(checkpoint_rng_state[device_state_key]) + except Exception as e: + # Log error if setting RNG state fails + logger.error(err_template.format(backend=device_name, exception=e)) diff --git a/docs/transformers/build/lib/transformers/trainer_seq2seq.py b/docs/transformers/build/lib/transformers/trainer_seq2seq.py new file mode 100644 index 0000000000000000000000000000000000000000..e9fa797f0628e37fc2478e793129c8355cb3ebf4 --- /dev/null +++ b/docs/transformers/build/lib/transformers/trainer_seq2seq.py @@ -0,0 +1,392 @@ +# Copyright 2020 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import contextlib +import warnings +from copy import deepcopy +from pathlib import Path +from typing import TYPE_CHECKING, Any, Callable, Optional, Union + +import torch +from torch import nn +from torch.distributed.fsdp import FullyShardedDataParallel +from torch.utils.data import Dataset + +from .generation.configuration_utils import GenerationConfig +from .integrations.deepspeed import is_deepspeed_zero3_enabled +from .integrations.fsdp import is_fsdp_managed_module +from .trainer import Trainer +from .utils import is_datasets_available, logging +from .utils.deprecation import deprecate_kwarg + + +if is_datasets_available(): + import datasets + +if TYPE_CHECKING: + from torch.utils.data import IterableDataset + + from .data.data_collator import DataCollator + from .feature_extraction_utils import FeatureExtractionMixin + from .image_processing_utils import BaseImageProcessor + from .modeling_utils import PreTrainedModel + from .processing_utils import ProcessorMixin + from .tokenization_utils_base import PreTrainedTokenizerBase + from .trainer_callback import TrainerCallback + from .trainer_utils import EvalPrediction, PredictionOutput + from .training_args import TrainingArguments + + +logger = logging.get_logger(__name__) + + +class Seq2SeqTrainer(Trainer): + @deprecate_kwarg("tokenizer", new_name="processing_class", version="5.0.0", raise_if_both_names=True) + def __init__( + self, + model: Union["PreTrainedModel", nn.Module] = None, + args: "TrainingArguments" = None, + data_collator: Optional["DataCollator"] = None, + train_dataset: Optional[Union[Dataset, "IterableDataset", "datasets.Dataset"]] = None, + eval_dataset: Optional[Union[Dataset, dict[str, Dataset]]] = None, + processing_class: Optional[ + Union["PreTrainedTokenizerBase", "BaseImageProcessor", "FeatureExtractionMixin", "ProcessorMixin"] + ] = None, + model_init: Optional[Callable[[], "PreTrainedModel"]] = None, + compute_loss_func: Optional[Callable] = None, + compute_metrics: Optional[Callable[["EvalPrediction"], dict]] = None, + callbacks: Optional[list["TrainerCallback"]] = None, + optimizers: tuple[torch.optim.Optimizer, torch.optim.lr_scheduler.LambdaLR] = (None, None), + preprocess_logits_for_metrics: Optional[Callable[[torch.Tensor, torch.Tensor], torch.Tensor]] = None, + ): + super().__init__( + model=model, + args=args, + data_collator=data_collator, + train_dataset=train_dataset, + eval_dataset=eval_dataset, + processing_class=processing_class, + model_init=model_init, + compute_loss_func=compute_loss_func, + compute_metrics=compute_metrics, + callbacks=callbacks, + optimizers=optimizers, + preprocess_logits_for_metrics=preprocess_logits_for_metrics, + ) + + # Override self.model.generation_config if a GenerationConfig is specified in args. + # Priority: args.generation_config > model.generation_config > default GenerationConfig. + if self.args.generation_config is not None: + gen_config = self.load_generation_config(self.args.generation_config) + self.model.generation_config = gen_config + + @staticmethod + def load_generation_config(gen_config_arg: Union[str, GenerationConfig]) -> GenerationConfig: + """ + Loads a `~generation.GenerationConfig` from the `Seq2SeqTrainingArguments.generation_config` arguments. + + Args: + gen_config_arg (`str` or [`~generation.GenerationConfig]`): + `Seq2SeqTrainingArguments.generation_config` argument. + + Returns: + A `~generation.GenerationConfig`. + """ + + # GenerationConfig provided, nothing to do + if isinstance(gen_config_arg, GenerationConfig): + gen_config = deepcopy(gen_config_arg) + else: + # str or Path + pretrained_model_name = Path(gen_config_arg) if isinstance(gen_config_arg, str) else gen_config_arg + config_file_name = None + + # Figuring if it is path pointing to a file, pointing to a directory or else a model id or URL + # This step is required in order to determine config_file_name + if pretrained_model_name.is_file(): + config_file_name = pretrained_model_name.name + pretrained_model_name = pretrained_model_name.parent + # dir path + elif pretrained_model_name.is_dir(): + pass + # model id or URL + else: + pretrained_model_name = gen_config_arg + + gen_config = GenerationConfig.from_pretrained(pretrained_model_name, config_file_name) + + # Strict validation to fail early. `GenerationConfig.save_pretrained()`, run at the end of training, throws + # an exception if there are warnings at validation time. + try: + with warnings.catch_warnings(record=True) as caught_warnings: + gen_config.validate() + if len(caught_warnings) > 0: + raise ValueError(str([w.message for w in caught_warnings])) + except ValueError as exc: + raise ValueError( + "The loaded generation config instance is invalid -- `GenerationConfig.validate()` throws warnings " + "and/or exceptions. Fix these issues to train your model.\n\nThrown during validation:\n" + str(exc) + ) + return gen_config + + def evaluate( + self, + eval_dataset: Optional[Dataset] = None, + ignore_keys: Optional[list[str]] = None, + metric_key_prefix: str = "eval", + **gen_kwargs, + ) -> dict[str, float]: + """ + Run evaluation and returns metrics. + + The calling script will be responsible for providing a method to compute metrics, as they are task-dependent + (pass it to the init `compute_metrics` argument). + + You can also subclass and override this method to inject custom behavior. + + Args: + eval_dataset (`Dataset`, *optional*): + Pass a dataset if you wish to override `self.eval_dataset`. If it is an [`~datasets.Dataset`], columns + not accepted by the `model.forward()` method are automatically removed. It must implement the `__len__` + method. + ignore_keys (`List[str]`, *optional*): + A list of keys in the output of your model (if it is a dictionary) that should be ignored when + gathering predictions. + metric_key_prefix (`str`, *optional*, defaults to `"eval"`): + An optional prefix to be used as the metrics key prefix. For example the metrics "bleu" will be named + "eval_bleu" if the prefix is `"eval"` (default) + max_length (`int`, *optional*): + The maximum target length to use when predicting with the generate method. + num_beams (`int`, *optional*): + Number of beams for beam search that will be used when predicting with the generate method. 1 means no + beam search. + gen_kwargs: + Additional `generate` specific kwargs. + + Returns: + A dictionary containing the evaluation loss and the potential metrics computed from the predictions. The + dictionary also contains the epoch number which comes from the training state. + """ + + gen_kwargs = gen_kwargs.copy() + + # Use legacy argument setting if a) the option is not explicitly passed; and b) the argument is set in the + # training args + if ( + gen_kwargs.get("max_length") is None + and gen_kwargs.get("max_new_tokens") is None + and self.args.generation_max_length is not None + ): + gen_kwargs["max_length"] = self.args.generation_max_length + if gen_kwargs.get("num_beams") is None and self.args.generation_num_beams is not None: + gen_kwargs["num_beams"] = self.args.generation_num_beams + # We don't want to drop samples in general + self.gather_function = self.accelerator.gather + self._gen_kwargs = gen_kwargs + return super().evaluate(eval_dataset, ignore_keys=ignore_keys, metric_key_prefix=metric_key_prefix) + + def predict( + self, + test_dataset: Dataset, + ignore_keys: Optional[list[str]] = None, + metric_key_prefix: str = "test", + **gen_kwargs, + ) -> "PredictionOutput": + """ + Run prediction and returns predictions and potential metrics. + + Depending on the dataset and your use case, your test dataset may contain labels. In that case, this method + will also return metrics, like in `evaluate()`. + + Args: + test_dataset (`Dataset`): + Dataset to run the predictions on. If it is a [`~datasets.Dataset`], columns not accepted by the + `model.forward()` method are automatically removed. Has to implement the method `__len__` + ignore_keys (`List[str]`, *optional*): + A list of keys in the output of your model (if it is a dictionary) that should be ignored when + gathering predictions. + metric_key_prefix (`str`, *optional*, defaults to `"eval"`): + An optional prefix to be used as the metrics key prefix. For example the metrics "bleu" will be named + "eval_bleu" if the prefix is `"eval"` (default) + max_length (`int`, *optional*): + The maximum target length to use when predicting with the generate method. + num_beams (`int`, *optional*): + Number of beams for beam search that will be used when predicting with the generate method. 1 means no + beam search. + gen_kwargs: + Additional `generate` specific kwargs. + + + + If your predictions or labels have different sequence lengths (for instance because you're doing dynamic + padding in a token classification task) the predictions will be padded (on the right) to allow for + concatenation into one array. The padding index is -100. + + + + Returns: *NamedTuple* A namedtuple with the following keys: + + - predictions (`np.ndarray`): The predictions on `test_dataset`. + - label_ids (`np.ndarray`, *optional*): The labels (if the dataset contained some). + - metrics (`Dict[str, float]`, *optional*): The potential dictionary of metrics (if the dataset contained + labels). + """ + + gen_kwargs = gen_kwargs.copy() + + # Use legacy argument setting if a) the option is not explicitly passed; and b) the argument is set in the + # training args + if ( + gen_kwargs.get("max_length") is None + and gen_kwargs.get("max_new_tokens") is None + and self.args.generation_max_length is not None + ): + gen_kwargs["max_length"] = self.args.generation_max_length + if gen_kwargs.get("num_beams") is None and self.args.generation_num_beams is not None: + gen_kwargs["num_beams"] = self.args.generation_num_beams + self.gather_function = self.accelerator.gather + self._gen_kwargs = gen_kwargs + + return super().predict(test_dataset, ignore_keys=ignore_keys, metric_key_prefix=metric_key_prefix) + + def prediction_step( + self, + model: nn.Module, + inputs: dict[str, Union[torch.Tensor, Any]], + prediction_loss_only: bool, + ignore_keys: Optional[list[str]] = None, + **gen_kwargs, + ) -> tuple[Optional[float], Optional[torch.Tensor], Optional[torch.Tensor]]: + """ + Perform an evaluation step on `model` using `inputs`. + + Subclass and override to inject custom behavior. + + Args: + model (`nn.Module`): + The model to evaluate. + inputs (`Dict[str, Union[torch.Tensor, Any]]`): + The inputs and targets of the model. + + The dictionary will be unpacked before being fed to the model. Most models expect the targets under the + argument `labels`. Check your model's documentation for all accepted arguments. + prediction_loss_only (`bool`): + Whether or not to return the loss only. + gen_kwargs: + Additional `generate` specific kwargs. + + Return: + Tuple[Optional[float], Optional[torch.Tensor], Optional[torch.Tensor]]: A tuple with the loss, logits and + labels (each being optional). + """ + + if not self.args.predict_with_generate or prediction_loss_only: + return super().prediction_step( + model, inputs, prediction_loss_only=prediction_loss_only, ignore_keys=ignore_keys + ) + + has_labels = "labels" in inputs + inputs = self._prepare_inputs(inputs) + + # Priority (handled in generate): + # non-`None` gen_kwargs > model.generation_config > default GenerationConfig() + if len(gen_kwargs) == 0 and hasattr(self, "_gen_kwargs"): + gen_kwargs = self._gen_kwargs.copy() + if "num_beams" in gen_kwargs and gen_kwargs["num_beams"] is None: + gen_kwargs.pop("num_beams") + if "max_length" in gen_kwargs and gen_kwargs["max_length"] is None: + gen_kwargs.pop("max_length") + + default_synced_gpus = is_deepspeed_zero3_enabled() or is_fsdp_managed_module(self.model) + gen_kwargs["synced_gpus"] = gen_kwargs.get("synced_gpus", default_synced_gpus) + + generation_inputs = inputs.copy() + # If the `decoder_input_ids` was created from `labels`, evict the former, so that the model can freely generate + # (otherwise, it would continue generating from the padded `decoder_input_ids`) + if ( + "labels" in generation_inputs + and "decoder_input_ids" in generation_inputs + and generation_inputs["labels"].shape == generation_inputs["decoder_input_ids"].shape + ): + generation_inputs = { + k: v for k, v in inputs.items() if k not in ("decoder_input_ids", "decoder_attention_mask") + } + + summon_full_params_context = ( + FullyShardedDataParallel.summon_full_params(self.model) + if isinstance(self.model, FullyShardedDataParallel) + else contextlib.nullcontext() + ) + + with summon_full_params_context: + generated_tokens = self.model.generate(**generation_inputs, **gen_kwargs) + + # Temporary hack to ensure the generation config is not initialized for each iteration of the evaluation loop + # TODO: remove this hack when the legacy code that initializes generation_config from a model config is + # removed in https://github.com/huggingface/transformers/blob/98d88b23f54e5a23e741833f1e973fdf600cc2c5/src/transformers/generation/utils.py#L1183 + if self.model.generation_config._from_model_config: + self.model.generation_config._from_model_config = False + + # Retrieves GenerationConfig from model.generation_config + gen_config = self.model.generation_config + # in case the batch is shorter than max length, the output should be padded + if generated_tokens.shape[-1] < gen_config.max_length: + generated_tokens = self._pad_tensors_to_max_len(generated_tokens, gen_config.max_length) + elif gen_config.max_new_tokens is not None and generated_tokens.shape[-1] < gen_config.max_new_tokens + 1: + generated_tokens = self._pad_tensors_to_max_len(generated_tokens, gen_config.max_new_tokens + 1) + + with torch.no_grad(): + if has_labels: + with self.compute_loss_context_manager(): + outputs = model(**inputs) + if self.label_smoother is not None: + loss = self.label_smoother(outputs, inputs["labels"]).detach().mean() + else: + loss = (outputs["loss"] if isinstance(outputs, dict) else outputs[0]).detach().mean() + else: + loss = None + + if self.args.prediction_loss_only: + return loss, None, None + + if has_labels: + labels = inputs["labels"] + if labels.shape[-1] < gen_config.max_length: + labels = self._pad_tensors_to_max_len(labels, gen_config.max_length) + elif gen_config.max_new_tokens is not None and labels.shape[-1] < gen_config.max_new_tokens + 1: + labels = self._pad_tensors_to_max_len(labels, gen_config.max_new_tokens + 1) + else: + labels = None + + return loss, generated_tokens, labels + + def _pad_tensors_to_max_len(self, tensor, max_length): + if self.processing_class is not None and hasattr(self.processing_class, "pad_token_id"): + # If PAD token is not defined at least EOS token has to be defined + pad_token_id = ( + self.processing_class.pad_token_id + if self.processing_class.pad_token_id is not None + else self.processing_class.eos_token_id + ) + else: + if self.model.config.pad_token_id is not None: + pad_token_id = self.model.config.pad_token_id + else: + raise ValueError("Pad_token_id must be set in the configuration of the model, in order to pad tensors") + + padded_tensor = pad_token_id * torch.ones( + (tensor.shape[0], max_length), dtype=tensor.dtype, device=tensor.device + ) + padded_tensor[:, : tensor.shape[-1]] = tensor + return padded_tensor diff --git a/docs/transformers/build/lib/transformers/trainer_utils.py b/docs/transformers/build/lib/transformers/trainer_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..4d3dd6d6bb17fe0ca850c70bd58ffb534e99263b --- /dev/null +++ b/docs/transformers/build/lib/transformers/trainer_utils.py @@ -0,0 +1,910 @@ +# Copyright 2020-present the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +PyTorch-independent utilities for the Trainer class. +""" + +import copy +import functools +import gc +import inspect +import os +import random +import re +import threading +import time +from typing import Any, NamedTuple, Optional, Union + +import numpy as np + +from .utils import ( + ExplicitEnum, + is_psutil_available, + is_tf_available, + is_torch_available, + is_torch_cuda_available, + is_torch_hpu_available, + is_torch_mlu_available, + is_torch_mps_available, + is_torch_musa_available, + is_torch_npu_available, + is_torch_xla_available, + is_torch_xpu_available, + requires_backends, +) + + +if is_torch_available(): + import torch + + +def seed_worker(_): + """ + Helper function to set worker seed during Dataloader initialization. + """ + worker_seed = torch.initial_seed() % 2**32 + set_seed(worker_seed) + + +def enable_full_determinism(seed: int, warn_only: bool = False): + """ + Helper function for reproducible behavior during distributed training. See + - https://pytorch.org/docs/stable/notes/randomness.html for pytorch + - https://www.tensorflow.org/api_docs/python/tf/config/experimental/enable_op_determinism for tensorflow + """ + # set seed first + set_seed(seed) + + if is_torch_available(): + # Enable PyTorch deterministic mode. This potentially requires either the environment + # variable 'CUDA_LAUNCH_BLOCKING' or 'CUBLAS_WORKSPACE_CONFIG' to be set, + # depending on the CUDA version, so we set them both here + os.environ["CUDA_LAUNCH_BLOCKING"] = "1" + os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":16:8" + # The environment variable required to enable deterministic mode on Ascend NPUs. + os.environ["ASCEND_LAUNCH_BLOCKING"] = "1" + os.environ["HCCL_DETERMINISTIC"] = "1" + + os.environ["FLASH_ATTENTION_DETERMINISTIC"] = "1" + torch.use_deterministic_algorithms(True, warn_only=warn_only) + + # Enable CUDNN deterministic mode + torch.backends.cudnn.deterministic = True + torch.backends.cudnn.benchmark = False + + if is_tf_available(): + import tensorflow as tf + + tf.config.experimental.enable_op_determinism() + + +def set_seed(seed: int, deterministic: bool = False): + """ + Helper function for reproducible behavior to set the seed in `random`, `numpy`, `torch` and/or `tf` (if installed). + + Args: + seed (`int`): + The seed to set. + deterministic (`bool`, *optional*, defaults to `False`): + Whether to use deterministic algorithms where available. Can slow down training. + """ + random.seed(seed) + np.random.seed(seed) + if is_torch_available(): + torch.manual_seed(seed) + torch.cuda.manual_seed_all(seed) + # ^^ safe to call this function even if cuda is not available + if deterministic: + torch.use_deterministic_algorithms(True) + if is_torch_mlu_available(): + torch.mlu.manual_seed_all(seed) + if is_torch_musa_available(): + torch.musa.manual_seed_all(seed) + if is_torch_npu_available(): + torch.npu.manual_seed_all(seed) + if is_torch_hpu_available(): + torch.hpu.manual_seed_all(seed) + if is_torch_xpu_available(): + torch.xpu.manual_seed_all(seed) + if is_tf_available(): + import tensorflow as tf + + tf.random.set_seed(seed) + if deterministic: + tf.config.experimental.enable_op_determinism() + + +def neftune_post_forward_hook(module, input, output): + """ + Implements the NEFTune forward pass for the model using forward hooks. Note this works only for torch.nn.Embedding + layers. This method is slightly adapted from the original source code that can be found here: + https://github.com/neelsjain/NEFTune Simply add it to your model as follows: + ```python + model = ... + model.embed_tokens.neftune_noise_alpha = 0.1 + model.embed_tokens.register_forward_hook(neftune_post_forward_hook) + ``` + Args: + module (`torch.nn.Module`): + The embedding module where the hook is attached. Note that you need to set `module.neftune_noise_alpha` to + the desired noise alpha value. + input (`torch.Tensor`): + The input tensor to the model. + output (`torch.Tensor`): + The output tensor of the model (i.e. the embeddings). + """ + if module.training: + dims = torch.tensor(output.size(1) * output.size(2)) + mag_norm = module.neftune_noise_alpha / torch.sqrt(dims) + output = output + torch.zeros_like(output).uniform_(-mag_norm, mag_norm) + return output + + +class EvalPrediction: + """ + Evaluation output (always contains labels), to be used to compute metrics. + + Parameters: + predictions (`np.ndarray`): Predictions of the model. + label_ids (`np.ndarray`): Targets to be matched. + inputs (`np.ndarray`, *optional*): Input data passed to the model. + losses (`np.ndarray`, *optional*): Loss values computed during evaluation. + """ + + def __init__( + self, + predictions: Union[np.ndarray, tuple[np.ndarray]], + label_ids: Union[np.ndarray, tuple[np.ndarray]], + inputs: Optional[Union[np.ndarray, tuple[np.ndarray]]] = None, + losses: Optional[Union[np.ndarray, tuple[np.ndarray]]] = None, + ): + self.predictions = predictions + self.label_ids = label_ids + self.inputs = inputs + self.losses = losses + self.elements = (self.predictions, self.label_ids) + if self.inputs is not None: + self.elements += (self.inputs,) + if self.losses is not None: + self.elements += (self.losses,) + + def __iter__(self): + return iter(self.elements) + + def __getitem__(self, idx): + if idx < 0 or idx >= len(self.elements): + raise IndexError("tuple index out of range") + return self.elements[idx] + + +class EvalLoopOutput(NamedTuple): + predictions: Union[np.ndarray, tuple[np.ndarray]] + label_ids: Optional[Union[np.ndarray, tuple[np.ndarray]]] + metrics: Optional[dict[str, float]] + num_samples: Optional[int] + + +class PredictionOutput(NamedTuple): + predictions: Union[np.ndarray, tuple[np.ndarray]] + label_ids: Optional[Union[np.ndarray, tuple[np.ndarray]]] + metrics: Optional[dict[str, float]] + + +class TrainOutput(NamedTuple): + global_step: int + training_loss: float + metrics: dict[str, float] + + +PREFIX_CHECKPOINT_DIR = "checkpoint" +_re_checkpoint = re.compile(r"^" + PREFIX_CHECKPOINT_DIR + r"\-(\d+)$") + + +def get_last_checkpoint(folder): + content = os.listdir(folder) + checkpoints = [ + path + for path in content + if _re_checkpoint.search(path) is not None and os.path.isdir(os.path.join(folder, path)) + ] + if len(checkpoints) == 0: + return + return os.path.join(folder, max(checkpoints, key=lambda x: int(_re_checkpoint.search(x).groups()[0]))) + + +class IntervalStrategy(ExplicitEnum): + NO = "no" + STEPS = "steps" + EPOCH = "epoch" + + +class SaveStrategy(ExplicitEnum): + NO = "no" + STEPS = "steps" + EPOCH = "epoch" + BEST = "best" + + +class EvaluationStrategy(ExplicitEnum): + NO = "no" + STEPS = "steps" + EPOCH = "epoch" + + +class HubStrategy(ExplicitEnum): + END = "end" + EVERY_SAVE = "every_save" + CHECKPOINT = "checkpoint" + ALL_CHECKPOINTS = "all_checkpoints" + + +class BestRun(NamedTuple): + """ + The best run found by a hyperparameter search (see [`~Trainer.hyperparameter_search`]). + + Parameters: + run_id (`str`): + The id of the best run (if models were saved, the corresponding checkpoint will be in the folder ending + with run-{run_id}). + objective (`float`): + The objective that was obtained for this run. + hyperparameters (`Dict[str, Any]`): + The hyperparameters picked to get this run. + run_summary (`Optional[Any]`): + A summary of tuning experiments. `ray.tune.ExperimentAnalysis` object for Ray backend. + """ + + run_id: str + objective: Union[float, list[float]] + hyperparameters: dict[str, Any] + run_summary: Optional[Any] = None + + +def default_compute_objective(metrics: dict[str, float]) -> float: + """ + The default objective to maximize/minimize when doing an hyperparameter search. It is the evaluation loss if no + metrics are provided to the [`Trainer`], the sum of all metrics otherwise. + + Args: + metrics (`Dict[str, float]`): The metrics returned by the evaluate method. + + Return: + `float`: The objective to minimize or maximize + """ + metrics = copy.deepcopy(metrics) + loss = metrics.pop("eval_loss", None) + _ = metrics.pop("epoch", None) + # Remove speed metrics + speed_metrics = [ + m + for m in metrics.keys() + if m.endswith("_runtime") or m.endswith("_per_second") or m.endswith("_compilation_time") + ] + for sm in speed_metrics: + _ = metrics.pop(sm, None) + return loss if len(metrics) == 0 else sum(metrics.values()) + + +def default_hp_space_optuna(trial) -> dict[str, float]: + from .integrations import is_optuna_available + + assert is_optuna_available(), "This function needs Optuna installed: `pip install optuna`" + return { + "learning_rate": trial.suggest_float("learning_rate", 1e-6, 1e-4, log=True), + "num_train_epochs": trial.suggest_int("num_train_epochs", 1, 5), + "seed": trial.suggest_int("seed", 1, 40), + "per_device_train_batch_size": trial.suggest_categorical("per_device_train_batch_size", [4, 8, 16, 32, 64]), + } + + +def default_hp_space_ray(trial) -> dict[str, float]: + from .integrations import is_ray_tune_available + + assert is_ray_tune_available(), "This function needs ray installed: `pip install ray[tune]`" + from ray import tune + + return { + "learning_rate": tune.loguniform(1e-6, 1e-4), + "num_train_epochs": tune.choice(list(range(1, 6))), + "seed": tune.uniform(1, 40), + "per_device_train_batch_size": tune.choice([4, 8, 16, 32, 64]), + } + + +def default_hp_space_sigopt(trial): + return [ + {"bounds": {"min": 1e-6, "max": 1e-4}, "name": "learning_rate", "type": "double", "transformation": "log"}, + {"bounds": {"min": 1, "max": 6}, "name": "num_train_epochs", "type": "int"}, + {"bounds": {"min": 1, "max": 40}, "name": "seed", "type": "int"}, + { + "categorical_values": ["4", "8", "16", "32", "64"], + "name": "per_device_train_batch_size", + "type": "categorical", + }, + ] + + +def default_hp_space_wandb(trial) -> dict[str, float]: + from .integrations import is_wandb_available + + if not is_wandb_available(): + raise ImportError("This function needs wandb installed: `pip install wandb`") + + return { + "method": "random", + "metric": {"name": "objective", "goal": "minimize"}, + "parameters": { + "learning_rate": {"distribution": "uniform", "min": 1e-6, "max": 1e-4}, + "num_train_epochs": {"distribution": "int_uniform", "min": 1, "max": 6}, + "seed": {"distribution": "int_uniform", "min": 1, "max": 40}, + "per_device_train_batch_size": {"values": [4, 8, 16, 32, 64]}, + }, + } + + +class HPSearchBackend(ExplicitEnum): + OPTUNA = "optuna" + RAY = "ray" + SIGOPT = "sigopt" + WANDB = "wandb" + + +def is_main_process(local_rank): + """ + Whether or not the current process is the local process, based on `xm.get_ordinal()` (for TPUs) first, then on + `local_rank`. + """ + if is_torch_xla_available(): + import torch_xla.core.xla_model as xm + + return xm.get_ordinal() == 0 + return local_rank in [-1, 0] + + +def total_processes_number(local_rank): + """ + Return the number of processes launched in parallel. Works with `torch.distributed` and TPUs. + """ + if is_torch_xla_available(): + import torch_xla.core.xla_model as xm + + return xm.xrt_world_size() + elif local_rank != -1 and is_torch_available(): + import torch + + return torch.distributed.get_world_size() + return 1 + + +def speed_metrics(split, start_time, num_samples=None, num_steps=None, num_tokens=None): + """ + Measure and return speed performance metrics. + + This function requires a time snapshot `start_time` before the operation to be measured starts and this function + should be run immediately after the operation to be measured has completed. + + Args: + - split: name to prefix metric (like train, eval, test...) + - start_time: operation start time + - num_samples: number of samples processed + - num_steps: number of steps processed + - num_tokens: number of tokens processed + """ + runtime = time.time() - start_time + result = {f"{split}_runtime": round(runtime, 4)} + if runtime == 0: + return result + if num_samples is not None: + samples_per_second = num_samples / runtime + result[f"{split}_samples_per_second"] = round(samples_per_second, 3) + if num_steps is not None: + steps_per_second = num_steps / runtime + result[f"{split}_steps_per_second"] = round(steps_per_second, 3) + if num_tokens is not None: + tokens_per_second = num_tokens / runtime + result[f"{split}_tokens_per_second"] = round(tokens_per_second, 3) + return result + + +class SchedulerType(ExplicitEnum): + """ + Scheduler names for the parameter `lr_scheduler_type` in [`TrainingArguments`]. + By default, it uses "linear". Internally, this retrieves `get_linear_schedule_with_warmup` scheduler from [`Trainer`]. + Scheduler types: + - "linear" = get_linear_schedule_with_warmup + - "cosine" = get_cosine_schedule_with_warmup + - "cosine_with_restarts" = get_cosine_with_hard_restarts_schedule_with_warmup + - "polynomial" = get_polynomial_decay_schedule_with_warmup + - "constant" = get_constant_schedule + - "constant_with_warmup" = get_constant_schedule_with_warmup + - "inverse_sqrt" = get_inverse_sqrt_schedule + - "reduce_lr_on_plateau" = get_reduce_on_plateau_schedule + - "cosine_with_min_lr" = get_cosine_with_min_lr_schedule_with_warmup + - "warmup_stable_decay" = get_wsd_schedule + """ + + LINEAR = "linear" + COSINE = "cosine" + COSINE_WITH_RESTARTS = "cosine_with_restarts" + POLYNOMIAL = "polynomial" + CONSTANT = "constant" + CONSTANT_WITH_WARMUP = "constant_with_warmup" + INVERSE_SQRT = "inverse_sqrt" + REDUCE_ON_PLATEAU = "reduce_lr_on_plateau" + COSINE_WITH_MIN_LR = "cosine_with_min_lr" + WARMUP_STABLE_DECAY = "warmup_stable_decay" + + +class TrainerMemoryTracker: + """ + A helper class that tracks cpu and gpu memory. + + This class will silently skip unless `psutil` is available. Install with `pip install psutil`. + + When a stage completes, it can pass metrics dict to update with the memory metrics gathered during this stage. + + Example : + + ```python + self._memory_tracker = TrainerMemoryTracker(self.args.skip_memory_metrics) + self._memory_tracker.start() + # code ... + metrics = {"train_runtime": 10.5} + self._memory_tracker.stop_and_update_metrics(metrics) + ``` + + At the moment GPU tracking is only for `pytorch`, but can be extended to support `tensorflow`. + + To understand this class' intricacies please read the documentation of [`~Trainer.log_metrics`]. + """ + + # map trainer methods to metrics prefix + stages = { + "__init__": "init", + "train": "train", + "_inner_training_loop": "train", + "evaluate": "eval", + "predict": "test", + } + + def __init__(self, skip_memory_metrics=False): + self.skip_memory_metrics = skip_memory_metrics + + if not is_psutil_available(): + # soft dependency on psutil + self.skip_memory_metrics = True + + if self.skip_memory_metrics: + return + + import psutil # noqa + + if is_torch_cuda_available() or is_torch_mlu_available() or is_torch_musa_available(): + import torch + + self.torch = torch + self.gpu = {} + elif is_torch_mps_available(): + import torch + + self.torch = torch + self.gpu = {} + elif is_torch_xpu_available(): + import torch + + self.torch = torch + self.gpu = {} + elif is_torch_npu_available(): + import torch + + self.torch = torch + self.gpu = {} + elif is_torch_hpu_available(): + import torch + + self.torch = torch + self.gpu = {} + else: + self.torch = None + + self.process = psutil.Process() + + self.cur_stage = None + self.cpu = {} + self.init_reported = False + + def derive_stage(self): + """derives the stage/caller name automatically""" + caller = inspect.currentframe().f_back.f_back.f_code.co_name + if caller in self.stages: + return self.stages[caller] + else: + raise ValueError( + f"was called from {caller}, but only expect to be called from one of {self.stages.keys()}" + ) + + def cpu_mem_used(self): + """get resident set size memory for the current process""" + return self.process.memory_info().rss + + def peak_monitor_func(self): + self.cpu_mem_used_peak = -1 + + while True: + self.cpu_mem_used_peak = max(self.cpu_mem_used(), self.cpu_mem_used_peak) + + # can't sleep or will not catch the peak right (this comment is here on purpose) + # time.sleep(0.001) # 1msec + + if not self.peak_monitoring: + break + + def start(self): + """start tracking for the caller's stage""" + if self.skip_memory_metrics: + return + + stage = self.derive_stage() + # deal with nested calls of eval during train - simply ignore those + if self.cur_stage is not None and self.cur_stage != stage: + return + + self.cur_stage = stage + + gc.collect() + + if self.torch is not None: + if torch.cuda.is_available(): + self.torch.cuda.reset_peak_memory_stats() + self.torch.cuda.empty_cache() + elif is_torch_mlu_available(): + self.torch.mlu.reset_peak_memory_stats() + self.torch.mlu.empty_cache() + elif is_torch_musa_available(): + self.torch.musa.reset_peak_memory_stats() + self.torch.musa.empty_cache() + elif is_torch_xpu_available(): + self.torch.xpu.reset_peak_memory_stats() + self.torch.xpu.empty_cache() + elif is_torch_npu_available(): + self.torch.npu.reset_peak_memory_stats() + self.torch.npu.empty_cache() + elif is_torch_hpu_available(): + self.torch.hpu.reset_peak_memory_stats() + # not available on hpu as it reserves all device memory for the current process + # self.torch.hpu.empty_cache() + elif is_torch_mps_available(): + self.torch.mps.empty_cache() + + # gpu + if self.torch is not None: + if torch.cuda.is_available(): + self.gpu_mem_used_at_start = self.torch.cuda.memory_allocated() + elif is_torch_mlu_available(): + self.gpu_mem_used_at_start = self.torch.mlu.memory_allocated() + elif is_torch_musa_available(): + self.gpu_mem_used_at_start = self.torch.musa.memory_allocated() + elif is_torch_xpu_available(): + self.gpu_mem_used_at_start = self.torch.xpu.memory_allocated() + elif is_torch_npu_available(): + self.gpu_mem_used_at_start = self.torch.npu.memory_allocated() + elif is_torch_hpu_available(): + self.gpu_mem_used_at_start = self.torch.hpu.memory_allocated() + elif is_torch_mps_available(): + self.gpu_mem_used_at_start = self.torch.mps.current_allocated_memory() + + # cpu + self.cpu_mem_used_at_start = self.cpu_mem_used() + + self.peak_monitoring = True + peak_monitor_thread = threading.Thread(target=self.peak_monitor_func) + peak_monitor_thread.daemon = True + peak_monitor_thread.start() + + def stop(self, stage): + """stop tracking for the passed stage""" + + # deal with nested calls of eval during train - simply ignore those + if self.cur_stage is not None and self.cur_stage != stage: + return + + # this sends a signal to peak_monitor_func to complete its loop + self.peak_monitoring = False + + # first ensure all objects get collected and their memory is freed + gc.collect() + + if self.torch is not None: + if torch.cuda.is_available(): + self.torch.cuda.empty_cache() + elif is_torch_mlu_available(): + self.torch.mlu.empty_cache() + elif is_torch_musa_available(): + self.torch.musa.empty_cache() + elif is_torch_xpu_available(): + self.torch.xpu.empty_cache() + elif is_torch_npu_available(): + self.torch.npu.empty_cache() + elif is_torch_hpu_available(): + # not available on hpu as it reserves all device memory for the current process + # self.torch.npu.empty_cache() + pass + elif is_torch_mps_available(): + self.torch.mps.empty_cache() + + # concepts: + # - alloc_delta: the difference of allocated memory between the end and the start + # - peaked_delta: the difference between the peak memory and the current memory + # in order to know how much memory the measured code consumed one needs to sum these two + + # gpu + if self.torch is not None: + if torch.cuda.is_available(): + self.gpu_mem_used_now = self.torch.cuda.memory_allocated() + self.gpu_mem_used_peak = self.torch.cuda.max_memory_allocated() + elif is_torch_mlu_available(): + self.gpu_mem_used_now = self.torch.mlu.memory_allocated() + self.gpu_mem_used_peak = self.torch.mlu.max_memory_allocated() + elif is_torch_musa_available(): + self.gpu_mem_used_now = self.torch.musa.memory_allocated() + self.gpu_mem_used_peak = self.torch.musa.max_memory_allocated() + elif is_torch_xpu_available(): + self.gpu_mem_used_now = self.torch.xpu.memory_allocated() + self.gpu_mem_used_peak = self.torch.xpu.max_memory_allocated() + elif is_torch_npu_available(): + self.gpu_mem_used_now = self.torch.npu.memory_allocated() + self.gpu_mem_used_peak = self.torch.npu.max_memory_allocated() + elif is_torch_hpu_available(): + self.gpu_mem_used_now = self.torch.hpu.memory_allocated() + self.gpu_mem_used_peak = self.torch.hpu.max_memory_allocated() + elif is_torch_mps_available(): + self.gpu_mem_used_now = self.torch.mps.current_allocated_memory() + # self.torch.mps.max_memory_allocated() does not exist yet + self.gpu_mem_used_peak = None + + else: + raise ValueError("No available GPU device found!") + + self.gpu[self.cur_stage] = { + "begin": self.gpu_mem_used_at_start, + "end": self.gpu_mem_used_now, + "alloc": (self.gpu_mem_used_now - self.gpu_mem_used_at_start), + } + if self.gpu_mem_used_peak is not None: + self.gpu[self.cur_stage]["peaked"] = max(0, self.gpu_mem_used_peak - self.gpu_mem_used_now) + else: + self.gpu[self.cur_stage]["peaked"] = "Not available" + + # cpu + self.cpu_mem_used_now = self.cpu_mem_used() + self.cpu[self.cur_stage] = { + "begin": self.cpu_mem_used_at_start, + "end": self.cpu_mem_used_now, + "alloc": (self.cpu_mem_used_now - self.cpu_mem_used_at_start), + "peaked": max(0, self.cpu_mem_used_peak - self.cpu_mem_used_now), + } + + # reset - cycle finished + self.cur_stage = None + + def update_metrics(self, stage, metrics): + """updates the metrics""" + if self.skip_memory_metrics: + return + + # deal with nested calls of eval during train - simply ignore those + if self.cur_stage is not None and self.cur_stage != stage: + return + + # since we don't have a way to return init metrics, we push them into the first of train/val/predict + stages = [stage] + if not self.init_reported: + stages.insert(0, "init") + self.init_reported = True + + for stage in stages: + for t in ["alloc", "peaked"]: + if stage in self.cpu and t in self.cpu[stage]: + metrics[f"{stage}_mem_cpu_{t}_delta"] = self.cpu[stage][t] + if self.torch is not None and stage in self.gpu and t in self.gpu[stage]: + metrics[f"{stage}_mem_gpu_{t}_delta"] = self.gpu[stage][t] + # if we need additional debug info, enable the following + # for t in ["begin", "end"]: + # if stage in self.cpu and t in self.cpu[stage]: + # metrics[f"{stage}_mem_cpu_{t}"] = self.cpu[stage][t] + # if self.torch is not None and stage in self.gpu and t in self.gpu[stage]: + # metrics[f"{stage}_mem_gpu_{t}"] = self.gpu[stage][t] + + # since memory can be allocated before init, and it might be difficult to track overall + # memory usage, in particular for GPU, let's report memory usage at the point init was called + if stages[0] == "init": + metrics["before_init_mem_cpu"] = self.cpu["init"]["begin"] + if self.torch is not None: + metrics["before_init_mem_gpu"] = self.gpu["init"]["begin"] + # if we also wanted to report any additional memory allocations in between init and + # whatever the next stage was we could also report this: + # if self.cpu["init"]["end"] != self.cpu[stage]["begin"]: + # metrics[f"after_init_mem_cpu_delta"] = self.cpu[stage]["begin"] - self.cpu["init"]["end"] + # if self.torch is not None and self.gpu["init"]["end"] != self.gpu[stage]["begin"]: + # metrics[f"after_init_mem_gpu_delta"] = self.gpu[stage]["begin"] - self.gpu["init"]["end"] + + def stop_and_update_metrics(self, metrics=None): + """combine stop and metrics update in one call for simpler code""" + if self.skip_memory_metrics: + return + + stage = self.derive_stage() + self.stop(stage) + + # init doesn't have metrics to update so we just save that data for later stages to retrieve + if metrics is not None: + self.update_metrics(stage, metrics) + + +def has_length(dataset): + """ + Checks if the dataset implements __len__() and it doesn't raise an error + """ + try: + return len(dataset) is not None + except TypeError: + # TypeError: len() of unsized object + return False + except AttributeError: + # Ray DataSets raises an AttributeError: https://github.com/ray-project/ray/blob/master/python/ray/data/dataset.py#L5616 + return False + + +def denumpify_detensorize(metrics): + """ + Recursively calls `.item()` on the element of the dictionary passed + """ + if isinstance(metrics, (list, tuple)): + return type(metrics)(denumpify_detensorize(m) for m in metrics) + elif isinstance(metrics, dict): + return type(metrics)({k: denumpify_detensorize(v) for k, v in metrics.items()}) + elif isinstance(metrics, np.generic): + return metrics.item() + elif is_torch_available() and isinstance(metrics, torch.Tensor) and metrics.numel() == 1: + return metrics.item() + return metrics + + +def number_of_arguments(func): + """ + Return the number of arguments of the passed function, even if it's a partial function. + """ + if isinstance(func, functools.partial): + total_args = len(inspect.signature(func.func).parameters) + return total_args - len(func.args) - len(func.keywords) + return len(inspect.signature(func).parameters) + + +def find_executable_batch_size( + function: callable = None, starting_batch_size: int = 128, auto_find_batch_size: bool = False +): + """ + Args: + A basic decorator that will try to execute `function`. If it fails from exceptions related to out-of-memory or + CUDNN, the batch size is cut in half and passed to `function`. `function` must take in a `batch_size` parameter as + its first argument. + function (`callable`, *optional*) + A function to wrap + starting_batch_size (`int`, *optional*) + The batch size to try and fit into memory + auto_find_batch_size (`bool`, *optional*) + If False, will just execute `function` + """ + if function is None: + return functools.partial( + find_executable_batch_size, + starting_batch_size=starting_batch_size, + auto_find_batch_size=auto_find_batch_size, + ) + + if auto_find_batch_size: + requires_backends(find_executable_batch_size, "accelerate") + from accelerate.utils import find_executable_batch_size as accelerate_find_executable_batch_size + + return accelerate_find_executable_batch_size(function=function, starting_batch_size=starting_batch_size) + + return functools.partial(function, batch_size=starting_batch_size) + + +class FSDPOption(ExplicitEnum): + FULL_SHARD = "full_shard" + SHARD_GRAD_OP = "shard_grad_op" + NO_SHARD = "no_shard" + HYBRID_SHARD = "hybrid_shard" + HYBRID_SHARD_ZERO2 = "hybrid_shard_zero2" + OFFLOAD = "offload" + AUTO_WRAP = "auto_wrap" + + +class RemoveColumnsCollator: + """Wrap the data collator to remove unused columns before they are passed to the collator.""" + + def __init__( + self, + data_collator, + signature_columns, + logger=None, + model_name: Optional[str] = None, + description: Optional[str] = None, + ): + self.data_collator = data_collator + self.signature_columns = signature_columns + self.logger = logger + self.description = description + self.model_name = model_name + self.message_logged = False + + def _remove_columns(self, feature: dict) -> dict: + if not isinstance(feature, dict): + return feature + if not self.message_logged and self.logger and self.model_name: + ignored_columns = list(set(feature.keys()) - set(self.signature_columns)) + if len(ignored_columns) > 0: + dset_description = "" if self.description is None else f"in the {self.description} set" + self.logger.info( + f"The following columns {dset_description} don't have a corresponding argument in " + f"`{self.model_name}.forward` and have been ignored: {', '.join(ignored_columns)}." + f" If {', '.join(ignored_columns)} are not expected by `{self.model_name}.forward`, " + " you can safely ignore this message." + ) + self.message_logged = True + return {k: v for k, v in feature.items() if k in self.signature_columns} + + def __call__(self, features: list[dict]): + features = [self._remove_columns(feature) for feature in features] + return self.data_collator(features) + + +def check_target_module_exists(optim_target_modules, key: str, return_is_regex: bool = False): + """A helper method to check if the passed module's key name matches any of the target modules in the optim_target_modules. + + Args: + optim_target_modules (`Union[str, List[str]]`): + A list of strings to try to match. Can be also a full string. + key (`str`): + A key to search any matches in optim_target_modules + return_is_regex (`bool`): + If set to `True`, the method will return whether the passed `optim_target_modules` + is a regex or not. + + Returns: + `bool` : True of match object if key matches any target modules from config, False or + None if no match found + `bool` : If the matched target module is a regex to silence out the warnings in Trainer + for extra modules being found (only if `target_module_found=True` for an array of regex). + """ + target_module_found = False + is_regex = False + + if isinstance(optim_target_modules, str): + target_module_found = bool(re.fullmatch(optim_target_modules, key)) + is_regex = True if not optim_target_modules == key else False + elif key in optim_target_modules: # from here, target_module_found must be a list of str + # this module is specified directly in target_modules + target_module_found = True + elif any(target_key in key for target_key in optim_target_modules): + target_module_found = True + elif any(bool(re.fullmatch(optim_target_module, key)) for optim_target_module in optim_target_modules): + target_module_found = True + is_regex = True + + if return_is_regex: + return target_module_found, is_regex + + return target_module_found diff --git a/docs/transformers/build/lib/transformers/training_args.py b/docs/transformers/build/lib/transformers/training_args.py new file mode 100644 index 0000000000000000000000000000000000000000..65cb2705cba3ba09541001c94661a17f3791025e --- /dev/null +++ b/docs/transformers/build/lib/transformers/training_args.py @@ -0,0 +1,3071 @@ +# Copyright 2020 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import contextlib +import json +import math +import os +import warnings +from dataclasses import asdict, dataclass, field, fields +from datetime import timedelta +from enum import Enum +from pathlib import Path +from typing import Any, Optional, Union + +from huggingface_hub import get_full_repo_name + +from .debug_utils import DebugOption +from .trainer_utils import ( + EvaluationStrategy, + FSDPOption, + HubStrategy, + IntervalStrategy, + SaveStrategy, + SchedulerType, +) +from .utils import ( + ACCELERATE_MIN_VERSION, + ExplicitEnum, + cached_property, + is_accelerate_available, + is_ipex_available, + is_safetensors_available, + is_sagemaker_dp_enabled, + is_sagemaker_mp_enabled, + is_torch_available, + is_torch_bf16_gpu_available, + is_torch_cuda_available, + is_torch_hpu_available, + is_torch_mlu_available, + is_torch_mps_available, + is_torch_musa_available, + is_torch_neuroncore_available, + is_torch_npu_available, + is_torch_tf32_available, + is_torch_xla_available, + is_torch_xpu_available, + logging, + requires_backends, +) +from .utils.generic import strtobool +from .utils.import_utils import is_optimum_neuron_available + + +logger = logging.get_logger(__name__) +log_levels = logging.get_log_levels_dict().copy() +trainer_log_levels = dict(**log_levels, passive=-1) + +if is_torch_available(): + import torch + import torch.distributed as dist + +if is_accelerate_available(): + from accelerate.state import AcceleratorState, PartialState + from accelerate.utils import DistributedType + + from .trainer_pt_utils import AcceleratorConfig + +if is_torch_xla_available(): + import torch_xla.core.xla_model as xm + +if is_torch_neuroncore_available(check_device=False): + # torchrun support + # https://github.com/pytorch/xla/pull/3609 + if os.environ.get("TORCHELASTIC_RUN_ID"): + if is_optimum_neuron_available(): + logger.info( + "Make sure that you are performing the training with the NeuronTrainer from optimum[neuron], this " + "will fail otherwise." + ) + else: + logger.warning( + "Please use the NeuronTrainer from optimum[neuron] instead of the Transformers library to perform " + "training on AWS Trainium instances. More information here: " + "https://github.com/huggingface/optimum-neuron" + ) + import torch_xla.distributed.xla_backend as xbn + + if not isinstance(dist.group.WORLD, xbn.ProcessGroupXla): + dist.init_process_group(backend="xla") + if not isinstance(dist.group.WORLD, xbn.ProcessGroupXla): + raise AssertionError("Failed to initialize torch.distributed process group using XLA backend.") + + +if is_sagemaker_mp_enabled(): + import smdistributed.modelparallel.torch as smp + + smp.init() + + +def default_logdir() -> str: + """ + Same default as PyTorch + """ + import socket + from datetime import datetime + + current_time = datetime.now().strftime("%b%d_%H-%M-%S") + return os.path.join("runs", current_time + "_" + socket.gethostname()) + + +def get_int_from_env(env_keys, default): + """Returns the first positive env value found in the `env_keys` list or the default.""" + for e in env_keys: + val = int(os.environ.get(e, -1)) + if val >= 0: + return val + return default + + +def get_xla_device_type(device: "torch.device") -> Optional[str]: + """ + Returns the xla device type (CPU|GPU|TPU) or None if the device is a non-xla device. + """ + if is_torch_xla_available(): + if device.type == "cpu": + return "CPU" + return xm.xla_real_devices([device])[0].split(":")[0] + return None + + +class OptimizerNames(ExplicitEnum): + """ + Stores the acceptable string identifiers for optimizers. + """ + + ADAMW_TORCH = "adamw_torch" + ADAMW_TORCH_FUSED = "adamw_torch_fused" + ADAMW_TORCH_XLA = "adamw_torch_xla" + ADAMW_TORCH_NPU_FUSED = "adamw_torch_npu_fused" + ADAMW_APEX_FUSED = "adamw_apex_fused" + ADAFACTOR = "adafactor" + ADAMW_ANYPRECISION = "adamw_anyprecision" + ADAMW_TORCH_4BIT = "adamw_torch_4bit" + ADAMW_TORCH_8BIT = "adamw_torch_8bit" + ADEMAMIX = "ademamix" + SGD = "sgd" + ADAGRAD = "adagrad" + ADAMW_BNB = "adamw_bnb_8bit" + ADAMW_8BIT = "adamw_8bit" # just an alias for adamw_bnb_8bit + ADEMAMIX_8BIT = "ademamix_8bit" + LION_8BIT = "lion_8bit" + LION = "lion_32bit" + PAGED_ADAMW = "paged_adamw_32bit" + PAGED_ADAMW_8BIT = "paged_adamw_8bit" + PAGED_ADEMAMIX = "paged_ademamix_32bit" + PAGED_ADEMAMIX_8BIT = "paged_ademamix_8bit" + PAGED_LION = "paged_lion_32bit" + PAGED_LION_8BIT = "paged_lion_8bit" + RMSPROP = "rmsprop" + RMSPROP_BNB = "rmsprop_bnb" + RMSPROP_8BIT = "rmsprop_bnb_8bit" + RMSPROP_32BIT = "rmsprop_bnb_32bit" + GALORE_ADAMW = "galore_adamw" + GALORE_ADAMW_8BIT = "galore_adamw_8bit" + GALORE_ADAFACTOR = "galore_adafactor" + GALORE_ADAMW_LAYERWISE = "galore_adamw_layerwise" + GALORE_ADAMW_8BIT_LAYERWISE = "galore_adamw_8bit_layerwise" + GALORE_ADAFACTOR_LAYERWISE = "galore_adafactor_layerwise" + LOMO = "lomo" + ADALOMO = "adalomo" + GROKADAMW = "grokadamw" + SCHEDULE_FREE_RADAM = "schedule_free_radam" + SCHEDULE_FREE_ADAMW = "schedule_free_adamw" + SCHEDULE_FREE_SGD = "schedule_free_sgd" + APOLLO_ADAMW = "apollo_adamw" + APOLLO_ADAMW_LAYERWISE = "apollo_adamw_layerwise" + + +def _convert_str_dict(passed_value: dict): + "Safely checks that a passed value is a dictionary and converts any string values to their appropriate types." + for key, value in passed_value.items(): + if isinstance(value, dict): + passed_value[key] = _convert_str_dict(value) + elif isinstance(value, str): + # First check for bool and convert + if value.lower() in ("true", "false"): + passed_value[key] = value.lower() == "true" + # Check for digit + elif value.isdigit(): + passed_value[key] = int(value) + elif value.replace(".", "", 1).isdigit(): + passed_value[key] = float(value) + + return passed_value + + +# TODO: `TrainingArguments` users rely on it being fully mutable. In the future see if we can narrow this to a few keys: https://github.com/huggingface/transformers/pull/25903 +@dataclass +class TrainingArguments: + """ + TrainingArguments is the subset of the arguments we use in our example scripts **which relate to the training loop + itself**. + + Using [`HfArgumentParser`] we can turn this class into + [argparse](https://docs.python.org/3/library/argparse#module-argparse) arguments that can be specified on the + command line. + + Parameters: + output_dir (`str`, *optional*, defaults to `"trainer_output"`): + The output directory where the model predictions and checkpoints will be written. + overwrite_output_dir (`bool`, *optional*, defaults to `False`): + If `True`, overwrite the content of the output directory. Use this to continue training if `output_dir` + points to a checkpoint directory. + do_train (`bool`, *optional*, defaults to `False`): + Whether to run training or not. This argument is not directly used by [`Trainer`], it's intended to be used + by your training/evaluation scripts instead. See the [example + scripts](https://github.com/huggingface/transformers/tree/main/examples) for more details. + do_eval (`bool`, *optional*): + Whether to run evaluation on the validation set or not. Will be set to `True` if `eval_strategy` is + different from `"no"`. This argument is not directly used by [`Trainer`], it's intended to be used by your + training/evaluation scripts instead. See the [example + scripts](https://github.com/huggingface/transformers/tree/main/examples) for more details. + do_predict (`bool`, *optional*, defaults to `False`): + Whether to run predictions on the test set or not. This argument is not directly used by [`Trainer`], it's + intended to be used by your training/evaluation scripts instead. See the [example + scripts](https://github.com/huggingface/transformers/tree/main/examples) for more details. + eval_strategy (`str` or [`~trainer_utils.IntervalStrategy`], *optional*, defaults to `"no"`): + The evaluation strategy to adopt during training. Possible values are: + + - `"no"`: No evaluation is done during training. + - `"steps"`: Evaluation is done (and logged) every `eval_steps`. + - `"epoch"`: Evaluation is done at the end of each epoch. + + prediction_loss_only (`bool`, *optional*, defaults to `False`): + When performing evaluation and generating predictions, only returns the loss. + per_device_train_batch_size (`int`, *optional*, defaults to 8): + The batch size per device accelerator core/CPU for training. + per_device_eval_batch_size (`int`, *optional*, defaults to 8): + The batch size per device accelerator core/CPU for evaluation. + gradient_accumulation_steps (`int`, *optional*, defaults to 1): + Number of updates steps to accumulate the gradients for, before performing a backward/update pass. + + + + When using gradient accumulation, one step is counted as one step with backward pass. Therefore, logging, + evaluation, save will be conducted every `gradient_accumulation_steps * xxx_step` training examples. + + + + eval_accumulation_steps (`int`, *optional*): + Number of predictions steps to accumulate the output tensors for, before moving the results to the CPU. If + left unset, the whole predictions are accumulated on the device accelerator before being moved to the CPU (faster but + requires more memory). + eval_delay (`float`, *optional*): + Number of epochs or steps to wait for before the first evaluation can be performed, depending on the + eval_strategy. + torch_empty_cache_steps (`int`, *optional*): + Number of steps to wait before calling `torch..empty_cache()`. If left unset or set to None, cache will not be emptied. + + + + This can help avoid CUDA out-of-memory errors by lowering peak VRAM usage at a cost of about [10% slower performance](https://github.com/huggingface/transformers/issues/31372). + + + + learning_rate (`float`, *optional*, defaults to 5e-5): + The initial learning rate for [`AdamW`] optimizer. + weight_decay (`float`, *optional*, defaults to 0): + The weight decay to apply (if not zero) to all layers except all bias and LayerNorm weights in [`AdamW`] + optimizer. + adam_beta1 (`float`, *optional*, defaults to 0.9): + The beta1 hyperparameter for the [`AdamW`] optimizer. + adam_beta2 (`float`, *optional*, defaults to 0.999): + The beta2 hyperparameter for the [`AdamW`] optimizer. + adam_epsilon (`float`, *optional*, defaults to 1e-8): + The epsilon hyperparameter for the [`AdamW`] optimizer. + max_grad_norm (`float`, *optional*, defaults to 1.0): + Maximum gradient norm (for gradient clipping). + num_train_epochs(`float`, *optional*, defaults to 3.0): + Total number of training epochs to perform (if not an integer, will perform the decimal part percents of + the last epoch before stopping training). + max_steps (`int`, *optional*, defaults to -1): + If set to a positive number, the total number of training steps to perform. Overrides `num_train_epochs`. + For a finite dataset, training is reiterated through the dataset (if all data is exhausted) until + `max_steps` is reached. + lr_scheduler_type (`str` or [`SchedulerType`], *optional*, defaults to `"linear"`): + The scheduler type to use. See the documentation of [`SchedulerType`] for all possible values. + lr_scheduler_kwargs ('dict', *optional*, defaults to {}): + The extra arguments for the lr_scheduler. See the documentation of each scheduler for possible values. + warmup_ratio (`float`, *optional*, defaults to 0.0): + Ratio of total training steps used for a linear warmup from 0 to `learning_rate`. + warmup_steps (`int`, *optional*, defaults to 0): + Number of steps used for a linear warmup from 0 to `learning_rate`. Overrides any effect of `warmup_ratio`. + log_level (`str`, *optional*, defaults to `passive`): + Logger log level to use on the main process. Possible choices are the log levels as strings: 'debug', + 'info', 'warning', 'error' and 'critical', plus a 'passive' level which doesn't set anything and keeps the + current log level for the Transformers library (which will be `"warning"` by default). + log_level_replica (`str`, *optional*, defaults to `"warning"`): + Logger log level to use on replicas. Same choices as `log_level`" + log_on_each_node (`bool`, *optional*, defaults to `True`): + In multinode distributed training, whether to log using `log_level` once per node, or only on the main + node. + logging_dir (`str`, *optional*): + [TensorBoard](https://www.tensorflow.org/tensorboard) log directory. Will default to + *output_dir/runs/**CURRENT_DATETIME_HOSTNAME***. + logging_strategy (`str` or [`~trainer_utils.IntervalStrategy`], *optional*, defaults to `"steps"`): + The logging strategy to adopt during training. Possible values are: + + - `"no"`: No logging is done during training. + - `"epoch"`: Logging is done at the end of each epoch. + - `"steps"`: Logging is done every `logging_steps`. + + logging_first_step (`bool`, *optional*, defaults to `False`): + Whether to log the first `global_step` or not. + logging_steps (`int` or `float`, *optional*, defaults to 500): + Number of update steps between two logs if `logging_strategy="steps"`. Should be an integer or a float in + range `[0,1)`. If smaller than 1, will be interpreted as ratio of total training steps. + logging_nan_inf_filter (`bool`, *optional*, defaults to `True`): + Whether to filter `nan` and `inf` losses for logging. If set to `True` the loss of every step that is `nan` + or `inf` is filtered and the average loss of the current logging window is taken instead. + + + + `logging_nan_inf_filter` only influences the logging of loss values, it does not change the behavior the + gradient is computed or applied to the model. + + + + save_strategy (`str` or [`~trainer_utils.SaveStrategy`], *optional*, defaults to `"steps"`): + The checkpoint save strategy to adopt during training. Possible values are: + + - `"no"`: No save is done during training. + - `"epoch"`: Save is done at the end of each epoch. + - `"steps"`: Save is done every `save_steps`. + - `"best"`: Save is done whenever a new `best_metric` is achieved. + + If `"epoch"` or `"steps"` is chosen, saving will also be performed at the + very end of training, always. + save_steps (`int` or `float`, *optional*, defaults to 500): + Number of updates steps before two checkpoint saves if `save_strategy="steps"`. Should be an integer or a + float in range `[0,1)`. If smaller than 1, will be interpreted as ratio of total training steps. + save_total_limit (`int`, *optional*): + If a value is passed, will limit the total amount of checkpoints. Deletes the older checkpoints in + `output_dir`. When `load_best_model_at_end` is enabled, the "best" checkpoint according to + `metric_for_best_model` will always be retained in addition to the most recent ones. For example, for + `save_total_limit=5` and `load_best_model_at_end`, the four last checkpoints will always be retained + alongside the best model. When `save_total_limit=1` and `load_best_model_at_end`, it is possible that two + checkpoints are saved: the last one and the best one (if they are different). + save_safetensors (`bool`, *optional*, defaults to `True`): + Use [safetensors](https://huggingface.co/docs/safetensors) saving and loading for state dicts instead of + default `torch.load` and `torch.save`. + save_on_each_node (`bool`, *optional*, defaults to `False`): + When doing multi-node distributed training, whether to save models and checkpoints on each node, or only on + the main one. + + This should not be activated when the different nodes use the same storage as the files will be saved with + the same names for each node. + save_only_model (`bool`, *optional*, defaults to `False`): + When checkpointing, whether to only save the model, or also the optimizer, scheduler & rng state. + Note that when this is true, you won't be able to resume training from checkpoint. + This enables you to save storage by not storing the optimizer, scheduler & rng state. + You can only load the model using `from_pretrained` with this option set to `True`. + restore_callback_states_from_checkpoint (`bool`, *optional*, defaults to `False`): + Whether to restore the callback states from the checkpoint. If `True`, will override + callbacks passed to the `Trainer` if they exist in the checkpoint." + use_cpu (`bool`, *optional*, defaults to `False`): + Whether or not to use cpu. If set to False, we will use cuda or mps device if available. + seed (`int`, *optional*, defaults to 42): + Random seed that will be set at the beginning of training. To ensure reproducibility across runs, use the + [`~Trainer.model_init`] function to instantiate the model if it has some randomly initialized parameters. + data_seed (`int`, *optional*): + Random seed to be used with data samplers. If not set, random generators for data sampling will use the + same seed as `seed`. This can be used to ensure reproducibility of data sampling, independent of the model + seed. + jit_mode_eval (`bool`, *optional*, defaults to `False`): + Whether or not to use PyTorch jit trace for inference. + use_ipex (`bool`, *optional*, defaults to `False`): + Use Intel extension for PyTorch when it is available. [IPEX + installation](https://github.com/intel/intel-extension-for-pytorch). + bf16 (`bool`, *optional*, defaults to `False`): + Whether to use bf16 16-bit (mixed) precision training instead of 32-bit training. Requires Ampere or higher + NVIDIA architecture or using CPU (use_cpu) or Ascend NPU. This is an experimental API and it may change. + fp16 (`bool`, *optional*, defaults to `False`): + Whether to use fp16 16-bit (mixed) precision training instead of 32-bit training. + fp16_opt_level (`str`, *optional*, defaults to 'O1'): + For `fp16` training, Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']. See details on + the [Apex documentation](https://nvidia.github.io/apex/amp). + fp16_backend (`str`, *optional*, defaults to `"auto"`): + This argument is deprecated. Use `half_precision_backend` instead. + half_precision_backend (`str`, *optional*, defaults to `"auto"`): + The backend to use for mixed precision training. Must be one of `"auto", "apex", "cpu_amp"`. `"auto"` will + use CPU/CUDA AMP or APEX depending on the PyTorch version detected, while the other choices will force the + requested backend. + bf16_full_eval (`bool`, *optional*, defaults to `False`): + Whether to use full bfloat16 evaluation instead of 32-bit. This will be faster and save memory but can harm + metric values. This is an experimental API and it may change. + fp16_full_eval (`bool`, *optional*, defaults to `False`): + Whether to use full float16 evaluation instead of 32-bit. This will be faster and save memory but can harm + metric values. + tf32 (`bool`, *optional*): + Whether to enable the TF32 mode, available in Ampere and newer GPU architectures. The default value depends + on PyTorch's version default of `torch.backends.cuda.matmul.allow_tf32`. For more details please refer to + the [TF32](https://huggingface.co/docs/transformers/perf_train_gpu_one#tf32) documentation. This is an + experimental API and it may change. + local_rank (`int`, *optional*, defaults to -1): + Rank of the process during distributed training. + ddp_backend (`str`, *optional*): + The backend to use for distributed training. Must be one of `"nccl"`, `"mpi"`, `"ccl"`, `"gloo"`, `"hccl"`. + tpu_num_cores (`int`, *optional*): + When training on TPU, the number of TPU cores (automatically passed by launcher script). + dataloader_drop_last (`bool`, *optional*, defaults to `False`): + Whether to drop the last incomplete batch (if the length of the dataset is not divisible by the batch size) + or not. + eval_steps (`int` or `float`, *optional*): + Number of update steps between two evaluations if `eval_strategy="steps"`. Will default to the same + value as `logging_steps` if not set. Should be an integer or a float in range `[0,1)`. If smaller than 1, + will be interpreted as ratio of total training steps. + dataloader_num_workers (`int`, *optional*, defaults to 0): + Number of subprocesses to use for data loading (PyTorch only). 0 means that the data will be loaded in the + main process. + past_index (`int`, *optional*, defaults to -1): + Some models like [TransformerXL](../model_doc/transformerxl) or [XLNet](../model_doc/xlnet) can make use of + the past hidden states for their predictions. If this argument is set to a positive int, the `Trainer` will + use the corresponding output (usually index 2) as the past state and feed it to the model at the next + training step under the keyword argument `mems`. + run_name (`str`, *optional*, defaults to `output_dir`): + A descriptor for the run. Typically used for [wandb](https://www.wandb.com/), + [mlflow](https://www.mlflow.org/), [comet](https://www.comet.com/site) and [swanlab](https://swanlab.cn) + logging. If not specified, will be the same as `output_dir`. + disable_tqdm (`bool`, *optional*): + Whether or not to disable the tqdm progress bars and table of metrics produced by + [`~notebook.NotebookTrainingTracker`] in Jupyter Notebooks. Will default to `True` if the logging level is + set to warn or lower (default), `False` otherwise. + remove_unused_columns (`bool`, *optional*, defaults to `True`): + Whether or not to automatically remove the columns unused by the model forward method. + label_names (`List[str]`, *optional*): + The list of keys in your dictionary of inputs that correspond to the labels. + + Will eventually default to the list of argument names accepted by the model that contain the word "label", + except if the model used is one of the `XxxForQuestionAnswering` in which case it will also include the + `["start_positions", "end_positions"]` keys. + load_best_model_at_end (`bool`, *optional*, defaults to `False`): + Whether or not to load the best model found during training at the end of training. When this option is + enabled, the best checkpoint will always be saved. See + [`save_total_limit`](https://huggingface.co/docs/transformers/main_classes/trainer#transformers.TrainingArguments.save_total_limit) + for more. + + + + When set to `True`, the parameters `save_strategy` needs to be the same as `eval_strategy`, and in + the case it is "steps", `save_steps` must be a round multiple of `eval_steps`. + + + + metric_for_best_model (`str`, *optional*): + Use in conjunction with `load_best_model_at_end` to specify the metric to use to compare two different + models. Must be the name of a metric returned by the evaluation with or without the prefix `"eval_"`. + + If not specified, this will default to `"loss"` when either `load_best_model_at_end == True` + or `lr_scheduler_type == SchedulerType.REDUCE_ON_PLATEAU` (to use the evaluation loss). + + If you set this value, `greater_is_better` will default to `True` unless the name ends with "loss". + Don't forget to set it to `False` if your metric is better when lower. + greater_is_better (`bool`, *optional*): + Use in conjunction with `load_best_model_at_end` and `metric_for_best_model` to specify if better models + should have a greater metric or not. Will default to: + + - `True` if `metric_for_best_model` is set to a value that doesn't end in `"loss"`. + - `False` if `metric_for_best_model` is not set, or set to a value that ends in `"loss"`. + ignore_data_skip (`bool`, *optional*, defaults to `False`): + When resuming training, whether or not to skip the epochs and batches to get the data loading at the same + stage as in the previous training. If set to `True`, the training will begin faster (as that skipping step + can take a long time) but will not yield the same results as the interrupted training would have. + fsdp (`bool`, `str` or list of [`~trainer_utils.FSDPOption`], *optional*, defaults to `''`): + Use PyTorch Distributed Parallel Training (in distributed training only). + + A list of options along the following: + + - `"full_shard"`: Shard parameters, gradients and optimizer states. + - `"shard_grad_op"`: Shard optimizer states and gradients. + - `"hybrid_shard"`: Apply `FULL_SHARD` within a node, and replicate parameters across nodes. + - `"hybrid_shard_zero2"`: Apply `SHARD_GRAD_OP` within a node, and replicate parameters across nodes. + - `"offload"`: Offload parameters and gradients to CPUs (only compatible with `"full_shard"` and + `"shard_grad_op"`). + - `"auto_wrap"`: Automatically recursively wrap layers with FSDP using `default_auto_wrap_policy`. + fsdp_config (`str` or `dict`, *optional*): + Config to be used with fsdp (Pytorch Distributed Parallel Training). The value is either a location of + fsdp json config file (e.g., `fsdp_config.json`) or an already loaded json file as `dict`. + + A List of config and its options: + - min_num_params (`int`, *optional*, defaults to `0`): + FSDP's minimum number of parameters for Default Auto Wrapping. (useful only when `fsdp` field is + passed). + - transformer_layer_cls_to_wrap (`List[str]`, *optional*): + List of transformer layer class names (case-sensitive) to wrap, e.g, `BertLayer`, `GPTJBlock`, + `T5Block` .... (useful only when `fsdp` flag is passed). + - backward_prefetch (`str`, *optional*) + FSDP's backward prefetch mode. Controls when to prefetch next set of parameters (useful only when + `fsdp` field is passed). + + A list of options along the following: + + - `"backward_pre"` : Prefetches the next set of parameters before the current set of parameter's + gradient + computation. + - `"backward_post"` : This prefetches the next set of parameters after the current set of + parameter’s + gradient computation. + - forward_prefetch (`bool`, *optional*, defaults to `False`) + FSDP's forward prefetch mode (useful only when `fsdp` field is passed). + If `"True"`, then FSDP explicitly prefetches the next upcoming all-gather while executing in the + forward pass. + - limit_all_gathers (`bool`, *optional*, defaults to `False`) + FSDP's limit_all_gathers (useful only when `fsdp` field is passed). + If `"True"`, FSDP explicitly synchronizes the CPU thread to prevent too many in-flight + all-gathers. + - use_orig_params (`bool`, *optional*, defaults to `True`) + If `"True"`, allows non-uniform `requires_grad` during init, which means support for interspersed + frozen and trainable parameters. Useful in cases such as parameter-efficient fine-tuning. Please + refer this + [blog](https://dev-discuss.pytorch.org/t/rethinking-pytorch-fully-sharded-data-parallel-fsdp-from-first-principles/1019 + - sync_module_states (`bool`, *optional*, defaults to `True`) + If `"True"`, each individually wrapped FSDP unit will broadcast module parameters from rank 0 to + ensure they are the same across all ranks after initialization + - cpu_ram_efficient_loading (`bool`, *optional*, defaults to `False`) + If `"True"`, only the first process loads the pretrained model checkpoint while all other processes + have empty weights. When this setting as `"True"`, `sync_module_states` also must to be `"True"`, + otherwise all the processes except the main process would have random weights leading to unexpected + behaviour during training. + - activation_checkpointing (`bool`, *optional*, defaults to `False`): + If `"True"`, activation checkpointing is a technique to reduce memory usage by clearing activations of + certain layers and recomputing them during a backward pass. Effectively, this trades extra + computation time for reduced memory usage. + - xla (`bool`, *optional*, defaults to `False`): + Whether to use PyTorch/XLA Fully Sharded Data Parallel Training. This is an experimental feature + and its API may evolve in the future. + - xla_fsdp_settings (`dict`, *optional*) + The value is a dictionary which stores the XLA FSDP wrapping parameters. + + For a complete list of options, please see [here]( + https://github.com/pytorch/xla/blob/master/torch_xla/distributed/fsdp/xla_fully_sharded_data_parallel.py). + - xla_fsdp_grad_ckpt (`bool`, *optional*, defaults to `False`): + Will use gradient checkpointing over each nested XLA FSDP wrapped layer. This setting can only be + used when the xla flag is set to true, and an auto wrapping policy is specified through + fsdp_min_num_params or fsdp_transformer_layer_cls_to_wrap. + deepspeed (`str` or `dict`, *optional*): + Use [Deepspeed](https://github.com/deepspeedai/DeepSpeed). This is an experimental feature and its API may + evolve in the future. The value is either the location of DeepSpeed json config file (e.g., + `ds_config.json`) or an already loaded json file as a `dict`" + + + If enabling any Zero-init, make sure that your model is not initialized until + *after* initializing the `TrainingArguments`, else it will not be applied. + + + accelerator_config (`str`, `dict`, or `AcceleratorConfig`, *optional*): + Config to be used with the internal `Accelerator` implementation. The value is either a location of + accelerator json config file (e.g., `accelerator_config.json`), an already loaded json file as `dict`, + or an instance of [`~trainer_pt_utils.AcceleratorConfig`]. + + A list of config and its options: + - split_batches (`bool`, *optional*, defaults to `False`): + Whether or not the accelerator should split the batches yielded by the dataloaders across the devices. If + `True` the actual batch size used will be the same on any kind of distributed processes, but it must be a + round multiple of the `num_processes` you are using. If `False`, actual batch size used will be the one set + in your script multiplied by the number of processes. + - dispatch_batches (`bool`, *optional*): + If set to `True`, the dataloader prepared by the Accelerator is only iterated through on the main process + and then the batches are split and broadcast to each process. Will default to `True` for `DataLoader` whose + underlying dataset is an `IterableDataset`, `False` otherwise. + - even_batches (`bool`, *optional*, defaults to `True`): + If set to `True`, in cases where the total batch size across all processes does not exactly divide the + dataset, samples at the start of the dataset will be duplicated so the batch can be divided equally among + all workers. + - use_seedable_sampler (`bool`, *optional*, defaults to `True`): + Whether or not use a fully seedable random sampler ([`accelerate.data_loader.SeedableRandomSampler`]). Ensures + training results are fully reproducible using a different sampling technique. While seed-to-seed results + may differ, on average the differences are negligible when using multiple different seeds to compare. Should + also be ran with [`~utils.set_seed`] for the best results. + - use_configured_state (`bool`, *optional*, defaults to `False`): + Whether or not to use a pre-configured `AcceleratorState` or `PartialState` defined before calling `TrainingArguments`. + If `True`, an `Accelerator` or `PartialState` must be initialized. Note that by doing so, this could lead to issues + with hyperparameter tuning. + + label_smoothing_factor (`float`, *optional*, defaults to 0.0): + The label smoothing factor to use. Zero means no label smoothing, otherwise the underlying onehot-encoded + labels are changed from 0s and 1s to `label_smoothing_factor/num_labels` and `1 - label_smoothing_factor + + label_smoothing_factor/num_labels` respectively. + debug (`str` or list of [`~debug_utils.DebugOption`], *optional*, defaults to `""`): + Enable one or more debug features. This is an experimental feature. + + Possible options are: + + - `"underflow_overflow"`: detects overflow in model's input/outputs and reports the last frames that led to + the event + - `"tpu_metrics_debug"`: print debug metrics on TPU + + The options should be separated by whitespaces. + optim (`str` or [`training_args.OptimizerNames`], *optional*, defaults to `"adamw_torch"`): + The optimizer to use, such as "adamw_torch", "adamw_torch_fused", "adamw_apex_fused", "adamw_anyprecision", + "adafactor". See `OptimizerNames` in [training_args.py](https://github.com/huggingface/transformers/blob/main/src/transformers/training_args.py) + for a full list of optimizers. + optim_args (`str`, *optional*): + Optional arguments that are supplied to optimizers such as AnyPrecisionAdamW, AdEMAMix, and GaLore. + group_by_length (`bool`, *optional*, defaults to `False`): + Whether or not to group together samples of roughly the same length in the training dataset (to minimize + padding applied and be more efficient). Only useful if applying dynamic padding. + length_column_name (`str`, *optional*, defaults to `"length"`): + Column name for precomputed lengths. If the column exists, grouping by length will use these values rather + than computing them on train startup. Ignored unless `group_by_length` is `True` and the dataset is an + instance of `Dataset`. + report_to (`str` or `List[str]`, *optional*, defaults to `"all"`): + The list of integrations to report the results and logs to. Supported platforms are `"azure_ml"`, + `"clearml"`, `"codecarbon"`, `"comet_ml"`, `"dagshub"`, `"dvclive"`, `"flyte"`, `"mlflow"`, `"neptune"`, + `"swanlab"`, `"tensorboard"`, and `"wandb"`. Use `"all"` to report to all integrations installed, `"none"` + for no integrations. + ddp_find_unused_parameters (`bool`, *optional*): + When using distributed training, the value of the flag `find_unused_parameters` passed to + `DistributedDataParallel`. Will default to `False` if gradient checkpointing is used, `True` otherwise. + ddp_bucket_cap_mb (`int`, *optional*): + When using distributed training, the value of the flag `bucket_cap_mb` passed to `DistributedDataParallel`. + ddp_broadcast_buffers (`bool`, *optional*): + When using distributed training, the value of the flag `broadcast_buffers` passed to + `DistributedDataParallel`. Will default to `False` if gradient checkpointing is used, `True` otherwise. + dataloader_pin_memory (`bool`, *optional*, defaults to `True`): + Whether you want to pin memory in data loaders or not. Will default to `True`. + dataloader_persistent_workers (`bool`, *optional*, defaults to `False`): + If True, the data loader will not shut down the worker processes after a dataset has been consumed once. + This allows to maintain the workers Dataset instances alive. Can potentially speed up training, but will + increase RAM usage. Will default to `False`. + dataloader_prefetch_factor (`int`, *optional*): + Number of batches loaded in advance by each worker. + 2 means there will be a total of 2 * num_workers batches prefetched across all workers. + skip_memory_metrics (`bool`, *optional*, defaults to `True`): + Whether to skip adding of memory profiler reports to metrics. This is skipped by default because it slows + down the training and evaluation speed. + push_to_hub (`bool`, *optional*, defaults to `False`): + Whether or not to push the model to the Hub every time the model is saved. If this is activated, + `output_dir` will begin a git directory synced with the repo (determined by `hub_model_id`) and the content + will be pushed each time a save is triggered (depending on your `save_strategy`). Calling + [`~Trainer.save_model`] will also trigger a push. + + + + If `output_dir` exists, it needs to be a local clone of the repository to which the [`Trainer`] will be + pushed. + + + + resume_from_checkpoint (`str`, *optional*): + The path to a folder with a valid checkpoint for your model. This argument is not directly used by + [`Trainer`], it's intended to be used by your training/evaluation scripts instead. See the [example + scripts](https://github.com/huggingface/transformers/tree/main/examples) for more details. + hub_model_id (`str`, *optional*): + The name of the repository to keep in sync with the local *output_dir*. It can be a simple model ID in + which case the model will be pushed in your namespace. Otherwise it should be the whole repository name, + for instance `"user_name/model"`, which allows you to push to an organization you are a member of with + `"organization_name/model"`. Will default to `user_name/output_dir_name` with *output_dir_name* being the + name of `output_dir`. + + Will default to the name of `output_dir`. + hub_strategy (`str` or [`~trainer_utils.HubStrategy`], *optional*, defaults to `"every_save"`): + Defines the scope of what is pushed to the Hub and when. Possible values are: + + - `"end"`: push the model, its configuration, the processing class e.g. tokenizer (if passed along to the [`Trainer`]) and a + draft of a model card when the [`~Trainer.save_model`] method is called. + - `"every_save"`: push the model, its configuration, the processing class e.g. tokenizer (if passed along to the [`Trainer`]) and + a draft of a model card each time there is a model save. The pushes are asynchronous to not block + training, and in case the save are very frequent, a new push is only attempted if the previous one is + finished. A last push is made with the final model at the end of training. + - `"checkpoint"`: like `"every_save"` but the latest checkpoint is also pushed in a subfolder named + last-checkpoint, allowing you to resume training easily with + `trainer.train(resume_from_checkpoint="last-checkpoint")`. + - `"all_checkpoints"`: like `"checkpoint"` but all checkpoints are pushed like they appear in the output + folder (so you will get one checkpoint folder per folder in your final repository) + + hub_token (`str`, *optional*): + The token to use to push the model to the Hub. Will default to the token in the cache folder obtained with + `huggingface-cli login`. + hub_private_repo (`bool`, *optional*): + Whether to make the repo private. If `None` (default), the repo will be public unless the organization's default is private. This value is ignored if the repo already exists. + hub_always_push (`bool`, *optional*, defaults to `False`): + Unless this is `True`, the `Trainer` will skip pushing a checkpoint when the previous push is not finished. + gradient_checkpointing (`bool`, *optional*, defaults to `False`): + If True, use gradient checkpointing to save memory at the expense of slower backward pass. + gradient_checkpointing_kwargs (`dict`, *optional*, defaults to `None`): + Key word arguments to be passed to the `gradient_checkpointing_enable` method. + include_inputs_for_metrics (`bool`, *optional*, defaults to `False`): + This argument is deprecated. Use `include_for_metrics` instead, e.g, `include_for_metrics = ["inputs"]`. + include_for_metrics (`List[str]`, *optional*, defaults to `[]`): + Include additional data in the `compute_metrics` function if needed for metrics computation. + Possible options to add to `include_for_metrics` list: + - `"inputs"`: Input data passed to the model, intended for calculating input dependent metrics. + - `"loss"`: Loss values computed during evaluation, intended for calculating loss dependent metrics. + eval_do_concat_batches (`bool`, *optional*, defaults to `True`): + Whether to recursively concat inputs/losses/labels/predictions across batches. If `False`, + will instead store them as lists, with each batch kept separate. + auto_find_batch_size (`bool`, *optional*, defaults to `False`) + Whether to find a batch size that will fit into memory automatically through exponential decay, avoiding + CUDA Out-of-Memory errors. Requires accelerate to be installed (`pip install accelerate`) + full_determinism (`bool`, *optional*, defaults to `False`) + If `True`, [`enable_full_determinism`] is called instead of [`set_seed`] to ensure reproducible results in + distributed training. Important: this will negatively impact the performance, so only use it for debugging. + torchdynamo (`str`, *optional*): + If set, the backend compiler for TorchDynamo. Possible choices are `"eager"`, `"aot_eager"`, `"inductor"`, + `"nvfuser"`, `"aot_nvfuser"`, `"aot_cudagraphs"`, `"ofi"`, `"fx2trt"`, `"onnxrt"` and `"ipex"`. + ray_scope (`str`, *optional*, defaults to `"last"`): + The scope to use when doing hyperparameter search with Ray. By default, `"last"` will be used. Ray will + then use the last checkpoint of all trials, compare those, and select the best one. However, other options + are also available. See the [Ray documentation]( + https://docs.ray.io/en/latest/tune/api_docs/analysis.html#ray.tune.ExperimentAnalysis.get_best_trial) for + more options. + ddp_timeout (`int`, *optional*, defaults to 1800): + The timeout for `torch.distributed.init_process_group` calls, used to avoid GPU socket timeouts when + performing slow operations in distributed runnings. Please refer the [PyTorch documentation] + (https://pytorch.org/docs/stable/distributed.html#torch.distributed.init_process_group) for more + information. + use_mps_device (`bool`, *optional*, defaults to `False`): + This argument is deprecated.`mps` device will be used if it is available similar to `cuda` device. + torch_compile (`bool`, *optional*, defaults to `False`): + Whether or not to compile the model using PyTorch 2.0 + [`torch.compile`](https://pytorch.org/get-started/pytorch-2.0/). + + This will use the best defaults for the [`torch.compile` + API](https://pytorch.org/docs/stable/generated/torch.compile.html?highlight=torch+compile#torch.compile). + You can customize the defaults with the argument `torch_compile_backend` and `torch_compile_mode` but we + don't guarantee any of them will work as the support is progressively rolled in in PyTorch. + + This flag and the whole compile API is experimental and subject to change in future releases. + torch_compile_backend (`str`, *optional*): + The backend to use in `torch.compile`. If set to any value, `torch_compile` will be set to `True`. + + Refer to the PyTorch doc for possible values and note that they may change across PyTorch versions. + + This flag is experimental and subject to change in future releases. + torch_compile_mode (`str`, *optional*): + The mode to use in `torch.compile`. If set to any value, `torch_compile` will be set to `True`. + + Refer to the PyTorch doc for possible values and note that they may change across PyTorch versions. + + This flag is experimental and subject to change in future releases. + include_tokens_per_second (`bool`, *optional*): + Whether or not to compute the number of tokens per second per device for training speed metrics. + + This will iterate over the entire training dataloader once beforehand, + + and will slow down the entire process. + + include_num_input_tokens_seen (`bool`, *optional*): + Whether or not to track the number of input tokens seen throughout training. + + May be slower in distributed training as gather operations must be called. + + neftune_noise_alpha (`Optional[float]`): + If not `None`, this will activate NEFTune noise embeddings. This can drastically improve model performance + for instruction fine-tuning. Check out the [original paper](https://arxiv.org/abs/2310.05914) and the + [original code](https://github.com/neelsjain/NEFTune). Support transformers `PreTrainedModel` and also + `PeftModel` from peft. The original paper used values in the range [5.0, 15.0]. + optim_target_modules (`Union[str, List[str]]`, *optional*): + The target modules to optimize, i.e. the module names that you would like to train. + Currently used for the GaLore algorithm (https://arxiv.org/abs/2403.03507) and APOLLO algorithm (https://arxiv.org/abs/2412.05270). + See GaLore implementation (https://github.com/jiaweizzhao/GaLore) and APOLLO implementation (https://github.com/zhuhanqing/APOLLO) for more details. + You need to make sure to pass a valid GaLore or APOLLO optimizer, e.g., one of: "apollo_adamw", "galore_adamw", "galore_adamw_8bit", "galore_adafactor" and make sure that the target modules are `nn.Linear` modules only. + + batch_eval_metrics (`Optional[bool]`, defaults to `False`): + If set to `True`, evaluation will call compute_metrics at the end of each batch to accumulate statistics + rather than saving all eval logits in memory. When set to `True`, you must pass a compute_metrics function + that takes a boolean argument `compute_result`, which when passed `True`, will trigger the final global + summary statistics from the batch-level summary statistics you've accumulated over the evaluation set. + + eval_on_start (`bool`, *optional*, defaults to `False`): + Whether to perform a evaluation step (sanity check) before the training to ensure the validation steps works correctly. + + eval_use_gather_object (`bool`, *optional*, defaults to `False`): + Whether to run recursively gather object in a nested list/tuple/dictionary of objects from all devices. This should only be enabled if users are not just returning tensors, and this is actively discouraged by PyTorch. + + use_liger_kernel (`bool`, *optional*, defaults to `False`): + Whether enable [Liger](https://github.com/linkedin/Liger-Kernel) Kernel for LLM model training. + It can effectively increase multi-GPU training throughput by ~20% and reduces memory usage by ~60%, works out of the box with + flash attention, PyTorch FSDP, and Microsoft DeepSpeed. Currently, it supports llama, mistral, mixtral and gemma models. + + average_tokens_across_devices (`bool`, *optional*, defaults to `False`): + Whether or not to average tokens across devices. If enabled, will use all_reduce to synchronize + num_tokens_in_batch for precise loss calculation. Reference: + https://github.com/huggingface/transformers/issues/34242 + """ + + # Sometimes users will pass in a `str` repr of a dict in the CLI + # We need to track what fields those can be. Each time a new arg + # has a dict type, it must be added to this list. + # Important: These should be typed with Optional[Union[dict,str,...]] + _VALID_DICT_FIELDS = [ + "accelerator_config", + "fsdp_config", + "deepspeed", + "gradient_checkpointing_kwargs", + "lr_scheduler_kwargs", + ] + + framework = "pt" + output_dir: Optional[str] = field( + default=None, + metadata={ + "help": "The output directory where the model predictions and checkpoints will be written. Defaults to 'trainer_output' if not provided." + }, + ) + overwrite_output_dir: bool = field( + default=False, + metadata={ + "help": ( + "Overwrite the content of the output directory. " + "Use this to continue training if output_dir points to a checkpoint directory." + ) + }, + ) + + do_train: bool = field(default=False, metadata={"help": "Whether to run training."}) + do_eval: bool = field(default=False, metadata={"help": "Whether to run eval on the dev set."}) + do_predict: bool = field(default=False, metadata={"help": "Whether to run predictions on the test set."}) + eval_strategy: Union[IntervalStrategy, str] = field( + default="no", + metadata={"help": "The evaluation strategy to use."}, + ) + prediction_loss_only: bool = field( + default=False, + metadata={"help": "When performing evaluation and predictions, only returns the loss."}, + ) + + per_device_train_batch_size: int = field( + default=8, metadata={"help": "Batch size per device accelerator core/CPU for training."} + ) + per_device_eval_batch_size: int = field( + default=8, metadata={"help": "Batch size per device accelerator core/CPU for evaluation."} + ) + + per_gpu_train_batch_size: Optional[int] = field( + default=None, + metadata={ + "help": ( + "Deprecated, the use of `--per_device_train_batch_size` is preferred. " + "Batch size per GPU/TPU core/CPU for training." + ) + }, + ) + per_gpu_eval_batch_size: Optional[int] = field( + default=None, + metadata={ + "help": ( + "Deprecated, the use of `--per_device_eval_batch_size` is preferred. " + "Batch size per GPU/TPU core/CPU for evaluation." + ) + }, + ) + + gradient_accumulation_steps: int = field( + default=1, + metadata={"help": "Number of updates steps to accumulate before performing a backward/update pass."}, + ) + eval_accumulation_steps: Optional[int] = field( + default=None, + metadata={"help": "Number of predictions steps to accumulate before moving the tensors to the CPU."}, + ) + + eval_delay: Optional[float] = field( + default=0, + metadata={ + "help": ( + "Number of epochs or steps to wait for before the first evaluation can be performed, depending on the" + " eval_strategy." + ) + }, + ) + + torch_empty_cache_steps: Optional[int] = field( + default=None, + metadata={ + "help": "Number of steps to wait before calling `torch..empty_cache()`." + "This can help avoid CUDA out-of-memory errors by lowering peak VRAM usage at a cost of about [10% slower performance](https://github.com/huggingface/transformers/issues/31372)." + "If left unset or set to None, cache will not be emptied." + }, + ) + + learning_rate: float = field(default=5e-5, metadata={"help": "The initial learning rate for AdamW."}) + weight_decay: float = field(default=0.0, metadata={"help": "Weight decay for AdamW if we apply some."}) + adam_beta1: float = field(default=0.9, metadata={"help": "Beta1 for AdamW optimizer"}) + adam_beta2: float = field(default=0.999, metadata={"help": "Beta2 for AdamW optimizer"}) + adam_epsilon: float = field(default=1e-8, metadata={"help": "Epsilon for AdamW optimizer."}) + max_grad_norm: float = field(default=1.0, metadata={"help": "Max gradient norm."}) + + num_train_epochs: float = field(default=3.0, metadata={"help": "Total number of training epochs to perform."}) + max_steps: int = field( + default=-1, + metadata={"help": "If > 0: set total number of training steps to perform. Override num_train_epochs."}, + ) + lr_scheduler_type: Union[SchedulerType, str] = field( + default="linear", + metadata={"help": "The scheduler type to use."}, + ) + lr_scheduler_kwargs: Optional[Union[dict, str]] = field( + default_factory=dict, + metadata={ + "help": ( + "Extra parameters for the lr_scheduler such as {'num_cycles': 1} for the cosine with hard restarts." + ) + }, + ) + warmup_ratio: float = field( + default=0.0, metadata={"help": "Linear warmup over warmup_ratio fraction of total steps."} + ) + warmup_steps: int = field(default=0, metadata={"help": "Linear warmup over warmup_steps."}) + + log_level: Optional[str] = field( + default="passive", + metadata={ + "help": ( + "Logger log level to use on the main node. Possible choices are the log levels as strings: 'debug'," + " 'info', 'warning', 'error' and 'critical', plus a 'passive' level which doesn't set anything and" + " lets the application set the level. Defaults to 'passive'." + ), + "choices": trainer_log_levels.keys(), + }, + ) + log_level_replica: Optional[str] = field( + default="warning", + metadata={ + "help": "Logger log level to use on replica nodes. Same choices and defaults as ``log_level``", + "choices": trainer_log_levels.keys(), + }, + ) + log_on_each_node: bool = field( + default=True, + metadata={ + "help": ( + "When doing a multinode distributed training, whether to log once per node or just once on the main" + " node." + ) + }, + ) + logging_dir: Optional[str] = field(default=None, metadata={"help": "Tensorboard log dir."}) + logging_strategy: Union[IntervalStrategy, str] = field( + default="steps", + metadata={"help": "The logging strategy to use."}, + ) + logging_first_step: bool = field(default=False, metadata={"help": "Log the first global_step"}) + logging_steps: float = field( + default=500, + metadata={ + "help": ( + "Log every X updates steps. Should be an integer or a float in range `[0,1)`. " + "If smaller than 1, will be interpreted as ratio of total training steps." + ) + }, + ) + logging_nan_inf_filter: bool = field(default=True, metadata={"help": "Filter nan and inf losses for logging."}) + save_strategy: Union[SaveStrategy, str] = field( + default="steps", + metadata={"help": "The checkpoint save strategy to use."}, + ) + save_steps: float = field( + default=500, + metadata={ + "help": ( + "Save checkpoint every X updates steps. Should be an integer or a float in range `[0,1)`. " + "If smaller than 1, will be interpreted as ratio of total training steps." + ) + }, + ) + save_total_limit: Optional[int] = field( + default=None, + metadata={ + "help": ( + "If a value is passed, will limit the total amount of checkpoints. Deletes the older checkpoints in" + " `output_dir`. When `load_best_model_at_end` is enabled, the 'best' checkpoint according to" + " `metric_for_best_model` will always be retained in addition to the most recent ones. For example," + " for `save_total_limit=5` and `load_best_model_at_end=True`, the four last checkpoints will always be" + " retained alongside the best model. When `save_total_limit=1` and `load_best_model_at_end=True`," + " it is possible that two checkpoints are saved: the last one and the best one (if they are different)." + " Default is unlimited checkpoints" + ) + }, + ) + save_safetensors: Optional[bool] = field( + default=True, + metadata={ + "help": "Use safetensors saving and loading for state dicts instead of default torch.load and torch.save." + }, + ) + save_on_each_node: bool = field( + default=False, + metadata={ + "help": ( + "When doing multi-node distributed training, whether to save models and checkpoints on each node, or" + " only on the main one" + ) + }, + ) + save_only_model: bool = field( + default=False, + metadata={ + "help": ( + "When checkpointing, whether to only save the model, or also the optimizer, scheduler & rng state." + "Note that when this is true, you won't be able to resume training from checkpoint." + "This enables you to save storage by not storing the optimizer, scheduler & rng state." + "You can only load the model using from_pretrained with this option set to True." + ) + }, + ) + restore_callback_states_from_checkpoint: bool = field( + default=False, + metadata={ + "help": "Whether to restore the callback states from the checkpoint. If `True`, will override callbacks passed to the `Trainer` if they exist in the checkpoint." + }, + ) + no_cuda: bool = field( + default=False, + metadata={"help": "This argument is deprecated. It will be removed in version 5.0 of 🤗 Transformers."}, + ) + use_cpu: bool = field( + default=False, + metadata={ + "help": "Whether or not to use cpu. If left to False, we will use the available torch device/backend (cuda/mps/xpu/hpu etc.)" + }, + ) + use_mps_device: bool = field( + default=False, + metadata={ + "help": "This argument is deprecated. `mps` device will be used if available similar to `cuda` device." + " It will be removed in version 5.0 of 🤗 Transformers" + }, + ) + seed: int = field(default=42, metadata={"help": "Random seed that will be set at the beginning of training."}) + data_seed: Optional[int] = field(default=None, metadata={"help": "Random seed to be used with data samplers."}) + jit_mode_eval: bool = field( + default=False, metadata={"help": "Whether or not to use PyTorch jit trace for inference"} + ) + use_ipex: bool = field( + default=False, + metadata={ + "help": ( + "Use Intel extension for PyTorch when it is available, installation:" + " 'https://github.com/intel/intel-extension-for-pytorch'" + ) + }, + ) + bf16: bool = field( + default=False, + metadata={ + "help": ( + "Whether to use bf16 (mixed) precision instead of 32-bit. Requires Ampere or higher NVIDIA" + " architecture or using CPU (use_cpu) or Ascend NPU. This is an experimental API and it may change." + ) + }, + ) + fp16: bool = field( + default=False, + metadata={"help": "Whether to use fp16 (mixed) precision instead of 32-bit"}, + ) + fp16_opt_level: str = field( + default="O1", + metadata={ + "help": ( + "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']. " + "See details at https://nvidia.github.io/apex/amp.html" + ) + }, + ) + half_precision_backend: str = field( + default="auto", + metadata={ + "help": "The backend to be used for half precision.", + "choices": ["auto", "apex", "cpu_amp"], + }, + ) + bf16_full_eval: bool = field( + default=False, + metadata={ + "help": ( + "Whether to use full bfloat16 evaluation instead of 32-bit. This is an experimental API and it may" + " change." + ) + }, + ) + fp16_full_eval: bool = field( + default=False, + metadata={"help": "Whether to use full float16 evaluation instead of 32-bit"}, + ) + tf32: Optional[bool] = field( + default=None, + metadata={ + "help": ( + "Whether to enable tf32 mode, available in Ampere and newer GPU architectures. This is an experimental" + " API and it may change." + ) + }, + ) + local_rank: int = field(default=-1, metadata={"help": "For distributed training: local_rank"}) + ddp_backend: Optional[str] = field( + default=None, + metadata={ + "help": "The backend to be used for distributed training", + "choices": ["nccl", "gloo", "mpi", "ccl", "hccl", "cncl", "mccl"], + }, + ) + tpu_num_cores: Optional[int] = field( + default=None, metadata={"help": "TPU: Number of TPU cores (automatically passed by launcher script)"} + ) + tpu_metrics_debug: bool = field( + default=False, + metadata={ + "help": ( + "Deprecated, the use of `--debug tpu_metrics_debug` is preferred. TPU: Whether to print debug metrics" + ) + }, + ) + debug: Union[str, list[DebugOption]] = field( + default="", + metadata={ + "help": ( + "Whether or not to enable debug mode. Current options: " + "`underflow_overflow` (Detect underflow and overflow in activations and weights), " + "`tpu_metrics_debug` (print debug metrics on TPU)." + ) + }, + ) + + dataloader_drop_last: bool = field( + default=False, metadata={"help": "Drop the last incomplete batch if it is not divisible by the batch size."} + ) + eval_steps: Optional[float] = field( + default=None, + metadata={ + "help": ( + "Run an evaluation every X steps. Should be an integer or a float in range `[0,1)`. " + "If smaller than 1, will be interpreted as ratio of total training steps." + ) + }, + ) + dataloader_num_workers: int = field( + default=0, + metadata={ + "help": ( + "Number of subprocesses to use for data loading (PyTorch only). 0 means that the data will be loaded" + " in the main process." + ) + }, + ) + dataloader_prefetch_factor: Optional[int] = field( + default=None, + metadata={ + "help": ( + "Number of batches loaded in advance by each worker. " + "2 means there will be a total of 2 * num_workers batches prefetched across all workers. " + ) + }, + ) + past_index: int = field( + default=-1, + metadata={"help": "If >=0, uses the corresponding part of the output as the past state for next step."}, + ) + + run_name: Optional[str] = field( + default=None, + metadata={ + "help": "An optional descriptor for the run. Notably used for wandb, mlflow comet and swanlab logging." + }, + ) + disable_tqdm: Optional[bool] = field( + default=None, metadata={"help": "Whether or not to disable the tqdm progress bars."} + ) + + remove_unused_columns: Optional[bool] = field( + default=True, metadata={"help": "Remove columns not required by the model when using an nlp.Dataset."} + ) + label_names: Optional[list[str]] = field( + default=None, metadata={"help": "The list of keys in your dictionary of inputs that correspond to the labels."} + ) + load_best_model_at_end: Optional[bool] = field( + default=False, + metadata={ + "help": ( + "Whether or not to load the best model found during training at the end of training. When this option" + " is enabled, the best checkpoint will always be saved. See `save_total_limit` for more." + ) + }, + ) + metric_for_best_model: Optional[str] = field( + default=None, metadata={"help": "The metric to use to compare two different models."} + ) + greater_is_better: Optional[bool] = field( + default=None, metadata={"help": "Whether the `metric_for_best_model` should be maximized or not."} + ) + ignore_data_skip: bool = field( + default=False, + metadata={ + "help": ( + "When resuming training, whether or not to skip the first epochs and batches to get to the same" + " training data." + ) + }, + ) + fsdp: Optional[Union[list[FSDPOption], str]] = field( + default="", + metadata={ + "help": ( + "Whether or not to use PyTorch Fully Sharded Data Parallel (FSDP) training (in distributed training" + " only). The base option should be `full_shard`, `shard_grad_op` or `no_shard` and you can add" + " CPU-offload to `full_shard` or `shard_grad_op` like this: full_shard offload` or `shard_grad_op" + " offload`. You can add auto-wrap to `full_shard` or `shard_grad_op` with the same syntax: full_shard" + " auto_wrap` or `shard_grad_op auto_wrap`." + ), + }, + ) + fsdp_min_num_params: int = field( + default=0, + metadata={ + "help": ( + "This parameter is deprecated. FSDP's minimum number of parameters for Default Auto Wrapping. (useful" + " only when `fsdp` field is passed)." + ) + }, + ) + fsdp_config: Optional[Union[dict, str]] = field( + default=None, + metadata={ + "help": ( + "Config to be used with FSDP (Pytorch Fully Sharded Data Parallel). The value is either a " + "fsdp json config file (e.g., `fsdp_config.json`) or an already loaded json file as `dict`." + ) + }, + ) + fsdp_transformer_layer_cls_to_wrap: Optional[str] = field( + default=None, + metadata={ + "help": ( + "This parameter is deprecated. Transformer layer class name (case-sensitive) to wrap, e.g," + " `BertLayer`, `GPTJBlock`, `T5Block` .... (useful only when `fsdp` flag is passed)." + ) + }, + ) + accelerator_config: Optional[Union[dict, str]] = field( + default=None, + metadata={ + "help": ( + "Config to be used with the internal Accelerator object initialization. The value is either a " + "accelerator json config file (e.g., `accelerator_config.json`) or an already loaded json file as `dict`." + ) + }, + ) + deepspeed: Optional[Union[dict, str]] = field( + default=None, + metadata={ + "help": ( + "Enable deepspeed and pass the path to deepspeed json config file (e.g. `ds_config.json`) or an already" + " loaded json file as a dict" + ) + }, + ) + label_smoothing_factor: float = field( + default=0.0, metadata={"help": "The label smoothing epsilon to apply (zero means no label smoothing)."} + ) + + default_optim = "adamw_torch" + # XXX: enable when pytorch==2.0.1 comes out - we want to give it time to get all the bugs sorted out + # if is_torch_available(): + # default_optim = "adamw_torch_fused" + # and update the doc above to: + # optim (`str` or [`training_args.OptimizerNames`], *optional*, defaults to `"adamw_torch_fused"` (for torch<2.1.0 `"adamw_torch"`): + optim: Union[OptimizerNames, str] = field( + default=default_optim, + metadata={"help": "The optimizer to use."}, + ) + optim_args: Optional[str] = field(default=None, metadata={"help": "Optional arguments to supply to optimizer."}) + adafactor: bool = field(default=False, metadata={"help": "Whether or not to replace AdamW by Adafactor."}) + group_by_length: bool = field( + default=False, + metadata={"help": "Whether or not to group samples of roughly the same length together when batching."}, + ) + length_column_name: Optional[str] = field( + default="length", + metadata={"help": "Column name with precomputed lengths to use when grouping by length."}, + ) + report_to: Union[None, str, list[str]] = field( + default=None, metadata={"help": "The list of integrations to report the results and logs to."} + ) + ddp_find_unused_parameters: Optional[bool] = field( + default=None, + metadata={ + "help": ( + "When using distributed training, the value of the flag `find_unused_parameters` passed to " + "`DistributedDataParallel`." + ) + }, + ) + ddp_bucket_cap_mb: Optional[int] = field( + default=None, + metadata={ + "help": ( + "When using distributed training, the value of the flag `bucket_cap_mb` passed to " + "`DistributedDataParallel`." + ) + }, + ) + ddp_broadcast_buffers: Optional[bool] = field( + default=None, + metadata={ + "help": ( + "When using distributed training, the value of the flag `broadcast_buffers` passed to " + "`DistributedDataParallel`." + ) + }, + ) + dataloader_pin_memory: bool = field( + default=True, metadata={"help": "Whether or not to pin memory for DataLoader."} + ) + dataloader_persistent_workers: bool = field( + default=False, + metadata={ + "help": "If True, the data loader will not shut down the worker processes after a dataset has been consumed once. This allows to maintain the workers Dataset instances alive. Can potentially speed up training, but will increase RAM usage." + }, + ) + skip_memory_metrics: bool = field( + default=True, metadata={"help": "Whether or not to skip adding of memory profiler reports to metrics."} + ) + use_legacy_prediction_loop: bool = field( + default=False, metadata={"help": "Whether or not to use the legacy prediction_loop in the Trainer."} + ) + push_to_hub: bool = field( + default=False, metadata={"help": "Whether or not to upload the trained model to the model hub after training."} + ) + resume_from_checkpoint: Optional[str] = field( + default=None, + metadata={"help": "The path to a folder with a valid checkpoint for your model."}, + ) + hub_model_id: Optional[str] = field( + default=None, metadata={"help": "The name of the repository to keep in sync with the local `output_dir`."} + ) + hub_strategy: Union[HubStrategy, str] = field( + default="every_save", + metadata={"help": "The hub strategy to use when `--push_to_hub` is activated."}, + ) + hub_token: Optional[str] = field(default=None, metadata={"help": "The token to use to push to the Model Hub."}) + hub_private_repo: Optional[bool] = field( + default=None, + metadata={ + "help": "Whether to make the repo private. If `None` (default), the repo will be public unless the organization's default is private. This value is ignored if the repo already exists." + }, + ) + hub_always_push: bool = field( + default=False, + metadata={"help": "Unless `True`, the Trainer will skip pushes if the previous one wasn't finished yet."}, + ) + gradient_checkpointing: bool = field( + default=False, + metadata={ + "help": "If True, use gradient checkpointing to save memory at the expense of slower backward pass." + }, + ) + gradient_checkpointing_kwargs: Optional[Union[dict, str]] = field( + default=None, + metadata={ + "help": "Gradient checkpointing key word arguments such as `use_reentrant`. Will be passed to `torch.utils.checkpoint.checkpoint` through `model.gradient_checkpointing_enable`." + }, + ) + include_inputs_for_metrics: bool = field( + default=False, + metadata={ + "help": "This argument is deprecated and will be removed in version 5 of 🤗 Transformers. Use `include_for_metrics` instead." + }, + ) + include_for_metrics: list[str] = field( + default_factory=list, + metadata={ + "help": "List of strings to specify additional data to include in the `compute_metrics` function." + "Options: 'inputs', 'loss'." + }, + ) + eval_do_concat_batches: bool = field( + default=True, + metadata={ + "help": "Whether to recursively concat inputs/losses/labels/predictions across batches. If `False`, will instead store them as lists, with each batch kept separate." + }, + ) + # Deprecated arguments + fp16_backend: str = field( + default="auto", + metadata={ + "help": "Deprecated. Use half_precision_backend instead", + "choices": ["auto", "apex", "cpu_amp"], + }, + ) + push_to_hub_model_id: Optional[str] = field( + default=None, metadata={"help": "The name of the repository to which push the `Trainer`."} + ) + push_to_hub_organization: Optional[str] = field( + default=None, metadata={"help": "The name of the organization in with to which push the `Trainer`."} + ) + push_to_hub_token: Optional[str] = field( + default=None, metadata={"help": "The token to use to push to the Model Hub."} + ) + _n_gpu: int = field(init=False, repr=False, default=-1) + mp_parameters: str = field( + default="", + metadata={"help": "Used by the SageMaker launcher to send mp-specific args. Ignored in Trainer"}, + ) + + auto_find_batch_size: bool = field( + default=False, + metadata={ + "help": ( + "Whether to automatically decrease the batch size in half and rerun the training loop again each time" + " a CUDA Out-of-Memory was reached" + ) + }, + ) + full_determinism: bool = field( + default=False, + metadata={ + "help": ( + "Whether to call enable_full_determinism instead of set_seed for reproducibility in distributed" + " training. Important: this will negatively impact the performance, so only use it for debugging." + ) + }, + ) + torchdynamo: Optional[str] = field( + default=None, + metadata={ + "help": "This argument is deprecated, use `--torch_compile_backend` instead.", + }, + ) + ray_scope: Optional[str] = field( + default="last", + metadata={ + "help": ( + 'The scope to use when doing hyperparameter search with Ray. By default, `"last"` will be used. Ray' + " will then use the last checkpoint of all trials, compare those, and select the best one. However," + " other options are also available. See the Ray documentation" + " (https://docs.ray.io/en/latest/tune/api_docs/analysis.html" + "#ray.tune.ExperimentAnalysis.get_best_trial)" + " for more options." + ) + }, + ) + ddp_timeout: Optional[int] = field( + default=1800, + metadata={ + "help": "Overrides the default timeout for distributed training (value should be given in seconds)." + }, + ) + torch_compile: bool = field( + default=False, metadata={"help": "If set to `True`, the model will be wrapped in `torch.compile`."} + ) + torch_compile_backend: Optional[str] = field( + default=None, + metadata={ + "help": "Which backend to use with `torch.compile`, passing one will trigger a model compilation.", + }, + ) + torch_compile_mode: Optional[str] = field( + default=None, + metadata={ + "help": "Which mode to use with `torch.compile`, passing one will trigger a model compilation.", + }, + ) + + include_tokens_per_second: Optional[bool] = field( + default=False, + metadata={"help": "If set to `True`, the speed metrics will include `tgs` (tokens per second per device)."}, + ) + + include_num_input_tokens_seen: Optional[bool] = field( + default=False, + metadata={ + "help": "If set to `True`, will track the number of input tokens seen throughout training. (May be slower in distributed training)" + }, + ) + + neftune_noise_alpha: Optional[float] = field( + default=None, + metadata={ + "help": "Activates neftune noise embeddings into the model. NEFTune has been proven to drastically improve model performances for instruction fine-tuning. Check out the original paper here: https://arxiv.org/abs/2310.05914 and the original code here: https://github.com/neelsjain/NEFTune. Only supported for `PreTrainedModel` and `PeftModel` classes." + }, + ) + + optim_target_modules: Union[None, str, list[str]] = field( + default=None, + metadata={ + "help": "Target modules for the optimizer defined in the `optim` argument. Only used for the GaLore optimizer at the moment." + }, + ) + + batch_eval_metrics: bool = field( + default=False, + metadata={"help": "Break eval metrics calculation into batches to save memory."}, + ) + + eval_on_start: bool = field( + default=False, + metadata={ + "help": "Whether to run through the entire `evaluation` step at the very beginning of training as a sanity check." + }, + ) + + use_liger_kernel: Optional[bool] = field( + default=False, + metadata={"help": "Whether or not to enable the Liger Kernel for model training."}, + ) + + eval_use_gather_object: Optional[bool] = field( + default=False, + metadata={ + "help": "Whether to run recursively gather object in a nested list/tuple/dictionary of objects from all devices." + }, + ) + + average_tokens_across_devices: Optional[bool] = field( + default=False, + metadata={ + "help": "Whether or not to average tokens across devices. If enabled, will use all_reduce to " + "synchronize num_tokens_in_batch for precise loss calculation. Reference: " + "https://github.com/huggingface/transformers/issues/34242" + }, + ) + + def __post_init__(self): + # Set default output_dir if not provided + if self.output_dir is None: + self.output_dir = "trainer_output" + logger.info( + "No output directory specified, defaulting to 'trainer_output'. " + "To change this behavior, specify --output_dir when creating TrainingArguments." + ) + + # Parse in args that could be `dict` sent in from the CLI as a string + for field in self._VALID_DICT_FIELDS: + passed_value = getattr(self, field) + # We only want to do this if the str starts with a bracket to indicate a `dict` + # else its likely a filename if supported + if isinstance(passed_value, str) and passed_value.startswith("{"): + loaded_dict = json.loads(passed_value) + # Convert str values to types if applicable + loaded_dict = _convert_str_dict(loaded_dict) + setattr(self, field, loaded_dict) + + # expand paths, if not os.makedirs("~/bar") will make directory + # in the current directory instead of the actual home + # see https://github.com/huggingface/transformers/issues/10628 + if self.output_dir is not None: + self.output_dir = os.path.expanduser(self.output_dir) + if self.logging_dir is None and self.output_dir is not None: + self.logging_dir = os.path.join(self.output_dir, default_logdir()) + if self.logging_dir is not None: + self.logging_dir = os.path.expanduser(self.logging_dir) + + if self.disable_tqdm is None: + self.disable_tqdm = logger.getEffectiveLevel() > logging.WARN + + if isinstance(self.eval_strategy, EvaluationStrategy): + warnings.warn( + "using `EvaluationStrategy` for `eval_strategy` is deprecated and will be removed in version 5" + " of 🤗 Transformers. Use `IntervalStrategy` instead", + FutureWarning, + ) + # Go back to the underlying string or we won't be able to instantiate `IntervalStrategy` on it. + self.eval_strategy = self.eval_strategy.value + if self.no_cuda: + warnings.warn( + "using `no_cuda` is deprecated and will be removed in version 5.0 of 🤗 Transformers. " + "Use `use_cpu` instead", + FutureWarning, + ) + self.use_cpu = self.no_cuda + + self.eval_strategy = IntervalStrategy(self.eval_strategy) + self.logging_strategy = IntervalStrategy(self.logging_strategy) + self.save_strategy = SaveStrategy(self.save_strategy) + self.hub_strategy = HubStrategy(self.hub_strategy) + + self.lr_scheduler_type = SchedulerType(self.lr_scheduler_type) + if self.do_eval is False and self.eval_strategy != IntervalStrategy.NO: + self.do_eval = True + + if self.torch_empty_cache_steps is not None: + if not (isinstance(self.torch_empty_cache_steps, int) and self.torch_empty_cache_steps > 0): + raise ValueError( + f"`torch_empty_cache_steps` must be an integer bigger than 0, got {self.torch_empty_cache_steps}." + ) + + # eval_steps has to be defined and non-zero, fallbacks to logging_steps if the latter is non-zero + if self.eval_strategy == IntervalStrategy.STEPS and (self.eval_steps is None or self.eval_steps == 0): + if self.logging_steps > 0: + logger.info(f"using `logging_steps` to initialize `eval_steps` to {self.logging_steps}") + self.eval_steps = self.logging_steps + else: + raise ValueError( + f"evaluation strategy {self.eval_strategy} requires either non-zero --eval_steps or" + " --logging_steps" + ) + + # logging_steps must be non-zero for logging_strategy that is other than 'no' + if self.logging_strategy == IntervalStrategy.STEPS and self.logging_steps == 0: + raise ValueError(f"logging strategy {self.logging_strategy} requires non-zero --logging_steps") + + if self.logging_strategy == IntervalStrategy.STEPS and self.logging_steps > 1: + if self.logging_steps != int(self.logging_steps): + raise ValueError(f"--logging_steps must be an integer if bigger than 1: {self.logging_steps}") + self.logging_steps = int(self.logging_steps) + if self.eval_strategy == IntervalStrategy.STEPS and self.eval_steps > 1: + if self.eval_steps != int(self.eval_steps): + raise ValueError(f"--eval_steps must be an integer if bigger than 1: {self.eval_steps}") + self.eval_steps = int(self.eval_steps) + if self.save_strategy == SaveStrategy.STEPS and self.save_steps > 1: + if self.save_steps != int(self.save_steps): + raise ValueError(f"--save_steps must be an integer if bigger than 1: {self.save_steps}") + self.save_steps = int(self.save_steps) + + # Sanity checks for load_best_model_at_end: we require save and eval strategies to be compatible. + if self.load_best_model_at_end and self.save_strategy != SaveStrategy.BEST: + if self.eval_strategy != self.save_strategy: + raise ValueError( + "--load_best_model_at_end requires the save and eval strategy to match, but found\n- Evaluation " + f"strategy: {self.eval_strategy}\n- Save strategy: {self.save_strategy}" + ) + if self.eval_strategy == IntervalStrategy.STEPS and self.save_steps % self.eval_steps != 0: + if self.eval_steps < 1 or self.save_steps < 1: + if not (self.eval_steps < 1 and self.save_steps < 1): + raise ValueError( + "--load_best_model_at_end requires the saving steps to be a multiple of the evaluation " + "steps, which cannot get guaranteed when mixing ratio and absolute steps for save_steps " + f"{self.save_steps} and eval_steps {self.eval_steps}." + ) + # Work around floating point precision issues + LARGE_MULTIPLIER = 1_000_000 + if (self.save_steps * LARGE_MULTIPLIER) % (self.eval_steps * LARGE_MULTIPLIER) != 0: + raise ValueError( + "--load_best_model_at_end requires the saving steps to be a multiple of the evaluation " + f"steps, but found {self.save_steps}, which is not a multiple of {self.eval_steps}." + ) + raise ValueError( + "--load_best_model_at_end requires the saving steps to be a round multiple of the evaluation " + f"steps, but found {self.save_steps}, which is not a round multiple of {self.eval_steps}." + ) + + safetensors_available = is_safetensors_available() + if self.save_safetensors and not safetensors_available: + raise ValueError(f"--save_safetensors={self.save_safetensors} requires safetensors to be installed!") + if not self.save_safetensors and safetensors_available: + logger.info( + f"Found safetensors installation, but --save_safetensors={self.save_safetensors}. " + f"Safetensors should be a preferred weights saving format due to security and performance reasons. " + f"If your model cannot be saved by safetensors please feel free to open an issue at " + f"https://github.com/huggingface/safetensors!" + ) + + if ( + self.load_best_model_at_end or self.lr_scheduler_type == SchedulerType.REDUCE_ON_PLATEAU + ) and self.metric_for_best_model is None: + self.metric_for_best_model = "loss" + if self.greater_is_better is None and self.metric_for_best_model is not None: + self.greater_is_better = not (self.metric_for_best_model.endswith("loss")) + if self.run_name is None: + self.run_name = self.output_dir + if self.framework == "pt" and is_torch_available(): + if self.fp16_backend and self.fp16_backend != "auto": + warnings.warn( + "`fp16_backend` is deprecated and will be removed in version 5 of 🤗 Transformers. Use" + " `half_precision_backend` instead", + FutureWarning, + ) + self.half_precision_backend = self.fp16_backend + + if self.bf16 or self.bf16_full_eval: + if self.use_cpu and not is_torch_available() and not is_torch_xla_available(): + # cpu + raise ValueError("Your setup doesn't support bf16/(cpu, tpu, neuroncore). You need torch>=1.10") + elif not self.use_cpu: + if not is_torch_bf16_gpu_available(): + error_message = "Your setup doesn't support bf16/gpu." + if is_torch_cuda_available(): + error_message += " You need Ampere+ GPU with cuda>=11.0" + # gpu + raise ValueError(error_message) + + if self.fp16 and self.bf16: + raise ValueError("At most one of fp16 and bf16 can be True, but not both") + + if self.fp16_full_eval and self.bf16_full_eval: + raise ValueError("At most one of fp16 and bf16 can be True for full eval, but not both") + + if self.bf16: + if self.half_precision_backend == "apex": + raise ValueError(" `--half_precision_backend apex`: GPU bf16 is not supported by apex.") + + if self.lr_scheduler_type == SchedulerType.REDUCE_ON_PLATEAU: + if self.eval_strategy == IntervalStrategy.NO: + raise ValueError("lr_scheduler_type reduce_lr_on_plateau requires an eval strategy") + if not is_torch_available(): + raise ValueError("lr_scheduler_type reduce_lr_on_plateau requires torch>=0.2.0") + + self.optim = OptimizerNames(self.optim) + if self.adafactor: + warnings.warn( + "`--adafactor` is deprecated and will be removed in version 5 of 🤗 Transformers. Use `--optim" + " adafactor` instead", + FutureWarning, + ) + self.optim = OptimizerNames.ADAFACTOR + + # We need to setup the accelerator config here *before* the first call to `self.device` + if is_accelerate_available(): + if not isinstance(self.accelerator_config, AcceleratorConfig): + if self.accelerator_config is None: + self.accelerator_config = AcceleratorConfig() + elif isinstance(self.accelerator_config, dict): + self.accelerator_config = AcceleratorConfig(**self.accelerator_config) + # Check that a user didn't pass in the class instantiator + # such as `accelerator_config = AcceleratorConfig` + elif isinstance(self.accelerator_config, type): + raise NotImplementedError( + "Tried passing in a callable to `accelerator_config`, but this is not supported. " + "Please pass in a fully constructed `AcceleratorConfig` object instead." + ) + else: + self.accelerator_config = AcceleratorConfig.from_json_file(self.accelerator_config) + + # Initialize device before we proceed + if self.framework == "pt" and is_torch_available(): + self.device + + # Disable average tokens when using single device + if self.average_tokens_across_devices: + try: + if self.world_size == 1: + logger.warning( + "average_tokens_across_devices is set to True but it is invalid when world size is" + "1. Turn it to False automatically." + ) + self.average_tokens_across_devices = False + except ImportError as e: + logger.warning(f"Can not specify world size due to {e}. Turn average_tokens_across_devices to False.") + self.average_tokens_across_devices = False + + if self.torchdynamo is not None: + warnings.warn( + "`torchdynamo` is deprecated and will be removed in version 5 of 🤗 Transformers. Use" + " `torch_compile_backend` instead", + FutureWarning, + ) + self.torch_compile_backend = self.torchdynamo + if (self.torch_compile_mode is not None or self.torch_compile_backend is not None) and not self.torch_compile: + self.torch_compile = True + if self.torch_compile and self.torch_compile_backend is None: + if not self.use_cpu and is_torch_hpu_available(): + self.torch_compile_backend = "hpu_backend" + else: + self.torch_compile_backend = "inductor" + + # accelerate integration for torch compile + if self.torch_compile: + # set env vars for accelerate + prefix = "ACCELERATE_DYNAMO_" + os.environ[prefix + "BACKEND"] = self.torch_compile_backend + if self.torch_compile_mode is not None: + os.environ[prefix + "MODE"] = self.torch_compile_mode + + if self.framework == "pt" and is_torch_available() and self.torch_compile: + if is_torch_tf32_available(): + if self.tf32 is None and not self.fp16 or self.bf16: + logger.info( + "Setting TF32 in CUDA backends to speedup torch compile, you won't see any improvement" + " otherwise." + ) + torch.backends.cuda.matmul.allow_tf32 = True + torch.backends.cudnn.allow_tf32 = True + else: + logger.warning( + "The speedups for torchdynamo mostly come with GPU Ampere or higher and which is not detected here." + ) + if self.framework == "pt" and is_torch_available() and self.tf32 is not None: + if self.tf32: + if is_torch_tf32_available(): + torch.backends.cuda.matmul.allow_tf32 = True + torch.backends.cudnn.allow_tf32 = True + else: + raise ValueError("--tf32 requires Ampere or a newer GPU arch, cuda>=11 and torch>=1.7") + else: + if is_torch_tf32_available(): + torch.backends.cuda.matmul.allow_tf32 = False + torch.backends.cudnn.allow_tf32 = False + # no need to assert on else + + # if training args is specified, it will override the one specified in the accelerate config + if self.half_precision_backend != "apex": + mixed_precision_dtype = os.environ.get("ACCELERATE_MIXED_PRECISION", "no") + if self.fp16: + mixed_precision_dtype = "fp16" + elif self.bf16: + mixed_precision_dtype = "bf16" + os.environ["ACCELERATE_MIXED_PRECISION"] = mixed_precision_dtype + + if self.report_to is None: + logger.info( + "The default value for the training argument `--report_to` will change in v5 (from all installed " + "integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as " + "now. You should start updating your code and make this info disappear :-)." + ) + self.report_to = "all" + if self.report_to == "all" or self.report_to == ["all"]: + # Import at runtime to avoid a circular import. + from .integrations import get_available_reporting_integrations + + self.report_to = get_available_reporting_integrations() + + if "codecarbon" in self.report_to and torch.version.hip: + logger.warning( + "When using the Trainer, CodeCarbonCallback requires the `codecarbon` package, which is not compatible with AMD ROCm (https://github.com/mlco2/codecarbon/pull/490). Automatically disabling the codecarbon callback. Reference: https://huggingface.co/docs/transformers/v4.39.3/en/main_classes/trainer#transformers.TrainingArguments.report_to." + ) + self.report_to.remove("codecarbon") + + elif self.report_to == "none" or self.report_to == ["none"]: + self.report_to = [] + elif not isinstance(self.report_to, list): + self.report_to = [self.report_to] + + if self.warmup_ratio < 0 or self.warmup_ratio > 1: + raise ValueError("warmup_ratio must lie in range [0,1]") + elif self.warmup_ratio > 0 and self.warmup_steps > 0: + logger.info( + "Both warmup_ratio and warmup_steps given, warmup_steps will override any effect of warmup_ratio" + " during training" + ) + + if not isinstance(self.warmup_steps, int) or self.warmup_steps < 0: + raise ValueError("warmup_steps must be of type int and must be 0 or a positive integer.") + + if isinstance(self.fsdp, bool): + self.fsdp = [FSDPOption.FULL_SHARD] if self.fsdp else "" + if isinstance(self.fsdp, str): + self.fsdp = [FSDPOption(s) for s in self.fsdp.split()] + if self.fsdp == [FSDPOption.OFFLOAD]: + raise ValueError( + "`--fsdp offload` can't work on its own. It needs to be added to `--fsdp full_shard` or " + '`--fsdp shard_grad_op`. For example, `--fsdp "full_shard offload"`.' + ) + elif FSDPOption.FULL_SHARD in self.fsdp and FSDPOption.SHARD_GRAD_OP in self.fsdp: + raise ValueError("`--fsdp full_shard` is not compatible with `--fsdp shard_grad_op`.") + + if self.gradient_checkpointing and ( + FSDPOption.FULL_SHARD in self.fsdp or FSDPOption.HYBRID_SHARD in self.fsdp + ): + logger.warning( + "When using FSDP full shard, instead of using `gradient_checkpointing` in TrainingArguments, please" + " use `activation_checkpointing` in `fsdp_config`. The former introduces a redundant AllGather" + " operation in backward pass. Reference: https://github.com/huggingface/transformers/issues/30404" + ) + + if self.fsdp_config is None: + self.fsdp_config = {} + + if isinstance(self.fsdp_config, str): + if len(self.fsdp) == 0: + warnings.warn("`--fsdp_config` is useful only when `--fsdp` is specified.") + with open(self.fsdp_config, encoding="utf-8") as f: + self.fsdp_config = json.load(f) + for k in list(self.fsdp_config.keys()): + if k.startswith("fsdp_"): + v = self.fsdp_config.pop(k) + self.fsdp_config[k[5:]] = v + + if self.fsdp_min_num_params > 0: + warnings.warn("using `--fsdp_min_num_params` is deprecated. Use fsdp_config instead ", FutureWarning) + + self.fsdp_config["min_num_params"] = max(self.fsdp_config.get("min_num_params", 0), self.fsdp_min_num_params) + + # if fsdp_config["transformer_layer_cls_to_wrap"] is specified as a string, convert it to a list with a single object + if isinstance(self.fsdp_config.get("transformer_layer_cls_to_wrap", None), str): + self.fsdp_config["transformer_layer_cls_to_wrap"] = [self.fsdp_config["transformer_layer_cls_to_wrap"]] + + if self.fsdp_transformer_layer_cls_to_wrap is not None: + warnings.warn( + "using `--fsdp_transformer_layer_cls_to_wrap` is deprecated. Use fsdp_config instead ", FutureWarning + ) + self.fsdp_config["transformer_layer_cls_to_wrap"] = self.fsdp_config.get( + "transformer_layer_cls_to_wrap", [] + ) + [self.fsdp_transformer_layer_cls_to_wrap] + + if len(self.fsdp) == 0 and self.fsdp_config["min_num_params"] > 0: + warnings.warn("`min_num_params` is useful only when `--fsdp` is specified.") + + if len(self.fsdp) == 0 and self.fsdp_config.get("transformer_layer_cls_to_wrap", None) is not None: + warnings.warn("`transformer_layer_cls_to_wrap` is useful only when `--fsdp` is specified.") + + if ( + len(self.fsdp) > 0 + and self.fsdp_config["min_num_params"] > 0 + and self.fsdp_config.get("transformer_layer_cls_to_wrap", None) is not None + ): + raise ValueError("`min_num_params` and `transformer_layer_cls_to_wrap` are mutually exclusive.") + self.fsdp_config["xla"] = self.fsdp_config.get("xla", False) + self.fsdp_config["xla_fsdp_v2"] = self.fsdp_config.get("xla_fsdp_v2", False) + self.fsdp_config["xla_fsdp_grad_ckpt"] = self.fsdp_config.get("xla_fsdp_grad_ckpt", False) + if self.fsdp_config["xla"]: + if len(self.fsdp) > 0: + # store XLA fsdp configuration parameters into a dictionary + # Copy the config to avoid modifying the original config (which may be used for JSON serialization) + self.xla_fsdp_config = self.fsdp_config.get("xla_fsdp_settings", {}).copy() + # apply appropriate string to torch.dtype conversions for parameters + if "compute_dtype" in self.xla_fsdp_config: + self.xla_fsdp_config["compute_dtype"] = getattr(torch, self.xla_fsdp_config["compute_dtype"]) + if "buffer_dtype" in self.xla_fsdp_config: + self.xla_fsdp_config["buffer_dtype"] = getattr(torch, self.xla_fsdp_config["buffer_dtype"]) + else: + warnings.warn("XLA FSDP can be used only when `--fsdp` is specified.") + else: + if self.fsdp_config["xla_fsdp_grad_ckpt"]: + warnings.warn("`--xla_fsdp_grad_ckpt` is useful only when `--xla` is set to true.") + + # accelerate integration for FSDP + if len(self.fsdp) > 0 and not self.fsdp_config["xla"]: + os.environ["ACCELERATE_USE_FSDP"] = "true" + from accelerate.utils.constants import ( + FSDP_AUTO_WRAP_POLICY, + FSDP_SHARDING_STRATEGY, + ) + + prefix = "FSDP_" + for fsdp_option in self.fsdp: + if fsdp_option.upper() in FSDP_SHARDING_STRATEGY: + # set environment variable for FSDP sharding strategy + os.environ[f"{prefix}SHARDING_STRATEGY"] = str( + FSDP_SHARDING_STRATEGY.index(fsdp_option.upper()) + 1 + ) + elif fsdp_option == FSDPOption.OFFLOAD: + os.environ[f"{prefix}OFFLOAD_PARAMS"] = "true" + elif fsdp_option == FSDPOption.AUTO_WRAP: + os.environ[f"{prefix}AUTO_WRAP_POLICY"] = FSDP_AUTO_WRAP_POLICY[0] + if self.fsdp_config["min_num_params"] > 0: + os.environ[f"{prefix}MIN_NUM_PARAMS"] = str(self.fsdp_config["min_num_params"]) + os.environ[f"{prefix}AUTO_WRAP_POLICY"] = FSDP_AUTO_WRAP_POLICY[1] + elif self.fsdp_config.get("transformer_layer_cls_to_wrap", None) is not None: + os.environ[f"{prefix}TRANSFORMER_CLS_TO_WRAP"] = ",".join( + self.fsdp_config["transformer_layer_cls_to_wrap"] + ) + prefetch_policy = self.fsdp_config.get("backward_prefetch", "NO_PREFETCH") + os.environ[f"{prefix}BACKWARD_PREFETCH"] = prefetch_policy.upper() + os.environ[f"{prefix}FORWARD_PREFETCH"] = str(self.fsdp_config.get("forward_prefetch", "false")).lower() + + sync_module_states = str(self.fsdp_config.get("sync_module_states", "true")).lower() + cpu_ram_efficient_loading = str(self.fsdp_config.get("cpu_ram_efficient_loading", "false")).lower() + + if sync_module_states == "false" and cpu_ram_efficient_loading == "true": + # In this case, all the processes except the main process would have random weights leading + # to unexpected behaviour during training, thus throwing error here to prevent it. + raise ValueError('`sync_module_states` must be `"True"` if `cpu_ram_efficient_loading` is `"True"`') + + os.environ[f"{prefix}SYNC_MODULE_STATES"] = sync_module_states + os.environ[f"{prefix}CPU_RAM_EFFICIENT_LOADING"] = cpu_ram_efficient_loading + + os.environ[f"{prefix}USE_ORIG_PARAMS"] = str(self.fsdp_config.get("use_orig_params", "true")).lower() + + if self.tpu_metrics_debug: + warnings.warn( + "using `--tpu_metrics_debug` is deprecated and will be removed in version 5 of 🤗 Transformers. Use" + " `--debug tpu_metrics_debug` instead", + FutureWarning, + ) + if self.debug is None: + self.debug = " tpu_metrics_debug" + else: + self.debug += " tpu_metrics_debug" + self.tpu_metrics_debug = False + + if isinstance(self.debug, str): + self.debug = [DebugOption(s) for s in self.debug.split()] + elif self.debug is None: + self.debug = [] + + self.deepspeed_plugin = None + if self.deepspeed: + # - must be run very last in arg parsing, since it will use a lot of these settings. + # - must be run before the model is created. + if not is_accelerate_available(): + raise ValueError( + f"--deepspeed requires Accelerate to be installed: `pip install 'accelerate>={ACCELERATE_MIN_VERSION}'`." + ) + from transformers.integrations.deepspeed import HfTrainerDeepSpeedConfig + + # will be used later by the Trainer + # note: leave self.deepspeed unmodified in case a user relies on it not to be modified) + self.hf_deepspeed_config = HfTrainerDeepSpeedConfig(self.deepspeed) + self.hf_deepspeed_config.trainer_config_process(self) + + # Accelerate DeepSpeed Plugin + from accelerate.utils import DeepSpeedPlugin + + os.environ["ACCELERATE_USE_DEEPSPEED"] = "true" + self.deepspeed_plugin = DeepSpeedPlugin(hf_ds_config=self.hf_deepspeed_config) + elif strtobool(os.environ.get("ACCELERATE_USE_DEEPSPEED", "false")): + # Accelerate DeepSpeed Plugin + from accelerate.utils import DeepSpeedPlugin + + self.deepspeed_plugin = DeepSpeedPlugin() + mixed_precision = os.environ.get("ACCELERATE_MIXED_PRECISION", "no") + self.deepspeed_plugin.set_mixed_precision(mixed_precision) + self.deepspeed_plugin.set_deepspeed_weakref() + + if self.use_cpu: + self.dataloader_pin_memory = False + + if self.dataloader_num_workers == 0 and self.dataloader_prefetch_factor is not None: + raise ValueError( + "--dataloader_prefetch_factor can only be set when data is loaded in a different process, i.e." + " when --dataloader_num_workers > 1." + ) + + if self.push_to_hub_token is not None: + warnings.warn( + "`--push_to_hub_token` is deprecated and will be removed in version 5 of 🤗 Transformers. Use " + "`--hub_token` instead.", + FutureWarning, + ) + self.hub_token = self.push_to_hub_token + + if self.push_to_hub_model_id is not None: + self.hub_model_id = get_full_repo_name( + self.push_to_hub_model_id, organization=self.push_to_hub_organization, token=self.hub_token + ) + if self.push_to_hub_organization is not None: + warnings.warn( + "`--push_to_hub_model_id` and `--push_to_hub_organization` are deprecated and will be removed in " + "version 5 of 🤗 Transformers. Use `--hub_model_id` instead and pass the full repo name to this " + f"argument (in this case {self.hub_model_id}).", + FutureWarning, + ) + else: + warnings.warn( + "`--push_to_hub_model_id` is deprecated and will be removed in version 5 of 🤗 Transformers. Use " + "`--hub_model_id` instead and pass the full repo name to this argument (in this case " + f"{self.hub_model_id}).", + FutureWarning, + ) + elif self.push_to_hub_organization is not None: + self.hub_model_id = f"{self.push_to_hub_organization}/{Path(self.output_dir).name}" + warnings.warn( + "`--push_to_hub_organization` is deprecated and will be removed in version 5 of 🤗 Transformers. Use " + "`--hub_model_id` instead and pass the full repo name to this argument (in this case " + f"{self.hub_model_id}).", + FutureWarning, + ) + + if self.eval_use_gather_object and not is_accelerate_available("0.30.0"): + raise ValueError( + "--eval_use_gather_object requires Accelerate to be version of `accelerate` > 0.30.0." + "This is not supported and we recommend you to update your version." + ) + + if self.data_seed is not None: + if not is_accelerate_available("1.1.0"): + raise NotImplementedError( + "data_seed requires Accelerate version `accelerate` >= 1.1.0. " + "This is not supported and we recommend you to update your version." + ) + + if self.include_inputs_for_metrics: + logger.warning( + "Using `include_inputs_for_metrics` is deprecated and will be removed in version 5 of 🤗 Transformers. Please use `include_for_metrics` list argument instead." + ) + self.include_for_metrics.append("inputs") + + def __str__(self): + self_as_dict = asdict(self) + + # Remove deprecated arguments. That code should be removed once + # those deprecated arguments are removed from TrainingArguments. (TODO: v5) + del self_as_dict["per_gpu_train_batch_size"] + del self_as_dict["per_gpu_eval_batch_size"] + + self_as_dict = {k: f"<{k.upper()}>" if k.endswith("_token") else v for k, v in self_as_dict.items()} + + attrs_as_str = [f"{k}={v},\n" for k, v in sorted(self_as_dict.items())] + return f"{self.__class__.__name__}(\n{''.join(attrs_as_str)})" + + __repr__ = __str__ + + @property + def train_batch_size(self) -> int: + """ + The actual batch size for training (may differ from `per_gpu_train_batch_size` in distributed training). + """ + if self.per_gpu_train_batch_size: + logger.warning( + "Using deprecated `--per_gpu_train_batch_size` argument which will be removed in a future " + "version. Using `--per_device_train_batch_size` is preferred." + ) + per_device_batch_size = self.per_gpu_train_batch_size or self.per_device_train_batch_size + train_batch_size = per_device_batch_size * max(1, self.n_gpu) + return train_batch_size + + @property + def eval_batch_size(self) -> int: + """ + The actual batch size for evaluation (may differ from `per_gpu_eval_batch_size` in distributed training). + """ + if self.per_gpu_eval_batch_size: + logger.warning( + "Using deprecated `--per_gpu_eval_batch_size` argument which will be removed in a future " + "version. Using `--per_device_eval_batch_size` is preferred." + ) + per_device_batch_size = self.per_gpu_eval_batch_size or self.per_device_eval_batch_size + eval_batch_size = per_device_batch_size * max(1, self.n_gpu) + return eval_batch_size + + @property + def ddp_timeout_delta(self) -> timedelta: + """ + The actual timeout for torch.distributed.init_process_group since it expects a timedelta variable. + """ + return timedelta(seconds=self.ddp_timeout) + + @cached_property + def _setup_devices(self) -> "torch.device": + requires_backends(self, ["torch"]) + logger.info("PyTorch: setting up devices") + if not is_sagemaker_mp_enabled(): + if not is_accelerate_available(): + raise ImportError( + f"Using the `Trainer` with `PyTorch` requires `accelerate>={ACCELERATE_MIN_VERSION}`: " + f"Please run `pip install transformers[torch]` or `pip install 'accelerate>={ACCELERATE_MIN_VERSION}'`" + ) + # We delay the init of `PartialState` to the end for clarity + accelerator_state_kwargs = {"enabled": True, "use_configured_state": False} + if isinstance(self.accelerator_config, AcceleratorConfig): + accelerator_state_kwargs["use_configured_state"] = self.accelerator_config.pop( + "use_configured_state", False + ) + if accelerator_state_kwargs["use_configured_state"]: + if PartialState._shared_state == {}: + raise ValueError( + "Passing `'use_configured_state':True` to the AcceleratorConfig requires a pre-configured " + "`AcceleratorState` or `PartialState` to be defined before calling `TrainingArguments`. " + ) + # We rely on `PartialState` to yell if there's issues here (which it will) + self.distributed_state = PartialState(cpu=self.use_cpu) + if self.deepspeed and self.distributed_state.distributed_type != DistributedType.DEEPSPEED: + raise RuntimeError( + "Tried to use an already configured `Accelerator` or `PartialState` that was not initialized for DeepSpeed, " + "but also passed in a `deepspeed` configuration to the `TrainingArguments`. Please set " + "`use_configured_state:False` instead or setup your `Accelerator` or `PartialState` properly." + ) + else: + AcceleratorState._reset_state(reset_partial_state=True) + self.distributed_state = None + if not self.use_ipex and "ACCELERATE_USE_IPEX" not in os.environ: + os.environ["ACCELERATE_USE_IPEX"] = "false" + + self._n_gpu = 1 + if self.use_cpu or strtobool(os.environ.get("ACCELERATE_USE_CPU", "False")): + accelerator_state_kwargs["cpu"] = True + accelerator_state_kwargs["backend"] = self.ddp_backend + self._n_gpu = 0 + elif is_sagemaker_mp_enabled(): + accelerator_state_kwargs["enabled"] = False + local_rank = smp.local_rank() + device = torch.device("cuda", local_rank) + torch.cuda.set_device(device) + elif is_sagemaker_dp_enabled(): + accelerator_state_kwargs["_use_sagemaker_dp"] = True + elif self.deepspeed: + accelerator_state_kwargs["use_deepspeed"] = True + accelerator_state_kwargs["timeout"] = timedelta(seconds=self.ddp_timeout) + else: + accelerator_state_kwargs["backend"] = self.ddp_backend + accelerator_state_kwargs["timeout"] = timedelta(seconds=self.ddp_timeout) + + # Now we pop everything + if accelerator_state_kwargs.pop("enabled", False) and not accelerator_state_kwargs.pop( + "use_configured_state", False + ): + # We need to patch this env var when enabling to detect deepspeed + use_deepspeed = accelerator_state_kwargs.pop("use_deepspeed", False) + if use_deepspeed: + os.environ["ACCELERATE_USE_DEEPSPEED"] = "true" + self.distributed_state = PartialState(**accelerator_state_kwargs) + if use_deepspeed: + del os.environ["ACCELERATE_USE_DEEPSPEED"] + if not is_sagemaker_mp_enabled(): + device = self.distributed_state.device + self.local_rank = self.distributed_state.local_process_index + if dist.is_available() and dist.is_initialized() and self.parallel_mode != ParallelMode.DISTRIBUTED: + logger.warning( + "torch.distributed process group is initialized, but parallel_mode != ParallelMode.DISTRIBUTED. " + "In order to use Torch DDP, launch your script with `python -m torch.distributed.launch" + ) + if is_torch_xla_available(): + device = self.distributed_state.device + self._n_gpu = 0 + elif is_sagemaker_dp_enabled() or is_sagemaker_mp_enabled(): + # Already set _n_gpu + pass + elif self.distributed_state.distributed_type == DistributedType.NO: + if self.use_mps_device: + warnings.warn( + "`use_mps_device` is deprecated and will be removed in version 5.0 of 🤗 Transformers. " + "`mps` device will be used by default if available similar to the way `cuda` device is used." + "Therefore, no action from user is required. " + ) + if device.type != "mps": + raise ValueError( + "Either you do not have an MPS-enabled device on this machine or MacOS version is not 12.3+ " + "or current PyTorch install was not built with MPS enabled." + ) + if self.use_cpu: + device = torch.device("cpu") + elif is_torch_mps_available(): + device = torch.device("mps") + elif is_torch_xpu_available(): + if not is_ipex_available() and not is_accelerate_available("0.32.0.dev"): + raise ImportError("Using the XPU PyTorch backend requires `accelerate>=0.32.0.dev`") + device = torch.device("xpu:0") + torch.xpu.set_device(device) + elif is_torch_mlu_available(): + device = torch.device("mlu:0") + torch.mlu.set_device(device) + elif is_torch_musa_available(): + device = torch.device("musa:0") + torch.musa.set_device(device) + elif is_torch_npu_available(): + device = torch.device("npu:0") + torch.npu.set_device(device) + elif is_torch_hpu_available(): + device = torch.device("hpu:0") + torch.hpu.set_device(device) + else: + # if n_gpu is > 1 we'll use nn.DataParallel. + # If you only want to use a specific subset of GPUs use `CUDA_VISIBLE_DEVICES=0` + # Explicitly set CUDA to the first (index 0) CUDA device, otherwise `set_device` will + # trigger an error that a device index is missing. Index 0 takes into account the + # GPUs available in the environment, so `CUDA_VISIBLE_DEVICES=1,2` with `cuda:0` + # will use the first GPU in that env, i.e. GPU#1 + device = torch.device( + "cuda:0" if torch.cuda.is_available() else os.environ.get("ACCELERATE_TORCH_DEVICE", "cpu") + ) + # Sometimes the line in the postinit has not been run before we end up here, so just checking we're not at + # the default value. + self._n_gpu = torch.cuda.device_count() + if device.type == "cuda": + torch.cuda.set_device(device) + return device + + @property + def device(self) -> "torch.device": + """ + The device used by this process. + """ + requires_backends(self, ["torch"]) + return self._setup_devices + + @property + def n_gpu(self): + """ + The number of GPUs used by this process. + + Note: + This will only be greater than one when you have multiple GPUs available but are not using distributed + training. For distributed training, it will always be 1. + """ + requires_backends(self, ["torch"]) + # Make sure `self._n_gpu` is properly setup. + if not hasattr(self, "_n_gpu"): + _ = self._setup_devices + return self._n_gpu + + @property + def parallel_mode(self): + """ + The current mode used for parallelism if multiple GPUs/TPU cores are available. One of: + + - `ParallelMode.NOT_PARALLEL`: no parallelism (CPU or one GPU). + - `ParallelMode.NOT_DISTRIBUTED`: several GPUs in one single process (uses `torch.nn.DataParallel`). + - `ParallelMode.DISTRIBUTED`: several GPUs, each having its own process (uses + `torch.nn.DistributedDataParallel`). + - `ParallelMode.TPU`: several TPU cores. + """ + requires_backends(self, ["torch"]) + if is_torch_xla_available(): + return ParallelMode.TPU + elif is_sagemaker_mp_enabled(): + return ParallelMode.SAGEMAKER_MODEL_PARALLEL + elif is_sagemaker_dp_enabled(): + return ParallelMode.SAGEMAKER_DATA_PARALLEL + elif ( + self.distributed_state is not None and self.distributed_state.distributed_type != DistributedType.NO + ) or (self.distributed_state is None and self.local_rank != -1): + return ParallelMode.DISTRIBUTED + elif self.n_gpu > 1: + return ParallelMode.NOT_DISTRIBUTED + else: + return ParallelMode.NOT_PARALLEL + + @property + def world_size(self): + """ + The number of processes used in parallel. + """ + requires_backends(self, ["torch"]) + if self.distributed_state is not None: + return self.distributed_state.num_processes + elif is_sagemaker_mp_enabled(): + return smp.dp_size() if not smp.state.cfg.prescaled_batch else smp.rdp_size() + return 1 + + @property + def process_index(self): + """ + The index of the current process used. + """ + requires_backends(self, ["torch"]) + if self.distributed_state is not None: + return self.distributed_state.process_index + elif is_sagemaker_mp_enabled(): + return smp.dp_rank() if not smp.state.cfg.prescaled_batch else smp.rdp_rank() + return 0 + + @property + def local_process_index(self): + """ + The index of the local process used. + """ + requires_backends(self, ["torch"]) + + if self.distributed_state is not None: + return self.distributed_state.local_process_index + elif is_sagemaker_mp_enabled(): + return smp.local_rank() + return 0 + + @property + def should_log(self): + """ + Whether or not the current process should produce log. + """ + if self.log_on_each_node: + return self.local_process_index == 0 + else: + if is_sagemaker_mp_enabled(): + return smp.rank() == 0 + else: + return self.process_index == 0 + + @property + def should_save(self): + """ + Whether or not the current process should write to disk, e.g., to save models and checkpoints. + """ + if self.save_on_each_node: + return self.local_process_index == 0 + else: + if is_sagemaker_mp_enabled(): + return smp.rank() == 0 + else: + return self.process_index == 0 + + def get_process_log_level(self): + """ + Returns the log level to be used depending on whether this process is the main process of node 0, main process + of node non-0, or a non-main process. + + For the main process the log level defaults to the logging level set (`logging.WARNING` if you didn't do + anything) unless overridden by `log_level` argument. + + For the replica processes the log level defaults to `logging.WARNING` unless overridden by `log_level_replica` + argument. + + The choice between the main and replica process settings is made according to the return value of `should_log`. + """ + + # convert to int + log_level = trainer_log_levels[self.log_level] + log_level_replica = trainer_log_levels[self.log_level_replica] + + log_level_main_node = logging.get_verbosity() if log_level == -1 else log_level + log_level_replica_node = logging.get_verbosity() if log_level_replica == -1 else log_level_replica + return log_level_main_node if self.should_log else log_level_replica_node + + @property + def place_model_on_device(self): + """ + Can be subclassed and overridden for some specific integrations. + """ + return not is_sagemaker_mp_enabled() + + @property + def _no_sync_in_gradient_accumulation(self): + """ + Whether or not to use no_sync for the gradients when doing gradient accumulation. + """ + return not ( + self.deepspeed or is_sagemaker_dp_enabled() or is_sagemaker_mp_enabled() or is_torch_neuroncore_available() + ) + + @contextlib.contextmanager + def main_process_first(self, local=True, desc="work"): + """ + A context manager for torch distributed environment where on needs to do something on the main process, while + blocking replicas, and when it's finished releasing the replicas. + + One such use is for `datasets`'s `map` feature which to be efficient should be run once on the main process, + which upon completion saves a cached version of results and which then automatically gets loaded by the + replicas. + + Args: + local (`bool`, *optional*, defaults to `True`): + if `True` first means process of rank 0 of each node if `False` first means process of rank 0 of node + rank 0 In multi-node environment with a shared filesystem you most likely will want to use + `local=False` so that only the main process of the first node will do the processing. If however, the + filesystem is not shared, then the main process of each node will need to do the processing, which is + the default behavior. + desc (`str`, *optional*, defaults to `"work"`): + a work description to be used in debug logs + + """ + if is_torch_available() and self.world_size > 1: + main_process_desc = "main local process" if local else "main process" + if self.distributed_state is not None: + is_main_process = ( + self.distributed_state.is_local_main_process if local else self.distributed_state.is_main_process + ) + elif is_sagemaker_mp_enabled(): + is_main_process = smp.rank() == 0 + + try: + if not is_main_process: + # tell all replicas to wait + logger.debug(f"{self.process_index}: waiting for the {main_process_desc} to perform {desc}") + + if is_torch_xla_available(): + xm.rendezvous(desc) + else: + dist.barrier() + yield + finally: + if is_main_process: + # the wait is over + logger.debug(f"{self.process_index}: {main_process_desc} completed {desc}, releasing all replicas") + if is_torch_xla_available(): + xm.rendezvous(desc) + else: + dist.barrier() + else: + yield + + def get_warmup_steps(self, num_training_steps: int): + """ + Get number of steps used for a linear warmup. + """ + warmup_steps = ( + self.warmup_steps if self.warmup_steps > 0 else math.ceil(num_training_steps * self.warmup_ratio) + ) + return warmup_steps + + def _dict_torch_dtype_to_str(self, d: dict[str, Any]) -> None: + """ + Checks whether the passed dictionary and its nested dicts have a *torch_dtype* key and if it's not None, + converts torch.dtype to a string of just the type. For example, `torch.float32` get converted into *"float32"* + string, which can then be stored in the json format. + """ + if d.get("torch_dtype", None) is not None and not isinstance(d["torch_dtype"], str): + d["torch_dtype"] = str(d["torch_dtype"]).split(".")[1] + for value in d.values(): + if isinstance(value, dict): + self._dict_torch_dtype_to_str(value) + + def to_dict(self): + """ + Serializes this instance while replace `Enum` by their values (for JSON serialization support). It obfuscates + the token values by removing their value. + """ + # filter out fields that are defined as field(init=False) + d = {field.name: getattr(self, field.name) for field in fields(self) if field.init} + + for k, v in d.items(): + if isinstance(v, Enum): + d[k] = v.value + if isinstance(v, list) and len(v) > 0 and isinstance(v[0], Enum): + d[k] = [x.value for x in v] + if k.endswith("_token"): + d[k] = f"<{k.upper()}>" + # Handle the accelerator_config if passed + if is_accelerate_available() and isinstance(v, AcceleratorConfig): + d[k] = v.to_dict() + # Handle the quantization_config if passed + if k == "model_init_kwargs" and isinstance(v, dict) and "quantization_config" in v: + quantization_config = v.get("quantization_config") + if quantization_config and not isinstance(quantization_config, dict): + d[k]["quantization_config"] = quantization_config.to_dict() + self._dict_torch_dtype_to_str(d) + + return d + + def to_json_string(self): + """ + Serializes this instance to a JSON string. + """ + return json.dumps(self.to_dict(), indent=2) + + def to_sanitized_dict(self) -> dict[str, Any]: + """ + Sanitized serialization to use with TensorBoard’s hparams + """ + d = self.to_dict() + d = {**d, **{"train_batch_size": self.train_batch_size, "eval_batch_size": self.eval_batch_size}} + + valid_types = [bool, int, float, str] + if is_torch_available(): + valid_types.append(torch.Tensor) + + return {k: v if type(v) in valid_types else str(v) for k, v in d.items()} + + # The following methods are there to simplify the instantiation of `TrainingArguments` + def set_training( + self, + learning_rate: float = 5e-5, + batch_size: int = 8, + weight_decay: float = 0, + num_epochs: float = 3, + max_steps: int = -1, + gradient_accumulation_steps: int = 1, + seed: int = 42, + gradient_checkpointing: bool = False, + ): + """ + A method that regroups all basic arguments linked to the training. + + + + Calling this method will automatically set `self.do_train` to `True`. + + + + Args: + learning_rate (`float`, *optional*, defaults to 5e-5): + The initial learning rate for the optimizer. + batch_size (`int` *optional*, defaults to 8): + The batch size per device (GPU/TPU core/CPU...) used for training. + weight_decay (`float`, *optional*, defaults to 0): + The weight decay to apply (if not zero) to all layers except all bias and LayerNorm weights in the + optimizer. + num_train_epochs(`float`, *optional*, defaults to 3.0): + Total number of training epochs to perform (if not an integer, will perform the decimal part percents + of the last epoch before stopping training). + max_steps (`int`, *optional*, defaults to -1): + If set to a positive number, the total number of training steps to perform. Overrides `num_train_epochs`. + For a finite dataset, training is reiterated through the dataset (if all data is exhausted) until + `max_steps` is reached. + gradient_accumulation_steps (`int`, *optional*, defaults to 1): + Number of updates steps to accumulate the gradients for, before performing a backward/update pass. + + + + When using gradient accumulation, one step is counted as one step with backward pass. Therefore, + logging, evaluation, save will be conducted every `gradient_accumulation_steps * xxx_step` training + examples. + + + + seed (`int`, *optional*, defaults to 42): + Random seed that will be set at the beginning of training. To ensure reproducibility across runs, use + the [`~Trainer.model_init`] function to instantiate the model if it has some randomly initialized + parameters. + gradient_checkpointing (`bool`, *optional*, defaults to `False`): + If True, use gradient checkpointing to save memory at the expense of slower backward pass. + + Example: + + ```py + >>> from transformers import TrainingArguments + + >>> args = TrainingArguments("working_dir") + >>> args = args.set_training(learning_rate=1e-4, batch_size=32) + >>> args.learning_rate + 1e-4 + ``` + """ + self.do_train = True + self.learning_rate = learning_rate + self.per_device_train_batch_size = batch_size + self.weight_decay = weight_decay + self.num_train_epochs = num_epochs + self.max_steps = max_steps + self.gradient_accumulation_steps = gradient_accumulation_steps + self.seed = seed + self.gradient_checkpointing = gradient_checkpointing + return self + + def set_evaluate( + self, + strategy: Union[str, IntervalStrategy] = "no", + steps: int = 500, + batch_size: int = 8, + accumulation_steps: Optional[int] = None, + delay: Optional[float] = None, + loss_only: bool = False, + jit_mode: bool = False, + ): + """ + A method that regroups all arguments linked to evaluation. + + Args: + strategy (`str` or [`~trainer_utils.IntervalStrategy`], *optional*, defaults to `"no"`): + The evaluation strategy to adopt during training. Possible values are: + + - `"no"`: No evaluation is done during training. + - `"steps"`: Evaluation is done (and logged) every `steps`. + - `"epoch"`: Evaluation is done at the end of each epoch. + + Setting a `strategy` different from `"no"` will set `self.do_eval` to `True`. + steps (`int`, *optional*, defaults to 500): + Number of update steps between two evaluations if `strategy="steps"`. + batch_size (`int` *optional*, defaults to 8): + The batch size per device (GPU/TPU core/CPU...) used for evaluation. + accumulation_steps (`int`, *optional*): + Number of predictions steps to accumulate the output tensors for, before moving the results to the CPU. + If left unset, the whole predictions are accumulated on GPU/TPU before being moved to the CPU (faster + but requires more memory). + delay (`float`, *optional*): + Number of epochs or steps to wait for before the first evaluation can be performed, depending on the + eval_strategy. + loss_only (`bool`, *optional*, defaults to `False`): + Ignores all outputs except the loss. + jit_mode (`bool`, *optional*): + Whether or not to use PyTorch jit trace for inference. + + Example: + + ```py + >>> from transformers import TrainingArguments + + >>> args = TrainingArguments("working_dir") + >>> args = args.set_evaluate(strategy="steps", steps=100) + >>> args.eval_steps + 100 + ``` + """ + self.eval_strategy = IntervalStrategy(strategy) + if self.eval_strategy == IntervalStrategy.STEPS and steps == 0: + raise ValueError("Setting `strategy` as 'steps' requires a positive value for `steps`.") + self.do_eval = self.eval_strategy != IntervalStrategy.NO + self.eval_steps = steps + self.per_device_eval_batch_size = batch_size + self.eval_accumulation_steps = accumulation_steps + self.eval_delay = delay + self.prediction_loss_only = loss_only + self.jit_mode_eval = jit_mode + return self + + def set_testing( + self, + batch_size: int = 8, + loss_only: bool = False, + jit_mode: bool = False, + ): + """ + A method that regroups all basic arguments linked to testing on a held-out dataset. + + + + Calling this method will automatically set `self.do_predict` to `True`. + + + + Args: + batch_size (`int` *optional*, defaults to 8): + The batch size per device (GPU/TPU core/CPU...) used for testing. + loss_only (`bool`, *optional*, defaults to `False`): + Ignores all outputs except the loss. + jit_mode (`bool`, *optional*): + Whether or not to use PyTorch jit trace for inference. + + Example: + + ```py + >>> from transformers import TrainingArguments + + >>> args = TrainingArguments("working_dir") + >>> args = args.set_testing(batch_size=32) + >>> args.per_device_eval_batch_size + 32 + ``` + """ + self.do_predict = True + self.per_device_eval_batch_size = batch_size + self.prediction_loss_only = loss_only + self.jit_mode_eval = jit_mode + return self + + def set_save( + self, + strategy: Union[str, IntervalStrategy] = "steps", + steps: int = 500, + total_limit: Optional[int] = None, + on_each_node: bool = False, + ): + """ + A method that regroups all arguments linked to checkpoint saving. + + Args: + strategy (`str` or [`~trainer_utils.IntervalStrategy`], *optional*, defaults to `"steps"`): + The checkpoint save strategy to adopt during training. Possible values are: + + - `"no"`: No save is done during training. + - `"epoch"`: Save is done at the end of each epoch. + - `"steps"`: Save is done every `save_steps`. + + steps (`int`, *optional*, defaults to 500): + Number of updates steps before two checkpoint saves if `strategy="steps"`. + total_limit (`int`, *optional*): + If a value is passed, will limit the total amount of checkpoints. Deletes the older checkpoints in + `output_dir`. + on_each_node (`bool`, *optional*, defaults to `False`): + When doing multi-node distributed training, whether to save models and checkpoints on each node, or + only on the main one. + + This should not be activated when the different nodes use the same storage as the files will be saved + with the same names for each node. + + Example: + + ```py + >>> from transformers import TrainingArguments + + >>> args = TrainingArguments("working_dir") + >>> args = args.set_save(strategy="steps", steps=100) + >>> args.save_steps + 100 + ``` + """ + self.save_strategy = SaveStrategy(strategy) + if self.save_strategy == SaveStrategy.STEPS and steps == 0: + raise ValueError("Setting `strategy` as 'steps' requires a positive value for `steps`.") + self.save_steps = steps + self.save_total_limit = total_limit + self.save_on_each_node = on_each_node + return self + + def set_logging( + self, + strategy: Union[str, IntervalStrategy] = "steps", + steps: int = 500, + report_to: Union[str, list[str]] = "none", + level: str = "passive", + first_step: bool = False, + nan_inf_filter: bool = False, + on_each_node: bool = False, + replica_level: str = "passive", + ): + """ + A method that regroups all arguments linked to logging. + + Args: + strategy (`str` or [`~trainer_utils.IntervalStrategy`], *optional*, defaults to `"steps"`): + The logging strategy to adopt during training. Possible values are: + + - `"no"`: No logging is done during training. + - `"epoch"`: Logging is done at the end of each epoch. + - `"steps"`: Logging is done every `logging_steps`. + + steps (`int`, *optional*, defaults to 500): + Number of update steps between two logs if `strategy="steps"`. + level (`str`, *optional*, defaults to `"passive"`): + Logger log level to use on the main process. Possible choices are the log levels as strings: `"debug"`, + `"info"`, `"warning"`, `"error"` and `"critical"`, plus a `"passive"` level which doesn't set anything + and lets the application set the level. + report_to (`str` or `List[str]`, *optional*, defaults to `"all"`): + The list of integrations to report the results and logs to. Supported platforms are `"azure_ml"`, + `"clearml"`, `"codecarbon"`, `"comet_ml"`, `"dagshub"`, `"dvclive"`, `"flyte"`, `"mlflow"`, + `"neptune"`, `"swanlab"`, `"tensorboard"`, and `"wandb"`. Use `"all"` to report to all integrations + installed, `"none"` for no integrations. + first_step (`bool`, *optional*, defaults to `False`): + Whether to log and evaluate the first `global_step` or not. + nan_inf_filter (`bool`, *optional*, defaults to `True`): + Whether to filter `nan` and `inf` losses for logging. If set to `True` the loss of every step that is + `nan` or `inf` is filtered and the average loss of the current logging window is taken instead. + + + + `nan_inf_filter` only influences the logging of loss values, it does not change the behavior the + gradient is computed or applied to the model. + + + + on_each_node (`bool`, *optional*, defaults to `True`): + In multinode distributed training, whether to log using `log_level` once per node, or only on the main + node. + replica_level (`str`, *optional*, defaults to `"passive"`): + Logger log level to use on replicas. Same choices as `log_level` + + Example: + + ```py + >>> from transformers import TrainingArguments + + >>> args = TrainingArguments("working_dir") + >>> args = args.set_logging(strategy="steps", steps=100) + >>> args.logging_steps + 100 + ``` + """ + self.logging_strategy = IntervalStrategy(strategy) + if self.logging_strategy == IntervalStrategy.STEPS and steps == 0: + raise ValueError("Setting `strategy` as 'steps' requires a positive value for `steps`.") + self.logging_steps = steps + self.report_to = report_to + self.log_level = level + self.logging_first_step = first_step + self.logging_nan_inf_filter = nan_inf_filter + self.log_on_each_node = on_each_node + self.log_level_replica = replica_level + return self + + def set_push_to_hub( + self, + model_id: str, + strategy: Union[str, HubStrategy] = "every_save", + token: Optional[str] = None, + private_repo: Optional[bool] = None, + always_push: bool = False, + ): + """ + A method that regroups all arguments linked to synchronizing checkpoints with the Hub. + + + + Calling this method will set `self.push_to_hub` to `True`, which means the `output_dir` will begin a git + directory synced with the repo (determined by `model_id`) and the content will be pushed each time a save is + triggered (depending on your `self.save_strategy`). Calling [`~Trainer.save_model`] will also trigger a push. + + + + Args: + model_id (`str`): + The name of the repository to keep in sync with the local *output_dir*. It can be a simple model ID in + which case the model will be pushed in your namespace. Otherwise it should be the whole repository + name, for instance `"user_name/model"`, which allows you to push to an organization you are a member of + with `"organization_name/model"`. + strategy (`str` or [`~trainer_utils.HubStrategy`], *optional*, defaults to `"every_save"`): + Defines the scope of what is pushed to the Hub and when. Possible values are: + + - `"end"`: push the model, its configuration, the processing_class e.g. tokenizer (if passed along to the [`Trainer`]) and a + draft of a model card when the [`~Trainer.save_model`] method is called. + - `"every_save"`: push the model, its configuration, the processing_class e.g. tokenizer (if passed along to the [`Trainer`]) + and + a draft of a model card each time there is a model save. The pushes are asynchronous to not block + training, and in case the save are very frequent, a new push is only attempted if the previous one is + finished. A last push is made with the final model at the end of training. + - `"checkpoint"`: like `"every_save"` but the latest checkpoint is also pushed in a subfolder named + last-checkpoint, allowing you to resume training easily with + `trainer.train(resume_from_checkpoint="last-checkpoint")`. + - `"all_checkpoints"`: like `"checkpoint"` but all checkpoints are pushed like they appear in the + output + folder (so you will get one checkpoint folder per folder in your final repository) + + token (`str`, *optional*): + The token to use to push the model to the Hub. Will default to the token in the cache folder obtained + with `huggingface-cli login`. + private_repo (`bool`, *optional*, defaults to `False`): + Whether to make the repo private. If `None` (default), the repo will be public unless the organization's default is private. This value is ignored if the repo already exists. + always_push (`bool`, *optional*, defaults to `False`): + Unless this is `True`, the `Trainer` will skip pushing a checkpoint when the previous push is not + finished. + + Example: + + ```py + >>> from transformers import TrainingArguments + + >>> args = TrainingArguments("working_dir") + >>> args = args.set_push_to_hub("me/awesome-model") + >>> args.hub_model_id + 'me/awesome-model' + ``` + """ + self.push_to_hub = True + self.hub_model_id = model_id + self.hub_strategy = HubStrategy(strategy) + self.hub_token = token + self.hub_private_repo = private_repo + self.hub_always_push = always_push + return self + + def set_optimizer( + self, + name: Union[str, OptimizerNames] = "adamw_torch", + learning_rate: float = 5e-5, + weight_decay: float = 0, + beta1: float = 0.9, + beta2: float = 0.999, + epsilon: float = 1e-8, + args: Optional[str] = None, + ): + """ + A method that regroups all arguments linked to the optimizer and its hyperparameters. + + Args: + name (`str` or [`training_args.OptimizerNames`], *optional*, defaults to `"adamw_torch"`): + The optimizer to use: `"adamw_torch"`, `"adamw_torch_fused"`, `"adamw_apex_fused"`, + `"adamw_anyprecision"` or `"adafactor"`. + learning_rate (`float`, *optional*, defaults to 5e-5): + The initial learning rate. + weight_decay (`float`, *optional*, defaults to 0): + The weight decay to apply (if not zero) to all layers except all bias and LayerNorm weights. + beta1 (`float`, *optional*, defaults to 0.9): + The beta1 hyperparameter for the adam optimizer or its variants. + beta2 (`float`, *optional*, defaults to 0.999): + The beta2 hyperparameter for the adam optimizer or its variants. + epsilon (`float`, *optional*, defaults to 1e-8): + The epsilon hyperparameter for the adam optimizer or its variants. + args (`str`, *optional*): + Optional arguments that are supplied to AnyPrecisionAdamW (only useful when + `optim="adamw_anyprecision"`). + + Example: + + ```py + >>> from transformers import TrainingArguments + + >>> args = TrainingArguments("working_dir") + >>> args = args.set_optimizer(name="adamw_torch", beta1=0.8) + >>> args.optim + 'adamw_torch' + ``` + """ + self.optim = OptimizerNames(name) + self.learning_rate = learning_rate + self.weight_decay = weight_decay + self.adam_beta1 = beta1 + self.adam_beta2 = beta2 + self.adam_epsilon = epsilon + self.optim_args = args + return self + + def set_lr_scheduler( + self, + name: Union[str, SchedulerType] = "linear", + num_epochs: float = 3.0, + max_steps: int = -1, + warmup_ratio: float = 0, + warmup_steps: int = 0, + ): + """ + A method that regroups all arguments linked to the learning rate scheduler and its hyperparameters. + + Args: + name (`str` or [`SchedulerType`], *optional*, defaults to `"linear"`): + The scheduler type to use. See the documentation of [`SchedulerType`] for all possible values. + num_epochs(`float`, *optional*, defaults to 3.0): + Total number of training epochs to perform (if not an integer, will perform the decimal part percents + of the last epoch before stopping training). + max_steps (`int`, *optional*, defaults to -1): + If set to a positive number, the total number of training steps to perform. Overrides `num_train_epochs`. + For a finite dataset, training is reiterated through the dataset (if all data is exhausted) until + `max_steps` is reached. + warmup_ratio (`float`, *optional*, defaults to 0.0): + Ratio of total training steps used for a linear warmup from 0 to `learning_rate`. + warmup_steps (`int`, *optional*, defaults to 0): + Number of steps used for a linear warmup from 0 to `learning_rate`. Overrides any effect of + `warmup_ratio`. + + Example: + + ```py + >>> from transformers import TrainingArguments + + >>> args = TrainingArguments("working_dir") + >>> args = args.set_lr_scheduler(name="cosine", warmup_ratio=0.05) + >>> args.warmup_ratio + 0.05 + ``` + """ + self.lr_scheduler_type = SchedulerType(name) + self.num_train_epochs = num_epochs + self.max_steps = max_steps + self.warmup_ratio = warmup_ratio + self.warmup_steps = warmup_steps + return self + + def set_dataloader( + self, + train_batch_size: int = 8, + eval_batch_size: int = 8, + drop_last: bool = False, + num_workers: int = 0, + pin_memory: bool = True, + persistent_workers: bool = False, + prefetch_factor: Optional[int] = None, + auto_find_batch_size: bool = False, + ignore_data_skip: bool = False, + sampler_seed: Optional[int] = None, + ): + """ + A method that regroups all arguments linked to the dataloaders creation. + + Args: + drop_last (`bool`, *optional*, defaults to `False`): + Whether to drop the last incomplete batch (if the length of the dataset is not divisible by the batch + size) or not. + num_workers (`int`, *optional*, defaults to 0): + Number of subprocesses to use for data loading (PyTorch only). 0 means that the data will be loaded in + the main process. + pin_memory (`bool`, *optional*, defaults to `True`): + Whether you want to pin memory in data loaders or not. Will default to `True`. + persistent_workers (`bool`, *optional*, defaults to `False`): + If True, the data loader will not shut down the worker processes after a dataset has been consumed + once. This allows to maintain the workers Dataset instances alive. Can potentially speed up training, + but will increase RAM usage. Will default to `False`. + prefetch_factor (`int`, *optional*): + Number of batches loaded in advance by each worker. + 2 means there will be a total of 2 * num_workers batches prefetched across all workers. + auto_find_batch_size (`bool`, *optional*, defaults to `False`) + Whether to find a batch size that will fit into memory automatically through exponential decay, + avoiding CUDA Out-of-Memory errors. Requires accelerate to be installed (`pip install accelerate`) + ignore_data_skip (`bool`, *optional*, defaults to `False`): + When resuming training, whether or not to skip the epochs and batches to get the data loading at the + same stage as in the previous training. If set to `True`, the training will begin faster (as that + skipping step can take a long time) but will not yield the same results as the interrupted training + would have. + sampler_seed (`int`, *optional*): + Random seed to be used with data samplers. If not set, random generators for data sampling will use the + same seed as `self.seed`. This can be used to ensure reproducibility of data sampling, independent of + the model seed. + + Example: + + ```py + >>> from transformers import TrainingArguments + + >>> args = TrainingArguments("working_dir") + >>> args = args.set_dataloader(train_batch_size=16, eval_batch_size=64) + >>> args.per_device_train_batch_size + 16 + ``` + """ + self.per_device_train_batch_size = train_batch_size + self.per_device_eval_batch_size = eval_batch_size + self.dataloader_drop_last = drop_last + self.dataloader_num_workers = num_workers + self.dataloader_pin_memory = pin_memory + self.dataloader_persistent_workers = persistent_workers + self.dataloader_prefetch_factor = prefetch_factor + self.auto_find_batch_size = auto_find_batch_size + self.ignore_data_skip = ignore_data_skip + self.data_seed = sampler_seed + return self + + +class ParallelMode(Enum): + NOT_PARALLEL = "not_parallel" + NOT_DISTRIBUTED = "not_distributed" + DISTRIBUTED = "distributed" + SAGEMAKER_MODEL_PARALLEL = "sagemaker_model_parallel" + SAGEMAKER_DATA_PARALLEL = "sagemaker_data_parallel" + TPU = "tpu" diff --git a/docs/transformers/build/lib/transformers/training_args_seq2seq.py b/docs/transformers/build/lib/transformers/training_args_seq2seq.py new file mode 100644 index 0000000000000000000000000000000000000000..5342b7add3932c542e35247e52920d8fc91ed325 --- /dev/null +++ b/docs/transformers/build/lib/transformers/training_args_seq2seq.py @@ -0,0 +1,90 @@ +# Copyright 2020 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging +from dataclasses import dataclass, field +from pathlib import Path +from typing import Optional, Union + +from .generation.configuration_utils import GenerationConfig +from .training_args import TrainingArguments +from .utils import add_start_docstrings + + +logger = logging.getLogger(__name__) + + +@dataclass +@add_start_docstrings(TrainingArguments.__doc__) +class Seq2SeqTrainingArguments(TrainingArguments): + """ + Args: + predict_with_generate (`bool`, *optional*, defaults to `False`): + Whether to use generate to calculate generative metrics (ROUGE, BLEU). + generation_max_length (`int`, *optional*): + The `max_length` to use on each evaluation loop when `predict_with_generate=True`. Will default to the + `max_length` value of the model configuration. + generation_num_beams (`int`, *optional*): + The `num_beams` to use on each evaluation loop when `predict_with_generate=True`. Will default to the + `num_beams` value of the model configuration. + generation_config (`str` or `Path` or [`~generation.GenerationConfig`], *optional*): + Allows to load a [`~generation.GenerationConfig`] from the `from_pretrained` method. This can be either: + + - a string, the *model id* of a pretrained model configuration hosted inside a model repo on + huggingface.co. + - a path to a *directory* containing a configuration file saved using the + [`~GenerationConfig.save_pretrained`] method, e.g., `./my_model_directory/`. + - a [`~generation.GenerationConfig`] object. + """ + + sortish_sampler: bool = field(default=False, metadata={"help": "Whether to use SortishSampler or not."}) + predict_with_generate: bool = field( + default=False, metadata={"help": "Whether to use generate to calculate generative metrics (ROUGE, BLEU)."} + ) + generation_max_length: Optional[int] = field( + default=None, + metadata={ + "help": ( + "The `max_length` to use on each evaluation loop when `predict_with_generate=True`. Will default " + "to the `max_length` value of the model configuration." + ) + }, + ) + generation_num_beams: Optional[int] = field( + default=None, + metadata={ + "help": ( + "The `num_beams` to use on each evaluation loop when `predict_with_generate=True`. Will default " + "to the `num_beams` value of the model configuration." + ) + }, + ) + generation_config: Optional[Union[str, Path, GenerationConfig]] = field( + default=None, + metadata={ + "help": "Model id, file path or url pointing to a GenerationConfig json file, to use during prediction." + }, + ) + + def to_dict(self): + """ + Serializes this instance while replace `Enum` by their values and `GenerationConfig` by dictionaries (for JSON + serialization support). It obfuscates the token values by removing their value. + """ + # filter out fields that are defined as field(init=False) + d = super().to_dict() + for k, v in d.items(): + if isinstance(v, GenerationConfig): + d[k] = v.to_dict() + return d diff --git a/docs/transformers/build/lib/transformers/training_args_tf.py b/docs/transformers/build/lib/transformers/training_args_tf.py new file mode 100644 index 0000000000000000000000000000000000000000..6bbd4b89a7249d84db9a579fe75928a3bd5feafb --- /dev/null +++ b/docs/transformers/build/lib/transformers/training_args_tf.py @@ -0,0 +1,299 @@ +# Copyright 2020 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import warnings +from dataclasses import dataclass, field +from typing import Optional + +from .training_args import TrainingArguments +from .utils import cached_property, is_tf_available, logging, requires_backends + + +logger = logging.get_logger(__name__) + +if is_tf_available(): + import tensorflow as tf + + from .modeling_tf_utils import keras + + +@dataclass +class TFTrainingArguments(TrainingArguments): + """ + TrainingArguments is the subset of the arguments we use in our example scripts **which relate to the training loop + itself**. + + Using [`HfArgumentParser`] we can turn this class into + [argparse](https://docs.python.org/3/library/argparse#module-argparse) arguments that can be specified on the + command line. + + Parameters: + output_dir (`str`): + The output directory where the model predictions and checkpoints will be written. + overwrite_output_dir (`bool`, *optional*, defaults to `False`): + If `True`, overwrite the content of the output directory. Use this to continue training if `output_dir` + points to a checkpoint directory. + do_train (`bool`, *optional*, defaults to `False`): + Whether to run training or not. This argument is not directly used by [`Trainer`], it's intended to be used + by your training/evaluation scripts instead. See the [example + scripts](https://github.com/huggingface/transformers/tree/main/examples) for more details. + do_eval (`bool`, *optional*): + Whether to run evaluation on the validation set or not. Will be set to `True` if `eval_strategy` is + different from `"no"`. This argument is not directly used by [`Trainer`], it's intended to be used by your + training/evaluation scripts instead. See the [example + scripts](https://github.com/huggingface/transformers/tree/main/examples) for more details. + do_predict (`bool`, *optional*, defaults to `False`): + Whether to run predictions on the test set or not. This argument is not directly used by [`Trainer`], it's + intended to be used by your training/evaluation scripts instead. See the [example + scripts](https://github.com/huggingface/transformers/tree/main/examples) for more details. + eval_strategy (`str` or [`~trainer_utils.IntervalStrategy`], *optional*, defaults to `"no"`): + The evaluation strategy to adopt during training. Possible values are: + + - `"no"`: No evaluation is done during training. + - `"steps"`: Evaluation is done (and logged) every `eval_steps`. + - `"epoch"`: Evaluation is done at the end of each epoch. + + per_device_train_batch_size (`int`, *optional*, defaults to 8): + The batch size per GPU/TPU core/CPU for training. + per_device_eval_batch_size (`int`, *optional*, defaults to 8): + The batch size per GPU/TPU core/CPU for evaluation. + gradient_accumulation_steps (`int`, *optional*, defaults to 1): + Number of updates steps to accumulate the gradients for, before performing a backward/update pass. + + + + When using gradient accumulation, one step is counted as one step with backward pass. Therefore, logging, + evaluation, save will be conducted every `gradient_accumulation_steps * xxx_step` training examples. + + + + learning_rate (`float`, *optional*, defaults to 5e-5): + The initial learning rate for Adam. + weight_decay (`float`, *optional*, defaults to 0): + The weight decay to apply (if not zero). + adam_beta1 (`float`, *optional*, defaults to 0.9): + The beta1 hyperparameter for the Adam optimizer. + adam_beta2 (`float`, *optional*, defaults to 0.999): + The beta2 hyperparameter for the Adam optimizer. + adam_epsilon (`float`, *optional*, defaults to 1e-8): + The epsilon hyperparameter for the Adam optimizer. + max_grad_norm (`float`, *optional*, defaults to 1.0): + Maximum gradient norm (for gradient clipping). + num_train_epochs(`float`, *optional*, defaults to 3.0): + Total number of training epochs to perform. + max_steps (`int`, *optional*, defaults to -1): + If set to a positive number, the total number of training steps to perform. Overrides `num_train_epochs`. + For a finite dataset, training is reiterated through the dataset (if all data is exhausted) until + `max_steps` is reached. + warmup_ratio (`float`, *optional*, defaults to 0.0): + Ratio of total training steps used for a linear warmup from 0 to `learning_rate`. + warmup_steps (`int`, *optional*, defaults to 0): + Number of steps used for a linear warmup from 0 to `learning_rate`. Overrides any effect of `warmup_ratio`. + logging_dir (`str`, *optional*): + [TensorBoard](https://www.tensorflow.org/tensorboard) log directory. Will default to + *runs/**CURRENT_DATETIME_HOSTNAME***. + logging_strategy (`str` or [`~trainer_utils.IntervalStrategy`], *optional*, defaults to `"steps"`): + The logging strategy to adopt during training. Possible values are: + + - `"no"`: No logging is done during training. + - `"epoch"`: Logging is done at the end of each epoch. + - `"steps"`: Logging is done every `logging_steps`. + + logging_first_step (`bool`, *optional*, defaults to `False`): + Whether to log and evaluate the first `global_step` or not. + logging_steps (`int`, *optional*, defaults to 500): + Number of update steps between two logs if `logging_strategy="steps"`. + save_strategy (`str` or [`~trainer_utils.SaveStrategy`], *optional*, defaults to `"steps"`): + The checkpoint save strategy to adopt during training. Possible values are: + + - `"no"`: No save is done during training. + - `"epoch"`: Save is done at the end of each epoch. + - `"steps"`: Save is done every `save_steps`. + + save_steps (`int`, *optional*, defaults to 500): + Number of updates steps before two checkpoint saves if `save_strategy="steps"`. + save_total_limit (`int`, *optional*): + If a value is passed, will limit the total amount of checkpoints. Deletes the older checkpoints in + `output_dir`. + no_cuda (`bool`, *optional*, defaults to `False`): + Whether to not use CUDA even when it is available or not. + seed (`int`, *optional*, defaults to 42): + Random seed that will be set at the beginning of training. + fp16 (`bool`, *optional*, defaults to `False`): + Whether to use 16-bit (mixed) precision training (through NVIDIA Apex) instead of 32-bit training. + fp16_opt_level (`str`, *optional*, defaults to 'O1'): + For `fp16` training, Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']. See details on + the [Apex documentation](https://nvidia.github.io/apex/amp). + local_rank (`int`, *optional*, defaults to -1): + During distributed training, the rank of the process. + tpu_num_cores (`int`, *optional*): + When training on TPU, the number of TPU cores (automatically passed by launcher script). + debug (`bool`, *optional*, defaults to `False`): + Whether to activate the trace to record computation graphs and profiling information or not. + dataloader_drop_last (`bool`, *optional*, defaults to `False`): + Whether to drop the last incomplete batch (if the length of the dataset is not divisible by the batch size) + or not. + eval_steps (`int`, *optional*, defaults to 1000): + Number of update steps before two evaluations. + past_index (`int`, *optional*, defaults to -1): + Some models like [TransformerXL](../model_doc/transformerxl) or :doc*XLNet <../model_doc/xlnet>* can make + use of the past hidden states for their predictions. If this argument is set to a positive int, the + `Trainer` will use the corresponding output (usually index 2) as the past state and feed it to the model at + the next training step under the keyword argument `mems`. + tpu_name (`str`, *optional*): + The name of the TPU the process is running on. + tpu_zone (`str`, *optional*): + The zone of the TPU the process is running on. If not specified, we will attempt to automatically detect + from metadata. + gcp_project (`str`, *optional*): + Google Cloud Project name for the Cloud TPU-enabled project. If not specified, we will attempt to + automatically detect from metadata. + run_name (`str`, *optional*): + A descriptor for the run. Notably used for wandb, mlflow, comet and swanlab logging. + xla (`bool`, *optional*): + Whether to activate the XLA compilation or not. + """ + + framework = "tf" + tpu_name: Optional[str] = field( + default=None, + metadata={"help": "Name of TPU"}, + ) + + tpu_zone: Optional[str] = field( + default=None, + metadata={"help": "Zone of TPU"}, + ) + + gcp_project: Optional[str] = field( + default=None, + metadata={"help": "Name of Cloud TPU-enabled project"}, + ) + + poly_power: float = field( + default=1.0, + metadata={"help": "Power for the Polynomial decay LR scheduler."}, + ) + + xla: bool = field(default=False, metadata={"help": "Whether to activate the XLA compilation or not"}) + + @cached_property + def _setup_strategy(self) -> tuple["tf.distribute.Strategy", int]: + requires_backends(self, ["tf"]) + logger.info("Tensorflow: setting up strategy") + + gpus = tf.config.list_physical_devices("GPU") + + # Set to float16 at first + if self.fp16: + keras.mixed_precision.set_global_policy("mixed_float16") + + if self.no_cuda: + strategy = tf.distribute.OneDeviceStrategy(device="/cpu:0") + else: + try: + if self.tpu_name: + tpu = tf.distribute.cluster_resolver.TPUClusterResolver( + self.tpu_name, zone=self.tpu_zone, project=self.gcp_project + ) + else: + tpu = tf.distribute.cluster_resolver.TPUClusterResolver() + except ValueError: + if self.tpu_name: + raise RuntimeError(f"Couldn't connect to TPU {self.tpu_name}!") + else: + tpu = None + + if tpu: + # Set to bfloat16 in case of TPU + if self.fp16: + keras.mixed_precision.set_global_policy("mixed_bfloat16") + + tf.config.experimental_connect_to_cluster(tpu) + tf.tpu.experimental.initialize_tpu_system(tpu) + + strategy = tf.distribute.TPUStrategy(tpu) + + elif len(gpus) == 0: + strategy = tf.distribute.OneDeviceStrategy(device="/cpu:0") + elif len(gpus) == 1: + strategy = tf.distribute.OneDeviceStrategy(device="/gpu:0") + elif len(gpus) > 1: + # If you only want to use a specific subset of GPUs use `CUDA_VISIBLE_DEVICES=0` + strategy = tf.distribute.MirroredStrategy() + else: + raise ValueError("Cannot find the proper strategy, please check your environment properties.") + + return strategy + + @property + def strategy(self) -> "tf.distribute.Strategy": + """ + The strategy used for distributed training. + """ + requires_backends(self, ["tf"]) + return self._setup_strategy + + @property + def n_replicas(self) -> int: + """ + The number of replicas (CPUs, GPUs or TPU cores) used in this training. + """ + requires_backends(self, ["tf"]) + return self._setup_strategy.num_replicas_in_sync + + @property + def should_log(self): + """ + Whether or not the current process should produce log. + """ + return False # TF Logging is handled by Keras not the Trainer + + @property + def train_batch_size(self) -> int: + """ + The actual batch size for training (may differ from `per_gpu_train_batch_size` in distributed training). + """ + if self.per_gpu_train_batch_size: + logger.warning( + "Using deprecated `--per_gpu_train_batch_size` argument which will be removed in a future " + "version. Using `--per_device_train_batch_size` is preferred." + ) + per_device_batch_size = self.per_gpu_train_batch_size or self.per_device_train_batch_size + return per_device_batch_size * self.n_replicas + + @property + def eval_batch_size(self) -> int: + """ + The actual batch size for evaluation (may differ from `per_gpu_eval_batch_size` in distributed training). + """ + if self.per_gpu_eval_batch_size: + logger.warning( + "Using deprecated `--per_gpu_eval_batch_size` argument which will be removed in a future " + "version. Using `--per_device_eval_batch_size` is preferred." + ) + per_device_batch_size = self.per_gpu_eval_batch_size or self.per_device_eval_batch_size + return per_device_batch_size * self.n_replicas + + @property + def n_gpu(self) -> int: + """ + The number of replicas (CPUs, GPUs or TPU cores) used in this training. + """ + requires_backends(self, ["tf"]) + warnings.warn( + "The n_gpu argument is deprecated and will be removed in a future version, use n_replicas instead.", + FutureWarning, + ) + return self._setup_strategy.num_replicas_in_sync diff --git a/silence_overlaps.zip b/silence_overlaps.zip new file mode 100644 index 0000000000000000000000000000000000000000..1024a1b414161532ded308b8a3ede2626a92fe34 --- /dev/null +++ b/silence_overlaps.zip @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:28ae3d5b7926569fa96005246be5397fb0dc10bf23281006fc1791dac1771d5e +size 645024