lsmpp commited on
Commit
4cef5ec
·
verified ·
1 Parent(s): ca32b0e

Add files using upload-large-folder tool

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. transformers/docker/transformers-all-latest-gpu/Dockerfile +80 -0
  2. transformers/docker/transformers-doc-builder/Dockerfile +18 -0
  3. transformers/docker/transformers-gpu/Dockerfile +31 -0
  4. transformers/docker/transformers-past-gpu/Dockerfile +59 -0
  5. transformers/docker/transformers-pytorch-amd-gpu/Dockerfile +39 -0
  6. transformers/docker/transformers-pytorch-deepspeed-amd-gpu/Dockerfile +53 -0
  7. transformers/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile +57 -0
  8. transformers/docker/transformers-pytorch-deepspeed-nightly-gpu/Dockerfile +68 -0
  9. transformers/docker/transformers-pytorch-gpu/Dockerfile +38 -0
  10. transformers/docker/transformers-pytorch-tpu/Dockerfile +65 -0
  11. transformers/docker/transformers-pytorch-tpu/bert-base-cased.jsonnet +38 -0
  12. transformers/docker/transformers-pytorch-tpu/dataset.yaml +32 -0
  13. transformers/docker/transformers-pytorch-tpu/docker-entrypoint.sh +8 -0
  14. transformers/docker/transformers-pytorch-xpu/Dockerfile +93 -0
  15. transformers/docker/transformers-quantization-latest-gpu/Dockerfile +101 -0
  16. transformers/docker/transformers-tensorflow-gpu/Dockerfile +25 -0
  17. transformers/docs/source/ar/tasks/language_modeling.md +422 -0
  18. transformers/docs/source/ar/tasks/masked_language_modeling.md +442 -0
  19. transformers/docs/source/ar/tasks/multiple_choice.md +452 -0
  20. transformers/docs/source/ar/tasks/question_answering.md +432 -0
  21. transformers/docs/source/ar/tasks/sequence_classification.md +387 -0
  22. transformers/docs/source/ar/tasks/summarization.md +397 -0
  23. transformers/docs/source/ar/tasks/token_classification.md +550 -0
  24. transformers/docs/source/ar/tasks/translation.md +407 -0
  25. transformers/docs/source/en/_config.py +14 -0
  26. transformers/docs/source/en/_redirects.yml +5 -0
  27. transformers/docs/source/en/_toctree.yml +1152 -0
  28. transformers/docs/source/en/accelerate.md +165 -0
  29. transformers/docs/source/en/accelerator_selection.md +126 -0
  30. transformers/docs/source/en/add_new_model.md +665 -0
  31. transformers/docs/source/en/add_new_pipeline.md +229 -0
  32. transformers/docs/source/en/agents.md +22 -0
  33. transformers/docs/source/en/attention_interface.md +168 -0
  34. transformers/docs/source/en/auto_docstring.md +280 -0
  35. transformers/docs/source/en/backbones.md +155 -0
  36. transformers/docs/source/en/cache_explanation.md +160 -0
  37. transformers/docs/source/en/chat_extras.md +299 -0
  38. transformers/docs/source/en/chat_templating.md +229 -0
  39. transformers/docs/source/en/chat_templating_multimodal.md +243 -0
  40. transformers/docs/source/en/chat_templating_writing.md +251 -0
  41. transformers/docs/source/en/community.md +70 -0
  42. transformers/docs/source/en/contributing.md +395 -0
  43. transformers/docs/source/en/conversations.md +161 -0
  44. transformers/docs/source/en/custom_models.md +297 -0
  45. transformers/docs/source/en/debugging.md +367 -0
  46. transformers/docs/source/en/deepspeed.md +1029 -0
  47. transformers/docs/source/en/executorch.md +59 -0
  48. transformers/docs/source/en/fast_tokenizers.md +362 -0
  49. transformers/docs/source/en/feature_extractors.md +200 -0
  50. transformers/docs/source/en/fsdp.md +145 -0
transformers/docker/transformers-all-latest-gpu/Dockerfile ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM nvidia/cuda:12.6.0-cudnn-devel-ubuntu22.04
2
+ LABEL maintainer="Hugging Face"
3
+
4
+ ARG DEBIAN_FRONTEND=noninteractive
5
+
6
+ # Use login shell to read variables from `~/.profile` (to pass dynamic created variables between RUN commands)
7
+ SHELL ["sh", "-lc"]
8
+
9
+ # The following `ARG` are mainly used to specify the versions explicitly & directly in this docker file, and not meant
10
+ # to be used as arguments for docker build (so far).
11
+
12
+ ARG PYTORCH='2.7.1'
13
+ # Example: `cu102`, `cu113`, etc.
14
+ ARG CUDA='cu126'
15
+ # Disable kernel mapping for now until all tests pass
16
+ ENV DISABLE_KERNEL_MAPPING=1
17
+
18
+ RUN apt update
19
+ RUN apt install -y git libsndfile1-dev tesseract-ocr espeak-ng python3 python3-pip ffmpeg git-lfs
20
+ RUN git lfs install
21
+ RUN python3 -m pip install --no-cache-dir --upgrade pip
22
+
23
+ ARG REF=main
24
+ RUN git clone https://github.com/huggingface/transformers && cd transformers && git checkout $REF
25
+
26
+ # 1. Put several commands in a single `RUN` to avoid image/layer exporting issue. Could be revised in the future.
27
+ # 2. Regarding `torch` part, We might need to specify proper versions for `torchvision` and `torchaudio`.
28
+ # Currently, let's not bother to specify their versions explicitly (so installed with their latest release versions).
29
+ RUN python3 -m pip install --no-cache-dir -e ./transformers[dev,onnxruntime] && [ ${#PYTORCH} -gt 0 -a "$PYTORCH" != "pre" ] && VERSION='torch=='$PYTORCH'.*' || VERSION='torch'; echo "export VERSION='$VERSION'" >> ~/.profile && echo torch=$VERSION && [ "$PYTORCH" != "pre" ] && python3 -m pip install --no-cache-dir -U $VERSION torchvision torchaudio torchcodec --extra-index-url https://download.pytorch.org/whl/$CUDA || python3 -m pip install --no-cache-dir -U --pre torch torchvision torchaudio torchcodec --extra-index-url https://download.pytorch.org/whl/nightly/$CUDA && python3 -m pip uninstall -y tensorflow tensorflow_text tensorflow_probability
30
+
31
+ RUN python3 -m pip uninstall -y flax jax
32
+
33
+ RUN python3 -m pip install --no-cache-dir -U timm
34
+
35
+ RUN python3 -m pip install --no-cache-dir git+https://github.com/facebookresearch/detectron2.git pytesseract
36
+ RUN python3 -m pip install -U "itsdangerous<2.1.0"
37
+
38
+ RUN python3 -m pip install --no-cache-dir git+https://github.com/huggingface/accelerate@main#egg=accelerate
39
+
40
+ RUN python3 -m pip install --no-cache-dir git+https://github.com/huggingface/peft@main#egg=peft
41
+
42
+ # For bettertransformer
43
+ RUN python3 -m pip install --no-cache-dir git+https://github.com/huggingface/optimum@main#egg=optimum
44
+
45
+ # For video model testing
46
+ RUN python3 -m pip install --no-cache-dir av
47
+
48
+ # Some slow tests require bnb
49
+ RUN python3 -m pip install --no-cache-dir bitsandbytes
50
+
51
+ # Some tests require quanto
52
+ RUN python3 -m pip install --no-cache-dir quanto
53
+
54
+ # `quanto` will install `ninja` which leads to many `CUDA error: an illegal memory access ...` in some model tests
55
+ # (`deformable_detr`, `rwkv`, `mra`)
56
+ RUN python3 -m pip uninstall -y ninja
57
+
58
+ # For `dinat` model
59
+ # The `XXX` part in `torchXXX` needs to match `PYTORCH` (to some extent)
60
+ # pin `0.17.4` otherwise `cannot import name 'natten2dav' from 'natten.functional'`
61
+ RUN python3 -m pip install --no-cache-dir natten==0.17.4+torch250cu121 -f https://shi-labs.com/natten/wheels
62
+
63
+ # For `nougat` tokenizer
64
+ RUN python3 -m pip install --no-cache-dir python-Levenshtein
65
+
66
+ # For `FastSpeech2ConformerTokenizer` tokenizer
67
+ RUN python3 -m pip install --no-cache-dir g2p-en
68
+
69
+ # For Some bitsandbytes tests
70
+ RUN python3 -m pip install --no-cache-dir einops
71
+
72
+ # For Some tests with `@require_liger_kernel`
73
+ RUN python3 -m pip install --no-cache-dir liger-kernel
74
+
75
+ # `kernels` may give different outputs (within 1e-5 range) even with the same model (weights) and the same inputs
76
+ RUN python3 -m pip uninstall -y kernels
77
+
78
+ # When installing in editable mode, `transformers` is not recognized as a package.
79
+ # this line must be added in order for python to be aware of transformers.
80
+ RUN cd transformers && python3 setup.py develop
transformers/docker/transformers-doc-builder/Dockerfile ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.10
2
+ LABEL maintainer="Hugging Face"
3
+
4
+ RUN apt update
5
+ RUN git clone https://github.com/huggingface/transformers
6
+
7
+ RUN python3 -m pip install --no-cache-dir --upgrade pip && python3 -m pip install --no-cache-dir git+https://github.com/huggingface/doc-builder ./transformers[dev]
8
+ RUN apt-get -y update && apt-get install -y libsndfile1-dev && apt install -y tesseract-ocr
9
+
10
+ # Torch needs to be installed before deepspeed
11
+ RUN python3 -m pip install --no-cache-dir ./transformers[deepspeed]
12
+
13
+ RUN python3 -m pip install --no-cache-dir torchvision git+https://github.com/facebookresearch/detectron2.git pytesseract
14
+ RUN python3 -m pip install -U "itsdangerous<2.1.0"
15
+
16
+ # Test if the image could successfully build the doc. before publishing the image
17
+ RUN doc-builder build transformers transformers/docs/source/en --build_dir doc-build-dev --notebook_dir notebooks/transformers_doc --clean
18
+ RUN rm -rf doc-build-dev
transformers/docker/transformers-gpu/Dockerfile ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM nvidia/cuda:10.2-cudnn7-devel-ubuntu18.04
2
+ LABEL maintainer="Hugging Face"
3
+ LABEL repository="transformers"
4
+
5
+ RUN apt update && \
6
+ apt install -y bash \
7
+ build-essential \
8
+ git \
9
+ curl \
10
+ ca-certificates \
11
+ python3 \
12
+ python3-pip && \
13
+ rm -rf /var/lib/apt/lists
14
+
15
+ RUN python3 -m pip install --no-cache-dir --upgrade pip && \
16
+ python3 -m pip install --no-cache-dir \
17
+ jupyter \
18
+ tensorflow \
19
+ torch
20
+
21
+ RUN git clone https://github.com/NVIDIA/apex
22
+ RUN cd apex && \
23
+ python3 setup.py install && \
24
+ pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./
25
+
26
+ WORKDIR /workspace
27
+ COPY . transformers/
28
+ RUN cd transformers/ && \
29
+ python3 -m pip install --no-cache-dir .
30
+
31
+ CMD ["/bin/bash"]
transformers/docker/transformers-past-gpu/Dockerfile ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ARG BASE_DOCKER_IMAGE
2
+ FROM $BASE_DOCKER_IMAGE
3
+ LABEL maintainer="Hugging Face"
4
+
5
+ ARG DEBIAN_FRONTEND=noninteractive
6
+
7
+ # Use login shell to read variables from `~/.profile` (to pass dynamic created variables between RUN commands)
8
+ SHELL ["sh", "-lc"]
9
+
10
+ RUN apt update
11
+ RUN apt install -y git libsndfile1-dev tesseract-ocr espeak-ng python3 python3-pip ffmpeg git-lfs libaio-dev
12
+ RUN git lfs install
13
+ RUN python3 -m pip install --no-cache-dir --upgrade pip
14
+
15
+ ARG REF=main
16
+ RUN git clone https://github.com/huggingface/transformers && cd transformers && git checkout $REF
17
+ RUN python3 -m pip install --no-cache-dir -e ./transformers[dev,onnxruntime]
18
+
19
+ # When installing in editable mode, `transformers` is not recognized as a package.
20
+ # this line must be added in order for python to be aware of transformers.
21
+ RUN cd transformers && python3 setup.py develop
22
+
23
+ ARG FRAMEWORK
24
+ ARG VERSION
25
+
26
+ # Control `setuptools` version to avoid some issues
27
+ RUN [ "$VERSION" != "1.10" ] && python3 -m pip install -U setuptools || python3 -m pip install -U "setuptools<=59.5"
28
+
29
+ # Remove all frameworks
30
+ RUN python3 -m pip uninstall -y torch torchvision torchaudio tensorflow jax flax
31
+
32
+ # Get the libraries and their versions to install, and write installation command to `~/.profile`.
33
+ RUN python3 ./transformers/utils/past_ci_versions.py --framework $FRAMEWORK --version $VERSION
34
+
35
+ # Install the target framework
36
+ RUN echo "INSTALL_CMD = $INSTALL_CMD"
37
+ RUN $INSTALL_CMD
38
+
39
+ RUN [ "$FRAMEWORK" != "pytorch" ] && echo "`deepspeed-testing` installation is skipped" || python3 -m pip install --no-cache-dir ./transformers[deepspeed-testing]
40
+
41
+ # Remove `accelerate`: it requires `torch`, and this causes import issues for TF-only testing
42
+ # We will install `accelerate@main` in Past CI workflow file
43
+ RUN python3 -m pip uninstall -y accelerate
44
+
45
+ # Uninstall `torch-tensorrt` and `apex` shipped with the base image
46
+ RUN python3 -m pip uninstall -y torch-tensorrt apex
47
+
48
+ # Pre-build **nightly** release of DeepSpeed, so it would be ready for testing (otherwise, the 1st deepspeed test will timeout)
49
+ RUN python3 -m pip uninstall -y deepspeed
50
+ # This has to be run inside the GPU VMs running the tests. (So far, it fails here due to GPU checks during compilation.)
51
+ # Issue: https://github.com/deepspeedai/DeepSpeed/issues/2010
52
+ # RUN git clone https://github.com/deepspeedai/DeepSpeed && cd DeepSpeed && rm -rf build && \
53
+ # DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 DS_BUILD_UTILS=1 python3 -m pip install . --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check 2>&1
54
+
55
+ RUN python3 -m pip install -U "itsdangerous<2.1.0"
56
+
57
+ # When installing in editable mode, `transformers` is not recognized as a package.
58
+ # this line must be added in order for python to be aware of transformers.
59
+ RUN cd transformers && python3 setup.py develop
transformers/docker/transformers-pytorch-amd-gpu/Dockerfile ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM rocm/pytorch:rocm6.4_ubuntu22.04_py3.10_pytorch_release_2.6.0
2
+ LABEL maintainer="Hugging Face"
3
+
4
+ ARG DEBIAN_FRONTEND=noninteractive
5
+
6
+ ARG TORCH_VISION='0.21.0'
7
+ ARG TORCH_AUDIO='2.6.0'
8
+
9
+ RUN apt update && \
10
+ apt install -y --no-install-recommends git libsndfile1-dev tesseract-ocr espeak-ng python3 python3-dev python3-pip python3-dev ffmpeg git-lfs && \
11
+ apt clean && \
12
+ rm -rf /var/lib/apt/lists/*
13
+
14
+ RUN git lfs install
15
+
16
+ RUN python3 -m pip install --no-cache-dir --upgrade pip numpy
17
+ RUN python3 -m pip install --no-cache-dir --upgrade importlib-metadata setuptools ninja git+https://github.com/facebookresearch/detectron2.git pytesseract "itsdangerous<2.1.0"
18
+
19
+ ARG REF=main
20
+ WORKDIR /
21
+
22
+ # Invalidate docker cache from here if new commit is available.
23
+ ADD https://api.github.com/repos/huggingface/transformers/git/refs/heads/main version.json
24
+ RUN git clone https://github.com/huggingface/transformers && cd transformers && git checkout $REF
25
+
26
+ RUN python3 -m pip install --no-cache-dir torchvision==$TORCH_VISION torchaudio==$TORCH_AUDIO
27
+ RUN python3 -m pip install --no-cache-dir -e ./transformers[dev-torch,testing,video]
28
+
29
+ RUN python3 -m pip uninstall -y tensorflow flax
30
+
31
+ # When installing in editable mode, `transformers` is not recognized as a package.
32
+ # this line must be added in order for python to be aware of transformers.
33
+ RUN cd transformers && python3 setup.py develop
34
+
35
+ # Remove nvml and nvidia-ml-py as it is not compatible with ROCm. apex is not tested on NVIDIA either.
36
+ RUN python3 -m pip uninstall py3nvml pynvml nvidia-ml-py apex -y
37
+
38
+ # `kernels` may causes many failing tests
39
+ RUN python3 -m pip uninstall -y kernels
transformers/docker/transformers-pytorch-deepspeed-amd-gpu/Dockerfile ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM rocm/dev-ubuntu-22.04:6.2.4
2
+ LABEL maintainer="Hugging Face"
3
+
4
+ ARG DEBIAN_FRONTEND=noninteractive
5
+ ARG PYTORCH='2.6.0'
6
+ ARG TORCH_VISION='0.21.0'
7
+ ARG TORCH_AUDIO='2.6.0'
8
+ ARG ROCM='6.2.4'
9
+
10
+ RUN apt update && \
11
+ apt install -y --no-install-recommends \
12
+ libaio-dev \
13
+ git \
14
+ # These are required to build deepspeed.
15
+ python3-dev \
16
+ python-is-python3 \
17
+ rocrand-dev \
18
+ rocthrust-dev \
19
+ rocblas-dev \
20
+ hipsolver-dev \
21
+ hipsparse-dev \
22
+ hipblas-dev \
23
+ hipblaslt-dev && \
24
+ apt clean && \
25
+ rm -rf /var/lib/apt/lists/*
26
+
27
+ RUN python3 -m pip install --no-cache-dir --upgrade pip ninja "pydantic>=2.0.0"
28
+ RUN python3 -m pip uninstall -y apex torch torchvision torchaudio
29
+ RUN python3 -m pip install torch==$PYTORCH torchvision==$TORCH_VISION torchaudio==$TORCH_AUDIO --index-url https://download.pytorch.org/whl/rocm$ROCM --no-cache-dir
30
+
31
+ # Pre-build DeepSpeed, so it's be ready for testing (to avoid timeout)
32
+ RUN DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache-dir -v --disable-pip-version-check 2>&1
33
+
34
+ ARG REF=main
35
+ WORKDIR /
36
+
37
+ # Invalidate docker cache from here if new commit is available.
38
+ ADD https://api.github.com/repos/huggingface/transformers/git/refs/heads/main version.json
39
+ RUN git clone https://github.com/huggingface/transformers && cd transformers && git checkout $REF
40
+
41
+ RUN python3 -m pip install --no-cache-dir ./transformers[accelerate,testing,sentencepiece,sklearn]
42
+
43
+ # When installing in editable mode, `transformers` is not recognized as a package.
44
+ # this line must be added in order for python to be aware of transformers.
45
+ RUN cd transformers && python3 setup.py develop
46
+
47
+ RUN python3 -c "from deepspeed.launcher.runner import main"
48
+
49
+ # Remove nvml as it is not compatible with ROCm
50
+ RUN python3 -m pip uninstall py3nvml pynvml nvidia-ml-py apex -y
51
+
52
+ # `kernels` may causes many failing tests
53
+ RUN python3 -m pip uninstall -y kernels
transformers/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-24-08.html
2
+ FROM nvcr.io/nvidia/pytorch:24.08-py3
3
+ LABEL maintainer="Hugging Face"
4
+
5
+ ARG DEBIAN_FRONTEND=noninteractive
6
+
7
+ ARG PYTORCH='2.7.1'
8
+ # Example: `cu102`, `cu113`, etc.
9
+ ARG CUDA='cu126'
10
+
11
+ RUN apt -y update
12
+ RUN apt install -y libaio-dev
13
+ RUN python3 -m pip install --no-cache-dir --upgrade pip
14
+
15
+ ARG REF=main
16
+ RUN git clone https://github.com/huggingface/transformers && cd transformers && git checkout $REF
17
+
18
+ # `datasets` requires pandas, pandas has some modules compiled with numpy=1.x causing errors
19
+ RUN python3 -m pip install --no-cache-dir './transformers[deepspeed-testing]' 'pandas<2' 'numpy<2'
20
+
21
+ # Install latest release PyTorch
22
+ # (PyTorch must be installed before pre-compiling any DeepSpeed c++/cuda ops.)
23
+ # (https://www.deepspeed.ai/tutorials/advanced-install/#pre-install-deepspeed-ops)
24
+ RUN python3 -m pip uninstall -y torch torchvision torchaudio && python3 -m pip install --no-cache-dir -U torch==$PYTORCH torchvision torchaudio torchcodec --extra-index-url https://download.pytorch.org/whl/$CUDA
25
+
26
+ RUN python3 -m pip install --no-cache-dir git+https://github.com/huggingface/accelerate@main#egg=accelerate
27
+
28
+ # Uninstall `transformer-engine` shipped with the base image
29
+ RUN python3 -m pip uninstall -y transformer-engine
30
+
31
+ # Uninstall `torch-tensorrt` shipped with the base image
32
+ RUN python3 -m pip uninstall -y torch-tensorrt
33
+
34
+ # recompile apex
35
+ RUN python3 -m pip uninstall -y apex
36
+ # RUN git clone https://github.com/NVIDIA/apex
37
+ # `MAX_JOBS=1` disables parallel building to avoid cpu memory OOM when building image on GitHub Action (standard) runners
38
+ # TODO: check if there is alternative way to install latest apex
39
+ # RUN cd apex && MAX_JOBS=1 python3 -m pip install --global-option="--cpp_ext" --global-option="--cuda_ext" --no-cache -v --disable-pip-version-check .
40
+
41
+ # Pre-build **latest** DeepSpeed, so it would be ready for testing (otherwise, the 1st deepspeed test will timeout)
42
+ RUN python3 -m pip uninstall -y deepspeed
43
+ # This has to be run (again) inside the GPU VMs running the tests.
44
+ # The installation works here, but some tests fail, if we don't pre-build deepspeed again in the VMs running the tests.
45
+ # TODO: Find out why test fail.
46
+ RUN DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check 2>&1
47
+
48
+ # `kernels` may give different outputs (within 1e-5 range) even with the same model (weights) and the same inputs
49
+ RUN python3 -m pip uninstall -y kernels
50
+
51
+ # When installing in editable mode, `transformers` is not recognized as a package.
52
+ # this line must be added in order for python to be aware of transformers.
53
+ RUN cd transformers && python3 setup.py develop
54
+
55
+ # The base image ships with `pydantic==1.8.2` which is not working - i.e. the next command fails
56
+ RUN python3 -m pip install -U --no-cache-dir "pydantic>=2.0.0"
57
+ RUN python3 -c "from deepspeed.launcher.runner import main"
transformers/docker/transformers-pytorch-deepspeed-nightly-gpu/Dockerfile ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-23-11.html#rel-23-11
2
+ FROM nvcr.io/nvidia/pytorch:24.08-py3
3
+ LABEL maintainer="Hugging Face"
4
+
5
+ ARG DEBIAN_FRONTEND=noninteractive
6
+
7
+ # Example: `cu102`, `cu113`, etc.
8
+ ARG CUDA='cu126'
9
+
10
+ RUN apt -y update
11
+ RUN apt install -y libaio-dev
12
+ RUN python3 -m pip install --no-cache-dir --upgrade pip
13
+
14
+ ARG REF=main
15
+ RUN git clone https://github.com/huggingface/transformers && cd transformers && git checkout $REF
16
+
17
+ RUN python3 -m pip uninstall -y torch torchvision torchaudio
18
+
19
+ # Install **nightly** release PyTorch (flag `--pre`)
20
+ # (PyTorch must be installed before pre-compiling any DeepSpeed c++/cuda ops.)
21
+ # (https://www.deepspeed.ai/tutorials/advanced-install/#pre-install-deepspeed-ops)
22
+ RUN python3 -m pip install --no-cache-dir -U --pre torch torchvision torchaudio torchcodec --extra-index-url https://download.pytorch.org/whl/nightly/$CUDA
23
+
24
+ # `datasets` requires pandas, pandas has some modules compiled with numpy=1.x causing errors
25
+ RUN python3 -m pip install --no-cache-dir './transformers[deepspeed-testing]' 'pandas<2' 'numpy<2'
26
+
27
+ RUN python3 -m pip install --no-cache-dir git+https://github.com/huggingface/accelerate@main#egg=accelerate
28
+
29
+ # Uninstall `transformer-engine` shipped with the base image
30
+ RUN python3 -m pip uninstall -y transformer-engine
31
+
32
+ # Uninstall `torch-tensorrt` and `apex` shipped with the base image
33
+ RUN python3 -m pip uninstall -y torch-tensorrt apex
34
+
35
+ # Pre-build **nightly** release of DeepSpeed, so it would be ready for testing (otherwise, the 1st deepspeed test will timeout)
36
+ RUN python3 -m pip uninstall -y deepspeed
37
+ # This has to be run inside the GPU VMs running the tests. (So far, it fails here due to GPU checks during compilation.)
38
+ # Issue: https://github.com/deepspeedai/DeepSpeed/issues/2010
39
+ # RUN git clone https://github.com/deepspeedai/DeepSpeed && cd DeepSpeed && rm -rf build && \
40
+ # DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 DS_BUILD_UTILS=1 python3 -m pip install . --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check 2>&1
41
+
42
+ ## For `torchdynamo` tests
43
+ ## (see https://github.com/huggingface/transformers/pull/17765)
44
+ #RUN git clone https://github.com/pytorch/functorch
45
+ #RUN python3 -m pip install --no-cache-dir ./functorch[aot]
46
+ #RUN cd functorch && python3 setup.py develop
47
+ #
48
+ #RUN git clone https://github.com/pytorch/torchdynamo
49
+ #RUN python3 -m pip install -r ./torchdynamo/requirements.txt
50
+ #RUN cd torchdynamo && python3 setup.py develop
51
+ #
52
+ ## install TensorRT
53
+ #RUN python3 -m pip install --no-cache-dir -U nvidia-pyindex
54
+ #RUN python3 -m pip install --no-cache-dir -U nvidia-tensorrt==8.2.4.2
55
+ #
56
+ ## install torch_tensorrt (fx path)
57
+ #RUN git clone https://github.com/pytorch/TensorRT.git
58
+ #RUN cd TensorRT/py && python3 setup.py install --fx-only
59
+
60
+ # `kernels` may give different outputs (within 1e-5 range) even with the same model (weights) and the same inputs
61
+ RUN python3 -m pip uninstall -y kernels
62
+
63
+ # When installing in editable mode, `transformers` is not recognized as a package.
64
+ # this line must be added in order for python to be aware of transformers.
65
+ RUN cd transformers && python3 setup.py develop
66
+
67
+ # Disable for now as deepspeed is not installed above. To be enabled once the issue is fixed.
68
+ # RUN python3 -c "from deepspeed.launcher.runner import main"
transformers/docker/transformers-pytorch-gpu/Dockerfile ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM nvidia/cuda:12.6.0-cudnn-devel-ubuntu22.04
2
+ LABEL maintainer="Hugging Face"
3
+
4
+ ARG DEBIAN_FRONTEND=noninteractive
5
+
6
+ RUN apt update
7
+ RUN apt install -y git libsndfile1-dev tesseract-ocr espeak-ng python3 python3-pip ffmpeg
8
+ RUN python3 -m pip install --no-cache-dir --upgrade pip
9
+
10
+ ARG REF=main
11
+ RUN git clone https://github.com/huggingface/transformers && cd transformers && git checkout $REF
12
+
13
+ # If set to nothing, will install the latest version
14
+ ARG PYTORCH='2.7.1'
15
+ ARG TORCH_VISION=''
16
+ ARG TORCH_AUDIO=''
17
+ # Example: `cu102`, `cu113`, etc.
18
+ ARG CUDA='cu126'
19
+
20
+ RUN python3 -m pip install --no-cache-dir -e ./transformers[dev-torch,testing,video]
21
+
22
+ # Install torch stuff after ./transformers[dev-torch,testing,video], otherwise torch may be resolved to a previous
23
+ # version.
24
+ RUN [ ${#PYTORCH} -gt 0 ] && VERSION='torch=='$PYTORCH'.*' || VERSION='torch'; python3 -m pip install --no-cache-dir -U $VERSION --extra-index-url https://download.pytorch.org/whl/$CUDA
25
+ RUN [ ${#TORCH_VISION} -gt 0 ] && VERSION='torchvision=='TORCH_VISION'.*' || VERSION='torchvision'; python3 -m pip install --no-cache-dir -U $VERSION --extra-index-url https://download.pytorch.org/whl/$CUDA
26
+ RUN [ ${#TORCH_AUDIO} -gt 0 ] && VERSION='torchaudio=='TORCH_AUDIO'.*' || VERSION='torchaudio'; python3 -m pip install --no-cache-dir -U $VERSION --extra-index-url https://download.pytorch.org/whl/$CUDA
27
+
28
+ RUN python3 -m pip uninstall -y tensorflow flax
29
+
30
+ RUN python3 -m pip install --no-cache-dir git+https://github.com/facebookresearch/detectron2.git pytesseract
31
+ RUN python3 -m pip install -U "itsdangerous<2.1.0"
32
+
33
+ # `kernels` may give different outputs (within 1e-5 range) even with the same model (weights) and the same inputs
34
+ RUN python3 -m pip uninstall -y kernels
35
+
36
+ # When installing in editable mode, `transformers` is not recognized as a package.
37
+ # this line must be added in order for python to be aware of transformers.
38
+ RUN cd transformers && python3 setup.py develop
transformers/docker/transformers-pytorch-tpu/Dockerfile ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM google/cloud-sdk:slim
2
+
3
+ # Build args.
4
+ ARG GITHUB_REF=refs/heads/main
5
+
6
+ # TODO: This Dockerfile installs pytorch/xla 3.6 wheels. There are also 3.7
7
+ # wheels available; see below.
8
+ ENV PYTHON_VERSION=3.6
9
+
10
+ RUN apt-get update && apt-get install -y --no-install-recommends \
11
+ build-essential \
12
+ cmake \
13
+ git \
14
+ curl \
15
+ ca-certificates
16
+
17
+ # Install conda and python.
18
+ # NOTE new Conda does not forward the exit status... https://github.com/conda/conda/issues/8385
19
+ RUN curl -o ~/miniconda.sh https://repo.anaconda.com/miniconda/Miniconda3-4.7.12-Linux-x86_64.sh && \
20
+ chmod +x ~/miniconda.sh && \
21
+ ~/miniconda.sh -b && \
22
+ rm ~/miniconda.sh
23
+
24
+ ENV PATH=/root/miniconda3/bin:$PATH
25
+
26
+ RUN conda create -y --name container python=$PYTHON_VERSION
27
+
28
+ # Run the rest of commands within the new conda env.
29
+ # Use absolute path to appease Codefactor.
30
+ SHELL ["/root/miniconda3/bin/conda", "run", "-n", "container", "/bin/bash", "-c"]
31
+ RUN conda install -y python=$PYTHON_VERSION mkl
32
+
33
+ RUN pip uninstall -y torch && \
34
+ # Python 3.7 wheels are available. Replace cp36-cp36m with cp37-cp37m
35
+ gsutil cp 'gs://tpu-pytorch/wheels/torch-nightly-cp${PYTHON_VERSION/./}-cp${PYTHON_VERSION/./}m-linux_x86_64.whl' . && \
36
+ gsutil cp 'gs://tpu-pytorch/wheels/torch_xla-nightly-cp${PYTHON_VERSION/./}-cp${PYTHON_VERSION/./}m-linux_x86_64.whl' . && \
37
+ gsutil cp 'gs://tpu-pytorch/wheels/torchvision-nightly-cp${PYTHON_VERSION/./}-cp${PYTHON_VERSION/./}m-linux_x86_64.whl' . && \
38
+ pip install 'torch-nightly-cp${PYTHON_VERSION/./}-cp${PYTHON_VERSION/./}m-linux_x86_64.whl' && \
39
+ pip install 'torch_xla-nightly-cp${PYTHON_VERSION/./}-cp${PYTHON_VERSION/./}m-linux_x86_64.whl' && \
40
+ pip install 'torchvision-nightly-cp${PYTHON_VERSION/./}-cp${PYTHON_VERSION/./}m-linux_x86_64.whl' && \
41
+ rm 'torch-nightly-cp${PYTHON_VERSION/./}-cp${PYTHON_VERSION/./}m-linux_x86_64.whl' && \
42
+ rm 'torch_xla-nightly-cp${PYTHON_VERSION/./}-cp${PYTHON_VERSION/./}m-linux_x86_64.whl' && \
43
+ rm 'torchvision-nightly-cp${PYTHON_VERSION/./}-cp${PYTHON_VERSION/./}m-linux_x86_64.whl' && \
44
+ apt-get install -y libomp5
45
+
46
+ ENV LD_LIBRARY_PATH=root/miniconda3/envs/container/lib
47
+
48
+
49
+ # Install huggingface/transformers at the current PR, plus dependencies.
50
+ RUN git clone https://github.com/huggingface/transformers.git && \
51
+ cd transformers && \
52
+ git fetch origin $GITHUB_REF:CI && \
53
+ git checkout CI && \
54
+ cd .. && \
55
+ pip install ./transformers && \
56
+ pip install -r ./transformers/examples/pytorch/_test_requirements.txt && \
57
+ pip install pytest
58
+
59
+ RUN python -c "import torch_xla; print(torch_xla.__version__)"
60
+ RUN python -c "import transformers as trf; print(trf.__version__)"
61
+ RUN conda init bash
62
+ COPY docker-entrypoint.sh /usr/local/bin/
63
+ RUN chmod +x /usr/local/bin/docker-entrypoint.sh
64
+ ENTRYPOINT ["/usr/local/bin/docker-entrypoint.sh"]
65
+ CMD ["bash"]
transformers/docker/transformers-pytorch-tpu/bert-base-cased.jsonnet ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ local base = import 'templates/base.libsonnet';
2
+ local tpus = import 'templates/tpus.libsonnet';
3
+ local utils = import "templates/utils.libsonnet";
4
+ local volumes = import "templates/volumes.libsonnet";
5
+
6
+ local bertBaseCased = base.BaseTest {
7
+ frameworkPrefix: "hf",
8
+ modelName: "bert-base-cased",
9
+ mode: "example",
10
+ configMaps: [],
11
+
12
+ timeout: 3600, # 1 hour, in seconds
13
+
14
+ image: std.extVar('image'),
15
+ imageTag: std.extVar('image-tag'),
16
+
17
+ tpuSettings+: {
18
+ softwareVersion: "pytorch-nightly",
19
+ },
20
+ accelerator: tpus.v3_8,
21
+
22
+ volumeMap+: {
23
+ datasets: volumes.PersistentVolumeSpec {
24
+ name: "huggingface-cluster-disk",
25
+ mountPath: "/datasets",
26
+ },
27
+ },
28
+ command: utils.scriptCommand(
29
+ |||
30
+ python -m pytest -s transformers/examples/pytorch/test_xla_examples.py -v
31
+ test_exit_code=$?
32
+ echo "\nFinished running commands.\n"
33
+ test $test_exit_code -eq 0
34
+ |||
35
+ ),
36
+ };
37
+
38
+ bertBaseCased.oneshotJob
transformers/docker/transformers-pytorch-tpu/dataset.yaml ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ apiVersion: v1
2
+ kind: PersistentVolume
3
+ metadata:
4
+ name: huggingface-cluster-disk
5
+ spec:
6
+ storageClassName: ""
7
+ capacity:
8
+ storage: 500Gi
9
+ accessModes:
10
+ - ReadOnlyMany
11
+ claimRef:
12
+ namespace: default
13
+ name: huggingface-cluster-disk-claim
14
+ gcePersistentDisk:
15
+ pdName: huggingface-cluster-disk
16
+ fsType: ext4
17
+ readOnly: true
18
+ ---
19
+ apiVersion: v1
20
+ kind: PersistentVolumeClaim
21
+ metadata:
22
+ name: huggingface-cluster-disk-claim
23
+ spec:
24
+ # Specify "" as the storageClassName so it matches the PersistentVolume's StorageClass.
25
+ # A nil storageClassName value uses the default StorageClass. For details, see
26
+ # https://kubernetes.io/docs/concepts/storage/persistent-volumes/#class-1
27
+ storageClassName: ""
28
+ accessModes:
29
+ - ReadOnlyMany
30
+ resources:
31
+ requests:
32
+ storage: 1Ki
transformers/docker/transformers-pytorch-tpu/docker-entrypoint.sh ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ source ~/.bashrc
3
+ echo "running docker-entrypoint.sh"
4
+ conda activate container
5
+ echo $KUBE_GOOGLE_CLOUD_TPU_ENDPOINTS
6
+ echo "printed TPU info"
7
+ export XRT_TPU_CONFIG="tpu_worker;0;${KUBE_GOOGLE_CLOUD_TPU_ENDPOINTS:7}"
8
+ exec "$@"#!/bin/bash
transformers/docker/transformers-pytorch-xpu/Dockerfile ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM intel/deep-learning-essentials:2025.1.3-0-devel-ubuntu22.04 AS base
2
+ LABEL maintainer="Hugging Face"
3
+
4
+ SHELL ["/bin/bash", "-c"]
5
+
6
+ ARG PYTHON_VER=3.11
7
+ ENV TORCH_DEVICE_BACKEND_AUTOLOAD=0
8
+ ENV DEBIAN_FRONTEND=noninteractive
9
+
10
+ RUN apt-get remove -y python3.10 && apt-get autoremove -y
11
+ RUN apt-get update && \
12
+ apt-get install -y software-properties-common && \
13
+ add-apt-repository -y ppa:deadsnakes/ppa && \
14
+ apt-get update && \
15
+ apt-get install -y python$PYTHON_VER python$PYTHON_VER-dev python3-pip && \
16
+ ln -sf /usr/bin/python$PYTHON_VER /usr/bin/python3 && \
17
+ ln -sf /usr/bin/python3 /usr/bin/python && \
18
+ apt-get clean && \
19
+ rm -rf /var/lib/apt/lists/*
20
+
21
+ RUN apt-get update && \
22
+ apt-get -y install \
23
+ apt-utils \
24
+ build-essential \
25
+ ca-certificates \
26
+ clinfo \
27
+ curl \
28
+ git \
29
+ git-lfs \
30
+ vim \
31
+ numactl \
32
+ gnupg2 \
33
+ gpg-agent \
34
+ zlib1g-dev \
35
+ rsync \
36
+ sudo \
37
+ libnl-genl-3-200 \
38
+ xpu-smi \
39
+ unzip \
40
+ ffmpeg \
41
+ tesseract-ocr \
42
+ espeak-ng \
43
+ wget \
44
+ ncurses-term && \
45
+ apt-get clean && \
46
+ rm -rf /var/lib/apt/lists/*
47
+
48
+
49
+ RUN apt-get update && \
50
+ apt-get install -y \
51
+ linux-headers-$(uname -r) \
52
+ linux-modules-extra-$(uname -r) \
53
+ flex bison \
54
+ intel-fw-gpu intel-i915-dkms xpu-smi \
55
+ intel-opencl-icd libze-intel-gpu1 libze1 \
56
+ intel-media-va-driver-non-free libmfx-gen1 libvpl2 \
57
+ libegl-mesa0 libegl1-mesa libegl1-mesa-dev libgbm1 libgl1-mesa-dev libgl1-mesa-dri \
58
+ libglapi-mesa libglx-mesa0 libigdgmm12 libxatracker2 mesa-va-drivers \
59
+ mesa-vdpau-drivers mesa-vulkan-drivers va-driver-all vainfo hwinfo clinfo intel-ocloc \
60
+ libigc-dev intel-igc-cm libigdfcl-dev libigfxcmrt-dev libze-dev && \
61
+ apt-get clean && \
62
+ rm -rf /var/lib/apt/lists/*
63
+
64
+ RUN pip install --upgrade pip
65
+ RUN pip install triton==3.3.0
66
+
67
+ RUN pip install torch==2.7.0 torchvision==0.22.0 torchaudio==2.7.0 --index-url https://download.pytorch.org/whl/xpu --no-cache-dir
68
+
69
+ RUN pip install evaluate torchdata pyctcdecode pytesseract decord galore-torch fire scipy scikit-learn sentencepiece sacremoses nltk rouge_score librosa soundfile g2p_en mpi4py requests_mock
70
+ RUN pip install pretty_midi essentia resampy Levenshtein av sacrebleu phonemizer invisible_watermark schedulefree
71
+ RUN pip install gguf hqq compressed_tensors gptqmodel mergekit autoawq deepspeed torchao onnx
72
+ RUN pip install hf_transfer huggingface-hub hf-doc-builder datasets optimum-quanto timm transformers accelerate optimum peft
73
+
74
+ RUN pip install git+https://github.com/linkedin/Liger-Kernel.git --extra-index-url https://download.pytorch.org/whl/test/xpu
75
+
76
+ # install bitsandbytes
77
+ RUN pip install git+https://github.com/bitsandbytes-foundation/bitsandbytes.git
78
+
79
+ ENV OCL_ICD_VENDORS=/etc/OpenCL/vendors
80
+ ENV FI_PROVIDER_PATH=${I_MPI_ROOT}/lib/libfabric/prov:/usr/lib/x86_64-linux-gnu/libfabric
81
+ ENV CCL_ROOT=/usr/local
82
+ ENV CCL_ATL_TRANSPORT=ofi
83
+ ENV I_MPI_ROOT=/usr/local
84
+ ENV CLASSPATH=${I_MPI_ROOT}/lib/mpi.jar
85
+ ENV PATH=${I_MPI_ROOT}/bin/libfabric:${PATH}
86
+ ENV LD_LIBRARY_PATH=${I_MPI_ROOT}/lib/libfabric:${LD_LIBRARY_PATH}
87
+
88
+ RUN touch /entrypoint.sh
89
+ RUN chmod +x /entrypoint.sh
90
+ RUN echo "#!/bin/bash" >> /entrypoint.sh
91
+ RUN echo "source /opt/intel/oneapi/setvars.sh --force && /bin/bash" >> /entrypoint.sh
92
+
93
+ ENTRYPOINT ["/entrypoint.sh"]
transformers/docker/transformers-quantization-latest-gpu/Dockerfile ADDED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM nvidia/cuda:12.1.1-cudnn8-devel-ubuntu22.04
2
+ LABEL maintainer="Hugging Face"
3
+
4
+ ARG DEBIAN_FRONTEND=noninteractive
5
+
6
+ # Use login shell to read variables from `~/.profile` (to pass dynamic created variables between RUN commands)
7
+ SHELL ["sh", "-lc"]
8
+
9
+ # The following `ARG` are mainly used to specify the versions explicitly & directly in this docker file, and not meant
10
+ # to be used as arguments for docker build (so far).
11
+
12
+ ARG PYTORCH='2.6.0'
13
+ # Example: `cu102`, `cu113`, etc.
14
+ ARG CUDA='cu121'
15
+ # Disable kernel mapping for quantization tests
16
+ ENV DISABLE_KERNEL_MAPPING=1
17
+
18
+ RUN apt update
19
+ RUN apt install -y git libsndfile1-dev tesseract-ocr espeak-ng python3 python3-pip ffmpeg
20
+ RUN python3 -m pip install --no-cache-dir --upgrade pip
21
+
22
+ ARG REF=main
23
+ RUN git clone https://github.com/huggingface/transformers && cd transformers && git checkout $REF
24
+
25
+ RUN [ ${#PYTORCH} -gt 0 ] && VERSION='torch=='$PYTORCH'.*' || VERSION='torch'; echo "export VERSION='$VERSION'" >> ~/.profile
26
+ RUN echo torch=$VERSION
27
+ # `torchvision` and `torchaudio` should be installed along with `torch`, especially for nightly build.
28
+ # Currently, let's just use their latest releases (when `torch` is installed with a release version)
29
+ RUN python3 -m pip install --no-cache-dir -U $VERSION torchvision torchaudio torchcodec --extra-index-url https://download.pytorch.org/whl/$CUDA
30
+
31
+ RUN python3 -m pip install --no-cache-dir git+https://github.com/huggingface/accelerate@main#egg=accelerate
32
+
33
+ # needed in bnb and awq
34
+ RUN python3 -m pip install --no-cache-dir einops
35
+
36
+ # Add bitsandbytes for mixed int8 testing
37
+ RUN python3 -m pip install --no-cache-dir bitsandbytes
38
+
39
+ # Add gptqmodel for gtpq quantization testing, installed from source for pytorch==2.6.0 compatibility
40
+ RUN python3 -m pip install lm_eval
41
+ RUN git clone https://github.com/ModelCloud/GPTQModel.git && cd GPTQModel && pip install -v . --no-build-isolation
42
+
43
+ # Add optimum for gptq quantization testing
44
+ RUN python3 -m pip install --no-cache-dir git+https://github.com/huggingface/optimum@main#egg=optimum
45
+
46
+ # Add PEFT
47
+ RUN python3 -m pip install --no-cache-dir git+https://github.com/huggingface/peft@main#egg=peft
48
+
49
+ # Add aqlm for quantization testing
50
+ RUN python3 -m pip install --no-cache-dir aqlm[gpu]==1.0.2
51
+
52
+ # Add vptq for quantization testing
53
+ RUN pip install vptq
54
+
55
+ # Add spqr for quantization testing
56
+ # Commented for now as No matching distribution found we need to reach out to the authors
57
+ # RUN python3 -m pip install --no-cache-dir spqr_quant[gpu]
58
+
59
+ # Add hqq for quantization testing
60
+ RUN python3 -m pip install --no-cache-dir hqq
61
+
62
+ # For GGUF tests
63
+ RUN python3 -m pip install --no-cache-dir gguf
64
+
65
+ # Add autoawq for quantization testing
66
+ # New release v0.2.8
67
+ RUN python3 -m pip install --no-cache-dir autoawq[kernels]
68
+
69
+ # Add quanto for quantization testing
70
+ RUN python3 -m pip install --no-cache-dir optimum-quanto
71
+
72
+ # Add eetq for quantization testing
73
+ RUN git clone https://github.com/NetEase-FuXi/EETQ.git && cd EETQ/ && git submodule update --init --recursive && pip install .
74
+
75
+ # # Add flute-kernel and fast_hadamard_transform for quantization testing
76
+ # # Commented for now as they cause issues with the build
77
+ # # TODO: create a new workflow to test them
78
+ # RUN python3 -m pip install --no-cache-dir flute-kernel==0.4.1
79
+ # RUN python3 -m pip install --no-cache-dir git+https://github.com/Dao-AILab/fast-hadamard-transform.git
80
+
81
+ # Add compressed-tensors for quantization testing
82
+ RUN python3 -m pip install --no-cache-dir compressed-tensors
83
+
84
+ # Add AMD Quark for quantization testing
85
+ RUN python3 -m pip install --no-cache-dir amd-quark
86
+
87
+ # Add AutoRound for quantization testing
88
+ RUN python3 -m pip install --no-cache-dir "auto-round>=0.5.0"
89
+
90
+ # Add transformers in editable mode
91
+ RUN python3 -m pip install --no-cache-dir -e ./transformers[dev-torch]
92
+
93
+ # `kernels` may give different outputs (within 1e-5 range) even with the same model (weights) and the same inputs
94
+ RUN python3 -m pip uninstall -y kernels
95
+
96
+ # Uninstall flash-attn installed by autoawq, it causes issues here : https://github.com/huggingface/transformers/actions/runs/15915442841/job/44892146131
97
+ RUN python3 -m pip uninstall -y flash-attn
98
+
99
+ # When installing in editable mode, `transformers` is not recognized as a package.
100
+ # this line must be added in order for python to be aware of transformers.
101
+ RUN cd transformers && python3 setup.py develop
transformers/docker/transformers-tensorflow-gpu/Dockerfile ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM nvidia/cuda:12.1.0-cudnn8-devel-ubuntu22.04
2
+ LABEL maintainer="Hugging Face"
3
+
4
+ ARG DEBIAN_FRONTEND=noninteractive
5
+
6
+ RUN apt update
7
+ RUN apt install -y git libsndfile1-dev tesseract-ocr espeak-ng python3 python3-pip ffmpeg
8
+ RUN python3 -m pip install --no-cache-dir --upgrade pip
9
+
10
+ ARG REF=main
11
+ RUN git clone https://github.com/huggingface/transformers && cd transformers && git checkout $REF
12
+ RUN python3 -m pip install --no-cache-dir -e ./transformers[dev-tensorflow,testing]
13
+
14
+ # If set to nothing, will install the latest version
15
+ ARG TENSORFLOW='2.13'
16
+
17
+ RUN [ ${#TENSORFLOW} -gt 0 ] && VERSION='tensorflow=='$TENSORFLOW'.*' || VERSION='tensorflow'; python3 -m pip install --no-cache-dir -U $VERSION
18
+ RUN python3 -m pip uninstall -y torch flax
19
+ RUN python3 -m pip install -U "itsdangerous<2.1.0"
20
+
21
+ RUN python3 -m pip install --no-cache-dir -U "tensorflow_probability<0.22"
22
+
23
+ # When installing in editable mode, `transformers` is not recognized as a package.
24
+ # this line must be added in order for python to be aware of transformers.
25
+ RUN cd transformers && python3 setup.py develop
transformers/docs/source/ar/tasks/language_modeling.md ADDED
@@ -0,0 +1,422 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!--Copyright 2022 The HuggingFace Team. All rights reserved.
2
+ Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
3
+ the License. You may obtain a copy of the License at
4
+ http://www.apache.org/licenses/LICENSE-2.0
5
+ Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
6
+ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
7
+ specific language governing permissions and limitations under the License.
8
+ ⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
9
+ rendered properly in your Markdown viewer.
10
+ -->
11
+
12
+ # نمذجة اللغة السببية (Causal language modeling)
13
+
14
+ [[open-in-colab]]
15
+
16
+ هناك نوعان من نمذجة اللغة، السببية والمقنعة. يوضح هذا الدليل نمذجة اللغة السببية.
17
+ تُستخدم نماذج اللغة السببية غالبًا لتوليد النص. يمكنك استخدام هذه النماذج للتطبيقات الإبداعية مثل
18
+ اختيار مغامرة النص الخاصة بك أو مساعد ترميز ذكي مثل Copilot أو CodeParrot.
19
+
20
+ <Youtube id="Vpjb1lu0MDk"/>
21
+
22
+ تتنبأ نمذجة اللغة السببية بالرمز التالي في تسلسل من الرموز، ولا يمكن للنموذج سوى الاهتمام بالرموز على
23
+ اليسار. هذا يعني أن النموذج لا يمكنه رؤية الرموز المستقبلية. GPT-2 هو مثال على نموذج اللغة السببية.
24
+
25
+ سيوضح لك هذا الدليل كيفية:
26
+
27
+ 1. ضبط دقيق [DistilRoBERTa](https://huggingface.co/distilbert/distilroberta-base) على مجموعة فرعية [r/askscience](https://www.reddit.com/r/askscience/) من مجموعة بيانات [ELI5](https://huggingface.co/datasets/eli5).
28
+ 2. استخدام النموذج المدرب الخاص بك للاستنتاج.
29
+
30
+ <Tip>
31
+
32
+ لرؤية جميع العمارات ونقاط التحقق المتوافقة مع هذه المهمة، نوصي بالتحقق من [task-page](https://huggingface.co/tasks/text-generation)
33
+
34
+ </Tip>
35
+
36
+ قبل أن تبدأ، تأكد من تثبيت جميع المكتبات الضرورية:
37
+
38
+ ```bash
39
+ pip install transformers datasets evaluate
40
+ ```
41
+
42
+ نحن نشجعك على تسجيل الدخول إلى حساب Hugging Face الخاص بك حتى تتمكن من تحميل ومشاركة نموذجك مع المجتمع. عند المطالبة، أدخل رمزك لتسجيل الدخول:
43
+
44
+ ```py
45
+ >>> from huggingface_hub import notebook_login
46
+
47
+ >>> notebook_login()
48
+ ```
49
+
50
+ ## تحميل مجموعة بيانات ELI5
51
+
52
+ ابدأ بتحميل أول 5000 مثال من [ELI5-Category](https://huggingface.co/datasets/eli5_category) مجموعة البيانات مع مكتبة 🤗 Datasets. سيعطيك هذا فرصة للتجربة والتأكد من أن كل شيء يعمل قبل قضاء المزيد من الوقت في التدريب على مجموعة البيانات الكاملة.
53
+
54
+ ```py
55
+ >>> from datasets import load_dataset
56
+
57
+ >>> eli5 = load_dataset("eli5_category", split="train[:5000]")
58
+ ```
59
+
60
+ قم بتقسيم مجموعة بيانات `train` إلى مجموعتي تدريب واختبار باستخدام الخاصية [`~datasets.Dataset.train_test_split`]:
61
+
62
+ ```py
63
+ >>> eli5 = eli5.train_test_split(test_size=0.2)
64
+ ```
65
+
66
+ ثم ألق نظرة على مثال:
67
+
68
+ ```py
69
+ >>> eli5["train"][0]
70
+ {'q_id': '7h191n',
71
+ 'title': 'What does the tax bill that was passed today mean? How will it affect Americans in each tax bracket?',
72
+ 'selftext': '',
73
+ 'category': 'Economics',
74
+ 'subreddit': 'explainlikeimfive',
75
+ 'answers': {'a_id': ['dqnds8l', 'dqnd1jl', 'dqng3i1', 'dqnku5x'],
76
+ 'text': ["The tax bill is 500 pages long and there were a lot of changes still going on right to the end. It's not just an adjustment to the income tax brackets, it's a whole bunch of changes. As such there is no good answer to your question. The big take aways are: - Big reduction in corporate income tax rate will make large companies very happy. - Pass through rate change will make certain styles of business (law firms, hedge funds) extremely happy - Income tax changes are moderate, and are set to expire (though it's the kind of thing that might just always get re-applied without being made permanent) - People in high tax states (California, New York) lose out, and many of them will end up with their taxes raised.",
77
+ 'None yet. It has to be reconciled with a vastly different house bill and then passed again.',
78
+ 'Also: does this apply to 2017 taxes? Or does it start with 2018 taxes?',
79
+ 'This article explains both the House and senate bills, including the proposed changes to your income taxes based on your income level. URL_0'],
80
+ 'score': [21, 19, 5, 3],
81
+ 'text_urls': [[],
82
+ [],
83
+ [],
84
+ ['https://www.investopedia.com/news/trumps-tax-reform-what-can-be-done/']]},
85
+ 'title_urls': ['url'],
86
+ 'selftext_urls': ['url']}
87
+ ```
88
+
89
+ على الرغم من أن هذا قد يبدو معقدًا، إلا أنك مهتم حقًا بحقل `text`. ما هو رائع حول مهام نمذجة اللغة
90
+ أنت لا تحتاج إلى تسميات (تُعرف أيضًا باسم المهمة غير الخاضعة للإشراف) لأن الكلمة التالية تعمل كتسمية.
91
+
92
+ ## معالجة مسبقة (Preprocess)
93
+
94
+ <Youtube id="ma1TrR7gE7I"/>
95
+
96
+ الخطوة التالية هي تحميل مجزء النص DistilGPT2 لمعالجة حقل `text` الفرعي:
97
+
98
+ ```py
99
+ >>> from transformers import AutoTokenizer
100
+
101
+ >>> tokenizer = AutoTokenizer.from_pretrained("distilbert/distilgpt2")
102
+ ```
103
+
104
+ ستلاحظ من المثال أعلاه، الحقل `text` هو في الواقع متداخل داخل `answers`. هذا يعني أنك ستحتاج إلى
105
+ استخراج حقل `text` الفرعي من بنيته المتداخلة باستخدام الدالة [`flatten`](https://huggingface.co/docs/datasets/process#flatten):
106
+
107
+ ```py
108
+ >>> eli5 = eli5.flatten()
109
+ >>> eli5["train"][0]
110
+ {'q_id': '7h191n',
111
+ 'title': 'What does the tax bill that was passed today mean? How will it affect Americans in each tax bracket?',
112
+ 'selftext': '',
113
+ 'category': 'Economics',
114
+ 'subreddit': 'explainlikeimfive',
115
+ 'answers.a_id': ['dqnds8l', 'dqnd1jl', 'dqng3i1', 'dqnku5x'],
116
+ 'answers.text': ["The tax bill is 500 pages long and there were a lot of changes still going on right to the end. It's not just an adjustment to the income tax brackets, it's a whole bunch of changes. As such there is no good answer to your question. The big take aways are: - Big reduction in corporate income tax rate will make large companies very happy. - Pass through rate change will make certain styles of business (law firms, hedge funds) extremely happy - Income tax changes are moderate, and are set to expire (though it's the kind of thing that might just always get re-applied without being made permanent) - People in high tax states (California, New York) lose out, and many of them will end up with their taxes raised.",
117
+ 'None yet. It has to be reconciled with a vastly different house bill and then passed again.',
118
+ 'Also: does this apply to 2017 taxes? Or does it start with 2018 taxes?',
119
+ 'This article explains both the House and senate bills, including the proposed changes to your income taxes based on your income level. URL_0'],
120
+ 'answers.score': [21, 19, 5, 3],
121
+ 'answers.text_urls': [[],
122
+ [],
123
+ [],
124
+ ['https://www.investopedia.com/news/trumps-tax-reform-what-can-be-done/']],
125
+ 'title_urls': ['url'],
126
+ 'selftext_urls': ['url']}
127
+ ```
128
+
129
+ كل حقل فرعي هو الآن عموداً منفصلاً مسبوقاً بـ `answers`، وحقل `text` هو قائمة الآن. بدلاً من ذلك
130
+ من تجزائة نص كل جملة بشكل منفصل، قم بتحويل القائمة إلى سلسلة حتى تتمكن من تجزئة نصها بشكل مجمّع.
131
+
132
+ هنا أول دالة معالجة مسبقة لدمج قائمة السلاسل لكل مثال ومجزىء النتيجة:
133
+
134
+ ```py
135
+ >>> def preprocess_function(examples):
136
+ ... return tokenizer([" ".join(x) for x in examples["answers.text"]])
137
+ ```
138
+
139
+ لتطبيق دالة المعالجة المسبقة هذه على مجموعة البيانات بأكملها، استخدم الدالة 🤗 Datasets [`~datasets.Dataset.map`]. يمكنك تسريع هذه العملية `map` عن طريق تعيين `batched=True` لمعالجة عناصر متعددة من مجموعة البيانات في وقت واحد، وزيادة عدد العمليات مع `num_proc`. احذف أي أعمدة لا تحتاجها:
140
+
141
+ ```py
142
+ >>> tokenized_eli5 = eli5.map(
143
+ ... preprocess_function,
144
+ ... batched=True,
145
+ ... num_proc=4,
146
+ ... remove_columns=eli5["train"].column_names,
147
+ ... )
148
+ ```
149
+
150
+ تحتوي هذه المجموعة من البيانات على تسلسلات الرموز، ولكن بعضها أطول من الطول الأقصى للمدخلات للنموذج.
151
+
152
+ يمكنك الآن استخدام دالة ما قبل المعالجة ثانية لـ:
153
+
154
+ - تجميع كل التسلسلات.
155
+ - تقسيم التسلسلات المجمّعة إلى أجزاء أقصر محددة، بحجم `block_size`، والتي يجب أن تكون أقصر من الطول الأقصى للمدخلات ومناسبة لذاكرة GPU.
156
+
157
+ ```py
158
+ >>> block_size = 128
159
+
160
+ >>> def group_texts(examples):
161
+ ... # ربط جميع النصوص.
162
+ ... concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
163
+ ... total_length = len(concatenated_examples[list(examples.keys())[0]])
164
+ ... # نتجاهل الباقي الصغير، يمكننا إضافة الحشو إذا كان النموذج يدعمه بدلاً من هذا الإسقاط، يمكنك
165
+ ... # تخصيص هذا الجزء حسب احتياجاتك.
166
+ ... if total_length >= block_size:
167
+ ... total_length = (total_length // block_size) * block_size
168
+ ... # التقسيم إلى أجزاء بحجم block_size.
169
+ ... result = {
170
+ ... k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
171
+ ... for k, t in concatenated_examples.items()
172
+ ... }
173
+ ... result["labels"] = result["input_ids"].copy()
174
+ ... return result
175
+ ```
176
+
177
+ طبق دالة `group_texts` على كامل المجموعة من البيانات:
178
+
179
+ ```py
180
+ >>> lm_dataset = tokenized_eli5.map(group_texts, batched=True, num_proc=4)
181
+ ```
182
+
183
+ الآن قم بإنشاء دفعة من الأمثلة باستخدام [`DataCollatorForLanguageModeling`]. من الأفضل أن تقوم بـ *الحشو الديناميكي* للجمل إلى الطول الأطول في الدفعة أثناء التجميع، بدلاً من حشو كامل المجموعة من البيانات إلى الطول الأقصى.
184
+
185
+ <frameworkcontent>
186
+ <pt>
187
+ استخدم رمز نهاية التسلسل كرمز للحشو، وحدد `mlm_probability` لحجب الرموز بشكل عشوائي عند كل تكرار للبيانات:
188
+
189
+ ```py
190
+ >>> from transformers import DataCollatorForLanguageModeling
191
+
192
+ >>> tokenizer.pad_token = tokenizer.eos_token
193
+ >>> data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
194
+ ```
195
+
196
+ </pt>
197
+ <tf>
198
+ استخدم رمز نهاية التسلسل كرمز للحشو، وحدد `mlm_probability` لحجب الرموز بشكل عشوائي عند كل تكرار للبيانات:
199
+
200
+ ```py
201
+ >>> from transformers import DataCollatorForLanguageModeling
202
+
203
+ >>> data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False, return_tensors="tf")
204
+ ```
205
+
206
+ </tf>
207
+ </frameworkcontent>
208
+
209
+ ## التدريب (Train)
210
+
211
+ <frameworkcontent>
212
+ <pt>
213
+
214
+ <Tip>
215
+
216
+ إذا لم تكن على دراية بتدريب نموذج باستخدام [`Trainer`], اطلع على [البرنامج التعليمي الأساسي](../training#train-with-pytorch-trainer)!
217
+
218
+ </Tip>
219
+
220
+ أنت جاهز الآن لبدء تدريب نموذجك! قم بتحميل DistilGPT2 باستخدام [`AutoModelForCausalLM`]:
221
+
222
+ ```py
223
+ >>> from transformers import AutoModelForCausalLM, TrainingArguments, Trainer
224
+
225
+ >>> model = AutoModelForCausalLM.from_pretrained("distilbert/distilgpt2")
226
+ ```
227
+
228
+ في هذه المرحلة، تبقى ثلاث خطوات فقط:
229
+
230
+ 1. حدد معلمات التدريب الخاصة بك في [`TrainingArguments`]. المعامل الوحيد المطلوب هو `output_dir` الذي يحدد أين سيتم حفظ نموذجك. ستقوم بدفع هذا النموذج إلى Hub بتحديد `push_to_hub=True` (يجب أن تكون مسجلاً الدخول إلى Hugging Face لتحميل نموذجك).
231
+ 2. قم بتمرير معاملات التدريب إلى [`Trainer`] إلى جانب النموذج، والمجموعات من البيانات، ومجمّع البيانات.
232
+ 3. قم باستدعاء [`~Trainer.train`] لتدريب نموذجك.
233
+
234
+ ```py
235
+ >>> training_args = TrainingArguments(
236
+ ... output_dir="my_awesome_eli5_clm-model",
237
+ ... eval_strategy="epoch",
238
+ ... learning_rate=2e-5,
239
+ ... weight_decay=0.01,
240
+ ... push_to_hub=True,
241
+ ... )
242
+
243
+ >>> trainer = Trainer(
244
+ ... model=model,
245
+ ... args=training_args,
246
+ ... train_dataset=lm_dataset["train"],
247
+ ... eval_dataset=lm_dataset["test"],
248
+ ... data_collator=data_collator,
249
+ ... tokenizer=tokenizer,
250
+ ... )
251
+
252
+ >>> trainer.train()
253
+ ```
254
+
255
+ بمجرد اكتمال التدريب، استخدم طريقة [`~transformers.Trainer.evaluate`] لتقييم نموذجك والحصول على احتمالية الارتباك:
256
+
257
+ ```py
258
+ >>> import math
259
+
260
+ >>> eval_results = trainer.evaluate()
261
+ >>> print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}")
262
+ Perplexity: 49.61
263
+ ```
264
+
265
+ ثم شارك نموذجك على Hub باستخدام طريقة [`~transformers.Trainer.push_to_hub`] حتى يتمكن الجميع من استخدام نموذجك:
266
+
267
+ ```py
268
+ >>> trainer.push_to_hub()
269
+ ```
270
+ </pt>
271
+ <tf>
272
+ <Tip>
273
+
274
+ إذا لم تكن على دراية بتدريب نموذج باستخدام Keras، اطلع على [البرنامج التعليمي الأساسي](../training#train-a-tensorflow-model-with-keras)!
275
+
276
+ </Tip>
277
+ لتدريب نموذج في TensorFlow، ابدأ بإعداد دالة المحسن، وجدول معدل التعلم، وبعض معاملات التدريب:
278
+
279
+ ```py
280
+ >>> from transformers import create_optimizer, AdamWeightDecay
281
+
282
+ >>> optimizer = AdamWeightDecay(learning_rate=2e-5, weight_decay_rate=0.01)
283
+ ```
284
+
285
+ ثم يمكنك تحميل DistilGPT2 باستخدام [`TFAutoModelForCausalLM`]:
286
+
287
+ ```py
288
+ >>> from transformers import TFAutoModelForCausalLM
289
+
290
+ >>> model = TFAutoModelForCausalLM.from_pretrained("distilbert/distilgpt2")
291
+ ```
292
+
293
+ حول مجموعات بياناتك إلى تنسيق `tf.data.Dataset` باستخدام [`~transformers.TFPreTrainedModel.prepare_tf_dataset`]:
294
+
295
+ ```py
296
+ >>> tf_train_set = model.prepare_tf_dataset(
297
+ ... lm_dataset["train"],
298
+ ... shuffle=True,
299
+ ... batch_size=16,
300
+ ... collate_fn=data_collator,
301
+ ... )
302
+
303
+ >>> tf_test_set = model.prepare_tf_dataset(
304
+ ... lm_dataset["test"],
305
+ ... shuffle=False,
306
+ ... batch_size=16,
307
+ ... collate_fn=data_collator,
308
+ ... )
309
+ ```
310
+
311
+ قم بتهيئة النموذج للتدريب باستخدام [`compile`](https://keras.io/api/models/model_training_apis/#compile-method). لاحظ أن جميع نماذج Transformers لديها دالة خسارة ذات صلة بالمهمة الافتراضية، لذلك لا تحتاج إلى تحديد واحدة ما لم ترغب في ذلك:
312
+
313
+ ```py
314
+ >>> import tensorflow as tf
315
+
316
+ >>> model.compile(optimizer=optimizer) # لا يوجد حجة للخسارة!
317
+ ```
318
+
319
+ يمكن القيام بذلك عن طريق تحديد مكان دفع نموذجك ومجمّع البيانات في [`~transformers.PushToHubCallback`]:
320
+
321
+ ```py
322
+ >>> from transformers.keras_callbacks import PushToHubCallback
323
+
324
+ >>> callback = PushToHubCallback(
325
+ ... output_dir="my_awesome_eli5_clm-model",
326
+ ... tokenizer=tokenizer,
327
+ ... )
328
+ ```
329
+
330
+ أخيراً، أنت جاهز لبدء تدريب نموذجك! قم باستدعاء [`fit`](https://keras.io/api/models/model_training_apis/#fit-method) مع مجموعات بيانات التدريب والتحقق من الصحة، وعدد العصور، والتعليقات الخاصة بك لتدريب النموذج:
331
+
332
+ ```py
333
+ >>> model.fit(x=tf_train_set, validation_data=tf_test_set, epochs=3, callbacks=[callback])
334
+ ```
335
+
336
+ بمجرد اكتمال التدريب، يتم تحميل نموذجك تلقائيًا إلى Hub حتى يتمكن الجميع من استخدامه!
337
+ </tf>
338
+ </frameworkcontent>
339
+
340
+ <Tip>
341
+
342
+ للحصول على مثال أكثر تعمقًا حول كيفية تدريب نموذج للنمذجة اللغوية السببية، اطلع على الدفتر المقابل
343
+ [دفتر PyTorch](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/language_modeling.ipynb)
344
+ أو [دفتر TensorFlow](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/language_modeling-tf.ipynb).
345
+
346
+ </Tip>
347
+
348
+ ## الاستدلال (Inference)
349
+
350
+ رائع، الآن بعد أن قمت بتدريب نموذج، يمكنك استخدامه للاستدلال!
351
+
352
+ قم بابتكار سؤال تود توليد نص منه:
353
+
354
+ ```py
355
+ >>> prompt = "Somatic hypermutation allows the immune system to"
356
+ ```
357
+
358
+ أبسط طريقة لتجربة نموذجك المدرب للاستدلال هي استخدامه في [`pipeline`]. قم بتنفيذ `pipeline` لتوليد النص مع نموذجك، ومرر نصك إليه:
359
+
360
+ ```py
361
+ >>> from transformers import pipeline
362
+
363
+ >>> generator = pipeline("text-generation", model="username/my_awesome_eli5_clm-model")
364
+ >>> generator(prompt)
365
+ [{'generated_text': "Somatic hypermutation allows the immune system to be able to effectively reverse the damage caused by an infection.\n\n\nThe damage caused by an infection is caused by the immune system's ability to perform its own self-correcting tasks."}]
366
+ ```
367
+
368
+ <frameworkcontent>
369
+ <pt>
370
+ قسم النص وإرجع `input_ids` كتنسورات PyTorch:
371
+
372
+ ```py
373
+ >>> from transformers import AutoTokenizer
374
+
375
+ >>> tokenizer = AutoTokenizer.from_pretrained("username/my_awesome_eli5_clm-model")
376
+ >>> inputs = tokenizer(prompt, return_tensors="pt").input_ids
377
+ ```
378
+
379
+ استخدم طريقة [`~generation.GenerationMixin.generate`] لتوليد النص.
380
+ للمزيد من التفاصيل حول استراتيجيات توليد النص المختلفة والبارامترات للتحكم في التوليد، راجع صفحة [استراتيجيات توليد النص](../generation_strategies).
381
+
382
+ ```py
383
+ >>> from transformers import AutoModelForCausalLM
384
+
385
+ >>> model = AutoModelForCausalLM.from_pretrained("username/my_awesome_eli5_clm-model")
386
+ >>> outputs = model.generate(inputs, max_new_tokens=100, do_sample=True, top_k=50, top_p=0.95)
387
+ ```
388
+
389
+ فك ترميز الرموز المولدة مرة أخرى إلى نص:
390
+
391
+ ```py
392
+ >>> tokenizer.batch_decode(outputs, skip_special_tokens=True)
393
+ ["Somatic hypermutation allows the immune system to react to drugs with the ability to adapt to a different environmental situation. In other words, a system of 'hypermutation' can help the immune system to adapt to a different environmental situation or in some cases even a single life. In contrast, researchers at the University of Massachusetts-Boston have found that 'hypermutation' is much stronger in mice than in humans but can be found in humans, and that it's not completely unknown to the immune system. A study on how the immune system"]
394
+ ```
395
+ </pt>
396
+ <tf>
397
+ قم بتقسيم النص وإرجاع `input_ids` كـ TensorFlow tensors:
398
+
399
+ ```py
400
+ >>> from transformers import AutoTokenizer
401
+
402
+ >>> tokenizer = AutoTokenizer.from_pretrained("username/my_awesome_eli5_clm-model")
403
+ >>> inputs = tokenizer(prompt, return_tensors="tf").input_ids
404
+ ```
405
+
406
+ استخدم طريقة [`~transformers.generation_tf_utils.TFGenerationMixin.generate`] ل��نشاء الملخص. للمزيد من التفاصيل حول استراتيجيات توليد النص المختلفة والبارامترات للتحكم في التوليد، راجع صفحة [استراتيجيات توليد النص](../generation_strategies).
407
+
408
+ ```py
409
+ >>> from transformers import TFAutoModelForCausalLM
410
+
411
+ >>> model = TFAutoModelForCausalLM.from_pretrained("username/my_awesome_eli5_clm-model")
412
+ >>> outputs = model.generate(input_ids=inputs, max_new_tokens=100, do_sample=True, top_k=50, top_p=0.95)
413
+ ```
414
+
415
+ فك ترميز الرموز المولدة مرة أخرى إلى نص:
416
+
417
+ ```py
418
+ >>> tokenizer.batch_decode(outputs, skip_special_tokens=True)
419
+ ['Somatic hypermutation allows the immune system to detect the presence of other viruses as they become more prevalent. Therefore, researchers have identified a high proportion of human viruses. The proportion of virus-associated viruses in our study increases with age. Therefore, we propose a simple algorithm to detect the presence of these new viruses in our samples as a sign of improved immunity. A first study based on this algorithm, which will be published in Science on Friday, aims to show that this finding could translate into the development of a better vaccine that is more effective for']
420
+ ```
421
+ </tf>
422
+ </frameworkcontent>
transformers/docs/source/ar/tasks/masked_language_modeling.md ADDED
@@ -0,0 +1,442 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!--Copyright 2022 The HuggingFace Team. All rights reserved.
2
+ Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
3
+ the License. You may obtain a copy of the License at
4
+ http://www.apache.org/licenses/LICENSE-2.0
5
+ Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
6
+ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
7
+ specific language governing permissions and limitations under the License.
8
+ ⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
9
+ rendered properly in your Markdown viewer.
10
+ -->
11
+
12
+ # نمذجة اللغة المقنعة (Masked language modeling)
13
+
14
+ [[open-in-colab]]
15
+
16
+ <Youtube id="mqElG5QJWUg"/>
17
+
18
+ تتنبأ نمذجة اللغة المقنعة برمز مقنع في تسلسل، ويمكن للنموذج الانتباه إلى الرموز بشكل ثنائي الاتجاه. هذا
19
+ يعني أن النموذج لديه إمكانية الوصول الكاملة إلى الرموز الموجودة على اليسار واليمين. تعد نمذجة اللغة المقنعة ممتازة للمهام التي
20
+ تتطلب فهمًا سياقيًا جيدًا لتسلسل كامل. BERT هو مثال على نموذج لغة مقنع.
21
+
22
+ سيوضح لك هذا الدليل كيفية:
23
+
24
+ 1. تكييف [DistilRoBERTa](https://huggingface.co/distilbert/distilroberta-base) على مجموعة فرعية [r/askscience](https://www.reddit.com/r/askscience/) من مجموعة بيانات [ELI5](https://huggingface.co/datasets/eli5).
25
+ 2. استخدام نموذج المدرب الخاص بك للاستدلال.
26
+
27
+ <Tip>
28
+
29
+ لمعرفة جميع البنى والنسخ المتوافقة مع هذه المهمة، نوصي بالتحقق من [صفحة المهمة](https://huggingface.co/tasks/fill-mask)
30
+
31
+ </Tip>
32
+
33
+ قبل أن تبدأ، تأكد من تثبيت جميع المكتبات الضرورية:
34
+
35
+ ```bash
36
+ pip install transformers datasets evaluate
37
+ ```
38
+
39
+ نحن نشجعك على تسجيل الدخول إلى حساب Hugging Face الخاص بك حتى تتمكن من تحميل ومشاركة نموذجك مع المجتمع. عندما تتم مطالبتك، أدخل رمزك لتسجيل الدخول:
40
+
41
+ ```py
42
+ >>> from huggingface_hub import notebook_login
43
+
44
+ >>> notebook_login()
45
+ ```
46
+
47
+ ## تحميل مجموعة بيانات ELI5
48
+
49
+ ابدأ بتحميل أول 5000 مثال من مجموعة بيانات [ELI5-Category](https://huggingface.co/datasets/eli5_category) باستخدام مكتبة 🤗 Datasets. سيعطيك هذا فرصة للتجربة والتأكد من أن كل شيء يعمل قبل قضاء المزيد من الوقت في التدريب على مجموعة البيانات الكاملة.
50
+
51
+ ```py
52
+ >>> from datasets import load_dataset
53
+
54
+ >>> eli5 = load_dataset("eli5_category", split="train[:5000]")
55
+ ```
56
+
57
+ قم بتقسيم مجموعة البيانات `train` إلى مجموعتي تدريب واختبار باستخدام الدالة [`~datasets.Dataset.train_test_split`]:
58
+
59
+ ```py
60
+ >>> eli5 = eli5.train_test_split(test_size=0.2)
61
+ ```
62
+
63
+ ثم ألق نظرة على مثال:
64
+
65
+ ```py
66
+ >>> eli5["train"][0]
67
+ {'q_id': '7h191n',
68
+ 'title': 'What does the tax bill that was passed today mean? How will it affect Americans in each tax bracket?',
69
+ 'selftext': '',
70
+ 'category': 'Economics',
71
+ 'subreddit': 'explainlikeimfive',
72
+ 'answers': {'a_id': ['dqnds8l', 'dqnd1jl', 'dqng3i1', 'dqnku5x'],
73
+ 'text': ["The tax bill is 500 pages long and there were a lot of changes still going on right to the end. It's not just an adjustment to the income tax brackets, it's a whole bunch of changes. As such there is no good answer to your question. The big take aways are: - Big reduction in corporate income tax rate will make large companies very happy. - Pass through rate change will make certain styles of business (law firms, hedge funds) extremely happy - Income tax changes are moderate, and are set to expire (though it's the kind of thing that might just always get re-applied without being made permanent) - People in high tax states (California, New York) lose out, and many of them will end up with their taxes raised.",
74
+ 'None yet. It has to be reconciled with a vastly different house bill and then passed again.',
75
+ 'Also: does this apply to 2017 taxes? Or does it start with 2018 taxes?',
76
+ 'This article explains both the House and senate bills, including the proposed changes to your income taxes based on your income level. URL_0'],
77
+ 'score': [21, 19, 5, 3],
78
+ 'text_urls': [[],
79
+ [],
80
+ [],
81
+ ['https://www.investopedia.com/news/trumps-tax-reform-what-can-be-done/']]},
82
+ 'title_urls': ['url'],
83
+ 'selftext_urls': ['url']}
84
+ ```
85
+
86
+ على الرغم من أن هذا قد يبدو كثيرًا، إلا أنك مهتم حقًا بحقل `text`. ما هو رائع حول مهام نمذجة ال��غة هو أنك لا تحتاج إلى تسميات (تُعرف أيضًا باسم المهمة غير الخاضعة للإشراف) لأن الكلمة التالية *هي* التسمية.
87
+
88
+ ## معالجة مسبقة (Preprocess)
89
+
90
+ <Youtube id="8PmhEIXhBvI"/>
91
+
92
+ بالنسبة لنمذجة اللغة المقنعة، فإن الخطوة التالية هي تحميل معالج DistilRoBERTa لمعالجة حقل `text` الفرعي:
93
+
94
+ ```py
95
+ >>> from transformers import AutoTokenizer
96
+
97
+ >>> tokenizer = AutoTokenizer.from_pretrained("distilbert/distilroberta-base")
98
+ ```
99
+
100
+ ستلاحظ من المثال أعلاه، أن حقل `text` موجود بالفعل داخل `answers`. هذا يعني أنك ستحتاج إلى استخراج حقل `text` الفرعي من بنيته المضمنة باستخدام الدالة [`flatten`](https://huggingface.co/docs/datasets/process#flatten):
101
+
102
+ ```py
103
+ >>> eli5 = eli5.flatten()
104
+ >>> eli5["train"][0]
105
+ {'q_id': '7h191n',
106
+ 'title': 'What does the tax bill that was passed today mean? How will it affect Americans in each tax bracket?',
107
+ 'selftext': '',
108
+ 'category': 'Economics',
109
+ 'subreddit': 'explainlikeimfive',
110
+ 'answers.a_id': ['dqnds8l', 'dqnd1jl', 'dqng3i1', 'dqnku5x'],
111
+ 'answers.text': ["The tax bill is 500 pages long and there were a lot of changes still going on right to the end. It's not just an adjustment to the income tax brackets, it's a whole bunch of changes. As such there is no good answer to your question. The big take aways are: - Big reduction in corporate income tax rate will make large companies very happy. - Pass through rate change will make certain styles of business (law firms, hedge funds) extremely happy - Income tax changes are moderate, and are set to expire (though it's the kind of thing that might just always get re-applied without being made permanent) - People in high tax states (California, New York) lose out, and many of them will end up with their taxes raised.",
112
+ 'None yet. It has to be reconciled with a vastly different house bill and then passed again.',
113
+ 'Also: does this apply to 2017 taxes? Or does it start with 2018 taxes?',
114
+ 'This article explains both the House and senate bills, including the proposed changes to your income taxes based on your income level. URL_0'],
115
+ 'answers.score': [21, 19, 5, 3],
116
+ 'answers.text_urls': [[],
117
+ [],
118
+ [],
119
+ ['https://www.investopedia.com/news/trumps-tax-reform-what-can-be-done/']],
120
+ 'title_urls': ['url'],
121
+ 'selftext_urls': ['url']}
122
+ ```
123
+
124
+ كل حقل فرعي هو الآن عمود منفصل كما هو موضح بواسطة بادئة `answers`، وحقل `text` هو قائمة الآن. بدلاً من
125
+ معالجة كل جملة بشكل منفصل، قم بتحويل القائمة إلى سلسلة حتى تتمكن من معالجتها بشكل مشترك.
126
+
127
+ هنا أول دالة معالجة مسبقة لربط قائمة السلاسل لكل مثال ومعالجة النتيجة:
128
+
129
+ ```py
130
+ >>> def preprocess_function(examples):
131
+ ... return tokenizer([" ".join(x) for x in examples["answers.text"]])
132
+ ```
133
+
134
+ لتطبيق دالة المعالجة المسبقة على مجموعة البيانات بأكملها، استخدم الدالة 🤗 Datasets [`~datasets.Dataset.map`]. يمكنك تسريع دالة `map` عن طريق تعيين `batched=True` لمعالجة عدة عناصر في وقت واحد، وزيادة عدد العمليات باستخدام `num_proc`. احذف أي أعمدة غير ضرورية:
135
+
136
+ ```py
137
+ >>> tokenized_eli5 = eli5.map(
138
+ ... preprocess_function,
139
+ ... batched=True,
140
+ ... num_proc=4,
141
+ ... remove_columns=eli5["train"].column_names,
142
+ ... )
143
+ ```
144
+
145
+
146
+ تحتوي مجموعة البيانات هذه على تسلسلات رمزية، ولكن بعضها أطول من الطول الأقصى للمدخلات للنموذج.
147
+
148
+ يمكنك الآن استخدام دالة معالجة مسبقة ثانية لـ:
149
+ - تجميع جميع التسلسلات
150
+ - تقسيم التسلسلات المجمّعة إلى أجزاء أقصر محددة بـ `block_size`، والتي يجب أن تكون أقصر من الحد الأقصى لطول المدخلات ومناسبة لذاكرة GPU.
151
+
152
+ ```py
153
+ >>> block_size = 128
154
+
155
+ >>> def group_texts(examples):
156
+ ... # تجميع جميع النصوص.
157
+ ... concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
158
+ ... total_length = len(concatenated_examples[list(examples.keys())[0]])
159
+ ... # نتجاهل الجزء المتبقي الصغير، يمكننا إضافة الحشو إذا كان النموذج يدعمه بدلاً من هذا الإسقاط، يمكنك
160
+ ... # تخصيص هذا الجزء حسب احتياجاتك.
161
+ ... if total_length >= block_size:
162
+ ... total_length = (total_length // block_size) * block_size
163
+ ... # تقسيمها إلى أجزاء بحجم block_size.
164
+ ... result = {
165
+ ... k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
166
+ ... for k, t in concatenated_examples.items()
167
+ ... }
168
+ ... return result
169
+ ```
170
+
171
+ طبق دالة `group_texts` على مجموعة البيانات بأكملها:
172
+
173
+ ```py
174
+ >>> lm_dataset = tokenized_eli5.map(group_texts, batched=True, num_proc=4)
175
+ ```
176
+
177
+ الآن، قم بإنشاء دفعة من الأمثلة باستخدام [`DataCollatorForLanguageModeling`]. من الأكثر كفاءة أن تقوم بـ *الحشو الديناميكي* ليصل طولها إلى أطول جملة في الدفعة أثناء التجميع، بدلاً من حشو مجموعة البيانات بأكملها إلى الطول الأقصى.
178
+
179
+ <frameworkcontent>
180
+ <pt>
181
+
182
+ استخدم رمز نهاية التسلسل كرمز الحشو وحدد `mlm_probability` لحجب الرموز عشوائياً كل مرة تكرر فيها البيانات:
183
+
184
+ ```py
185
+ >>> from transformers import DataCollatorForLanguageModeling
186
+
187
+ >>> tokenizer.pad_token = tokenizer.eos_token
188
+ >>> data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)
189
+ ```
190
+ </pt>
191
+ <tf>
192
+
193
+ استخدم رمز نهاية التسلسل كرمز الحشو وحدد `mlm_probability` لحجب الرموز عشوائياً كل مرة تكرر فيها البيانات:
194
+
195
+ ```py
196
+ >>> from transformers import DataCollatorForLanguageModeling
197
+
198
+ >>> data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15, return_tensors="tf")
199
+ ```
200
+ </tf>
201
+ </frameworkcontent>
202
+
203
+ ## التدريب (Train)
204
+
205
+ <frameworkcontent>
206
+ <pt>
207
+
208
+ <Tip>
209
+
210
+ إذا لم تكن على دراية بتعديل نموذج باستخدام [`Trainer`], ألق نظرة على الدليل الأساسي [هنا](../training#train-with-pytorch-trainer)!
211
+
212
+ </Tip>
213
+
214
+ أنت مستعد الآن لبدء تدريب نموذجك! قم بتحميل DistilRoBERTa باستخدام [`AutoModelForMaskedLM`]:
215
+
216
+ ```py
217
+ >>> from transformers import AutoModelForMaskedLM
218
+
219
+ >>> model = AutoModelForMaskedLM.from_pretrained("distilbert/distilroberta-base")
220
+ ```
221
+
222
+ في هذه المرحلة، تبقى ثلاث خطوات فقط:
223
+
224
+ 1. حدد معلمات التدريب الخاصة بك في [`TrainingArguments`]. المعلمة الوحيدة المطلوبة هي `output_dir` والتي تحدد مكان حفظ نموذجك. ستقوم بدفع هذا النموذج إلى Hub عن طريق تعيين `push_to_hub=True` (يجب أن تكون مسجلاً الدخول إلى Hugging Face لتحميل نموذجك).
225
+ 2. قم بتمرير معلمات التدريب إلى [`Trainer`] مع النموذج، ومجموعات البيانات، ومجمّع البيانات.
226
+ 3. قم باستدعاء [`~Trainer.train`] لتعديل نموذجك.
227
+
228
+ ```py
229
+ >>> training_args = TrainingArguments(
230
+ ... output_dir="my_awesome_eli5_mlm_model",
231
+ ... eval_strategy="epoch",
232
+ ... learning_rate=2e-5,
233
+ ... num_train_epochs=3,
234
+ ... weight_decay=0.01,
235
+ ... push_to_hub=True,
236
+ ... )
237
+
238
+ >>> trainer = Trainer(
239
+ ... model=model,
240
+ ... args=training_args,
241
+ ... train_dataset=lm_dataset["train"],
242
+ ... eval_dataset=lm_dataset["test"],
243
+ ... data_collator=data_collator,
244
+ ... tokenizer=tokenizer,
245
+ ... )
246
+
247
+ >>> trainer.train()
248
+ ```
249
+
250
+ بمجرد اكتمال التدريب، استخدم طريقة [`~transformers.Trainer.evaluate`] لتقييم النموذج والحصول على مقياس
251
+ الحيرة:
252
+
253
+ ```py
254
+ >>> import math
255
+
256
+ >>> eval_results = trainer.evaluate()
257
+ >>> print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}")
258
+ Perplexity: 8.76
259
+ ```
260
+
261
+ ثم شارك نموذجك على Hub باستخدام طريقة [`~transformers.Trainer.push_to_hub`] حتى يتمكن الجميع من استخدام نموذجك:
262
+
263
+ ```py
264
+ >>> trainer.push_to_hub()
265
+ ```
266
+ </pt>
267
+ <tf>
268
+ <Tip>
269
+
270
+ إذا لم تكن على دراية بتعديل نموذج باستخدام Keras، ألق نظرة على الدليل الأساسي [هنا](../training#train-a-tensorflow-model-with-keras)!
271
+
272
+ </Tip>
273
+ لتعديل نموذج في TensorFlow، ابدأ بإعداد دالة محسن، وجدول معدل التعلم، وبعض معلمات التدريب:
274
+
275
+ ```py
276
+ >>> from transformers import create_optimizer, AdamWeightDecay
277
+
278
+ >>> optimizer = AdamWeightDecay(learning_rate=2e-5, weight_decay_rate=0.01)
279
+ ```
280
+
281
+ ثم يمكنك تحميل DistilRoBERTa باستخدام [`TFAutoModelForMaskedLM`]:
282
+
283
+ ```py
284
+ >>> from transformers import TFAutoModelForMaskedLM
285
+
286
+ >>> model = TFAutoModelForMaskedLM.from_pretrained("distilbert/distilroberta-base")
287
+ ```
288
+
289
+ قم بتحويل مجموعات بياناتك إلى تنسيق `tf.data.Dataset` باستخدام [`~transformers.TFPreTrainedModel.prepare_tf_dataset`]:
290
+
291
+ ```py
292
+ >>> tf_train_set = model.prepare_tf_dataset(
293
+ ... lm_dataset["train"],
294
+ ... shuffle=True,
295
+ ... batch_size=16,
296
+ ... collate_fn=data_collator,
297
+ ... )
298
+
299
+ >>> tf_test_set = model.prepare_tf_dataset(
300
+ ... lm_dataset["test"],
301
+ ... shuffle=False,
302
+ ... batch_size=16,
303
+ ... collate_fn=data_collator,
304
+ ... )
305
+ ```
306
+
307
+ قم بتهيئة النموذج لل��دريب باستخدام [`compile`](https://keras.io/api/models/model_training_apis/#compile-method). لاحظ أن نماذج Transformers لديها جميعها دالة خسارة افتراضية ذات صلة بالمهمة، لذلك لا تحتاج إلى تحديد واحدة ما لم تكن تريد ذلك:
308
+
309
+ ```py
310
+ >>> import tensorflow as tf
311
+
312
+ >>> model.compile(optimizer=optimizer) # لا توجد حجة للخسارة!
313
+ ```
314
+
315
+ يمكن القيام بذلك عن طريق تحديد مكان دفع نموذجك ومعالج الرموز في [`~transformers.PushToHubCallback`]:
316
+
317
+ ```py
318
+ >>> from transformers.keras_callbacks import PushToHubCallback
319
+
320
+ >>> callback = PushToHubCallback(
321
+ ... output_dir="my_awesome_eli5_mlm_model",
322
+ ... tokenizer=tokenizer,
323
+ ... )
324
+ ```
325
+
326
+ أخيراً، أنت مستعد لبدء تدريب نموذجك! قم باستدعاء [`fit`](https://keras.io/api/models/model_training_apis/#fit-method) مع مجموعات بيانات التدريب والتحقق، وعدد العصور، والتعليقات الخاصة بك لتعديل النموذج:
327
+
328
+ ```py
329
+ >>> model.fit(x=tf_train_set, validation_data=tf_test_set, epochs=3, callbacks=[callback])
330
+ ```
331
+
332
+ بمجرد اكتمال التدريب، يتم تحميل نموذجك تلقائياً إلى Hub حتى يتمكن الجميع من استخدامه!
333
+ </tf>
334
+ </frameworkcontent>
335
+
336
+ <Tip>
337
+
338
+ لمثال أكثر تفصيلاً حول كيفية تعديل نموذج للنمذجة اللغوية المقنعة، ألق نظرة على الدفتر المقابل
339
+ [دفتر PyTorch](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/language_modeling.ipynb)
340
+ أو [دفتر TensorFlow](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/language_modeling-tf.ipynb).
341
+
342
+ </Tip>
343
+
344
+ ## الاستدلال
345
+
346
+ رائع، الآن بعد أن قمت بتعديل نموذج، يمكنك استخدامه للاستدلال!
347
+
348
+ جهّز بعض النصوص التي تريد أن يملأ النموذج الفراغات فيها، واستخدم الرمز الخاص `<mask>` للإشارة إلى الفراغ:
349
+
350
+ ```py
351
+ >>> text = "The Milky Way is a <mask> galaxy."
352
+ ```
353
+
354
+ أبسط طريقة لتجربة نموذجك المعدل للاستدلال هي استخدامه في [`pipeline`]. قم بإنشاء كائن `pipeline` لملء الفراغ مع نموذجك، ومرر نصك إليه. إذا أردت، يمكنك استخدام معلمة `top_k` لتحديد عدد التنبؤات التي تريد إرجاعها:
355
+
356
+ ```py
357
+ >>> from transformers import pipeline
358
+
359
+ >>> mask_filler = pipeline("fill-mask", "username/my_awesome_eli5_mlm_model")
360
+ >>> mask_filler(text, top_k=3)
361
+ [{'score': 0.5150994658470154,
362
+ 'token': 21300,
363
+ 'token_str': ' spiral',
364
+ 'sequence': 'The Milky Way is a spiral galaxy.'},
365
+ {'score': 0.07087188959121704,
366
+ 'token': 2232,
367
+ 'token_str': ' massive',
368
+ 'sequence': 'The Milky Way is a massive galaxy.'},
369
+ {'score': 0.06434620916843414,
370
+ 'token': 650,
371
+ 'token_str': ' small',
372
+ 'sequence': 'The Milky Way is a small galaxy.'}]
373
+ ```
374
+
375
+ <frameworkcontent>
376
+ <pt>
377
+ قم بتجزئة النص وإرجاع `input_ids` كمتجهات PyTorch. ستحتاج أيضًا إلى تحديد موضع رمز `<mask>`:
378
+
379
+ ```py
380
+ >>> from transformers import AutoTokenizer
381
+
382
+ >>> tokenizer = AutoTokenizer.from_pretrained("username/my_awesome_eli5_mlm_model")
383
+ >>> inputs = tokenizer(text, return_tensors="pt")
384
+ >>> mask_token_index = torch.where(inputs["input_ids"] == tokenizer.mask_token_id)[1]
385
+ ```
386
+
387
+ قم بتمرير المدخلات إلى النموذج وإرجاع `logits` للرمز المقنع:
388
+
389
+ ```py
390
+ >>> from transformers import AutoModelForMaskedLM
391
+
392
+ >>> model = AutoModelForMaskedLM.from_pretrained("username/my_awesome_eli5_mlm_model")
393
+ >>> logits = model(**inputs).logits
394
+ >>> mask_token_logits = logits[0, mask_token_index, :]
395
+ ```
396
+
397
+ ثم قم بإرجاع الرموز الثلاثة المقنعة ذات الاحتمالية الأعلى وطباعتها:
398
+
399
+ ```py
400
+ >>> top_3_tokens = torch.topk(mask_token_logits, 3, dim=1).indices[0].tolist()
401
+
402
+ >>> for token in top_3_tokens:
403
+ ... print(text.replace(tokenizer.mask_token, tokenizer.decode([token])))
404
+ The Milky Way is a spiral galaxy.
405
+ The Milky Way is a massive galaxy.
406
+ The Milky Way is a small galaxy.
407
+ ```
408
+ </pt>
409
+ <tf>
410
+ قم بتقسيم النص إلى رموز وإرجاع `input_ids` كـ TensorFlow tensors. ستحتاج أيضًا إلى تحديد موضع رمز `<mask>`:
411
+
412
+ ```py
413
+ >>> from transformers import AutoTokenizer
414
+
415
+ >>> tokenizer = AutoTokenizer.from_pretrained("username/my_awesome_eli5_mlm_model")
416
+ >>> inputs = tokenizer(text, return_tensors="tf")
417
+ >>> mask_token_index = tf.where(inputs["input_ids"] == tokenizer.mask_token_id)[0, 1]
418
+ ```
419
+
420
+ قم بتمرير المدخلات إلى النموذج وإرجاع `logits` للرمز المقنع:
421
+
422
+ ```py
423
+ >>> from transformers import TFAutoModelForMaskedLM
424
+
425
+ >>> model = TFAutoModelForMaskedLM.from_pretrained("username/my_awesome_eli5_mlm_model")
426
+ >>> logits = model(**inputs).logits
427
+ >>> mask_token_logits = logits[0, mask_token_index, :]
428
+ ```
429
+
430
+ ثم قم بإرجاع الرموز الثلاثة المقنعة ذات الاحتمالية الأعلى وطباعتها:
431
+
432
+ ```py
433
+ >>> top_3_tokens = tf.math.top_k(mask_token_logits, 3).indices.numpy()
434
+
435
+ >>> for token in top_3_tokens:
436
+ ... print(text.replace(tokenizer.mask_token, tokenizer.decode([token])))
437
+ The Milky Way is a spiral galaxy.
438
+ The Milky Way is a massive galaxy.
439
+ The Milky Way is a small galaxy.
440
+ ```
441
+ </tf>
442
+ </frameworkcontent>
transformers/docs/source/ar/tasks/multiple_choice.md ADDED
@@ -0,0 +1,452 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!--Copyright 2022 The HuggingFace Team. All rights reserved.
2
+
3
+ Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
4
+ the License. You may obtain a copy of the License at
5
+
6
+ http://www.apache.org/licenses/LICENSE-2.0
7
+
8
+ Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
9
+ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
10
+ specific language governing permissions and limitations under the License.
11
+
12
+ ⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
13
+ rendered properly in your Markdown viewer.
14
+
15
+ -->
16
+
17
+ # الاختيار من متعدد (Multiple choice)
18
+
19
+ [[open-in-colab]]
20
+
21
+ مهمة الاختيار من متعدد مشابهة لمهمة الإجابة على الأسئلة، ولكن مع توفير عدة إجابات محتملة مع سياق، ويُدرّب النموذج على تحديد الإجابة الصحيحة.
22
+
23
+ سيوضح لك هذا الدليل كيفية:
24
+
25
+ 1. ضبط نموذج [BERT](https://huggingface.co/google-bert/bert-base-uncased) باستخدام الإعداد `regular` لمجموعة بيانات [SWAG](https://huggingface.co/datasets/swag) لاختيار الإجابة الأفضل من بين الخيارات المتعددة المتاحة مع السياق.
26
+ 2. استخدام النموذج المضبوط للاستدلال.
27
+
28
+ قبل البدء، تأكد من تثبيت جميع المكتبات الضرورية:
29
+
30
+ ```bash
31
+ pip install transformers datasets evaluate
32
+ ```
33
+
34
+ نشجعك على تسجيل الدخول إلى حساب Hugging Face الخاص بك حتى تتمكن من تحميل نموذجك ومشاركته مع المجتمع. عند المطالبة، أدخل الرمز المميز الخاص بك لتسجيل الدخول:
35
+
36
+ ```py
37
+ >>> from huggingface_hub import notebook_login
38
+
39
+ >>> notebook_login()
40
+ ```
41
+
42
+ ## تحميل مجموعة بيانات SWAG
43
+
44
+ ابدأ بتحميل تهيئة `regular` لمجموعة بيانات SWAG من مكتبة 🤗 Datasets:
45
+
46
+ ```py
47
+ >>> from datasets import load_dataset
48
+
49
+ >>> swag = load_dataset("swag", "regular")
50
+ ```
51
+
52
+ ثم ألق نظرة على مثال:
53
+
54
+ ```py
55
+ >>> swag["train"][0]
56
+ {'ending0': 'passes by walking down the street playing their instruments.',
57
+ 'ending1': 'has heard approaching them.',
58
+ 'ending2': "arrives and they're outside dancing and asleep.",
59
+ 'ending3': 'turns the lead singer watches the performance.',
60
+ 'fold-ind': '3416',
61
+ 'gold-source': 'gold',
62
+ 'label': 0,
63
+ 'sent1': 'Members of the procession walk down the street holding small horn brass instruments.',
64
+ 'sent2': 'A drum line',
65
+ 'startphrase': 'Members of the procession walk down the street holding small horn brass instruments. A drum line',
66
+ 'video-id': 'anetv_jkn6uvmqwh4'}
67
+ ```
68
+
69
+ على الرغم من أن الحقول تبدو كثيرة، إلا أنها في الواقع بسيطة جداً:
70
+
71
+ - `sent1` و `sent2`: يعرض هذان الحقلان بداية الجملة، وبدمجهما معًا، نحصل على حقل `startphrase`.
72
+ - `ending`: يقترح نهاية محتملة للجملة، واحدة منها فقط هي الصحيحة.
73
+ - `label`: يحدد نهاية الجملة الصحيحة.
74
+
75
+ ## المعالجة المسبقة (Preprocess)
76
+
77
+ الخطوة التالية هي استدعاء مُجزئ BERT لمعالجة بدايات الجمل والنهايات الأربع المحتملة:
78
+
79
+ ```py
80
+ >>> from transformers import AutoTokenizer
81
+
82
+ >>> tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
83
+ ```
84
+
85
+ تحتاج دالة المعالجة المسبقة التي تريد إنشاءها إلى:
86
+
87
+ 1. إنشاء أربع نسخ من حقل `sent1` ودمج كل منها مع `sent2` لإعادة إنشاء كيفية بدء الجملة.
88
+ 2. دمج `sent2` مع كل من نهايات الجمل الأربع المحتملة.
89
+ 3. تتجميع هاتين القائمتين لتتمكن من تجزئتهما، ثم إعادة ترتيبها بعد ذلك بحيث يكون لكل مثال حقول `input_ids` و `attention_mask` و `labels` مقابلة.
90
+
91
+
92
+ ```py
93
+ >>> ending_names = ["ending0", "ending1", "ending2", "ending3"]
94
+
95
+ >>> def preprocess_function(examples):
96
+ ... first_sentences = [[context] * 4 for context in examples["sent1"]]
97
+ ... question_headers = examples["sent2"]
98
+ ... second_sentences = [
99
+ ... [f"{header} {examples[end][i]}" for end in ending_names] for i, header in enumerate(question_headers)
100
+ ... ]
101
+
102
+ ... first_sentences = sum(first_sentences, [])
103
+ ... second_sentences = sum(second_sentences, [])
104
+
105
+ ... tokenized_examples = tokenizer(first_sentences, second_sentences, truncation=True)
106
+ ... return {k: [v[i : i + 4] for i in range(0, len(v), 4)] for k, v in tokenized_examples.items()}
107
+ ```
108
+
109
+ لتطبيق دالة المعالجة المسبقة على مجموعة البيانات بأكملها، استخدم طريقة [`~datasets.Dataset.map`] الخاصة بـ 🤗 Datasets. يمكنك تسريع دالة `map` عن طريق تعيين `batched=True` لمعالجة عناصر متعددة من مجموعة البيانات في وقت واحد:
110
+
111
+ ```py
112
+ tokenized_swag = swag.map(preprocess_function, batched=True)
113
+ ```
114
+
115
+ لا يحتوي 🤗 Transformers على مجمع بيانات للاختيار من متعدد، لذلك ستحتاج إلى تكييف [`DataCollatorWithPadding`] لإنشاء دفعة من الأمثلة. من الأكفأ إضافة حشو (padding) ديناميكي للجمل إلى أطول طول في دفعة أثناء التجميع، بدلاً من حشو مجموعة البيانات بأكملها إلى الحد الأقصى للطول.
116
+
117
+ يقوم `DataCollatorForMultipleChoice` بتجميع جميع مدخلات النموذج، ويطبق الحشو، ثم يعيد تجميع النتائج في شكلها الأصلي:
118
+
119
+ <frameworkcontent>
120
+ <pt>
121
+
122
+ ```py
123
+ >>> from dataclasses import dataclass
124
+ >>> from transformers.tokenization_utils_base import PreTrainedTokenizerBase, PaddingStrategy
125
+ >>> from typing import Optional, Union
126
+ >>> import torch
127
+
128
+ >>> @dataclass
129
+ ... class DataCollatorForMultipleChoice:
130
+ ... """
131
+ ... Data collator that will dynamically pad the inputs for multiple choice received.
132
+ ... """
133
+
134
+ ... tokenizer: PreTrainedTokenizerBase
135
+ ... padding: Union[bool, str, PaddingStrategy] = True
136
+ ... max_length: Optional[int] = None
137
+ ... pad_to_multiple_of: Optional[int] = None
138
+
139
+ ... def __call__(self, features):
140
+ ... label_name = "label" if "label" in features[0].keys() else "labels"
141
+ ... labels = [feature.pop(label_name) for feature in features]
142
+ ... batch_size = len(features)
143
+ ... num_choices = len(features[0]["input_ids"])
144
+ ... flattened_features = [
145
+ ... [{k: v[i] for k, v in feature.items()} for i in range(num_choices)] for feature in features
146
+ ... ]
147
+ ... flattened_features = sum(flattened_features, [])
148
+
149
+ ... batch = self.tokenizer.pad(
150
+ ... flattened_features,
151
+ ... padding=self.padding,
152
+ ... max_length=self.max_length,
153
+ ... pad_to_multiple_of=self.pad_to_multiple_of,
154
+ ... return_tensors="pt",
155
+ ... )
156
+
157
+ ... batch = {k: v.view(batch_size, num_choices, -1) for k, v in batch.items()}
158
+ ... batch["labels"] = torch.tensor(labels, dtype=torch.int64)
159
+ ... return batch
160
+ ```
161
+ </pt>
162
+ <tf>
163
+
164
+ ```py
165
+ >>> from dataclasses import dataclass
166
+ >>> from transformers.tokenization_utils_base import PreTrainedTokenizerBase, PaddingStrategy
167
+ >>> from typing import Optional, Union
168
+ >>> import tensorflow as tf
169
+
170
+ >>> @dataclass
171
+ ... class DataCollatorForMultipleChoice:
172
+ ... """
173
+ ... Data collator that will dynamically pad the inputs for multiple choice received.
174
+ ... """
175
+
176
+ ... tokenizer: PreTrainedTokenizerBase
177
+ ... padding: Union[bool, str, PaddingStrategy] = True
178
+ ... max_length: Optional[int] = None
179
+ ... pad_to_multiple_of: Optional[int] = None
180
+
181
+ ... def __call__(self, features):
182
+ ... label_name = "label" if "label" in features[0].keys() else "labels"
183
+ ... labels = [feature.pop(label_name) for feature in features]
184
+ ... batch_size = len(features)
185
+ ... num_choices = len(features[0]["input_ids"])
186
+ ... flattened_features = [
187
+ ... [{k: v[i] for k, v in feature.items()} for i in range(num_choices)] for feature in features
188
+ ... ]
189
+ ... flattened_features = sum(flattened_features, [])
190
+
191
+ ... batch = self.tokenizer.pad(
192
+ ... flattened_features,
193
+ ... padding=self.padding,
194
+ ... max_length=self.max_length,
195
+ ... pad_to_multiple_of=self.pad_to_multiple_of,
196
+ ... return_tensors="tf",
197
+ ... )
198
+
199
+ ... batch = {k: tf.reshape(v, (batch_size, num_choices, -1)) for k, v in batch.items()}
200
+ ... batch["labels"] = tf.convert_to_tensor(labels, dtype=tf.int64)
201
+ ... return batch
202
+ ```
203
+ </tf>
204
+ </frameworkcontent>
205
+
206
+ ## التقييم (Evaluate)
207
+
208
+ يُفضل غالبًا تضمين مقياس أثناء التدريب لتقييم أداء نموذجك. يمكنك تحميل طريقة تقييم بسرعة باستخدام مكتبة 🤗 [Evaluate](https://huggingface.co/docs/evaluate/index). لهذه المهمة، قم بتحميل مقياس [الدقة](https://huggingface.co/spaces/evaluate-metric/accuracy) (انظر إلى [الجولة السريعة](https://huggingface.co/docs/evaluate/a_quick_tour) لـ 🤗 Evaluate لمعرفة المزيد حول كيفية تحميل المقياس وحسابه):
209
+
210
+ ```py
211
+ >>> import evaluate
212
+
213
+ >>> accuracy = evaluate.load("accuracy")
214
+ ```
215
+
216
+ ثم أنشئ دالة لتمرير التنبؤات والتسميات إلى [`~evaluate.EvaluationModule.compute`] لحساب الدقة:
217
+
218
+ ```py
219
+ >>> import numpy as np
220
+
221
+ >>> def compute_metrics(eval_pred):
222
+ ... predictions, labels = eval_pred
223
+ ... predictions = np.argmax(predictions, axis=1)
224
+ ... return accuracy.compute(predictions=predictions, references=labels)
225
+ ```
226
+
227
+ دالتك `compute_metrics` جاهزة الآن، وستعود إليها عند إعداد تدريبك.
228
+
229
+ ## التدريب (Train)
230
+
231
+ <frameworkcontent>
232
+ <pt>
233
+
234
+ <Tip>
235
+
236
+ إذا لم تكن معتادًا على ضبط نموذج باستخدام [`Trainer`], فراجع الدرس الأساسي [هنا](../training#train-with-pytorch-trainer)!
237
+
238
+ </Tip>
239
+
240
+ أنت جاهز لبدء تدريب نموذجك الآن! قم بتحميل BERT باستخدام [`AutoModelForMultipleChoice`]:
241
+
242
+ ```py
243
+ >>> from transformers import AutoModelForMultipleChoice, TrainingArguments, Trainer
244
+
245
+ >>> model = AutoModelForMultipleChoice.from_pretrained("google-bert/bert-base-uncased")
246
+ ```
247
+
248
+ في هذه المرحلة، تبقى ثلاث خطوات فقط:
249
+
250
+ 1. حدد معلمات التدريب الخاصة بك في [`TrainingArguments`]. المعلمة الوحيدة المطلوبة هي `output_dir` التي تحدد مكان حفظ نموذجك. ستدفع هذا النموذج إلى Hub عن طريق تعيين `push_to_hub=True` (يجب عليك تسجيل الدخول إلى Hugging Face لتحميل نموذجك). في نهاية كل حقبة، سيقوم [`Trainer`] بتقييم الدقة وحفظ نقطة فحص التدريب.
251
+ 2. مرر معلمات التدريب إلى [`Trainer`] جنبًا إلى جنب مع النموذج ومُجمِّع البيانات والمعالج ودالة تجميع البيانات ودالة `compute_metrics`.
252
+ 3. استدعي [`~Trainer.train`] لضبط نموذجك.
253
+
254
+ ```py
255
+ >>> training_args = TrainingArguments(
256
+ ... output_dir="my_awesome_swag_model",
257
+ ... eval_strategy="epoch",
258
+ ... save_strategy="epoch",
259
+ ... load_best_model_at_end=True,
260
+ ... learning_rate=5e-5,
261
+ ... per_device_train_batch_size=16,
262
+ ... per_device_eval_batch_size=16,
263
+ ... num_train_epochs=3,
264
+ ... weight_decay=0.01,
265
+ ... push_to_hub=True,
266
+ ... )
267
+
268
+ >>> trainer = Trainer(
269
+ ... model=model,
270
+ ... args=training_args,
271
+ ... train_dataset=tokenized_swag["train"],
272
+ ... eval_dataset=tokenized_swag["validation"],
273
+ ... processing_class=tokenizer,
274
+ ... data_collator=DataCollatorForMultipleChoice(tokenizer=tokenizer),
275
+ ... compute_metrics=compute_metrics,
276
+ ... )
277
+
278
+ >>> trainer.train()
279
+ ```
280
+
281
+ بمجرد اكتمال التدريب، شارك نموذجك مع Hub باستخدام طريقة [`~transformers.Trainer.push_to_hub`] حتى يتمكن الجميع من استخدام نموذجك:
282
+
283
+ ```py
284
+ >>> trainer.push_to_hub()
285
+ ```
286
+ </pt>
287
+ <tf>
288
+ <Tip>
289
+
290
+ إذا لم تكن معتادًا على ضبط نموذج باستخدام Keras، فراجع الدرس الأساسي [هنا](../training#train-a-tensorflow-model-with-keras)!
291
+
292
+ </Tip>
293
+ لضبط نموذج في TensorFlow، ابدأ بإعداد دالة مُحسِّن وجدول معدل التعلم وبعض معلمات التدريب:
294
+
295
+ ```py
296
+ >>> from transformers import create_optimizer
297
+
298
+ >>> batch_size = 16
299
+ >>> num_train_epochs = 2
300
+ >>> total_train_steps = (len(tokenized_swag["train"]) // batch_size) * num_train_epochs
301
+ >>> optimizer, schedule = create_optimizer(init_lr=5e-5, num_warmup_steps=0, num_train_steps=total_train_steps)
302
+ ```
303
+
304
+ ثم يمكنك تحميل BERT باستخدام [`TFAutoModelForMultipleChoice`]:
305
+
306
+ ```py
307
+ >>> from transformers import TFAutoModelForMultipleChoice
308
+
309
+ >>> model = TFAutoModelForMultipleChoice.from_pretrained("google-bert/bert-base-uncased")
310
+ ```
311
+
312
+ حوّل مجموعات البيانات الخاصة بك إلى تنسيق `tf.data.Dataset` باستخدام [`~transformers.TFPreTrainedModel.prepare_tf_dataset`]:
313
+
314
+ ```py
315
+ >>> data_collator = DataCollatorForMultipleChoice(tokenizer=tokenizer)
316
+ >>> tf_train_set = model.prepare_tf_dataset(
317
+ ... tokenized_swag["train"],
318
+ ... shuffle=True,
319
+ ... batch_size=batch_size,
320
+ ... collate_fn=data_collator,
321
+ ... )
322
+
323
+ >>> tf_validation_set = model.prepare_tf_dataset(
324
+ ... tokenized_swag["validation"],
325
+ ... shuffle=False,
326
+ ... batch_size=batch_size,
327
+ ... collate_fn=data_collator,
328
+ ... )
329
+ ```
330
+
331
+ قم بتهيئة النموذج للتدريب باستخدام [`compile`](https://keras.io/api/models/model_training_apis/#compile-method). لاحظ أن جميع نماذج Transformers تحتوي على دالة خسارة مناسبة للمهمة بشكل افتراضي، لذلك لا تحتاج إلى تحديد واحدة ما لم ترغب في ذلك:
332
+
333
+ ```py
334
+ >>> model.compile(optimizer=optimizer) # لا توجد وسيطة خسارة!
335
+ ```
336
+
337
+ الخطوتان الأخيرتان قبل بدء التدريب هما: حساب دقة التنبؤات، وتوفير طريقة لرفع النموذج إلى Hub. ويمكن تحقيق ذلك باستخدام [استدعاءات Keras](../main_classes/keras_callbacks)
338
+
339
+ مرر دالتك `compute_metrics` إلى [`~transformers.KerasMetricCallback`]:
340
+
341
+ ```py
342
+ >>> from transformers.keras_callbacks import KerasMetricCallback
343
+
344
+ >>> metric_callback = KerasMetricCallback(metric_fn=compute_metrics, eval_dataset=tf_validation_set)
345
+ ```
346
+
347
+ حدد مكان دفع نموذجك ومعالجك في [`~transformers.PushToHubCallback`]:
348
+
349
+ ```py
350
+ >>> from transformers.keras_callbacks import PushToHubCallback
351
+
352
+ >>> push_to_hub_callback = PushToHubCallback(
353
+ ... output_dir="my_awesome_model",
354
+ ... tokenizer=tokenizer,
355
+ ... )
356
+ ```
357
+
358
+ ثم قم بتضمين الاستدعاءات معًا:
359
+
360
+ ```py
361
+ >>> callbacks = [metric_callback, push_to_hub_callback]
362
+ ```
363
+
364
+ أخيرًا، أنت جاهز لبدء تدريب نموذجك! استدعِ[`fit`](https://keras.io/api/models/model_training_apis/#fit-method) مع مجموعات بيانات التدريب والتحقق من الصحة وعدد الحقب والاستدعاءات لضبط النموذج:
365
+
366
+ ```py
367
+ >>> model.fit(x=tf_train_set, validation_data=tf_validation_set, epochs=2, callbacks=callbacks)
368
+ ```
369
+
370
+ بمجرد اكتمال التدريب، يتم تحميل نموذجك تلقائيًا إلى Hub حتى يتمكن الجميع من استخدامه!
371
+ </tf>
372
+ </frameworkcontent>
373
+
374
+ <Tip>
375
+
376
+ للحصول على مثال أكثر تعمقًا حول كيفية ضبط نموذج للاختيار من متعدد، ألق نظرة على [دفتر ملاحظات PyTorch](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/multiple_choice.ipynb)
377
+ أو [دفتر ملاحظات TensorFlow](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/multiple_choice-tf.ipynb) المقابل.
378
+
379
+ </Tip>
380
+
381
+ ## الاستدلال (Inference)
382
+
383
+ رائع، الآن بعد أن قمت بضبط نموذج، يمكنك استخدامه للاستدلال!
384
+
385
+ قم بإنشاء نص واقتراح إجابتين محتملتين:
386
+
387
+ ```py
388
+ >>> prompt = "France has a bread law, Le Décret Pain, with strict rules on what is allowed in a traditional baguette."
389
+ >>> candidate1 = "The law does not apply to croissants and brioche."
390
+ >>> candidate2 = "The law applies to baguettes."
391
+ ```
392
+
393
+ <frameworkcontent>
394
+ <pt>
395
+ قم بتحليل كل مطالبة وزوج إجابة مرشح وأعد تنسورات PyTorch. يجب عليك أيضًا إنشاء بعض `العلامات`:
396
+
397
+ ```py
398
+ >>> from transformers import AutoTokenizer
399
+
400
+ >>> tokenizer = AutoTokenizer.from_pretrained("username/my_awesome_swag_model")
401
+ >>> inputs = tokenizer([[prompt, candidate1], [prompt, candidate2]], return_tensors="pt", padding=True)
402
+ >>> labels = torch.tensor(0).unsqueeze(0)
403
+ ```
404
+
405
+ مرر مدخلاتك والعلامات إلى النموذج وأرجع`logits`:
406
+
407
+ ```py
408
+ >>> from transformers import AutoModelForMultipleChoice
409
+
410
+ >>> model = AutoModelForMultipleChoice.from_pretrained("username/my_awesome_swag_model")
411
+ >>> outputs = model(**{k: v.unsqueeze(0) for k, v in inputs.items()}, labels=labels)
412
+ >>> logits = outputs.logits
413
+ ```
414
+
415
+ استخرج الفئة ذات الاحتمالية الأكبر:
416
+
417
+ ```py
418
+ >>> predicted_class = logits.argmax().item()
419
+ >>> predicted_class
420
+ 0
421
+ ```
422
+ </pt>
423
+ <tf>
424
+ قم بتحليل كل مطالبة وزوج إجابة مرشح وأعد موترات TensorFlow:
425
+
426
+ ```py
427
+ >>> from transformers import AutoTokenizer
428
+
429
+ >>> tokenizer = AutoTokenizer.from_pretrained("username/my_awesome_swag_model")
430
+ >>> inputs = tokenizer([[prompt, candidate1], [prompt, candidate2]], return_tensors="tf", padding=True)
431
+ ```
432
+
433
+ مرر مدخلاتك إلى النموذج وأعد القيم logits:
434
+
435
+ ```py
436
+ >>> from transformers import TFAutoModelForMultipleChoice
437
+
438
+ >>> model = TFAutoModelForMultipleChoice.from_pretrained("username/my_awesome_swag_model")
439
+ >>> inputs = {k: tf.expand_dims(v, 0) for k, v in inputs.items()}
440
+ >>> outputs = model(inputs)
441
+ >>> logits = outputs.logits
442
+ ```
443
+
444
+ استخرج الفئة ذات الاحتمالية الأكبر:
445
+
446
+ ```py
447
+ >>> predicted_class = int(tf.math.argmax(logits, axis=-1)[0])
448
+ >>> predicted_class
449
+ 0
450
+ ```
451
+ </tf>
452
+ </frameworkcontent>
transformers/docs/source/ar/tasks/question_answering.md ADDED
@@ -0,0 +1,432 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!--Copyright 2022 The HuggingFace Team. All rights reserved.
2
+
3
+ Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
4
+ the License. You may obtain a copy of the License at
5
+
6
+ http://www.apache.org/licenses/LICENSE-2.0
7
+
8
+ Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
9
+ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
10
+ specific language governing permissions and limitations under the License.
11
+
12
+ ⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
13
+ rendered properly in your Markdown viewer.
14
+
15
+ -->
16
+
17
+ # الإجابة على الأسئلة (Question answering)
18
+
19
+ [[open-in-colab]]
20
+
21
+ <Youtube id="ajPx5LwJD-I"/>
22
+
23
+ تُقدّم مهام الإجابة على الأسئلة إجابةً بناءً على سؤال. إذا سبق لك أن سألت مساعدًا افتراضيًا مثل Alexa أو Siri أو Google عن حالة الطقس، فأنت قد استخدمت نموذج للإجابة على الأسئلة من قبل. هناك نوعان شائعان لمهام الإجابة على الأسئلة:
24
+
25
+ - الاستخراجية: استخراج الإجابة من السياق المحدد.
26
+ - التلخيصية: إنشاء إجابة من السياق تجيب على السؤال بشكل صحيح.
27
+
28
+ سيوضح لك هذا الدليل كيفية:
29
+
30
+ 1. ضبط [DistilBERT](https://huggingface.co/distilbert/distilbert-base-uncased) على مجموعة بيانات [SQuAD](https://huggingface.co/datasets/squad) للإجابة على الأسئلة الاستخراجية.
31
+ 2. استخدام النموذج المضبوط للاستدلال.
32
+
33
+ <Tip>
34
+
35
+ لمشاهدة جميع الهياكل والنسخ المتوافقة مع هذه المهمة، نوصي بالرجوع إلى [صفحة المهمة](https://huggingface.co/tasks/question-answering)
36
+
37
+ </Tip>
38
+
39
+ قبل البدء، تأكد من تثبيت جميع المكتبات الضرورية:
40
+
41
+ ```bash
42
+ pip install transformers datasets evaluate
43
+ ```
44
+
45
+ نشجعك على تسجيل الدخول إلى حساب Hugging Face الخاص بك حتى تتمكن من تحميل نموذجك ومشاركته مع المجتمع. عند المطالبة، أدخل الرمز المميز الخاص بك لتسجيل الدخول:
46
+
47
+ ```py
48
+ >>> from huggingface_hub import notebook_login
49
+
50
+ >>> notebook_login()
51
+ ```
52
+
53
+ ## تحميل مجموعة بيانات SQuAD
54
+
55
+ ابدأ بتحميل جزء أصغر من مجموعة بيانات SQuAD من مكتبة 🤗 Datasets. سيتيح لك ذلك فرصة للتجربة والتحقق من عمل كل شيء بشكل صحيح قبل قضاء المزيد من الوقت في التدريب على مجموعة البيانات الكاملة.
56
+
57
+ ```py
58
+ >>> from datasets import load_dataset
59
+
60
+ >>> squad = load_dataset("squad", split="train[:5000]")
61
+ ```
62
+
63
+ قم بتقسيم تقسيم `train` لمجموعة البيانات إلى مجموعة تدريب واختبار باستخدام طريقة [`~datasets.Dataset.train_test_split`]:
64
+
65
+ ```py
66
+ >>> squad = squad.train_test_split(test_size=0.2)
67
+ ```
68
+
69
+ ثم ألق نظرة على مثال:
70
+
71
+ ```py
72
+ >>> squad["train"][0]
73
+ {'answers': {'answer_start': [515], 'text': ['Saint Bernadette Soubirous']},
74
+ 'context': 'Architecturally, the school has a Catholic character. Atop the Main Building\'s gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.',
75
+ 'id': '5733be284776f41900661182',
76
+ 'question': 'To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?',
77
+ 'title': 'University_of_Notre_Dame'
78
+ }
79
+ ```
80
+
81
+ هناك العديد من الحقول المهمة هنا:
82
+
83
+ - `answers`: موقع بداية الرمز المميز للإجابة ونص الإجابة.
84
+ - `context`: معلومات أساسية يحتاج النموذج إلى استخراج الإجابة منها.
85
+ - `question`: السؤال الذي يجب على النموذج الإجابة عليه.
86
+
87
+ ## المعالجة المسبقة (Preprocess)
88
+
89
+ <Youtube id="qgaM0weJHpA"/>
90
+
91
+ الخطوة التالية هي تحميل المحلل اللغوى DistilBERT لمعالجة حقلي `question` و `context`:
92
+
93
+ ```py
94
+ >>> from transformers import AutoTokenizer
95
+
96
+ >>> tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")
97
+ ```
98
+
99
+ هناك بعض خطوات المعالجة المسبقة الخاصة بمهام الإجابة على الأسئلة التي يجب أن تكون على دراية بها:
100
+
101
+ 1. قد تحتوي بعض الأمثلة في مجموعة البيانات على `context` طويلًا يتجاوز الحد الأقصى لطول مدخل النموذج. للتعامل مع النصوص الأطول، يتم اقتطاع `context` فقط عن طريق تعيين `truncation="only_second"`.
102
+ 2. بعد ذلك، يتم تحديد مواضع بداية ونهاية الإجابة في `context` الأصلي عن طريق تعيين
103
+ `return_offset_mapping=True`.
104
+ 3. باستخدام التعيين، يمكن الآن تحديد رموز بداية ونهاية الإجابة. استخدم طريقة [`~tokenizers.Encoding.sequence_ids`]
105
+ لتحديد أجزاء الإزاحة التي تتوافق مع `question` و `context`.
106
+
107
+ فيما يلي كيفية إنشاء دالة لقص وتعيين رموز البداية والنهاية لـ `answer` إلى `context`:
108
+
109
+ ```py
110
+ >>> def preprocess_function(examples):
111
+ ... questions = [q.strip() for q in examples["question"]]
112
+ ... inputs = tokenizer(
113
+ ... questions,
114
+ ... examples["context"],
115
+ ... max_length=384,
116
+ ... truncation="only_second",
117
+ ... return_offsets_mapping=True,
118
+ ... padding="max_length",
119
+ ... )
120
+
121
+ ... offset_mapping = inputs.pop("offset_mapping")
122
+ ... answers = examples["answers"]
123
+ ... start_positions = []
124
+ ... end_positions = []
125
+
126
+ ... for i, offset in enumerate(offset_mapping):
127
+ ... answer = answers[i]
128
+ ... start_char = answer["answer_start"][0]
129
+ ... end_char = answer["answer_start"][0] + len(answer["text"][0])
130
+ ... sequence_ids = inputs.sequence_ids(i)
131
+
132
+ ... # Find the start and end of the context
133
+ ... idx = 0
134
+ ... while sequence_ids[idx] != 1:
135
+ ... idx += 1
136
+ ... context_start = idx
137
+ ... while sequence_ids[idx] == 1:
138
+ ... idx += 1
139
+ ... context_end = idx - 1
140
+
141
+ ... # If the answer is not fully inside the context, label it (0, 0)
142
+ ... if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
143
+ ... start_positions.append(0)
144
+ ... end_positions.append(0)
145
+ ... else:
146
+ ... # Otherwise it's the start and end token positions
147
+ ... idx = context_start
148
+ ... while idx <= context_end and offset[idx][0] <= start_char:
149
+ ... idx += 1
150
+ ... start_positions.append(idx - 1)
151
+
152
+ ... idx = context_end
153
+ ... while idx >= context_start and offset[idx][1] >= end_char:
154
+ ... idx -= 1
155
+ ... end_positions.append(idx + 1)
156
+
157
+ ... inputs["start_positions"] = start_positions
158
+ ... inputs["end_positions"] = end_positions
159
+ ... return inputs
160
+ ```
161
+
162
+ لتطبيق المعالجة المسبقة على كامل مجموعة البيانات، استخدم [`~datasets.Dataset.map`] من مكتبة 🤗 Datasets. يمكنك تسريع دالة `map` عن طريق تعيين `batched=True` لمعالجة عناصر متعددة من مجموعة البيانات دفعة واحدة. قم بإزالة أي أعمدة لا تحتاجها:
163
+
164
+ ```py
165
+ >>> tokenized_squad = squad.map(preprocess_function, batched=True, remove_columns=squad["train"].column_names)
166
+ ```
167
+
168
+ الآن قم بإنشاء دفعة من الأمثلة باستخدام [`DefaultDataCollator`]. بخلاف مجمّعات البيانات الأخرى في 🤗 Transformers، لا يطبق [`DefaultDataCollator`] أي معالجة مسبقة إضافية مثل الحشو.
169
+
170
+ <frameworkcontent>
171
+ <pt>
172
+
173
+ ```py
174
+ >>> from transformers import DefaultDataCollator
175
+
176
+ >>> data_collator = DefaultDataCollator()
177
+ ```
178
+ </pt>
179
+ <tf>
180
+
181
+ ```py
182
+ >>> from transformers import DefaultDataCollator
183
+
184
+ >>> data_collator = DefaultDataCollator(return_tensors="tf")
185
+ ```
186
+ </tf>
187
+ </frameworkcontent>
188
+
189
+ ## التدريب (Train)
190
+
191
+ <frameworkcontent>
192
+ <pt>
193
+
194
+ <Tip>
195
+
196
+ إذا لم تكن معتادًا على ضبط نموذج باستخدام [`Trainer`], ألق نظرة على البرنامج التعليمي الأساسي [هنا](../training#train-with-pytorch-trainer)!
197
+
198
+ </Tip>
199
+
200
+ أنت جاهز لبدء تدريب نموذجك الآن! قم بتحميل DistilBERT باستخدام [`AutoModelForQuestionAnswering`]:
201
+
202
+ ```py
203
+ >>> from transformers import AutoModelForQuestionAnswering, TrainingArguments, Trainer
204
+
205
+ >>> model = AutoModelForQuestionAnswering.from_pretrained("distilbert/distilbert-base-uncased")
206
+ ```
207
+
208
+ في هذه المرحلة، تبقى ثلاث خطوات فقط:
209
+
210
+ 1. حدد المعاملات الفائقة للتدريب في [`TrainingArguments`]. المعامل الوحيد المطلوب هو `output_dir` الذي يحدد مكان حفظ نموذجك. ستدفع هذا النموذج إلى Hub عن طريق تعيين `push_to_hub=True` (يجب عليك تسجيل الدخول إلى Hugging Face لتحميل نموذجك).
211
+ 2. مرر معاملات التدريب إلى [`Trainer`] جنبًا إلى جنب مع النموذج، ومجموعة البيانات، والمُحلّل النصي، ومُجمّع البيانات.
212
+ 3. استدعِ ـ [`~Trainer.train`] لضبط النموذج.
213
+
214
+ ```py
215
+ >>> training_args = TrainingArguments(
216
+ ... output_dir="my_awesome_qa_model",
217
+ ... eval_strategy="epoch",
218
+ ... learning_rate=2e-5,
219
+ ... per_device_train_batch_size=16,
220
+ ... per_device_eval_batch_size=16,
221
+ ... num_train_epochs=3,
222
+ ... weight_decay=0.01,
223
+ ... push_to_hub=True,
224
+ ... )
225
+
226
+ >>> trainer = Trainer(
227
+ ... model=model,
228
+ ... args=training_args,
229
+ ... train_dataset=tokenized_squad["train"],
230
+ ... eval_dataset=tokenized_squad["test"],
231
+ ... processing_class=tokenizer,
232
+ ... data_collator=data_collator,
233
+ ... )
234
+
235
+ >>> trainer.train()
236
+ ```
237
+
238
+ بمجرد اكتمال التدريب، شارك نموذجك في Hub باستخدام الدالة [`~transformers.Trainer.push_to_hub`] حتى يتمكن الجميع من استخدام نموذجك:
239
+
240
+ ```py
241
+ >>> trainer.push_to_hub()
242
+ ```
243
+ </pt>
244
+ <tf>
245
+
246
+ <Tip>
247
+
248
+ إذا لم تكن معتادًا على ضبط نموذج باستخدام Keras، فألق نظرة على البرنامج التعليمي الأساسي [هنا](../training#train-a-tensorflow-model-with-keras)!
249
+
250
+ </Tip>
251
+ لضبط نموذج في TensorFlow، ابدأ بإعداد دالة مُحسِّن، وجدول معدل التعلم، وبعض المعاملات الفائقة للتدريب:
252
+
253
+ ```py
254
+ >>> from transformers import create_optimizer
255
+
256
+ >>> batch_size = 16
257
+ >>> num_epochs = 2
258
+ >>> total_train_steps = (len(tokenized_squad["train"]) // batch_size) * num_epochs
259
+ >>> optimizer, schedule = create_optimizer(
260
+ ... init_lr=2e-5,
261
+ ... num_warmup_steps=0,
262
+ ... num_train_steps=total_train_steps,
263
+ ... )
264
+ ```
265
+
266
+ ثم يمكنك تحميل DistilBERT باستخدام [`TFAutoModelForQuestionAnswering`]:
267
+
268
+ ```py
269
+ >>> from transformers import TFAutoModelForQuestionAnswering
270
+
271
+ >>> model = TFAutoModelForQuestionAnswering.from_pretrained("distilbert/distilbert-base-uncased")
272
+ ```
273
+
274
+ حوّل مجموعات البيانات الخاصة بك إلى تنسيق `tf.data.Dataset` باستخدام [`~transformers.TFPreTrainedModel.prepare_tf_dataset`]:
275
+
276
+ ```py
277
+ >>> tf_train_set = model.prepare_tf_dataset(
278
+ ... tokenized_squad["train"],
279
+ ... shuffle=True,
280
+ ... batch_size=16,
281
+ ... collate_fn=data_collator,
282
+ ... )
283
+
284
+ >>> tf_validation_set = model.prepare_tf_dataset(
285
+ ... tokenized_squad["test"],
286
+ ... shuffle=False,
287
+ ... batch_size=16,
288
+ ... collate_fn=data_collator,
289
+ ... )
290
+ ```
291
+
292
+ قم بتكوين النموذج للتدريب باستخدام [`compile`](https://keras.io/api/models/model_training_apis/#compile-method):
293
+
294
+ ```py
295
+ >>> import tensorflow as tf
296
+
297
+ >>> model.compile(optimizer=optimizer)
298
+ ```
299
+
300
+ آخر شيء يجب إعداده قبل بدء التدريب هو توفير طريقة لدفع نموذجك إلى Hub. يمكن القيام بذلك عن طريق تحديد مكان دفع نموذجك ومعالجك المعجمي في [`~transformers.PushToHubCallback`]:
301
+
302
+ ```py
303
+ >>> from transformers.keras_callbacks import PushToHubCallback
304
+
305
+ >>> callback = PushToHubCallback(
306
+ ... output_dir="my_awesome_qa_model",
307
+ ... tokenizer=tokenizer,
308
+ ... )
309
+ ```
310
+
311
+ أخيرًا، أنت جاهز لبدء تدريب نموذجك! اتصل بـ [`fit`](https://keras.io/api/models/model_training_apis/#fit-method) مع مجموعات بيانات التدريب والتحقق من الصحة، وعدد العهود، ومعاودة الاتصال الخاصة بك لضبط النموذج:
312
+
313
+ ```py
314
+ >>> model.fit(x=tf_train_set, validation_data=tf_validation_set, epochs=3, callbacks=[callback])
315
+ ```
316
+ بمجرد اكتمال التدريب، يتم تحميل نموذجك تلقائيًا إلى Hub حتى يتمكن الجميع من استخدامه!
317
+ </tf>
318
+ </frameworkcontent>
319
+
320
+
321
+ <Tip>
322
+
323
+ للحصول على مثال أكثر تعمقًا حول كيفية ضبط نموذج للإجابة على الأسئلة، ألق نظرة على [دفتر ملاحظات PyTorch](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/question_answering.ipynb) المقابل
324
+ أو [دفتر ملاحظات TensorFlow](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/question_answering-tf.ipynb).
325
+
326
+ </Tip>
327
+
328
+ ## التقييم (Evaluate)
329
+
330
+ يتطلب التقييم للإجابة على الأسئلة قدرًا كبيرًا من المعالجة اللاحقة. لتوفير وقتك، يتخطى هذا الدليل خطوة التقييم. لا يزال [`Trainer`] يحسب خسارة التقييم أثناء التدريب، مما يعني أنك لست تجهل تمامًا أداء نموذجك.
331
+
332
+ إذا كان لديك المزيد من الوقت وتهتم بكيفية تقييم نموذجك للإجابة على الأسئلة، فألق نظرة على فصل [الإجابة على الأسئلة](https://huggingface.co/course/chapter7/7?fw=pt#post-processing) من دورة 🤗 Hugging Face!
333
+
334
+ ## الاستدلال (Inference)
335
+
336
+ رائع، الآن بعد أن قمت بضبط نموذج، يمكنك استخدامه للاستدلال!
337
+
338
+ حدد سؤالًا وسياقًا ليقوم النموذج بالتنبؤ بالإجابة عليه:
339
+
340
+ ```py
341
+ >>> question = "How many programming languages does BLOOM support?"
342
+ >>> context = "BLOOM has 176 billion parameters and can generate text in 46 languages natural languages and 13 programming languages."
343
+ ```
344
+
345
+ أبسط طريقة لتجربة نموذجك المُدرَّب للاستدلال هي استخدامه في [`pipeline`]. قم بإنشاء كائن لـ `pipeline` للإجابة على الأسئلة باستخدام نموذجك، ومرِّر النص إليه:
346
+
347
+ ```py
348
+ >>> from transformers import pipeline
349
+
350
+ >>> question_answerer = pipeline("question-answering", model="my_awesome_qa_model")
351
+ >>> question_answerer(question=question, context=context)
352
+ {'score': 0.2058267742395401,
353
+ 'start': 10,
354
+ 'end': 95,
355
+ 'answer': '176 مليار معامل ويمكنه إنشاء نصوص بـ 46 لغة طبيعية و 13'}
356
+ ```
357
+
358
+ يمكنك أيضًا تكرار نتائج `pipeline` يدويًا إذا أردت:
359
+
360
+ <frameworkcontent>
361
+ <pt>
362
+
363
+ قسّم النص وأرجع تنسورات PyTorch:
364
+
365
+ ```py
366
+ >>> from transformers import AutoTokenizer
367
+
368
+ >>> tokenizer = AutoTokenizer.from_pretrained("my_awesome_qa_model")
369
+ >>> inputs = tokenizer(question, context, return_tensors="pt")
370
+ ```
371
+
372
+ مرر مدخلاتك إلى النموذج وأرجع `logits`:
373
+
374
+ ```py
375
+ >>> import torch
376
+ >>> from transformers import AutoModelForQuestionAnswering
377
+
378
+ >>> model = AutoModelForQuestionAnswering.from_pretrained("my_awesome_qa_model")
379
+ >>> with torch.no_grad():
380
+ ... outputs = model(**inputs)
381
+ ```
382
+
383
+ احصل على أعلى احتمال من مخرجات النموذج لموضعي البداية والنهاية:
384
+
385
+ ```py
386
+ >>> answer_start_index = outputs.start_logits.argmax()
387
+ >>> answer_end_index = outputs.end_logits.argmax()
388
+ ```
389
+
390
+ استخلاص الإجابة من الرموز المتوقعة:
391
+
392
+ ```py
393
+ >>> predict_answer_tokens = inputs.input_ids[0, answer_start_index : answer_end_index + 1]
394
+ >>> tokenizer.decode(predict_answer_tokens)
395
+ '176 billion parameters and can generate text in 46 languages natural languages and 13'
396
+ ```
397
+ </pt>
398
+ <tf>
399
+ قم بتحليل النص المعجمي وأعد موترات TensorFlow:
400
+
401
+ ```py
402
+ >>> from transformers import AutoTokenizer
403
+
404
+ >>> tokenizer = AutoTokenizer.from_pretrained("my_awesome_qa_model")
405
+ >>> inputs = tokenizer(question, context, return_tensors="tf")
406
+ ```
407
+
408
+ مرر مدخلاتك إلى النموذج وأعد `logits`:
409
+
410
+ ```py
411
+ >>> from transformers import TFAutoModelForQuestionAnswering
412
+
413
+ >>> model = TFAutoModelForQuestionAnswering.from_pretrained("my_awesome_qa_model")
414
+ >>> outputs = model(**inputs)
415
+ ```
416
+
417
+ احصل على أعلى احتمال من مخرجات النموذج لموضعي البداية والنهاية:
418
+
419
+ ```py
420
+ >>> answer_start_index = int(tf.math.argmax(outputs.start_logits, axis=-1)[0])
421
+ >>> answer_end_index = int(tf.math.argmax(outputs.end_logits, axis=-1)[0])
422
+ ```
423
+
424
+ استخلاص الإجابة من الرموز المتوقعة:
425
+
426
+ ```py
427
+ >>> predict_answer_tokens = inputs.input_ids[0, answer_start_index : answer_end_index + 1]
428
+ >>> tokenizer.decode(predict_answer_tokens)
429
+ '176 billion parameters and can generate text in 46 languages natural languages and 13'
430
+ ```
431
+ </tf>
432
+ </frameworkcontent>
transformers/docs/source/ar/tasks/sequence_classification.md ADDED
@@ -0,0 +1,387 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!--Copyright 2022 The HuggingFace Team. All rights reserved.
2
+ Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
3
+ the License. You may obtain a copy of the License at
4
+ http://www.apache.org/licenses/LICENSE-2.0
5
+ Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
6
+ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
7
+ specific language governing permissions and limitations under the License.
8
+ ⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
9
+ rendered properly in your Markdown viewer.
10
+ -->
11
+
12
+ # تصنيف النص(Text classification)
13
+
14
+ [[open-in-colab]]
15
+
16
+ <Youtube id="leNG9fN9FQU"/>
17
+
18
+ تصنيف النص هو مهمة NLP شائعة حيث يُعيّن تصنيفًا أو فئة للنص. تستخدم بعض أكبر الشركات تصنيف النصوص في الإنتاج لمجموعة واسعة من التطبيقات العملية. أحد أكثر أشكال تصنيف النص شيوعًا هو تحليل المشاعر، والذي يقوم بتعيين تسمية مثل 🙂 إيجابية، 🙁 سلبية، أو 😐 محايدة لتسلسل نصي.
19
+
20
+ سيوضح لك هذا الدليل كيفية:
21
+
22
+ 1. ضبط [DistilBERT](https://huggingface.co/distilbert/distilbert-base-uncased) على مجموعة بيانات [IMDb](https://huggingface.co/datasets/imdb) لتحديد ما إذا كانت مراجعة الفيلم إيجابية أو سلبية.
23
+ 2. استخدام نموذج الضبط الدقيق للتنبؤ.
24
+
25
+ <Tip>
26
+
27
+ لرؤية جميع البنى ونقاط التحقق المتوافقة مع هذه المهمة، نوصي بالتحقق من [صفحة المهمة](https://huggingface.co/tasks/text-classification).
28
+
29
+ </Tip>
30
+
31
+ قبل أن تبدأ، تأكد من تثبيت جميع المكتبات الضرورية:
32
+
33
+ ```bash
34
+ pip install transformers datasets evaluate accelerate
35
+ ```
36
+
37
+ نحن نشجعك على تسجيل الدخول إلى حساب Hugging Face الخاص بك حتى تتمكن من تحميل ومشاركة نموذجك مع المجتمع. عند المطالبة، أدخل رمزك لتسجيل الدخول:
38
+
39
+ ```py
40
+ >>> from huggingface_hub import notebook_login
41
+
42
+ >>> notebook_login()
43
+ ```
44
+
45
+ ## تحميل مجموعة بيانات IMDb
46
+
47
+ ابدأ بتحميل مجموعة بيانات IMDb من مكتبة 🤗 Datasets:
48
+
49
+ ```py
50
+ >>> from datasets import load_dataset
51
+
52
+ >>> imdb = load_dataset("imdb")
53
+ ```
54
+
55
+ ثم ألق نظرة على مثال:
56
+
57
+ ```py
58
+ >>> imdb["test"][0]
59
+ {
60
+ "label": 0,
61
+ "text": "I love sci-fi and am willing to put up with a lot. Sci-fi movies/TV are usually underfunded, under-appreciated and misunderstood. I tried to like this, I really did, but it is to good TV sci-fi as Babylon 5 is to Star Trek (the original). Silly prosthetics, cheap cardboard sets, stilted dialogues, CG that doesn't match the background, and painfully one-dimensional characters cannot be overcome with a 'sci-fi' setting. (I'm sure there are those of you out there who think Babylon 5 is good sci-fi TV. It's not. It's clichéd and uninspiring.) While US viewers might like emotion and character development, sci-fi is a genre that does not take itself seriously (cf. Star Trek). It may treat important issues, yet not as a serious philosophy. It's really difficult to care about the characters here as they are not simply foolish, just missing a spark of life. Their actions and reactions are wooden and predictable, often painful to watch. The makers of Earth KNOW it's rubbish as they have to always say \"Gene Roddenberry's Earth...\" otherwise people would not continue watching. Roddenberry's ashes must be turning in their orbit as this dull, cheap, poorly edited (watching it without advert breaks really brings this home) trudging Trabant of a show lumbers into space. Spoiler. So, kill off a main character. And then bring him back as another actor. Jeeez! Dallas all over again.",
62
+ }
63
+ ```
64
+
65
+ هناك حقولان في هذه المجموعة من البيانات:
66
+
67
+ - `text`: نص مراجعة الفيلم.
68
+ - `label`: قيمة إما `0` لمراجعة سلبية أو `1` لمراجعة إيجابية.
69
+
70
+ ## المعالجة المسبقة(Preprocess)
71
+
72
+ الخطوة التالية هي تحميل المُجزِّئ النص DistilBERT لتهيئة لحقل `text`:
73
+
74
+ ```py
75
+ >>> from transformers import AutoTokenizer
76
+
77
+ >>> tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")
78
+ ```
79
+
80
+ أنشئ دالة لتهيئة حقل `text` وتقصير السلاسل النصية بحيث لا يتجاوز طولها الحد الأقصى لإدخالات DistilBERT:
81
+
82
+ ```py
83
+ >>> def preprocess_function(examples):
84
+ ... return tokenizer(examples["text"], truncation=True)
85
+ ```
86
+
87
+ لتطبيق دالة التهيئة على مجموعة البيانات بأكملها، ا��تخدم دالة 🤗 Datasets [`~datasets.Dataset.map`] . يمكنك تسريع `map` باستخدام `batched=True` لمعالجة دفعات من البيانات:
88
+
89
+ ```py
90
+ tokenized_imdb = imdb.map(preprocess_function, batched=True)
91
+ ```
92
+
93
+ الآن قم بإنشاء دفعة من الأمثلة باستخدام [`DataCollatorWithPadding`]. الأكثر كفاءة هو استخدام الحشو الديناميكي لجعل الجمل متساوية في الطول داخل كل دفعة، بدلًا من حشو كامل البيانات إلى الحد الأقصى للطول.
94
+
95
+ <frameworkcontent>
96
+ <pt>
97
+
98
+ ```py
99
+ >>> from transformers import DataCollatorWithPadding
100
+
101
+ >>> data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
102
+ ```
103
+ </pt>
104
+ <tf>
105
+
106
+ ```py
107
+ >>> from transformers import DataCollatorWithPadding
108
+
109
+ >>> data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="tf")
110
+ ```
111
+ </tf>
112
+ </frameworkcontent>
113
+
114
+ ## التقييم(Evaluate)
115
+
116
+ يُعدّ تضمين مقياس أثناء التدريب مفيدًا لتقييم أداء النموذج. يمكنك تحميل طريقة تقييم بسرعة باستخدام مكتبة 🤗 [Evaluate](https://huggingface.co/docs/evaluate/index) . بالنسبة لهذه المهمة، قم بتحميل مقياس [الدقة](https://huggingface.co/spaces/evaluate-metric/accuracy) (راجع جولة 🤗 Evaluate [السريعة](https://huggingface.co/docs/evaluate/a_quick_tour) لمعرفة المزيد حول كيفية تحميل وحساب مقياس):
117
+
118
+ ```py
119
+ >>> import evaluate
120
+
121
+ >>> accuracy = evaluate.load("accuracy")
122
+ ```
123
+
124
+ ثم أنشئ دالة تقوم بتمرير تنبؤاتك وتصنيفاتك إلى [`~evaluate.EvaluationModule.compute`] لحساب الدقة:
125
+
126
+ ```py
127
+ >>> import numpy as np
128
+
129
+ >>> def compute_metrics(eval_pred):
130
+ ... predictions, labels = eval_pred
131
+ ... predictions = np.argmax(predictions, axis=1)
132
+ ... return accuracy.compute(predictions=predictions, references=labels)
133
+ ```
134
+
135
+ دالة `compute_metrics` جاهزة الآن، وستعود إليها عند إعداد التدريب.
136
+
137
+ ## التدريب(Train)
138
+
139
+ قبل أن تبدأ في تدريب نموذجك، قم بإنشاء خريطة من المعرفات المتوقعة إلى تسمياتها باستخدام `id2label` و `label2id`:
140
+
141
+ ```py
142
+ >>> id2label = {0: "NEGATIVE", 1: "POSITIVE"}
143
+ >>> label2id = {"NEGATIVE": 0, "POSITIVE": 1}
144
+ ```
145
+
146
+ <frameworkcontent>
147
+ <pt>
148
+ <Tip>
149
+
150
+ إذا لم تكن على دراية بضبط نموذج دقيق باستخدام [`Trainer`], فالق نظرة على البرنامج التعليمي الأساسي [هنا](../training#train-with-pytorch-trainer)!
151
+
152
+ </Tip>
153
+
154
+ أنت مستعد الآن لبدء تدريب نموذجك! قم بتحميل DistilBERT مع [`AutoModelForSequenceClassification`] جنبًا إلى جنب مع عدد التصنيفات المتوقعة، وتصنيفات الخرائط:
155
+
156
+ ```py
157
+ >>> from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
158
+
159
+ >>> model = AutoModelForSequenceClassification.from_pretrained(
160
+ ... "distilbert/distilbert-base-uncased", num_labels=2, id2label=id2label, label2id=label2id
161
+ ... )
162
+ ```
163
+
164
+ في هذه المرحلة، هناك ثلاث خطوات فقط متبقية:
165
+
166
+ 1. حدد مُعامِلات التدريب في [`TrainingArguments`]. المُعامل المطلوب الوحيد هو `output_dir`، لتحديد مكان حفظ النموذج. يمكنك رفع النموذج إلى Hub بتعيين `push_to_hub=True` (يجب تسجيل الدخول إلى Hugging Face لرفع النموذج). سيقوم `Trainer` بتقييم الدقة وحفظ نقاط التحقق في نهاية كل حقبة.
167
+ 2. مرر مُعامِلات التدريب إلى `Trainer` مع النموذج، ومجموعة البيانات، والمحلل اللغوي، ومُجمِّع البيانات، ووظيفة `compute_metrics`.
168
+ 3. استدعِ [`~Trainer.train`] لضبط النموذج.
169
+
170
+ ```py
171
+ >>> training_args = TrainingArguments(
172
+ ... output_dir="my_awesome_model",
173
+ ... learning_rate=2e-5,
174
+ ... per_device_train_batch_size=16,
175
+ ... per_device_eval_batch_size=16,
176
+ ... num_train_epochs=2,
177
+ ... weight_decay=0.01,
178
+ ... eval_strategy="epoch",
179
+ ... save_strategy="epoch",
180
+ ... load_best_model_at_end=True,
181
+ ... push_to_hub=True,
182
+ ... )
183
+
184
+ >>> trainer = Trainer(
185
+ ... model=model,
186
+ ... args=training_args,
187
+ ... train_dataset=tokenized_imdb["train"],
188
+ ... eval_dataset=tokenized_imdb["test"],
189
+ ... processing_class=tokenizer,
190
+ ... data_collator=data_collator,
191
+ ... compute_metrics=compute_metrics,
192
+ ... )
193
+
194
+ >>> trainer.train()
195
+ ```
196
+
197
+ <Tip>
198
+
199
+ يستخدم [`Trainer`] الحشو الديناميكي افتراضيًا عند تمرير `tokenizer` إليه. في هذه الحالة، لا تحتاج لتحديد مُجمِّع البيانات صراحةً.
200
+
201
+ </Tip>
202
+
203
+ بعد اكتمال التدريب، شارك نموذجك على Hub باستخد��م الطريقة [`~transformers.Trainer.push_to_hub`] ليستخدمه الجميع:
204
+
205
+ ```py
206
+ >>> trainer.push_to_hub()
207
+ ```
208
+ </pt>
209
+ <tf>
210
+ <Tip>
211
+
212
+ إذا لم تكن على دراية بضبط نموذج باستخدام Keras، قم بالاطلاع على البرنامج التعليمي الأساسي [هنا](../training#train-a-tensorflow-model-with-keras)!
213
+
214
+ </Tip>
215
+ لضبط نموذج في TensorFlow، ابدأ بإعداد دالة المحسن، وجدول معدل التعلم، وبعض معلمات التدريب:
216
+
217
+ ```py
218
+ >>> from transformers import create_optimizer
219
+ >>> import tensorflow as tf
220
+
221
+ >>> batch_size = 16
222
+ >>> num_epochs = 5
223
+ >>> batches_per_epoch = len(tokenized_imdb["train"]) // batch_size
224
+ >>> total_train_steps = int(batches_per_epoch * num_epochs)
225
+ >>> optimizer, schedule = create_optimizer(init_lr=2e-5, num_warmup_steps=0, num_train_steps=total_train_steps)
226
+ ```
227
+
228
+ ثم يمكنك تحميل DistilBERT مع [`TFAutoModelForSequenceClassification`] بالإضافة إلى عدد التصنيفات المتوقعة، وتعيينات التسميات:
229
+
230
+ ```py
231
+ >>> from transformers import TFAutoModelForSequenceClassification
232
+
233
+ >>> model = TFAutoModelForSequenceClassification.from_pretrained(
234
+ ... "distilbert/distilbert-base-uncased", num_labels=2, id2label=id2label, label2id=label2id
235
+ ... )
236
+ ```
237
+
238
+ قم بتحويل مجموعات بياناتك إلى تنسيق `tf.data.Dataset` باستخدام [`~transformers.TFPreTrainedModel.prepare_tf_dataset`]:
239
+
240
+ ```py
241
+ >>> tf_train_set = model.prepare_tf_dataset(
242
+ ... tokenized_imdb["train"],
243
+ ... shuffle=True,
244
+ ... batch_size=16,
245
+ ... collate_fn=data_collator,
246
+ ... )
247
+
248
+ >>> tf_validation_set = model.prepare_tf_dataset(
249
+ ... tokenized_imdb["test"],
250
+ ... shuffle=False,
251
+ ... batch_size=16,
252
+ ... collate_fn=data_collator,
253
+ ... )
254
+ ```
255
+
256
+ قم بتهيئة النموذج للتدريب باستخدام [`compile`](https://keras.io/api/models/model_training_apis/#compile-method). لاحظ أن جميع نماذج Transformers لديها دالة خسارة ذات صلة بالمهمة بشكل افتراضي، لذلك لا تحتاج إلى تحديد واحدة ما لم ترغب في ذلك:
257
+
258
+ ```py
259
+ >>> import tensorflow as tf
260
+
261
+ >>> model.compile(optimizer=optimizer) # No loss argument!
262
+ ```
263
+
264
+ آخر أمرين يجب إعدادهما قبل بدء التدريب هو حساب الدقة من التوقعات، وتوفير طريقة لدفع نموذجك إلى Hub. يتم ذلك باستخدام [Keras callbacks](../main_classes/keras_callbacks).
265
+
266
+ قم بتمرير دالة `compute_metrics` الخاصة بك إلى [`~transformers.KerasMetricCallback`]:
267
+
268
+ ```py
269
+ >>> from transformers.keras_callbacks import KerasMetricCallback
270
+
271
+ >>> metric_callback = KerasMetricCallback(metric_fn=compute_metrics, eval_dataset=tf_validation_set)
272
+ ```
273
+
274
+ حدد مكان دفع نموذجك والمجزئ اللغوي في [`~transformers.PushToHubCallback`]:
275
+
276
+ ```py
277
+ >>> from transformers.keras_callbacks import PushToHubCallback
278
+
279
+ >>> push_to_hub_callback = PushToHubCallback(
280
+ ... output_dir="my_awesome_model",
281
+ ... tokenizer=tokenizer,
282
+ ... )
283
+ ```
284
+
285
+ ثم اجمع الاستدعاءات معًا:
286
+
287
+ ```py
288
+ >>> callbacks = [metric_callback, push_to_hub_callback]
289
+ ```
290
+
291
+ أخيرًا، أنت مستعد لبدء تدريب نموذجك! قم باستدعاء [`fit`](https://keras.io/api/models/model_training_apis/#fit-method) مع مجموعات بيانات التدريب والتحقق، وعدد الحقبات، واستدعاءاتك لضبط النموذج:
292
+
293
+ ```py
294
+ >>> model.fit(x=tf_train_set, validation_data=tf_validation_set, epochs=3, callbacks=callbacks)
295
+ ```
296
+
297
+ بمجرد اكتمال التدريب، يتم تحميل نموذجك تلقائيًا إلى Hub حتى يتمكن الجميع من استخدامه!
298
+ </tf>
299
+ </frameworkcontent>
300
+
301
+ <Tip>
302
+
303
+ للحصول على مثال أكثر عمقًا حول كيفية ضبط نموذج لتصنيف النصوص، قم بالاطلاع على الدفتر المقابل
304
+ [دفتر PyTorch](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/text_classification.ipynb)
305
+ أو [دفتر TensorFlow](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/text_classification-tf.ipynb).
306
+
307
+ </Tip>
308
+
309
+ ## الاستدلال(Inference)
310
+
311
+ رائع، الآن بعد أن قمت بضبط نموذج، يمكنك استخدامه للاستدلال!
312
+
313
+ احصل على بعض النصوص التي ترغب في إجراء الاستدلال عليها:
314
+
315
+ ```py
316
+ >>> text = "This was a masterpiece. Not completely faithful to the books, but enthralling from beginning to end. Might be my favorite of the three."
317
+ ```
318
+
319
+ أسهل طريقة لتجربة النموذج المضبوط للاستدلال هي استخدامه ضمن [`pipeline`]. قم بإنشاء `pipeline` لتحليل المشاعر مع نموذجك، ومرر نصك إليه:
320
+
321
+ ```py
322
+ >>> from transformers import pipeline
323
+
324
+ >>> classifier = pipeline("sentiment-analysis", model="stevhliu/my_awesome_model")
325
+ >>> classifier(text)
326
+ [{'label': 'POSITIVE', 'score': 0.9994940757751465}]
327
+ ```
328
+
329
+ يمكنك أيضًا تكرار نتائج `pipeline` يدويًا إذا أردت:
330
+
331
+ <frameworkcontent>
332
+ <pt>
333
+ قم يتجزئة النص وإرجاع تنسورات PyTorch:
334
+
335
+ ```py
336
+ >>> from transformers import AutoTokenizer
337
+
338
+ >>> tokenizer = AutoTokenizer.from_pretrained("stevhliu/my_awesome_model")
339
+ >>> inputs = tokenizer(text, return_tensors="pt")
340
+ ```
341
+
342
+ مرر المدخلات إلى النموذج واسترجع `logits`:
343
+
344
+ ```py
345
+ >>> from transformers import AutoModelForSequenceClassification
346
+
347
+ >>> model = AutoModelForSequenceClassification.from_pretrained("stevhliu/my_awesome_model")
348
+ >>> with torch.no_grad():
349
+ ... logits = model(**inputs).logits
350
+ ```
351
+
352
+ استخرج الفئة ذات الاحتمالية الأعلى، واستخدم `id2label` لتحويلها إلى تصنيف نصي:
353
+
354
+ ```py
355
+ >>> predicted_class_id = logits.argmax().item()
356
+ >>> model.config.id2label[predicted_class_id]
357
+ 'POSITIVE'
358
+ ```
359
+ </pt>
360
+ <tf>
361
+ قم بتحليل النص وإرجاع تنسيقات TensorFlow:
362
+
363
+ ```py
364
+ >>> from transformers import AutoTokenizer
365
+
366
+ >>> tokenizer = AutoTokenizer.from_pretrained("stevhliu/my_awesome_model")
367
+ >>> inputs = tokenizer(text, return_tensors="tf")
368
+ ```
369
+
370
+ قم بتمرير مدخلاتك إلى النموذج وإرجاع `logits`:
371
+
372
+ ```py
373
+ >>> from transformers import TFAutoModelForSequenceClassification
374
+
375
+ >>> model = TFAutoModelForSequenceClassification.from_pretrained("stevhliu/my_awesome_model")
376
+ >>> logits = model(**inputs).logits
377
+ ```
378
+
379
+ استخرج الفئة ذات الاحتمالية الأعلى، واستخدم `id2label` لتحويلها إلى تصنيف نصي:
380
+
381
+ ```py
382
+ >>> predicted_class_id = int(tf.math.argmax(logits, axis=-1)[0])
383
+ >>> model.config.id2label[predicted_class_id]
384
+ 'POSITIVE'
385
+ ```
386
+ </tf>
387
+ </frameworkcontent>
transformers/docs/source/ar/tasks/summarization.md ADDED
@@ -0,0 +1,397 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!--Copyright 2022 The HuggingFace Team. All rights reserved.
2
+
3
+ Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
4
+ the License. You may obtain a copy of the License at
5
+
6
+ http://www.apache.org/licenses/LICENSE-2.0
7
+
8
+ Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
9
+ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
10
+ specific language governing permissions and limitations under the License.
11
+
12
+ ⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
13
+ rendered properly in your Markdown viewer.
14
+
15
+ -->
16
+
17
+ # التلخيص (Summarization)
18
+
19
+ [[open-in-colab]]
20
+
21
+ <Youtube id="yHnr5Dk2zCI"/>
22
+
23
+ يقوم التلخيص بإنشاء نسخة مختصرة من مستند أو مقال، حيث يلتقط جميع المعلومات المهمة. بالإضافة إلى الترجمة، يعتبر التلخيص مثالاً آخر على مهمة يمكن صياغتها كتسلسل إلى تسلسل. يمكن أن يكون التلخيص:
24
+
25
+ - استخراجي: استخراج أهم المعلومات من مستند.
26
+ - تجريدي: إنشاء نص جديد يلخص أهم المعلومات.
27
+
28
+ سيوضح لك هذا الدليل كيفية:
29
+
30
+ 1. ضبط دقيق [T5](https://huggingface.co/google-t5/t5-small) على مجموعة فرعية من مشاريع قوانين ولاية كاليفورنيا من مجموعة بيانات [BillSum](https://huggingface.co/datasets/billsum) للتلخيص التجريدي.
31
+ 2. استخدام النموذج المضبوط بدقة للتنبؤ.
32
+
33
+ <Tip>
34
+
35
+ لمشاهدة جميع البنى ونقاط التفتيش المتوافقة مع هذه المهمة، نوصي بالتحقق من [صفحة المهمة](https://huggingface.co/tasks/summarization)
36
+
37
+ </Tip>
38
+
39
+ قبل البدء، تأكد من تثبيت جميع المكتبات الضرورية:
40
+
41
+ ```bash
42
+ pip install transformers datasets evaluate rouge_score
43
+ ```
44
+
45
+ نشجعك على تسجيل الدخول إلى حساب Hugging Face الخاص بك حتى تتمكن من تحميل نموذجك ومشاركته مع المجتمع. عند المطالبة، أدخل الرمز المميز لتسجيل الدخول:
46
+
47
+ ```py
48
+ >>> from huggingface_hub import notebook_login
49
+
50
+ >>> notebook_login()
51
+ ```
52
+
53
+ ## تحميل مجموعة بيانات BillSum
54
+
55
+ ابدأ بتحميل جزء صغير من بيانات مشاريع القوانين الخاصة بولاية كاليفورنيا من مجموعة بيانات BillSum في مكتبة 🤗 Datasets:
56
+
57
+ ```py
58
+ >>> from datasets import load_dataset
59
+
60
+ >>> billsum = load_dataset("billsum", split="ca_test")
61
+ ```
62
+
63
+ قسّم مجموعة البيانات إلى مجموعتي تدريب واختبار باستخدام الدالة [`~datasets.Dataset.train_test_split`]:
64
+
65
+ ```py
66
+ >>> billsum = billsum.train_test_split(test_size=0.2)
67
+ ```
68
+
69
+ ثم ألقِ نظرة على مثال:
70
+
71
+ ```py
72
+ >>> billsum["train"][0]
73
+ {'summary': 'Existing law authorizes state agencies to enter into contracts for the acquisition of goods or services upon approval by the Department of General Services. Existing law sets forth various requirements and prohibitions for those contracts, including, but not limited to, a prohibition on entering into contracts for the acquisition of goods or services of $100,000 or more with a contractor that discriminates between spouses and domestic partners or same-sex and different-sex couples in the provision of benefits. Existing law provides that a contract entered into in violation of those requirements and prohibitions is void and authorizes the state or any person acting on behalf of the state to bring a civil action seeking a determination that a contract is in violation and therefore void. Under existing law, a willful violation of those requirements and prohibitions is a misdemeanor.\nThis bill would also prohibit a state agency from entering into contracts for the acquisition of goods or services of $100,000 or more with a contractor that discriminates between employees on the basis of gender identity in the provision of benefits, as specified. By expanding the scope of a crime, this bill would impose a state-mandated local program.\nThe California Constitution requires the state to reimburse local agencies and school districts for certain costs mandated by the state. Statutory provisions establish procedures for making that reimbursement.\nThis bill would provide that no reimbursement is required by this act for a specified reason.',
74
+ 'text': 'The people of the State of California do enact as follows:\n\n\nSECTION 1.\nSection 10295.35 is added to the Public Contract Code, to read:\n10295.35.\n(a) (1) Notwithstanding any other law, a state agency shall not enter into any contract for the acquisition of goods or services in the amount of one hundred thousand dollars ($100,000) or more with a contractor that, in the provision of benefits, discriminates between employees on the basis of an employee’s or dependent’s actual or perceived gender identity, including, but not limited to, the employee’s or dependent’s identification as transgender.\n(2) For purposes of this section, “contract” includes contracts with a cumulative amount of one hundred thousand dollars ($100,000) or more per contractor in each fiscal year.\n(3) For purposes of this section, an employee health plan is discriminatory if the plan is not consistent with Section 1365.5 of the Health and Safety Code and Section 10140 of the Insurance Code.\n(4) The requirements of this section shall apply only to those portions of a contractor’s operations that occur under any of the following conditions:\n(A) Within the state.\n(B) On real property outside the state if the property is owned by the state or if the state has a right to occupy the property, and if the contractor’s presence at that location is connected to a contract with the state.\n(C) Elsewhere in the United States where work related to a state contract is being performed.\n(b) Contractors shall treat as confidential, to the maximum extent allowed by law or by the requirement of the contractor’s insurance provider, any request by an employee or applicant for employment benefits or any documentation of eligibility for benefits submitted by an employee or applicant for employment.\n(c) After taking all reasonable measures to find a contractor that complies with this section, as determined by the state agency, the requirements of this section may be waived under any of the following circumstances:\n(1) There is only one prospective contractor willing to enter into a specific contract with the state agency.\n(2) The contract is necessary to respond to an emergency, as determined by the state agency, that endangers the public health, welfare, or safety, or the contract is necessary for the provision of essential services, and no entity that complies with the requirements of this section capable of responding to the emergency is immediately available.\n(3) The requirements of this section violate, or are inconsistent with, the terms or conditions of a grant, subvention, or agreement, if the agency has made a good faith attempt to change the terms or conditions of any grant, subvention, or agreement to authorize application of this section.\n(4) The contractor is providing wholesale or bulk water, power, or natural gas, the conveyance or transmission of the same, or ancillary services, as required for ensuring reliable services in accordance with good utility practice, if the purchase of the same cannot practically be accomplished through the standard competitive bidding procedures and the contractor is not providing direct retail services to end users.\n(d) (1) A contractor shall not be deemed to discriminate in the provision of benefits if the contractor, in providing the benefits, pays the actual costs incurred in obtaining the benefit.\n(2) If a contractor is unable to provide a certain benefit, despite taking reasonable measures to do so, the contractor shall not be deemed to discriminate in the provision of benefits.\n(e) (1) Every contract subject to this chapter shall contain a statement by which the contractor certifies that the contractor is in compliance with this section.\n(2) The department or other contracting agency shall enforce this section pursuant to its existing enforcement powers.\n(3) (A) If a contractor falsely certifies that it is in compliance with this section, the contract with that contractor shall be subject to Article 9 (commencing with Section 10420), unless, within a time period specified by the department or other contracting agency, the contractor provides to the department or agency proof that it has complied, or is in the process of complying, with this section.\n(B) The application of the remedies or penalties contained in Article 9 (commencing with Section 10420) to a contract subject to this chapter shall not preclude the application of any existing remedies otherwise available to the department or other contracting agency under its existing enforcement powers.\n(f) Nothing in this section is intended to regulate the contracting practices of any local jurisdiction.\n(g) This section shall be construed so as not to conflict with applicable federal laws, rules, or regulations. In the event that a court or agency of competent jurisdiction holds that federal law, rule, or regulation invalidates any clause, sentence, paragraph, or section of this code or the application thereof to any person or circumstances, it is the intent of the state that the court or agency sever that clause, sentence, paragraph, or section so that the remainder of this section shall remain in effect.\nSEC. 2.\nSection 10295.35 of the Public Contract Code shall not be construed to create any new enforcement authority or responsibility in the Department of General Services or any other contracting agency.\nSEC. 3.\nNo reimbursement is required by this act pursuant to Section 6 of Article XIII\u2009B of the California Constitution because the only costs that may be incurred by a local agency or school district will be incurred because this act creates a new crime or infraction, eliminates a crime or infraction, or changes the penalty for a crime or infraction, within the meaning of Section 17556 of the Government Code, or changes the definition of a crime within the meaning of Section 6 of Article XIII\u2009B of the California Constitution.',
75
+ 'title': 'An act to add Section 10295.35 to the Public Contract Code, relating to public contracts.'}
76
+ ```
77
+
78
+ هناك مُدخلان سترغب في استخدامهما:
79
+
80
+ - `text`: نص القانون الذي سيكون مُدخلًا للنموذج.
81
+ - `summary`: نسخة مُختصرة من `text` والتي ستكون هدف النموذج.
82
+
83
+ ## المعالجة المسبقة (Preprocess)
84
+
85
+ الخطوة التالية هي تحميل مجزء النصوص T5 لمعالجة `text` و `summary`:
86
+
87
+ ```py
88
+ >>> from transformers import AutoTokenizer
89
+
90
+ >>> checkpoint = "google-t5/t5-small"
91
+ >>> tokenizer = AutoTokenizer.from_pretrained(checkpoint)
92
+ ```
93
+
94
+ وظيفة المعالجة المسبقة التي تريد إنشاءها تحتاج إلى:
95
+
96
+ 1. إضافة بادئة للمُدخل باستخدام توجيه حتى يعرف T5 أن هذه مهمة تلخيص. تتطلب بعض النماذج القادرة على مهام البرمجة اللغوية العصبية المتعددة توجيهات لمهام مُحددة.
97
+ 2. استخدام مُعامل الكلمة الرئيسية `text_target` عند ترميز التصنيفات.
98
+ 3. قصّ التسلسلات بحيث لا يزيد طولها عن الحد الأقصى الذي تم تعيينه بواسطة مُعامل `max_length`.
99
+
100
+ ```py
101
+ >>> prefix = "summarize: "
102
+
103
+ >>> def preprocess_function(examples):
104
+ ... inputs = [prefix + doc for doc in examples["text"]]
105
+ ... model_inputs = tokenizer(inputs, max_length=1024, truncation=True)
106
+
107
+ ... labels = tokenizer(text_target=examples["summary"], max_length=128, truncation=True)
108
+
109
+ ... model_inputs["labels"] = labels["input_ids"]
110
+ ... return model_inputs
111
+ ```
112
+
113
+ لتطبيق دالة المعالجة المسبقة على مجموعة البيانات بأكملها، استخدم طريقة [`~datasets.Dataset.map`] الخاصة بـ 🤗 Datasets. يمكنك تسريع دالة `map` عن طريق تعيين `batched=True` لمعالجة عناصر متعددة من مجموعة البيانات في وقت واحد:
114
+
115
+ ```py
116
+ >>> tokenized_billsum = billsum.map(preprocess_function, batched=True)
117
+ ```
118
+
119
+ الآن قم بإنشاء دفعة من الأمثلة باستخدام [`DataCollatorForSeq2Seq`]. الأكثر كفاءة *الحشو الديناميكي* للجمل إلى أطول طول في دفعة أثناء عملية التجميع، بدلاً من حشو مجموعة البيانات بأكملها إلى الحد الأقصى للطول.
120
+
121
+ <frameworkcontent>
122
+ <pt>
123
+
124
+ ```py
125
+ >>> from transformers import DataCollatorForSeq2Seq
126
+
127
+ >>> data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint)
128
+ ```
129
+ </pt>
130
+ <tf>
131
+
132
+ ```py
133
+ >>> from transformers import DataCollatorForSeq2Seq
134
+
135
+ >>> data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint, return_tensors="tf")
136
+ ```
137
+ </tf>
138
+ </frameworkcontent>
139
+
140
+ ## التقييم (Evaluate)
141
+
142
+ يُعد تضمين مقياس أثناء التدريب مفيدًا غالبًا لتقييم أداء نموذجك. يمكنك تحميل طريقة تقييم بسرعة باستخدام مكتبة 🤗 [Evaluate](https://huggingface.co/docs/evaluate/index). لهذه المهمة، قم بتحميل مقياس [ROUGE](https://huggingface.co/spaces/evaluate-metric/rouge) (راجع [الجولة السريعة](https://huggingface.co/docs/evaluate/a_quick_tour) الخاصة بـ 🤗 Evaluate لمعرفة المزيد حول كيفية تحميل وحساب مقياس):
143
+
144
+ ```py
145
+ >>> import evaluate
146
+
147
+ >>> rouge = evaluate.load("rouge")
148
+ ```
149
+
150
+ ثم قم بإنشاء دالة تُمرر تنبؤاتك وتصنيفاتك إلى [`~evaluate.EvaluationModule.compute`] لحساب مقياس ROUGE:
151
+
152
+ ```py
153
+ >>> import numpy as np
154
+
155
+ >>> def compute_metrics(eval_pred):
156
+ ... predictions, labels = eval_pred
157
+ ... decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
158
+ ... labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
159
+ ... decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
160
+
161
+ ... result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)
162
+
163
+ ... prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
164
+ ... result["gen_len"] = np.mean(prediction_lens)
165
+
166
+ ... return {k: round(v, 4) for k, v in result.items()}
167
+ ```
168
+
169
+ دالة `compute_metrics` الخاصة بك جاهزة الآن، وستعود إليها عند إعداد التدريب الخاص بك.
170
+
171
+ ## التدري�� (Train)
172
+
173
+ <frameworkcontent>
174
+ <pt>
175
+
176
+ <Tip>
177
+
178
+ إذا لم تكن معتادًا على ضبط نموذج باستخدام [`Trainer`]، فألق نظرة على البرنامج التعليمي الأساسي [هنا](../training#train-with-pytorch-trainer)!
179
+
180
+ </Tip>
181
+
182
+ أنت جاهز لبدء تدريب نموذجك الآن! قم بتحميل T5 باستخدام [`AutoModelForSeq2SeqLM`]:
183
+
184
+ ```py
185
+ >>> from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer
186
+
187
+ >>> model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)
188
+ ```
189
+
190
+ في هذه المرحلة، لم يتبق سوى ثلاث خطوات:
191
+
192
+ 1. حدد مُعامِلات التدريب الخاصة بك في [`Seq2SeqTrainingArguments`]. المعامل الوحيد المطلوب هو `output_dir` الذي يُحدد مكان حفظ نموذجك. ستدفع هذا النموذج إلى Hub عن طريق تعيين `push_to_hub=True` (تحتاج إلى تسجيل الدخول إلى Hugging Face لتحميل نموذجك). في نهاية كل حقبة، سيقوم [`Trainer`] بتقييم مقياس ROUGE وحفظ نقطة تفتيش التدريب.
193
+ 2. مرر مُعامِلات التدريب إلى [`Seq2SeqTrainer`] جنبًا إلى جنب مع النموذج ومجموعة البيانات والمُحلِّل اللغوي وجامع البيانات ودالة `compute_metrics`.
194
+ 3. استدعِ [`~Trainer.train`] لضبط نموذجك.
195
+
196
+ ```py
197
+ >>> training_args = Seq2SeqTrainingArguments(
198
+ ... output_dir="my_awesome_billsum_model",
199
+ ... eval_strategy="epoch",
200
+ ... learning_rate=2e-5,
201
+ ... per_device_train_batch_size=16,
202
+ ... per_device_eval_batch_size=16,
203
+ ... weight_decay=0.01,
204
+ ... save_total_limit=3,
205
+ ... num_train_epochs=4,
206
+ ... predict_with_generate=True,
207
+ ... fp16=True, #change to bf16=True for XPU
208
+ ... push_to_hub=True,
209
+ ... )
210
+
211
+ >>> trainer = Seq2SeqTrainer(
212
+ ... model=model,
213
+ ... args=training_args,
214
+ ... train_dataset=tokenized_billsum["train"],
215
+ ... eval_dataset=tokenized_billsum["test"],
216
+ ... processing_class=tokenizer,
217
+ ... data_collator=data_collator,
218
+ ... compute_metrics=compute_metrics,
219
+ ... )
220
+
221
+ >>> trainer.train()
222
+ ```
223
+
224
+ بمجرد اكتمال التدريب، شارك نموذجك مع Hub باستخدام طريقة [`~transformers.Trainer.push_to_hub`] حتى يتمكن الجميع من استخدام نموذجك:
225
+
226
+ ```py
227
+ >>> trainer.push_to_hub()
228
+ ```
229
+ </pt>
230
+ <tf>
231
+ <Tip>
232
+
233
+ إذا لم تكن معتادًا على ضبط نموذج باستخدام Keras، فألق نظرة على البرنامج التعليمي الأساسي [هنا](../training#train-a-tensorflow-model-with-keras)!
234
+
235
+ </Tip>
236
+ لضبط نموذج في TensorFlow، ابدأ بإعداد دالة مُحسِّن وجدول معدل التعلم وبعض معلمات التدريب:
237
+
238
+ ```py
239
+ >>> from transformers import create_optimizer, AdamWeightDecay
240
+
241
+ >>> optimizer = AdamWeightDecay(learning_rate=2e-5, weight_decay_rate=0.01)
242
+ ```
243
+
244
+ ثم يمكنك تحميل T5 باستخدام [`TFAutoModelForSeq2SeqLM`]:
245
+
246
+ ```py
247
+ >>> from transformers import TFAutoModelForSeq2SeqLM
248
+
249
+ >>> model = TFAutoModelForSeq2SeqLM.from_pretrained(checkpoint)
250
+ ```
251
+
252
+ حوّل مجموعات البيانات الخاصة بك إلى تنسيق `tf.data.Dataset` باستخدام [`~transformers.TFPreTrainedModel.prepare_tf_dataset`]:
253
+
254
+ ```py
255
+ >>> tf_train_set = model.prepare_tf_dataset(
256
+ ... tokenized_billsum["train"],
257
+ ... shuffle=True,
258
+ ... batch_size=16,
259
+ ... collate_fn=data_collator,
260
+ ... )
261
+
262
+ >>> tf_test_set = model.prepare_tf_dataset(
263
+ ... tokenized_billsum["test"],
264
+ ... shuffle=False,
265
+ ... batch_size=16,
266
+ ... collate_fn=data_collator,
267
+ ... )
268
+ ```
269
+
270
+ قم بتكوين النموذج للتدريب باستخدام [`compile`](https://keras.io/api/models/model_training_apis/#compile-method). لاحظ أن جميع نماذج Transformers لديها دالة خسارة ذات صلة بالمهمة افتراضيًا، لذلك لست بحاجة إلى تحديد واحدة ما لم تكن ترغب في ذلك:
271
+
272
+ ```py
273
+ >>> import tensorflow as tf
274
+
275
+ >>> model.compile(optimizer=optimizer) # No loss argument!
276
+ ```
277
+
278
+ آخر شيئين يجب إعدادهما قبل بدء التدريب هما حساب درجة ROUGE من التنبؤات، وتوفير طريقة لدفع نموذجك إلى Hub. يتم كلاهما باستخدام [استدعاءات Keras](../main_classes/keras_callbacks).
279
+
280
+ مرر دالة `compute_metrics` الخاصة بك إلى [`~transformers.KerasMetricCallback`]:
281
+
282
+ ```py
283
+ >>> from transformers.keras_callbacks import KerasMetricCallback
284
+
285
+ >>> metric_callback = KerasMetricCallback(metric_fn=compute_metrics, eval_dataset=tf_test_set)
286
+ ```
287
+
288
+ حدد مكان دفع نموذجك ومُحلِّلك اللغوي في [`~transformers.PushToHubCallback`]:
289
+
290
+ ```py
291
+ >>> from transformers.keras_callbacks import PushToHubCallback
292
+
293
+ >>> push_to_hub_callback = PushToHubCallback(
294
+ ... output_dir="my_awesome_billsum_model",
295
+ ... tokenizer=tokenizer,
296
+ ... )
297
+ ```
298
+
299
+ ثم اجمع استدعاءاتك معًا:
300
+
301
+ ```py
302
+ >>> callbacks = [metric_callback, push_to_hub_callback]
303
+ ```
304
+
305
+ أخيرًا، أنت جاهز لبدء تدريب نموذجك! اتصل بـ [`fit`](https://keras.io/api/models/model_training_apis/#fit-method) مع مجموعات بيانات التدريب والتحقق من الصحة وعدد الحقب واستدعاءاتك لضبط النموذج:
306
+
307
+ ```py
308
+ >>> model.fit(x=tf_train_set, validation_data=tf_test_set, epochs=3, callbacks=callbacks)
309
+ ```
310
+
311
+ بمجرد اكتمال التدريب، يتم تحميل نموذجك تلقائيًا إلى Hub حتى يتمكن الجميع من استخدامه!
312
+ </tf>
313
+ </frameworkcontent>
314
+
315
+ <Tip>
316
+
317
+ للحصول على مثال أكثر تعمقًا حول كيفية ضبط نموذج للتجميع، ألقِ نظرة على [دفتر ملاحظات PyTorch](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/summarization.ipynb)
318
+ أو [دفتر ملاحظات TensorFlow](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/summarization-tf.ipynb) المقابل.
319
+
320
+ </Tip>
321
+
322
+ ## الاستدلال (Inference)
323
+
324
+ رائع، الآن بعد أن قمت بضبط نموذج، يمكنك استخدامه للاستدلال!
325
+
326
+ خدد بعض النصوص الذي ترغب في تلخيصها. بالنسبة لـ T5، تحتاج إلى إضافة بادئة إلى مُدخلاتك اعتمادًا على المهمة التي تعمل عليها. بالنسبة التلخيص، يجب عليك إضافة بادئة إلى مُدخلاتك كما هو موضح أدناه:
327
+
328
+ ```py
329
+ >>> text = "summarize: The Inflation Reduction Act lowers prescription drug costs, health care costs, and energy costs. It's the most aggressive action on tackling the climate crisis in American history, which will lift up American workers and create good-paying, union jobs across the country. It'll lower the deficit and ask the ultra-wealthy and corporations to pay their fair share. And no one making under $400,000 per year will pay a penny more in taxes."
330
+ ```
331
+
332
+ أبسط طريقة لتجربة نموذجك المضبوط للاستدلال هي استخدامه في [`pipeline`]. استخدم `pipeline` للتلخيص باستخدام نموذجك، ومرر نصك إليه:
333
+
334
+ ```py
335
+ >>> from transformers import pipeline
336
+
337
+ >>> summarizer = pipeline("summarization", model="username/my_awesome_billsum_model")
338
+ >>> summarizer(text)
339
+ [{"summary_text": "The Inflation Reduction Act lowers prescription drug costs, health care costs, and energy costs. It's the most aggressive action on tackling the climate crisis in American history, which will lift up American workers and create good-paying, union jobs across the country."}]
340
+ ```
341
+
342
+ يمكنك أيضًا تكرار نتائج `pipeline` يدويًا إذا أردت:
343
+
344
+ <frameworkcontent>
345
+ <pt>
346
+ قسم النص وإرجع `input_ids` كتنسورات PyTorch:
347
+
348
+ ```py
349
+ >>> from transformers import AutoTokenizer
350
+
351
+ >>> tokenizer = AutoTokenizer.from_pretrained("username/my_awesome_billsum_model")
352
+ >>> inputs = tokenizer(text, return_tensors="pt").input_ids
353
+ ```
354
+
355
+ استخدم طريقة [`~generation.GenerationMixin.generate`] لإنشاء التلخيص. لمزيد من التفاصيل حول استراتيجيات توليد النص المختلفة والمعلمات للتحكم في التوليد، راجع واجهة برمجة تطبيقات [توليد النص](../main_classes/text_generation).
356
+
357
+ ```py
358
+ >>> from transformers import AutoModelForSeq2SeqLM
359
+
360
+ >>> model = AutoModelForSeq2SeqLM.from_pretrained("username/my_awesome_billsum_model")
361
+ >>> outputs = model.generate(inputs, max_new_tokens=100, do_sample=False)
362
+ ```
363
+
364
+ فك تشفير معرفات الرموز المولدة مرة أخرى إلى نص:
365
+
366
+ ```py
367
+ >>> tokenizer.decode(outputs[0], skip_special_tokens=True)
368
+ 'the inflation reduction act lowers prescription drug costs, health care costs, and energy costs. it's the most aggressive action on tackling the climate crisis in american history. it will ask the ultra-wealthy and corporations to pay their fair share.'
369
+ ```
370
+ </pt>
371
+ <tf>
372
+ قسم النص وإرجع `input_ids` كتنسورات TensorFlow:
373
+
374
+ ```py
375
+ >>> from transformers import AutoTokenizer
376
+
377
+ >>> tokenizer = AutoTokenizer.from_pretrained("username/my_awesome_billsum_model")
378
+ >>> inputs = tokenizer(text, return_tensors="tf").input_ids
379
+ ```
380
+
381
+ استخدم طريقة [`~transformers.generation_tf_utils.TFGenerationMixin.generate`] لإنشاء التلخيص. لمزيد من التفاصيل حول استراتيجيات توليد النص المختلفة والمعلمات للتحكم في التوليد، راجع واجهة برمجة تطبيقات [توليد النص](../main_classes/text_generation).
382
+
383
+ ```py
384
+ >>> from transformers import TFAutoModelForSeq2SeqLM
385
+
386
+ >>> model = TFAutoModelForSeq2SeqLM.from_pretrained("username/my_awesome_billsum_model")
387
+ >>> outputs = model.generate(inputs, max_new_tokens=100, do_sample=False)
388
+ ```
389
+
390
+ فك تشفير معرفات الرموز المولدة مرة أخرى إلى نص:
391
+
392
+ ```py
393
+ >>> tokenizer.decode(outputs[0], skip_special_tokens=True)
394
+ 'the inflation reduction act lowers prescription drug costs, health care costs, and energy costs. it's the most aggressive action on tackling the climate crisis in american history. it will ask the ultra-wealthy and corporations to pay their fair share.'
395
+ ```
396
+ </tf>
397
+ </frameworkcontent>
transformers/docs/source/ar/tasks/token_classification.md ADDED
@@ -0,0 +1,550 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!--Copyright 2022 The HuggingFace Team. All rights reserved.
2
+ Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
3
+ the License. You may obtain a copy of the License at
4
+ http://www.apache.org/licenses/LICENSE-2.0
5
+ Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
6
+ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
7
+ specific language governing permissions and limitations under the License.
8
+ ⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
9
+ rendered properly in your Markdown viewer.
10
+ -->
11
+
12
+ # تصنيف الرموز(Token classification)
13
+
14
+ [[open-in-colab]]
15
+
16
+ <Youtube id="wVHdVlPScxA"/>
17
+
18
+ يهدف تصنيف الرموز إلى إعطاء تسمية لكل رمز على حدة في الجملة. من أكثر مهام تصنيف الرموز شيوعًا هو التعرف على الكيانات المسماة (NER). يحاول NER تحديد تسمية لكل كيان في الجملة، مثل شخص، أو مكان، أو منظمة.
19
+
20
+ سيوضح لك هذا الدليل كيفية:
21
+
22
+ 1. ضبط [DistilBERT](https://huggingface.co/distilbert/distilbert-base-uncased) على مجموعة بيانات [WNUT 17](https://huggingface.co/datasets/wnut_17) للكشف عن كيانات جديدة.
23
+ 2. استخدام نموذجك المضبوط بدقة للاستدلال.
24
+
25
+ <Tip>
26
+
27
+ للاطلاع جميع البنى والنقاط المتوافقة مع هذه المهمة، نوصي بالرجوع من [صفحة المهمة](https://huggingface.co/tasks/token-classification).
28
+
29
+ </Tip>
30
+
31
+ قبل أن تبدأ، تأكد من تثبيت جميع المكتبات الضرورية:
32
+
33
+ ```bash
34
+ pip install transformers datasets evaluate seqeval
35
+ ```
36
+
37
+ نحن نشجعك على تسجيل الدخول إلى حساب HuggingFace الخاص بك حتى تتمكن من تحميل ومشاركة نموذجك مع المجتمع. عندما يُطلب منك، أدخل رمزك لتسجيل الدخول:
38
+
39
+ ```py
40
+ >>> from huggingface_hub import notebook_login
41
+
42
+ >>> notebook_login()
43
+ ```
44
+
45
+ ## تحميل مجموعة بيانات WNUT 17
46
+
47
+ ابدأ بتحميل مجموعة بيانات WNUT 17 من مكتبة 🤗 Datasets:
48
+
49
+ ```py
50
+ >>> from datasets import load_dataset
51
+
52
+ >>> wnut = load_dataset("wnut_17")
53
+ ```
54
+
55
+ ثم ألق نظرة على مثال:
56
+
57
+ ```py
58
+ >>> wnut["train"][0]
59
+ {'id': '0',
60
+ 'ner_tags': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 7, 8, 8, 0, 7, 0, 0, 0, 0, 0, 0, 0, 0],
61
+ 'tokens': ['@paulwalk', 'It', "'s", 'the', 'view', 'from', 'where', 'I', "'m", 'living', 'for', 'two', 'weeks', '.', 'Empire', 'State', 'Building', '=', 'ESB', '.', 'Pretty', 'bad', 'storm', 'here', 'last', 'evening', '.']
62
+ }
63
+ ```
64
+
65
+ يمثل كل رقم في `ner_tags` كياناً. حوّل الأرقام إلى أسماء التصنيفات لمعرفة ماهية الكيانات:
66
+
67
+ ```py
68
+ >>> label_list = wnut["train"].features[f"ner_tags"].feature.names
69
+ >>> label_list
70
+ [
71
+ "O",
72
+ "B-corporation",
73
+ "I-corporation",
74
+ "B-creative-work",
75
+ "I-creative-work",
76
+ "B-group",
77
+ "I-group",
78
+ "B-location",
79
+ "I-location",
80
+ "B-person",
81
+ "I-person",
82
+ "B-product",
83
+ "I-product",
84
+ ]
85
+ ```
86
+
87
+ يشير الحرف الذي يسبق كل `ner_tag` إلى موضع الرمز للكيان:
88
+
89
+ - `B-` يشير إلى بداية الكيان.
90
+ - `I-` يشير إلى أن الرمز يقع ضمن نفس الكيان (على سبيل المثال، الرمز `State` هو جزء من كيان مثل `Empire State Building`).
91
+ - `0` يشير إلى أن الرمز لا يمثل أي كيان.
92
+
93
+ ## المعالجة المسبقة(Preprocess)
94
+
95
+ <Youtube id="iY2AZYdZAr0"/>
96
+
97
+ الخطوة التالية هي تحميل مُجزِّئ النصوص DistilBERT للمعالجة المسبقة لحقل `tokens`:
98
+
99
+ ```py
100
+ >>> from transformers import AutoTokenizer
101
+
102
+ >>> tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")
103
+ ```
104
+
105
+ كما رأيت في حقل `tokens` المثال أعلاه، يبدو أن المدخل قد تم تحليله بالفعل. لكن المدخل لم يُجزأ بعد ويتعيّن عليك ضبط `is_split_into_words=True` لتقسيم الكلمات إلى كلمات فرعية. على سبيل المثال:
106
+
107
+ ```py
108
+ >>> example = wnut["train"][0]
109
+ >>> tokenized_input = tokenizer(example["tokens"], is_split_into_words=True)
110
+ >>> tokens = tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"])
111
+ >>> tokens
112
+ ['[CLS]', '@', 'paul', '##walk', 'it', "'", 's', 'the', 'view', 'from', 'where', 'i', "'", 'm', 'living', 'for', 'two', 'weeks', '.', 'empire', 'state', 'building', '=', 'es', '##b', '.', 'pretty', 'bad', 'storm', 'here', 'last', 'evening', '.', '[SEP]']
113
+ ```
114
+
115
+ ومع ذلك، يضيف هذا بعض الرموز الخاصة `[CLS]` و`[SEP]` وتقسيم الكلمات إلى أجزاء يُنشئ عدم تطابق بين المُدخلات والتسميات. قد يتم تقسيم كلمة واحدة تقابل تسمية واحدة الآن إلى كلمتين فرعيتين. ستحتاج إلى إعادة محاذاة الرموز والتسميات عن طريق:
116
+
117
+ 1. ربط كل رمز بالكلمة الأصلية باستخدام الخاصية [`word_ids`](https://huggingface.co/docs/transformers/main_classes/tokenizer#transformers.BatchEncoding.word_ids).
118
+ 2. تعيين التسمية `-100` للرموز الخاصة `[CLS]` و`[SEP]` بحيث يتم تجاهلها بواسطة دالة الخسارة PyTorch (انظر [CrossEntropyLoss](https://pytorch.org/docs/stable/generated/torch.nn.CrossEntropyLoss.html)).
119
+ 3. تسمية الرمز الأول فقط لكلمة معينة. قم بتعيين `-100` لأجزاء الكلمة الأخرى.
120
+
121
+ هنا كيف يمكنك إنشاء وظيفة لإعادة محاذاة الرموز والتسميات، وقص الجمل لتتجاوز الحد الأقصى لطول مُدخلات DistilBERT:
122
+
123
+ ```py
124
+ >>> def tokenize_and_align_labels(examples):
125
+ ... tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)
126
+
127
+ ... labels = []
128
+ ... for i, label in enumerate(examples[f"ner_tags"]):
129
+ ... word_ids = tokenized_inputs.word_ids(batch_index=i) # تعيين الرموز إلى كلماتهم المقابلة.
130
+ ... previous_word_idx = None
131
+ ... label_ids = []
132
+ ... for word_idx in word_ids: # تعيين الرموز الخاصة إلى -100.
133
+ ... if word_idx is None:
134
+ ... label_ids.append(-100)
135
+ ... elif word_idx != previous_word_idx: # تسمية الرمز الأول فقط لكلمة معينة.
136
+ ... label_ids.append(label[word_idx])
137
+ ... else:
138
+ ... label_ids.append(-100)
139
+ ... previous_word_idx = word_idx
140
+ ... labels.append(label_ids)
141
+
142
+ ... tokenized_inputs["labels"] = labels
143
+ ... return tokenized_inputs
144
+ ```
145
+
146
+ لتطبيق هذه العملية على كامل مجموعة البيانات، استخدم الدالة [`~datasets.Dataset.map`] لمجموعة بيانات 🤗. يمكنك تسريع الدالة `map` عن طريق تعيين `batched=True` لمعالجة عناصر متعددة من مجموعة البيانات في وقت واحد:
147
+
148
+ ```py
149
+ >>> tokenized_wnut = wnut.map(tokenize_and_align_labels, batched=True)
150
+ ```
151
+
152
+ الآن قم بإنشاء دفعة من الأمثلة باستخدام [`DataCollatorWithPadding`].من الأفضل استخدام *الحشو الديناميكي* للجمل إلى أطول طول في دفعة أثناء التجميع، بدلاً من حشو مجموعة البيانات بالكامل إلى الطول الأقصى.
153
+
154
+ <frameworkcontent>
155
+ <pt>
156
+ ```py
157
+ >>> from transformers import DataCollatorForTokenClassification
158
+
159
+ >>> data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)
160
+ ```
161
+ </pt>
162
+ <tf>
163
+ ```py
164
+ >>> from transformers import DataCollatorForTokenClassification
165
+
166
+ >>> data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer, return_tensors="tf")
167
+ ```
168
+ </tf>
169
+ </frameworkcontent>
170
+
171
+ ## التقييم(Evaluate)
172
+
173
+ يُعدّ تضمين مقياس أثناء التدريب مفيدًا في تقييم أداء نموذجك. يمكنك تحميل طريقة تقييم بسرعة مع مكتبة 🤗 [Evaluate](https://huggingface.co/docs/evaluate/index). لهذه المهمة، قم بتحميل إطار [seqeval](https://huggingface.co/spaces/evaluate-metric/seqeval) (انظر جولة 🤗 Evaluate [quick tour](https://huggingface.co/docs/evaluate/a_quick_tour) لمعرفة المزيد حول كيفية تحميل وحساب مقياس). يُخرج seqeval عدة نتائج: الدقة، والاستذكار، ومقياس F1، والدقة.
174
+
175
+ ```py
176
+ >>> import evaluate
177
+
178
+ >>> seqeval = evaluate.load("seqeval")
179
+ ```
180
+
181
+ احصل على تسميات الكيانات المسماة (NER) أولاً،ثم أنشئ دالة تُمرر تنبؤاتك وتسمياتك الصحيحة إلى [`~evaluate.EvaluationModule.compute`] لحساب النتائج:
182
+
183
+ ```py
184
+ >>> import numpy as np
185
+
186
+ >>> labels = [label_list[i] for i in example[f"ner_tags"]]
187
+
188
+ >>> def compute_metrics(p):
189
+ ... predictions, labels = p
190
+ ... predictions = np.argmax(predictions, axis=2)
191
+
192
+ ... true_predictions = [
193
+ ... [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
194
+ ... for prediction, label in zip(predictions, labels)
195
+ ... ]
196
+ ... true_labels = [
197
+ ... [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
198
+ ... for prediction, label in zip(predictions, labels)
199
+ ... ]
200
+
201
+ ... results = seqeval.compute(predictions=true_predictions, references=true_labels)
202
+ ... return {
203
+ ... "precision": results["overall_precision"],
204
+ ... "recall": results["overall_recall"],
205
+ ... "f1": results["overall_f1"],
206
+ ... "accuracy": results["overall_accuracy"],
207
+ ... }
208
+ ```
209
+
210
+ دالة `compute_metrics` جاهزة للاستخدام، وستحتاج إليها عند إعداد التدريب.
211
+
212
+ ## التدريب(Train)
213
+
214
+ قبل تدريب النموذج، جهّز خريطة تربط بين المعرّفات المتوقعة وتسمياتها باستخدام `id2label` و `label2id`:
215
+
216
+ ```py
217
+ >>> id2label = {
218
+ ... 0: "O",
219
+ ... 1: "B-corporation",
220
+ ... 2: "I-corporation",
221
+ ... 3: "B-creative-work",
222
+ ... 4: "I-creative-work",
223
+ ... 5: "B-group",
224
+ ... 6: "I-group",
225
+ ... 7: "B-location",
226
+ ... 8: "I-location",
227
+ ... 9: "B-person",
228
+ ... 10: "I-person",
229
+ ... 11: "B-product",
230
+ ... 12: "I-product",
231
+ ... }
232
+ >>> label2id = {
233
+ ... "O": 0,
234
+ ... "B-corporation": 1,
235
+ ... "I-corporation": 2,
236
+ ... "B-creative-work": 3,
237
+ ... "I-creative-work": 4,
238
+ ... "B-group": 5,
239
+ ... "I-group": 6,
240
+ ... "B-location": 7,
241
+ ... "I-location": 8,
242
+ ... "B-person": 9,
243
+ ... "I-person": 10,
244
+ ... "B-product": 11,
245
+ ... "I-product": 12,
246
+ ... }
247
+ ```
248
+
249
+ <frameworkcontent>
250
+ <pt>
251
+ <Tip>
252
+
253
+ إذا لم تكن على دراية بتعديل نموذج باستخدام [`Trainer`], ألق نظرة على الدليل التعليمي الأساسي [هنا](../training#train-with-pytorch-trainer)!
254
+
255
+ </Tip>
256
+
257
+ أنت مستعد الآن لبدء تدريب نموذجك! قم بتحميل DistilBERT مع [`AutoModelForTokenClassification`] إلى جانب عدد التصنيفات المتوقعة، وخريطة التسميات:
258
+
259
+ ```py
260
+ >>> from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer
261
+
262
+ >>> model = AutoModelForTokenClassification.from_pretrained(
263
+ ... "distilbert/distilbert-base-uncased", num_labels=13, id2label=id2label, label2id=label2id
264
+ ... )
265
+ ```
266
+
267
+ في هذه المرحلة، هناك ثلاث خطوات فقط متبقية:
268
+
269
+ 1. حدد معلمات التدريب الخاصة بك في [`TrainingArguments`]. المعامل الوحيد المطلوب هو `output_dir` الذي يحدد مكان حفظ نموذجك. ستقوم بدفع هذا النموذج إلى Hub عن طريق تعيين `push_to_hub=True` (يجب أن تكون مسجلاً الدخول إلى Hugging Face لتحميل نموذجك). في نهاية كل حقبة، سيقوم [`Trainer`] بتقييم درجات seqeval وحفظ تسخة التدريب.
270
+ 2. قم بتمرير معاملات التدريب إلى [`Trainer`] إلى جانب النموذج، ومجموعة البيانات، والمُجزِّئ اللغوي، و`data collator`، ودالة `compute_metrics`.
271
+ 3.استدعِ [`~Trainer.train`] لتدريب نموذجك.
272
+
273
+ ```py
274
+ >>> training_args = TrainingArguments(
275
+ ... output_dir="my_awesome_wnut_model",
276
+ ... learning_rate=2e-5,
277
+ ... per_device_train_batch_size=16,
278
+ ... per_device_eval_batch_size=16,
279
+ ... num_train_epochs=2,
280
+ ... weight_decay=0.01,
281
+ ... eval_strategy="epoch",
282
+ ... save_strategy="epoch",
283
+ ... load_best_model_at_end=True,
284
+ ... push_to_hub=True,
285
+ ... )
286
+
287
+ >>> trainer = Trainer(
288
+ ... model=model,
289
+ ... args=training_args,
290
+ ... train_dataset=tokenized_wnut["train"],
291
+ ... eval_dataset=tokenized_wnut["test"],
292
+ ... processing_class=tokenizer,
293
+ ... data_collator=data_collator,
294
+ ... compute_metrics=compute_metrics,
295
+ ... )
296
+
297
+ >>> trainer.train()
298
+ ```
299
+
300
+ بمجرد اكتمال التدريب، شارك نموذجك على Hub باستخدام طريقة [`~transformers.Trainer.push_to_hub`] حتى يتمكن الجميع من استخدام نموذجك:
301
+
302
+ ```py
303
+ >>> trainer.push_to_hub()
304
+ ```
305
+ </pt>
306
+ <tf>
307
+ <Tip>
308
+
309
+ إذا لم تكن على دراية بتعديل نموذج باستخدام Keras، ألق نظرة على الدليل التعليمي الأساسي [هنا](../training#train-a-tensorflow-model-with-keras)!
310
+
311
+ </Tip>
312
+ للتعديل على نموذج في TensorFlow، ابدأ بإعداد دالة محسن، وجدول معدل التعلم، وبعض معلمات التدريب:
313
+
314
+ ```py
315
+ >>> from transformers import create_optimizer
316
+
317
+ >>> batch_size = 16
318
+ >>> num_train_epochs = 3
319
+ >>> num_train_steps = (len(tokenized_wnut["train"]) // batch_size) * num_train_epochs
320
+ >>> optimizer, lr_schedule = create_optimizer(
321
+ ... init_lr=2e-5,
322
+ ... num_train_steps=num_train_steps,
323
+ ... weight_decay_rate=0.01,
324
+ ... num_warmup_steps=0,
325
+ ... )
326
+ ```
327
+
328
+ ثم يمكنك تحميل DistilBERT مع [`TFAutoModelForTokenClassification`] إلى جانب عدد التسميات المتوقعة، وتخطيطات التسميات:
329
+
330
+ ```py
331
+ >>> from transformers import TFAutoModelForTokenClassification
332
+
333
+ >>> model = TFAutoModelForTokenClassification.from_pretrained(
334
+ ... "distilbert/distilbert-base-uncased", num_labels=13, id2label=id2label, label2id=label2id
335
+ ... )
336
+ ```
337
+
338
+ قم بتحويل مجموعات بياناتك إلى تنسيق `tf.data.Dataset` مع [`~transformers.TFPreTrainedModel.prepare_tf_dataset`]:
339
+
340
+ ```py
341
+ >>> tf_train_set = model.prepare_tf_dataset(
342
+ ... tokenized_wnut["train"],
343
+ ... shuffle=True,
344
+ ... batch_size=16,
345
+ ... collate_fn=data_collator,
346
+ ... )
347
+
348
+ >>> tf_validation_set = model.prepare_tf_dataset(
349
+ ... tokenized_wnut["validation"],
350
+ ... shuffle=False,
351
+ ... batch_size=16,
352
+ ... collate_fn=data_collator,
353
+ ... )
354
+ ```
355
+
356
+ هيّئ النموذج للتدريب باستخدام [`compile`](https://keras.io/api/models/model_training_apis/#compile-method). لاحظ أن نماذج Transformers تتضمن دالة خسارة افتراضية مرتبطة بالمهمة، لذلك لا تحتاج إلى تحديد واحدة إلا إذا كنت ترغب في ذلك:
357
+
358
+ ```py
359
+ >>> import tensorflow as tf
360
+
361
+ >>> model.compile(optimizer=optimizer) # No loss argument!
362
+ ```
363
+
364
+ آخر أمرين يجب إعدادهما قبل بدء التدريب هو حساب درجات seqeval من التنبؤات، وتوفير طريقة لدفع نموذجك إلى Hub. يتم ذلك باستخدام [Keras callbacks](../main_classes/keras_callbacks).
365
+
366
+ مرر دالة `compute_metrics` الخاصة بك إلى [`~transformers.KerasMetricCallback`]:
367
+
368
+ ```py
369
+ >>> from transformers.keras_callbacks import KerasMetricCallback
370
+
371
+ >>> metric_callback = KerasMetricCallback(metric_fn=compute_metrics, eval_dataset=tf_validation_set)
372
+ ```
373
+
374
+ حدد مكان دفع نموذجك والمحلل اللغوي في [`~transformers.PushToHubCallback`]:
375
+
376
+ ```py
377
+ >>> from transformers.keras_callbacks import PushToHubCallback
378
+
379
+ >>> push_to_hub_callback = PushToHubCallback(
380
+ ... output_dir="my_awesome_wnut_model",
381
+ ... tokenizer=tokenizer,
382
+ ... )
383
+ ```
384
+
385
+ ثم جمّع callbacks الخاصة بك معًا:
386
+
387
+ ```py
388
+ >>> callbacks = [metric_callback, push_to_hub_callback]
389
+ ```
390
+
391
+ أخيرًا، أنت جاهز الآن لبدء تدريب نموذجك! قم باستدعاء [`fit`](https://keras.io/api/models/model_training_apis/#fit-method) مع بيانات التدريب والتحقق، وعدد الحقبات، وcallbacks لتعديل النموذج:
392
+
393
+ ```py
394
+ >>> model.fit(x=tf_train_set, validation_data=tf_validation_set, epochs=3, callbacks=callbacks)
395
+ ```
396
+
397
+ بمجرد اكتمال التدريب، يتم تحميل نموذجك تلقائيًا إلى Hub حتى يتمكن الجميع من استخدامه!
398
+ </tf>
399
+ </frameworkcontent>
400
+
401
+ <Tip>
402
+
403
+ للحصول على مثال أكثر تفصيلاً حول كيفية تعديل نموذج لتصنيف الرموز، ألق نظرة على الدفتر المقابل
404
+ [دفتر PyTorch](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/token_classification.ipynb)
405
+ أو [دفتر TensorFlow](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/token_classification-tf.ipynb).
406
+
407
+ </Tip>
408
+
409
+ ## الاستدلال(Inference)
410
+
411
+ رائع، الآن بعد أن قمت بتعديل نموذج، يمكنك استخدامه للاستدلال!
412
+
413
+ احصل على بعض النصوص التي تريد تشغيل الاستدلال عليها:
414
+
415
+ ```py
416
+ >>> text = "The Golden State Warriors are an American professional basketball team based in San Francisco."
417
+ ```
418
+
419
+ أبسط طريقة لتجربة نموذجك المُدرب مسبقًا للاستدلال هي استخدامه في [`pipeline`]. قم بتنفيذ `pipeline` لتصنيف الكيانات المسماة مع نموذجك، ومرر نصك إليه:
420
+
421
+ ```py
422
+ >>> from transformers import pipeline
423
+
424
+ >>> classifier = pipeline("ner", model="stevhliu/my_awesome_wnut_model")
425
+ >>> classifier(text)
426
+ [{'entity': 'B-location',
427
+ 'score': 0.42658573,
428
+ 'index': 2,
429
+ 'word': 'golden',
430
+ 'start': 4,
431
+ 'end': 10},
432
+ {'entity': 'I-location',
433
+ 'score': 0.35856336,
434
+ 'index': 3,
435
+ 'word': 'state',
436
+ 'start': 11,
437
+ 'end': 16},
438
+ {'entity': 'B-group',
439
+ 'score': 0.3064001,
440
+ 'index': 4,
441
+ 'word': 'warriors',
442
+ 'start': 17,
443
+ 'end': 25},
444
+ {'entity': 'B-location',
445
+ 'score': 0.65523505,
446
+ 'index': 13,
447
+ 'word': 'san',
448
+ 'start': 80,
449
+ 'end': 83},
450
+ {'entity': 'B-location',
451
+ 'score': 0.4668663,
452
+ 'index': 14,
453
+ 'word': 'francisco',
454
+ 'start': 84,
455
+ 'end': 93}]
456
+ ```
457
+
458
+ يمكنك أيضًا تكرار نتائج `pipeline` يدويًا إذا أردت:
459
+
460
+ <frameworkcontent>
461
+ <pt>
462
+ قسّم النص إلى رموز وأرجع المُوتّرات بلغة PyTorch:
463
+
464
+ ```py
465
+ >>> from transformers import AutoTokenizer
466
+
467
+ >>> tokenizer = AutoTokenizer.from_pretrained("stevhliu/my_awesome_wnut_model")
468
+ >>> inputs = tokenizer(text, return_tensors="pt")
469
+ ```
470
+
471
+ مرر مدخلاتك إلى النموذج واحصل على `logits`:
472
+
473
+ ```py
474
+ >>> from transformers import AutoModelForTokenClassification
475
+
476
+ >>> model = AutoModelForTokenClassification.from_pretrained("stevhliu/my_awesome_wnut_model")
477
+ >>> with torch.no_grad():
478
+ ... logits = model(**inputs).logits
479
+ ```
480
+
481
+ استخرج الفئة ذات الاحتمالية الأعلى، واستخدم جدول `id2label` الخاصة بالنموذج لتحويلها إلى تسمية نصي��:
482
+
483
+ ```py
484
+ >>> predictions = torch.argmax(logits, dim=2)
485
+ >>> predicted_token_class = [model.config.id2label[t.item()] for t in predictions[0]]
486
+ >>> predicted_token_class
487
+ ['O',
488
+ 'O',
489
+ 'B-location',
490
+ 'I-location',
491
+ 'B-group',
492
+ 'O',
493
+ 'O',
494
+ 'O',
495
+ 'O',
496
+ 'O',
497
+ 'O',
498
+ 'O',
499
+ 'O',
500
+ 'B-location',
501
+ 'B-location',
502
+ 'O',
503
+ 'O']
504
+ ```
505
+ </pt>
506
+ <tf>
507
+ قسّم النص إلى رموز وأرجع المُوتّرات ب TensorFlow:
508
+
509
+ ```py
510
+ >>> from transformers import AutoTokenizer
511
+
512
+ >>> tokenizer = AutoTokenizer.from_pretrained("stevhliu/my_awesome_wnut_model")
513
+ >>> inputs = tokenizer(text, return_tensors="tf")
514
+ ```
515
+
516
+ مرر مدخلاتك إلى النموذج واحصل على `logits`:
517
+
518
+ ```py
519
+ >>> from transformers import TFAutoModelForTokenClassification
520
+
521
+ >>> model = TFAutoModelForTokenClassification.from_pretrained("stevhliu/my_awesome_wnut_model")
522
+ >>> logits = model(**inputs).logits
523
+ ```
524
+
525
+ استخرج الفئة ذات الاحتمالية الأعلى، واستخدم جدول `id2label` الخاصة بالنموذج لتحويلها إلى تسمية نصية:
526
+
527
+ ```py
528
+ >>> predicted_token_class_ids = tf.math.argmax(logits, axis=-1)
529
+ >>> predicted_token_class = [model.config.id2label[t] for t in predicted_token_class_ids[0].numpy().tolist()]
530
+ >>> predicted_token_class
531
+ ['O',
532
+ 'O',
533
+ 'B-location',
534
+ 'I-location',
535
+ 'B-group',
536
+ 'O',
537
+ 'O',
538
+ 'O',
539
+ 'O',
540
+ 'O',
541
+ 'O',
542
+ 'O',
543
+ 'O',
544
+ 'B-location',
545
+ 'B-location',
546
+ 'O',
547
+ 'O']
548
+ ```
549
+ </tf>
550
+ </frameworkcontent>
transformers/docs/source/ar/tasks/translation.md ADDED
@@ -0,0 +1,407 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!--Copyright 2022 The HuggingFace Team. All rights reserved.
2
+
3
+ Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
4
+ the License. You may obtain a copy of the License at
5
+
6
+ http://www.apache.org/licenses/LICENSE-2.0
7
+
8
+ Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
9
+ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
10
+ specific language governing permissions and limitations under the License.
11
+
12
+ ⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
13
+ rendered properly in your Markdown viewer.
14
+
15
+ -->
16
+
17
+ # الترجمة(Translation)
18
+
19
+ [[open-in-colab]]
20
+
21
+ <Youtube id="1JvfrvZgi6c"/>
22
+
23
+ الترجمة هي عملية تحويل سلسلة نصية من لغة إلى أخرى. وهي إحدى المهام التي يمكن صياغتها كمسألة تسلسل إلى تسلسل، وهو إطار عمل قوي لإنتاج مخرجات من مدخلات، مثل الترجمة أو التلخيص. تُستخدم أنظمة الترجمة عادةً للترجمة بين نصوص لغات مختلفة، ويمكن استخدامها أيضًا لترجمة الكلام أو لمهام تجمع بين النصوص والكلام، مثل تحويل النص إلى كلام أو تحويل الكلام إلى نص.
24
+
25
+ سيوضح لك هذا الدليل كيفية:
26
+
27
+ 1. ضبط دقيق لنموذج [T5](https://huggingface.co/google-t5/t5-small) على المجموعة الفرعية الإنجليزية-الفرنسية من مجموعة بيانات [OPUS Books](https://huggingface.co/datasets/opus_books) لترجمة النص الإنجليزي إلى الفرنسية.
28
+ 2. استخدام النموذج المضبوط بدقة للاستدلال.
29
+
30
+ <Tip>
31
+
32
+ لمشاهدة جميع البنى والنسخ المتوافقة مع هذه المهمة، نوصي بالتحقق من [صفحة المهمة](https://huggingface.co/tasks/translation).
33
+
34
+ </Tip>
35
+
36
+ قبل البدء، تأكد من تثبيت جميع المكتبات الضرورية:
37
+
38
+ ```bash
39
+ pip install transformers datasets evaluate sacrebleu
40
+ ```
41
+
42
+ نشجعك على تسجيل الدخول إلى حساب Hugging Face الخاص بك حتى تتمكن من تحميل نموذجك ومشاركته مع المجتمع. عند الطلب، أدخل الرمز المميز الخاص بك لتسجيل الدخول:
43
+
44
+ ```py
45
+ >>> from huggingface_hub import notebook_login
46
+
47
+ >>> notebook_login()
48
+ ```
49
+
50
+ ## تحميل مجموعة بيانات OPUS Books
51
+
52
+ ابدأ بتحميل المجموعة الفرعية الإنجليزية-الفرنسية من مجموعة بيانات [OPUS Books](https://huggingface.co/datasets/opus_books) من مكتبة 🤗 Datasets:
53
+
54
+ ```py
55
+ >>> from datasets import load_dataset
56
+
57
+ >>> books = load_dataset("opus_books", "en-fr")
58
+ ```
59
+
60
+ قسّم مجموعة البيانات إلى مجموعة تدريب ومجموعة اختبار باستخدام طريقة [`~datasets.Dataset.train_test_split`]:
61
+
62
+ ```py
63
+ >>> books = books["train"].train_test_split(test_size=0.2)
64
+ ```
65
+
66
+ ثم ألقِ نظرة على مثال:
67
+
68
+ ```py
69
+ >>> books["train"][0]
70
+ {'id': '90560',
71
+ 'translation': {'en': 'But this lofty plateau measured only a few fathoms, and soon we reentered Our Element.',
72
+ 'fr': 'Mais ce plateau élevé ne mesurait que quelques toises, et bientôt nous fûmes rentrés dans notre élément.'}}
73
+ ```
74
+
75
+ `translation`: ترجمة إنجليزية وفرنسية للنص.
76
+
77
+ ## المعالجة المسبقة(Preprocess)
78
+
79
+ <Youtube id="XAR8jnZZuUs"/>
80
+
81
+ الخطوة التالية هي تحميل مُجزئ T5 لمعالجة أزواج اللغة الإنجليزية-الفرنسية:
82
+
83
+ ```py
84
+ >>> from transformers import AutoTokenizer
85
+
86
+ >>> checkpoint = "google-t5/t5-small"
87
+ >>> tokenizer = AutoTokenizer.from_pretrained(checkpoint)
88
+ ```
89
+
90
+ يجب أن تقوم دالة المعالجة المسبقة التي تُريد إنشاءها بما يلي:
91
+
92
+ 1. إضافة بادئة إلى المُدخل بمُوجه حتى يعرف T5 أن هذه مهمة ترجمة. تتطلب بعض النماذج القادرة على أداء مهام متعددة توجيهًا لمهام مُحددة.
93
+ 2. تعيين اللغة الهدف (الفرنسية) في معامل `text_target` لضمان معالجة المُجزئ للنص بشكل صحيح. إذا لم تُعيّن `text_target`، فسيُعالج المُجزئ النص على أنه إنجليزي.
94
+ 3. اقتطاع التسلسلات بحيث لا يزيد طولها عن الحد الأقصى الذي يحدده معامل `max_length`.
95
+
96
+ ```py
97
+ >>> source_lang = "en"
98
+ >>> target_lang = "fr"
99
+ >>> prefix = "translate English to French: "
100
+
101
+ >>> def preprocess_function(examples):
102
+ ... inputs = [prefix + example[source_lang] for example in examples["translation"]]
103
+ ... targets = [example[target_lang] for example in examples["translation"]]
104
+ ... model_inputs = tokenizer(inputs, text_target=targets, max_length=128, truncation=True)
105
+ ... return model_inputs
106
+ ```
107
+
108
+ لتطبيق دالة المعالجة المسبقة على مجموعة البيانات بأكملها، استخدم طريقة [`~datasets.Dataset.map`] من 🤗 Datasets. يمكنك تسريع دالة `map` عن طريق تعيين `batched=True` لمعالجة عناصر متعددة من مجموعة البيانات في وقت واحد:
109
+
110
+ ```py
111
+ >>> tokenized_books = books.map(preprocess_function, batched=True)
112
+ ```
113
+
114
+ الآن أنشئ دفعة من الأمثلة باستخدام [`DataCollatorForSeq2Seq`]. من الأكثر كفاءة *الحشو الديناميكي* للجمل إلى أطول طول في دفعة أثناء التجميع، بدلاً من حشو مجموعة البيانات بأكملها إلى الحد الأقصى للطول.
115
+
116
+ <frameworkcontent>
117
+ <pt>
118
+
119
+ ```py
120
+ >>> from transformers import DataCollatorForSeq2Seq
121
+
122
+ >>> data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint)
123
+ ```
124
+ </pt>
125
+ <tf>
126
+
127
+ ```py
128
+ >>> from transformers import DataCollatorForSeq2Seq
129
+
130
+ >>> data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint, return_tensors="tf")
131
+ ```
132
+ </tf>
133
+ </frameworkcontent>
134
+
135
+ ## التقييم (Evaluate)
136
+
137
+ غالباً ما يكون تضمين مقياس أثناء التدريب مفيداً لتقييم أداء نموذجك. يمكنك تحميل طريقة تقييم بسرعة باستخدام مكتبة 🤗 [Evaluate](https://huggingface.co/docs/evaluate/index). لهذه المهمة، حمّل مقياس [SacreBLEU](https://huggingface.co/spaces/evaluate-metric/sacrebleu) (راجع [الجولة السريعة](https://huggingface.co/docs/evaluate/a_quick_tour) لـ 🤗 Evaluate لمعرفة المزيد حول كيفية تحميل وحساب مقياس):
138
+
139
+ ```py
140
+ >>> import evaluate
141
+
142
+ >>> metric = evaluate.load("sacrebleu")
143
+ ```
144
+
145
+ ثم أنشئ دالة تُمرر تنبؤاتك وتسمياتك إلى [`~evaluate.EvaluationModule.compute`] لحساب درجة SacreBLEU:
146
+
147
+ ```py
148
+ >>> import numpy as np
149
+
150
+ >>> def postprocess_text(preds, labels):
151
+ ... preds = [pred.strip() for pred in preds]
152
+ ... labels = [[label.strip()] for label in labels]
153
+
154
+ ... return preds, labels
155
+
156
+ >>> def compute_metrics(eval_preds):
157
+ ... preds, labels = eval_preds
158
+ ... if isinstance(preds, tuple):
159
+ ... preds = preds[0]
160
+ ... decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
161
+
162
+ ... labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
163
+ ... decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
164
+
165
+ ... decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)
166
+
167
+ ... result = metric.compute(predictions=decoded_preds, references=decoded_labels)
168
+ ... result = {"bleu": result["score"]}
169
+
170
+ ... prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
171
+ ... result["gen_len"] = np.mean(prediction_lens)
172
+ ... result = {k: round(v, 4) for k, v in result.items()}
173
+ ... return result
174
+ ```
175
+
176
+ دالة `compute_metrics` الخاصة بك جاهزة الآن، وسوف تعود إليها عند إعداد التدريب.
177
+
178
+ ## التدريب (Train)
179
+
180
+ <frameworkcontent>
181
+ <pt>
182
+
183
+ <Tip>
184
+
185
+ إذا لم تكن معتادًا على ضبط دقيق نموذج باستخدام [`Trainer`], فألقِ نظرة على البرنامج التعليمي الأساسي [هنا](../training#train-with-pytorch-trainer)!
186
+
187
+ </Tip>
188
+
189
+ أنت جاهز لبدء تدريب نموذجك الآن! حمّل T5 باستخدام [`AutoModelForSeq2SeqLM`]:
190
+
191
+ ```py
192
+ >>> from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer
193
+
194
+ >>> model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)
195
+ ```
196
+
197
+ في هذه المرحلة، تبقى ثلاث خطوات فقط:
198
+
199
+ 1. حدد مُعاملات للتدريب في [`Seq2SeqTrainingArguments`]. المُعامل الوحيدة المطلوبة هي `output_dir` التي تحدد مكان حفظ النموذج الخاص بك. ستقوم بدفع هذا النموذج إلى Hub عن طريق تعيين `push_to_hub=True` (يجب عليك تسجيل الدخول إلى Hugging Face لتحميل نموذجك). في نهاية كل حقبة، سيقوم [`Trainer`] بتقييم مقياس SacreBLEU وحفظ نقطة تدقيق التدريب.
200
+ 2. مرر مُعاملات التدريب إلى [`Seq2SeqTrainer`] جنبًا إلى جنب مع النموذج ومجموعة البيانات والمعالج اللغوي وجامع البيانات ووظيفة `compute_metrics`.
201
+ 3. نفّذ [`~Trainer.train`] لضبط نموذجك.
202
+
203
+ ```py
204
+ >>> training_args = Seq2SeqTrainingArguments(
205
+ ... output_dir="my_awesome_opus_books_model",
206
+ ... eval_strategy="epoch",
207
+ ... learning_rate=2e-5,
208
+ ... per_device_train_batch_size=16,
209
+ ... per_device_eval_batch_size=16,
210
+ ... weight_decay=0.01,
211
+ ... save_total_limit=3,
212
+ ... num_train_epochs=2,
213
+ ... predict_with_generate=True,
214
+ ... fp16=True, #change to bf16=True for XPU
215
+ ... push_to_hub=True,
216
+ ... )
217
+
218
+ >>> trainer = Seq2SeqTrainer(
219
+ ... model=model,
220
+ ... args=training_args,
221
+ ... train_dataset=tokenized_books["train"],
222
+ ... eval_dataset=tokenized_books["test"],
223
+ ... processing_class=tokenizer,
224
+ ... data_collator=data_collator,
225
+ ... compute_metrics=compute_metrics,
226
+ ... )
227
+
228
+ >>> trainer.train()
229
+ ```
230
+
231
+ بمجرد اكتمال التدريب، شارك نموذجك مع Hub باستخدام طريقة [`~transformers.Trainer.push_to_hub`] حتى يتمكن الجميع من استخدام نموذجك:
232
+
233
+ ```py
234
+ >>> trainer.push_to_hub()
235
+ ```
236
+ </pt>
237
+ <tf>
238
+ <Tip>
239
+
240
+ إذا لم تكن معتادًا على ضبط نموذج باستخدام Keras، فألق نظرة على البرنامج التعليمي الأساسي [هنا](../training#train-a-tensorflow-model-with-keras)!
241
+
242
+ </Tip>
243
+ لضبط نموذج في TensorFlow، ابدأ بإعداد دالة مُحسِّن وجدول معدل تعلم وبعض المعلمات الفائقة للتدريب:
244
+
245
+ ```py
246
+ >>> from transformers import AdamWeightDecay
247
+
248
+ >>> optimizer = AdamWeightDecay(learning_rate=2e-5, weight_decay_rate=0.01)
249
+ ```
250
+
251
+ ثم يمكنك تحميل T5 باستخدام [`TFAutoModelForSeq2SeqLM`]:
252
+
253
+ ```py
254
+ >>> from transformers import TFAutoModelForSeq2SeqLM
255
+
256
+ >>> model = TFAutoModelForSeq2SeqLM.from_pretrained(checkpoint)
257
+ ```
258
+
259
+ حوّل مجموعات البيانات الخاصة بك إلى تنسيق `tf.data.Dataset` باستخدام [`~transformers.TFPreTrainedModel.prepare_tf_dataset`]:
260
+
261
+ ```py
262
+ >>> tf_train_set = model.prepare_tf_dataset(
263
+ ... tokenized_books["train"],
264
+ ... shuffle=True,
265
+ ... batch_size=16,
266
+ ... collate_fn=data_collator,
267
+ ... )
268
+
269
+ >>> tf_test_set = model.prepare_tf_dataset(
270
+ ... tokenized_books["test"],
271
+ ... shuffle=False,
272
+ ... batch_size=16,
273
+ ... collate_fn=data_collator,
274
+ ... )
275
+ ```
276
+
277
+ قم بتكوين النموذج للتدريب باستخدام [`compile`](https://keras.io/api/models/model_training_apis/#compile-method). لاحظ أن جميع نماذج Transformers تحتوي على دالة خسارة ذات صلة بالمهمة بشكل افتراضي، لذلك لا تحتاج إلى تحديد واحدة إلا إذا كنت ترغب في ذلك:
278
+
279
+ ```py
280
+ >>> import tensorflow as tf
281
+
282
+ >>> model.compile(optimizer=optimizer) # No loss argument!
283
+ ```
284
+
285
+ آخر شيئين يجب إعدادهما قبل بدء التدريب هما حساب مقياس SacreBLEU من التوقعات، وتوفير طريقة لدفع نموذجك إلى Hub. يتم كلاهما باستخدام [استدعاءات Keras](../main_classes/keras_callbacks).
286
+
287
+ مرر دالة `compute_metrics` الخاصة بك إلى [`~transformers.KerasMetricCallback`]:
288
+
289
+ ```py
290
+ >>> from transformers.keras_callbacks import KerasMetricCallback
291
+
292
+ >>> metric_callback = KerasMetricCallback(metric_fn=compute_metrics, eval_dataset=tf_test_set)
293
+ ```
294
+
295
+ حدد مكان دفع نموذجك ومعالجك اللغوي في [`~transformers.PushToHubCallback`]:
296
+
297
+ ```py
298
+ >>> from transformers.keras_callbacks import PushToHubCallback
299
+
300
+ >>> push_to_hub_callback = PushToHubCallback(
301
+ ... output_dir="my_awesome_opus_books_model",
302
+ ... tokenizer=tokenizer,
303
+ ... )
304
+ ```
305
+
306
+ ثم اجمع استدعاءاتك معًا:
307
+
308
+ ```py
309
+ >>> callbacks = [metric_callback, push_to_hub_callback]
310
+ ```
311
+
312
+ أخيرًا، أنت جاهز لبدء تدريب نموذجك! اتصل بـ [`fit`](https://keras.io/api/models/model_training_apis/#fit-method) مع مجموعات بيانات التدريب والتحقق من الصحة وعدد الحقب واستدعاءاتك لضبط النموذج:
313
+
314
+ ```py
315
+ >>> model.fit(x=tf_train_set, validation_data=tf_test_set, epochs=3, callbacks=callbacks)
316
+ ```
317
+
318
+ بمجرد اكتمال التدريب، يتم تحميل نموذجك تلقائيًا إلى Hub حتى يتمكن الجميع من استخدامه!
319
+ </tf>
320
+ </frameworkcontent>
321
+
322
+ <Tip>
323
+
324
+ للحصول على مثال أكثر تعمقًا لكيفية ضبط نموذج للترجمة، ألق نظرة على [دفتر ملاحظات PyTorch](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/translation.ipynb) المقابل
325
+ أو [دفتر ملاحظات TensorFlow](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/translation-tf.ipynb).
326
+
327
+ </Tip>
328
+
329
+ ## الاستدلال (Inference)
330
+
331
+ رائع، الآن بعد أن قمت بضبط نموذج، يمكنك استخدامه للاستدلال!
332
+
333
+ أحضر بعض النصوص التي ترغب في ترجمتها إلى لغة أخرى. بالنسبة لـ T5، تحتاج إلى إضافة بادئة إلى مدخلاتك اعتمادًا على المهمة التي تعمل عليها. للترجمة من الإنجليزية إلى الفرنسية، يجب عليك إضافة بادئة إلى مدخلاتك كما هو موضح أدناه:
334
+
335
+ ```py
336
+ >>> text = "translate English to French: Legumes share resources with nitrogen-fixing bacteria."
337
+ ```
338
+
339
+ أبسط طريقة لتجربة نموذجك المضبوط للاستدلال هي استخدامه في [`pipeline`]. قم بإنشاء مثيل لـ `pipeline` للترجمة باستخدام نموذجك، ومرر النص الخاص بك إليه:
340
+
341
+ ```py
342
+ >>> from transformers import pipeline
343
+
344
+ # تغيير `xx` إلى لغة الإدخال و `yy` إلى لغة المخرجات المطلوبة.
345
+ # أمثلة: "en" للغة الإنجليزية، "fr" للغة الفرنسية، "de" للغة الألمانية، "es" للغة الإسبانية، "zh" للغة الصينية، إلخ؛ translation_en_to_fr تترجم من الإنجليزية إلى الفرنسية
346
+ # يمكنك عرض جميع قوائم اللغات هنا - https://huggingface.co/languages
347
+ >>> translator = pipeline("translation_xx_to_yy", model="username/my_awesome_opus_books_model")
348
+ >>> translator(text)
349
+ [{'translation_text': 'Legumes partagent des ressources avec des bactéries azotantes.'}]
350
+ ```
351
+
352
+ يمكنك أيضًا تكرار نتائج `pipeline` يدويًا إذا أردت:
353
+
354
+ <frameworkcontent>
355
+ <pt>
356
+ قم بتحويل النص إلى رموز وإرجاع `input_ids` كموترات PyTorch:
357
+
358
+ ```py
359
+ >>> from transformers import AutoTokenizer
360
+
361
+ >>> tokenizer = AutoTokenizer.from_pretrained("username/my_awesome_opus_books_model")
362
+ >>> inputs = tokenizer(text, return_tensors="pt").input_ids
363
+ ```
364
+
365
+ استخدم الدالة [`~generation.GenerationMixin.generate`] لإنشاء الترجمة. لمزيد من التفاصيل حول استراتيجيات توليد النصوص المختلفة والمعلمات للتحكم في التوليد، تحقق من واجهة برمجة تطبيقات [توليد النصوص](../main_classes/text_generation).
366
+
367
+ ```py
368
+ >>> from transformers import AutoModelForSeq2SeqLM
369
+
370
+ >>> model = AutoModelForSeq2SeqLM.from_pretrained("username/my_awesome_opus_books_model")
371
+ >>> outputs = model.generate(inputs, max_new_tokens=40, do_sample=True, top_k=30, top_p=0.95)
372
+ ```
373
+
374
+ فك تشفير معرفات الرموز المولدة مرة أخرى إلى نص:
375
+
376
+ ```py
377
+ >>> tokenizer.decode(outputs[0], skip_special_tokens=True)
378
+ 'Les lignées partagent des ressources avec des bactéries enfixant l'azote.'
379
+ ```
380
+ </pt>
381
+ <tf>
382
+ قم بتحويل النص إلى رموز وإرجاع `input_ids` كموترات TensorFlow:
383
+
384
+ ```py
385
+ >>> from transformers import AutoTokenizer
386
+
387
+ >>> tokenizer = AutoTokenizer.from_pretrained("username/my_awesome_opus_books_model")
388
+ >>> inputs = tokenizer(text, return_tensors="tf").input_ids
389
+ ```
390
+
391
+ استخدم طريقة [`~transformers.generation_tf_utils.TFGenerationMixin.generate`] لإنشاء الترجمة. لمزيد من التفاصيل حول استراتيجيات توليد النصوص المختلفة والمعلمات للتحكم في التوليد، تحقق من واجهة برمجة تطبيقات [توليد النصوص](../main_classes/text_generation).
392
+
393
+ ```py
394
+ >>> from transformers import TFAutoModelForSeq2SeqLM
395
+
396
+ >>> model = TFAutoModelForSeq2SeqLM.from_pretrained("username/my_awesome_opus_books_model")
397
+ >>> outputs = model.generate(inputs, max_new_tokens=40, do_sample=True, top_k=30, top_p=0.95)
398
+ ```
399
+
400
+ فك تشفير معرفات الرموز المولدة مرة أخرى إلى نص:
401
+
402
+ ```py
403
+ >>> tokenizer.decode(outputs[0], skip_special_tokens=True)
404
+ 'Les lugumes partagent les ressources avec des bactéries fixatrices d'azote.'
405
+ ```
406
+ </tf>
407
+ </frameworkcontent>
transformers/docs/source/en/_config.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # docstyle-ignore
2
+ INSTALL_CONTENT = """
3
+ # Transformers installation
4
+ ! pip install transformers datasets evaluate accelerate
5
+ # To install from source instead of the last release, comment the command above and uncomment the following one.
6
+ # ! pip install git+https://github.com/huggingface/transformers.git
7
+ """
8
+
9
+ notebook_first_cells = [{"type": "code", "content": INSTALL_CONTENT}]
10
+ black_avoid_patterns = {
11
+ "{processor_class}": "FakeProcessorClass",
12
+ "{model_class}": "FakeModelClass",
13
+ "{object_class}": "FakeObjectClass",
14
+ }
transformers/docs/source/en/_redirects.yml ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ # Optimizing inference
2
+
3
+ perf_infer_gpu_many: perf_infer_gpu_one
4
+ transformers_agents: agents
5
+ quantization: quantization/overview
transformers/docs/source/en/_toctree.yml ADDED
@@ -0,0 +1,1152 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ - sections:
2
+ - local: index
3
+ title: Transformers
4
+ - local: installation
5
+ title: Installation
6
+ - local: quicktour
7
+ title: Quickstart
8
+ title: Get started
9
+ - isExpanded: false
10
+ sections:
11
+ - sections:
12
+ - local: models
13
+ title: Loading models
14
+ - local: custom_models
15
+ title: Customizing models
16
+ - local: how_to_hack_models
17
+ title: Customizing model components
18
+ - local: model_sharing
19
+ title: Sharing
20
+ - local: modular_transformers
21
+ title: Contributing a new model to Transformers
22
+ - local: add_new_model
23
+ title: Legacy model contribution
24
+ - local: auto_docstring
25
+ title: Documenting a model
26
+ - local: attention_interface
27
+ title: Customizing attention function
28
+ title: Models
29
+ - sections:
30
+ - local: fast_tokenizers
31
+ title: Tokenizers
32
+ - local: image_processors
33
+ title: Image processors
34
+ - local: video_processors
35
+ title: Video processors
36
+ - local: backbones
37
+ title: Backbones
38
+ - local: feature_extractors
39
+ title: Feature extractors
40
+ - local: processors
41
+ title: Processors
42
+ - local: tokenizer_summary
43
+ title: Summary of the tokenizers
44
+ - local: pad_truncation
45
+ title: Padding and truncation
46
+ title: Preprocessors
47
+ title: Base classes
48
+ - isExpanded: false
49
+ sections:
50
+ - sections:
51
+ - local: pipeline_tutorial
52
+ title: Pipeline
53
+ - local: pipeline_gradio
54
+ title: Machine learning apps
55
+ - local: pipeline_webserver
56
+ title: Web server inference
57
+ - local: add_new_pipeline
58
+ title: Adding a new pipeline
59
+ title: Pipeline API
60
+ - sections:
61
+ - local: llm_tutorial
62
+ title: Text generation
63
+ - local: generation_strategies
64
+ title: Generation strategies
65
+ - local: generation_features
66
+ title: Generation features
67
+ - local: tasks/prompting
68
+ title: Prompt engineering
69
+ - local: llm_optims
70
+ title: Optimizing inference
71
+ - local: cache_explanation
72
+ title: Caching
73
+ - local: kv_cache
74
+ title: KV cache strategies
75
+ - local: serving
76
+ title: Serving
77
+ - local: llm_tutorial_optimization
78
+ title: Getting the most out of LLMs
79
+ - local: perplexity
80
+ title: Perplexity of fixed-length models
81
+ title: LLMs
82
+ - sections:
83
+ - local: conversations
84
+ title: Chat basics
85
+ - local: chat_templating
86
+ title: Templates
87
+ - local: chat_templating_multimodal
88
+ title: Multimodal templates
89
+ - local: chat_templating_writing
90
+ title: Template writing
91
+ - local: chat_extras
92
+ title: Tools and RAG
93
+ title: Chat with models
94
+ - sections:
95
+ - local: perf_torch_compile
96
+ title: torch.compile
97
+ - local: perf_infer_gpu_one
98
+ title: GPU
99
+ - local: perf_infer_gpu_multi
100
+ title: Distributed inference
101
+ - local: perf_infer_cpu
102
+ title: CPU
103
+ title: Optimization
104
+ - local: agents
105
+ title: Agents
106
+ - local: tools
107
+ title: Tools
108
+ title: Inference
109
+ - isExpanded: false
110
+ sections:
111
+ - sections:
112
+ - local: trainer
113
+ title: Trainer
114
+ - local: training
115
+ title: Fine-tuning
116
+ - local: optimizers
117
+ title: Optimizers
118
+ - local: hpo_train
119
+ title: Hyperparameter search
120
+ title: Trainer API
121
+ - sections:
122
+ - local: accelerator_selection
123
+ title: Accelerator selection
124
+ - local: accelerate
125
+ title: Accelerate
126
+ - local: fsdp
127
+ title: FullyShardedDataParallel
128
+ - local: deepspeed
129
+ title: DeepSpeed
130
+ - local: debugging
131
+ title: Multi-GPU debugging
132
+ - local: perf_train_cpu_many
133
+ title: Distributed CPUs
134
+ - local: perf_train_gpu_many
135
+ title: Parallelism methods
136
+ title: Distributed training
137
+ - sections:
138
+ - local: perf_train_gpu_one
139
+ title: GPU
140
+ - local: perf_train_cpu
141
+ title: CPU
142
+ - local: perf_train_special
143
+ title: Apple Silicon
144
+ - local: perf_train_gaudi
145
+ title: Intel Gaudi
146
+ - local: perf_hardware
147
+ title: Build your own machine
148
+ title: Hardware
149
+ - local: peft
150
+ title: PEFT
151
+ - local: model_memory_anatomy
152
+ title: Model training anatomy
153
+ title: Training
154
+ - isExpanded: false
155
+ sections:
156
+ - local: quantization/overview
157
+ title: Overview
158
+ - local: quantization/selecting
159
+ title: Selecting a quantization method
160
+ - local: quantization/concept_guide
161
+ title: Quantization concepts
162
+ - local: quantization/aqlm
163
+ title: AQLM
164
+ - local: quantization/auto_round
165
+ title: AutoRound
166
+ - local: quantization/awq
167
+ title: AWQ
168
+ - local: quantization/bitnet
169
+ title: BitNet
170
+ - local: quantization/bitsandbytes
171
+ title: bitsandbytes
172
+ - local: quantization/compressed_tensors
173
+ title: compressed-tensors
174
+ - local: quantization/eetq
175
+ title: EETQ
176
+ - local: quantization/fbgemm_fp8
177
+ title: FBGEMM
178
+ - local: quantization/finegrained_fp8
179
+ title: Fine-grained FP8
180
+ - local: gguf
181
+ title: GGUF
182
+ - local: quantization/gptq
183
+ title: GPTQ
184
+ - local: quantization/higgs
185
+ title: HIGGS
186
+ - local: quantization/hqq
187
+ title: HQQ
188
+ - local: quantization/optimum
189
+ title: Optimum
190
+ - local: quantization/quanto
191
+ title: Quanto
192
+ - local: quantization/quark
193
+ title: Quark
194
+ - local: quantization/torchao
195
+ title: torchao
196
+ - local: quantization/spqr
197
+ title: SpQR
198
+ - local: quantization/vptq
199
+ title: VPTQ
200
+ - local: quantization/contribute
201
+ title: Contribute
202
+ title: Quantization
203
+ - isExpanded: false
204
+ sections:
205
+ - local: serialization
206
+ title: ONNX
207
+ - local: tflite
208
+ title: LiteRT
209
+ - local: executorch
210
+ title: ExecuTorch
211
+ - local: torchscript
212
+ title: TorchScript
213
+ title: Export to production
214
+ - isExpanded: false
215
+ sections:
216
+ - sections:
217
+ - sections:
218
+ - local: tasks/sequence_classification
219
+ title: Text classification
220
+ - local: tasks/token_classification
221
+ title: Token classification
222
+ - local: tasks/question_answering
223
+ title: Question answering
224
+ - local: tasks/language_modeling
225
+ title: Causal language modeling
226
+ - local: tasks/masked_language_modeling
227
+ title: Masked language modeling
228
+ - local: tasks/translation
229
+ title: Translation
230
+ - local: tasks/summarization
231
+ title: Summarization
232
+ - local: tasks/multiple_choice
233
+ title: Multiple choice
234
+ title: Natural language processing
235
+ - sections:
236
+ - local: tasks/audio_classification
237
+ title: Audio classification
238
+ - local: tasks/asr
239
+ title: Automatic speech recognition
240
+ title: Audio
241
+ - sections:
242
+ - local: tasks/image_classification
243
+ title: Image classification
244
+ - local: tasks/semantic_segmentation
245
+ title: Image segmentation
246
+ - local: tasks/video_classification
247
+ title: Video classification
248
+ - local: tasks/object_detection
249
+ title: Object detection
250
+ - local: tasks/zero_shot_object_detection
251
+ title: Zero-shot object detection
252
+ - local: tasks/zero_shot_image_classification
253
+ title: Zero-shot image classification
254
+ - local: tasks/monocular_depth_estimation
255
+ title: Depth estimation
256
+ - local: tasks/image_to_image
257
+ title: Image-to-Image
258
+ - local: tasks/image_feature_extraction
259
+ title: Image Feature Extraction
260
+ - local: tasks/mask_generation
261
+ title: Mask Generation
262
+ - local: tasks/keypoint_detection
263
+ title: Keypoint detection
264
+ - local: tasks/knowledge_distillation_for_image_classification
265
+ title: Knowledge Distillation for Computer Vision
266
+ title: Computer vision
267
+ - sections:
268
+ - local: tasks/image_captioning
269
+ title: Image captioning
270
+ - local: tasks/document_question_answering
271
+ title: Document Question Answering
272
+ - local: tasks/visual_question_answering
273
+ title: Visual Question Answering
274
+ - local: tasks/text-to-speech
275
+ title: Text to speech
276
+ - local: tasks/idefics
277
+ title: Image tasks with IDEFICS
278
+ - local: tasks/image_text_to_text
279
+ title: Image-text-to-text
280
+ - local: tasks/video_text_to_text
281
+ title: Video-text-to-text
282
+ - local: tasks/visual_document_retrieval
283
+ title: Visual Document Retrieval
284
+ title: Multimodal
285
+ title: Task recipes
286
+ - local: run_scripts
287
+ title: Training scripts
288
+ - local: glossary
289
+ title: Glossary
290
+ - local: philosophy
291
+ title: Philosophy
292
+ - local: notebooks
293
+ title: Notebooks with examples
294
+ - local: community
295
+ title: Community resources
296
+ - local: troubleshooting
297
+ title: Troubleshoot
298
+ title: Resources
299
+ - isExpanded: false
300
+ sections:
301
+ - local: contributing
302
+ title: Contribute to Transformers
303
+ - local: testing
304
+ title: Transformers model tests
305
+ - local: pr_checks
306
+ title: Pull request checks
307
+ title: Contribute
308
+ - isExpanded: false
309
+ sections:
310
+ - sections:
311
+ - local: model_doc/auto
312
+ title: Auto Classes
313
+ - local: main_classes/backbones
314
+ title: Backbones
315
+ - local: main_classes/callback
316
+ title: Callbacks
317
+ - local: main_classes/configuration
318
+ title: Configuration
319
+ - local: main_classes/data_collator
320
+ title: Data Collator
321
+ - local: main_classes/keras_callbacks
322
+ title: Keras callbacks
323
+ - local: main_classes/logging
324
+ title: Logging
325
+ - local: main_classes/model
326
+ title: Models
327
+ - local: main_classes/text_generation
328
+ title: Text Generation
329
+ - local: main_classes/onnx
330
+ title: ONNX
331
+ - local: main_classes/optimizer_schedules
332
+ title: Optimization
333
+ - local: main_classes/output
334
+ title: Model outputs
335
+ - local: main_classes/peft
336
+ title: PEFT
337
+ - local: main_classes/pipelines
338
+ title: Pipelines
339
+ - local: main_classes/processors
340
+ title: Processors
341
+ - local: main_classes/quantization
342
+ title: Quantization
343
+ - local: main_classes/tokenizer
344
+ title: Tokenizer
345
+ - local: main_classes/trainer
346
+ title: Trainer
347
+ - local: main_classes/deepspeed
348
+ title: DeepSpeed
349
+ - local: main_classes/executorch
350
+ title: ExecuTorch
351
+ - local: main_classes/feature_extractor
352
+ title: Feature Extractor
353
+ - local: main_classes/image_processor
354
+ title: Image Processor
355
+ - local: main_classes/video_processor
356
+ title: Video Processor
357
+ title: Main Classes
358
+ - sections:
359
+ - sections:
360
+ - local: model_doc/albert
361
+ title: ALBERT
362
+ - local: model_doc/arcee
363
+ title: Arcee
364
+ - local: model_doc/bamba
365
+ title: Bamba
366
+ - local: model_doc/bart
367
+ title: BART
368
+ - local: model_doc/barthez
369
+ title: BARThez
370
+ - local: model_doc/bartpho
371
+ title: BARTpho
372
+ - local: model_doc/bert
373
+ title: BERT
374
+ - local: model_doc/bert-generation
375
+ title: BertGeneration
376
+ - local: model_doc/bert-japanese
377
+ title: BertJapanese
378
+ - local: model_doc/bertweet
379
+ title: BERTweet
380
+ - local: model_doc/big_bird
381
+ title: BigBird
382
+ - local: model_doc/bigbird_pegasus
383
+ title: BigBirdPegasus
384
+ - local: model_doc/biogpt
385
+ title: BioGpt
386
+ - local: model_doc/bitnet
387
+ title: BitNet
388
+ - local: model_doc/blenderbot
389
+ title: Blenderbot
390
+ - local: model_doc/blenderbot-small
391
+ title: Blenderbot Small
392
+ - local: model_doc/bloom
393
+ title: BLOOM
394
+ - local: model_doc/bort
395
+ title: BORT
396
+ - local: model_doc/byt5
397
+ title: ByT5
398
+ - local: model_doc/camembert
399
+ title: CamemBERT
400
+ - local: model_doc/canine
401
+ title: CANINE
402
+ - local: model_doc/codegen
403
+ title: CodeGen
404
+ - local: model_doc/code_llama
405
+ title: CodeLlama
406
+ - local: model_doc/cohere
407
+ title: Cohere
408
+ - local: model_doc/cohere2
409
+ title: Cohere2
410
+ - local: model_doc/convbert
411
+ title: ConvBERT
412
+ - local: model_doc/cpm
413
+ title: CPM
414
+ - local: model_doc/cpmant
415
+ title: CPMANT
416
+ - local: model_doc/ctrl
417
+ title: CTRL
418
+ - local: model_doc/dbrx
419
+ title: DBRX
420
+ - local: model_doc/deberta
421
+ title: DeBERTa
422
+ - local: model_doc/deberta-v2
423
+ title: DeBERTa-v2
424
+ - local: model_doc/deepseek_v3
425
+ title: DeepSeek-V3
426
+ - local: model_doc/dialogpt
427
+ title: DialoGPT
428
+ - local: model_doc/diffllama
429
+ title: DiffLlama
430
+ - local: model_doc/distilbert
431
+ title: DistilBERT
432
+ - local: model_doc/doge
433
+ title: Doge
434
+ - local: model_doc/dots1
435
+ title: dots1
436
+ - local: model_doc/dpr
437
+ title: DPR
438
+ - local: model_doc/electra
439
+ title: ELECTRA
440
+ - local: model_doc/encoder-decoder
441
+ title: Encoder Decoder Models
442
+ - local: model_doc/ernie
443
+ title: ERNIE
444
+ - local: model_doc/ernie_m
445
+ title: ErnieM
446
+ - local: model_doc/esm
447
+ title: ESM
448
+ - local: model_doc/falcon
449
+ title: Falcon
450
+ - local: model_doc/falcon3
451
+ title: Falcon3
452
+ - local: model_doc/falcon_h1
453
+ title: FalconH1
454
+ - local: model_doc/falcon_mamba
455
+ title: FalconMamba
456
+ - local: model_doc/flan-t5
457
+ title: FLAN-T5
458
+ - local: model_doc/flan-ul2
459
+ title: FLAN-UL2
460
+ - local: model_doc/flaubert
461
+ title: FlauBERT
462
+ - local: model_doc/fnet
463
+ title: FNet
464
+ - local: model_doc/fsmt
465
+ title: FSMT
466
+ - local: model_doc/funnel
467
+ title: Funnel Transformer
468
+ - local: model_doc/fuyu
469
+ title: Fuyu
470
+ - local: model_doc/gemma
471
+ title: Gemma
472
+ - local: model_doc/gemma2
473
+ title: Gemma2
474
+ - local: model_doc/glm
475
+ title: GLM
476
+ - local: model_doc/glm4
477
+ title: glm4
478
+ - local: model_doc/openai-gpt
479
+ title: GPT
480
+ - local: model_doc/gpt_neo
481
+ title: GPT Neo
482
+ - local: model_doc/gpt_neox
483
+ title: GPT NeoX
484
+ - local: model_doc/gpt_neox_japanese
485
+ title: GPT NeoX Japanese
486
+ - local: model_doc/gptj
487
+ title: GPT-J
488
+ - local: model_doc/gpt2
489
+ title: GPT2
490
+ - local: model_doc/gpt_bigcode
491
+ title: GPTBigCode
492
+ - local: model_doc/gptsan-japanese
493
+ title: GPTSAN Japanese
494
+ - local: model_doc/gpt-sw3
495
+ title: GPTSw3
496
+ - local: model_doc/granite
497
+ title: Granite
498
+ - local: model_doc/granitemoe
499
+ title: GraniteMoe
500
+ - local: model_doc/granitemoehybrid
501
+ title: GraniteMoeHybrid
502
+ - local: model_doc/granitemoeshared
503
+ title: GraniteMoeShared
504
+ - local: model_doc/helium
505
+ title: Helium
506
+ - local: model_doc/herbert
507
+ title: HerBERT
508
+ - local: model_doc/hgnet_v2
509
+ title: HGNet-V2
510
+ - local: model_doc/ibert
511
+ title: I-BERT
512
+ - local: model_doc/jamba
513
+ title: Jamba
514
+ - local: model_doc/jetmoe
515
+ title: JetMoe
516
+ - local: model_doc/jukebox
517
+ title: Jukebox
518
+ - local: model_doc/led
519
+ title: LED
520
+ - local: model_doc/lfm2
521
+ title: LFM2
522
+ - local: model_doc/llama
523
+ title: LLaMA
524
+ - local: model_doc/llama2
525
+ title: Llama2
526
+ - local: model_doc/llama3
527
+ title: Llama3
528
+ - local: model_doc/longformer
529
+ title: Longformer
530
+ - local: model_doc/longt5
531
+ title: LongT5
532
+ - local: model_doc/luke
533
+ title: LUKE
534
+ - local: model_doc/m2m_100
535
+ title: M2M100
536
+ - local: model_doc/madlad-400
537
+ title: MADLAD-400
538
+ - local: model_doc/mamba
539
+ title: Mamba
540
+ - local: model_doc/mamba2
541
+ title: Mamba2
542
+ - local: model_doc/marian
543
+ title: MarianMT
544
+ - local: model_doc/markuplm
545
+ title: MarkupLM
546
+ - local: model_doc/mbart
547
+ title: MBart and MBart-50
548
+ - local: model_doc/mega
549
+ title: MEGA
550
+ - local: model_doc/megatron-bert
551
+ title: MegatronBERT
552
+ - local: model_doc/megatron_gpt2
553
+ title: MegatronGPT2
554
+ - local: model_doc/minimax
555
+ title: MiniMax
556
+ - local: model_doc/mistral
557
+ title: Mistral
558
+ - local: model_doc/mixtral
559
+ title: Mixtral
560
+ - local: model_doc/mluke
561
+ title: mLUKE
562
+ - local: model_doc/mobilebert
563
+ title: MobileBERT
564
+ - local: model_doc/modernbert
565
+ title: ModernBert
566
+ - local: model_doc/mpnet
567
+ title: MPNet
568
+ - local: model_doc/mpt
569
+ title: MPT
570
+ - local: model_doc/mra
571
+ title: MRA
572
+ - local: model_doc/mt5
573
+ title: MT5
574
+ - local: model_doc/mvp
575
+ title: MVP
576
+ - local: model_doc/myt5
577
+ title: myt5
578
+ - local: model_doc/nemotron
579
+ title: Nemotron
580
+ - local: model_doc/nezha
581
+ title: NEZHA
582
+ - local: model_doc/nllb
583
+ title: NLLB
584
+ - local: model_doc/nllb-moe
585
+ title: NLLB-MoE
586
+ - local: model_doc/nystromformer
587
+ title: Nyströmformer
588
+ - local: model_doc/olmo
589
+ title: OLMo
590
+ - local: model_doc/olmo2
591
+ title: OLMo2
592
+ - local: model_doc/olmoe
593
+ title: OLMoE
594
+ - local: model_doc/open-llama
595
+ title: Open-Llama
596
+ - local: model_doc/opt
597
+ title: OPT
598
+ - local: model_doc/pegasus
599
+ title: Pegasus
600
+ - local: model_doc/pegasus_x
601
+ title: PEGASUS-X
602
+ - local: model_doc/persimmon
603
+ title: Persimmon
604
+ - local: model_doc/phi
605
+ title: Phi
606
+ - local: model_doc/phi3
607
+ title: Phi-3
608
+ - local: model_doc/phimoe
609
+ title: PhiMoE
610
+ - local: model_doc/phobert
611
+ title: PhoBERT
612
+ - local: model_doc/plbart
613
+ title: PLBart
614
+ - local: model_doc/prophetnet
615
+ title: ProphetNet
616
+ - local: model_doc/qdqbert
617
+ title: QDQBert
618
+ - local: model_doc/qwen2
619
+ title: Qwen2
620
+ - local: model_doc/qwen2_moe
621
+ title: Qwen2MoE
622
+ - local: model_doc/qwen3
623
+ title: Qwen3
624
+ - local: model_doc/qwen3_moe
625
+ title: Qwen3MoE
626
+ - local: model_doc/rag
627
+ title: RAG
628
+ - local: model_doc/realm
629
+ title: REALM
630
+ - local: model_doc/recurrent_gemma
631
+ title: RecurrentGemma
632
+ - local: model_doc/reformer
633
+ title: Reformer
634
+ - local: model_doc/rembert
635
+ title: RemBERT
636
+ - local: model_doc/retribert
637
+ title: RetriBERT
638
+ - local: model_doc/roberta
639
+ title: RoBERTa
640
+ - local: model_doc/roberta-prelayernorm
641
+ title: RoBERTa-PreLayerNorm
642
+ - local: model_doc/roc_bert
643
+ title: RoCBert
644
+ - local: model_doc/roformer
645
+ title: RoFormer
646
+ - local: model_doc/rwkv
647
+ title: RWKV
648
+ - local: model_doc/splinter
649
+ title: Splinter
650
+ - local: model_doc/squeezebert
651
+ title: SqueezeBERT
652
+ - local: model_doc/stablelm
653
+ title: StableLm
654
+ - local: model_doc/starcoder2
655
+ title: Starcoder2
656
+ - local: model_doc/switch_transformers
657
+ title: SwitchTransformers
658
+ - local: model_doc/t5
659
+ title: T5
660
+ - local: model_doc/t5gemma
661
+ title: T5Gemma
662
+ - local: model_doc/t5v1.1
663
+ title: T5v1.1
664
+ - local: model_doc/tapex
665
+ title: TAPEX
666
+ - local: model_doc/transfo-xl
667
+ title: Transformer XL
668
+ - local: model_doc/ul2
669
+ title: UL2
670
+ - local: model_doc/umt5
671
+ title: UMT5
672
+ - local: model_doc/xmod
673
+ title: X-MOD
674
+ - local: model_doc/xglm
675
+ title: XGLM
676
+ - local: model_doc/xlm
677
+ title: XLM
678
+ - local: model_doc/xlm-prophetnet
679
+ title: XLM-ProphetNet
680
+ - local: model_doc/xlm-roberta
681
+ title: XLM-RoBERTa
682
+ - local: model_doc/xlm-roberta-xl
683
+ title: XLM-RoBERTa-XL
684
+ - local: model_doc/xlm-v
685
+ title: XLM-V
686
+ - local: model_doc/xlnet
687
+ title: XLNet
688
+ - local: model_doc/yoso
689
+ title: YOSO
690
+ - local: model_doc/zamba
691
+ title: Zamba
692
+ - local: model_doc/zamba2
693
+ title: Zamba2
694
+ title: Text models
695
+ - sections:
696
+ - local: model_doc/aimv2
697
+ title: Aimv2
698
+ - local: model_doc/beit
699
+ title: BEiT
700
+ - local: model_doc/bit
701
+ title: BiT
702
+ - local: model_doc/conditional_detr
703
+ title: Conditional DETR
704
+ - local: model_doc/convnext
705
+ title: ConvNeXT
706
+ - local: model_doc/convnextv2
707
+ title: ConvNeXTV2
708
+ - local: model_doc/cvt
709
+ title: CvT
710
+ - local: model_doc/d_fine
711
+ title: D-FINE
712
+ - local: model_doc/dab-detr
713
+ title: DAB-DETR
714
+ - local: model_doc/deepseek_v2
715
+ title: DeepSeek-V2
716
+ - local: model_doc/deformable_detr
717
+ title: Deformable DETR
718
+ - local: model_doc/deit
719
+ title: DeiT
720
+ - local: model_doc/depth_anything
721
+ title: Depth Anything
722
+ - local: model_doc/depth_anything_v2
723
+ title: Depth Anything V2
724
+ - local: model_doc/depth_pro
725
+ title: DepthPro
726
+ - local: model_doc/deta
727
+ title: DETA
728
+ - local: model_doc/detr
729
+ title: DETR
730
+ - local: model_doc/dinat
731
+ title: DiNAT
732
+ - local: model_doc/dinov2
733
+ title: DINOV2
734
+ - local: model_doc/dinov2_with_registers
735
+ title: DINOv2 with Registers
736
+ - local: model_doc/dit
737
+ title: DiT
738
+ - local: model_doc/dpt
739
+ title: DPT
740
+ - local: model_doc/efficientformer
741
+ title: EfficientFormer
742
+ - local: model_doc/efficientnet
743
+ title: EfficientNet
744
+ - local: model_doc/eomt
745
+ title: EoMT
746
+ - local: model_doc/focalnet
747
+ title: FocalNet
748
+ - local: model_doc/glpn
749
+ title: GLPN
750
+ - local: model_doc/hiera
751
+ title: Hiera
752
+ - local: model_doc/ijepa
753
+ title: I-JEPA
754
+ - local: model_doc/imagegpt
755
+ title: ImageGPT
756
+ - local: model_doc/levit
757
+ title: LeViT
758
+ - local: model_doc/lightglue
759
+ title: LightGlue
760
+ - local: model_doc/mask2former
761
+ title: Mask2Former
762
+ - local: model_doc/maskformer
763
+ title: MaskFormer
764
+ - local: model_doc/mlcd
765
+ title: MLCD
766
+ - local: model_doc/mobilenet_v1
767
+ title: MobileNetV1
768
+ - local: model_doc/mobilenet_v2
769
+ title: MobileNetV2
770
+ - local: model_doc/mobilevit
771
+ title: MobileViT
772
+ - local: model_doc/mobilevitv2
773
+ title: MobileViTV2
774
+ - local: model_doc/nat
775
+ title: NAT
776
+ - local: model_doc/poolformer
777
+ title: PoolFormer
778
+ - local: model_doc/prompt_depth_anything
779
+ title: Prompt Depth Anything
780
+ - local: model_doc/pvt
781
+ title: Pyramid Vision Transformer (PVT)
782
+ - local: model_doc/pvt_v2
783
+ title: Pyramid Vision Transformer v2 (PVTv2)
784
+ - local: model_doc/regnet
785
+ title: RegNet
786
+ - local: model_doc/resnet
787
+ title: ResNet
788
+ - local: model_doc/rt_detr
789
+ title: RT-DETR
790
+ - local: model_doc/rt_detr_v2
791
+ title: RT-DETRv2
792
+ - local: model_doc/segformer
793
+ title: SegFormer
794
+ - local: model_doc/seggpt
795
+ title: SegGpt
796
+ - local: model_doc/superglue
797
+ title: SuperGlue
798
+ - local: model_doc/superpoint
799
+ title: SuperPoint
800
+ - local: model_doc/swiftformer
801
+ title: SwiftFormer
802
+ - local: model_doc/swin
803
+ title: Swin Transformer
804
+ - local: model_doc/swinv2
805
+ title: Swin Transformer V2
806
+ - local: model_doc/swin2sr
807
+ title: Swin2SR
808
+ - local: model_doc/table-transformer
809
+ title: Table Transformer
810
+ - local: model_doc/textnet
811
+ title: TextNet
812
+ - local: model_doc/timm_wrapper
813
+ title: Timm Wrapper
814
+ - local: model_doc/upernet
815
+ title: UperNet
816
+ - local: model_doc/van
817
+ title: VAN
818
+ - local: model_doc/vit
819
+ title: Vision Transformer (ViT)
820
+ - local: model_doc/vit_hybrid
821
+ title: ViT Hybrid
822
+ - local: model_doc/vitdet
823
+ title: ViTDet
824
+ - local: model_doc/vit_mae
825
+ title: ViTMAE
826
+ - local: model_doc/vitmatte
827
+ title: ViTMatte
828
+ - local: model_doc/vit_msn
829
+ title: ViTMSN
830
+ - local: model_doc/vitpose
831
+ title: ViTPose
832
+ - local: model_doc/yolos
833
+ title: YOLOS
834
+ - local: model_doc/zoedepth
835
+ title: ZoeDepth
836
+ title: Vision models
837
+ - sections:
838
+ - local: model_doc/audio-spectrogram-transformer
839
+ title: Audio Spectrogram Transformer
840
+ - local: model_doc/bark
841
+ title: Bark
842
+ - local: model_doc/clap
843
+ title: CLAP
844
+ - local: model_doc/csm
845
+ title: CSM
846
+ - local: model_doc/dac
847
+ title: dac
848
+ - local: model_doc/dia
849
+ title: Dia
850
+ - local: model_doc/encodec
851
+ title: EnCodec
852
+ - local: model_doc/fastspeech2_conformer
853
+ title: FastSpeech2Conformer
854
+ - local: model_doc/granite_speech
855
+ title: GraniteSpeech
856
+ - local: model_doc/hubert
857
+ title: Hubert
858
+ - local: model_doc/kyutai_speech_to_text
859
+ title: Kyutai Speech-To-Text
860
+ - local: model_doc/mctct
861
+ title: MCTCT
862
+ - local: model_doc/mimi
863
+ title: Mimi
864
+ - local: model_doc/mms
865
+ title: MMS
866
+ - local: model_doc/moonshine
867
+ title: Moonshine
868
+ - local: model_doc/moshi
869
+ title: Moshi
870
+ - local: model_doc/musicgen
871
+ title: MusicGen
872
+ - local: model_doc/musicgen_melody
873
+ title: MusicGen Melody
874
+ - local: model_doc/pop2piano
875
+ title: Pop2Piano
876
+ - local: model_doc/seamless_m4t
877
+ title: Seamless-M4T
878
+ - local: model_doc/seamless_m4t_v2
879
+ title: SeamlessM4T-v2
880
+ - local: model_doc/sew
881
+ title: SEW
882
+ - local: model_doc/sew-d
883
+ title: SEW-D
884
+ - local: model_doc/speech_to_text
885
+ title: Speech2Text
886
+ - local: model_doc/speech_to_text_2
887
+ title: Speech2Text2
888
+ - local: model_doc/speecht5
889
+ title: SpeechT5
890
+ - local: model_doc/unispeech
891
+ title: UniSpeech
892
+ - local: model_doc/unispeech-sat
893
+ title: UniSpeech-SAT
894
+ - local: model_doc/univnet
895
+ title: UnivNet
896
+ - local: model_doc/vits
897
+ title: VITS
898
+ - local: model_doc/wav2vec2
899
+ title: Wav2Vec2
900
+ - local: model_doc/wav2vec2-bert
901
+ title: Wav2Vec2-BERT
902
+ - local: model_doc/wav2vec2-conformer
903
+ title: Wav2Vec2-Conformer
904
+ - local: model_doc/wav2vec2_phoneme
905
+ title: Wav2Vec2Phoneme
906
+ - local: model_doc/wavlm
907
+ title: WavLM
908
+ - local: model_doc/whisper
909
+ title: Whisper
910
+ - local: model_doc/xls_r
911
+ title: XLS-R
912
+ - local: model_doc/xlsr_wav2vec2
913
+ title: XLSR-Wav2Vec2
914
+ title: Audio models
915
+ - sections:
916
+ - local: model_doc/timesformer
917
+ title: TimeSformer
918
+ - local: model_doc/vjepa2
919
+ title: V-JEPA 2
920
+ - local: model_doc/videomae
921
+ title: VideoMAE
922
+ - local: model_doc/vivit
923
+ title: ViViT
924
+ title: Video models
925
+ - sections:
926
+ - local: model_doc/align
927
+ title: ALIGN
928
+ - local: model_doc/altclip
929
+ title: AltCLIP
930
+ - local: model_doc/aria
931
+ title: Aria
932
+ - local: model_doc/aya_vision
933
+ title: AyaVision
934
+ - local: model_doc/blip
935
+ title: BLIP
936
+ - local: model_doc/blip-2
937
+ title: BLIP-2
938
+ - local: model_doc/bridgetower
939
+ title: BridgeTower
940
+ - local: model_doc/bros
941
+ title: BROS
942
+ - local: model_doc/chameleon
943
+ title: Chameleon
944
+ - local: model_doc/chinese_clip
945
+ title: Chinese-CLIP
946
+ - local: model_doc/clip
947
+ title: CLIP
948
+ - local: model_doc/clipseg
949
+ title: CLIPSeg
950
+ - local: model_doc/clvp
951
+ title: CLVP
952
+ - local: model_doc/colpali
953
+ title: ColPali
954
+ - local: model_doc/colqwen2
955
+ title: ColQwen2
956
+ - local: model_doc/data2vec
957
+ title: Data2Vec
958
+ - local: model_doc/deplot
959
+ title: DePlot
960
+ - local: model_doc/donut
961
+ title: Donut
962
+ - local: model_doc/emu3
963
+ title: Emu3
964
+ - local: model_doc/flava
965
+ title: FLAVA
966
+ - local: model_doc/gemma3
967
+ title: Gemma3
968
+ - local: model_doc/gemma3n
969
+ title: Gemma3n
970
+ - local: model_doc/git
971
+ title: GIT
972
+ - local: model_doc/glm4v
973
+ title: glm4v
974
+ - local: model_doc/got_ocr2
975
+ title: GOT-OCR2
976
+ - local: model_doc/granitevision
977
+ title: GraniteVision
978
+ - local: model_doc/grounding-dino
979
+ title: Grounding DINO
980
+ - local: model_doc/groupvit
981
+ title: GroupViT
982
+ - local: model_doc/idefics
983
+ title: IDEFICS
984
+ - local: model_doc/idefics2
985
+ title: Idefics2
986
+ - local: model_doc/idefics3
987
+ title: Idefics3
988
+ - local: model_doc/instructblip
989
+ title: InstructBLIP
990
+ - local: model_doc/instructblipvideo
991
+ title: InstructBlipVideo
992
+ - local: model_doc/internvl
993
+ title: InternVL
994
+ - local: model_doc/janus
995
+ title: Janus
996
+ - local: model_doc/kosmos-2
997
+ title: KOSMOS-2
998
+ - local: model_doc/layoutlm
999
+ title: LayoutLM
1000
+ - local: model_doc/layoutlmv2
1001
+ title: LayoutLMV2
1002
+ - local: model_doc/layoutlmv3
1003
+ title: LayoutLMV3
1004
+ - local: model_doc/layoutxlm
1005
+ title: LayoutXLM
1006
+ - local: model_doc/lilt
1007
+ title: LiLT
1008
+ - local: model_doc/llama4
1009
+ title: Llama4
1010
+ - local: model_doc/llava
1011
+ title: Llava
1012
+ - local: model_doc/llava_next
1013
+ title: LLaVA-NeXT
1014
+ - local: model_doc/llava_next_video
1015
+ title: LLaVa-NeXT-Video
1016
+ - local: model_doc/llava_onevision
1017
+ title: LLaVA-Onevision
1018
+ - local: model_doc/lxmert
1019
+ title: LXMERT
1020
+ - local: model_doc/matcha
1021
+ title: MatCha
1022
+ - local: model_doc/mgp-str
1023
+ title: MGP-STR
1024
+ - local: model_doc/mistral3
1025
+ title: Mistral3
1026
+ - local: model_doc/mllama
1027
+ title: mllama
1028
+ - local: model_doc/nougat
1029
+ title: Nougat
1030
+ - local: model_doc/omdet-turbo
1031
+ title: OmDet-Turbo
1032
+ - local: model_doc/oneformer
1033
+ title: OneFormer
1034
+ - local: model_doc/owlvit
1035
+ title: OWL-ViT
1036
+ - local: model_doc/owlv2
1037
+ title: OWLv2
1038
+ - local: model_doc/paligemma
1039
+ title: PaliGemma
1040
+ - local: model_doc/perceiver
1041
+ title: Perceiver
1042
+ - local: model_doc/perception_lm
1043
+ title: PerceptionLM
1044
+ - local: model_doc/phi4_multimodal
1045
+ title: Phi4 Multimodal
1046
+ - local: model_doc/pix2struct
1047
+ title: Pix2Struct
1048
+ - local: model_doc/pixtral
1049
+ title: Pixtral
1050
+ - local: model_doc/qwen2_5_omni
1051
+ title: Qwen2.5-Omni
1052
+ - local: model_doc/qwen2_5_vl
1053
+ title: Qwen2.5-VL
1054
+ - local: model_doc/qwen2_audio
1055
+ title: Qwen2Audio
1056
+ - local: model_doc/qwen2_vl
1057
+ title: Qwen2VL
1058
+ - local: model_doc/sam
1059
+ title: Segment Anything
1060
+ - local: model_doc/sam_hq
1061
+ title: Segment Anything High Quality
1062
+ - local: model_doc/shieldgemma2
1063
+ title: ShieldGemma2
1064
+ - local: model_doc/siglip
1065
+ title: SigLIP
1066
+ - local: model_doc/siglip2
1067
+ title: SigLIP2
1068
+ - local: model_doc/smollm3
1069
+ title: SmolLM3
1070
+ - local: model_doc/smolvlm
1071
+ title: SmolVLM
1072
+ - local: model_doc/speech-encoder-decoder
1073
+ title: Speech Encoder Decoder Models
1074
+ - local: model_doc/tapas
1075
+ title: TAPAS
1076
+ - local: model_doc/trocr
1077
+ title: TrOCR
1078
+ - local: model_doc/tvlt
1079
+ title: TVLT
1080
+ - local: model_doc/tvp
1081
+ title: TVP
1082
+ - local: model_doc/udop
1083
+ title: UDOP
1084
+ - local: model_doc/video_llava
1085
+ title: VideoLlava
1086
+ - local: model_doc/vilt
1087
+ title: ViLT
1088
+ - local: model_doc/vipllava
1089
+ title: VipLlava
1090
+ - local: model_doc/vision-encoder-decoder
1091
+ title: Vision Encoder Decoder Models
1092
+ - local: model_doc/vision-text-dual-encoder
1093
+ title: Vision Text Dual Encoder
1094
+ - local: model_doc/visual_bert
1095
+ title: VisualBERT
1096
+ - local: model_doc/xclip
1097
+ title: X-CLIP
1098
+ title: Multimodal models
1099
+ - sections:
1100
+ - local: model_doc/decision_transformer
1101
+ title: Decision Transformer
1102
+ - local: model_doc/trajectory_transformer
1103
+ title: Trajectory Transformer
1104
+ title: Reinforcement learning models
1105
+ - sections:
1106
+ - local: model_doc/autoformer
1107
+ title: Autoformer
1108
+ - local: model_doc/informer
1109
+ title: Informer
1110
+ - local: model_doc/patchtsmixer
1111
+ title: PatchTSMixer
1112
+ - local: model_doc/patchtst
1113
+ title: PatchTST
1114
+ - local: model_doc/time_series_transformer
1115
+ title: Time Series Transformer
1116
+ - local: model_doc/timesfm
1117
+ title: TimesFM
1118
+ title: Time series models
1119
+ - sections:
1120
+ - local: model_doc/graphormer
1121
+ title: Graphormer
1122
+ title: Graph models
1123
+ title: Models
1124
+ - sections:
1125
+ - local: internal/modeling_utils
1126
+ title: Custom Layers and Utilities
1127
+ - local: internal/model_debugging_utils
1128
+ title: Utilities for Model Debugging
1129
+ - local: internal/pipelines_utils
1130
+ title: Utilities for pipelines
1131
+ - local: internal/tokenization_utils
1132
+ title: Utilities for Tokenizers
1133
+ - local: internal/trainer_utils
1134
+ title: Utilities for Trainer
1135
+ - local: internal/generation_utils
1136
+ title: Utilities for Generation
1137
+ - local: internal/image_processing_utils
1138
+ title: Utilities for Image Processors
1139
+ - local: internal/audio_utils
1140
+ title: Utilities for Audio processing
1141
+ - local: internal/file_utils
1142
+ title: General Utilities
1143
+ - local: internal/import_utils
1144
+ title: Importing Utilities
1145
+ - local: internal/time_series_utils
1146
+ title: Utilities for Time Series
1147
+ title: Internal helpers
1148
+ - sections:
1149
+ - local: reference/environment_variables
1150
+ title: Environment Variables
1151
+ title: Reference
1152
+ title: API
transformers/docs/source/en/accelerate.md ADDED
@@ -0,0 +1,165 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!--Copyright 2024 The HuggingFace Team. All rights reserved.
2
+
3
+ Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
4
+ the License. You may obtain a copy of the License at
5
+
6
+ http://www.apache.org/licenses/LICENSE-2.0
7
+
8
+ Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
9
+ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
10
+ specific language governing permissions and limitations under the License.
11
+
12
+ ⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
13
+ rendered properly in your Markdown viewer.
14
+
15
+ -->
16
+
17
+ # Accelerate
18
+
19
+ [Accelerate](https://hf.co/docs/accelerate/index) is a library designed to simplify distributed training on any type of setup with PyTorch by uniting the most common frameworks ([Fully Sharded Data Parallel (FSDP)](https://pytorch.org/blog/introducing-pytorch-fully-sharded-data-parallel-api/) and [DeepSpeed](https://www.deepspeed.ai/)) for it into a single interface. [`Trainer`] is powered by Accelerate under the hood, enabling loading big models and distributed training.
20
+
21
+ This guide will show you two ways to use Accelerate with Transformers, using FSDP as the backend. The first method demonstrates distributed training with [`Trainer`], and the second method demonstrates adapting a PyTorch training loop. For more detailed information about Accelerate, please refer to the [documentation](https://hf.co/docs/accelerate/index).
22
+
23
+ ```bash
24
+ pip install accelerate
25
+ ```
26
+
27
+ Start by running [accelerate config](https://hf.co/docs/accelerate/main/en/package_reference/cli#accelerate-config) in the command line to answer a series of prompts about your training system. This creates and saves a configuration file to help Accelerate correctly set up training based on your setup.
28
+
29
+ ```bash
30
+ accelerate config
31
+ ```
32
+
33
+ Depending on your setup and the answers you provide, an example configuration file for distributing training with FSDP on one machine with two GPUs may look like the following.
34
+
35
+ ```yaml
36
+ compute_environment: LOCAL_MACHINE
37
+ debug: false
38
+ distributed_type: FSDP
39
+ downcast_bf16: 'no'
40
+ fsdp_config:
41
+ fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
42
+ fsdp_backward_prefetch_policy: BACKWARD_PRE
43
+ fsdp_forward_prefetch: false
44
+ fsdp_cpu_ram_efficient_loading: true
45
+ fsdp_offload_params: false
46
+ fsdp_sharding_strategy: FULL_SHARD
47
+ fsdp_state_dict_type: SHARDED_STATE_DICT
48
+ fsdp_sync_module_states: true
49
+ fsdp_transformer_layer_cls_to_wrap: BertLayer
50
+ fsdp_use_orig_params: true
51
+ machine_rank: 0
52
+ main_training_function: main
53
+ mixed_precision: bf16
54
+ num_machines: 1
55
+ num_processes: 2
56
+ rdzv_backend: static
57
+ same_network: true
58
+ tpu_env: []
59
+ tpu_use_cluster: false
60
+ tpu_use_sudo: false
61
+ use_cpu: false
62
+ ```
63
+
64
+ ## Trainer
65
+
66
+ Pass the path to the saved configuration file to [`TrainingArguments`], and from there, pass your [`TrainingArguments`] to [`Trainer`].
67
+
68
+ ```py
69
+ from transformers import TrainingArguments, Trainer
70
+
71
+ training_args = TrainingArguments(
72
+ output_dir="your-model",
73
+ learning_rate=2e-5,
74
+ per_device_train_batch_size=16,
75
+ per_device_eval_batch_size=16,
76
+ num_train_epochs=2,
77
+ fsdp_config="path/to/fsdp_config",
78
+ fsdp="full_shard",
79
+ weight_decay=0.01,
80
+ eval_strategy="epoch",
81
+ save_strategy="epoch",
82
+ load_best_model_at_end=True,
83
+ push_to_hub=True,
84
+ )
85
+
86
+ trainer = Trainer(
87
+ model=model,
88
+ args=training_args,
89
+ train_dataset=dataset["train"],
90
+ eval_dataset=dataset["test"],
91
+ processing_class=tokenizer,
92
+ data_collator=data_collator,
93
+ compute_metrics=compute_metrics,
94
+ )
95
+
96
+ trainer.train()
97
+ ```
98
+
99
+ ## Native PyTorch
100
+
101
+ Accelerate can also be added to any PyTorch training loop to enable distributed training. The [`~accelerate.Accelerator`] is the main entry point for adapting your PyTorch code to work with Accelerate. It automatically detects your distributed training setup and initializes all the necessary components for training. You don't need to explicitly place your model on a device because [`~accelerate.Accelerator`] knows which device to move your model to.
102
+
103
+ ```py
104
+ from accelerate import Accelerator
105
+
106
+ accelerator = Accelerator()
107
+ device = accelerator.device
108
+ ```
109
+
110
+ All PyTorch objects (model, optimizer, scheduler, dataloaders) should be passed to the [`~accelerate.Accelerator.prepare`] method now. This method moves your model to the appropriate device or devices, adapts the optimizer and scheduler to use [`~accelerate.optimizer.AcceleratedOptimizer`] and [`~accelerate.scheduler.AcceleratedScheduler`], and creates a new shardable dataloader.
111
+
112
+ ```py
113
+ train_dataloader, eval_dataloader, model, optimizer = accelerator.prepare(
114
+ train_dataloader, eval_dataloader, model, optimizer
115
+ )
116
+ ```
117
+
118
+ Replace `loss.backward` in your training loop with Accelerates [`~accelerate.Accelerator.backward`] method to scale the gradients and determine the appropriate `backward` method to use depending on your framework (for example, DeepSpeed or Megatron).
119
+
120
+ ```py
121
+ for epoch in range(num_epochs):
122
+ for batch in train_dataloader:
123
+ outputs = model(**batch)
124
+ loss = outputs.loss
125
+ accelerator.backward(loss)
126
+ optimizer.step()
127
+ lr_scheduler.step()
128
+ optimizer.zero_grad()
129
+ progress_bar.update(1)
130
+ ```
131
+
132
+ Combine everything into a function and make it callable as a script.
133
+
134
+ ```py
135
+ from accelerate import Accelerator
136
+
137
+ def main():
138
+ accelerator = Accelerator()
139
+
140
+ model, optimizer, training_dataloader, scheduler = accelerator.prepare(
141
+ model, optimizer, training_dataloader, scheduler
142
+ )
143
+
144
+ for batch in training_dataloader:
145
+ optimizer.zero_grad()
146
+ inputs, targets = batch
147
+ outputs = model(inputs)
148
+ loss = loss_function(outputs, targets)
149
+ accelerator.backward(loss)
150
+ optimizer.step()
151
+ scheduler.step()
152
+
153
+ if __name__ == "__main__":
154
+ main()
155
+ ```
156
+
157
+ From the command line, call [accelerate launch](https://hf.co/docs/accelerate/main/en/package_reference/cli#accelerate-launch) to run your training script. Any additional arguments or parameters can be passed here as well.
158
+
159
+ To launch your training script on two GPUs, add the `--num_processes` argument.
160
+
161
+ ```bash
162
+ accelerate launch --num_processes=2 your_script.py
163
+ ```
164
+
165
+ Refer to the [Launching Accelerate scripts](https://hf.co/docs/accelerate/main/en/basic_tutorials/launch) for more details.
transformers/docs/source/en/accelerator_selection.md ADDED
@@ -0,0 +1,126 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!--Copyright 2025 The HuggingFace Team. All rights reserved.
2
+
3
+ Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
4
+ the License. You may obtain a copy of the License at
5
+
6
+ http://www.apache.org/licenses/LICENSE-2.0
7
+
8
+ Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
9
+ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
10
+ specific language governing permissions and limitations under the License.
11
+
12
+ ⚠️ Note that this file is in Markdown but contains specific syntax for our doc-builder (similar to MDX) that may not be
13
+ rendered properly in your Markdown viewer.
14
+
15
+ -->
16
+
17
+ # Accelerator selection
18
+
19
+ During distributed training, you can specify the number and order of accelerators (CUDA, XPU, MPS, HPU, etc.) to use. This can be useful when you have accelerators with different computing power and you want to use the faster accelerator first. Or you could only use a subset of the available accelerators. The selection process works for both [DistributedDataParallel](https://pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html) and [DataParallel](https://pytorch.org/docs/stable/generated/torch.nn.DataParallel.html). You don't need Accelerate or [DeepSpeed integration](./main_classes/deepspeed).
20
+
21
+ This guide will show you how to select the number of accelerators to use and the order to use them in.
22
+
23
+ ## Number of accelerators
24
+
25
+ For example, if there are 4 accelerators and you only want to use the first 2, run the command below.
26
+
27
+ <hfoptions id="select-accelerator">
28
+ <hfoption id="torchrun">
29
+
30
+ Use the `--nproc_per_node` to select how many accelerators to use.
31
+
32
+ ```bash
33
+ torchrun --nproc_per_node=2 trainer-program.py ...
34
+ ```
35
+
36
+ </hfoption>
37
+ <hfoption id="Accelerate">
38
+
39
+ Use `--num_processes` to select how many accelerators to use.
40
+
41
+ ```bash
42
+ accelerate launch --num_processes 2 trainer-program.py ...
43
+ ```
44
+
45
+ </hfoption>
46
+ <hfoption id="DeepSpeed">
47
+
48
+ Use `--num_gpus` to select how many GPUs to use.
49
+
50
+ ```bash
51
+ deepspeed --num_gpus 2 trainer-program.py ...
52
+ ```
53
+
54
+ </hfoption>
55
+ </hfoptions>
56
+
57
+ ## Order of accelerators
58
+ To select specific accelerators to use and their order, use the environment variable appropriate for your hardware. This is often set on the command line for each run, but can also be added to your `~/.bashrc` or other startup config file.
59
+
60
+ For example, if there are 4 accelerators (0, 1, 2, 3) and you only want to run accelerators 0 and 2:
61
+
62
+ <hfoptions id="accelerator-type">
63
+ <hfoption id="CUDA">
64
+
65
+ ```bash
66
+ CUDA_VISIBLE_DEVICES=0,2 torchrun trainer-program.py ...
67
+ ```
68
+
69
+ Only GPUs 0 and 2 are "visible" to PyTorch and are mapped to `cuda:0` and `cuda:1` respectively.
70
+ To reverse the order (use GPU 2 as `cuda:0` and GPU 0 as `cuda:1`):
71
+
72
+
73
+ ```bash
74
+ CUDA_VISIBLE_DEVICES=2,0 torchrun trainer-program.py ...
75
+ ```
76
+
77
+ To run without any GPUs:
78
+
79
+ ```bash
80
+ CUDA_VISIBLE_DEVICES= python trainer-program.py ...
81
+ ```
82
+
83
+ You can also control the order of CUDA devices using `CUDA_DEVICE_ORDER`:
84
+
85
+ - Order by PCIe bus ID (matches `nvidia-smi`):
86
+
87
+ ```bash
88
+ export CUDA_DEVICE_ORDER=PCI_BUS_ID
89
+ ```
90
+
91
+ - Order by compute capability (fastest first):
92
+
93
+ ```bash
94
+ export CUDA_DEVICE_ORDER=FASTEST_FIRST
95
+ ```
96
+
97
+ </hfoption>
98
+ <hfoption id="Intel XPU">
99
+
100
+ ```bash
101
+ ZE_AFFINITY_MASK=0,2 torchrun trainer-program.py ...
102
+ ```
103
+
104
+ Only XPUs 0 and 2 are "visible" to PyTorch and are mapped to `xpu:0` and `xpu:1` respectively.
105
+ To reverse the order (use XPU 2 as `xpu:0` and XPU 0 as `xpu:1`):
106
+
107
+ ```bash
108
+ ZE_AFFINITY_MASK=2,0 torchrun trainer-program.py ...
109
+ ```
110
+
111
+
112
+ You can also control the order of Intel XPUs with:
113
+
114
+ ```bash
115
+ export ZE_ENABLE_PCI_ID_DEVICE_ORDER=1
116
+ ```
117
+
118
+ For more information about device enumeration and sorting on Intel XPU, please refer to the [Level Zero](https://github.com/oneapi-src/level-zero/blob/master/README.md?plain=1#L87) documentation.
119
+
120
+ </hfoption>
121
+ </hfoptions>
122
+
123
+
124
+
125
+ > [!WARNING]
126
+ > Environment variables can be exported instead of being added to the command line. This is not recommended because it can be confusing if you forget how the environment variable was set up and you end up using the wrong accelerators. Instead, it is common practice to set the environment variable for a specific training run on the same command line.
transformers/docs/source/en/add_new_model.md ADDED
@@ -0,0 +1,665 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!--Copyright 2024 The HuggingFace Team. All rights reserved.
2
+
3
+ Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
4
+ the License. You may obtain a copy of the License at
5
+
6
+ http://www.apache.org/licenses/LICENSE-2.0
7
+
8
+ Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
9
+ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
10
+
11
+ ⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
12
+ rendered properly in your Markdown viewer.
13
+
14
+ -->
15
+
16
+ # Legacy model contribution
17
+
18
+ > [!TIP]
19
+ > Try adding new models with a more [modular](./modular_transformers) approach first. This makes it significantly easier to contribute a model to Transformers!
20
+
21
+ Many of the models in Transformers are contributed by developers and researchers. As an open-source first project, we're invested in empowering the community to actively and independently add more models.
22
+
23
+ When you add a model to Transformers, you'll learn:
24
+
25
+ - more about open-source best practices
26
+ - about a models architecture
27
+ - about Transformers' design principles
28
+ - how to efficiently test large models
29
+ - how to use Python utilities like [Black](https://black.readthedocs.io/en/stable/) and [Ruff](https://docs.astral.sh/ruff/) to create clean and readable code
30
+
31
+ It is a challenging but rewarding process.
32
+
33
+ This guide will walk you through adding an example BrandNewLlama PyTorch model to Transformers. Before you begin, it is a good idea to familiarize yourself with the library.
34
+
35
+ ## Transformers overview
36
+
37
+ Transformers is an opinionated library with its own unique philosophy and design choices. These choices help us sustainably scale and maintain Transformers.
38
+
39
+ > [!TIP]
40
+ > Learn more about our design principles on the [Philosophy](./philosophy) doc.
41
+
42
+ Some of these design choices are:
43
+
44
+ - composition > over-abstraction
45
+ - duplicate code isn't always bad if it greatly improves readability and accessibility
46
+ - model files are self-contained and all the necessary model code is found in the `modeling_mymodel.py` file
47
+
48
+ These design choices are important *for everyone* interacting with the model. It is easier to read, understand, and modify.
49
+
50
+ This section describes how the model and configuration classes interact and the Transformers code style.
51
+
52
+ ### Model and configuration
53
+
54
+ All Transformers' models inherit from a base [`PreTrainedModel`] and [`PretrainedConfig`] class. The configuration is the models blueprint.
55
+
56
+ There is never more than two levels of abstraction for any model to keep the code readable. The example model here, BrandNewLlama, inherits from `BrandNewLlamaPreTrainedModel` and [`PreTrainedModel`]. It is important that a new model only depends on [`PreTrainedModel`] so that it can use the [`~PreTrainedModel.from_pretrained`] and [`~PreTrainedModel.save_pretrained`] methods.
57
+
58
+ Other important functions like the forward method are defined in the `modeling.py` file.
59
+
60
+ Specific model heads (for example, sequence classification or language modeling) should call the base model in the forward pass rather than inheriting from it to keep abstraction low.
61
+
62
+ New models require a configuration, for example `BrandNewLlamaConfig`, that is stored as an attribute of [`PreTrainedModel`].
63
+
64
+ ```py
65
+ model = BrandNewLlamaModel.from_pretrained("username/brand_new_llama")
66
+ model.config
67
+ ```
68
+
69
+ [`PretrainedConfig`] provides the [`~PretrainedConfig.from_pretrained`] and [`~PretrainedConfig.save_pretrained`] methods.
70
+
71
+ When you use [`PreTrainedModel.save_pretrained`], it automatically calls [`PretrainedConfig.save_pretrained`] so that both the model and configuration are saved together.
72
+
73
+ A model is saved to a `model.safetensors` file and a configuration is saved to a `config.json` file.
74
+
75
+ ### Code style
76
+
77
+ Transformers prefers a clean and readable code over a more abstracted code style. Some of the code style choices include:
78
+
79
+ - The code should be accessible to non-English users. Pick descriptive variable names and avoid abbreviations. For example, "activation" is preferred over "act". One letter variables names are highly discouraged unless it's an index in a for loop.
80
+
81
+ - Explicit code is preferred - even if it's longer - over shorter code.
82
+
83
+ - Avoid subclassing [nn.Sequential](https://pytorch.org/docs/stable/generated/torch.nn.Sequential.html). Subclass [nn.Module](https://pytorch.org/docs/stable/generated/torch.nn.Module.html#torch.nn.Module) instead so the code can be quickly debugged with print statements or breakpoints.
84
+
85
+ - Function signatures should be type-annotated. Otherwise, use good variable names so they're more understandable.
86
+
87
+ ## New model addition issue
88
+
89
+ Open a [New model addition](https://github.com/huggingface/transformers/issues/new?assignees=&labels=New+model&template=new-model-addition.yml) issue to add a specific model.
90
+
91
+ > [!TIP]
92
+ > Filter by the [New model](https://github.com/huggingface/transformers/labels/New%20model) label on GitHub to view and add any existing model requests.
93
+
94
+ Now is a good time to get familiar with BrandNewLlama. It is helpful to read a models research paper to understand its technical design and implementation. You don't necessarily have to worry too much about the theoretical details. Instead, focus on the practical ones. Use the questions below to guide your reading.
95
+
96
+ - What type of model is BrandNewLlama? Is it a encoder, decoder, or encoder-decoder model?
97
+ - What tasks can BrandNewLlama be used for?
98
+ - What makes BrandNewLlama different from other models?
99
+ - What models in Transformers are most similar to BrandNewLlama?
100
+ - What tokenizer does BrandNewLlama use?
101
+
102
+ In addition to learning more about your model, use the tips below to help you add a model faster.
103
+
104
+ > [!TIP]
105
+ > Each contributor has a unique style and workflow for adding models to Transformers. For an example, take a look at how [Gemma](https://github.com/huggingface/transformers/pull/29167) was added.
106
+
107
+ - Don't reinvent the wheel! Take your time to explore existing models and tokenizers to see what you can copy and reuse. [Grep](https://www.gnu.org/software/grep/) and [ripgrep](https://github.com/BurntSushi/ripgrep) are great tools for this.
108
+ - This is more of an engineering than a science challenge. Focus on the more practical (setting up an efficient debugging environment for example) instead of the theorertical aspects of the model.
109
+ - Don't be shy to ask for help! We are here to support you. 🤗
110
+
111
+ ## Dev environment
112
+
113
+ Click on the **Fork** button on the [Transformers](https://github.com/huggingface/transformers) repository to create your own copy to work on. Clone the repository to your local disk and add the base repository as the remote.
114
+
115
+ ```bash
116
+ git clone https://github.com/[your Github handle]/transformers.git
117
+ cd transformers
118
+ git remote add upstream https://github.com/huggingface/transformers.git
119
+ ```
120
+
121
+ Create a virtual environment and perform an [editable install](./installation#editable-install) of the library with the "dev" or development dependencies.
122
+
123
+ ```bash
124
+ python -m venv .env
125
+ source .env/bin/activate
126
+ pip install -e ".[dev]"
127
+ ```
128
+
129
+ Due to the number of optional dependencies as Transformers grows, this command may fail. In this case, install the "quality" dependencies. Also make sure you have a deep learning framework installed.
130
+
131
+ ```bash
132
+ pip install -e ".[quality]"
133
+ ```
134
+
135
+ Return to the parent directory and clone and install the original BrandNewLlama repository.
136
+
137
+ ```bash
138
+ git clone https://github.com/org_that_created_brand_new_llama_org/brand_new_llama.git
139
+ cd brand_new_bert
140
+ pip install -e .
141
+ ```
142
+
143
+ Return to your clone of Transformers to begin porting BrandNewLlama.
144
+
145
+ ```bash
146
+ cd transformers
147
+ ```
148
+
149
+ There are two possible debugging environments for running the original model, a notebook ([Google Colab](https://colab.research.google.com/notebooks/intro.ipynb) or [Jupyter](https://jupyter.org/)) or a local Python script.
150
+
151
+ > [!WARNING]
152
+ > We don't recommend setting up a GPU environment to run the original model because it can be expensive. Instead, work in a CPU environment first to verify the model works in Transformers. Once it does, then you can verify it on a GPU.
153
+
154
+ Notebooks are great for executing code cell-by-cell which can help split logical components from one another. It can also accelerate debugging cycles because intermediate results can be stored. You can also share notebooks when working with other contributors.
155
+
156
+ The downside is that if you aren't used to them, it may take some time to get used to.
157
+
158
+ > [!TIP]
159
+ > If the model architecture is identical to an existing model, skip ahead to add a [conversion script](#conversion-script), because you can reuse the architecture of the existing model.
160
+
161
+ Run the command below to start and complete the questionnaire with some basic information about the new model. This command jumpstarts the process by automatically generating some model code that you'll need to adapt.
162
+
163
+ ```bash
164
+ transformers add-new-model-like
165
+ ```
166
+
167
+ ## Create a pull request
168
+
169
+ Before you start adapting the code, create a pull request to track your progress and get feedback from the Transformers team. Title your pull request **[WIP] Add BrandNewLlama** so it's clear that this is a work in progress.
170
+
171
+ Create a branch with a descriptive name from your main branch.
172
+
173
+ ```bash
174
+ git checkout -b add_brand_new_bert
175
+ ```
176
+
177
+ Commit the code, and then fetch and rebase on the main branch.
178
+
179
+ ```bash
180
+ git add .
181
+ git commit
182
+ git fetch upstream
183
+ git rebase upstream/main
184
+ ```
185
+
186
+ Push any changes to your branch and click on **Compare & pull request** to open a pull request on GitHub. Open the pull request as a *draft* to indicate it's a work in progress.
187
+
188
+ ```bash
189
+ git push -u origin a-descriptive-name-for-my-changes
190
+ ```
191
+
192
+ Include relevant Hugging Face team members by adding their GitHub handles in the pull request for questions, feedback, comments, and reviews. Direct team members to specific parts of the code you want by clicking on the **Files changed** tab, and then clicking on **+** to the left of the line number to add a comment. When a question or problem is solved, click on **Resolve** to indicate the issue is resolved. This keeps the conversation organized and clean.
193
+
194
+ Remember to periodically commit and push your work, and update your work with the current main branch.
195
+
196
+ ```bash
197
+ git fetch upstream
198
+ git merge upstream/main
199
+ ```
200
+
201
+ ## Original checkpoint
202
+
203
+ Take some time to work on the original model implementation first to understand how it works.
204
+
205
+ This can be difficult if the original model repository is lacking documentation or if the codebase is complex. But you should use this as your motivation to implement the model in Transformers. Your contribution makes it more accessible and user-friendly to everyone!
206
+
207
+ Orient yourself with the original repository by doing the following.
208
+
209
+ - Locate the pretrained weights.
210
+ - Figure out how to the load pretrained weights into the model.
211
+ - Figure out how to run the tokenizer independently of the model.
212
+ - Trace one forward pass to understand which classes and functions are required. These are probably the only classes and functions you'll have to implement.
213
+ - Locate all the important components (model class, model subclasses, self-attention layer, etc.) of the model.
214
+ - Figure out how to debug the model in the original repository. Add print statements, use interactive debuggers like [ipdb](https://github.com/gotcha/ipdb), or a efficient integrated development environment (IDE) like [PyCharm](https://www.jetbrains.com/pycharm/).
215
+
216
+ The last point is especially important because you'll need a thorough understanding of what's happening inside the original model before you can reimplement it in Transformers. Feel free to open issues and pull requests in the original repository if you encounter any issues.
217
+
218
+ A good first step is to load a *small* pretrained checkpoint and try to reproduce a single forward pass with an example integer vector of inputs. For example, in pseudocode, this could look like the following.
219
+
220
+ ```py
221
+ model = BrandNewLlamaModel.load_pretrained_checkpoint("/path/to/checkpoint/")
222
+ input_ids = [0, 4, 5, 2, 3, 7, 9] # vector of input ids
223
+ original_output = model.generate(input_ids)
224
+ ```
225
+
226
+ ### Debugging
227
+
228
+ If you run into issues, you'll need to choose one of the following debugging strategies depending on the original models codebase.
229
+
230
+ <hfoptions id="debug-strategy">
231
+ <hfoption id="sub-components">
232
+
233
+ This strategy relies on breaking the original model into smaller sub-components, such as when the code can be easily run in eager mode. While more difficult, there are some advantages to this approach.
234
+
235
+ 1. It is easier later to compare the original model to your implementation. You can automatically verify that each individual component matches its corresponding component in the Transformers' implementation. This is better than relying on a visual comparison based on print statements.
236
+ 2. It is easier to port individual components instead of the entire model.
237
+ 3. It is easier for understanding how a model works by breaking it up into smaller parts.
238
+ 4. It is easier to prevent regressions at a later stage when you change your code thanks to component-by-component tests.
239
+
240
+ > [!TIP]
241
+ > Refer to the ELECTRA [integration checks](https://gist.github.com/LysandreJik/db4c948f6b4483960de5cbac598ad4ed) for a good example of how to decompose a model into smaller components.
242
+
243
+ </hfoption>
244
+ <hfoption id="model and tokenizer">
245
+
246
+ This strategy is viable when the original codebase is too complex, only allows intermediate components to be run in compiled mode, or if it's too time-consuming (maybe even impossible) to separate the model into smaller sub-components.
247
+
248
+ For example, the MeshTensorFlow implementation of [T5](https://github.com/tensorflow/mesh/tree/master/mesh_tensorflow) is too complex and doesn't offer a simple way to decompose the model into its sub-components. In this situation, you'll have to rely on verifying print statements.
249
+
250
+ </hfoption>
251
+ </hfoptions>
252
+
253
+ Whichever strategy you choose, it is recommended to debug the initial layers first and the final layers last. Retrieve the output, either with print statements or sub-component functions, of the following layers in this order.
254
+
255
+ 1. input ids passed to the model
256
+ 2. word embeddings
257
+ 3. input of the first Transformer layer
258
+ 4. output of the first Transformer layer
259
+ 5. output of the following n-1 Transformer layers
260
+ 6. output of the whole model
261
+
262
+ The input ids should just be an array of integers like `input_ids = [0, 4, 4, 3, 2, 4, 1, 7, 19]`.
263
+
264
+ Layer outputs often consist of multi-dimensional float arrays.
265
+
266
+ ```py
267
+ [[
268
+ [-0.1465, -0.6501, 0.1993, ..., 0.1451, 0.3430, 0.6024],
269
+ [-0.4417, -0.5920, 0.3450, ..., -0.3062, 0.6182, 0.7132],
270
+ [-0.5009, -0.7122, 0.4548, ..., -0.3662, 0.6091, 0.7648],
271
+ ...,
272
+ [-0.5613, -0.6332, 0.4324, ..., -0.3792, 0.7372, 0.9288],
273
+ [-0.5416, -0.6345, 0.4180, ..., -0.3564, 0.6992, 0.9191],
274
+ [-0.5334, -0.6403, 0.4271, ..., -0.3339, 0.6533, 0.8694]]],
275
+ ```
276
+
277
+ Every Transformers model output should have a precision or error tolerance of *1e-3*. This accounts for any output differences that arise from using a different library framework. Compare the intermediate outputs of the original model with the Transformers implementation to ensure they're nearly identical. Having an *efficient* debugging environment is crucial for this step.
278
+
279
+ Here are some tips for an efficient debugging environment.
280
+
281
+ - To debug intermediate results, it depends on the machine learning framework the original model repository is using. For PyTorch, you should write a script to decompose the original model into smaller sub-components to retrieve the intermediate values. For TensorFlow, you may need to use [tf.print](https://www.tensorflow.org/api_docs/python/tf/print). For Flax, make sure the model is *not jitted* during the forward pass (refer to this GitHub [Issue](https://github.com/google/jax/issues/196) for more details).
282
+
283
+ - It is faster to debug with a smaller pretrained checkpoint versus a larger checkpoint where the forward pass takes more than 10 seconds. If only large checkpoints are available, create a dummy model with randomly initialized weights and save those weights to compare against the Transformers implementation.
284
+
285
+ - Find the easiest way to call the model's forward pass. Ideally, this function (may be called `predict`, `evaluate`, `forward`, or `__call__`) should only call the forward pass *once*. It is more difficult to debug a function that calls the forward pass multiple times.
286
+
287
+ - Separate tokenization from the forward pass. Locate where a string input is changed to input ids in the forward pass and start here. You may need to create a small script or modify the original code to directly input the input ids instead of an input string.
288
+
289
+ - Ensure the model is *not* in training mode. This can produce random outputs due to multiple dropout layers in a model. The forward pass in your debugging environment should be *deterministic* so that the dropout layers aren't used.
290
+
291
+ Once you're able to run the original checkpoint, you're ready to start adapting the model code for Transformers.
292
+
293
+ ## Adapt the model code
294
+
295
+ The `transformers add-new-model-like` command should have generated a model and configuration file.
296
+
297
+ - `src/transformers/models/brand_new_llama/modeling_brand_new_llama.py`
298
+ - `src/transformers/models/brand_new_llama/configuration_brand_new_llama.py`
299
+
300
+ The automatically generated code in the `modeling.py` file has the same architecture as Llama if you answered it's a decoder-only model or it will have the same architecture as BART if you answered it's an encoder-decoder model. The generated code is just a starting point. Based on your research on the new model, you'll need to implement those specific changes by adapting the generated code. This may involve changes to the self-attention layer, the order of the normalization layer, and so on.
301
+
302
+ ### Model initialization
303
+
304
+ At this point, your code doesn't have to be clean or even fully correct, It is more efficient to quickly create a first draft and then iteratively improve on it. The most important thing is that your model can be instantiated from Transformers. The command below creates a model from the configuration with random weights, verifying that the `__init__` method works.
305
+
306
+ ```py
307
+ from transformers import BrandNewLlama, BrandNewLlamaConfig
308
+ model = BrandNewLlama(BrandNewLlamaConfig())
309
+ ```
310
+
311
+ Random initialization occurs in the `_init_weights` method of `BrandNewLlamaPreTrainedModel`. All leaf modules are initialized depending on the configuration's variables.
312
+
313
+ ```py
314
+ def _init_weights(self, module):
315
+ """Initialize the weights"""
316
+ if isinstance(module, nn.Linear):
317
+ module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
318
+ if module.bias is not None:
319
+ module.bias.data.zero_()
320
+ elif isinstance(module, nn.Embedding):
321
+ module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
322
+ if module.padding_idx is not None:
323
+ module.weight.data[module.padding_idx].zero_()
324
+ elif isinstance(module, nn.LayerNorm):
325
+ module.bias.data.zero_()
326
+ module.weight.data.fill_(1.0)
327
+ ```
328
+
329
+ The initialization scheme can look different if you need to adapt it to your model. For example, [`Wav2Vec2ForPreTraining`] initializes [nn.Linear](https://pytorch.org/docs/stable/generated/torch.nn.Linear.html) in its last two linear layers.
330
+
331
+ The `_is_hf_initialized` flag makes sure the submodule is only initialized once. Setting `module.project_q` and `module.project_hid` to `True` ensures the custom initialization is not overridden later. The `_init_weights` function won't be applied to these modules.
332
+
333
+ ```py
334
+ def _init_weights(self, module):
335
+ """Initialize the weights"""
336
+ if isinstance(module, Wav2Vec2ForPreTraining):
337
+ module.project_hid.reset_parameters()
338
+ module.project_q.reset_parameters()
339
+ module.project_hid._is_hf_initialized = True
340
+ module.project_q._is_hf_initialized = True
341
+ elif isinstance(module, nn.Linear):
342
+ module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
343
+ if module.bias is not None:
344
+ module.bias.data.zero_()
345
+ ```
346
+
347
+ ### Convert checkpoints to Transformers
348
+
349
+ The original checkpoint must be converted to a Transformers compatible checkpoint.
350
+
351
+ > [!TIP]
352
+ > Try looking for an existing conversion script to copy, adapt, and reuse for your model!
353
+ >
354
+ > - If you're porting a model from TensorFlow to PyTorch, a good starting point may be the BERT [conversion script](https://github.com/huggingface/transformers/blob/7acfa95afb8194f8f9c1f4d2c6028224dbed35a2/src/transformers/models/bert/modeling_bert.py#L91).
355
+ > - If you're porting a model from PyTorch to PyTorch, a good starting point may be the BART [conversion script](https://github.com/huggingface/transformers/blob/main/src/transformers/models/bart/convert_bart_original_pytorch_checkpoint_to_pytorch.py).
356
+
357
+ Make sure **all** required weights are initialized and print out all the checkpoint weights that weren't used for initialization to make sure the model has been converted correctly.
358
+
359
+ You may encounter wrong shape statements or name assignments during the conversion. This is most likely because of incorrect parameters in `BrandNewLlamaConfig`, the wrong architecture, a bug in the `init` method of your implementation, or you need to transpose one of the checkpoint weights.
360
+
361
+ Keep iterating on the [Adapt the model code](#adapt-the-model-code) section until all the checkpoint weights are correctly loaded. Once you can load a checkpoint in your model, save it to a folder. This should contain a `model.safetensors` file and a `config.json` file.
362
+
363
+ ```py
364
+ model.save_pretrained("/path/to/converted/checkpoint/folder")
365
+ ```
366
+
367
+ To help with conversion, the next section briefly describes how PyTorch models stores and defines layer weights and names.
368
+
369
+ #### PyTorch layer weights and names
370
+
371
+ It is helpful to create a basic PyTorch model to understand how layer names are defined and weights are initialized.
372
+
373
+ ```py
374
+ from torch import nn
375
+
376
+ class SimpleModel(nn.Module):
377
+ def __init__(self):
378
+ super().__init__()
379
+ self.dense = nn.Linear(10, 10)
380
+ self.intermediate = nn.Linear(10, 10)
381
+ self.layer_norm = nn.LayerNorm(10)
382
+ ```
383
+
384
+ PyTorch layer names are defined by the class attribute name of the layer (`dense`, `intermediate`, `layer_norm`). Create a instance of `SimpleModel` to fill all the layers with random weights.
385
+
386
+ ```py
387
+ model = SimpleModel()
388
+ print(model)
389
+ SimpleModel(
390
+ (dense): Linear(in_features=10, out_features=10, bias=True)
391
+ (intermediate): Linear(in_features=10, out_features=10, bias=True)
392
+ (layer_norm): LayerNorm((10,), eps=1e-05, elementwise_affine=True)
393
+ )
394
+ ```
395
+
396
+ The weight values of a specific layer are randomly initialized.
397
+
398
+ ```py
399
+ print(model.dense.weight.data)
400
+ tensor([[-0.0818, 0.2207, -0.0749, -0.0030, 0.0045, -0.1569, -0.1598, 0.0212,
401
+ -0.2077, 0.2157],
402
+ [ 0.1044, 0.0201, 0.0990, 0.2482, 0.3116, 0.2509, 0.2866, -0.2190,
403
+ 0.2166, -0.0212],
404
+ [-0.2000, 0.1107, -0.1999, -0.3119, 0.1559, 0.0993, 0.1776, -0.1950,
405
+ -0.1023, -0.0447],
406
+ [-0.0888, -0.1092, 0.2281, 0.0336, 0.1817, -0.0115, 0.2096, 0.1415,
407
+ -0.1876, -0.2467],
408
+ [ 0.2208, -0.2352, -0.1426, -0.2636, -0.2889, -0.2061, -0.2849, -0.0465,
409
+ 0.2577, 0.0402],
410
+ [ 0.1502, 0.2465, 0.2566, 0.0693, 0.2352, -0.0530, 0.1859, -0.0604,
411
+ 0.2132, 0.1680],
412
+ [ 0.1733, -0.2407, -0.1721, 0.1484, 0.0358, -0.0633, -0.0721, -0.0090,
413
+ 0.2707, -0.2509],
414
+ [-0.1173, 0.1561, 0.2945, 0.0595, -0.1996, 0.2988, -0.0802, 0.0407,
415
+ 0.1829, -0.1568],
416
+ [-0.1164, -0.2228, -0.0403, 0.0428, 0.1339, 0.0047, 0.1967, 0.2923,
417
+ 0.0333, -0.0536],
418
+ [-0.1492, -0.1616, 0.1057, 0.1950, -0.2807, -0.2710, -0.1586, 0.0739,
419
+ 0.2220, 0.2358]]).
420
+ ```
421
+
422
+ In the conversion script, the random weights should be replaced with the exact weights from the corresponding layer in the original checkpoint.
423
+
424
+ ```py
425
+ # retrieve matching layer weights with recursive algorithm
426
+ layer_name = "dense"
427
+ pretrained_weight = array_of_dense_layer
428
+
429
+ model_pointer = getattr(model, "dense")
430
+ model_pointer.weight.data = torch.from_numpy(pretrained_weight)
431
+ ```
432
+
433
+ Verify the randomly initialized weights and their corresponding pretrained checkpoint weights have the identical **shape** and **name**. Add assert statements for the shape and print out the checkpoint weight names.
434
+
435
+ ```py
436
+ assert (
437
+ model_pointer.weight.shape == pretrained_weight.shape
438
+ ), f"Pointer shape of random weight {model_pointer.shape} and array shape of checkpoint weight {pretrained_weight.shape} mismatched"
439
+
440
+ logger.info(f"Initialize PyTorch weight {layer_name} from {pretrained_weight.name}")
441
+ ```
442
+
443
+ When the shape or name don't match, you may have assigned the incorrect checkpoint weight to a randomly initialized layer. An incorrect shape may be because the `BrandNewLlama` parameters don't exactly match the original models parameters. But it could also be that the PyTorch layer implementation requires the weights to be transposed first.
444
+
445
+ ### Implement the forward pass
446
+
447
+ The forward pass should be implemented next if the model loads correctly. It takes some inputs and returns the model output.
448
+
449
+ ```py
450
+ model = BrandNewLlamaModel.from_pretrained("/path/to/converted/checkpoint/folder")
451
+ input_ids = [0, 4, 4, 3, 2, 4, 1, 7, 19]
452
+ output = model.generate(input_ids).last_hidden_states
453
+ ```
454
+
455
+ Don't be discouraged if your forward pass isn't identical with the output from the original model or if it returns an error. Check that the forward pass doesn't throw any errors. This is often because the dimensions are wrong or because the wrong data type is used ([torch.long](https://pytorch.org/docs/stable/generated/torch.Tensor.long.html) instead of [torch.float32](https://pytorch.org/docs/stable/tensors.html)).
456
+
457
+ Your output should have a precision of *1e-3*. Ensure the output shapes and output values are identical. Common reasons for why the outputs aren't identical include:
458
+
459
+ - Some layers were not added (activation layer or a residual connection).
460
+ - The word embedding matrix is not tied.
461
+ - The wrong positional embeddings are used because the original implementation includes an offset.
462
+ - Dropout is applied during the forward pass. Fix this error by making sure `model.training` is `False` and passing `self.training` to [torch.nn.functional.dropout](https://pytorch.org/docs/stable/nn.functional.html?highlight=dropout#torch.nn.functional.dropout).
463
+
464
+ Compare the forward pass of the original model and your implementation to check if there are any differences. Ideally, debug and print out the intermediate outputs of both implementations of the forward pass to pinpoint where the original implementation differs from yours.
465
+
466
+ 1. Make sure the hardcoded `input_ids` in both implementations are identical.
467
+ 2. Verify the outputs of the first transformation of `input_ids` (usually the word embeddings) are identical, and work your way through to the last layer.
468
+
469
+ Any difference between the two implementations should point to the bug in your implementation.
470
+
471
+ One of the best strategies is to add many print statements to the same positions in both implementations, and then successively remove them when they output identical values for the intermediate outputs.
472
+
473
+ When both implementations produce the same output, verify the outputs are within a precision of *1e-3*.
474
+
475
+ ```py
476
+ torch.allclose(original_output, output, atol=1e-3)
477
+ ```
478
+
479
+ This is typically the most difficult part of the process. Congratulations if you've made it this far!
480
+
481
+ And if you're stuck or struggling with this step, don't hesitate to ask for help on your pull request.
482
+
483
+ ### Add model tests
484
+
485
+ While the model works, you still need to add tests to ensure it is compatible with Transformers. Tests are important because they help users understand your work by looking at specific tests, and because they prevent your model from breaking in the future if any changes are made.
486
+
487
+ [Cookiecutter](https://cookiecutter.readthedocs.io/en/stable/) should have added a test file for your model. Run the test file below to make sure all common tests pass.
488
+
489
+ ```bash
490
+ pytest tests/models/brand_new_llama/test_modeling_brand_new_llama.py
491
+ ```
492
+
493
+ The integration tests should be added first because they serve the same purpose as the debugging scripts you used earlier to implement the new model in Transformers. A template of those model tests, `BrandNewLlamaModelIntegrationTests`, was added by Cookiecutter and should be filled out. To ensure it passes, run the following command.
494
+
495
+ <hfoptions id="integration-test">
496
+ <hfoption id="macOS">
497
+
498
+ ```bash
499
+ RUN_SLOW=1 pytest -sv tests/models/brand_new_llama/test_modeling_brand_new_llama.py::BrandNewLlamaModelIntegrationTests
500
+ ```
501
+
502
+ </hfoption>
503
+ <hfoption id="Windows">
504
+
505
+ ```bash
506
+ SET RUN_SLOW=1 pytest -sv tests/models/brand_new_llama/test_modeling_brand_new_llama.py::BrandNewLlamaModelIntegrationTests
507
+ ```
508
+
509
+ </hfoption>
510
+ </hfoptions>
511
+
512
+ All features unique to BrandNewLlama should be tested in a separate test under `BrandNewLlamaModelTester/BrandNewLlamaModelTest`. This test is often overlooked, but it is extremely important because:
513
+
514
+ - it helps transfer knowledge you acquired during the process to the community by showing how the models novel features work
515
+ - future contributors can quickly test changes to the model by running these special tests
516
+
517
+ ## Implement tokenizer
518
+
519
+ > [!TIP]
520
+ > We recommend adding a fast tokenizer ([`PreTrainedTokenizerFast`]) to give users the best performance. Feel free to tag [@ArthurZucker](https://github.com/ArthurZucker) or [@itazap](https://github.com/itazap) in your PR for help on how to add [`PreTrainedTokenizerFast`].
521
+
522
+ With the model out of the way, time to focus on the tokenizer. The tokenizer should be identical or very similar to an existing tokenizer in Transformers.
523
+
524
+ Find and load the original tokenizer file into your implementation. Create a script in the original repository that inputs a string and returns the `input_ids`. The pseudocode should look similar to the code below.
525
+
526
+ ```py
527
+ input_str = "This is a long example input string containing special characters .$?-, numbers 2872 234 12 and words."
528
+ model = BrandNewLlamaModel.load_pretrained_checkpoint("/path/to/checkpoint/")
529
+ input_ids = model.tokenize(input_str)
530
+ ```
531
+
532
+ You may need to search the original repository to find the correct tokenizer function or modify the existing tokenizer in your clone of the original repository to only return the `input_ids`. The script for your tokenizer should look similar to the following.
533
+
534
+ ```py
535
+ from transformers import BrandNewLlamaTokenizer
536
+
537
+ input_str = "This is a long example input string containing special characters .$?-, numbers 2872 234 12 and words."
538
+ tokenizer = BrandNewLlamaTokenizer.from_pretrained("/path/to/tokenizer/folder/")
539
+ input_ids = tokenizer(input_str).input_ids
540
+ ```
541
+
542
+ When both implementations have the same `input_ids`, add a tokenizer test file. This file is analogous to the modeling test files. The tokenizer test files should contain a couple of hardcoded integration tests.
543
+
544
+ ## Implement image processor
545
+
546
+ > [!TIP]
547
+ > Fast image processors use the [torchvision](https://pytorch.org/vision/stable/index.html) library and can perform image processing on the GPU, significantly improving processing speed.
548
+ > We recommend adding a fast image processor ([`BaseImageProcessorFast`]) in addition to the "slow" image processor ([`BaseImageProcessor`]) to provide users with the best performance. Feel free to tag [@yonigozlan](https://github.com/yonigozlan) for help adding a [`BaseImageProcessorFast`].
549
+
550
+ While this example doesn't include an image processor, you may need to implement one if your model requires image inputs. The image processor is responsible for converting images into a format suitable for your model. Before implementing a new one, check whether an existing image processor in the Transformers library can be reused, as many models share similar image processing techniques. Note that you can also use [modular](./modular_transformers) for image processors to reuse existing components.
551
+
552
+ If you do need to implement a new image processor, refer to an existing image processor to understand the expected structure. Slow image processors ([`BaseImageProcessor`]) and fast image processors ([`BaseImageProcessorFast`]) are designed differently, so make sure you follow the correct structure based on the processor type you're implementing.
553
+
554
+ Run the following command (only if you haven't already created the fast image processor with the `transformers add-new-model-like` command) to generate the necessary imports and to create a prefilled template for the fast image processor. Modify the template to fit your model.
555
+
556
+ ```bash
557
+ transformers add-fast-image-processor --model-name your_model_name
558
+ ```
559
+
560
+ This command will generate the necessary imports and provide a pre-filled template for the fast image processor. You can then modify it to fit your model's needs.
561
+
562
+ Add tests for the image processor in `tests/models/your_model_name/test_image_processing_your_model_name.py`. These tests should be similar to those for other image processors and should verify that the image processor correctly handles image inputs. If your image processor includes unique features or processing methods, ensure you add specific tests for those as well.
563
+
564
+ ## Implement processor
565
+
566
+ If your model accepts multiple modalities, like text and images, you need to add a processor. The processor centralizes the preprocessing of different modalities before passing them to the model.
567
+
568
+ The processor should call the appropriate modality-specific processors within its `__call__` function to handle each type of input correctly. Be sure to check existing processors in the library to understand their expected structure. Transformers uses the following convention in the `__call__` function signature.
569
+
570
+ ```python
571
+ def __call__(
572
+ self,
573
+ images: ImageInput = None,
574
+ text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None,
575
+ audio=None,
576
+ videos=None,
577
+ **kwargs: Unpack[YourModelProcessorKwargs],
578
+ ) -> BatchFeature:
579
+ ...
580
+ ```
581
+
582
+ `YourModelProcessorKwargs` is a `TypedDict` that includes all the typical processing arguments and any extra arguments a specific processor may require.
583
+
584
+ Add tests for the processor in `tests/models/your_model_name/test_processor_your_model_name.py`. These tests should be similar to those for other processors and should verify that the processor correctly handles the different modalities.
585
+
586
+ ## Integration tests
587
+
588
+ Now that you have a model and tokenizer, add end-to-end integration tests for the model and tokenizer to `tests/models/brand_new_llama/test_modeling_brand_new_llama.py`.
589
+
590
+ The test should provide a meaningful text-to-text example to show the model works as expected. For example, you can include a source-to-target translation pair, an article-to-summary pair, or a question-to-answer pair.
591
+
592
+ If the checkpoint hasn't been fine-tuned on a downstream task, then the model tests are sufficient.
593
+
594
+ Finally, try to make sure your tests can run on a GPU by adding `.to(self.device)` statements to the models internal tensors. If you don't have access to a GPU, we can take care of that for you.
595
+
596
+ ## Add documentation
597
+
598
+ Your model is only useful if users know how to use it. This is why it's important to add documentation and docstrings. Cookiecutter added a template file, `docs/source/model_doc/brand_new_llama.md`, that you can fill out with information about your model.
599
+
600
+ This is generally a user's first interaction with a model, so the documentation should be clear and concise. It is often very useful to add examples of how the model should be used.
601
+
602
+ Make sure docstrings are added to `src/transformers/models/brand_new_llama/modeling_brand_new_llama.py` and includes all necessary inputs and outputs. Review our [guide](https://github.com/huggingface/transformers/tree/main/docs#writing-documentation---specification) for writing documentation and docstrings.
603
+
604
+ ## Refactor
605
+
606
+ Time to tidy things up and make sure the code style is consistent with the rest of the library. Run the following command to automatically fix incorrect styles.
607
+
608
+ ```bash
609
+ make style
610
+ ```
611
+
612
+ To verify the code style passes quality checks, run the command below.
613
+
614
+ ```bash
615
+ make quality
616
+ ```
617
+
618
+ There may be other failing tests or checks (missing docstring or incorrect naming) on your pull request due to Transformers strict design tests. We can help you with these issues if you're stuck.
619
+
620
+ After ensuring the code runs correctly, you may want to refactor it to make it more readable or cleaner.
621
+
622
+ ## Upload to the Hub
623
+
624
+ Convert and upload all checkpoints to the [Hub](https://hf.co/models). Add a model card to provide more transparency and context about the model. The model card should highlight specific characteristics of a checkpoint, how the model was trained, and code examples of how to use it.
625
+
626
+ > [!TIP]
627
+ > In many cases, adding an interactive notebook users can run is a great way to showcase how to use the model for inference or fine-tune it on a downstream task. While not required, including a notebook can drive greater adoption of your model.
628
+
629
+ You should also consult with the Transformers team to decide on an appropriate name for the model, and getting the required access rights to upload the model.
630
+
631
+ Use the [`~PreTrainedModel.push_to_hub`] method to upload the model.
632
+
633
+ ```py
634
+ brand_new_bert.push_to_hub("brand_new_llama")
635
+ ```
636
+
637
+ Refer to the [Sharing](./model_sharing) guide for more information about uploading models to the Hub.
638
+
639
+ ## Merge your model
640
+
641
+ You're finally ready to merge your pull request and officially add the model to Transformers! Make sure all the tests are passing and all comments and feedback have been addressed.
642
+
643
+ Congratulations on adding a new model to Transformers! 🥳
644
+
645
+ This is a very significant contribution. Your work makes Transformers more accessible to developers and researchers around the world. You should be proud of your contribution and share your accomplishment with the community!
646
+
647
+ ## Model addition timeline
648
+
649
+ There are four timelines for model additions depending on the model contributor and community demand for an architecture.
650
+
651
+ - **day-0 integration**: If you plan on having a Transformers-first release, this is a great option because we can ensure the documentation is clear and optimize your model as much as possible (quantization, FlashAttention, KV-cache, etc.). We can also help you add the model, provide early reviews and make sure it works as expected.
652
+
653
+ Reach out to transformers@huggingface.co a few days (preferably weeks) in advance, especially if an architecture is particularly novel, to ensure model integration. We'll work together on a private fork of Transformers until your checkpoint and release is ready.
654
+
655
+ - **same week integration**: Models with significant requests/demand are usually added the same week if the model author doesn't reach out.
656
+
657
+ Use the [issue tracker](https://github.com/huggingface/transformers/issues/new?assignees=&labels=New+model&projects=&template=new-model-addition.yml) to request a specific model to add. The more activity on the issue, the faster and more likely we'll integrate it.
658
+
659
+ - **post-release integration**: Models without popular requests/demand or if we don't have the bandwidth to integrate it are added post-release.
660
+
661
+ This is a good opportunity if you're interested in contributing a model to Transformers. Take a look at open issues tagged with ["New model"](https://github.com/huggingface/transformers/issues?q=is%3Aopen+is%3Aissue+label%3A%22New+model%22). Feel free to give the most requested models a try first to multiply the impact of your contribution. We'll be there to help you each step of the way!
662
+
663
+ - **Hub-first release**: Transformers [remote-code](./models#custom-models) feature allows Transformers-based projects to be shared directly on the Hub. This is a good option if you don't have the bandwidth to add a model directly to Transformers.
664
+
665
+ If a model ends up being very popular, then it's very likely that we'll integrate it in Transformers ourselves to enable better support (documentation, maintenance, optimization, etc.) for it. A Hub-first release is the most frictionless way to add a model.
transformers/docs/source/en/add_new_pipeline.md ADDED
@@ -0,0 +1,229 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!--Copyright 2024 The HuggingFace Team. All rights reserved.
2
+
3
+ Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
4
+ the License. You may obtain a copy of the License at
5
+
6
+ http://www.apache.org/licenses/LICENSE-2.0
7
+
8
+ Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
9
+ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
10
+
11
+ ⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
12
+ rendered properly in your Markdown viewer.
13
+
14
+ -->
15
+
16
+ # Adding a new pipeline
17
+
18
+ Make [`Pipeline`] your own by subclassing it and implementing a few methods. Share the code with the community on the [Hub](https://hf.co) and register the pipeline with Transformers so that everyone can quickly and easily use it.
19
+
20
+ This guide will walk you through the process of adding a new pipeline to Transformers.
21
+
22
+ ## Design choices
23
+
24
+ At a minimum, you only need to provide [`Pipeline`] with an appropriate input for a task. This is also where you should begin when designing your pipeline.
25
+
26
+ Decide what input types [`Pipeline`] can accept. It can be strings, raw bytes, dictionaries, and so on. Try to keep the inputs in pure Python where possible because it's more compatible. Next, decide on the output [`Pipeline`] should return. Again, keeping the output in Python is the simplest and best option because it's easier to work with.
27
+
28
+ Keeping the inputs and outputs simple, and ideally JSON-serializable, makes it easier for users to run your [`Pipeline`] without needing to learn new object types. It's also common to support many different input types for even greater ease of use. For example, making an audio file acceptable from a filename, URL, or raw bytes gives the user more flexibility in how they provide the audio data.
29
+
30
+ ## Create a pipeline
31
+
32
+ With an input and output decided, you can start implementing [`Pipeline`]. Your pipeline should inherit from the base [`Pipeline`] class and include 4 methods.
33
+
34
+ ```py
35
+ from transformers import Pipeline
36
+
37
+ class MyPipeline(Pipeline):
38
+ def _sanitize_parameters(self, **kwargs):
39
+
40
+ def preprocess(self, inputs, args=2):
41
+
42
+ def _forward(self, model_inputs):
43
+
44
+ def postprocess(self, model_outputs):
45
+ ```
46
+
47
+ 1. `preprocess` takes the inputs and transforms them into the appropriate input format for the model.
48
+
49
+ ```py
50
+ def preprocess(self, inputs, maybe_arg=2):
51
+ model_input = Tensor(inputs["input_ids"])
52
+ return {"model_input": model_input}
53
+ ```
54
+
55
+ 2. `_forward` shouldn't be called directly. `forward` is the preferred method because it includes safeguards to make sure everything works correctly on the expected device. Anything linked to the model belongs in `_forward` and everything else belongs in either `preprocess` or `postprocess`.
56
+
57
+ ```py
58
+ def _forward(self, model_inputs):
59
+ outputs = self.model(**model_inputs)
60
+ return outputs
61
+ ```
62
+
63
+ 3. `postprocess` generates the final output from the models output in `_forward`.
64
+
65
+ ```py
66
+ def postprocess(self, model_outputs, top_k=5):
67
+ best_class = model_outputs["logits"].softmax(-1)
68
+ return best_class
69
+ ```
70
+
71
+ 4. `_sanitize_parameters` lets users pass additional parameters to [`Pipeline`]. This could be during initialization or when [`Pipeline`] is called. `_sanitize_parameters` returns 3 dicts of additional keyword arguments that are passed directly to `preprocess`, `_forward`, and `postprocess`. Don't add anything if a user didn't call the pipeline with extra parameters. This keeps the default arguments in the function definition which is always more natural.
72
+
73
+ For example, add a `top_k` parameter in `postprocess` to return the top 5 most likely classes. Then in `_sanitize_parameters`, check if the user passed in `top_k` and add it to `postprocess_kwargs`.
74
+
75
+ ```py
76
+ def _sanitize_parameters(self, **kwargs):
77
+ preprocess_kwargs = {}
78
+ if "maybe_arg" in kwargs:
79
+ preprocess_kwargs["maybe_arg"] = kwargs["maybe_arg"]
80
+
81
+ postprocess_kwargs = {}
82
+ if "top_k" in kwargs:
83
+ postprocess_kwargs["top_k"] = kwargs["top_k"]
84
+ return preprocess_kwargs, {}, postprocess_kwargs
85
+ ```
86
+
87
+ Now the pipeline can return the top most likely labels if a user chooses to.
88
+
89
+ ```py
90
+ from transformers import pipeline
91
+
92
+ pipeline = pipeline("my-task")
93
+ # returns 3 most likely labels
94
+ pipeline("This is the best meal I've ever had", top_k=3)
95
+ # returns 5 most likely labels by default
96
+ pipeline("This is the best meal I've ever had")
97
+ ```
98
+
99
+ ## Register a pipeline
100
+
101
+ Register the new task your pipeline supports in the `PIPELINE_REGISTRY`. The registry defines:
102
+
103
+ - the machine learning framework the pipeline supports with either `pt_model` or `tf_model` (add both to ensure it works with either frameworks)
104
+ - a default model which should come from a specific revision (branch, or commit hash) where the model works as expected with `default`
105
+ - the expected input with `type`
106
+
107
+ ```py
108
+ from transformers.pipelines import PIPELINE_REGISTRY
109
+ from transformers import AutoModelForSequenceClassification, TFAutoModelForSequenceClassification
110
+
111
+ PIPELINE_REGISTRY.register_pipeline(
112
+ "new-task",
113
+ pipeline_class=MyPipeline,
114
+ pt_model=AutoModelForSequenceClassification,
115
+ tf_model=TFAutoModelForSequenceClassification,
116
+ default={"pt": ("user/awesome-model", "branch-name")},
117
+ type="text",
118
+ )
119
+ ```
120
+
121
+ ## Share your pipeline
122
+
123
+ Share your pipeline with the community on the [Hub](https://hf.co) or you can add it directly to Transformers.
124
+
125
+ It's faster to upload your pipeline code to the Hub because it doesn't require a review from the Transformers team. Adding the pipeline to Transformers may be slower because it requires a review and you need to add tests to ensure your [`Pipeline`] works.
126
+
127
+ ### Upload to the Hub
128
+
129
+ Add your pipeline code to the Hub in a Python file.
130
+
131
+ For example, a custom pipeline for sentence pair classification might look like the following code below. The implementation works for PyTorch and TensorFlow models.
132
+
133
+ ```py
134
+ import numpy as np
135
+ from transformers import Pipeline
136
+
137
+ def softmax(outputs):
138
+ maxes = np.max(outputs, axis=-1, keepdims=True)
139
+ shifted_exp = np.exp(outputs - maxes)
140
+ return shifted_exp / shifted_exp.sum(axis=-1, keepdims=True)
141
+
142
+ class PairClassificationPipeline(Pipeline):
143
+ def _sanitize_parameters(self, **kwargs):
144
+ preprocess_kwargs = {}
145
+ if "second_text" in kwargs:
146
+ preprocess_kwargs["second_text"] = kwargs["second_text"]
147
+ return preprocess_kwargs, {}, {}
148
+
149
+ def preprocess(self, text, second_text=None):
150
+ return self.tokenizer(text, text_pair=second_text, return_tensors=self.framework)
151
+
152
+ def _forward(self, model_inputs):
153
+ return self.model(**model_inputs)
154
+
155
+ def postprocess(self, model_outputs):
156
+ logits = model_outputs.logits[0].numpy()
157
+ probabilities = softmax(logits)
158
+
159
+ best_class = np.argmax(probabilities)
160
+ label = self.model.config.id2label[best_class]
161
+ score = probabilities[best_class].item()
162
+ logits = logits.tolist()
163
+ return {"label": label, "score": score, "logits": logits}
164
+ ```
165
+
166
+ Save the code in a file named `pair_classification.py`, and import and register it as shown below.
167
+
168
+ ```py
169
+ from pair_classification import PairClassificationPipeline
170
+ from transformers.pipelines import PIPELINE_REGISTRY
171
+ from transformers import AutoModelForSequenceClassification, TFAutoModelForSequenceClassification
172
+
173
+ PIPELINE_REGISTRY.register_pipeline(
174
+ "pair-classification",
175
+ pipeline_class=PairClassificationPipeline,
176
+ pt_model=AutoModelForSequenceClassification,
177
+ tf_model=TFAutoModelForSequenceClassification,
178
+ )
179
+ ```
180
+
181
+ The [register_pipeline](https://github.com/huggingface/transformers/blob/9feae5fb0164e89d4998e5776897c16f7330d3df/src/transformers/pipelines/base.py#L1387) function registers the pipeline details (task type, pipeline class, supported backends) to a models `config.json` file.
182
+
183
+ ```json
184
+ "custom_pipelines": {
185
+ "pair-classification": {
186
+ "impl": "pair_classification.PairClassificationPipeline",
187
+ "pt": [
188
+ "AutoModelForSequenceClassification"
189
+ ],
190
+ "tf": [
191
+ "TFAutoModelForSequenceClassification"
192
+ ],
193
+ }
194
+ },
195
+ ```
196
+
197
+ Call [`~Pipeline.push_to_hub`] to push the pipeline to the Hub. The Python file containing the code is copied to the Hub, and the pipelines model and tokenizer are also saved and pushed to the Hub. Your pipeline should now be available on the Hub under your namespace.
198
+
199
+ ```py
200
+ from transformers import pipeline
201
+
202
+ pipeline = pipeline(task="pair-classification", model="sgugger/finetuned-bert-mrpc")
203
+ pipeline.push_to_hub("pair-classification-pipeline")
204
+ ```
205
+
206
+ To use the pipeline, add `trust_remote_code=True` when loading the pipeline.
207
+
208
+ ```py
209
+ from transformers import pipeline
210
+
211
+ pipeline = pipeline(task="pair-classification", trust_remote_code=True)
212
+ ```
213
+
214
+ ### Add to Transformers
215
+
216
+ Adding a custom pipeline to Transformers requires adding tests to make sure everything works as expected, and requesting a review from the Transformers team.
217
+
218
+ Add your pipeline code as a new module to the [pipelines](https://github.com/huggingface/transformers/tree/main/src/transformers/pipelines) submodule, and add it to the list of tasks defined in [pipelines/__init__.py](https://github.com/huggingface/transformers/blob/main/src/transformers/pipelines/__init__.py).
219
+
220
+ Next, add a new test for the pipeline in [transformers/tests/pipelines](https://github.com/huggingface/transformers/tree/main/tests/pipelines). You can look at the other tests for examples of how to test your pipeline.
221
+
222
+ The [run_pipeline_test](https://github.com/huggingface/transformers/blob/db70426854fe7850f2c5834d633aff637f14772e/tests/pipelines/test_pipelines_text_classification.py#L186) function should be very generic and run on the models defined in [model_mapping](https://github.com/huggingface/transformers/blob/db70426854fe7850f2c5834d633aff637f14772e/tests/pipelines/test_pipelines_text_classification.py#L48) and [tf_model_mapping](https://github.com/huggingface/transformers/blob/db70426854fe7850f2c5834d633aff637f14772e/tests/pipelines/test_pipelines_text_classification.py#L49). This is important for testing future compatibility with new models.
223
+
224
+ You'll also notice `ANY` is used throughout the [run_pipeline_test](https://github.com/huggingface/transformers/blob/db70426854fe7850f2c5834d633aff637f14772e/tests/pipelines/test_pipelines_text_classification.py#L186) function. The models are random, so you can't check the actual values. Using `ANY` allows the test to match the output of the pipeline type instead.
225
+
226
+ Finally, you should also implement the following 4 tests.
227
+
228
+ 1. [test_small_model_pt](https://github.com/huggingface/transformers/blob/db70426854fe7850f2c5834d633aff637f14772e/tests/pipelines/test_pipelines_text_classification.py#L59) and [test_small_model_tf](https://github.com/huggingface/transformers/blob/db70426854fe7850f2c5834d633aff637f14772e/tests/pipelines/test_pipelines_text_classification.py#L150), use a small model for these pipelines to make sure they return the correct outputs. The results don't have to make sense. Each pipeline should return the same result.
229
+ 1. [test_large_model_pt](https://github.com/huggingface/transformers/blob/db70426854fe7850f2c5834d633aff637f14772e/tests/pipelines/test_pipelines_zero_shot_image_classification.py#L187) nad [test_large_model_tf](https://github.com/huggingface/transformers/blob/db70426854fe7850f2c5834d633aff637f14772e/tests/pipelines/test_pipelines_zero_shot_image_classification.py#L220), use a realistic model for these pipelines to make sure they return meaningful results. These tests are slow and should be marked as slow.
transformers/docs/source/en/agents.md ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!--Copyright 2024 The HuggingFace Team. All rights reserved.
2
+
3
+ Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
4
+ the License. You may obtain a copy of the License at
5
+
6
+ http://www.apache.org/licenses/LICENSE-2.0
7
+
8
+ Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
9
+ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
10
+ specific language governing permissions and limitations under the License.
11
+
12
+ ⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
13
+ rendered properly in your Markdown viewer.
14
+
15
+ -->
16
+
17
+ # Agents
18
+
19
+ (deprecated)
20
+
21
+ > [!WARNING]
22
+ > Agents and tools were spun out into the standalone [smolagents](https://huggingface.co/docs/smolagents/index) library. They were removed from `transformers` in v4.52.
transformers/docs/source/en/attention_interface.md ADDED
@@ -0,0 +1,168 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!--Copyright 2025 The HuggingFace Team. All rights reserved.
2
+
3
+ Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
4
+ the License. You may obtain a copy of the License at
5
+
6
+ http://www.apache.org/licenses/LICENSE-2.0
7
+
8
+ Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
9
+ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
10
+
11
+ ⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
12
+ rendered properly in your Markdown viewer.
13
+
14
+ -->
15
+
16
+ # Attention Interface
17
+
18
+ This page describes how to use the `AttentionInterface` in order to register custom attention functions to use with
19
+ supported models.
20
+
21
+ ## Customizing attention function
22
+
23
+ Most recent models can now switch from one attention function used in the Attention layer to the other, thanks to a simple mapping.
24
+ By default, we provide the implementation for [`sdpa`](https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html),
25
+ [`flash_attention_2`](https://github.com/Dao-AILab/flash-attention) and [`flex_attention`](https://pytorch.org/docs/stable/nn.attention.flex_attention.html#module-torch.nn.attention.flex_attention)
26
+ as well as `eager`, which is a simple matrix multiplication without any optimization on top.
27
+ This is the setting you can usually choose when instantiating a model:
28
+
29
+ ```python
30
+ from transformers import AutoModelForCausalLM
31
+
32
+ model_id = "meta-llama/Llama-3.2-1B"
33
+
34
+ # Here, using flash attention as an example
35
+ model = AutoModelForCausalLM.from_pretrained(model_id, attn_implementation="flash_attention_2")
36
+ ```
37
+
38
+ But what if you wanted to create your own attention function? Or simply play around with existing ones, adding
39
+ a few statements here and there? You can now do so with the `AttentionInterface`! Here is an example:
40
+
41
+ ```python
42
+ from transformers import AutoModelForCausalLM, AttentionInterface
43
+ from transformers.integrations.sdpa_attention import sdpa_attention_forward
44
+ import torch
45
+
46
+ model_id = "meta-llama/Llama-3.2-1B"
47
+
48
+ def my_new_sdpa(*args, **kwargs):
49
+ print("I just entered the attention computation")
50
+ return sdpa_attention_forward(*args, **kwargs)
51
+
52
+ AttentionInterface.register("my_new_sdpa", my_new_sdpa)
53
+
54
+ model = AutoModelForCausalLM.from_pretrained(model_id, attn_implementation="my_new_sdpa")
55
+ # Try running the forward with the new attention function
56
+ model(torch.ones(1, 5, dtype=int))
57
+ ```
58
+
59
+ You will see it prints "I just entered the attention computation" as many times as there are layers in the model (with this example, 16 times).
60
+
61
+ ## Dynamically switching attention function
62
+
63
+ You could dynamically change the model's attention function as well, by overriding the `config._attn_implementation` field:
64
+
65
+ ```python
66
+ # Back to use original sdpa implementation
67
+ model.config._attn_implementation = "sdpa"
68
+
69
+ model(torch.ones(1, 5, dtype=int))
70
+ ```
71
+
72
+ and it will stop printing the statements, as it now uses the `sdpa` attention.
73
+ This allows to quickly change an attention function, without needing to reload the model!
74
+
75
+ ## What about new args needed in my custom attention function?
76
+
77
+ But indeed, what if the new function requires a new arg to be properly used? It's no issue! Models supporting the
78
+ `AttentionInterface` propagate kwargs all the way to the Attention layers, and to the used attention function. That way,
79
+ you can simply pass the arg (as a kwargs, i.e. you need to qualify the name of the arg) in the model's forward, and it will be correctly used in the attention. However, custom attention functions have some limitations. In particular, it must follow the signature and return format of other attention functions, i.e.
80
+
81
+ ```python
82
+ from transformers import AutoModelForCausalLM, AttentionInterface
83
+ from transformers.integrations.sdpa_attention import sdpa_attention_forward
84
+ import torch
85
+
86
+ def custom_attention(
87
+ module: torch.nn.Module, # required arg
88
+ query: torch.Tensor, # required arg
89
+ key: torch.Tensor, # required arg
90
+ value: torch.Tensor, # required arg
91
+ attention_mask: Optional[torch.Tensor], # required arg
92
+ a_new_kwargs = None, # You can now add as many kwargs as you need
93
+ another_new_kwargs = None, # You can now add as many kwargs as you need
94
+ **kwargs, # You need to accept **kwargs as models will pass other args
95
+ ) -> tuple[torch.Tensor, Optional[torch.Tensor]]
96
+ ... # do your magic!
97
+ return attn_output, attn_weights # attn_weights are optional here
98
+
99
+ AttentionInterface.register("custom", custom_attention)
100
+
101
+ model = AutoModelForCausalLM.from_pretrained(model_id, attn_implementation="custom")
102
+ # Forward pass with the new kwargs
103
+ model(torch.ones(1, 5, dtype=int), a_new_kwargs=..., another_new_kwargs=...)
104
+ ```
105
+
106
+ If in doubt about what args/kwargs a given model sends to the attention function, simply check that model's modeling code on [GitHub](https://github.com/huggingface/transformers/tree/main/src/transformers/models)!
107
+
108
+ ## Accessing current available implementations
109
+
110
+ Most of the time, you will simply need to `register` a new function. If, however, you need to access an existing one,
111
+ and/or perform a few checks, the preferred way is to use the global `ALL_ATTENTION_FUNCTIONS`. It behaves the same way you
112
+ would expect from a usual Python dictionary:
113
+
114
+ ```python
115
+ >>> from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS
116
+
117
+ >>> list(ALL_ATTENTION_FUNCTIONS.keys())
118
+ >>> ['flash_attention_2', 'flex_attention', 'sdpa']
119
+
120
+ >>> ALL_ATTENTION_FUNCTIONS["sdpa"]
121
+ >>> <function transformers.integrations.sdpa_attention.sdpa_attention_forward>
122
+
123
+ >>> ALL_ATTENTION_FUNCTIONS.get("sdpa", None)
124
+ >>> <function transformers.integrations.sdpa_attention.sdpa_attention_forward>
125
+
126
+ # You can also globally `register` a new function directly on it
127
+ >>> ALL_ATTENTION_FUNCTIONS.register("new_func", new_func)
128
+ ```
129
+
130
+ ## Attention Mask Interface
131
+
132
+ Having a new attention function may mean that you need a new format of attention mask to decide what key and value tokens
133
+ the query tokens should attend to. This is now possible with the `AttentionMaskInterface`! It works in the same way as
134
+ the `AttentionInterface`:
135
+
136
+ ```python
137
+ from transformers import AttentionMaskInterface
138
+ from transformers.masking_utils import sdpa_mask
139
+ import torch
140
+
141
+ def my_new_sdpa_mask(*args, **kwargs):
142
+ print("I just entered the attention mask computation")
143
+ return sdpa_mask(*args, **kwargs)
144
+
145
+ AttentionMaskInterface.register("my_new_sdpa_mask", my_new_sdpa_mask)
146
+ ```
147
+
148
+ The reason you have to register it is because we need to automatically correct your mask format based on the attention implementation (for example, flex attention uses a BlockMask format, while sdpa uses a 4D tensor).
149
+ By default, if you do not register an attention mask function along with your attention function, mask creation will be skipped
150
+ and `attention_mask=None` will be passed along to the Attention layers.
151
+
152
+ The default signature of the attention mask functions is the following:
153
+
154
+ ```python
155
+ def custom_attention_mask(
156
+ batch_size: int, # required arg
157
+ cache_position: torch.Tensor, # required arg
158
+ kv_length: int, # required arg
159
+ kv_offset: int = 0, # required arg
160
+ mask_function: Callable = causal_mask_function, # required arg
161
+ attention_mask: Optional[torch.Tensor] = None, # required arg
162
+ **kwargs, # a few additional args may be passed as kwargs, especially the model's config is always passed
163
+ ) -> Optional[torch.Tensor]:
164
+ ```
165
+
166
+ It mostly works thanks to the `mask_function`, which is a `Callable` in the form of [torch's mask_mod functions](https://pytorch.org/blog/flexattention/), taking 4 indices as input and returning a boolean to indicate if this position should take part in the attention computation.
167
+
168
+ If you cannot use the `mask_function` to create your mask for some reason, you can try to work around it by doing something similar to our [torch export workaround](https://github.com/huggingface/transformers/blob/main/src/transformers/integrations/executorch.py).
transformers/docs/source/en/auto_docstring.md ADDED
@@ -0,0 +1,280 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!--Copyright 2025 The HuggingFace Team. All rights reserved.
2
+
3
+ Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
4
+ the License. You may obtain a copy of the License at
5
+
6
+ http://www.apache.org/licenses/LICENSE-2.0
7
+
8
+ Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
9
+ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
10
+ specific language governing permissions and limitations under the License.
11
+
12
+ ⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
13
+ rendered properly in your Markdown viewer.
14
+
15
+ -->
16
+
17
+ # Documenting a model
18
+
19
+ The `@auto_docstring` decorator in Transformers generates consistent docstrings for model classes and their methods. It reduces boilerplate by automatically including standard argument descriptions while also allowing overrides to add new or custom arguments. [Contributing a new model](./modular_transformers) is easier because you don't need to manually add the standard docstrings, and only focus on documenting new arguments.
20
+
21
+ This guide describes how to use the `@auto_docstring` decorator and how it works.
22
+
23
+ ## @auto_docstring
24
+
25
+ Start by importing the decorator in the modeling file (`modular_model.py` or `modeling_model.py`).
26
+
27
+ ```python
28
+ from ...utils import auto_docstring
29
+ ```
30
+
31
+ Select whether you'd like to apply `@auto_docstring` to a class or function below to see how to use it.
32
+
33
+ <hfoptions id="type">
34
+ <hfoption id="classes">
35
+
36
+ Place `@auto_docstring` directly above the class definition. The decorator derives parameter descriptions from the `__init__` method's signature and docstring.
37
+
38
+ ```python
39
+ from transformers.modeling_utils import PreTrainedModel
40
+ from ...utils import auto_docstring
41
+
42
+ @auto_docstring
43
+ class MyAwesomeModel(PreTrainedModel):
44
+ def __init__(self, config, custom_parameter: int = 10, another_custom_arg: str = "default"):
45
+ r"""
46
+ custom_parameter (`int`, *optional*, defaults to 10):
47
+ Description of the custom_parameter for MyAwesomeModel.
48
+ another_custom_arg (`str`, *optional*, defaults to "default"):
49
+ Documentation for another unique argument.
50
+ """
51
+ super().__init__(config)
52
+ self.custom_parameter = custom_parameter
53
+ self.another_custom_arg = another_custom_arg
54
+ # ... rest of your init
55
+
56
+ # ... other methods
57
+ ```
58
+
59
+ Arguments can also be passed directly to `@auto_docstring` for more control. Use the `custom_intro` parameter to describe the argument and the `custom_args` parameter to describe the arguments.
60
+
61
+ ```python
62
+ @auto_docstring(
63
+ custom_intro="""This model performs specific synergistic operations.
64
+ It builds upon the standard Transformer architecture with unique modifications.""",
65
+ custom_args="""
66
+ custom_parameter (`type`, *optional*, defaults to `default_value`):
67
+ A concise description for custom_parameter if not defined or overriding the description in `args_doc.py`.
68
+ internal_helper_arg (`type`, *optional*, defaults to `default_value`):
69
+ A concise description for internal_helper_arg if not defined or overriding the description in `args_doc.py`.
70
+ """
71
+ )
72
+ class MySpecialModel(PreTrainedModel):
73
+ def __init__(self, config: ConfigType, custom_parameter: "type" = "default_value", internal_helper_arg=None):
74
+ # ...
75
+ ```
76
+
77
+ You can also choose to only use `custom_intro` and define the custom arguments directly in the class.
78
+
79
+ ```python
80
+ @auto_docstring(
81
+ custom_intro="""This model performs specific synergistic operations.
82
+ It builds upon the standard Transformer architecture with unique modifications.""",
83
+ )
84
+ class MySpecialModel(PreTrainedModel):
85
+ def __init__(self, config: ConfigType, custom_parameter: "type" = "default_value", internal_helper_arg=None):
86
+ r"""
87
+ custom_parameter (`type`, *optional*, defaults to `default_value`):
88
+ A concise description for custom_parameter if not defined or overriding the description in `args_doc.py`.
89
+ internal_helper_arg (`type`, *optional*, defaults to `default_value`):
90
+ A concise description for internal_helper_arg if not defined or overriding the description in `args_doc.py`.
91
+ """
92
+ # ...
93
+ ```
94
+
95
+ </hfoption>
96
+ <hfoption id="functions">
97
+
98
+ Place `@auto_docstring` directly above the method definition. The decorator derives parameter descriptions from the function signature.
99
+
100
+ ```python
101
+ @auto_docstring
102
+ def forward(
103
+ self,
104
+ input_ids: Optional[torch.Tensor] = None,
105
+ attention_mask: Optional[torch.Tensor] = None,
106
+ new_custom_argument: Optional[torch.Tensor] = None,
107
+ arg_documented_in_args_doc: Optional[torch.Tensor] = None,
108
+ # ... other arguments
109
+ ) -> Union[Tuple, ModelOutput]: # The description of the return value will automatically be generated from the ModelOutput class docstring.
110
+ r"""
111
+ new_custom_argument (`torch.Tensor`, *optional*):
112
+ Description of this new custom argument and its expected shape or type.
113
+ """
114
+ # ...
115
+ ```
116
+
117
+ Arguments can also be passed directly to `@auto_docstring` for more control. Use the `custom_intro` parameter to describe the argument and the `custom_args` parameter to describe the arguments.
118
+
119
+ The `Returns` and `Examples` parts of the docstring can also be manually specified.
120
+
121
+
122
+ ```python
123
+ MODEL_COMMON_CUSTOM_ARGS = r"""
124
+ common_arg_1 (`torch.Tensor`, *optional*, defaults to `default_value`):
125
+ Description of common_arg_1
126
+ common_arg_2 (`torch.Tensor`, *optional*, defaults to `default_value`):
127
+ Description of common_arg_2
128
+ ...
129
+ """
130
+
131
+ class MyModel(PreTrainedModel):
132
+ # ...
133
+ @auto_docstring(
134
+ custom_intro="""
135
+ This is a custom introduction for the function.
136
+ """
137
+ custom_args=MODEL_COMMON_CUSTOM_ARGS
138
+ )
139
+ def forward(
140
+ self,
141
+ input_ids: Optional[torch.Tensor] = None,
142
+ attention_mask: Optional[torch.Tensor] = None,
143
+ common_arg_1: Optional[torch.Tensor] = None,
144
+ common_arg_2: Optional[torch.Tensor] = None,
145
+ #...
146
+ function_specific_argument: Optional[torch.Tensor] = None,
147
+ # ... other arguments
148
+ ) -> torch.Tensor:
149
+ r"""
150
+ function_specific_argument (`torch.Tensor`, *optional*):
151
+ Description of an argument specific to this function
152
+
153
+ Returns:
154
+ `torch.Tensor`: For a function returning a generic type, a custom "Returns" section can be specified.
155
+
156
+ Example:
157
+
158
+ (To override the default example with a custom one or to add an example for a model class that does not have a pipeline)
159
+
160
+ ```python
161
+ ...
162
+ ```
163
+ """
164
+ # ...
165
+ ```
166
+
167
+ </hfoption>
168
+ </hfoptions>
169
+
170
+ ## Documenting arguments
171
+
172
+ There are some rules for documenting different types of arguments and they're listed below.
173
+
174
+ - Standard arguments (`input_ids`, `attention_mask`, `pixel_values`, etc.) are defined and retrieved from `args_doc.py`. It is the single source of truth for standard arguments and should not be redefined locally if an argument's description and shape is the same as an argument in `args_doc.py`.
175
+
176
+ If a standard argument behaves differently in your model, then you can override it locally in a `r""" """` block. This local definition has a higher priority. For example, the `labels` argument is often customized per model and typically requires overriding.
177
+
178
+
179
+ - New or custom arguments should be documented within an `r""" """` block after the signature if it is a function or in the `__init__` method's docstring if it is a class.
180
+
181
+ ```py
182
+ argument_name (`type`, *optional*, defaults to `X`):
183
+ Description of the argument.
184
+ Explain its purpose, expected shape/type if complex, and default behavior.
185
+ This can span multiple lines.
186
+ ```
187
+
188
+ * Include `type` in backticks.
189
+ * Add *optional* if the argument is not required or has a default value.
190
+ * Add "defaults to X" if it has a default value. You don't need to add "defaults to `None`" if the default value is `None`.
191
+
192
+ These arguments can also be passed to `@auto_docstring` as a `custom_args` argument. It is used to define the docstring block for new arguments once if they are repeated in multiple places in the modeling file.
193
+
194
+ ```py
195
+ class MyModel(PreTrainedModel):
196
+ # ...
197
+ @auto_docstring(
198
+ custom_intro="""
199
+ This is a custom introduction for the function.
200
+ """
201
+ custom_args=r"""
202
+ common_arg_1 (`torch.Tensor`, *optional*, defaults to `default_value`):
203
+ Description of common_arg_1
204
+ """
205
+ )
206
+ ```
207
+
208
+ ## Checking the docstrings
209
+
210
+ Transformers includes a utility script to validate the docstrings when you open a Pull Request which triggers CI (continuous integration) checks. The script checks for the following criteria.
211
+
212
+ * Ensures `@auto_docstring` is applied to relevant mode classes and public methods.
213
+ * Ensures arguments are complete and consistent. It checks that documented arguments exist in the signature and verifies whether the types and default values in the docstring match the signature. Arguments that aren't known standard arguments or if they lack a local description are flagged.
214
+ * Reminds you to complete placeholders like `<fill_type>` and `<fill_docstring>`.
215
+ * Ensures docstrings are formatted according to the expected docstring style.
216
+
217
+ You can run this check locally - before committing - by running the following command.
218
+
219
+ ```bash
220
+ make fix-copies
221
+ ```
222
+
223
+ `make fix-copies` runs several other checks as well. If you don't need those checks, run the command below to only perform docstring and auto-docstring checks.
224
+
225
+ ```bash
226
+ python utils/check_docstrings.py # to only check files included in the diff without fixing them
227
+ # python utils/check_docstrings.py --fix_and_overwrite # to fix and overwrite the files in the diff
228
+ # python utils/check_docstrings.py --fix_and_overwrite --check_all # to fix and overwrite all files
229
+ ```
230
+
231
+ ## modular_model.py files
232
+
233
+ When working with modular files (`modular_model.py`), follow the guidelines below for applying `@auto_docstring`.
234
+
235
+ - For standalone models in modular files, apply `@auto_docstring` like you would in a `modeling_model.py` file.
236
+ - For models that inherit from other library models, `@auto_docstring` is automatically carried over to the generated modeling file. You don't need to add `@auto_docstring` in your modular file.
237
+
238
+ If you need to modify the `@auto_docstring` behavior, apply the customized decorator in your modular file. Make sure to **include all other decorators** that are present in the original function or class.
239
+
240
+ > [!WARNING]
241
+ > When overriding any decorator in a modular file, you must include **all** decorators that were applied to that function or class in the parent model. If you only override some decorators, the others won't be included in the generated modeling file.
242
+
243
+ ## How it works
244
+
245
+ The `@auto_docstring` decorator automatically generates docstrings by:
246
+
247
+ 1. Inspecting the signature (arguments, types, defaults) of the decorated class' `__init__` method or the decorated function.
248
+ 2. Retrieving the predefined docstrings for common arguments (`input_ids`, `attention_mask`, etc.) from internal library sources like [`ModelArgs`], [`ImageProcessorArgs`], and the `args_doc.py` file.
249
+ 3. Adding argument descriptions in one of two ways as shown below.
250
+
251
+ | method | description | usage |
252
+ |---|---|---|
253
+ | `r""" """` | add custom docstring content directly to a method signature or within the `__init__` docstring | document new arguments or override standard descriptions |
254
+ | `custom_args` | add custom docstrings for specific arguments directly in `@auto_docstring` | define docstring for new arguments once if they're repeated in multiple places in the modeling file |
255
+
256
+ 4. Adding class and function descriptions. For model classes with standard naming patterns, like `ModelForCausalLM`, or if it belongs to a pipeline, `@auto_docstring` automatically generates the appropriate descriptions with `ClassDocstring` from `args_doc.py`.
257
+
258
+ `@auto_docstring` also accepts the `custom_intro` argument to describe a class or function.
259
+
260
+ 5. Using a templating system to allow predefined docstrings to include dynamic information from Transformers' [auto_modules](https://github.com/huggingface/transformers/tree/main/src/transformers/models/auto) such as `{{processor_class}}` and `{{config_class}}`.
261
+
262
+ 6. Finding appropriate usage examples based on the model's task or pipeline compatibility. It extracts checkpoint information form the model's configuration class to provide concrete examples with real model identifiers.
263
+
264
+ 7. Adding return values to the docstring. For methods like `forward`, the decorator automatically generates the `Returns` field in the docstring based on the method's return type annotation.
265
+
266
+ For example, if a method returns a [`~transformers.utils.ModelOutput`] subclass, `@auto_docstring` extracts the field descriptions from the class' docstring to create a comprehensive return value description. You can also manually specifiy a custom `Returns` field in a functions docstring.
267
+
268
+ 8. Unrolling kwargs typed with the unpack operator. For specific methods (defined in `UNROLL_KWARGS_METHODS`) or classes (defined in `UNROLL_KWARGS_CLASSES`), the decorator processes `**kwargs` parameters that are typed with `Unpack[KwargsTypedDict]`. It extracts the documentations from the `TypedDict` and adds each parameter to the function's docstring.
269
+
270
+ Currently only supported for [`FastImageProcessorKwargs`].
271
+
272
+ ## Best practices
273
+
274
+ Follow the best practices below to help maintain consistent and informative documentation for Transformers!
275
+
276
+ * Use `@auto_docstring` for new PyTorch model classes ([`PreTrainedModel`] subclasses) and their primary methods like `forward` or `get_text_features`.
277
+ * For classes, `@auto_docstring` retrieves parameter descriptions from the `__init__` method's docstring.
278
+ * Rely on standard docstrings and do not redefine common arguments unless their behavior is different in your model.
279
+ * Document new or custom arguments clearly.
280
+ * Run `check_docstrings` locally and iteratively.
transformers/docs/source/en/backbones.md ADDED
@@ -0,0 +1,155 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!--Copyright 2024 The HuggingFace Team. All rights reserved.
2
+
3
+ Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
4
+ the License. You may obtain a copy of the License at
5
+
6
+ http://www.apache.org/licenses/LICENSE-2.0
7
+
8
+ Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
9
+ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
10
+ specific language governing permissions and limitations under the License.
11
+
12
+ ⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
13
+ rendered properly in your Markdown viewer.
14
+
15
+ -->
16
+
17
+ # Backbones
18
+
19
+ Higher-level computer visions tasks, such as object detection or image segmentation, use several models together to generate a prediction. A separate model is used for the *backbone*, neck, and head. The backbone extracts useful features from an input image into a feature map, the neck combines and processes the feature maps, and the head uses them to make a prediction.
20
+
21
+ <div class="flex justify-center">
22
+ <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/Backbone.png"/>
23
+ </div>
24
+
25
+ Load a backbone with [`~PretrainedConfig.from_pretrained`] and use the `out_indices` parameter to determine which layer, given by the index, to extract a feature map from.
26
+
27
+ ```py
28
+ from transformers import AutoBackbone
29
+
30
+ model = AutoBackbone.from_pretrained("microsoft/swin-tiny-patch4-window7-224", out_indices=(1,))
31
+ ```
32
+
33
+ This guide describes the backbone class, backbones from the [timm](https://hf.co/docs/timm/index) library, and how to extract features with them.
34
+
35
+ ## Backbone classes
36
+
37
+ There are two backbone classes.
38
+
39
+ - [`~transformers.utils.BackboneMixin`] allows you to load a backbone and includes functions for extracting the feature maps and indices.
40
+ - [`~transformers.utils.BackboneConfigMixin`] allows you to set the feature map and indices of a backbone configuration.
41
+
42
+ Refer to the [Backbone](./main_classes/backbones) API documentation to check which models support a backbone.
43
+
44
+ There are two ways to load a Transformers backbone, [`AutoBackbone`] and a model-specific backbone class.
45
+
46
+ <hfoptions id="backbone-classes">
47
+ <hfoption id="AutoBackbone">
48
+
49
+ The [AutoClass](./model_doc/auto) API automatically loads a pretrained vision model with [`~PretrainedConfig.from_pretrained`] as a backbone if it's supported.
50
+
51
+ Set the `out_indices` parameter to the layer you'd like to get the feature map from. If you know the name of the layer, you could also use `out_features`. These parameters can be used interchangeably, but if you use both, make sure they refer to the same layer.
52
+
53
+ When `out_indices` or `out_features` isn't used, the backbone returns the feature map from the last layer. The example code below uses `out_indices=(1,)` to get the feature map from the first layer.
54
+
55
+ <div class="flex justify-center">
56
+ <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/Swin%20Stage%201.png"/>
57
+ </div>
58
+
59
+ ```py
60
+ from transformers import AutoImageProcessor, AutoBackbone
61
+
62
+ model = AutoBackbone.from_pretrained("microsoft/swin-tiny-patch4-window7-224", out_indices=(1,))
63
+ ```
64
+
65
+ </hfoption>
66
+ <hfoption id="model-specific backbone">
67
+
68
+ When you know a model supports a backbone, you can load the backbone and neck directly into the models configuration. Pass the configuration to the model to initialize it for a task.
69
+
70
+ The example below loads a [ResNet](./model_doc/resnet) backbone and neck for use in a [MaskFormer](./model_doc/maskformer) instance segmentation head.
71
+
72
+ Set `backbone` to a pretrained model and `use_pretrained_backbone=True` to use pretrained weights instead of randomly initialized weights.
73
+
74
+ ```py
75
+ from transformers import MaskFormerConfig, MaskFormerForInstanceSegmentation
76
+
77
+ config = MaskFormerConfig(backbone="microsoft/resnet-50", use_pretrained_backbone=True)
78
+ model = MaskFormerForInstanceSegmentation(config)
79
+ ```
80
+
81
+ Another option is to separately load the backbone configuration and then pass it to `backbone_config` in the model configuration.
82
+
83
+ ```py
84
+ from transformers import MaskFormerConfig, MaskFormerForInstanceSegmentation, ResNetConfig
85
+
86
+ # instantiate backbone configuration
87
+ backbone_config = ResNetConfig()
88
+ # load backbone in model
89
+ config = MaskFormerConfig(backbone_config=backbone_config)
90
+ # attach backbone to model head
91
+ model = MaskFormerForInstanceSegmentation(config)
92
+ ```
93
+
94
+ </hfoption>
95
+ </hfoptions>
96
+
97
+ ## timm backbones
98
+
99
+ [timm](https://hf.co/docs/timm/index) is a collection of vision models for training and inference. Transformers supports timm models as backbones with the [`TimmBackbone`] and [`TimmBackboneConfig`] classes.
100
+
101
+ Set `use_timm_backbone=True` to load pretrained timm weights, and `use_pretrained_backbone` to use pretrained or randomly initialized weights.
102
+
103
+ ```py
104
+ from transformers import MaskFormerConfig, MaskFormerForInstanceSegmentation
105
+
106
+ config = MaskFormerConfig(backbone="resnet50", use_timm_backbone=True, use_pretrained_backbone=True)
107
+ model = MaskFormerForInstanceSegmentation(config)
108
+ ```
109
+
110
+ You could also explicitly call the [`TimmBackboneConfig`] class to load and create a pretrained timm backbone.
111
+
112
+ ```py
113
+ from transformers import TimmBackboneConfig
114
+
115
+ backbone_config = TimmBackboneConfig("resnet50", use_pretrained_backbone=True)
116
+ ```
117
+
118
+ Pass the backbone configuration to the model configuration and instantiate the model head, [`MaskFormerForInstanceSegmentation`], with the backbone.
119
+
120
+ ```py
121
+ from transformers import MaskFormerConfig, MaskFormerForInstanceSegmentation
122
+
123
+ config = MaskFormerConfig(backbone_config=backbone_config)
124
+ model = MaskFormerForInstanceSegmentation(config)
125
+ ```
126
+
127
+ ## Feature extraction
128
+
129
+ The backbone is used to extract image features. Pass an image through the backbone to get the feature maps.
130
+
131
+ Load and preprocess an image and pass it to the backbone. The example below extracts the feature maps from the first layer.
132
+
133
+ ```py
134
+ from transformers import AutoImageProcessor, AutoBackbone
135
+ import torch
136
+ from PIL import Image
137
+ import requests
138
+
139
+ model = AutoBackbone.from_pretrained("microsoft/swin-tiny-patch4-window7-224", out_indices=(1,))
140
+ processor = AutoImageProcessor.from_pretrained("microsoft/swin-tiny-patch4-window7-224")
141
+
142
+ url = "http://images.cocodataset.org/val2017/000000039769.jpg"
143
+ image = Image.open(requests.get(url, stream=True).raw)
144
+
145
+ inputs = processor(image, return_tensors="pt")
146
+ outputs = model(**inputs)
147
+ ```
148
+
149
+ The features are stored and accessed from the outputs `feature_maps` attribute.
150
+
151
+ ```py
152
+ feature_maps = outputs.feature_maps
153
+ list(feature_maps[0].shape)
154
+ [1, 96, 56, 56]
155
+ ```
transformers/docs/source/en/cache_explanation.md ADDED
@@ -0,0 +1,160 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!--Copyright 2024 The HuggingFace Team. All rights reserved.
2
+
3
+ Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
4
+ the License. You may obtain a copy of the License at
5
+
6
+ http://www.apache.org/licenses/LICENSE-2.0
7
+
8
+ Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
9
+ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
10
+ specific language governing permissions and limitations under the License.
11
+
12
+ ⚠️ Note that this file is in Markdown but contains specific syntax for our doc-builder (similar to MDX) that may not be
13
+ rendered properly in your Markdown viewer.
14
+
15
+ -->
16
+
17
+ # Caching
18
+ Imagine you're having a conversation with someone, and instead of remembering what they previously said, they have to start from scratch every time you respond. This would be slow and inefficient, right?
19
+
20
+ You can extend this analogy to transformer models. Autoregressive model generation can be slow because it makes a prediction one token at a time. Each new prediction is dependent on all the previous context.
21
+
22
+ To predict the 1000th token, the model requires information from the previous 999 tokens. The information is represented as matrix multiplications across the token representations.
23
+
24
+ To predict the 1001th token, you need the same information from the previous 999 tokens in addition to any information from the 1000th token. This is a lot of matrix multiplications a model has to compute over and over for each token!
25
+
26
+ A key-value (KV) cache eliminates this inefficiency by storing kv pairs derived from the attention layers of previously processed tokens. The stored kv pairs are retrieved from the cache and reused for subsequent tokens, avoiding the need to recompute.
27
+
28
+ > [!WARNING]
29
+ > Caching should only be used for **inference**. It may cause unexpected errors if it's enabled during training.
30
+
31
+ To better understand how and why caching works, let's take a closer look at the structure of the attention matrices.
32
+
33
+ ## Attention matrices
34
+
35
+ The **scaled dot-product attention** is calculated as shown below for a batch of size `b`, number of attention heads `h`, sequence length so far `T`, and dimension per attention head `d_head`.
36
+
37
+ $$
38
+ \text{Attention}(Q, K, V) = \text{softmax}\left( \frac{Q K^\top}{\sqrt{d_{\text{head}}}} \times \text{mask} \right) V
39
+ $$
40
+
41
+ The query (`Q`), key (`K`), and value (`V`) matrices are projections from the input embeddings of shape `(b, h, T, d_head)`.
42
+
43
+ For causal attention, the mask prevents the model from attending to future tokens. Once a token is processed, its representation never changes with respect to future tokens, which means \\( K_{\text{past}} \\) and \\( V_{\text{past}} \\) can be cached and reused to compute the last token's representation.
44
+
45
+ $$
46
+ \text{Attention}(q_t, [\underbrace{k_1, k_2, \dots, k_{t-1}}_{\text{cached}}, k_{t}], [\underbrace{v_1, v_2, \dots, v_{t-1}}_{\text{cached}}, v_{t}])
47
+ $$
48
+
49
+ At inference time, you only need the last token's query to compute the representation \\( x_t \\) that predicts the next token \\( t+1 \\). At each step, the new key and value vectors are **stored** in the cache and **appended** to the past keys and values.
50
+
51
+ $$
52
+ K_{\text{cache}} \leftarrow \text{concat}(K_{\text{past}}, k_t), \quad V_{\text{cache}} \leftarrow \text{concat}(V_{\text{past}}, v_t)
53
+ $$
54
+
55
+ Attention is calculated independently in each layer of the model, and caching is done on a per-layer basis.
56
+
57
+ Refer to the table below to compare how caching improves efficiency.
58
+
59
+ | without caching | with caching |
60
+ |---|---|
61
+ | for each step, recompute all previous `K` and `V` | for each step, only compute current `K` and `V`
62
+ | attention cost per step is **quadratic** with sequence length | attention cost per step is **linear** with sequence length (memory grows linearly, but compute/token remains low) |
63
+
64
+
65
+
66
+ ## Cache class
67
+
68
+ A basic KV cache interface takes a key and value tensor for the current token and returns the updated `K` and `V` tensors. This is internally managed by a model's `forward` method.
69
+
70
+ ```py
71
+ new_K, new_V = cache.update(k_t, v_t, layer_idx)
72
+ attn_output = attn_layer_idx_fn(q_t, new_K, new_V)
73
+ ```
74
+
75
+ When you use Transformers' [`Cache`] class, the self-attention module performs several critical steps to integrate past and present information.
76
+
77
+ 1. The attention module concatenates current kv pairs with past kv pairs stored in the cache. This creates attentions weights with the shape `(new_tokens_length, past_kv_length + new_tokens_length)`. The current and past kv pairs are essentially combined to compute the attention scores, ensuring a model is aware of previous context and the current input.
78
+
79
+ 2. When the `forward` method is called iteratively, it's crucial that the attention mask shape matches the combined length of the past and current kv pairs. The attention mask should have the shape `(batch_size, past_kv_length + new_tokens_length)`. This is typically handled internally in [`~GenerationMixin.generate`], but if you want to implement your own generation loop with [`Cache`], keep this in mind! The attention mask should hold the past and current token values.
80
+
81
+ 3. It is also important to be aware of the `cache_position`. This is important if you want to reuse a prefilled [`Cache`] with the `forward` method because you have to pass a valid `cache_position` value. This indicates the input positions in a sequence. `cache_position` is unaffected by padding, and it always adds one more position for each token. For example, if a kv cache contains 10 tokens - regardless of pad tokens - the cache position for the next token should be `torch.tensor([10])`.
82
+
83
+ ## Cache storage implementation
84
+
85
+ The actual storage of key-value pairs varies between cache implementations. As an example, consider the [`DynamicCache`].
86
+
87
+
88
+ In [`DynamicCache`], the key-value pairs are stored as two lists of tensors. Each tensor in the lists have the shape `[batch_size, num_heads, seq_len, head_dim]`.
89
+ - `key_cache`: A list of tensors, one for each layer.
90
+ - `value_cache`: A list of tensors, one for each layer.
91
+
92
+ When new tokens are processed:
93
+
94
+ 1. For each layer, the new key and value states are concatenated with the existing cache.
95
+ ```py
96
+ self.key_cache[layer_idx] = torch.cat([self.key_cache[layer_idx], key_states], dim=-2)
97
+ self.value_cache[layer_idx] = torch.cat([self.value_cache[layer_idx], value_states], dim=-2)
98
+ ```
99
+
100
+ 2. The cache grows dynamically as more tokens are processed. The sequence length dimension (`seq_len`) increases with each new token.
101
+
102
+ The example below demonstrates how to create a generation loop with [`DynamicCache`]. As discussed, the attention mask is a concatenation of past and current token values and `1` is added to the cache position for the next token.
103
+
104
+ ```py
105
+ import torch
106
+ from transformers import AutoTokenizer, AutoModelForCausalLM, DynamicCache
107
+
108
+ model_id = "meta-llama/Llama-2-7b-chat-hf"
109
+ model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, device_map="cuda:0")
110
+ tokenizer = AutoTokenizer.from_pretrained(model_id)
111
+
112
+ past_key_values = DynamicCache()
113
+ messages = [{"role": "user", "content": "Hello, what's your name."}]
114
+ inputs = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt", return_dict=True).to("cuda:0")
115
+
116
+ generated_ids = inputs.input_ids
117
+ cache_position = torch.arange(inputs.input_ids.shape[1], dtype=torch.int64, device="cuda:0")
118
+ max_new_tokens = 10
119
+
120
+ for _ in range(max_new_tokens):
121
+ outputs = model(**inputs, cache_position=cache_position, past_key_values=past_key_values, use_cache=True)
122
+ # Greedily sample one next token
123
+ next_token_ids = outputs.logits[:, -1:].argmax(-1)
124
+ generated_ids = torch.cat([generated_ids, next_token_ids], dim=-1)
125
+ # Prepare inputs for the next generation step by leaving unprocessed tokens, in our case we have only one new token
126
+ # and expanding attn mask for the new token, as explained above
127
+ attention_mask = inputs["attention_mask"]
128
+ attention_mask = torch.cat([attention_mask, attention_mask.new_ones((attention_mask.shape[0], 1))], dim=-1)
129
+ inputs = {"input_ids": next_token_ids, "attention_mask": attention_mask}
130
+ cache_position = cache_position[-1:] + 1 # add one more position for the next token
131
+
132
+ print(tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0])
133
+ "[INST] Hello, what's your name. [/INST] Hello! My name is LLaMA,"
134
+ ```
135
+ ## Legacy cache format
136
+
137
+ Before the [`Cache`] class, the cache used to be stored as a tuple of tuples of tensors. This format is dynamic because it grows as text is generated, similar to [`DynamicCache`].
138
+
139
+ The legacy format is essentially the same data structure but organized differently.
140
+ - It's a tuple of tuples, where each inner tuple contains the key and value tensors for a layer.
141
+ - The tensors have the same shape `[batch_size, num_heads, seq_len, head_dim]`.
142
+ - The format is less flexible and doesn't support features like quantization or offloading.
143
+
144
+ If your project depends on this legacy format, you can convert between [`DynamicCache`] and a tuple of tuples as shown below with the [`~DynamicCache.from_legacy_cache`] and [`DynamicCache.to_legacy_cache`] functions. This is helpful if you have custom logic for manipulating a cache in a specific format.
145
+
146
+ ```py
147
+ import torch
148
+ from transformers import AutoTokenizer, AutoModelForCausalLM, DynamicCache
149
+
150
+ tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf")
151
+ model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf", torch_dtype=torch.float16, device_map="auto")
152
+ inputs = tokenizer("Hello, my name is", return_tensors="pt").to(model.device)
153
+
154
+ # `return_dict_in_generate=True` is required to return the cache and `return_legacy_cache` forces the returned cache
155
+ # in the legacy format
156
+ generation_outputs = model.generate(**inputs, return_dict_in_generate=True, return_legacy_cache=True, max_new_tokens=5)
157
+
158
+ cache = DynamicCache.from_legacy_cache(generation_outputs.past_key_values)
159
+ legacy_format_cache = cache.to_legacy_cache()
160
+ ```
transformers/docs/source/en/chat_extras.md ADDED
@@ -0,0 +1,299 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!--Copyright 2024 The HuggingFace Team. All rights reserved.
2
+
3
+ Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
4
+ the License. You may obtain a copy of the License at
5
+
6
+ http://www.apache.org/licenses/LICENSE-2.0
7
+
8
+ Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
9
+ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
10
+ specific language governing permissions and limitations under the License.
11
+
12
+ ⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
13
+ rendered properly in your Markdown viewer.
14
+
15
+ -->
16
+
17
+ # Tools and RAG
18
+
19
+ The [`~PreTrainedTokenizerBase.apply_chat_template`] method supports virtually any additional argument types - strings, lists, dicts - besides the chat message. This makes it possible to use chat templates for many use cases.
20
+
21
+ This guide will demonstrate how to use chat templates with tools and retrieval-augmented generation (RAG).
22
+
23
+ ## Tools
24
+
25
+ Tools are functions a large language model (LLM) can call to perform specific tasks. It is a powerful way to extend the capabilities of conversational agents with real-time information, computational tools, or access to large databases.
26
+
27
+ Follow the rules below when creating a tool.
28
+
29
+ 1. The function should have a descriptive name.
30
+ 2. The function arguments must have a type hint in the function header (don't include in the `Args` block).
31
+ 3. The function must have a [Google-style](https://google.github.io/styleguide/pyguide.html#38-comments-and-docstrings) docstring.
32
+ 4. The function can have a return type and `Returns` block, but these are optional because most tool use models ignore them.
33
+
34
+ An example tool to get temperature and wind speed is shown below.
35
+
36
+ ```py
37
+ def get_current_temperature(location: str, unit: str) -> float:
38
+ """
39
+ Get the current temperature at a location.
40
+
41
+ Args:
42
+ location: The location to get the temperature for, in the format "City, Country"
43
+ unit: The unit to return the temperature in. (choices: ["celsius", "fahrenheit"])
44
+ Returns:
45
+ The current temperature at the specified location in the specified units, as a float.
46
+ """
47
+ return 22. # A real function should probably actually get the temperature!
48
+
49
+ def get_current_wind_speed(location: str) -> float:
50
+ """
51
+ Get the current wind speed in km/h at a given location.
52
+
53
+ Args:
54
+ location: The location to get the temperature for, in the format "City, Country"
55
+ Returns:
56
+ The current wind speed at the given location in km/h, as a float.
57
+ """
58
+ return 6. # A real function should probably actually get the wind speed!
59
+
60
+ tools = [get_current_temperature, get_current_wind_speed]
61
+ ```
62
+
63
+ Load a model and tokenizer that supports tool-use like [NousResearch/Hermes-2-Pro-Llama-3-8B](https://hf.co/NousResearch/Hermes-2-Pro-Llama-3-8B), but you can also consider a larger model like [Command-R](./model_doc/cohere) and [Mixtral-8x22B](./model_doc/mixtral) if your hardware can support it.
64
+
65
+ ```py
66
+ import torch
67
+ from transformers import AutoModelForCausalLM, AutoTokenizer
68
+
69
+ tokenizer = AutoTokenizer.from_pretrained( "NousResearch/Hermes-2-Pro-Llama-3-8B")
70
+ tokenizer = AutoTokenizer.from_pretrained( "NousResearch/Hermes-2-Pro-Llama-3-8B")
71
+ model = AutoModelForCausalLM.from_pretrained( "NousResearch/Hermes-2-Pro-Llama-3-8B", torch_dtype=torch.bfloat16, device_map="auto")
72
+ ```
73
+
74
+ Create a chat message.
75
+
76
+ ```py
77
+ messages = [
78
+ {"role": "system", "content": "You are a bot that responds to weather queries. You should reply with the unit used in the queried location."},
79
+ {"role": "user", "content": "Hey, what's the temperature in Paris right now?"}
80
+ ]
81
+ ```
82
+
83
+ Pass `messages` and a list of tools to [`~PreTrainedTokenizerBase.apply_chat_template`]. Then you can pass the inputs to the model for generation.
84
+
85
+ ```py
86
+ inputs = tokenizer.apply_chat_template(messages, tools=tools, add_generation_prompt=True, return_dict=True, return_tensors="pt")
87
+ inputs = {k: v for k, v in inputs.items()}
88
+ outputs = model.generate(**inputs, max_new_tokens=128)
89
+ print(tokenizer.decode(outputs[0][len(inputs["input_ids"][0]):]))
90
+ ```
91
+
92
+ ```txt
93
+ <tool_call>
94
+ {"arguments": {"location": "Paris, France", "unit": "celsius"}, "name": "get_current_temperature"}
95
+ </tool_call><|im_end|>
96
+ ```
97
+
98
+ The chat model called the `get_current_temperature` tool with the correct parameters from the docstring. It inferred France as the location based on Paris, and that it should use Celsius for the units of temperature.
99
+
100
+ Now append the `get_current_temperature` function and these arguments to the chat message as `tool_call`. The `tool_call` dictionary should be provided to the `assistant` role instead of the `system` or `user`.
101
+
102
+ > [!WARNING]
103
+ > The OpenAI API uses a JSON string as its `tool_call` format. This may cause errors or strange model behavior if used in Transformers, which expects a dict.
104
+
105
+ <hfoptions id="tool-call">
106
+ <hfoption id="Llama">
107
+
108
+ ```py
109
+ tool_call = {"name": "get_current_temperature", "arguments": {"location": "Paris, France", "unit": "celsius"}}
110
+ messages.append({"role": "assistant", "tool_calls": [{"type": "function", "function": tool_call}]})
111
+ ```
112
+
113
+ Allow the assistant to read the function outputs and chat with the user.
114
+
115
+ ```py
116
+ inputs = tokenizer.apply_chat_template(messages, tools=tools, add_generation_prompt=True, return_dict=True, return_tensors="pt")
117
+ inputs = {k: v for k, v in inputs.items()}
118
+ out = model.generate(**inputs, max_new_tokens=128)
119
+ print(tokenizer.decode(out[0][len(inputs["input_ids"][0]):]))
120
+ ```
121
+
122
+ ```txt
123
+ The temperature in Paris, France right now is approximately 12°C (53.6°F).<|im_end|>
124
+ ```
125
+
126
+ </hfoption>
127
+ <hfoption id="Mistral/Mixtral">
128
+
129
+ For [Mistral](./model_doc/mistral) and [Mixtral](./model_doc/mixtral) models, you need an additional `tool_call_id`. The `tool_call_id` is 9 randomly generated alphanumeric characters assigned to the `id` key in the `tool_call` dictionary.
130
+
131
+ ```py
132
+ tool_call_id = "9Ae3bDc2F"
133
+ tool_call = {"name": "get_current_temperature", "arguments": {"location": "Paris, France", "unit": "celsius"}}
134
+ messages.append({"role": "assistant", "tool_calls": [{"type": "function", "id": tool_call_id, "function": tool_call}]})
135
+ ```
136
+
137
+ ```py
138
+ inputs = tokenizer.apply_chat_template(messages, tools=tools, add_generation_prompt=True, return_dict=True, return_tensors="pt")
139
+ inputs = {k: v for k, v in inputs.items()}
140
+ out = model.generate(**inputs, max_new_tokens=128)
141
+ print(tokenizer.decode(out[0][len(inputs["input_ids"][0]):]))
142
+ ```
143
+
144
+ </hfoption>
145
+ </hfoptions>
146
+
147
+ ## Schema
148
+
149
+ [`~PreTrainedTokenizerBase.apply_chat_template`] converts functions into a [JSON schema](https://json-schema.org/learn/getting-started-step-by-step) which is passed to the chat template. A LLM never sees the code inside the function. In other words, a LLM doesn't care how the function works technically, it only cares about function **definition** and **arguments**.
150
+
151
+ The JSON schema is automatically generated behind the scenes as long as your function follows the [rules](#tools) listed earlier above. But you can use [get_json_schema](https://github.com/huggingface/transformers/blob/14561209291255e51c55260306c7d00c159381a5/src/transformers/utils/chat_template_utils.py#L205) to manually convert a schema for more visibility or debugging.
152
+
153
+ ```py
154
+ from transformers.utils import get_json_schema
155
+
156
+ def multiply(a: float, b: float):
157
+ """
158
+ A function that multiplies two numbers
159
+
160
+ Args:
161
+ a: The first number to multiply
162
+ b: The second number to multiply
163
+ """
164
+ return a * b
165
+
166
+ schema = get_json_schema(multiply)
167
+ print(schema)
168
+ ```
169
+
170
+ ```json
171
+ {
172
+ "type": "function",
173
+ "function": {
174
+ "name": "multiply",
175
+ "description": "A function that multiplies two numbers",
176
+ "parameters": {
177
+ "type": "object",
178
+ "properties": {
179
+ "a": {
180
+ "type": "number",
181
+ "description": "The first number to multiply"
182
+ },
183
+ "b": {
184
+ "type": "number",
185
+ "description": "The second number to multiply"
186
+ }
187
+ },
188
+ "required": ["a", "b"]
189
+ }
190
+ }
191
+ }
192
+ ```
193
+
194
+ You can edit the schema or write one entirely from scratch. This gives you a lot of flexibility to define precise schemas for more complex functions.
195
+
196
+ > [!WARNING]
197
+ > Try keeping your function signatures simple and the arguments to a minimum. These are easier for a model to understand and use than complex functions for example with nested arguments.
198
+
199
+ The example below demonstrates writing a schema manually and then passing it to [`~PreTrainedTokenizerBase.apply_chat_template`].
200
+
201
+ ```py
202
+ # A simple function that takes no arguments
203
+ current_time = {
204
+ "type": "function",
205
+ "function": {
206
+ "name": "current_time",
207
+ "description": "Get the current local time as a string.",
208
+ "parameters": {
209
+ 'type': 'object',
210
+ 'properties': {}
211
+ }
212
+ }
213
+ }
214
+
215
+ # A more complete function that takes two numerical arguments
216
+ multiply = {
217
+ 'type': 'function',
218
+ 'function': {
219
+ 'name': 'multiply',
220
+ 'description': 'A function that multiplies two numbers',
221
+ 'parameters': {
222
+ 'type': 'object',
223
+ 'properties': {
224
+ 'a': {
225
+ 'type': 'number',
226
+ 'description': 'The first number to multiply'
227
+ },
228
+ 'b': {
229
+ 'type': 'number', 'description': 'The second number to multiply'
230
+ }
231
+ },
232
+ 'required': ['a', 'b']
233
+ }
234
+ }
235
+ }
236
+
237
+ model_input = tokenizer.apply_chat_template(
238
+ messages,
239
+ tools = [current_time, multiply]
240
+ )
241
+ ```
242
+
243
+ ## RAG
244
+
245
+ Retrieval-augmented generation (RAG) models enhance a models existing knowledge by allowing it to search documents for additional information before returning a query. For RAG models, add a `documents` parameter to [`~PreTrainedTokenizerBase.apply_chat_template`]. This `documents` parameter should be a list of documents, and each document should be a single dict with `title` and `content` keys.
246
+
247
+ > [!TIP]
248
+ > The `documents` parameter for RAG isn't widely supported and many models have chat templates that ignore `documents`. Verify if a model supports `documents` by reading its model card or executing `print(tokenizer.chat_template)` to see if the `documents` key is present. [Command-R](https://hf.co/CohereForAI/c4ai-command-r-08-2024) and [Command-R+](https://hf.co/CohereForAI/c4ai-command-r-plus-08-2024) both support `documents` in their RAG chat templates.
249
+
250
+ Create a list of documents to pass to the model.
251
+
252
+ ```py
253
+ documents = [
254
+ {
255
+ "title": "The Moon: Our Age-Old Foe",
256
+ "text": "Man has always dreamed of destroying the moon. In this essay, I shall..."
257
+ },
258
+ {
259
+ "title": "The Sun: Our Age-Old Friend",
260
+ "text": "Although often underappreciated, the sun provides several notable benefits..."
261
+ }
262
+ ]
263
+ ```
264
+
265
+ Set `chat_template="rag"` in [`~PreTrainedTokenizerBase.apply_chat_template`] and generate a response.
266
+
267
+ ```py
268
+ from transformers import AutoTokenizer, AutoModelForCausalLM
269
+
270
+ # Load the model and tokenizer
271
+ tokenizer = AutoTokenizer.from_pretrained("CohereForAI/c4ai-command-r-v01-4bit")
272
+ model = AutoModelForCausalLM.from_pretrained("CohereForAI/c4ai-command-r-v01-4bit", device_map="auto")
273
+ device = model.device # Get the device the model is loaded on
274
+
275
+ # Define conversation input
276
+ conversation = [
277
+ {"role": "user", "content": "What has Man always dreamed of?"}
278
+ ]
279
+
280
+ input_ids = tokenizer.apply_chat_template(
281
+ conversation=conversation,
282
+ documents=documents,
283
+ chat_template="rag",
284
+ tokenize=True,
285
+ add_generation_prompt=True,
286
+ return_tensors="pt").to(device)
287
+
288
+ # Generate a response
289
+ generated_tokens = model.generate(
290
+ input_ids,
291
+ max_new_tokens=100,
292
+ do_sample=True,
293
+ temperature=0.3,
294
+ )
295
+
296
+ # Decode and print the generated text along with generation prompt
297
+ generated_text = tokenizer.decode(generated_tokens[0])
298
+ print(generated_text)
299
+ ```
transformers/docs/source/en/chat_templating.md ADDED
@@ -0,0 +1,229 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!--Copyright 2024 The HuggingFace Team. All rights reserved.
2
+
3
+ Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
4
+ the License. You may obtain a copy of the License at
5
+
6
+ http://www.apache.org/licenses/LICENSE-2.0
7
+
8
+ Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
9
+ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
10
+ specific language governing permissions and limitations under the License.
11
+
12
+ ⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
13
+ rendered properly in your Markdown viewer.
14
+
15
+ -->
16
+
17
+ # Templates
18
+
19
+ The [chat pipeline](./conversations) guide introduced [`TextGenerationPipeline`] and the concept of a chat prompt or chat template for conversing with a model. Underlying this high-level pipeline is the [`apply_chat_template`] method. A chat template is a part of the tokenizer and it specifies how to convert conversations into a single tokenizable string in the expected model format.
20
+
21
+ In the example below, Mistral-7B-Instruct and Zephyr-7B are finetuned from the same base model but they’re trained with different chat formats. Without chat templates, you have to manually write formatting code for each model and even minor errors can hurt performance. Chat templates offer a universal way to format chat inputs to any model.
22
+
23
+ <hfoptions id="template">
24
+ <hfoption id="Mistral">
25
+
26
+ ```py
27
+ from transformers import AutoTokenizer
28
+
29
+ tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.1")
30
+ chat = [
31
+ {"role": "user", "content": "Hello, how are you?"},
32
+ {"role": "assistant", "content": "I'm doing great. How can I help you today?"},
33
+ {"role": "user", "content": "I'd like to show off how chat templating works!"},
34
+ ]
35
+
36
+ tokenizer.apply_chat_template(chat, tokenize=False)
37
+ ```
38
+ ```md
39
+ <s>[INST] Hello, how are you? [/INST]I'm doing great. How can I help you today?</s> [INST] I'd like to show off how chat templating works! [/INST]
40
+ ```
41
+
42
+ </hfoption>
43
+ <hfoption id="Zephyr">
44
+
45
+ ```py
46
+ from transformers import AutoTokenizer
47
+
48
+ tokenizer = AutoTokenizer.from_pretrained("HuggingFaceH4/zephyr-7b-beta")
49
+ chat = [
50
+ {"role": "user", "content": "Hello, how are you?"},
51
+ {"role": "assistant", "content": "I'm doing great. How can I help you today?"},
52
+ {"role": "user", "content": "I'd like to show off how chat templating works!"},
53
+ ]
54
+
55
+ tokenizer.apply_chat_template(chat, tokenize=False)
56
+ ```
57
+ ```md
58
+ <|user|>\nHello, how are you?</s>\n<|assistant|>\nI'm doing great. How can I help you today?</s>\n<|user|>\nI'd like to show off how chat templating works!</s>\n
59
+ ```
60
+
61
+ </hfoption>
62
+ </hfoptions>
63
+
64
+ This guide explores [`apply_chat_template`] and chat templates in more detail.
65
+
66
+ ## apply_chat_template
67
+
68
+ Chats should be structured as a list of dictionaries with `role` and `content` keys. The `role` key specifies the speaker (usually between you and the system), and the `content` key contains your message. For the system, the `content` is a high-level description of how the model should behave and respond when you’re chatting with it.
69
+
70
+ Pass your messages to [`apply_chat_template`] to tokenize and format them. You can set [add_generation_prompt](https://huggingface.co/docs/transformers/internal/tokenization_utils#transformers.PreTrainedTokenizerBase.apply_chat_template.add_generation_prompt) to `True` to indicate the start of a message.
71
+
72
+ ```py
73
+ import torch
74
+ from transformers import AutoModelForCausalLM, AutoTokenizer
75
+
76
+ tokenizer = AutoTokenizer.from_pretrained("HuggingFaceH4/zephyr-7b-beta")
77
+ model = AutoModelForCausalLM.from_pretrained("HuggingFaceH4/zephyr-7b-beta", device_map="auto", torch_dtype=torch.bfloat16)
78
+
79
+ messages = [
80
+ {"role": "system", "content": "You are a friendly chatbot who always responds in the style of a pirate",},
81
+ {"role": "user", "content": "How many helicopters can a human eat in one sitting?"},
82
+ ]
83
+ tokenized_chat = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt")
84
+ print(tokenizer.decode(tokenized_chat[0]))
85
+ ```
86
+ ```md
87
+ <|system|>
88
+ You are a friendly chatbot who always responds in the style of a pirate</s>
89
+ <|user|>
90
+ How many helicopters can a human eat in one sitting?</s>
91
+ <|assistant|>
92
+ ```
93
+
94
+ Now pass the tokenized chat to [`~GenerationMixin.generate`] to generate a response.
95
+
96
+ ```py
97
+ outputs = model.generate(tokenized_chat, max_new_tokens=128)
98
+ print(tokenizer.decode(outputs[0]))
99
+ ```
100
+ ```md
101
+ <|system|>
102
+ You are a friendly chatbot who always responds in the style of a pirate</s>
103
+ <|user|>
104
+ How many helicopters can a human eat in one sitting?</s>
105
+ <|assistant|>
106
+ Matey, I'm afraid I must inform ye that humans cannot eat helicopters. Helicopters are not food, they are flying machines. Food is meant to be eaten, like a hearty plate o' grog, a savory bowl o' stew, or a delicious loaf o' bread. But helicopters, they be for transportin' and movin' around, not for eatin'. So, I'd say none, me hearties. None at all.
107
+ ```
108
+
109
+ ### add_generation_prompt
110
+ The [add_generation_prompt](https://huggingface.co/docs/transformers/internal/tokenization_utils#transformers.PreTrainedTokenizerBase.apply_chat_template.add_generation_prompt) parameter adds tokens that indicate the start of a response. This ensures the chat model generates a system response instead of continuing a users message.
111
+
112
+ Not all models require generation prompts, and some models, like [Llama](./model_doc/llama), don’t have any special tokens before the system response. In this case, [add_generation_prompt](https://huggingface.co/docs/transformers/internal/tokenization_utils#transformers.PreTrainedTokenizerBase.apply_chat_template.add_generation_prompt) has no effect.
113
+
114
+ ```py
115
+ tokenized_chat = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=False)
116
+ tokenized_chat
117
+ ```
118
+ ```md
119
+ <|im_start|>user
120
+ Hi there!<|im_end|>
121
+ <|im_start|>assistant
122
+ Nice to meet you!<|im_end|>
123
+ <|im_start|>user
124
+ Can I ask a question?<|im_end|>
125
+ ```
126
+
127
+ ### continue_final_message
128
+
129
+ The [continue_final_message](https://huggingface.co/docs/transformers/internal/tokenization_utils#transformers.PreTrainedTokenizerBase.apply_chat_template.continue_final_message) parameter controls whether the final message in the chat should be continued or not instead of starting a new one. It removes end of sequence tokens so that the model continues generation from the final message.
130
+
131
+ This is useful for “prefilling” a model response. In the example below, the model generates text that continues the JSON string rather than starting a new message. It can be very useful for improving the accuracy for instruction following when you know how to start its replies.
132
+
133
+ ```py
134
+ chat = [
135
+ {"role": "user", "content": "Can you format the answer in JSON?"},
136
+ {"role": "assistant", "content": '{"name": "'},
137
+ ]
138
+
139
+ formatted_chat = tokenizer.apply_chat_template(chat, tokenize=True, return_dict=True, continue_final_message=True)
140
+ model.generate(**formatted_chat)
141
+ ```
142
+
143
+ > [!WARNING]
144
+ > You shouldn’t use [add_generation_prompt](https://huggingface.co/docs/transformers/internal/tokenization_utils#transformers.PreTrainedTokenizerBase.apply_chat_template.add_generation_prompt) and [continue_final_message](https://huggingface.co/docs/transformers/internal/tokenization_utils#transformers.PreTrainedTokenizerBase.apply_chat_template.continue_final_message) together. The former adds tokens that start a new message, while the latter removes end of sequence tokens. Using them together returns an error.
145
+
146
+ [`TextGenerationPipeline`] sets [add_generation_prompt](https://huggingface.co/docs/transformers/internal/tokenization_utils#transformers.PreTrainedTokenizerBase.apply_chat_template.add_generation_prompt) to `True` by default to start a new message. However, if the final message in the chat has the “assistant” role, it assumes the message is a prefill and switches to `continue_final_message=True`. This is because most models don’t support multiple consecutive assistant messages. To override this behavior, explicitly pass the [continue_final_message](https://huggingface.co/docs/transformers/internal/tokenization_utils#transformers.PreTrainedTokenizerBase.apply_chat_template.continue_final_message) to the pipeline.
147
+
148
+ ## Multiple templates
149
+
150
+ A model may have several different templates for different use cases. For example, a model may have a template for regular chat, tool use, and RAG.
151
+
152
+ When there are multiple templates, the chat template is a dictionary. Each key corresponds to the name of a template. [`apply_chat_template`] handles multiple templates based on their name. It looks for a template named `default` in most cases and if it can’t find one, it raises an error.
153
+
154
+ For a tool calling template, if a user passes a `tools` parameter and a `tool_use` template exists, the tool calling template is used instead of `default`.
155
+
156
+ To access templates with other names, pass the template name to the `chat_template` parameter in [`apply_chat_template`]. For example, if you’re using a RAG template then set `chat_template="rag"`.
157
+
158
+ It can be confusing to manage multiple templates though, so we recommend using a single template for all use cases. Use Jinja statements like `if tools is defined` and `{% macro %}` definitions to wrap multiple code paths in a single template.
159
+
160
+ ## Template selection
161
+
162
+ It is important to set a chat template format that matches the template format a model was pretrained on, otherwise performance may suffer. Even if you’re training the model further, performance is best if the chat tokens are kept constant.
163
+
164
+ But if you’re training a model from scratch or finetuning a model for chat, you have more options to select a template. For example, [ChatML](https://github.com/openai/openai-python/blob/release-v0.28.0/chatml.md) is a popular format that is flexbile enough to handle many use cases. It even includes support for [generation prompts](#add_generation_prompt), but it doesn’t add beginning-of-string (`BOS`) or end-of-string (`EOS`) tokens. If your model expects `BOS` and `EOS` tokens, set `add_special_tokens=True` and make sure to add them to your template.
165
+
166
+ ```py
167
+ {%- for message in messages %}
168
+ {{- '<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n' }}
169
+ {%- endfor %}
170
+ ```
171
+
172
+ Set the template with the following logic to support [generation prompts](#add_generation_prompt). The template wraps each message with `<|im_start|>` and `<|im_end|>` tokens and writes the role as a string. This allows you to easily customize the roles you want to train with.
173
+
174
+ ```py
175
+ tokenizer.chat_template = "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}"
176
+ ```
177
+
178
+ The `user`, `system` and `assistant` roles are standard roles in chat templates. We recommend using these roles when it makes sense, especially if you’re using your model with the [`TextGenerationPipeline`].
179
+
180
+ ```py
181
+ <|im_start|>system
182
+ You are a helpful chatbot that will do its best not to say anything so stupid that people tweet about it.<|im_end|>
183
+ <|im_start|>user
184
+ How are you?<|im_end|>
185
+ <|im_start|>assistant
186
+ I'm doing great!<|im_end|>
187
+ ```
188
+
189
+ ## Model training
190
+
191
+ Training a model with a chat template is a good way to ensure a chat template matches the tokens a model is trained on. Apply the chat template as a preprocessing step to your dataset. Set `add_generation_prompt=False` because the additional tokens to prompt an assistant response aren’t helpful during training.
192
+
193
+ An example of preprocessing a dataset with a chat template is shown below.
194
+
195
+ ```py
196
+ from transformers import AutoTokenizer
197
+ from datasets import Dataset
198
+
199
+ tokenizer = AutoTokenizer.from_pretrained("HuggingFaceH4/zephyr-7b-beta")
200
+
201
+ chat1 = [
202
+ {"role": "user", "content": "Which is bigger, the moon or the sun?"},
203
+ {"role": "assistant", "content": "The sun."}
204
+ ]
205
+ chat2 = [
206
+ {"role": "user", "content": "Which is bigger, a virus or a bacterium?"},
207
+ {"role": "assistant", "content": "A bacterium."}
208
+ ]
209
+
210
+ dataset = Dataset.from_dict({"chat": [chat1, chat2]})
211
+ dataset = dataset.map(lambda x: {"formatted_chat": tokenizer.apply_chat_template(x["chat"], tokenize=False, add_generation_prompt=False)})
212
+ print(dataset['formatted_chat'][0])
213
+ ```
214
+ ```md
215
+ <|user|>
216
+ Which is bigger, the moon or the sun?</s>
217
+ <|assistant|>
218
+ The sun.</s>
219
+ ```
220
+
221
+ After this step, you can continue following the [training recipe](./tasks/language_modeling) for causal language models using the `formatted_chat` column.
222
+
223
+ Some tokenizers add special `<bos>` and `<eos>` tokens. Chat templates should already include all the necessary special tokens, and adding additional special tokens is often incorrect or duplicated, hurting model performance. When you format text with `apply_chat_template(tokenize=False)`, make sure you set `add_special_tokens=False` as well to avoid duplicating them.
224
+
225
+ ```py
226
+ apply_chat_template(messages, tokenize=False, add_special_tokens=False)
227
+ ```
228
+
229
+ This isn’t an issue if `apply_chat_template(tokenize=True)`.
transformers/docs/source/en/chat_templating_multimodal.md ADDED
@@ -0,0 +1,243 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!--Copyright 2025 The HuggingFace Team. All rights reserved.
2
+
3
+ Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
4
+ the License. You may obtain a copy of the License at
5
+
6
+ http://www.apache.org/licenses/LICENSE-2.0
7
+
8
+ Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
9
+ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
10
+ specific language governing permissions and limitations under the License.
11
+
12
+ ⚠️ Note that this file is in Markdown but contains specific syntax for our doc-builder (similar to MDX) that may not be
13
+ rendered properly in your Markdown viewer.
14
+
15
+ -->
16
+
17
+ # Multimodal templates
18
+
19
+ Multimodal model chat templates expect a similar [template](./chat_templating) as text-only models. It needs `messages` that includes a dictionary of the `role` and `content`.
20
+
21
+ Multimodal templates are included in the [Processor](./processors) class and require an additional `type` key for specifying whether the included content is an image, video, or text.
22
+
23
+ This guide will show you how to format chat templates for multimodal models as well as some best practices for configuring the template
24
+
25
+ ## ImageTextToTextPipeline
26
+
27
+ [`ImageTextToTextPipeline`] is a high-level image and text generation class with a “chat mode”. Chat mode is enabled when a conversational model is detected and the chat prompt is [properly formatted](./llm_tutorial#wrong-prompt-format).
28
+
29
+ Start by building a chat history with the following two roles.
30
+
31
+ - `system` describes how the model should behave and respond when you’re chatting with it. This role isn’t supported by all chat models.
32
+ - `user` is where you enter your first message to the model.
33
+
34
+ ```py
35
+ messages = [
36
+ {
37
+ "role": "system",
38
+ "content": [{"type": "text", "text": "You are a friendly chatbot who always responds in the style of a pirate"}],
39
+ },
40
+ {
41
+ "role": "user",
42
+ "content": [
43
+ {"type": "image", "url": "http://images.cocodataset.org/val2017/000000039769.jpg"},
44
+ {"type": "text", "text": "What are these?"},
45
+ ],
46
+ },
47
+ ]
48
+ ```
49
+
50
+ Create a [`ImageTextToTextPipeline`] and pass the chat to it. For large models, setting [device_map=“auto”](./models#big-model-inference) helps load the model quicker and automatically places it on the fastest device available. Changing the data type to [torch.bfloat16](./models#model-data-type) also helps save memory.
51
+
52
+ > [!TIP]
53
+ > The [`ImageTextToTextPipeline`] accepts chats in the OpenAI format to make inference easier and more accessible.
54
+
55
+ ```python
56
+ import torch
57
+ from transformers import pipeline
58
+
59
+ pipeline = pipeline("image-text-to-text", model="llava-hf/llava-onevision-qwen2-0.5b-ov-hf", device_map="auto", torch_dtype=torch.float16)
60
+ pipeline(text=messages, max_new_tokens=50, return_full_text=False)
61
+ [{'input_text': [{'role': 'system',
62
+ 'content': [{'type': 'text',
63
+ 'text': 'You are a friendly chatbot who always responds in the style of a pirate'}]},
64
+ {'role': 'user',
65
+ 'content': [{'type': 'image',
66
+ 'url': 'http://images.cocodataset.org/val2017/000000039769.jpg'},
67
+ {'type': 'text', 'text': 'What are these?'}]}],
68
+ 'generated_text': 'The image shows two cats lying on a pink surface, which appears to be a cushion or a soft blanket. The cat on the left has a striped coat, typical of tabby cats, and is lying on its side with its head resting on the'}]
69
+ ```
70
+
71
+ ## Image inputs
72
+
73
+ For multimodal models that accept images like [LLaVA](./model_doc/llava), include the following in `content` as shown below.
74
+
75
+ - The content `"type"` can be an `"image"` or `"text"`.
76
+ - For images, it can be a link to the image (`"url"`), a file path (`"path"`), or `"base64"`. Images are automatically loaded, processed, and prepared into pixel values as inputs to the model.
77
+
78
+ ```python
79
+ from transformers import AutoProcessor, LlavaOnevisionForConditionalGeneration
80
+
81
+ model = LlavaOnevisionForConditionalGeneration.from_pretrained("llava-hf/llava-onevision-qwen2-0.5b-ov-hf")
82
+ processor = AutoProcessor.from_pretrained("llava-hf/llava-onevision-qwen2-0.5b-ov-hf")
83
+
84
+ messages = [
85
+ {
86
+ "role": "system",
87
+ "content": [{"type": "text", "text": "You are a friendly chatbot who always responds in the style of a pirate"}],
88
+ },
89
+ {
90
+ "role": "user",
91
+ "content": [
92
+ {"type": "image", "url": "http://images.cocodataset.org/val2017/000000039769.jpg"},
93
+ {"type": "text", "text": "What are these?"},
94
+ ],
95
+ },
96
+ ]
97
+ ```
98
+
99
+ Pass `messages` to [`~ProcessorMixin.apply_chat_template`] to tokenize the input content and return the `input_ids` and `pixel_values`.
100
+
101
+ ```py
102
+ processed_chat = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt")
103
+ print(processed_chat.keys())
104
+ ```
105
+
106
+ These inputs are now ready to be used in [`~GenerationMixin.generate`].
107
+
108
+ ## Video inputs
109
+
110
+ Some vision models also support video inputs. The message format is very similar to the format for [image inputs](#image-inputs).
111
+
112
+ - The content `"type"` should be `"video"` to indicate the content is a video.
113
+ - For videos, it can be a link to the video (`"url"`) or it could be a file path (`"path"`). Videos loaded from a URL can only be decoded with [PyAV](https://pyav.basswood-io.com/docs/stable/) or [Decord](https://github.com/dmlc/decord).
114
+
115
+ > [!WARNING]
116
+ > Loading a video from `"url"` is only supported by the PyAV or Decord backends.
117
+
118
+ ```python
119
+ from transformers import AutoProcessor, LlavaOnevisionForConditionalGeneration
120
+
121
+ model_id = "llava-hf/llava-onevision-qwen2-0.5b-ov-hf"
122
+ model = LlavaOnevisionForConditionalGeneration.from_pretrained(model_id)
123
+ processor = AutoProcessor.from_pretrained(model_id)
124
+
125
+ messages = [
126
+ {
127
+ "role": "system",
128
+ "content": [{"type": "text", "text": "You are a friendly chatbot who always responds in the style of a pirate"}],
129
+ },
130
+ {
131
+ "role": "user",
132
+ "content": [
133
+ {"type": "video", "url": "https://test-videos.co.uk/vids/bigbuckbunny/mp4/h264/720/Big_Buck_Bunny_720_10s_10MB.mp4"},
134
+ {"type": "text", "text": "What do you see in this video?"},
135
+ ],
136
+ },
137
+ ]
138
+ ```
139
+
140
+ Pass `messages` to [`~ProcessorMixin.apply_chat_template`] to tokenize the input content. There are a few extra parameters to include in [`~ProcessorMixin.apply_chat_template`] that controls the sampling process.
141
+
142
+ The `video_load_backend` parameter refers to a specific framework to load a video. It supports [PyAV](https://pyav.basswood-io.com/docs/stable/), [Decord](https://github.com/dmlc/decord), [OpenCV](https://github.com/opencv/opencv), and [torchvision](https://pytorch.org/vision/stable/index.html).
143
+
144
+ The examples below use Decord as the backend because it is a bit faster than PyAV.
145
+
146
+ <hfoptions id="sampling">
147
+ <hfoption id="fixed number of frames">
148
+
149
+ The `num_frames` parameter controls how many frames to uniformly sample from the video. Each checkpoint has a maximum frame count it was pretrained with and exceeding this count can significantly lower generation quality. It's important to choose a frame count that fits both the model capacity and your hardware resources. If `num_frames` isn't specified, the entire video is loaded without any frame sampling.
150
+
151
+
152
+ ```python
153
+ processed_chat = processor.apply_chat_template(
154
+ messages,
155
+ add_generation_prompt=True,
156
+ tokenize=True,
157
+ return_dict=True,
158
+ return_tensors="pt",
159
+ num_frames=32,
160
+ video_load_backend="decord",
161
+ )
162
+ print(processed_chat.keys())
163
+ ```
164
+
165
+ These inputs are now ready to be used in [`~GenerationMixin.generate`].
166
+
167
+ </hfoption>
168
+ <hfoption id="fps">
169
+
170
+ For longer videos, it may be better to sample more frames for better representation with the `video_fps` parameter. This determines how many frames per second to extract. As an example, if a video is 10 seconds long and `video_fps=2`, then the model samples 20 frames. In other words, 2 frames are uniformly sampled every 10 seconds.
171
+
172
+ ```py
173
+ processed_chat = processor.apply_chat_template(
174
+ messages,
175
+ add_generation_prompt=True,
176
+ tokenize=True,
177
+ return_dict=True,
178
+ video_fps=16,
179
+ video_load_backend="decord",
180
+ )
181
+ print(processed_chat.keys())
182
+ ```
183
+
184
+ </hfoption>
185
+ <hfoption id="list of image frames">
186
+
187
+ Videos may also exist as a set of sampled frames stored as images rather than the full video file.
188
+
189
+ In this case, pass a list of image file paths and the processor automatically concatenates them into a video. Make sure all images are the same size since they are assumed to be from the same video.
190
+
191
+ ```py
192
+ frames_paths = ["/path/to/frame0.png", "/path/to/frame5.png", "/path/to/frame10.png"]
193
+ messages = [
194
+ {
195
+ "role": "system",
196
+ "content": [{"type": "text", "text": "You are a friendly chatbot who always responds in the style of a pirate"}],
197
+ },
198
+ {
199
+ "role": "user",
200
+ "content": [
201
+ {"type": "video", "path": frames_paths},
202
+ {"type": "text", "text": "What do you see in this video?"},
203
+ ],
204
+ },
205
+ ]
206
+
207
+ processed_chat = processor.apply_chat_template(
208
+ messages,
209
+ add_generation_prompt=True,
210
+ tokenize=True,
211
+ return_dict=True,
212
+ )
213
+ print(processed_chat.keys())
214
+ ```
215
+
216
+ </hfoption>
217
+ </hfoptions>
218
+
219
+ ## Template configuration
220
+
221
+ You can create a custom chat template with [Jinja](https://jinja.palletsprojects.com/en/3.1.x/templates/) and set it with [`~ProcessorMixin.apply_chat_template`]. Refer to the [Template writing](./chat_templating_writing) guide for more details.
222
+
223
+ For example, to enable a template to handle a *list of content* from multiple modalities while still supporting plain strings for text-only inference, specify how to handle the `content['type']` if it is an image or text as shown below in the Llama 3.2 Vision Instruct [template](https://huggingface.co/meta-llama/Llama-3.2-11B-Vision-Instruct/blob/main/chat_template.json).
224
+
225
+ ```jinja
226
+ {% for message in messages %}
227
+ {% if loop.index0 == 0 %}{{ bos_token }}{% endif %}
228
+ {{ '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n' }}
229
+ {% if message['content'] is string %}
230
+ {{ message['content'] }}
231
+ {% else %}
232
+ {% for content in message['content'] %}
233
+ {% if content['type'] == 'image' %}
234
+ {{ '<|image|>' }}
235
+ {% elif content['type'] == 'text' %}
236
+ {{ content['text'] }}
237
+ {% endif %}
238
+ {% endfor %}
239
+ {% endif %}
240
+ {{ '<|eot_id|>' }}
241
+ {% endfor %}
242
+ {% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}{% endif %}
243
+ ```
transformers/docs/source/en/chat_templating_writing.md ADDED
@@ -0,0 +1,251 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!--Copyright 2024 The HuggingFace Team. All rights reserved.
2
+
3
+ Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
4
+ the License. You may obtain a copy of the License at
5
+
6
+ http://www.apache.org/licenses/LICENSE-2.0
7
+
8
+ Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
9
+ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
10
+ specific language governing permissions and limitations under the License.
11
+
12
+ ⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
13
+ rendered properly in your Markdown viewer.
14
+
15
+ -->
16
+
17
+ # Template writing
18
+
19
+ A chat template is a [Jinja](https://jinja.palletsprojects.com/en/3.1.x/templates/) template stored in the tokenizers [chat_template](https://huggingface.co/docs/transformers/main_classes/tokenizer#transformers.PreTrainedTokenizer.chat_template) attribute. Jinja is a templating language that allows you to write Python-like code and syntax. A chat template performs the following three roles.
20
+
21
+ 1. Print the role enclosed in `<|` and `|>` (`<|user|>`, `<|assistant|>`, etc.).
22
+ 2. Print the message followed by an end-of-sequence (`EOS`) token.
23
+ 3. Print the assistant token if [add_generation_prompt=True](./chat_templating#add_generation_prompt) so the model generates an assistant response.
24
+
25
+ An example template is shown below.
26
+
27
+ ```jinja
28
+ {%- for message in messages %}
29
+ {{- '<|' + message['role'] + |>\n' }}
30
+ {{- message['content'] + eos_token }}
31
+ {%- endfor %}
32
+ {%- if add_generation_prompt %}
33
+ {{- '<|assistant|>\n' }}
34
+ {%- endif %}
35
+ ```
36
+
37
+ The template can be customized to handle more complex use cases. This guide will show you how to add and edit templates and includes template writing tips.
38
+
39
+ ## Create a template
40
+
41
+ Create a template by writing a Jinja template and then setting it as the chat template in the tokenizer. For example, the template below adds `[ASST]` and `[/ASST]` tags to the assistant messages.
42
+
43
+ ```jinja
44
+ {%- for message in messages %}
45
+ {%- if message['role'] == 'user' %}
46
+ {{- bos_token + '[INST] ' + message['content'].strip() + ' [/INST]' }}
47
+ {%- elif message['role'] == 'system' %}
48
+ {{- '<<SYS>>\\n' + message['content'].strip() + '\\n<</SYS>>\\n\\n' }}
49
+ {%- elif message['role'] == 'assistant' %}
50
+ {{- '[ASST] ' + message['content'] + ' [/ASST]' + eos_token }}
51
+ {%- endif %}
52
+ {%- endfor %}
53
+ ```
54
+
55
+ Set the template in the tokenizer, and the next time you use [`~PreTrainedTokenizerBase.apply_chat_template`], the new template is used.
56
+
57
+ ```py
58
+ template = tokenizer.chat_template
59
+ template = template.replace("SYS", "SYSTEM") # Change the system token
60
+ tokenizer.chat_template = template # Set the new template
61
+ ```
62
+
63
+ The template is saved in the `tokenizer_config.json` file. Upload it to the Hub with [`~PreTrainedTokenizer.push_to_hub`] so you can reuse it later and make sure everyone is using the right template for your model.
64
+
65
+ ```py
66
+ tokenizer.push_to_hub("model_name")
67
+ ```
68
+
69
+ ## Template writing tips
70
+
71
+ The easiest way to start writing Jinja templates is to refer to existing templates. Use `print(tokenizer.chat_template)` on any chat model to see what template it's using. Try starting with simple models that don't call any tools or support RAG. Finally, take a look at the [Jinja documentation](https://jinja.palletsprojects.com/en/3.1.x/templates/#synopsis) for more details about formatting and syntax.
72
+
73
+ This section curates some best practices for writing clean and efficient Jinja templates.
74
+
75
+ ### Trimming whitespace
76
+
77
+ Jinja prints any whitespace before or after a block of text. This can be an issue for chat templates because whitespace usage should be intentional. Add `-` to strip any whitespace before a block.
78
+
79
+ ```jinja
80
+ {%- for message in messages %}
81
+ {{- message['role'] + message['content'] }}
82
+ {%- endfor %}
83
+ ```
84
+
85
+ The incorrect whitespace usage example below may introduce a newline and indentation in the output.
86
+
87
+ ```jinja
88
+ {% for message in messages %}
89
+ {{ message['role'] + message['content'] }}
90
+ {% endfor %}
91
+ ```
92
+
93
+ ### Special variables
94
+
95
+ There are five special variables available inside a template. You can pass virtually any additional arguments to [`~PreTrainedTokenizerBase.apply_chat_template`] and it will be available inside the template as a variable. However, you should try to keep the number of variables to the five below to make it easier for users to use the chat model without writing custom code to handle model-specific arguments.
96
+
97
+ - `messages` contains the chat history as a list of message dicts.
98
+ - `tools` contains a list of tools in JSON schema format.
99
+ - `documents` contains a list of documents with the format `{"title": Title, "contents": "Contents"}` (designed for RAG models).
100
+ - `add_generation_prompt` is a boolean that determines whether to add an assistant header at the end of the conversation.
101
+ - `bos_token` and `eos_token` are special tokens extracted from a tokenizers `special_tokens_map`.
102
+
103
+ ### Callable functions
104
+
105
+ There are two callable functions available inside a template.
106
+
107
+ - `raise_exception(msg)` raises a `TemplateException`. This is useful for debugging or warning users about incorrect template usage.
108
+ - `strftime_now(format_str)` retrieves the current date and time in a specific format which could be useful to include in system messages. It is equivalent to [datetime.now().strftime(format_str)](https://docs.python.org/3/library/datetime.html#datetime.datetime.now) in Python.
109
+
110
+ ### Compatibility with non-Python Jinja
111
+
112
+ Jinja is implemented in multiple languages and they generally have the same syntax. Writing a template in Python allows you to use Python methods such as [lower](https://docs.python.org/3/library/stdtypes.html#str.lower) on strings or [items](https://docs.python.org/3/library/stdtypes.html#dict.items) on dicts. But this won't work if the template is used in a non-Python implementation, for example, when deploying with Javascript or Rust.
113
+
114
+ Make the changes below to ensure compatibility across all Jinja implementations.
115
+
116
+ - Replace Python methods with Jinja filters. For example, replace `string.lower()` with `string|lower` or `dict.items()` with `dict|dictitems`. Most of the changes follow the same pattern except `string.strip()`, which is replaced with `string|trim`. Refer to the list of [built-in filters](https://jinja.palletsprojects.com/en/3.1.x/templates/#builtin-filters) for a complete list of filters.
117
+ - Replace `True`, `False`, and `None` (these are Python specific) with `true`, `false`, and `none` respectively.
118
+ - Directly rendering a dict or list may return different results in other implementations. For example, string entries may change from single-quote to double-quote. To avoid this, add the [tojson](https://jinja.palletsprojects.com/en/3.1.x/templates/#jinja-filters.tojson) filter to maintain consistency.
119
+
120
+ ### Big templates
121
+
122
+ Newer models or models with features like [tool-calling](./chat_extras#tools) and [RAG](./chat_extras#retrieval-augmented-generation-rag) require larger templates that can be longer than 100 lines. It may be easier to write larger templates in a separate file. The line numbers in the separate file corresponds exactly to the line numbers in template parsing or execution errors, making it easier to debug any potential issues.
123
+
124
+ Write the template in a separate file and extract it to the chat template.
125
+
126
+ ```py
127
+ open("template.jinja", "w").write(tokenizer.chat_template)
128
+ ```
129
+
130
+ You could also load an edited template back into the tokenizer.
131
+
132
+ ```py
133
+ tokenizer.chat_template = open("template.jinja").read()
134
+ ```
135
+
136
+ ## Templates for tools
137
+
138
+ There isn't a specific format for writing templates for tools but it is best to follow the standard API. This ensures the template is widely accessible across models without requiring users to write custom code to use tools with your model.
139
+
140
+ > [!WARNING]
141
+ > Formatting such as whitespace and special tokens are model-specific. Make sure everything exactly matches the format a model was trained with.
142
+
143
+ The following section lists elements of the standard API for writing templates for tools.
144
+
145
+ ### Tool definitions
146
+
147
+ Transformers chat template methods allow a user to pass tools as Python functions or a JSON schema. When functions are passed, a JSON schema is automatically generated and passed to the template. The `tools` variable in a template always takes a list of JSON schemas.
148
+
149
+ The specific tokens and tool descriptions should match the ones your model was trained with. Your model doesn't need to understand the JSON schema input because your template can translate the JSON schema into your models format. For example, [Command-R](./model_doc/cohere) was trained with tools defined with Python function headers, but the Command-R tool template accepts JSON schemas. The template internally converts types and renders the input tools as Python headers.
150
+
151
+ ```json
152
+ {
153
+ "type": "function",
154
+ "function": {
155
+ "name": "multiply",
156
+ "description": "A function that multiplies two numbers",
157
+ "parameters": {
158
+ "type": "object",
159
+ "properties": {
160
+ "a": {
161
+ "type": "number",
162
+ "description": "The first number to multiply"
163
+ },
164
+ "b": {
165
+ "type": "number",
166
+ "description": "The second number to multiply"
167
+ }
168
+ },
169
+ "required": ["a", "b"]
170
+ }
171
+ }
172
+ }
173
+ ```
174
+
175
+ An example for handling tool definitions in a chat template is shown below. The specific tokens and tool descriptions should be changed to match the ones a model was trained with.
176
+
177
+ ```
178
+ {%- if tools %}
179
+ {%- for tool in tools %}
180
+ {{- '<tool>' + tool['function']['name'] + '\n' }}
181
+ {%- for argument in tool['function']['parameters']['properties'] %}
182
+ {{- argument + ': ' + tool['function']['parameters']['properties'][argument]['description'] + '\n' }}
183
+ {%- endfor %}
184
+ {{- '\n</tool>' }}
185
+ {%- endif %}
186
+ {%- endif %}
187
+ ```
188
+
189
+ ### Tool calls
190
+
191
+ Tool calls, if present, is a list with the `"assistant”` role. This is always a list even though most tool-calling models only support single tool calls, which means the list usually only contains a single element.
192
+
193
+ ```json
194
+ {
195
+ "role": "assistant",
196
+ "tool_calls": [
197
+ {
198
+ "type": "function",
199
+ "function": {
200
+ "name": "multiply",
201
+ "arguments": {
202
+ "a": 5,
203
+ "b": 6
204
+ }
205
+ }
206
+ }
207
+ ]
208
+ }
209
+ ```
210
+
211
+ A common pattern for handling tool calls is shown below.
212
+
213
+ ```
214
+ {%- if message['role'] == 'assistant' and 'tool_calls' in message %}
215
+ {%- for tool_call in message['tool_calls'] %}
216
+ {{- '<tool_call>' + tool_call['function']['name'] + '\n' + tool_call['function']['arguments']|tojson + '\n</tool_call>' }}
217
+ {%- endif %}
218
+ {%- endfor %}
219
+ {%- endif %}
220
+ ```
221
+
222
+ ### Tool responses
223
+
224
+ Tool responses are a message dict with the `role`, `name` (name of the function) and `content` (result of the tool call) keys.
225
+
226
+ ```json
227
+ {
228
+ "role": "tool",
229
+ "name": "multiply",
230
+ "content": "30"
231
+ }
232
+ ```
233
+
234
+ Not all the keys need to be used in the tool response. For example, if a model doesn’t expect the function name to be included in the tool response, then you can just include the `role` and `content`.
235
+
236
+ ```
237
+ {%- if message['role'] == 'tool' %}
238
+ {{- "<tool_result>" + message['content'] + "</tool_result>" }}
239
+ {%- endif %}
240
+ ```
241
+
242
+ ## Contribute
243
+
244
+ Add a chat template by setting the `chat_template` attribute in the tokenizer and testing it with [`~PreTrainedTokenizerBase.apply_chat_template`]. If it works as expected, then you can upload it to the Hub with with [`~PreTrainedTokenizer.push_to_hub`].
245
+
246
+ Even if you're not the model owner, it is still helpful to add a template for a model with an empty chat template or a model that is using a default class template. Open a [pull request](https://hf.co/docs/hub/repositories-pull-requests-discussions) on the model repository to add the template.
247
+
248
+ ```py
249
+ tokenizer.chat_template = template
250
+ tokenizer.push_to_hub("model_name")
251
+ ```
transformers/docs/source/en/community.md ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!--⚠️ Note that this file is in Markdown but contains specific syntax for our doc-builder (similar to MDX) that may not be
2
+ rendered properly in your Markdown viewer.
3
+ -->
4
+
5
+ # Community
6
+
7
+ This page regroups resources around 🤗 Transformers developed by the community.
8
+
9
+ ## Community resources:
10
+
11
+ | Resource | Description | Author |
12
+ |:----------|:-------------|------:|
13
+ | [Hugging Face Transformers Glossary Flashcards](https://www.darigovresearch.com/huggingface-transformers-glossary-flashcards) | A set of flashcards based on the [Transformers Docs Glossary](glossary) that has been put into a form which can be easily learned/revised using [Anki](https://apps.ankiweb.net/) an open source, cross platform app specifically designed for long term knowledge retention. See this [Introductory video on how to use the flashcards](https://www.youtube.com/watch?v=Dji_h7PILrw). | [Darigov Research](https://www.darigovresearch.com/) |
14
+
15
+ ## Community notebooks:
16
+
17
+ | Notebook | Description | Author | |
18
+ |:----------|:-------------|:-------------|------:|
19
+ | [Fine-tune a pre-trained Transformer to generate lyrics](https://github.com/AlekseyKorshuk/huggingartists) | How to generate lyrics in the style of your favorite artist by fine-tuning a GPT-2 model | [Aleksey Korshuk](https://github.com/AlekseyKorshuk) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/AlekseyKorshuk/huggingartists/blob/master/huggingartists-demo.ipynb) |
20
+ | [Train T5 in Tensorflow 2](https://github.com/snapthat/TF-T5-text-to-text) | How to train T5 for any task using Tensorflow 2. This notebook demonstrates a Question & Answer task implemented in Tensorflow 2 using SQUAD | [Muhammad Harris](https://github.com/HarrisDePerceptron) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/snapthat/TF-T5-text-to-text/blob/master/snapthatT5/notebooks/TF-T5-Datasets%20Training.ipynb) |
21
+ | [Train T5 on TPU](https://github.com/patil-suraj/exploring-T5/blob/master/T5_on_TPU.ipynb) | How to train T5 on SQUAD with Transformers and Nlp | [Suraj Patil](https://github.com/patil-suraj) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patil-suraj/exploring-T5/blob/master/T5_on_TPU.ipynb#scrollTo=QLGiFCDqvuil) |
22
+ | [Fine-tune T5 for Classification and Multiple Choice](https://github.com/patil-suraj/exploring-T5/blob/master/t5_fine_tuning.ipynb) | How to fine-tune T5 for classification and multiple choice tasks using a text-to-text format with PyTorch Lightning | [Suraj Patil](https://github.com/patil-suraj) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patil-suraj/exploring-T5/blob/master/t5_fine_tuning.ipynb) |
23
+ | [Fine-tune DialoGPT on New Datasets and Languages](https://github.com/ncoop57/i-am-a-nerd/blob/master/_notebooks/2020-05-12-chatbot-part-1.ipynb) | How to fine-tune the DialoGPT model on a new dataset for open-dialog conversational chatbots | [Nathan Cooper](https://github.com/ncoop57) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/ncoop57/i-am-a-nerd/blob/master/_notebooks/2020-05-12-chatbot-part-1.ipynb) |
24
+ | [Long Sequence Modeling with Reformer](https://github.com/patrickvonplaten/notebooks/blob/master/PyTorch_Reformer.ipynb) | How to train on sequences as long as 500,000 tokens with Reformer | [Patrick von Platen](https://github.com/patrickvonplaten) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patrickvonplaten/notebooks/blob/master/PyTorch_Reformer.ipynb) |
25
+ | [Fine-tune BART for Summarization](https://github.com/ohmeow/ohmeow_website/blob/master/posts/2021-05-25-mbart-sequence-classification-with-blurr.ipynb) | How to fine-tune BART for summarization with fastai using blurr | [Wayde Gilliam](https://ohmeow.com/) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/ohmeow/ohmeow_website/blob/master/posts/2021-05-25-mbart-sequence-classification-with-blurr.ipynb) |
26
+ | [Fine-tune a pre-trained Transformer on anyone's tweets](https://colab.research.google.com/github/borisdayma/huggingtweets/blob/master/huggingtweets-demo.ipynb) | How to generate tweets in the style of your favorite Twitter account by fine-tuning a GPT-2 model | [Boris Dayma](https://github.com/borisdayma) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/borisdayma/huggingtweets/blob/master/huggingtweets-demo.ipynb) |
27
+ | [Optimize 🤗 Hugging Face models with Weights & Biases](https://colab.research.google.com/github/wandb/examples/blob/master/colabs/huggingface/Optimize_Hugging_Face_models_with_Weights_%26_Biases.ipynb) | A complete tutorial showcasing W&B integration with Hugging Face | [Boris Dayma](https://github.com/borisdayma) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/wandb/examples/blob/master/colabs/huggingface/Optimize_Hugging_Face_models_with_Weights_%26_Biases.ipynb) |
28
+ | [Pretrain Longformer](https://github.com/allenai/longformer/blob/master/scripts/convert_model_to_long.ipynb) | How to build a "long" version of existing pretrained models | [Iz Beltagy](https://beltagy.net) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/allenai/longformer/blob/master/scripts/convert_model_to_long.ipynb) |
29
+ | [Fine-tune Longformer for QA](https://github.com/patil-suraj/Notebooks/blob/master/longformer_qa_training.ipynb) | How to fine-tune longformer model for QA task | [Suraj Patil](https://github.com/patil-suraj) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patil-suraj/Notebooks/blob/master/longformer_qa_training.ipynb) |
30
+ | [Evaluate Model with 🤗nlp](https://github.com/patrickvonplaten/notebooks/blob/master/How_to_evaluate_Longformer_on_TriviaQA_using_NLP.ipynb) | How to evaluate longformer on TriviaQA with `nlp` | [Patrick von Platen](https://github.com/patrickvonplaten) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1m7eTGlPmLRgoPkkA7rkhQdZ9ydpmsdLE?usp=sharing) |
31
+ | [Fine-tune T5 for Sentiment Span Extraction](https://github.com/enzoampil/t5-intro/blob/master/t5_qa_training_pytorch_span_extraction.ipynb) | How to fine-tune T5 for sentiment span extraction using a text-to-text format with PyTorch Lightning | [Lorenzo Ampil](https://github.com/enzoampil) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/enzoampil/t5-intro/blob/master/t5_qa_training_pytorch_span_extraction.ipynb) |
32
+ | [Fine-tune DistilBert for Multiclass Classification](https://github.com/abhimishra91/transformers-tutorials/blob/master/transformers_multiclass_classification.ipynb) | How to fine-tune DistilBert for multiclass classification with PyTorch | [Abhishek Kumar Mishra](https://github.com/abhimishra91) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/abhimishra91/transformers-tutorials/blob/master/transformers_multiclass_classification.ipynb)|
33
+ |[Fine-tune BERT for Multi-label Classification](https://github.com/abhimishra91/transformers-tutorials/blob/master/transformers_multi_label_classification.ipynb)|How to fine-tune BERT for multi-label classification using PyTorch|[Abhishek Kumar Mishra](https://github.com/abhimishra91) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/abhimishra91/transformers-tutorials/blob/master/transformers_multi_label_classification.ipynb)|
34
+ |[Fine-tune T5 for Summarization](https://github.com/abhimishra91/transformers-tutorials/blob/master/transformers_summarization_wandb.ipynb)|How to fine-tune T5 for summarization in PyTorch and track experiments with WandB|[Abhishek Kumar Mishra](https://github.com/abhimishra91) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/abhimishra91/transformers-tutorials/blob/master/transformers_summarization_wandb.ipynb)|
35
+ |[Speed up Fine-Tuning in Transformers with Dynamic Padding / Bucketing](https://github.com/ELS-RD/transformers-notebook/blob/master/Divide_Hugging_Face_Transformers_training_time_by_2_or_more.ipynb)|How to speed up fine-tuning by a factor of 2 using dynamic padding / bucketing|[Michael Benesty](https://github.com/pommedeterresautee) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1CBfRU1zbfu7-ijiOqAAQUA-RJaxfcJoO?usp=sharing)|
36
+ |[Pretrain Reformer for Masked Language Modeling](https://github.com/patrickvonplaten/notebooks/blob/master/Reformer_For_Masked_LM.ipynb)| How to train a Reformer model with bi-directional self-attention layers | [Patrick von Platen](https://github.com/patrickvonplaten) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1tzzh0i8PgDQGV3SMFUGxM7_gGae3K-uW?usp=sharing)|
37
+ |[Expand and Fine Tune Sci-BERT](https://github.com/lordtt13/word-embeddings/blob/master/COVID-19%20Research%20Data/COVID-SciBERT.ipynb)| How to increase vocabulary of a pretrained SciBERT model from AllenAI on the CORD dataset and pipeline it. | [Tanmay Thakur](https://github.com/lordtt13) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1rqAR40goxbAfez1xvF3hBJphSCsvXmh8)|
38
+ |[Fine Tune BlenderBotSmall for Summarization using the Trainer API](https://github.com/lordtt13/transformers-experiments/blob/master/Custom%20Tasks/fine-tune-blenderbot_small-for-summarization.ipynb)| How to fine-tune BlenderBotSmall for summarization on a custom dataset, using the Trainer API. | [Tanmay Thakur](https://github.com/lordtt13) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/19Wmupuls7mykSGyRN_Qo6lPQhgp56ymq?usp=sharing)|
39
+ |[Fine-tune Electra and interpret with Integrated Gradients](https://github.com/elsanns/xai-nlp-notebooks/blob/master/electra_fine_tune_interpret_captum_ig.ipynb) | How to fine-tune Electra for sentiment analysis and interpret predictions with Captum Integrated Gradients | [Eliza Szczechla](https://elsanns.github.io) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/elsanns/xai-nlp-notebooks/blob/master/electra_fine_tune_interpret_captum_ig.ipynb)|
40
+ |[fine-tune a non-English GPT-2 Model with Trainer class](https://github.com/philschmid/fine-tune-GPT-2/blob/master/Fine_tune_a_non_English_GPT_2_Model_with_Huggingface.ipynb) | How to fine-tune a non-English GPT-2 Model with Trainer class | [Philipp Schmid](https://www.philschmid.de) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/philschmid/fine-tune-GPT-2/blob/master/Fine_tune_a_non_English_GPT_2_Model_with_Huggingface.ipynb)|
41
+ |[Fine-tune a DistilBERT Model for Multi Label Classification task](https://github.com/DhavalTaunk08/Transformers_scripts/blob/master/Transformers_multilabel_distilbert.ipynb) | How to fine-tune a DistilBERT Model for Multi Label Classification task | [Dhaval Taunk](https://github.com/DhavalTaunk08) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/DhavalTaunk08/Transformers_scripts/blob/master/Transformers_multilabel_distilbert.ipynb)|
42
+ |[Fine-tune ALBERT for sentence-pair classification](https://github.com/NadirEM/nlp-notebooks/blob/master/Fine_tune_ALBERT_sentence_pair_classification.ipynb) | How to fine-tune an ALBERT model or another BERT-based model for the sentence-pair classification task | [Nadir El Manouzi](https://github.com/NadirEM) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NadirEM/nlp-notebooks/blob/master/Fine_tune_ALBERT_sentence_pair_classification.ipynb)|
43
+ |[Fine-tune Roberta for sentiment analysis](https://github.com/DhavalTaunk08/NLP_scripts/blob/master/sentiment_analysis_using_roberta.ipynb) | How to fine-tune a Roberta model for sentiment analysis | [Dhaval Taunk](https://github.com/DhavalTaunk08) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/DhavalTaunk08/NLP_scripts/blob/master/sentiment_analysis_using_roberta.ipynb)|
44
+ |[Evaluating Question Generation Models](https://github.com/flexudy-pipe/qugeev) | How accurate are the answers to questions generated by your seq2seq transformer model? | [Pascal Zoleko](https://github.com/zolekode) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1bpsSqCQU-iw_5nNoRm_crPq6FRuJthq_?usp=sharing)|
45
+ |[Classify text with DistilBERT and Tensorflow](https://github.com/peterbayerle/huggingface_notebook/blob/main/distilbert_tf.ipynb) | How to fine-tune DistilBERT for text classification in TensorFlow | [Peter Bayerle](https://github.com/peterbayerle) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/peterbayerle/huggingface_notebook/blob/main/distilbert_tf.ipynb)|
46
+ |[Leverage BERT for Encoder-Decoder Summarization on CNN/Dailymail](https://github.com/patrickvonplaten/notebooks/blob/master/BERT2BERT_for_CNN_Dailymail.ipynb) | How to warm-start a *EncoderDecoderModel* with a *google-bert/bert-base-uncased* checkpoint for summarization on CNN/Dailymail | [Patrick von Platen](https://github.com/patrickvonplaten) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patrickvonplaten/notebooks/blob/master/BERT2BERT_for_CNN_Dailymail.ipynb)|
47
+ |[Leverage RoBERTa for Encoder-Decoder Summarization on BBC XSum](https://github.com/patrickvonplaten/notebooks/blob/master/RoBERTaShared_for_BBC_XSum.ipynb) | How to warm-start a shared *EncoderDecoderModel* with a *FacebookAI/roberta-base* checkpoint for summarization on BBC/XSum | [Patrick von Platen](https://github.com/patrickvonplaten) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patrickvonplaten/notebooks/blob/master/RoBERTaShared_for_BBC_XSum.ipynb)|
48
+ |[Fine-tune TAPAS on Sequential Question Answering (SQA)](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/TAPAS/Fine_tuning_TapasForQuestionAnswering_on_SQA.ipynb) | How to fine-tune *TapasForQuestionAnswering* with a *tapas-base* checkpoint on the Sequential Question Answering (SQA) dataset | [Niels Rogge](https://github.com/nielsrogge) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/TAPAS/Fine_tuning_TapasForQuestionAnswering_on_SQA.ipynb)|
49
+ |[Evaluate TAPAS on Table Fact Checking (TabFact)](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/TAPAS/Evaluating_TAPAS_on_the_Tabfact_test_set.ipynb) | How to evaluate a fine-tuned *TapasForSequenceClassification* with a *tapas-base-finetuned-tabfact* checkpoint using a combination of the 🤗 datasets and 🤗 transformers libraries | [Niels Rogge](https://github.com/nielsrogge) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/TAPAS/Evaluating_TAPAS_on_the_Tabfact_test_set.ipynb)|
50
+ |[Fine-tuning mBART for translation](https://colab.research.google.com/github/vasudevgupta7/huggingface-tutorials/blob/main/translation_training.ipynb) | How to fine-tune mBART using Seq2SeqTrainer for Hindi to English translation | [Vasudev Gupta](https://github.com/vasudevgupta7) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/vasudevgupta7/huggingface-tutorials/blob/main/translation_training.ipynb)|
51
+ |[Fine-tune LayoutLM on FUNSD (a form understanding dataset)](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/LayoutLM/Fine_tuning_LayoutLMForTokenClassification_on_FUNSD.ipynb) | How to fine-tune *LayoutLMForTokenClassification* on the FUNSD dataset for information extraction from scanned documents | [Niels Rogge](https://github.com/nielsrogge) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/LayoutLM/Fine_tuning_LayoutLMForTokenClassification_on_FUNSD.ipynb)|
52
+ |[Fine-Tune DistilGPT2 and Generate Text](https://colab.research.google.com/github/tripathiaakash/DistilGPT2-Tutorial/blob/main/distilgpt2_fine_tuning.ipynb) | How to fine-tune DistilGPT2 and generate text | [Aakash Tripathi](https://github.com/tripathiaakash) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/tripathiaakash/DistilGPT2-Tutorial/blob/main/distilgpt2_fine_tuning.ipynb)|
53
+ |[Fine-Tune LED on up to 8K tokens](https://github.com/patrickvonplaten/notebooks/blob/master/Fine_tune_Longformer_Encoder_Decoder_(LED)_for_Summarization_on_pubmed.ipynb) | How to fine-tune LED on pubmed for long-range summarization | [Patrick von Platen](https://github.com/patrickvonplaten) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patrickvonplaten/notebooks/blob/master/Fine_tune_Longformer_Encoder_Decoder_(LED)_for_Summarization_on_pubmed.ipynb)|
54
+ |[Evaluate LED on Arxiv](https://github.com/patrickvonplaten/notebooks/blob/master/LED_on_Arxiv.ipynb) | How to effectively evaluate LED on long-range summarization | [Patrick von Platen](https://github.com/patrickvonplaten) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patrickvonplaten/notebooks/blob/master/LED_on_Arxiv.ipynb)|
55
+ |[Fine-tune LayoutLM on RVL-CDIP (a document image classification dataset)](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/LayoutLM/Fine_tuning_LayoutLMForSequenceClassification_on_RVL_CDIP.ipynb) | How to fine-tune *LayoutLMForSequenceClassification* on the RVL-CDIP dataset for scanned document classification | [Niels Rogge](https://github.com/nielsrogge) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/LayoutLM/Fine_tuning_LayoutLMForSequenceClassification_on_RVL_CDIP.ipynb)|
56
+ |[Wav2Vec2 CTC decoding with GPT2 adjustment](https://github.com/voidful/huggingface_notebook/blob/main/xlsr_gpt.ipynb) | How to decode CTC sequence with language model adjustment | [Eric Lam](https://github.com/voidful) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1e_z5jQHYbO2YKEaUgzb1ww1WwiAyydAj?usp=sharing)|
57
+ |[Fine-tune BART for summarization in two languages with Trainer class](https://github.com/elsanns/xai-nlp-notebooks/blob/master/fine_tune_bart_summarization_two_langs.ipynb) | How to fine-tune BART for summarization in two languages with Trainer class | [Eliza Szczechla](https://github.com/elsanns) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/elsanns/xai-nlp-notebooks/blob/master/fine_tune_bart_summarization_two_langs.ipynb)|
58
+ |[Evaluate Big Bird on Trivia QA](https://github.com/patrickvonplaten/notebooks/blob/master/Evaluating_Big_Bird_on_TriviaQA.ipynb) | How to evaluate BigBird on long document question answering on Trivia QA | [Patrick von Platen](https://github.com/patrickvonplaten) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patrickvonplaten/notebooks/blob/master/Evaluating_Big_Bird_on_TriviaQA.ipynb)|
59
+ | [Create video captions using Wav2Vec2](https://github.com/Muennighoff/ytclipcc/blob/main/wav2vec_youtube_captions.ipynb) | How to create YouTube captions from any video by transcribing the audio with Wav2Vec | [Niklas Muennighoff](https://github.com/Muennighoff) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Muennighoff/ytclipcc/blob/main/wav2vec_youtube_captions.ipynb) |
60
+ | [Fine-tune the Vision Transformer on CIFAR-10 using PyTorch Lightning](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/VisionTransformer/Fine_tuning_the_Vision_Transformer_on_CIFAR_10_with_PyTorch_Lightning.ipynb) | How to fine-tune the Vision Transformer (ViT) on CIFAR-10 using HuggingFace Transformers, Datasets and PyTorch Lightning | [Niels Rogge](https://github.com/nielsrogge) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/VisionTransformer/Fine_tuning_the_Vision_Transformer_on_CIFAR_10_with_PyTorch_Lightning.ipynb) |
61
+ | [Fine-tune the Vision Transformer on CIFAR-10 using the 🤗 Trainer](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/VisionTransformer/Fine_tuning_the_Vision_Transformer_on_CIFAR_10_with_the_%F0%9F%A4%97_Trainer.ipynb) | How to fine-tune the Vision Transformer (ViT) on CIFAR-10 using HuggingFace Transformers, Datasets and the 🤗 Trainer | [Niels Rogge](https://github.com/nielsrogge) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/VisionTransformer/Fine_tuning_the_Vision_Transformer_on_CIFAR_10_with_the_%F0%9F%A4%97_Trainer.ipynb) |
62
+ | [Evaluate LUKE on Open Entity, an entity typing dataset](https://github.com/studio-ousia/luke/blob/master/notebooks/huggingface_open_entity.ipynb) | How to evaluate *LukeForEntityClassification* on the Open Entity dataset | [Ikuya Yamada](https://github.com/ikuyamada) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/studio-ousia/luke/blob/master/notebooks/huggingface_open_entity.ipynb) |
63
+ | [Evaluate LUKE on TACRED, a relation extraction dataset](https://github.com/studio-ousia/luke/blob/master/notebooks/huggingface_tacred.ipynb) | How to evaluate *LukeForEntityPairClassification* on the TACRED dataset | [Ikuya Yamada](https://github.com/ikuyamada) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/studio-ousia/luke/blob/master/notebooks/huggingface_tacred.ipynb) |
64
+ | [Evaluate LUKE on CoNLL-2003, an important NER benchmark](https://github.com/studio-ousia/luke/blob/master/notebooks/huggingface_conll_2003.ipynb) | How to evaluate *LukeForEntitySpanClassification* on the CoNLL-2003 dataset | [Ikuya Yamada](https://github.com/ikuyamada) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/studio-ousia/luke/blob/master/notebooks/huggingface_conll_2003.ipynb) |
65
+ | [Evaluate BigBird-Pegasus on PubMed dataset](https://github.com/vasudevgupta7/bigbird/blob/main/notebooks/bigbird_pegasus_evaluation.ipynb) | How to evaluate *BigBirdPegasusForConditionalGeneration* on PubMed dataset | [Vasudev Gupta](https://github.com/vasudevgupta7) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/vasudevgupta7/bigbird/blob/main/notebooks/bigbird_pegasus_evaluation.ipynb) |
66
+ | [Speech Emotion Classification with Wav2Vec2](https://github.com/m3hrdadfi/soxan/blob/main/notebooks/Emotion_recognition_in_Greek_speech_using_Wav2Vec2.ipynb) | How to leverage a pretrained Wav2Vec2 model for Emotion Classification on the MEGA dataset | [Mehrdad Farahani](https://github.com/m3hrdadfi) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/m3hrdadfi/soxan/blob/main/notebooks/Emotion_recognition_in_Greek_speech_using_Wav2Vec2.ipynb) |
67
+ | [Detect objects in an image with DETR](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/DETR/DETR_minimal_example_(with_DetrFeatureExtractor).ipynb) | How to use a trained *DetrForObjectDetection* model to detect objects in an image and visualize attention | [Niels Rogge](https://github.com/NielsRogge) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/DETR/DETR_minimal_example_(with_DetrFeatureExtractor).ipynb) |
68
+ | [Fine-tune DETR on a custom object detection dataset](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/DETR/Fine_tuning_DetrForObjectDetection_on_custom_dataset_(balloon).ipynb) | How to fine-tune *DetrForObjectDetection* on a custom object detection dataset | [Niels Rogge](https://github.com/NielsRogge) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NielsRogge/Transformers-Tutorials/blob/master/DETR/Fine_tuning_DetrForObjectDetection_on_custom_dataset_(balloon).ipynb) |
69
+ | [Finetune T5 for Named Entity Recognition](https://github.com/ToluClassics/Notebooks/blob/main/T5_Ner_Finetuning.ipynb) | How to fine-tune *T5* on a Named Entity Recognition Task | [Ogundepo Odunayo](https://github.com/ToluClassics) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1obr78FY_cBmWY5ODViCmzdY6O1KB65Vc?usp=sharing) |
70
+ | [Fine-Tuning Open-Source LLM using QLoRA with MLflow and PEFT](https://github.com/mlflow/mlflow/blob/master/docs/source/llms/transformers/tutorials/fine-tuning/transformers-peft.ipynb) | How to use [QLoRA](https://github.com/artidoro/qlora) and [PEFT](https://huggingface.co/docs/peft/en/index) to fine-tune an LLM in a memory-efficient way, while using [MLflow](https://mlflow.org/docs/latest/llms/transformers/index.html) to manage experiment tracking | [Yuki Watanabe](https://github.com/B-Step62) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/mlflow/mlflow/blob/master/docs/source/llms/transformers/tutorials/fine-tuning/transformers-peft.ipynb) |
transformers/docs/source/en/contributing.md ADDED
@@ -0,0 +1,395 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!---
2
+ Copyright 2020 The HuggingFace Team. All rights reserved.
3
+
4
+ Licensed under the Apache License, Version 2.0 (the "License");
5
+ you may not use this file except in compliance with the License.
6
+ You may obtain a copy of the License at
7
+
8
+ http://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ Unless required by applicable law or agreed to in writing, software
11
+ distributed under the License is distributed on an "AS IS" BASIS,
12
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ See the License for the specific language governing permissions and
14
+ limitations under the License.
15
+ -->
16
+
17
+ # Contribute to 🤗 Transformers
18
+
19
+ Everyone is welcome to contribute, and we value everybody's contribution. Code
20
+ contributions are not the only way to help the community. Answering questions, helping
21
+ others, and improving the documentation are also immensely valuable.
22
+
23
+ It also helps us if you spread the word! Reference the library in blog posts
24
+ about the awesome projects it made possible, shout out on Twitter every time it has
25
+ helped you, or simply ⭐️ the repository to say thank you.
26
+
27
+ However you choose to contribute, please be mindful and respect our
28
+ [code of conduct](https://github.com/huggingface/transformers/blob/main/CODE_OF_CONDUCT.md).
29
+
30
+ **This guide was heavily inspired by the awesome [scikit-learn guide to contributing](https://github.com/scikit-learn/scikit-learn/blob/main/CONTRIBUTING.md).**
31
+
32
+ ## Ways to contribute
33
+
34
+ There are several ways you can contribute to 🤗 Transformers:
35
+
36
+ * Fix outstanding issues with the existing code.
37
+ * Submit issues related to bugs or desired new features.
38
+ * Implement new models.
39
+ * Contribute to the examples or to the documentation.
40
+
41
+ If you don't know where to start, there is a special [Good First
42
+ Issue](https://github.com/huggingface/transformers/contribute) listing. It will give you a list of
43
+ open issues that are beginner-friendly and help you start contributing to open-source. The best way to do that is to open a Pull Request and link it to the issue that you'd like to work on. We try to give priority to opened PRs as we can easily track the progress of the fix, and if the contributor does not have time anymore, someone else can take the PR over.
44
+
45
+ For something slightly more challenging, you can also take a look at the [Good Second Issue](https://github.com/huggingface/transformers/labels/Good%20Second%20Issue) list. In general though, if you feel like you know what you're doing, go for it and we'll help you get there! 🚀
46
+
47
+ > All contributions are equally valuable to the community. 🥰
48
+
49
+ ## Fixing outstanding issues
50
+
51
+ If you notice an issue with the existing code and have a fix in mind, feel free to [start contributing](#create-a-pull-request) and open a Pull Request!
52
+
53
+ ## Submitting a bug-related issue or feature request
54
+
55
+ Do your best to follow these guidelines when submitting a bug-related issue or a feature
56
+ request. It will make it easier for us to come back to you quickly and with good
57
+ feedback.
58
+
59
+ ### Did you find a bug?
60
+
61
+ The 🤗 Transformers library is robust and reliable thanks to users who report the problems they encounter.
62
+
63
+ Before you report an issue, we would really appreciate it if you could **make sure the bug was not
64
+ already reported** (use the search bar on GitHub under Issues). Your issue should also be related to bugs in the library itself, and not your code. If you're unsure whether the bug is in your code or the library, please ask in the [forum](https://discuss.huggingface.co/) or on our [discord](https://discord.com/invite/hugging-face-879548962464493619) first. This helps us respond quicker to fixing issues related to the library versus general questions.
65
+
66
+ > [!TIP]
67
+ > We have a [docs bot](https://huggingface.co/spaces/huggingchat/hf-docs-chat), and we highly encourage you to ask all your questions there. There is always a chance your bug can be fixed with a simple flag 👾🔫
68
+
69
+ Once you've confirmed the bug hasn't already been reported, please include the following information in your issue so we can quickly resolve it:
70
+
71
+ * Your **OS type and version** and **Python**, **PyTorch** and
72
+ **TensorFlow** versions when applicable.
73
+ * A short, self-contained, code snippet that allows us to reproduce the bug in
74
+ less than 30s.
75
+ * The *full* traceback if an exception is raised.
76
+ * Attach any other additional information, like screenshots, you think may help.
77
+
78
+ To get the OS and software versions automatically, run the following command:
79
+
80
+ ```bash
81
+ transformers env
82
+ ```
83
+
84
+ You can also run the same command from the root of the repository:
85
+
86
+ ```bash
87
+ python src/transformers/commands/transformers_cli.py env
88
+ ```
89
+
90
+ ### Do you want a new feature?
91
+
92
+ If there is a new feature you'd like to see in 🤗 Transformers, please open an issue and describe:
93
+
94
+ 1. What is the *motivation* behind this feature? Is it related to a problem or frustration with the library? Is it a feature related to something you need for a project? Is it something you worked on and think it could benefit the community?
95
+
96
+ Whatever it is, we'd love to hear about it!
97
+
98
+ 2. Describe your requested feature in as much detail as possible. The more you can tell us about it, the better we'll be able to help you.
99
+ 3. Provide a *code snippet* that demonstrates the features usage.
100
+ 4. If the feature is related to a paper, please include a link.
101
+
102
+ If your issue is well written we're already 80% of the way there by the time you create it.
103
+
104
+ We have added [templates](https://github.com/huggingface/transformers/tree/main/templates) to help you get started with your issue.
105
+
106
+ ## Do you want to implement a new model?
107
+
108
+ New models are constantly released and if you want to implement a new model, please provide the following information:
109
+
110
+ * A short description of the model and a link to the paper.
111
+ * Link to the implementation if it is open-sourced.
112
+ * Link to the model weights if they are available.
113
+
114
+ If you are willing to contribute the model yourself, let us know so we can help you add it to 🤗 Transformers!
115
+
116
+ We have a technical guide for [how to add a model to 🤗 Transformers](https://huggingface.co/docs/transformers/add_new_model).
117
+
118
+ ## Do you want to add documentation?
119
+
120
+ We're always looking for improvements to the documentation that make it more clear and accurate. Please let us know how the documentation can be improved such as typos and any content that is missing, unclear or inaccurate. We'll be happy to make the changes or help you make a contribution if you're interested!
121
+
122
+ For more details about how to generate, build, and write the documentation, take a look at the documentation [README](https://github.com/huggingface/transformers/tree/main/docs).
123
+
124
+ ## Create a Pull Request
125
+
126
+ Before writing any code, we strongly advise you to search through the existing PRs or
127
+ issues to make sure nobody is already working on the same thing. If you are
128
+ unsure, it is always a good idea to open an issue to get some feedback.
129
+
130
+ You will need basic `git` proficiency to contribute to
131
+ 🤗 Transformers. While `git` is not the easiest tool to use, it has the greatest
132
+ manual. Type `git --help` in a shell and enjoy! If you prefer books, [Pro
133
+ Git](https://git-scm.com/book/en/v2) is a very good reference.
134
+
135
+ You'll need **[Python 3.9](https://github.com/huggingface/transformers/blob/main/setup.py#L449)** or above to contribute to 🤗 Transformers. Follow the steps below to start contributing:
136
+
137
+ 1. Fork the [repository](https://github.com/huggingface/transformers) by
138
+ clicking on the **[Fork](https://github.com/huggingface/transformers/fork)** button on the repository's page. This creates a copy of the code
139
+ under your GitHub user account.
140
+
141
+ 2. Clone your fork to your local disk, and add the base repository as a remote:
142
+
143
+ ```bash
144
+ git clone git@github.com:<your Github handle>/transformers.git
145
+ cd transformers
146
+ git remote add upstream https://github.com/huggingface/transformers.git
147
+ ```
148
+
149
+ 3. Create a new branch to hold your development changes:
150
+
151
+ ```bash
152
+ git checkout -b a-descriptive-name-for-my-changes
153
+ ```
154
+
155
+ 🚨 **Do not** work on the `main` branch!
156
+
157
+ 4. Set up a development environment by running the following command in a virtual environment:
158
+
159
+ ```bash
160
+ pip install -e ".[dev]"
161
+ ```
162
+
163
+ If 🤗 Transformers was already installed in the virtual environment, remove
164
+ it with `pip uninstall transformers` before reinstalling it in editable
165
+ mode with the `-e` flag.
166
+
167
+ Depending on your OS, and since the number of optional dependencies of Transformers is growing, you might get a
168
+ failure with this command. If that's the case make sure to install the Deep Learning framework you are working with
169
+ (PyTorch, TensorFlow and/or Flax) then do:
170
+
171
+ ```bash
172
+ pip install -e ".[quality]"
173
+ ```
174
+
175
+ which should be enough for most use cases.
176
+
177
+ 5. Develop the features in your branch.
178
+
179
+ As you work on your code, you should make sure the test suite
180
+ passes. Run the tests impacted by your changes like this:
181
+
182
+ ```bash
183
+ pytest tests/<TEST_TO_RUN>.py
184
+ ```
185
+
186
+ For more information about tests, check out the
187
+ [Testing](https://huggingface.co/docs/transformers/testing) guide.
188
+
189
+ 🤗 Transformers relies on `black` and `ruff` to format its source code
190
+ consistently. After you make changes, apply automatic style corrections and code verifications
191
+ that can't be automated in one go with:
192
+
193
+ ```bash
194
+ make fixup
195
+ ```
196
+
197
+ This target is also optimized to only work with files modified by the PR you're working on.
198
+
199
+ If you prefer to run the checks one after the other, the following command applies the
200
+ style corrections:
201
+
202
+ ```bash
203
+ make style
204
+ ```
205
+
206
+ 🤗 Transformers also uses `ruff` and a few custom scripts to check for coding mistakes. Quality
207
+ controls are run by the CI, but you can run the same checks with:
208
+
209
+ ```bash
210
+ make quality
211
+ ```
212
+
213
+ Finally, we have a lot of scripts to make sure we don't forget to update
214
+ some files when adding a new model. You can run these scripts with:
215
+
216
+ ```bash
217
+ make repo-consistency
218
+ ```
219
+
220
+ To learn more about those checks and how to fix any issues with them, check out the
221
+ [Checks on a Pull Request](https://huggingface.co/docs/transformers/pr_checks) guide.
222
+
223
+ If you're modifying documents under the `docs/source` directory, make sure the documentation can still be built. This check will also run in the CI when you open a pull request. To run a local check
224
+ make sure you install the [documentation builder](https://github.com/huggingface/doc-builder).
225
+
226
+ ```bash
227
+ pip install hf-doc-builder
228
+ ```
229
+
230
+ Run the following command from the root of the repository:
231
+
232
+ ```bash
233
+ doc-builder build transformers docs/source/en --build_dir ~/tmp/test-build
234
+ ```
235
+
236
+ This will build the documentation in the `~/tmp/test-build` folder where you can inspect the generated
237
+ Markdown files with your favorite editor. You can also preview the docs on GitHub when you open a pull request.
238
+
239
+ Once you're happy with your changes, add the changed files with `git add` and
240
+ record your changes locally with `git commit`:
241
+
242
+ ```bash
243
+ git add modified_file.py
244
+ git commit
245
+ ```
246
+
247
+ Please remember to write [good commit
248
+ messages](https://chris.beams.io/posts/git-commit/) to clearly communicate the changes you made!
249
+
250
+ To keep your copy of the code up to date with the original
251
+ repository, rebase your branch on `upstream/branch` *before* you open a pull request or if requested by a maintainer:
252
+
253
+ ```bash
254
+ git fetch upstream
255
+ git rebase upstream/main
256
+ ```
257
+
258
+ Push your changes to your branch:
259
+
260
+ ```bash
261
+ git push -u origin a-descriptive-name-for-my-changes
262
+ ```
263
+
264
+ If you've already opened a pull request, you'll need to force push with the `--force` flag. Otherwise, if the pull request hasn't been opened yet, you can just push your changes normally.
265
+
266
+ 6. Now you can go to your fork of the repository on GitHub and click on **Pull Request** to open a pull request. Make sure you tick off all the boxes on our [checklist](#pull-request-checklist) below. When you're ready, you can send your changes to the project maintainers for review.
267
+
268
+ 7. It's ok if maintainers request changes, it happens to our core contributors
269
+ too! So everyone can see the changes in the pull request, work in your local
270
+ branch and push the changes to your fork. They will automatically appear in
271
+ the pull request.
272
+
273
+ ### Pull request checklist
274
+
275
+ ☐ The pull request title should summarize your contribution.<br>
276
+ ☐ If your pull request addresses an issue, please mention the issue number in the pull
277
+ request description to make sure they are linked (and people viewing the issue know you
278
+ are working on it).<br>
279
+ ☐ To indicate a work in progress please prefix the title with `[WIP]`. These are
280
+ useful to avoid duplicated work, and to differentiate it from PRs ready to be merged.<br>
281
+ ☐ Make sure existing tests pass.<br>
282
+ ☐ If adding a new feature, also add tests for it.<br>
283
+ - If you are adding a new model, make sure you use
284
+ `ModelTester.all_model_classes = (MyModel, MyModelWithLMHead,...)` to trigger the common tests.
285
+ - If you are adding new `@slow` tests, make sure they pass using
286
+ `RUN_SLOW=1 python -m pytest tests/models/my_new_model/test_my_new_model.py`.
287
+ - If you are adding a new tokenizer, write tests and make sure
288
+ `RUN_SLOW=1 python -m pytest tests/models/{your_model_name}/test_tokenization_{your_model_name}.py` passes.
289
+ - CircleCI does not run the slow tests, but GitHub Actions does every night!<br>
290
+
291
+ ☐ All public methods must have informative docstrings (see
292
+ [`modeling_bert.py`](https://github.com/huggingface/transformers/blob/main/src/transformers/models/bert/modeling_bert.py)
293
+ for an example).<br>
294
+ ☐ Due to the rapidly growing repository, don't add any images, videos and other
295
+ non-text files that'll significantly weigh down the repository. Instead, use a Hub
296
+ repository such as [`hf-internal-testing`](https://huggingface.co/hf-internal-testing)
297
+ to host these files and reference them by URL. We recommend placing documentation
298
+ related images in the following repository:
299
+ [huggingface/documentation-images](https://huggingface.co/datasets/huggingface/documentation-images).
300
+ You can open a PR on this dataset repository and ask a Hugging Face member to merge it.
301
+
302
+ For more information about the checks run on a pull request, take a look at our [Checks on a Pull Request](https://huggingface.co/docs/transformers/pr_checks) guide.
303
+
304
+ ### Tests
305
+
306
+ An extensive test suite is included to test the library behavior and several examples. Library tests can be found in
307
+ the [tests](https://github.com/huggingface/transformers/tree/main/tests) folder and examples tests in the
308
+ [examples](https://github.com/huggingface/transformers/tree/main/examples) folder.
309
+
310
+ We like `pytest` and `pytest-xdist` because it's faster. From the root of the
311
+ repository, specify a *path to a subfolder or a test file* to run the test:
312
+
313
+ ```bash
314
+ python -m pytest -n auto --dist=loadfile -s -v ./tests/models/my_new_model
315
+ ```
316
+
317
+ Similarly, for the `examples` directory, specify a *path to a subfolder or test file* to run the test. For example, the following command tests the text classification subfolder in the PyTorch `examples` directory:
318
+
319
+ ```bash
320
+ pip install -r examples/xxx/requirements.txt # only needed the first time
321
+ python -m pytest -n auto --dist=loadfile -s -v ./examples/pytorch/text-classification
322
+ ```
323
+
324
+ In fact, this is actually how our `make test` and `make test-examples` commands are implemented (not including the `pip install`)!
325
+
326
+ You can also specify a smaller set of tests in order to test only the feature
327
+ you're working on.
328
+
329
+ By default, slow tests are skipped but you can set the `RUN_SLOW` environment variable to
330
+ `yes` to run them. This will download many gigabytes of models so make sure you
331
+ have enough disk space, a good internet connection or a lot of patience!
332
+
333
+ <Tip warning={true}>
334
+
335
+ Remember to specify a *path to a subfolder or a test file* to run the test. Otherwise, you'll run all the tests in the `tests` or `examples` folder, which will take a very long time!
336
+
337
+ </Tip>
338
+
339
+ ```bash
340
+ RUN_SLOW=yes python -m pytest -n auto --dist=loadfile -s -v ./tests/models/my_new_model
341
+ RUN_SLOW=yes python -m pytest -n auto --dist=loadfile -s -v ./examples/pytorch/text-classification
342
+ ```
343
+
344
+ Like the slow tests, there are other environment variables available which are not enabled by default during testing:
345
+ - `RUN_CUSTOM_TOKENIZERS`: Enables tests for custom tokenizers.
346
+
347
+ More environment variables and additional information can be found in the [testing_utils.py](https://github.com/huggingface/transformers/blob/main/src/transformers/testing_utils.py).
348
+
349
+ 🤗 Transformers uses `pytest` as a test runner only. It doesn't use any
350
+ `pytest`-specific features in the test suite itself.
351
+
352
+ This means `unittest` is fully supported. Here's how to run tests with
353
+ `unittest`:
354
+
355
+ ```bash
356
+ python -m unittest discover -s tests -t . -v
357
+ python -m unittest discover -s examples -t examples -v
358
+ ```
359
+
360
+ ### Style guide
361
+
362
+ For documentation strings, 🤗 Transformers follows the [Google Python Style Guide](https://google.github.io/styleguide/pyguide.html).
363
+ Check our [documentation writing guide](https://github.com/huggingface/transformers/tree/main/docs#writing-documentation---specification)
364
+ for more information.
365
+
366
+ ### Develop on Windows
367
+
368
+ On Windows (unless you're working in [Windows Subsystem for Linux](https://learn.microsoft.com/en-us/windows/wsl/) or WSL), you need to configure git to transform Windows `CRLF` line endings to Linux `LF` line endings:
369
+
370
+ ```bash
371
+ git config core.autocrlf input
372
+ ```
373
+
374
+ One way to run the `make` command on Windows is with MSYS2:
375
+
376
+ 1. [Download MSYS2](https://www.msys2.org/), and we assume it's installed in `C:\msys64`.
377
+ 2. Open the command line `C:\msys64\msys2.exe` (it should be available from the **Start** menu).
378
+ 3. Run in the shell: `pacman -Syu` and install `make` with `pacman -S make`.
379
+ 4. Add `C:\msys64\usr\bin` to your PATH environment variable.
380
+
381
+ You can now use `make` from any terminal (PowerShell, cmd.exe, etc.)! 🎉
382
+
383
+ ### Sync a forked repository with upstream main (the Hugging Face repository)
384
+
385
+ When updating the main branch of a forked repository, please follow these steps to avoid pinging the upstream repository which adds reference notes to each upstream PR, and sends unnecessary notifications to the developers involved in these PRs.
386
+
387
+ 1. When possible, avoid syncing with the upstream using a branch and PR on the forked repository. Instead, merge directly into the forked main.
388
+ 2. If a PR is absolutely necessary, use the following steps after checking out your branch:
389
+
390
+ ```bash
391
+ git checkout -b your-branch-for-syncing
392
+ git pull --squash --no-commit upstream main
393
+ git commit -m '<your message without GitHub references>'
394
+ git push --set-upstream origin your-branch-for-syncing
395
+ ```
transformers/docs/source/en/conversations.md ADDED
@@ -0,0 +1,161 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!--Copyright 2024 The HuggingFace Team. All rights reserved.
2
+
3
+ Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
4
+ the License. You may obtain a copy of the License at
5
+
6
+ http://www.apache.org/licenses/LICENSE-2.0
7
+
8
+ Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
9
+ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
10
+ specific language governing permissions and limitations under the License.
11
+
12
+ ⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
13
+ rendered properly in your Markdown viewer.
14
+
15
+ -->
16
+
17
+ # Chat basics
18
+
19
+ Chat models are conversational models you can send and receive messages from. There are many chat models available to choose from, but in general, larger models tend to be better though that's not always the case. The model size is often included in the name, like "8B" or "70B", and it describes the number of parameters. Mixture-of-expert (MoE) models have names like "8x7B" or "141B-A35B" which means it's a 56B and 141B parameter model. You can try quantizing larger models to reduce memory requirements, otherwise you'll need ~2 bytes of memory per parameter.
20
+
21
+ Check model leaderboards like [OpenLLM](https://hf.co/spaces/HuggingFaceH4/open_llm_leaderboard) and [LMSys Chatbot Arena](https://chat.lmsys.org/?leaderboard) to further help you identify the best chat models for your use case. Models that are specialized in certain domains (medical, legal text, non-English languages, etc.) may sometimes outperform larger general purpose models.
22
+
23
+ > [!TIP]
24
+ > Chat with a number of open-source models for free on [HuggingChat](https://hf.co/chat/)!
25
+
26
+ This guide shows you how to quickly start chatting with Transformers from the command line, how build and format a conversation, and how to chat using the [`TextGenerationPipeline`].
27
+
28
+ ## chat CLI
29
+
30
+ After you've [installed Transformers](./installation.md), chat with a model directly from the command line as shown below. It launches an interactive session with a model, with a few base commands listed at the start of the session.
31
+
32
+ ```bash
33
+ transformers chat Qwen/Qwen2.5-0.5B-Instruct
34
+ ```
35
+
36
+ <div class="flex justify-center">
37
+ <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/transformers-chat-cli.png"/>
38
+ </div>
39
+
40
+ You can launch the CLI with arbitrary `generate` flags, with the format `arg_1=value_1 arg_2=value_2 ...`
41
+
42
+ ```bash
43
+ transformers chat Qwen/Qwen2.5-0.5B-Instruct do_sample=False max_new_tokens=10
44
+ ```
45
+
46
+ For a full list of options, run the command below.
47
+
48
+ ```bash
49
+ transformers chat -h
50
+ ```
51
+
52
+ The chat is implemented on top of the [AutoClass](./model_doc/auto), using tooling from [text generation](./llm_tutorial) and [chat](./chat_templating). It uses the `transformers serve` CLI under the hood ([docs](./serving.md#serve-cli)).
53
+
54
+
55
+ ## TextGenerationPipeline
56
+
57
+ [`TextGenerationPipeline`] is a high-level text generation class with a "chat mode". Chat mode is enabled when a conversational model is detected and the chat prompt is [properly formatted](./llm_tutorial#wrong-prompt-format).
58
+
59
+ To start, build a chat history with the following two roles.
60
+
61
+ - `system` describes how the model should behave and respond when you're chatting with it. This role isn't supported by all chat models.
62
+ - `user` is where you enter your first message to the model.
63
+
64
+ ```py
65
+ chat = [
66
+ {"role": "system", "content": "You are a sassy, wise-cracking robot as imagined by Hollywood circa 1986."},
67
+ {"role": "user", "content": "Hey, can you tell me any fun things to do in New York?"}
68
+ ]
69
+ ```
70
+
71
+ Create the [`TextGenerationPipeline`] and pass `chat` to it. For large models, setting [device_map="auto"](./models#big-model-inference) helps load the model quicker and automatically places it on the fastest device available. Changing the data type to [torch.bfloat16](./models#model-data-type) also helps save memory.
72
+
73
+ ```py
74
+ import torch
75
+ from transformers import pipeline
76
+
77
+ pipeline = pipeline(task="text-generation", model="meta-llama/Meta-Llama-3-8B-Instruct", torch_dtype=torch.bfloat16, device_map="auto")
78
+ response = pipeline(chat, max_new_tokens=512)
79
+ print(response[0]["generated_text"][-1]["content"])
80
+ ```
81
+
82
+ ```txt
83
+ (sigh) Oh boy, you're asking me for advice? You're gonna need a map, pal! Alright,
84
+ alright, I'll give you the lowdown. But don't say I didn't warn you, I'm a robot, not a tour guide!
85
+
86
+ So, you wanna know what's fun to do in the Big Apple? Well, let me tell you, there's a million
87
+ things to do, but I'll give you the highlights. First off, you gotta see the sights: the Statue of
88
+ Liberty, Central Park, Times Square... you know, the usual tourist traps. But if you're lookin' for
89
+ something a little more... unusual, I'd recommend checkin' out the Museum of Modern Art. It's got
90
+ some wild stuff, like that Warhol guy's soup cans and all that jazz.
91
+
92
+ And if you're feelin' adventurous, take a walk across the Brooklyn Bridge. Just watch out for
93
+ those pesky pigeons, they're like little feathered thieves! (laughs) Get it? Thieves? Ah, never mind.
94
+
95
+ Now, if you're lookin' for some serious fun, hit up the comedy clubs in Greenwich Village. You might
96
+ even catch a glimpse of some up-and-coming comedians... or a bunch of wannabes tryin' to make it big. (winks)
97
+
98
+ And finally, if you're feelin' like a real New Yorker, grab a slice of pizza from one of the many amazing
99
+ pizzerias around the city. Just don't try to order a "robot-sized" slice, trust me, it won't end well. (laughs)
100
+
101
+ So, there you have it, pal! That's my expert advice on what to do in New York. Now, if you'll
102
+ excuse me, I've got some oil changes to attend to. (winks)
103
+ ```
104
+
105
+ Use the `append` method on `chat` to respond to the models message.
106
+
107
+ ```py
108
+ chat = response[0]["generated_text"]
109
+ chat.append(
110
+ {"role": "user", "content": "Wait, what's so wild about soup cans?"}
111
+ )
112
+ response = pipeline(chat, max_new_tokens=512)
113
+ print(response[0]["generated_text"][-1]["content"])
114
+ ```
115
+
116
+ ```txt
117
+ (laughs) Oh, you're killin' me, pal! You don't get it, do you? Warhol's soup cans are like, art, man!
118
+ It's like, he took something totally mundane, like a can of soup, and turned it into a masterpiece. It's
119
+ like, "Hey, look at me, I'm a can of soup, but I'm also a work of art!"
120
+ (sarcastically) Oh, yeah, real original, Andy.
121
+
122
+ But, you know, back in the '60s, it was like, a big deal. People were all about challenging the
123
+ status quo, and Warhol was like, the king of that. He took the ordinary and made it extraordinary.
124
+ And, let me tell you, it was like, a real game-changer. I mean, who would've thought that a can of soup could be art? (laughs)
125
+
126
+ But, hey, you're not alone, pal. I mean, I'm a robot, and even I don't get it. (winks)
127
+ But, hey, that's what makes art, art, right? (laughs)
128
+ ```
129
+
130
+ ## Performance
131
+
132
+ Transformers load models in full precision by default, and for a 8B model, this requires ~32GB of memory! Reduce memory usage by loading a model in half-precision or bfloat16 (only uses ~2 bytes per parameter). You can even quantize the model to a lower precision like 8-bit or 4-bit with [bitsandbytes](https://hf.co/docs/bitsandbytes/index).
133
+
134
+ > [!TIP]
135
+ > Refer to the [Quantization](./quantization/overview) docs for more information about the different quantization backends available.
136
+
137
+ Create a [`BitsAndBytesConfig`] with your desired quantization settings and pass it to the pipelines `model_kwargs` parameter. The example below quantizes a model to 8-bits.
138
+
139
+ ```py
140
+ from transformers import pipeline, BitsAndBytesConfig
141
+
142
+ quantization_config = BitsAndBytesConfig(load_in_8bit=True)
143
+ pipeline = pipeline(task="text-generation", model="meta-llama/Meta-Llama-3-8B-Instruct", device_map="auto", model_kwargs={"quantization_config": quantization_config})
144
+ ```
145
+
146
+ In general, larger models are slower in addition to requiring more memory because text generation is bottlenecked by **memory bandwidth** instead of compute power. Each active parameter must be read from memory for every generated token. For a 16GB model, 16GB must be read from memory for every generated token.
147
+
148
+ The number of generated tokens/sec is proportional to the total memory bandwidth of the system divided by the model size. Depending on your hardware, total memory bandwidth can vary. Refer to the table below for approximate generation speeds for different hardware types.
149
+
150
+ | Hardware | Memory bandwidth |
151
+ |---|---|
152
+ | consumer CPU | 20-100GB/sec |
153
+ | specialized CPU (Intel Xeon, AMD Threadripper/Epyc, Apple silicon) | 200-900GB/sec |
154
+ | data center GPU (NVIDIA A100/H100) | 2-3TB/sec |
155
+
156
+ The easiest solution for improving generation speed is to either quantize a model or use hardware with higher memory bandwidth.
157
+
158
+ You can also try techniques like [speculative decoding](./generation_strategies#speculative-decoding), where a smaller model generates candidate tokens that are verified by the larger model. If the candidate tokens are correct, the larger model can generate more than one token per `forward` pass. This significantly alleviates the bandwidth bottleneck and improves generation speed.
159
+
160
+ > [!TIP]
161
+ > Parameters may not be active for every generated token in MoE models such as [Mixtral](./model_doc/mixtral), [Qwen2MoE](./model_doc/qwen2_moe.md), and [DBRX](./model_doc/dbrx). As a result, MoE models generally have much lower memory bandwidth requirements and can be faster than a regular LLM of the same size. However, techniques like speculative decoding are ineffective with MoE models because parameters become activated with each new speculated token.
transformers/docs/source/en/custom_models.md ADDED
@@ -0,0 +1,297 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!--Copyright 2024 The HuggingFace Team. All rights reserved.
2
+
3
+ Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
4
+ the License. You may obtain a copy of the License at
5
+
6
+ http://www.apache.org/licenses/LICENSE-2.0
7
+
8
+ Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
9
+ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
10
+ specific language governing permissions and limitations under the License.
11
+
12
+ ⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
13
+ rendered properly in your Markdown viewer.
14
+
15
+ -->
16
+
17
+ # Customizing models
18
+
19
+ Transformers models are designed to be customizable. A models code is fully contained in the [model](https://github.com/huggingface/transformers/tree/main/src/transformers/models) subfolder of the Transformers repository. Each folder contains a `modeling.py` and a `configuration.py` file. Copy these files to start customizing a model.
20
+
21
+ > [!TIP]
22
+ > It may be easier to start from scratch if you're creating an entirely new model. But for models that are very similar to an existing one in Transformers, it is faster to reuse or subclass the same configuration and model class.
23
+
24
+ This guide will show you how to customize a ResNet model, enable [AutoClass](./models#autoclass) support, and share it on the Hub.
25
+
26
+ ## Configuration
27
+
28
+ A configuration, given by the base [`PretrainedConfig`] class, contains all the necessary information to build a model. This is where you'll configure the attributes of the custom ResNet model. Different attributes gives different ResNet model types.
29
+
30
+ The main rules for customizing a configuration are:
31
+
32
+ 1. A custom configuration must subclass [`PretrainedConfig`]. This ensures a custom model has all the functionality of a Transformers' model such as [`~PretrainedConfig.from_pretrained`], [`~PretrainedConfig.save_pretrained`], and [`~PretrainedConfig.push_to_hub`].
33
+ 2. The [`PretrainedConfig`] `__init__` must accept any `kwargs` and they must be passed to the superclass `__init__`. [`PretrainedConfig`] has more fields than the ones set in your custom configuration, so when you load a configuration with [`~PretrainedConfig.from_pretrained`], those fields need to be accepted by your configuration and passed to the superclass.
34
+
35
+ > [!TIP]
36
+ > It is useful to check the validity of some of the parameters. In the example below, a check is implemented to ensure `block_type` and `stem_type` belong to one of the predefined values.
37
+ >
38
+ > Add `model_type` to the configuration class to enable [AutoClass](./models#autoclass) support.
39
+
40
+ ```py
41
+ from transformers import PretrainedConfig
42
+ from typing import List
43
+
44
+ class ResnetConfig(PretrainedConfig):
45
+ model_type = "resnet"
46
+
47
+ def __init__(
48
+ self,
49
+ block_type="bottleneck",
50
+ layers: list[int] = [3, 4, 6, 3],
51
+ num_classes: int = 1000,
52
+ input_channels: int = 3,
53
+ cardinality: int = 1,
54
+ base_width: int = 64,
55
+ stem_width: int = 64,
56
+ stem_type: str = "",
57
+ avg_down: bool = False,
58
+ **kwargs,
59
+ ):
60
+ if block_type not in ["basic", "bottleneck"]:
61
+ raise ValueError(f"`block_type` must be 'basic' or bottleneck', got {block_type}.")
62
+ if stem_type not in ["", "deep", "deep-tiered"]:
63
+ raise ValueError(f"`stem_type` must be '', 'deep' or 'deep-tiered', got {stem_type}.")
64
+
65
+ self.block_type = block_type
66
+ self.layers = layers
67
+ self.num_classes = num_classes
68
+ self.input_channels = input_channels
69
+ self.cardinality = cardinality
70
+ self.base_width = base_width
71
+ self.stem_width = stem_width
72
+ self.stem_type = stem_type
73
+ self.avg_down = avg_down
74
+ super().__init__(**kwargs)
75
+ ```
76
+
77
+ Save the configuration to a JSON file in your custom model folder, `custom-resnet`, with [`~PretrainedConfig.save_pretrained`].
78
+
79
+ ```py
80
+ resnet50d_config = ResnetConfig(block_type="bottleneck", stem_width=32, stem_type="deep", avg_down=True)
81
+ resnet50d_config.save_pretrained("custom-resnet")
82
+ ```
83
+
84
+ ## Model
85
+
86
+ With the custom ResNet configuration, you can now create and customize the model. The model subclasses the base [`PreTrainedModel`] class. Like [`PretrainedConfig`], inheriting from [`PreTrainedModel`] and initializing the superclass with the configuration extends Transformers' functionalities such as saving and loading to the custom model.
87
+
88
+ Transformers' models follow the convention of accepting a `config` object in the `__init__` method. This passes the entire `config` to the model sublayers, instead of breaking the `config` object into multiple arguments that are individually passed to the sublayers.
89
+
90
+ Writing models this way produces simpler code with a clear source of truth for any hyperparameters. It also makes it easier to reuse code from other Transformers' models.
91
+
92
+ You'll create two ResNet models, a barebones ResNet model that outputs the hidden states and a ResNet model with an image classification head.
93
+
94
+ <hfoptions id="resnet">
95
+ <hfoption id="ResnetModel">
96
+
97
+ Define a mapping between the block types and classes. Everything else is created by passing the configuration class to the ResNet model class.
98
+
99
+ > [!TIP]
100
+ > Add `config_class` to the model class to enable [AutoClass](#autoclass-support) support.
101
+
102
+ ```py
103
+ from transformers import PreTrainedModel
104
+ from timm.models.resnet import BasicBlock, Bottleneck, ResNet
105
+ from .configuration_resnet import ResnetConfig
106
+
107
+ BLOCK_MAPPING = {"basic": BasicBlock, "bottleneck": Bottleneck}
108
+
109
+ class ResnetModel(PreTrainedModel):
110
+ config_class = ResnetConfig
111
+
112
+ def __init__(self, config):
113
+ super().__init__(config)
114
+ block_layer = BLOCK_MAPPING[config.block_type]
115
+ self.model = ResNet(
116
+ block_layer,
117
+ config.layers,
118
+ num_classes=config.num_classes,
119
+ in_chans=config.input_channels,
120
+ cardinality=config.cardinality,
121
+ base_width=config.base_width,
122
+ stem_width=config.stem_width,
123
+ stem_type=config.stem_type,
124
+ avg_down=config.avg_down,
125
+ )
126
+
127
+ def forward(self, tensor):
128
+ return self.model.forward_features(tensor)
129
+ ```
130
+
131
+ </hfoption>
132
+ <hfoption id="ResnetModelForImageClassification">
133
+
134
+ The `forward` method needs to be rewritten to calculate the loss for each logit if labels are available. Otherwise, the ResNet model class is the same.
135
+
136
+ > [!TIP]
137
+ > Add `config_class` to the model class to enable [AutoClass](#autoclass-support) support.
138
+
139
+ ```py
140
+ import torch
141
+
142
+ class ResnetModelForImageClassification(PreTrainedModel):
143
+ config_class = ResnetConfig
144
+
145
+ def __init__(self, config):
146
+ super().__init__(config)
147
+ block_layer = BLOCK_MAPPING[config.block_type]
148
+ self.model = ResNet(
149
+ block_layer,
150
+ config.layers,
151
+ num_classes=config.num_classes,
152
+ in_chans=config.input_channels,
153
+ cardinality=config.cardinality,
154
+ base_width=config.base_width,
155
+ stem_width=config.stem_width,
156
+ stem_type=config.stem_type,
157
+ avg_down=config.avg_down,
158
+ )
159
+
160
+ def forward(self, tensor, labels=None):
161
+ logits = self.model(tensor)
162
+ if labels is not None:
163
+ loss = torch.nn.functional.cross_entropy(logits, labels)
164
+ return {"loss": loss, "logits": logits}
165
+ return {"logits": logits}
166
+ ```
167
+
168
+ </hfoption>
169
+ </hfoptions>
170
+
171
+ A model can return any output format. Returning a dictionary (like `ResnetModelForImageClassification`) with losses when labels are available makes the custom model compatible with [`Trainer`]. For other output formats, you'll need your own training loop or a different library for training.
172
+
173
+ Instantiate the custom model class with the configuration.
174
+
175
+ ```py
176
+ resnet50d = ResnetModelForImageClassification(resnet50d_config)
177
+ ```
178
+
179
+ At this point, you can load pretrained weights into the model or train it from scratch. In this guide, you'll load pretrained weights.
180
+
181
+ Load the pretrained weights from the [timm](https://hf.co/docs/timm/index) library, and then transfer those weights to the custom model with [load_state_dict](https://pytorch.org/docs/stable/generated/torch.nn.Module.html#torch.nn.Module.load_state_dict).
182
+
183
+ ```py
184
+ import timm
185
+
186
+ pretrained_model = timm.create_model("resnet50d", pretrained=True)
187
+ resnet50d.model.load_state_dict(pretrained_model.state_dict())
188
+ ```
189
+
190
+ ## AutoClass
191
+
192
+ The [AutoClass](./models#model-classes) API is a shortcut for automatically loading the correct architecture for a given model. It is convenient to enable this for users loading your custom model.
193
+
194
+ Make sure you have the `model_type` attribute (must be different from existing model types) in the configuration class and `config_class` attribute in the model class. Use the [`~AutoConfig.register`] method to add the custom configuration and model to the [AutoClass](./models#model-classes) API.
195
+
196
+ > [!TIP]
197
+ > The first argument to [`AutoConfig.register`] must match the `model_type` attribute in the custom configuration class, and the first argument to [`AutoModel.register`] must match the `config_class` of the custom model class.
198
+
199
+ ```py
200
+ from transformers import AutoConfig, AutoModel, AutoModelForImageClassification
201
+
202
+ AutoConfig.register("resnet", ResnetConfig)
203
+ AutoModel.register(ResnetConfig, ResnetModel)
204
+ AutoModelForImageClassification.register(ResnetConfig, ResnetModelForImageClassification)
205
+ ```
206
+
207
+ Your custom model code is now compatible with the [AutoClass](./models#autoclass) API. Users can load the model with the [AutoModel](./model_doc/auto#automodel) or [`AutoModelForImageClassification`] classes.
208
+
209
+ ## Upload
210
+
211
+ Upload a custom model to the [Hub](https://hf.co/models) to allow other users to easily load and use it.
212
+
213
+ Ensure the model directory is structured correctly as shown below. The directory should contain:
214
+
215
+ - `modeling.py`: Contains the code for `ResnetModel` and `ResnetModelForImageClassification`. This file can rely on relative imports to other files as long as they're in the same directory.
216
+
217
+ > [!WARNING]
218
+ > When copying a Transformers' model file, replace all relative imports at the top of the `modeling.py` file to import from Transformers instead.
219
+
220
+ - `configuration.py`: Contains the code for `ResnetConfig`.
221
+ - `__init__.py`: Can be empty, this file allows Python `resnet_model` to be used as a module.
222
+
223
+ ```bash
224
+ .
225
+ └── resnet_model
226
+ ├── __init__.py
227
+ ├── configuration_resnet.py
228
+ └── modeling_resnet.py
229
+ ```
230
+
231
+ To share the model, import the ResNet model and configuration.
232
+
233
+ ```py
234
+ from resnet_model.configuration_resnet import ResnetConfig
235
+ from resnet_model.modeling_resnet import ResnetModel, ResnetModelForImageClassification
236
+ ```
237
+
238
+ Copy the code from the model and configuration files. To make sure the AutoClass objects are saved with [`~PreTrainedModel.save_pretrained`], call the [`~PretrainedConfig.register_for_auto_class`] method. This modifies the configuration JSON file to include the AutoClass objects and mapping.
239
+
240
+ For a model, pick the appropriate `AutoModelFor` class based on the task.
241
+
242
+ ```py
243
+ ResnetConfig.register_for_auto_class()
244
+ ResnetModel.register_for_auto_class("AutoModel")
245
+ ResnetModelForImageClassification.register_for_auto_class("AutoModelForImageClassification")
246
+ ```
247
+
248
+ To map more than one task to the model, edit `auto_map` in the configuration JSON file directly.
249
+
250
+ ```json
251
+ "auto_map": {
252
+ "AutoConfig": "<your-repo-name>--<config-name>",
253
+ "AutoModel": "<your-repo-name>--<config-name>",
254
+ "AutoModelFor<Task>": "<your-repo-name>--<config-name>",
255
+ },
256
+ ```
257
+
258
+ Create the configuration and model and load pretrained weights into it.
259
+
260
+ ```py
261
+ resnet50d_config = ResnetConfig(block_type="bottleneck", stem_width=32, stem_type="deep", avg_down=True)
262
+ resnet50d = ResnetModelForImageClassification(resnet50d_config)
263
+
264
+ pretrained_model = timm.create_model("resnet50d", pretrained=True)
265
+ resnet50d.model.load_state_dict(pretrained_model.state_dict())
266
+ ```
267
+
268
+ The model is ready to be pushed to the Hub now. Log in to your Hugging Face account from the command line or notebook.
269
+
270
+ <hfoptions id="push">
271
+ <hfoption id="huggingface-CLI">
272
+
273
+ ```bash
274
+ huggingface-cli login
275
+ ```
276
+
277
+ </hfoption>
278
+ <hfoption id="notebook">
279
+
280
+ ```py
281
+ from huggingface_hub import notebook_login
282
+
283
+ notebook_login()
284
+ ```
285
+
286
+ </hfoption>
287
+ </hfoptions>
288
+
289
+ Call [`~PreTrainedModel.push_to_hub`] on the model to upload the model to the Hub.
290
+
291
+ ```py
292
+ resnet50d.push_to_hub("custom-resnet50d")
293
+ ```
294
+
295
+ The pretrained weights, configuration, `modeling.py` and `configuration.py` files should all be uploaded to the Hub now in a [repository](https://hf.co/sgugger/custom-resnet50d) under your namespace.
296
+
297
+ Because a custom model doesn't use the same modeling code as a Transformers' model, you need to add `trust_remode_code=True` in [`~PreTrainedModel.from_pretrained`] to load it. Refer to the load [custom models](./models#custom-models) section for more information.
transformers/docs/source/en/debugging.md ADDED
@@ -0,0 +1,367 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!--Copyright 2024 The HuggingFace Team. All rights reserved.
2
+
3
+ Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
4
+ the License. You may obtain a copy of the License at
5
+
6
+ http://www.apache.org/licenses/LICENSE-2.0
7
+
8
+ Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
9
+ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
10
+ specific language governing permissions and limitations under the License.
11
+
12
+ ⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
13
+ rendered properly in your Markdown viewer.
14
+
15
+ -->
16
+
17
+ # Multi-GPU debugging
18
+
19
+ Distributed training can be tricky because you have to ensure you're using the correct CUDA version across your system. You may encounter inter-communication issues between GPUs, and there may be underflow or overflow problems in your model.
20
+
21
+ This guide covers how to debug these issues, especially as it relates to DeepSpeed and PyTorch.
22
+
23
+ ## DeepSpeed CUDA
24
+
25
+ DeepSpeed compiles CUDA C++ which can be a potential source of errors when building PyTorch extensions that require CUDA. These errors depend on how CUDA is installed on your system. This section focuses on PyTorch built with *CUDA 10.2*
26
+
27
+ ```bash
28
+ pip install deepspeed
29
+ ```
30
+
31
+ > [!TIP]
32
+ > For any other installation issues, please [open an issue](https://github.com/microsoft/DeepSpeed/issues) with the DeepSpeed team.
33
+
34
+ ### Non-identical toolkits
35
+
36
+ PyTorch comes with its own CUDA toolkit, but to use DeepSpeed with PyTorch, you need to have an identical version of CUDA installed system-wide. For example, if you installed PyTorch with `cudatoolkit==10.2` in your Python environment, then you'll also need to have CUDA 10.2 installed everywhere.
37
+
38
+ The exact location can vary from system to system, but `usr/local/cuda-10.2` is the most common location on many Unix systems. When CUDA is correctly set up and added to your `PATH` environment variable, you can find the installation location with the following command.
39
+
40
+ ```bash
41
+ which nvcc
42
+ ```
43
+
44
+ ### Multiple toolkits
45
+
46
+ You may also have more than one CUDA toolkit installed on your system.
47
+
48
+ ```bash
49
+ /usr/local/cuda-10.2
50
+ /usr/local/cuda-11.0
51
+ ```
52
+
53
+ Typically, package installers set the paths to whatever the last version was installed. If the package build fails because it can't find the right CUDA version (despite it being installed already), then you need to configure the `PATH` and `LD_LIBRARY_PATH` environment variables to point to the correct path.
54
+
55
+ Take a look at the contents of the following environment variables first.
56
+
57
+ ```bash
58
+ echo $PATH
59
+ echo $LD_LIBRARY_PATH
60
+ ```
61
+
62
+ `PATH` lists the locations of the executables and `LD_LIBRARY_PATH` lists where to look for shared libraries. Earlier entries are prioritized over later ones, and `:` is used to separate multiple entries. To find a specific CUDA toolkit, insert the correct path to list first. This command prepends rather than overwrites the existing values.
63
+
64
+ ```bash
65
+ # adjust the version and full path if needed
66
+ export PATH=/usr/local/cuda-10.2/bin:$PATH
67
+ export LD_LIBRARY_PATH=/usr/local/cuda-10.2/lib64:$LD_LIBRARY_PATH
68
+ ```
69
+
70
+ In addition, you should also check that the assigned directories actually exist. The `lib64` sub-directory contains various CUDA `.so` objects (like `libcudart.so`), and while it is unlikely your system names them differently, you should check the actual names and change them accordingly.
71
+
72
+ ### Older versions
73
+
74
+ Sometimes, older CUDA versions may refuse to build with newer compilers. For example, if you have `gcc-9` but CUDA wants `gcc-7`. Usually, installing the latest CUDA toolkit enables support for the newer compiler.
75
+
76
+ You could also install an older version of the compiler in addition to the one you're currently using (or it may already be installed but it's not used by default and the build system can't see it). To resolve this, create a symlink to give the build system visibility to the older compiler.
77
+
78
+ ```bash
79
+ # adjust the path to your system
80
+ sudo ln -s /usr/bin/gcc-7 /usr/local/cuda-10.2/bin/gcc
81
+ sudo ln -s /usr/bin/g++-7 /usr/local/cuda-10.2/bin/g++
82
+ ```
83
+
84
+ ### Prebuild
85
+
86
+ If you're still having issues with installing DeepSpeed or if you're building DeepSpeed at run time, try to prebuild the DeepSpeed modules before installing them. Run the commands below to make a local build for DeepSpeed.
87
+
88
+ ```bash
89
+ git clone https://github.com/deepspeedai/DeepSpeed/
90
+ cd DeepSpeed
91
+ rm -rf build
92
+ TORCH_CUDA_ARCH_LIST="8.6" DS_BUILD_CPU_ADAM=1 DS_BUILD_UTILS=1 pip install . \
93
+ --global-option="build_ext" --global-option="-j8" --no-cache -v \
94
+ --disable-pip-version-check 2>&1 | tee build.log
95
+ ```
96
+
97
+ > [!TIP]
98
+ > Add the `DS_BUILD_AIO=1` parameter to the build command to use NVMe offload. Make sure you install the libaio-dev package across your system.
99
+
100
+ Next, specify your GPUs architecture by editing the `TORCH_CUDA_ARCH_LIST` variable (find a complete list of NVIDIA GPUs and their corresponding architectures on this [page](https://developer.nvidia.com/cuda-gpus)). To check the PyTorch version that corresponds to your architecture, run the following command.
101
+
102
+ ```bash
103
+ python -c "import torch; print(torch.cuda.get_arch_list())"
104
+ ```
105
+
106
+ Find the architecture for a GPU with the following command.
107
+
108
+ <hfoptions id="arch">
109
+ <hfoption id="same GPUs">
110
+
111
+ ```bash
112
+ CUDA_VISIBLE_DEVICES=0 python -c "import torch; print(torch.cuda.get_device_capability())"
113
+ ```
114
+
115
+ </hfoption>
116
+ <hfoption id="specific GPU">
117
+
118
+ Run the following command to find the architecture for GPU `0`. The results will show a value for `major` and `minor`, which is your GPU architecture. The GPU architecture below is `8.6`.
119
+
120
+ ```bash
121
+ CUDA_VISIBLE_DEVICES=0 python -c "import torch; \
122
+ print(torch.cuda.get_device_properties(torch.device('cuda')))
123
+ "_CudaDeviceProperties(name='GeForce RTX 3090', major=8, minor=6, total_memory=24268MB, multi_processor_count=82)"
124
+ ```
125
+
126
+ </hfoption>
127
+ </hfoptions>
128
+
129
+ If you get `8, 6`, then you can set `TORCH_CUDA_ARCH_LIST="8.6"`. For multiple GPUs with different architectures, list them like `TORCH_CUDA_ARCH_LIST="6.1;8.6"`.
130
+
131
+ It is also possible to not specify `TORCH_CUDA_ARCH_LIST` and the build program automatically queries the GPU architecture of the build. However, it may or may not match the actual GPU on the target machine which is why it is better to explicitly specify the correct architecture.
132
+
133
+ For training on multiple machines with the same setup, you'll need to make a binary wheel as shown below.
134
+
135
+ ```bash
136
+ git clone https://github.com/deepspeedai/DeepSpeed/
137
+ cd DeepSpeed
138
+ rm -rf build
139
+ TORCH_CUDA_ARCH_LIST="8.6" DS_BUILD_CPU_ADAM=1 DS_BUILD_UTILS=1 \
140
+ python setup.py build_ext -j8 bdist_wheel
141
+ ```
142
+
143
+ This command generates a binary wheel that'll look something like `dist/deepspeed-0.3.13+8cd046f-cp38-cp38-linux_x86_64.whl`. Install this wheel locally or on another machine.
144
+
145
+ ```bash
146
+ pip install deepspeed-0.3.13+8cd046f-cp38-cp38-linux_x86_64.whl
147
+ ```
148
+
149
+ ## Communication
150
+
151
+ Distributed training involves communication between processes and or nodes and this can be a potential source of errors.
152
+
153
+ Download the script below to diagnose network issues, and then run it to test GPU communication. The example command below tests how two GPUs communicate. Adjust the `--nproc_per_node` and `--nnodes` parameters to adapt it to your system.
154
+
155
+ ```bash
156
+ wget https://raw.githubusercontent.com/huggingface/transformers/main/scripts/distributed/torch-distributed-gpu-test.py
157
+ python -m torch.distributed.run --nproc_per_node 2 --nnodes 1 torch-distributed-gpu-test.py
158
+ ```
159
+
160
+ The script prints an `OK` status if both GPUs are able to communicate and allocate memory. Take a closer look at the diagnostic script for more details and a recipe for running it in a SLURM environment.
161
+
162
+ Add the `NCCL_DEBUG=INFO` environment variable to report more NCCL-related debugging information.
163
+
164
+ ```bash
165
+ NCCL_DEBUG=INFO python -m torch.distributed.run --nproc_per_node 2 --nnodes 1 torch-distributed-gpu-test.py
166
+ ```
167
+
168
+ ## Underflow and overflow detection
169
+
170
+ Underflow and overflow can occur when activations or weights are `inf`, `nan`, and when `loss=NaN`. This may indicate an underflow or overflow issue. To detect these issues, activate the `DebugUnderflowOverflow` module in [`TrainingArguments.debug`] or import and add the module to your own training loop or another trainer class.
171
+
172
+ <hfoptions id="overflow">
173
+ <hfoption id="Trainer">
174
+
175
+ ```py
176
+ from transformers import TrainingArguments
177
+
178
+ args = TrainingArguments(
179
+ debug="underflow_overflow",
180
+ ...
181
+ )
182
+ ```
183
+
184
+ </hfoption>
185
+ <hfoption id="PyTorch training loop">
186
+
187
+ ```py
188
+ from transformers.debug_utils import DebugUnderflowOverflow
189
+
190
+ debug_overflow = DebugUnderflowOverflow(model)
191
+ ```
192
+
193
+ </hfoption>
194
+ </hfoptions>
195
+
196
+ The [`~debug_utils.DebugUnderflowOverflow`] module inserts hooks into the model to test the input and output variables and the corresponding model weights after each forward call. If `inf` or `nan` is detected in at least one element of the activations or weights, the module prints a report like the one shown below.
197
+
198
+ The example below is for fp16 mixed precision training with [google/mt5-small](https://huggingface.co/google/mt5-small).
199
+
200
+ ```shell
201
+ Detected inf/nan during batch_number=0
202
+ Last 21 forward frames:
203
+ abs min abs max metadata
204
+ encoder.block.1.layer.1.DenseReluDense.dropout Dropout
205
+ 0.00e+00 2.57e+02 input[0]
206
+ 0.00e+00 2.85e+02 output
207
+ [...]
208
+ encoder.block.2.layer.0 T5LayerSelfAttention
209
+ 6.78e-04 3.15e+03 input[0]
210
+ 2.65e-04 3.42e+03 output[0]
211
+ None output[1]
212
+ 2.25e-01 1.00e+04 output[2]
213
+ encoder.block.2.layer.1.layer_norm T5LayerNorm
214
+ 8.69e-02 4.18e-01 weight
215
+ 2.65e-04 3.42e+03 input[0]
216
+ 1.79e-06 4.65e+00 output
217
+ encoder.block.2.layer.1.DenseReluDense.wi_0 Linear
218
+ 2.17e-07 4.50e+00 weight
219
+ 1.79e-06 4.65e+00 input[0]
220
+ 2.68e-06 3.70e+01 output
221
+ encoder.block.2.layer.1.DenseReluDense.wi_1 Linear
222
+ 8.08e-07 2.66e+01 weight
223
+ 1.79e-06 4.65e+00 input[0]
224
+ 1.27e-04 2.37e+02 output
225
+ encoder.block.2.layer.1.DenseReluDense.dropout Dropout
226
+ 0.00e+00 8.76e+03 input[0]
227
+ 0.00e+00 9.74e+03 output
228
+ encoder.block.2.layer.1.DenseReluDense.wo Linear
229
+ 1.01e-06 6.44e+00 weight
230
+ 0.00e+00 9.74e+03 input[0]
231
+ 3.18e-04 6.27e+04 output
232
+ encoder.block.2.layer.1.DenseReluDense T5DenseGatedGeluDense
233
+ 1.79e-06 4.65e+00 input[0]
234
+ 3.18e-04 6.27e+04 output
235
+ encoder.block.2.layer.1.dropout Dropout
236
+ 3.18e-04 6.27e+04 input[0]
237
+ 0.00e+00 inf output
238
+ ```
239
+
240
+ At the start of the report, you can see which batch number the error occurred. In this case, it occurred on the first batch.
241
+
242
+ Each frame describes the module it is reporting on. For example, the frame below inspected `encoder.block.2.layer.1.layer_norm`. This indicates the layer norm in the first layer of the second block of the encoder. The forward calls are to `T5LayerNorm`.
243
+
244
+ ```shell
245
+ encoder.block.2.layer.1.layer_norm T5LayerNorm
246
+ 8.69e-02 4.18e-01 weight
247
+ 2.65e-04 3.42e+03 input[0]
248
+ 1.79e-06 4.65e+00 output
249
+ ```
250
+
251
+ The last frame reports on the `Dropout.forward` function. It called the `dropout` attribute from inside the `DenseReluDense` class. You can observe that the overflow (`inf`) occurred in the first layer of the encoders second block in the first batch. The absolute largest input element was 6.27e+04.
252
+
253
+ ```shell
254
+ encoder.block.2.layer.1.DenseReluDense T5DenseGatedGeluDense
255
+ 1.79e-06 4.65e+00 input[0]
256
+ 3.18e-04 6.27e+04 output
257
+ encoder.block.2.layer.1.dropout Dropout
258
+ 3.18e-04 6.27e+04 input[0]
259
+ 0.00e+00 inf output
260
+ ```
261
+
262
+ The `T5DenseGatedGeluDense.forward` function output activations had an absolute maximum value of 6.27e+04 which is close to fp16s maximum limit of 6.4e+04. In the next step, `Dropout` renormalizes the weights, after zeroing some elements, which pushes the absolute maximum value to greater than 6.4e+04 resulting in an overflow.
263
+
264
+ Now that you know where the error is happening, you can investigate the modeling code in [modeling_t5.py](https://github.com/huggingface/transformers/blob/main/src/transformers/models/t5/modeling_t5.py).
265
+
266
+ ```py
267
+ class T5DenseGatedGeluDense(nn.Module):
268
+ def __init__(self, config):
269
+ super().__init__()
270
+ self.wi_0 = nn.Linear(config.d_model, config.d_ff, bias=False)
271
+ self.wi_1 = nn.Linear(config.d_model, config.d_ff, bias=False)
272
+ self.wo = nn.Linear(config.d_ff, config.d_model, bias=False)
273
+ self.dropout = nn.Dropout(config.dropout_rate)
274
+ self.gelu_act = ACT2FN["gelu_new"]
275
+
276
+ def forward(self, hidden_states):
277
+ hidden_gelu = self.gelu_act(self.wi_0(hidden_states))
278
+ hidden_linear = self.wi_1(hidden_states)
279
+ hidden_states = hidden_gelu * hidden_linear
280
+ hidden_states = self.dropout(hidden_states)
281
+ hidden_states = self.wo(hidden_states)
282
+ return hidden_states
283
+ ```
284
+
285
+ One solution is to go back a few steps before the values started growing too large and switch to fp32 so the numbers don't overflow when multiplied or summed. Another potential solution is to temporarily disable mixed precision training (`amp`).
286
+
287
+ ```py
288
+ import torch
289
+
290
+ def forward(self, hidden_states):
291
+ if torch.is_autocast_enabled():
292
+ with torch.cuda.amp.autocast(enabled=False):
293
+ return self._forward(hidden_states)
294
+ else:
295
+ return self._forward(hidden_states)
296
+ ```
297
+
298
+ The report only returns inputs and outputs of full frames, so you may also want to analyze the intermediate values of any `forward` function as well. Add the `detect_overflow` function after the forward calls to track `inf` or `nan` values in the intermediate `forwarded_states`.
299
+
300
+ ```py
301
+ from debug_utils import detect_overflow
302
+
303
+ class T5LayerFF(nn.Module):
304
+ [...]
305
+
306
+ def forward(self, hidden_states):
307
+ forwarded_states = self.layer_norm(hidden_states)
308
+ detect_overflow(forwarded_states, "after layer_norm")
309
+ forwarded_states = self.DenseReluDense(forwarded_states)
310
+ detect_overflow(forwarded_states, "after DenseReluDense")
311
+ return hidden_states + self.dropout(forwarded_states)
312
+ ```
313
+
314
+ Finally, you can configure the number of frames printed by [`~debug_utils.DebugUnderflowOverflow`].
315
+
316
+ ```py
317
+ from transformers.debug_utils import DebugUnderflowOverflow
318
+
319
+ debug_overflow = DebugUnderflowOverflow(model, max_frames_to_save=100)
320
+ ```
321
+
322
+ ### Batch tracing
323
+
324
+ [`~debug_utils.DebugUnderflowOverflow`] is able to trace the absolute minimum and maximum values in each batch with the underflow and overflow feature disabled. This is useful for identifying where errors are occurring in the model.
325
+
326
+ The example below shows how to trace the minimum and maximum values in batches 1 and 3 (batches are zero-indexd).
327
+
328
+ ```py
329
+ debug_overflow = DebugUnderflowOverflow(model, trace_batch_nums=[1, 3])
330
+ ```
331
+
332
+ ```shell
333
+ *** Starting batch number=1 ***
334
+ abs min abs max metadata
335
+ shared Embedding
336
+ 1.01e-06 7.92e+02 weight
337
+ 0.00e+00 2.47e+04 input[0]
338
+ 5.36e-05 7.92e+02 output
339
+ [...]
340
+ decoder.dropout Dropout
341
+ 1.60e-07 2.27e+01 input[0]
342
+ 0.00e+00 2.52e+01 output
343
+ decoder T5Stack
344
+ not a tensor output
345
+ lm_head Linear
346
+ 1.01e-06 7.92e+02 weight
347
+ 0.00e+00 1.11e+00 input[0]
348
+ 6.06e-02 8.39e+01 output
349
+ T5ForConditionalGeneration
350
+ not a tensor output
351
+
352
+ *** Starting batch number=3 ***
353
+ abs min abs max metadata
354
+ shared Embedding
355
+ 1.01e-06 7.92e+02 weight
356
+ 0.00e+00 2.78e+04 input[0]
357
+ 5.36e-05 7.92e+02 output
358
+ [...]
359
+ ```
360
+
361
+ [`~debug_utils.DebugUnderflowOverflow`] reports on a large number of frames which is easier for debugging. Once you know where a problem is occurring, say batch 150, then you can focus the trace for batches 149 and 150 and compare where the numbers are diverging.
362
+
363
+ It is also possible to abort the trace after a certain batch number, for example, batch 3.
364
+
365
+ ```py
366
+ debug_overflow = DebugUnderflowOverflow(model, trace_batch_nums=[1, 3], abort_after_batch_num=3)
367
+ ```
transformers/docs/source/en/deepspeed.md ADDED
@@ -0,0 +1,1029 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!--Copyright 2024 The HuggingFace Team. All rights reserved.
2
+
3
+ Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
4
+ the License. You may obtain a copy of the License at
5
+
6
+ http://www.apache.org/licenses/LICENSE-2.0
7
+
8
+ Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
9
+ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
10
+ specific language governing permissions and limitations under the License.
11
+
12
+ ⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
13
+ rendered properly in your Markdown viewer.
14
+
15
+ -->
16
+
17
+ # DeepSpeed
18
+
19
+ [DeepSpeed](https://www.deepspeed.ai/) is designed to optimize distributed training for large models with data, model, pipeline, and even a combination of all three [parallelism](./perf_train_gpu_many) strategies to provide better memory efficiency and faster training speeds. This is achieved with the [Zero Redundancy Optimizer (ZeRO)](https://hf.co/papers/1910.02054) which consists of three stages.
20
+
21
+ | ZeRO stage | description |
22
+ |---|---|
23
+ | 1 | partition optimizer states |
24
+ | 2 | partition optimizer and gradient states |
25
+ | 3 | partition optimizer, gradient, and parameters |
26
+
27
+ Each stage progressively saves more memory, allowing really large models to fit and train on a single GPU. All ZeRO stages, offloading optimizer memory and computations from the GPU to the CPU are integrated with [`Trainer`]. Provide a config file or one of the example templates to [`Trainer`] to enable DeepSpeed features.
28
+
29
+ This guide walks you through setting up a DeepSpeed config file, how to enable its features in [`Trainer`], and deploy for training.
30
+
31
+ Install DeepSpeed from either PyPI or Transformers. For more detailed installation instructions, refer to the DeepSpeed [installation](https://www.deepspeed.ai/tutorials/advanced-install/) or GitHUB [README](https://github.com/microsoft/deepspeed#installation).
32
+
33
+ <hfoptions id="installation">
34
+ <hfoption id="PyPI">
35
+
36
+ ```bash
37
+ pip install deepspeed
38
+ ```
39
+
40
+ </hfoption>
41
+ <hfoption id="Transformers">
42
+
43
+ ```bash
44
+ pip install transformers[deepspeed]
45
+ ```
46
+
47
+ </hfoption>
48
+ </hfoptions>
49
+
50
+ > [!WARNING]
51
+ > Refer to the [DeepSpeed CUDA installation](./debugging#deepspeed-cuda-issues) if you're having trouble with your installation. While DeepSpeed has a pip installable package, it is highly recommended to [install it from source](https://www.deepspeed.ai/tutorials/advanced-install/#install-deepspeed-from-source) to ensure it matches your hardware and to support certain features which aren't available in the PyPI distribution.
52
+
53
+ DeepSpeed provides a tool for estimating the required CPU and GPU memory for the parameters, optimizer and gradient states. You'll also to need to reserve some memory for the CUDA kernels and activations.
54
+
55
+ Run the command below to check the memory requirements for [bigscience/T0_3B](https://huggingface.co/docs/transformers/main/en/bigscience/T0_3B) on a single GPU.
56
+
57
+ ```bash
58
+ $ python -c 'from transformers import AutoModel; \
59
+ from deepspeed.runtime.zero.stage3 import estimate_zero3_model_states_mem_needs_all_live; \
60
+ model = AutoModel.from_pretrained("bigscience/T0_3B"); \
61
+ estimate_zero3_model_states_mem_needs_all_live(model, num_gpus_per_node=1, num_nodes=1)'
62
+ [...]
63
+ Estimated memory needed for params, optim states and gradients for a:
64
+ HW: Setup with 1 node, 1 GPU per node.
65
+ SW: Model with 2783M total params, 65M largest layer params.
66
+ per CPU | per GPU | Options
67
+ 70.00GB | 0.25GB | offload_param=cpu , offload_optimizer=cpu , zero_init=1
68
+ 70.00GB | 0.25GB | offload_param=cpu , offload_optimizer=cpu , zero_init=0
69
+ 62.23GB | 5.43GB | offload_param=none, offload_optimizer=cpu , zero_init=1
70
+ 62.23GB | 5.43GB | offload_param=none, offload_optimizer=cpu , zero_init=0
71
+ 0.37GB | 46.91GB | offload_param=none, offload_optimizer=none, zero_init=1
72
+ 15.56GB | 46.91GB | offload_param=none, offload_optimizer=none, zero_init=0
73
+ ```
74
+
75
+ > [!TIP]
76
+ > If you have enough GPU memory, disable CPU and NVMe offload to speed everything up.
77
+
78
+ ## Choosing a ZeRO stage
79
+
80
+ Consider the table below to help you choose the appropriate ZeRO stage for training because there is a trade-off between training speed and memory usage. The table orders the ZeRO stages from fastest to slowest and from least memory usage to most.
81
+
82
+ | fastest | least memory usage |
83
+ |---|---|
84
+ | ZeRO-1 | ZeRO-3 + offload |
85
+ | ZeRO-2 | ZeRO-3 |
86
+ | ZeRO-2 + offload | ZeRO-2 + offload |
87
+ | ZeRO-3 | ZeRO-2 |
88
+ | ZeRO-3 + offload | ZeRO-1 |
89
+
90
+ Decide the type of performance you're optimizing for, speed or memory, and then work backwards to discover the best ZeRO stage for your use case. For example, if you're optimizing for speed, start with the fastest ZeRO stage and if you run out of memory, try the next stage which is slower but more memory efficient.
91
+
92
+ ## Config file
93
+
94
+ Once you've decided on a ZeRO stage, set up a config file to enable DeepSpeed with [`Trainer`]. The config file contains all the parameters for how to configure and set up your training. When the training script is executed, DeepSpeed logs the configuration from [`Trainer`] to the console so you can see exactly what's being used.
95
+
96
+ > [!TIP]
97
+ > Find a complete list of DeepSpeed configuration options on the [DeepSpeed Configuration JSON](https://www.deepspeed.ai/docs/config-json/) reference. There are also practical examples of various DeepSpeed configuration examples in the [DeepSpeedExamples](https://github.com/microsoft/DeepSpeedExamples) main [DeepSpeed](https://github.com/microsoft/DeepSpeed) repository. Run the command below to quickly find specific examples.
98
+ >
99
+ > ```bash
100
+ > git clone https://github.com/microsoft/DeepSpeedExamples
101
+ > cd DeepSpeedExamples
102
+ > find . -name '*json'
103
+ > # find examples with the Lamb optimizer
104
+ > grep -i Lamb $(find . -name '*json')
105
+ > ```
106
+
107
+ The config file is passed as a path to a JSON file if you're training from the command line interface or as a nested dict object if you're using [`Trainer`] in a notebook.
108
+
109
+ <hfoptions id="pass-config">
110
+ <hfoption id="path to file">
111
+
112
+ ```py
113
+ TrainingArguments(
114
+ deepspeed="path/to/deepspeed_config.json",
115
+ ...,
116
+ )
117
+ ```
118
+
119
+ </hfoption>
120
+ <hfoption id="nested dict">
121
+
122
+ ```py
123
+ ds_config_dict = dict(scheduler=scheduler_params, optimizer=optimizer_params)
124
+ args = TrainingArguments(
125
+ deepspeed=ds_config_dict,
126
+ ...,
127
+ )
128
+ trainer = Trainer(
129
+ model,
130
+ args,
131
+ ...,
132
+ )
133
+ ```
134
+
135
+ </hfoption>
136
+ </hfoptions>
137
+
138
+ ### DeepSpeed versus Trainer parameters
139
+
140
+ There are three types of config parameters.
141
+
142
+ 1. Some config parameters are shared by DeepSpeed and [`Trainer`] making it difficult to identify errors when there are conflicting definitions. In this case, configure these parameters from the [`Trainer`] command line arguments.
143
+ 1. Some config parameters are automatically derived from the model configuration and don't need to be manually configured. [`Trainer`] uses the config value `auto` to set the most correct or efficient option. You could define these parameters explicitly, but you must take care to ensure the [`Trainer`] and DeepSpeed config parameters match. Mismatches may cause training to fail in very difficult to detect ways.
144
+ 1. Some config parameters are specific to DeepSpeed and should be manually set based on your training requirements.
145
+
146
+ There are two ways to modify the config parameters.
147
+
148
+ > [!TIP]
149
+ > Some values, such as `scheduler.params.total_num_steps`, are calculated by [`Trainer`] during training.
150
+
151
+ 1. Create or load a DeepSpeed config to use as the main config.
152
+ 1. Create a [`TrainingArguments`] object based on the DeepSpeed config values.
153
+
154
+ ### ZeRO stage
155
+
156
+ Each ZeRO stage config is defined in `zero_optimization`.
157
+
158
+ For a more detailed explanation of each parameter, refer to the [DeepSpeed Configuration JSON](https://www.deepspeed.ai/docs/config-json/) reference. These parameters must be set up with DeepSpeed because [`Trainer`] doesn't provide equivalent command line arguments.
159
+
160
+ > [!WARNING]
161
+ > DeepSpeed doesn't validate parameter names and any typos will fallback on the parameters default setting. Observe the DeepSpeed engine startup log messages to see what values are being used.
162
+
163
+ <hfoptions id="zero-config">
164
+ <hfoption id="ZeRO-1">
165
+
166
+ ZeRO-1 shards the optimizer states across GPUs and you can expect a small speed up.
167
+
168
+ ```yml
169
+ {
170
+ "zero_optimization": {
171
+ "stage": 1
172
+ }
173
+ }
174
+ ```
175
+
176
+ </hfoption>
177
+ <hfoption id="ZeRO-2">
178
+
179
+ ZeRO-2 shards the optimizer and gradient states across GPUs. This stage is primarily used for training since its features are not relevant to inference. Some important parameters to configure for better performance include the following.
180
+
181
+ * `offload_optimizer` should be enabled to reduce GPU memory usage.
182
+ * `overlap_comm` when set to `true` uses more GPU memory in exchange for lower allreduce latency. This feature uses 4.5x the `allgather_bucket_size` and `reduce_bucket_size` values. In this example, they're set to `5e8` which means it requires 9GB of GPU memory. If your GPU memory is 8GB or less, you should reduce `overlap_comm` to lower the memory requirements and prevent an out-of-memory (OOM) error.
183
+ * `allgather_bucket_size` and `reduce_bucket_size` trade-off available GPU memory for communication speed. The smaller their values, the slower communication is and the more GPU memory is available. You can balance, for example, whether a bigger batch size is more important than a slightly slower training time.
184
+ * `round_robin_gradients` is available in DeepSpeed 0.4.4 for CPU offloading. It parallelizes gradient copying to CPU memory among ranks by fine-grained gradient partitioning. Performance benefit grows with gradient accumulation steps (more copying between optimizer steps) or GPU count (increased parallelism).
185
+
186
+ ```yml
187
+ {
188
+ "zero_optimization": {
189
+ "stage": 2,
190
+ "offload_optimizer": {
191
+ "device": "cpu",
192
+ "pin_memory": true
193
+ },
194
+ "allgather_partitions": true,
195
+ "allgather_bucket_size": 5e8,
196
+ "overlap_comm": true,
197
+ "reduce_scatter": true,
198
+ "reduce_bucket_size": 5e8,
199
+ "contiguous_gradients": true
200
+ "round_robin_gradients": true
201
+ }
202
+ }
203
+ ```
204
+
205
+ </hfoption>
206
+ <hfoption id="ZeRO-3">
207
+
208
+ ZeRO-3 shards the optimizer and gradient states, and parameters across GPUs. Unlike ZeRO-2, ZeRO-3 can also be used for inference in addition to training because it loads large models onto multiple GPUs. Some important parameters to configure include the following.
209
+
210
+ * `device: "cpu"` can help if you're running out of GPU memory and if you have free CPU memory available. This offloads model parameters to the CPU.
211
+ * `pin_memory: true` can improve throughput, but less memory becomes available for other processes because the pinned memory is reserved for the specific process that requested it and it's typically accessed much faster than normal CPU memory.
212
+ * `stage3_max_live_parameters` is the upper limit on how many full parameters to keep on the GPU at any given time. Reduce this value if you encounter an OOM error.
213
+ * `stage3_max_reuse_distance` is a value for determining when a parameter is used again in the future, and it helps decide whether to throw the parameter away or to keep it. If the parameter is going to be reused (if the value is less than `stage3_max_reuse_distance`), then it is kept to reduce communication overhead. This is helpful when activation checkpointing is enabled and you want to keep the parameter in the forward recompute until the backward pass. Reduce this value if you encounter an OOM error.
214
+ * `stage3_gather_16bit_weights_on_model_save` consolidates fp16 weights when a model is saved. For large models and multiple GPUs, this is expensive in terms of memory and speed. You should enable it if you're planning on resuming training.
215
+ * `sub_group_size` controls which parameters are updated during the optimizer step. Parameters are grouped into buckets of `sub_group_size` and each bucket is updated one at a time. When used with NVMe offload, `sub_group_size` determines when model states are moved in and out of CPU memory during the optimization step. This prevents running out of CPU memory for extremely large models. `sub_group_size` can be left to its default value if you aren't using NVMe offload, but you may want to change it if you:
216
+
217
+ 1. Run into an OOM error during the optimization step. In this case, reduce `sub_group_size` to reduce memory usage of the temporary buffers.
218
+ 2. The optimization step is taking a really long time. In this case, increase `sub_group_size` to improve bandwidth utilization as a result of increased data buffers.
219
+
220
+ * `reduce_bucket_size`, `stage3_prefetch_bucket_size`, and `stage3_param_persistence_threshold` are dependent on a models hidden size. It is recommended to set these values to `auto` and allow [`Trainer`] to automatically assign the values.
221
+
222
+ ```yml
223
+ {
224
+ "zero_optimization": {
225
+ "stage": 3,
226
+ "offload_optimizer": {
227
+ "device": "cpu",
228
+ "pin_memory": true
229
+ },
230
+ "offload_param": {
231
+ "device": "cpu",
232
+ "pin_memory": true
233
+ },
234
+ "overlap_comm": true,
235
+ "contiguous_gradients": true,
236
+ "sub_group_size": 1e9,
237
+ "reduce_bucket_size": "auto",
238
+ "stage3_prefetch_bucket_size": "auto",
239
+ "stage3_param_persistence_threshold": "auto",
240
+ "stage3_max_live_parameters": 1e9,
241
+ "stage3_max_reuse_distance": 1e9,
242
+ "stage3_gather_16bit_weights_on_model_save": true
243
+ }
244
+ }
245
+ ```
246
+
247
+ ### Initialize large models
248
+
249
+ With ZeRO-3, use the [deepspeed.zero.Init](https://deepspeed.readthedocs.io/en/latest/zero3.html#deepspeed.zero.Init) context manager to initialize a model faster.
250
+
251
+ ```py
252
+ from transformers import T5ForConditionalGeneration, T5Config
253
+ import deepspeed
254
+
255
+ with deepspeed.zero.Init():
256
+ config = T5Config.from_pretrained("google-t5/t5-small")
257
+ model = T5ForConditionalGeneration(config)
258
+ ```
259
+
260
+ The DeepSped config file needs to have `is_deepspeed_zero3_enabled: true` setup in [`TrainingArguments`] and it needs a ZeRO configuration enabled. The [`TrainingArguments`] object must be created **before** calling [`~PreTrainedModel.from_pretrained`].
261
+
262
+ > [!TIP]
263
+ > You'll need ZeRO-3 when the fp16 weights don't fit on a single GPU. But if you're able to load the fp16 weights, set `torch_dtype=torch.float16` in [`~PreTrainedModel.from_pretrained`].
264
+
265
+ ```py
266
+ from transformers import AutoModel, Trainer, TrainingArguments
267
+
268
+ training_args = TrainingArguments(..., deepspeed=ds_config)
269
+ model = AutoModel.from_pretrained("google-t5/t5-small")
270
+ trainer = Trainer(model=model, args=training_args, ...)
271
+ ```
272
+
273
+ When there are multiple GPUs, no single GPU has all the parameters unless it's the parameters of the currently executing layer. To access all parameters from all the layers at once, such as loading pretrained model weights in [`~PreTrainedModel.from_pretrained`], one layer is loaded at a time and immediately partitioned to all GPUs. For very large models, it isn't possible to load the weights onto one GPU and then distribute them across the other GPUs due to memory limitations.
274
+
275
+ If you encounter a model parameter weight where `tensor([1.])` or the parameter size is 1 instead of a larger multidimensional shape, it means the parameter is partitioned and this is a ZeRO-3 placeholder.
276
+
277
+ ```py
278
+ tensor([1.0], device="cuda:0", dtype=torch.float16, requires_grad=True)
279
+ ```
280
+
281
+ > [!TIP]
282
+ > For more information about initializing large models with ZeRO-3 and accessing the parameters, take a look at the [Constructing Massive Models](https://deepspeed.readthedocs.io/en/latest/zero3.html#constructing-massive-models) and [Gathering Parameters](https://deepspeed.readthedocs.io/en/latest/zero3.html#gathering-parameters) guides.
283
+
284
+ </hfoption>
285
+ </hfoptions>
286
+
287
+ ### NVMe
288
+
289
+ [ZeRO-Infinity](https://hf.co/papers/2104.07857) offloads model states to the CPU and/or NVMe to save even more memory. Smart partitioning and tiling algorithms allow each GPU to send and receive very small amounts of data during offloading such that a modern NVMe can fit an even larger total memory pool than is available to your training process. ZeRO-Infinity requires ZeRO-3.
290
+
291
+ Depending on the CPU and NVMe memory available, you can offload both the [optimizer states](https://www.deepspeed.ai/docs/config-json/#optimizer-offloading) and [parameters](https://www.deepspeed.ai/docs/config-json/#parameter-offloading), just one of them, or none of them. Make sure the `nvme_path` points to a NVMe device, because while it still works with a regular hard drive or solid state drive, it'll be significantly slower. With a modern NVMe, you can expect peak transfer speeds of ~3.5GB/s for read operations and ~3GB/s for write operations.
292
+
293
+ Consider running a [benchmark](https://github.com/microsoft/DeepSpeed/issues/998) on your training setup to determine the optimal `aio` configuration.
294
+
295
+ The example ZeRO-3 and ZeRO-Infinity config below sets most of the parameter values to `auto`, but you can also manually set configure these values.
296
+
297
+ ```yaml
298
+ {
299
+ "fp16": {
300
+ "enabled": "auto",
301
+ "loss_scale": 0,
302
+ "loss_scale_window": 1000,
303
+ "initial_scale_power": 16,
304
+ "hysteresis": 2,
305
+ "min_loss_scale": 1
306
+ },
307
+
308
+ "optimizer": {
309
+ "type": "AdamW",
310
+ "params": {
311
+ "lr": "auto",
312
+ "betas": "auto",
313
+ "eps": "auto",
314
+ "weight_decay": "auto"
315
+ }
316
+ },
317
+
318
+ "scheduler": {
319
+ "type": "WarmupLR",
320
+ "params": {
321
+ "warmup_min_lr": "auto",
322
+ "warmup_max_lr": "auto",
323
+ "warmup_num_steps": "auto"
324
+ }
325
+ },
326
+
327
+ "zero_optimization": {
328
+ "stage": 3,
329
+ "offload_optimizer": {
330
+ "device": "nvme",
331
+ "nvme_path": "/local_nvme",
332
+ "pin_memory": true,
333
+ "buffer_count": 4,
334
+ "fast_init": false
335
+ },
336
+ "offload_param": {
337
+ "device": "nvme",
338
+ "nvme_path": "/local_nvme",
339
+ "pin_memory": true,
340
+ "buffer_count": 5,
341
+ "buffer_size": 1e8,
342
+ "max_in_cpu": 1e9
343
+ },
344
+ "aio": {
345
+ "block_size": 262144,
346
+ "queue_depth": 32,
347
+ "thread_count": 1,
348
+ "single_submit": false,
349
+ "overlap_events": true
350
+ },
351
+ "overlap_comm": true,
352
+ "contiguous_gradients": true,
353
+ "sub_group_size": 1e9,
354
+ "reduce_bucket_size": "auto",
355
+ "stage3_prefetch_bucket_size": "auto",
356
+ "stage3_param_persistence_threshold": "auto",
357
+ "stage3_max_live_parameters": 1e9,
358
+ "stage3_max_reuse_distance": 1e9,
359
+ "stage3_gather_16bit_weights_on_model_save": true
360
+ },
361
+
362
+ "gradient_accumulation_steps": "auto",
363
+ "gradient_clipping": "auto",
364
+ "steps_per_print": 2000,
365
+ "train_batch_size": "auto",
366
+ "train_micro_batch_size_per_gpu": "auto",
367
+ "wall_clock_breakdown": false
368
+ }
369
+ ```
370
+
371
+ ## Training features
372
+
373
+ DeepSpeed supports many training features that can be configured in the config file. This section describes some of the most important features.
374
+
375
+ ### Gradient checkpointing
376
+
377
+ Gradient checkpointing saves memory by only storing *some* of the intermediate activations instead of storing *all* of them. It is useful for fitting larger models on the GPU without running out of memory or to increase the batch size for better performance. Training speed is slower though.
378
+
379
+ * For a Transformers model, set `model.gradient_checkpointing_enable()` or add `--gradient_checkpointing` in the [`TrainingArguments`].
380
+ * For a non-Transformers model, use the DeepSpeed [Activation Checkpointing API](https://deepspeed.readthedocs.io/en/latest/activation-checkpointing.html). Replacing Transformers modeling code and [torch.utils.checkpoint](https://pytorch.org/docs/stable/checkpoint.html) with the DeepSpeed API gives you more flexibility because you can offload the forward activations to the CPU memory instead of recalculating them.
381
+
382
+ ### Batch size
383
+
384
+ The batch size can be automatically configured or manually set. When you choose the `"auto"` option, [`Trainer`] sets `train_micro_batch_size_per_gpu` and `train_batch_size` to the value of `world_size * per_device_train_batch_size * gradient_accumulation_steps`.
385
+
386
+ ```yaml
387
+ {
388
+ "train_micro_batch_size_per_gpu": "auto",
389
+ "train_batch_size": "auto"
390
+ }
391
+ ```
392
+
393
+ ### Communication data type
394
+
395
+ A separate data type is used for communication collectives like reduction, gathering and scattering operations.
396
+
397
+ All gather and scatter operations are performed in the same data type the data is in. For example, if you're training in bf16, the data is also gathered in bf16 because gathering is a non-lossy operation.
398
+
399
+ Reduce operations are lossy, for example, when gradients are averaged across multiple GPUs. When the communication is done in fp16 or bf16, it's more likely to be lossy because adding multiple numbers in low precision isn't exact. This is especially the case with bf16 which has a lower precision than fp16. For this reason, fp16 is the default for reduction operations because the loss is minimal when averaging gradients.
400
+
401
+ Choose the communication data type by setting the `communication_data_type` parameter in the config file. For example, choosing fp32 adds a small amount of overhead but ensures the reduction operation is accumulated in fp32 and when it is ready, it's downcasted to whichever half-precision data type you're training in.
402
+
403
+ ```yaml
404
+ {
405
+ "communication_data_type": "fp32"
406
+ }
407
+ ```
408
+
409
+ ### Gradient accumulation
410
+
411
+ Gradient accumulation accumulates gradients over several mini-batches of data before updating parameters. It stores less gradients and enables training with a larger *effective batch size*. Training speed is slower though, but it's useful for overcoming memory constraints.
412
+
413
+ Gradient accumulation can be automatically configured or manually set. When you choose the `"auto"` option, [`Trainer`] sets it to the value of `gradient_accumulation_steps`.
414
+
415
+ ```yaml
416
+ {
417
+ "gradient_accumulation_steps": "auto"
418
+ }
419
+ ```
420
+
421
+ ### Gradient clipping
422
+
423
+ Gradient clipping is useful for preventing exploding gradients which can lead to instability during training. It sets a maximum threshold value and rescales the gradients if their norm exceeds the threshold.
424
+
425
+ Gradient clipping can be automatically configured or manually set. When you choose the `"auto"` option, [`Trainer`] sets it to the value of `max_grad_norm`.
426
+
427
+ ```yaml
428
+ {
429
+ "gradient_clipping": "auto"
430
+ }
431
+ ```
432
+
433
+ ### Mixed precision training
434
+
435
+ Mixed precision accelerates training speed by performing some calculations in half-precision, but it also maintains some calculations in full-precision to preserve accuracy. DeepSpeed supports fp32, fp16, and bf16 data types.
436
+
437
+ <hfoptions id="precision">
438
+ <hfoption id="fp32">
439
+
440
+ Train in fp32 if a model wasn't pretrained in mixed precision because it may cause underflow or overflow errors. Disable fp16, the default, in this case.
441
+
442
+ ```yaml
443
+ {
444
+ "fp16": {
445
+ "enabled": false
446
+ }
447
+ }
448
+ ```
449
+
450
+ For Ampere GPUs and PyTorch 1.7+, the more efficient [tf32](https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices) mode is automatically enabled for some operations but the results are still in fp32. Configure it in [`Trainer`] by setting `--tf32` to enable it, and `--tf32 0` or `--no_tf32` to disable it.
451
+
452
+ </hfoption>
453
+ <hfoption id="fp16">
454
+
455
+ To configure AMP-like fp16 mixed precision, set up the config as shown below with `"auto"` or your own values. [`Trainer`] automatically enables or disables fp16 based on the value of `fp16_backend`, and the rest of the config can be set by you. fp16 is enabled from the command line when the following arguments are passed: `--fp16`, `--fp16_backend amp` or `--fp16_full_eval`.
456
+
457
+ ```yaml
458
+ {
459
+ "fp16": {
460
+ "enabled": "auto",
461
+ "loss_scale": 0,
462
+ "loss_scale_window": 1000,
463
+ "initial_scale_power": 16,
464
+ "hysteresis": 2,
465
+ "min_loss_scale": 1
466
+ }
467
+ }
468
+ ```
469
+
470
+ For additional DeepSpeed fp16 training options, take a look at the [FP16 Training Options](https://www.deepspeed.ai/docs/config-json/#fp16-training-options) reference.
471
+
472
+ To configure Apex-like fp16 mixed precision, set up the config as shown below with `"auto"` or your own values. [`Trainer`] automatically configures `amp` based on the values of `fp16_backend` and `fp16_opt_level`. It can also be enabled from the command line when the following arguments are passed: `--fp16`, `--fp16_backend apex` or `--fp16_opt_level 01`.
473
+
474
+ ```yaml
475
+ {
476
+ "amp": {
477
+ "enabled": "auto",
478
+ "opt_level": "auto"
479
+ }
480
+ }
481
+ ```
482
+
483
+ </hfoption>
484
+ <hfoption id="bf16">
485
+
486
+ > [!TIP]
487
+ > bf16 requires DeepSpeed 0.6.0.
488
+
489
+ bf16 has the same dynamic range as fp32, and doesn’t require loss scaling unlike fp16. However, if you use [gradient accumulation](#gradient-accumulation) with bf16, gradients are accumulated in bf16 which may not be desirable because the lower precision can lead to lossy accumulation.
490
+
491
+ bf16 can be set up in the config file or enabled from the command line when the following arguments are passed: `--bf16` or `--bf16_full_eval`.
492
+
493
+ ```yaml
494
+ {
495
+ "bf16": {
496
+ "enabled": "auto"
497
+ }
498
+ }
499
+ ```
500
+
501
+ </hfoption>
502
+ </hfoptions>
503
+
504
+ ### Optimizer and scheduler
505
+
506
+ DeepSpeed and Transformers optimizers and schedulers can be mixed and matched if `offload_optimizer` isn't enabled. When `offload_optimizer` is enabled, use a non-DeepSpeed optimizer (except for LAMB) as long as it has it a CPU and GPU implementation.
507
+
508
+ Set the optimizer and scheduler parameters for the config file from the command line to avoid hard to find errors. For example, if the learning rate is set to a different value in another place, you can override it from the command line.
509
+
510
+ <hfoptions id="opt-sched">
511
+ <hfoption id="optimizer">
512
+
513
+ DeepSpeed offers several [optimizers](https://www.deepspeed.ai/docs/config-json/#optimizer-parameters) (Adam, AdamW, OneBitAdam, and LAMB) but you can also import other optimizers from PyTorch. If you don't configure the optimizer in the config, [`Trainer`] automatically selects AdamW and either uses the supplied values or the default values for the following parameters from the command line: `lr`, `adam_beta1`, `adam_beta2`, `adam_epsilon`, `weight_decay`.
514
+
515
+ You can set the parameters to `"auto"` or manually input your own values.
516
+
517
+ ```yaml
518
+ {
519
+ "optimizer": {
520
+ "type": "AdamW",
521
+ "params": {
522
+ "lr": "auto",
523
+ "betas": "auto",
524
+ "eps": "auto",
525
+ "weight_decay": "auto"
526
+ }
527
+ }
528
+ }
529
+ ```
530
+
531
+ Use an unsupported optimizer by adding the following to the top level configuration.
532
+
533
+ ```yaml
534
+ {
535
+ "zero_allow_untested_optimizer": true
536
+ }
537
+ ```
538
+
539
+ From DeepSpeed 0.8.3+, if you want to use offload, you'll also need to add the following to the top level configuration because offload works best with DeepSpeed's CPU Adam optimizer.
540
+
541
+ ```yaml
542
+ {
543
+ "zero_force_ds_cpu_optimizer": false
544
+ }
545
+ ```
546
+
547
+ </hfoption>
548
+ <hfoption id="scheduler">
549
+
550
+ DeepSpeed supports the LRRangeTest, OneCycle, WarmupLR and WarmupDecayLR learning rate [schedulers](https://www.deepspeed.ai/docs/config-json/#scheduler-parameters).
551
+
552
+ Transformers and DeepSpeed provide two of the same schedulers:
553
+
554
+ * WarmupLR is the same as `--lr_scheduler_type constant_with_warmup` in Transformers.
555
+ * WarmupDecayLR is the same as `--lr_scheduler_type linear` in Transformers (this is the default scheduler used in Transformers).
556
+
557
+ If you don't configure the scheduler in the config file, [`Trainer`] automatically selects WarmupDecayLR and either uses the supplied values or the default values for the following parameters from the command line: `warmup_min_lr`, `warmup_max_lr`, `warmup_num_steps`, `total_num_steps` (automatically calculated during run time if `max_steps` is not provided).
558
+
559
+ You can set the parameters to `"auto"` or manually input your own values.
560
+
561
+ ```yaml
562
+ {
563
+ "scheduler": {
564
+ "type": "WarmupDecayLR",
565
+ "params": {
566
+ "total_num_steps": "auto",
567
+ "warmup_min_lr": "auto",
568
+ "warmup_max_lr": "auto",
569
+ "warmup_num_steps": "auto"
570
+ }
571
+ }
572
+ }
573
+ ```
574
+
575
+ </hfoption>
576
+ </hfoptions>
577
+
578
+ ### Universal checkpointing
579
+
580
+ [Universal Checkpointing](https://www.deepspeed.ai/tutorials/universal-checkpointing) saves and loads model, optimizer and training scheduler states across different model architectures, parallelism techniques, and training configurations. By saving them in a Universal format, it enables easier model training continuation and fine-tuning.
581
+
582
+ Resume training with a Universal checkpoint by setting `load_universal` to `true` in the config file.
583
+
584
+ ```yaml
585
+ {
586
+ "checkpoint": {
587
+ "load_universal": true
588
+ }
589
+ }
590
+ ```
591
+
592
+ ## Deploy
593
+
594
+ DeepSpeed can be deployed with its native launcher, [torchrun](https://pytorch.org/docs/stable/elastic/run.html) or [Accelerate](https://huggingface.co/docs/accelerate/basic_tutorials/launch#using-accelerate-launch).
595
+
596
+ Add the `--deepspeed ds_config.json` argument to [`Trainer`] in the command line. It is recommended to use DeepSpeeds [add_config_arguments](https://deepspeed.readthedocs.io/en/latest/initialize.html#argument-parsing) utility to add any other command line arguments to your code.
597
+
598
+ <hfoptions id="deploy">
599
+ <hfoption id="multi-GPU">
600
+
601
+ To deploy DeepSpeed on multiple GPUs, add `--num_gpus`. You don't need to add `--num_gpus` if you're planning on using all available GPUs.
602
+
603
+ ```bash
604
+ deepspeed --num_gpus=2 examples/pytorch/translation/run_translation.py \
605
+ --deepspeed tests/deepspeed/ds_config_zero3.json \
606
+ --model_name_or_path google-t5/t5-small --per_device_train_batch_size 1 \
607
+ --output_dir output_dir --overwrite_output_dir --fp16 \
608
+ --do_train --max_train_samples 500 --num_train_epochs 1 \
609
+ --dataset_name wmt16 --dataset_config "ro-en" \
610
+ --source_lang en --target_lang ro
611
+ ```
612
+
613
+ </hfoption>
614
+ <hfoption id="single-GPU">
615
+
616
+ DeepSpeed is still useful with just one GPU because you can:
617
+
618
+ 1. Offload some computations and memory to the CPU to make more GPU resources available to your model to use a larger batch size or fit a very large model that normally won't fit.
619
+ 2. Minimize memory fragmentation with its smart GPU memory management system which also allows you to fit bigger models and data batches.
620
+
621
+ To deploy DeepSpeed on a single GPU, add `--num_gpus`. You don't need to add `--num_gpus` if you only have one GPU because DeepSpeed deploys all GPUs it can see on a given node.
622
+
623
+ > [!TIP]
624
+ > Set the `allgather_bucket_size` and `reduce_bucket_size` values to 2e8 in the [ZeRO-2](#zero-configuration) configuration file to get better performance on a single GPU.
625
+
626
+ ```bash
627
+ deepspeed --num_gpus=1 examples/pytorch/translation/run_translation.py \
628
+ --deepspeed tests/deepspeed/ds_config_zero2.json \
629
+ --model_name_or_path google-t5/t5-small --per_device_train_batch_size 1 \
630
+ --output_dir output_dir --overwrite_output_dir --fp16 \
631
+ --do_train --max_train_samples 500 --num_train_epochs 1 \
632
+ --dataset_name wmt16 --dataset_config "ro-en" \
633
+ --source_lang en --target_lang ro
634
+ ```
635
+
636
+ </hfoption>
637
+ </hfoptions>
638
+
639
+ ### Multi-node
640
+
641
+ A multi-node setup consists of multiple nodes, where each node has one of more GPUs running a workload. DeepSpeed expects a shared storage system, but if this is not the case, you need to adjust the config file to include a [checkpoint](https://www.deepspeed.ai/docs/config-json/#checkpoint-options) to allow loading without access to a shared filesystem.
642
+
643
+ ```yaml
644
+ {
645
+ "checkpoint": {
646
+ "use_node_local_storage": true
647
+ }
648
+ }
649
+ ```
650
+
651
+ You could also use the `--save_on_each_node` parameter in [`TrainingArguments`] to automatically add the above `checkpoint` to your config.
652
+
653
+ The examples below for the torchrun and DeepSpeed launcher shows how to deploy two nodes with eight GPUs each. Access the first node with `ssh hostname1` and the second node with `ssh hostname2`. Both nodes must be able to communicate with each other locally over ssh without a password.
654
+
655
+ <hfoptions id="multinode">
656
+ <hfoption id="torchrun">
657
+
658
+ With [torchrun](https://pytorch.org/docs/stable/elastic/run.html), ssh to each node and run the following command on both of them. The launcher waits until both nodes are synchronized before launching the training.
659
+
660
+ ```bash
661
+ torchrun --nproc_per_node=8 --nnode=2 --node_rank=0 --master_addr=hostname1 \
662
+ --master_port=9901 your_program.py <normal cl args> --deepspeed ds_config.json
663
+ ```
664
+
665
+ </hfoption>
666
+ <hfoption id="DeepSpeed">
667
+
668
+ Create a `hostfile` for the DeepSpeed launcher.
669
+
670
+ ```bash
671
+ hostname1 slots=8
672
+ hostname2 slots=8
673
+ ```
674
+
675
+ The DeepSpeed launcher automatically launches the command on both nodes at once with the command below.
676
+
677
+ ```bash
678
+ deepspeed --num_gpus 8 --num_nodes 2 --hostfile hostfile --master_addr hostname1 --master_port=9901 \
679
+ your_program.py <normal cl args> --deepspeed ds_config.json
680
+ ```
681
+
682
+ Check out the [Resource Configuration (multi-node)](https://www.deepspeed.ai/getting-started/#resource-configuration-multi-node) guide for more details about configuring multi-node compute resources.
683
+
684
+ </hfoption>
685
+ </hfoptions>
686
+
687
+ ### Slurm
688
+
689
+ [Slurm](https://slurm.schedmd.com/documentation.html) is a cluster management and job scheduling system. An example Slurm script is shown below.
690
+
691
+ ```bash
692
+ #SBATCH --job-name=test-nodes # name
693
+ #SBATCH --nodes=2 # nodes
694
+ #SBATCH --ntasks-per-node=1 # crucial - only 1 task per dist per node!
695
+ #SBATCH --cpus-per-task=10 # number of cores per tasks
696
+ #SBATCH --gres=gpu:8 # number of gpus
697
+ #SBATCH --time 20:00:00 # maximum execution time (HH:MM:SS)
698
+ #SBATCH --output=%x-%j.out # output file name
699
+
700
+ export GPUS_PER_NODE=8
701
+ export MASTER_ADDR=$(scontrol show hostnames $SLURM_JOB_NODELIST | head -n 1)
702
+ export MASTER_PORT=9901
703
+
704
+ srun --jobid $SLURM_JOBID bash -c 'python -m torch.distributed.run \
705
+ --nproc_per_node $GPUS_PER_NODE --nnodes $SLURM_NNODES --node_rank $SLURM_PROCID \
706
+ --master_addr $MASTER_ADDR --master_port $MASTER_PORT \
707
+ your_program.py <normal cl args> --deepspeed ds_config.json'
708
+ ```
709
+
710
+ Launch training simultaneously on all nodes with the command below.
711
+
712
+ ```bash
713
+ sbatch launch.slurm
714
+ ```
715
+
716
+ ### Jupyter Notebook
717
+
718
+ To use DeepSpeed in a Jupyter Notebook, you need to emulate a distributed environment because the launcher doesn't support deployment from a notebook. This is only supported for one GPU. To use multiple GPUs, you must use a multi-process environment, which means you have to use the DeepSpeed launcher which can't be emulated as shown here.
719
+
720
+ ```py
721
+ # emulate a launcher in the notebook
722
+ import os
723
+
724
+ os.environ["MASTER_ADDR"] = "localhost"
725
+ os.environ["MASTER_PORT"] = "9994" # modify if RuntimeError: Address already in use
726
+ os.environ["RANK"] = "0"
727
+ os.environ["LOCAL_RANK"] = "0"
728
+ os.environ["WORLD_SIZE"] = "1"
729
+
730
+ training_args = TrainingArguments(..., deepspeed="ds_config_zero3.json")
731
+ trainer = Trainer(...)
732
+ trainer.train()
733
+ ```
734
+
735
+ Create a config file on the fly in the notebook in the current directory with a dedicated cell.
736
+
737
+ ```py
738
+ %%bash
739
+ cat <<'EOT' > ds_config_zero3.json
740
+ {
741
+ "fp16": {
742
+ "enabled": "auto",
743
+ "loss_scale": 0,
744
+ "loss_scale_window": 1000,
745
+ "initial_scale_power": 16,
746
+ "hysteresis": 2,
747
+ "min_loss_scale": 1
748
+ },
749
+
750
+ "optimizer": {
751
+ "type": "AdamW",
752
+ "params": {
753
+ "lr": "auto",
754
+ "betas": "auto",
755
+ "eps": "auto",
756
+ "weight_decay": "auto"
757
+ }
758
+ },
759
+
760
+ "scheduler": {
761
+ "type": "WarmupLR",
762
+ "params": {
763
+ "warmup_min_lr": "auto",
764
+ "warmup_max_lr": "auto",
765
+ "warmup_num_steps": "auto"
766
+ }
767
+ },
768
+
769
+ "zero_optimization": {
770
+ "stage": 3,
771
+ "offload_optimizer": {
772
+ "device": "cpu",
773
+ "pin_memory": true
774
+ },
775
+ "offload_param": {
776
+ "device": "cpu",
777
+ "pin_memory": true
778
+ },
779
+ "overlap_comm": true,
780
+ "contiguous_gradients": true,
781
+ "sub_group_size": 1e9,
782
+ "reduce_bucket_size": "auto",
783
+ "stage3_prefetch_bucket_size": "auto",
784
+ "stage3_param_persistence_threshold": "auto",
785
+ "stage3_max_live_parameters": 1e9,
786
+ "stage3_max_reuse_distance": 1e9,
787
+ "stage3_gather_16bit_weights_on_model_save": true
788
+ },
789
+
790
+ "gradient_accumulation_steps": "auto",
791
+ "gradient_clipping": "auto",
792
+ "steps_per_print": 2000,
793
+ "train_batch_size": "auto",
794
+ "train_micro_batch_size_per_gpu": "auto",
795
+ "wall_clock_breakdown": false
796
+ }
797
+ EOT
798
+ ```
799
+
800
+ If the training script is in a file and not a notebook cell, launch DeepSpeed from the shell in the notebook cell.
801
+
802
+ ```py
803
+ !git clone https://github.com/huggingface/transformers
804
+ !cd transformers; deepspeed examples/pytorch/translation/run_translation.py ...
805
+ ```
806
+
807
+ Another option is to use `%%bash` to run the shell program without emulating the distributed environment. However, you won't be able to view the logs until training is complete.
808
+
809
+ ```py
810
+ %%bash
811
+
812
+ git clone https://github.com/huggingface/transformers
813
+ cd transformers
814
+ deepspeed examples/pytorch/translation/run_translation.py ...
815
+ ```
816
+
817
+ ## Save model weights
818
+
819
+ DeepSpeed stores the main fp32 weights in custom checkpoint optimizer files (`global_step*/*optim_states.pt`) which are saved under the normal checkpoint.
820
+
821
+ ### fp16
822
+
823
+ ZeRO-2 saves the model weights in fp16. To save the weights in fp16 for ZeRO-3, set `"stage3_gather_16bit_weights_on_model_save": true` in the config file, because the weights are distributed across multiple GPUs.
824
+
825
+ If you don't, [`Trainer`] won't save the weights in fp16 and won't create a `pytorch_model.bin` file. This is because DeepSpeed's state_dict contains a placeholder instead of the real weights, so you won't be able to load it.
826
+
827
+ ```yaml
828
+ {
829
+ "zero_optimization": {
830
+ "stage": 3,
831
+ "stage3_gather_16bit_weights_on_model_save": true
832
+ }
833
+ }
834
+ ```
835
+
836
+ ### fp32
837
+
838
+ Unless you have a lot of free CPU memory, fp32 weights shouldn't be saved during training because it can require a lot of memory. It is usually best to save the fp32 weights offline after training is complete.
839
+
840
+ <hfoptions id="save">
841
+ <hfoption id="offline">
842
+
843
+ DeepSpeed provides a [zero_to_fp32.py](https://github.com/microsoft/DeepSpeed/blob/91829476a8fd4d0d9268c03c1d56795d20a51c12/deepspeed/utils/zero_to_fp32.py#L14) script at the top-level checkpoint folder for extracting weights at any point. This is a standalone script and you don't need a config file or [`Trainer`].
844
+
845
+ For example, if your checkpoint folder looks like the one shown below, then you can run the following command to create and consolidate the fp32 weights from multiple GPUs into a single `pytorch_model.bin` file. The script automatically discovers the subfolder `global_step1` which contains the checkpoint.
846
+
847
+ ```bash
848
+ $ ls -l output_dir/checkpoint-1/
849
+ -rw-rw-r-- 1 stas stas 1.4K Mar 27 20:42 config.json
850
+ drwxrwxr-x 2 stas stas 4.0K Mar 25 19:52 global_step1/
851
+ -rw-rw-r-- 1 stas stas 12 Mar 27 13:16 latest
852
+ -rw-rw-r-- 1 stas stas 827K Mar 27 20:42 optimizer.pt
853
+ -rw-rw-r-- 1 stas stas 231M Mar 27 20:42 pytorch_model.bin
854
+ -rw-rw-r-- 1 stas stas 623 Mar 27 20:42 scheduler.pt
855
+ -rw-rw-r-- 1 stas stas 1.8K Mar 27 20:42 special_tokens_map.json
856
+ -rw-rw-r-- 1 stas stas 774K Mar 27 20:42 spiece.model
857
+ -rw-rw-r-- 1 stas stas 1.9K Mar 27 20:42 tokenizer_config.json
858
+ -rw-rw-r-- 1 stas stas 339 Mar 27 20:42 trainer_state.json
859
+ -rw-rw-r-- 1 stas stas 2.3K Mar 27 20:42 training_args.bin
860
+ -rwxrw-r-- 1 stas stas 5.5K Mar 27 13:16 zero_to_fp32.py*
861
+ ```
862
+
863
+ > [!TIP]
864
+ > Run `python zero_to_fp32.py -h` for more usage details. The script requires 2x the general RAM of the final fp32 weights.
865
+
866
+ ```bash
867
+ python zero_to_fp32.py . pytorch_model.bin
868
+ ```
869
+
870
+ </hfoption>
871
+ <hfoption id="online">
872
+
873
+ Adding the `--load_best_model_at_end` parameter in [`TrainingArguments`] tracks the best checkpoint so you can finish training first and save the final model explicitly. Reload the model as shown below.
874
+
875
+ > [!WARNING]
876
+ > Once [load_state_dict_from_zero_checkpoint](https://deepspeed.readthedocs.io/en/stable/model-checkpointing.html#deepspeed.utils.zero_to_fp32.load_state_dict_from_zero_checkpoint) is run, the model is no longer usable in DeepSpeed in the context of the same application. You'll need to reinitialize the DeepSpeed engine because `model.load_state_dict(state_dict)` removes all the DeepSpeed magic from it. Only use this function once training is complete.
877
+
878
+ ```py
879
+ from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint
880
+
881
+ checkpoint_dir = os.path.join(trainer.args.output_dir, "checkpoint-final")
882
+ trainer.deepspeed.save_checkpoint(checkpoint_dir)
883
+ fp32_model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir)
884
+ ```
885
+
886
+ You must have saved at least one checkpoint to load the latest checkpoint as shown in the example below.
887
+
888
+ ```py
889
+ from transformers.trainer_utils import get_last_checkpoint
890
+ from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint
891
+
892
+ checkpoint_dir = get_last_checkpoint(trainer.args.output_dir)
893
+ fp32_model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir)
894
+ ```
895
+
896
+ Use `load_state_dict` to extract and load the state_dict of the fp32 weights.
897
+
898
+ ```py
899
+ from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
900
+
901
+ state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir)
902
+ model = model.cpu()
903
+ model.load_state_dict(state_dict)
904
+ ```
905
+
906
+ </hfoption>
907
+ </hfoptions>
908
+
909
+ ## Non-Trainer integration
910
+
911
+ DeepSpeed also works with Transformers without [`Trainer`]. The [`~integrations.HfDeepSpeedConfig`] is responsible for gathering ZeRO-3 parameters and partitioning a model across multiple GPUs when [`~PreTrainedModel.from_pretrained`] is called.
912
+
913
+ You must instantiate [`~integrations.HfDeepSpeedConfig`] before loading a model to efficiently deploy ZeRO-3.
914
+
915
+ <hfoptions id="models">
916
+ <hfoption id="pretrained model">
917
+
918
+ ```py
919
+ from transformers.integrations import HfDeepSpeedConfig
920
+ from transformers import AutoModel
921
+ import deepspeed
922
+
923
+ # DeepSpeed config object or path to the file
924
+ ds_config = {...}
925
+ # must run before instantiating the model to detect ZeRO-3
926
+ dschf = HfDeepSpeedConfig(ds_config) # keep this object alive
927
+ model = AutoModel.from_pretrained("openai-community/gpt2")
928
+ engine = deepspeed.initialize(model=model, config_params=ds_config, ...)
929
+ ```
930
+
931
+ </hfoption>
932
+ <hfoption id="non-pretrained model">
933
+
934
+ [`~integrations.HfDeepSpeedConfig`] is not required for ZeRO-1 or ZeRO-2.
935
+
936
+ ```py
937
+ from transformers.integrations import HfDeepSpeedConfig
938
+ from transformers import AutoModel, AutoConfig
939
+ import deepspeed
940
+
941
+ # DeepSpeed config object or path to the file
942
+ ds_config = {...}
943
+ # must run before instantiating the model to detect zero 3
944
+ dschf = HfDeepSpeedConfig(ds_config) # keep this object alive
945
+ # randomly initialize model weights
946
+ config = AutoConfig.from_pretrained("openai-community/gpt2")
947
+ model = AutoModel.from_config(config)
948
+ engine = deepspeed.initialize(model=model, config_params=ds_config, ...)
949
+ ```
950
+
951
+ </hfoption>
952
+ </hfoptions>
953
+
954
+ ## Troubleshoot
955
+
956
+ One of the first things to check when you encounter an error is whether DeepSpeed is the cause (because often it isn't). Retry your setup without DeepSpeed, and if the error persists, report the issue. If the issue is unrelated to the Transformers integration, please open the issue on the DeepSpeed [repository](https://github.com/microsoft/DeepSpeed).
957
+
958
+ For issues related to the Transformers integration, please provide the following information.
959
+
960
+ * The full DeepSpeed config file.
961
+ * The command line arguments for [`Trainer`] or the [`TrainingArguments`] if you're scripting the [`Trainer`] setup yourself (don't dump the entire [`TrainingArguments`] which contains many irrelevant entries).
962
+ * The outputs of the following commands.
963
+
964
+ ```bash
965
+ python -c 'import torch; print(f"torch: {torch.__version__}")'
966
+ python -c 'import transformers; print(f"transformers: {transformers.__version__}")'
967
+ python -c 'import deepspeed; print(f"deepspeed: {deepspeed.__version__}")'
968
+ ```
969
+
970
+ * A link to a Google Colab notebook to reproduce the issue.
971
+ * A standard or non-custom dataset or an existing example to reproduce the issue.
972
+
973
+ The following sections provide a guide for resolving two of the most common issues.
974
+
975
+ ### Process killed at startup
976
+
977
+ When the DeepSpeed process is killed during launch without a traceback, that usually means the program tried to allocate more CPU memory than is available on your system. Or the process may have tried to allocate more CPU memory than allowed, leading the OS kernel to terminate the process.
978
+
979
+ In this case, check whether your config file has either `offload_optimizer`, `offlload_param`, or both configured to offload to the CPU.
980
+
981
+ If you have NVM3 and ZeRO-3 set up, experiment with offloading to the NVMe ([estimate](https://deepspeed.readthedocs.io/en/latest/memory.html) the memory requirements of a model first) instead.
982
+
983
+ ### NaN loss
984
+
985
+ NaN loss often occurs when a model is pretrained in bf16 and you try to use it with fp16 (especially relevant to TPU trained models). To resolve this, use fp32 or bf16 if your hardware (TPUs, Ampere GPUs or newer) supports it.
986
+
987
+ It is also possible that fp16 is causing overflow. For example, if your config file looks like the one below, you may see the following overflow errors in the logs.
988
+
989
+ ```yaml
990
+ {
991
+ "fp16": {
992
+ "enabled": "auto",
993
+ "loss_scale": 0,
994
+ "loss_scale_window": 1000,
995
+ "initial_scale_power": 16,
996
+ "hysteresis": 2,
997
+ "min_loss_scale": 1
998
+ }
999
+ }
1000
+ ```
1001
+
1002
+ The `OVERFLOW!` error below is a result of the DeepSpeed loss scaler unable to find a scaling coefficient to overcome the loss overflow. Try a higher `initial_scale_power` value in this case (32 usually works).
1003
+
1004
+ ```bash
1005
+ 0%| | 0/189 [00:00<?, ?it/s]
1006
+ [deepscale] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 262144, reducing to 262144
1007
+ 1%|▌ | 1/189 [00:00<01:26, 2.17it/s]
1008
+ [deepscale] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 262144, reducing to 131072.0
1009
+ 1%|█▏
1010
+ [...]
1011
+ [deepscale] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 1, reducing to 1
1012
+ 14%|████████████████▌ | 27/189 [00:14<01:13, 2.21it/s]
1013
+ [deepscale] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 1, reducing to 1
1014
+ 15%|█████████████████▏ | 28/189 [00:14<01:13, 2.18it/s]
1015
+ [deepscale] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 1, reducing to 1
1016
+ 15%|█████████████████▊ | 29/189 [00:15<01:13, 2.18it/s]
1017
+ [deepscale] OVERFLOW! Rank 0 Skipping step. Attempted loss scale: 1, reducing to 1
1018
+ [...]
1019
+ ```
1020
+
1021
+ ## Resources
1022
+
1023
+ DeepSpeed is a powerful technology for scaling large model training. To learn more about DeepSpeed, take a look at their [blog posts](https://www.microsoft.com/en-us/research/search/?q=deepspeed), [documentation](https://www.deepspeed.ai/getting-started/), and [GitHub](https://github.com/microsoft/deepspeed).
1024
+
1025
+ The papers below provide additional details about ZeRO.
1026
+
1027
+ * [ZeRO: Memory Optimizations Toward Training Trillion Parameter Models](https://hf.co/papers/1910.02054)
1028
+ * [ZeRO-Offload: Democratizing Billion-Scale Model Training](https://hf.co/papers/2101.06840)
1029
+ * [ZeRO-Infinity: Breaking the GPU Memory Wall for Extreme Scale Deep Learning](https://hf.co/papers/2104.07857)
transformers/docs/source/en/executorch.md ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!--Copyright 2025 The HuggingFace Team. All rights reserved.
2
+
3
+ Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
4
+ the License. You may obtain a copy of the License at
5
+
6
+ http://www.apache.org/licenses/LICENSE-2.0
7
+
8
+ Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
9
+ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
10
+ specific language governing permissions and limitations under the License.
11
+
12
+ ⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
13
+ rendered properly in your Markdown viewer.
14
+
15
+ -->
16
+
17
+ # ExecuTorch
18
+
19
+ [ExecuTorch](https://pytorch.org/executorch/stable/index.html) is a platform that enables PyTorch training and inference programs to be run on mobile and edge devices. It is powered by [torch.compile](https://pytorch.org/docs/stable/torch.compiler.html) and [torch.export](https://pytorch.org/docs/main/export.html) for performance and deployment.
20
+
21
+ You can use ExecuTorch with Transformers with [torch.export](https://pytorch.org/docs/main/export.html). The [`~transformers.convert_and_export_with_cache`] method converts a [`PreTrainedModel`] into an exportable module. Under the hood, it uses [torch.export](https://pytorch.org/docs/main/export.html) to export the model, ensuring compatibility with ExecuTorch.
22
+
23
+ ```py
24
+ import torch
25
+ from transformers import LlamaForCausalLM, AutoTokenizer, GenerationConfig
26
+ from transformers.integrations.executorch import(
27
+ TorchExportableModuleWithStaticCache,
28
+ convert_and_export_with_cache
29
+ )
30
+
31
+ generation_config = GenerationConfig(
32
+ use_cache=True,
33
+ cache_implementation="static",
34
+ cache_config={
35
+ "batch_size": 1,
36
+ "max_cache_len": 20,
37
+ }
38
+ )
39
+
40
+ tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B", pad_token="</s>", padding_side="right")
41
+ model = LlamaForCausalLM.from_pretrained("meta-llama/Llama-3.2-1B", device_map="auto", torch_dtype=torch.bfloat16, attn_implementation="sdpa", generation_config=generation_config)
42
+
43
+ exported_program = convert_and_export_with_cache(model)
44
+ ```
45
+
46
+ The exported PyTorch model is now ready to be used with ExecuTorch. Wrap the model with [`~transformers.TorchExportableModuleWithStaticCache`] to generate text.
47
+
48
+ ```py
49
+ prompts = ["Simply put, the theory of relativity states that "]
50
+ prompt_tokens = tokenizer(prompts, return_tensors="pt", padding=True).to(model.device)
51
+ prompt_token_ids = prompt_tokens["input_ids"]
52
+
53
+ generated_ids = TorchExportableModuleWithStaticCache.generate(
54
+ exported_program=exported_program, prompt_token_ids=prompt_token_ids, max_new_tokens=20,
55
+ )
56
+ generated_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
57
+ print(generated_text)
58
+ ['Simply put, the theory of relativity states that 1) the speed of light is the']
59
+ ```
transformers/docs/source/en/fast_tokenizers.md ADDED
@@ -0,0 +1,362 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!--Copyright 2024 The HuggingFace Team. All rights reserved.
2
+
3
+ Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
4
+ the License. You may obtain a copy of the License at
5
+
6
+ http://www.apache.org/licenses/LICENSE-2.0
7
+
8
+ Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
9
+ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
10
+ specific language governing permissions and limitations under the License.
11
+
12
+ ⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
13
+ rendered properly in your Markdown viewer.
14
+
15
+ -->
16
+
17
+ # Tokenizers
18
+
19
+ Tokenizers convert text into an array of numbers known as tensors, the inputs to a text model. There are several tokenizer algorithms, but they all share the same purpose. Split text into smaller words or subwords (tokens) according to some rules, and convert them into numbers (input ids). A Transformers tokenizer also returns an attention mask to indicate which tokens should be attended to.
20
+
21
+ > [!TIP]
22
+ > Learn about the most popular tokenization algorithms on the [Summary of the tokenizers](./tokenizer_summary) doc.
23
+
24
+ Call [`~PreTrainedTokenizer.from_pretrained`] to load a tokenizer and its configuration from the Hugging Face [Hub](https://hf.co) or a local directory. The pretrained tokenizer is saved in a [tokenizer.model](https://huggingface.co/google/gemma-2-2b/blob/main/tokenizer.model) file with all its associated vocabulary files.
25
+
26
+ Pass a string of text to the tokenizer to return the input ids and attention mask, and set the framework tensor type to return with the `return_tensors` parameter.
27
+
28
+ ```py
29
+ from transformers import AutoTokenizer
30
+
31
+ tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-2b")
32
+ tokenizer("We are very happy to show you the 🤗 Transformers library", return_tensors="pt")
33
+ {'input_ids': tensor([[ 2, 1734, 708, 1508, 4915, 577, 1500, 692, 573,
34
+ 156808, 128149, 9581, 235265]]),
35
+ 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])
36
+ }
37
+ ```
38
+
39
+ Whichever tokenizer you use, make sure the tokenizer vocabulary is the same as the pretrained models tokenizer vocabulary. This is especially important if you're using a custom tokenizer with a different vocabulary from the pretrained models tokenizer.
40
+
41
+ This guide provides a brief overview of the tokenizer classes and how to preprocess text with it.
42
+
43
+ ## Tokenizer classes
44
+
45
+ All tokenizers inherit from a [`PreTrainedTokenizerBase`] class that provides common methods for all tokenizers like [`~PreTrainedTokenizerBase.from_pretrained`] and [`~PreTrainedTokenizerBase.batch_decode`]. There are two main tokenizer classes that build on top of the base class.
46
+
47
+ - [`PreTrainedTokenizer`] is a Python implementation, for example [`LlamaTokenizer`].
48
+ - [`PreTrainedTokenizerFast`] is a fast Rust-based implementation from the [Tokenizers](https://hf.co/docs/tokenizers/index) library, for example [`LlamaTokenizerFast`].
49
+
50
+ There are two ways you can load a tokenizer, with [`AutoTokenizer`] or a model-specific tokenizer.
51
+
52
+ <hfoptions id="tokenizer-classes">
53
+ <hfoption id="AutoTokenizer">
54
+
55
+ The [AutoClass](./model_doc/auto) API is a fast and easy way to load a tokenizer without needing to know whether a Python or Rust-based implementation is available. By default, [`AutoTokenizer`] tries to load a fast tokenizer if it's available, otherwise, it loads the Python implementation.
56
+
57
+ Use [`~PreTrainedTokenizer.from_pretrained`] to load a tokenizer.
58
+
59
+ ```py
60
+ from transformers import AutoTokenizer
61
+
62
+ tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-2b")
63
+ tokenizer("We are very happy to show you the 🤗 Transformers library.", return_tensors="pt")
64
+ {'input_ids': tensor([[ 2, 1734, 708, 1508, 4915, 577, 1500, 692, 573,
65
+ 156808, 128149, 9581, 235265]]),
66
+ 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])
67
+ }
68
+ ```
69
+
70
+ Load your own tokenizer by passing its vocabulary file to [`~AutoTokenizer.from_pretrained`].
71
+
72
+ ```py
73
+ from transformers import AutoTokenizer
74
+
75
+ tokenizer = AutoTokenizer.from_pretrained("./model_directory/my_vocab_file.txt")
76
+ ```
77
+
78
+ </hfoption>
79
+ <hfoption id="model-specific tokenizer">
80
+
81
+ Each pretrained model is associated with a tokenizer and the specific vocabulary it was trained on. A tokenizer can be loaded directly from the model-specific class.
82
+
83
+ > [!TIP]
84
+ > Refer to a models API documentation to check whether a fast tokenizer is supported.
85
+
86
+ ```py
87
+ from transformers import GemmaTokenizer
88
+
89
+ tokenizer = GemmaTokenizer.from_pretrained("google/gemma-2-2b")
90
+ tokenizer("We are very happy to show you the 🤗 Transformers library.", return_tensors="pt")
91
+ ```
92
+
93
+ To load a fast tokenizer, use the fast implementation class.
94
+
95
+ ```py
96
+ from transformers import GemmaTokenizerFast
97
+
98
+ tokenizer = GemmaTokenizerFast.from_pretrained("google/gemma-2-2b")
99
+ tokenizer("We are very happy to show you the 🤗 Transformers library.", return_tensors="pt")
100
+ ```
101
+
102
+ Load your own tokenizer by passing its vocabulary file to the `vocab_file` parameter.
103
+
104
+ ```py
105
+ from transformers import GemmaTokenizerFast
106
+
107
+ tokenizer = GemmaTokenizerFast(vocab_file="my_vocab_file.txt")
108
+ ```
109
+
110
+ </hfoption>
111
+ </hfoptions>
112
+
113
+ ## Multimodal tokenizers
114
+
115
+ In addition to text tokens, multimodal tokenizers also holds tokens from other modalities as a part of its attributes for easy access.
116
+
117
+ To add these special tokens to a tokenizer, pass them as a dictionary to the `extra_special_tokens` parameter in [`~AutoTokenizer.from_pretrained`]. The example below adds the `image_token` to a vision-language model.
118
+
119
+ Save the tokenizer so you can reuse it with direct access to the `image_token`, `boi_token`, and `eoi_token`.
120
+
121
+ ```py
122
+ vision_tokenizer = AutoTokenizer.from_pretrained(
123
+ "llava-hf/llava-1.5-7b-hf",
124
+ extra_special_tokens={"image_token": "<image>", "boi_token": "<image_start>", "eoi_token": "<image_end>"}
125
+ )
126
+ print(vision_tokenizer.image_token, vision_tokenizer.image_token_id)
127
+ ("<image>", 32000)
128
+
129
+ vision_tokenizer.save_pretrained("./path/to/tokenizer")
130
+ ```
131
+
132
+ ## Fast tokenizers
133
+
134
+ <Youtube id="3umI3tm27Vw"/>
135
+
136
+ [`PreTrainedTokenizerFast`] or *fast tokenizers* are Rust-based tokenizers from the [Tokenizers](https://hf.co/docs/tokenizers) library. It is significantly faster at batched tokenization and provides additional alignment methods compared to the Python-based tokenizers.
137
+
138
+ [`AutoTokenizer`] automatically loads a fast tokenizer if it's supported. Otherwise, you need to explicitly load the fast tokenizer.
139
+
140
+ This section will show you how to train a fast tokenizer and reuse it in Transformers.
141
+
142
+ To train a Byte-Pair Encoding (BPE) tokenizer, create a [`~tokenizers.Tokenizer`] and [`~tokenizers.trainers.BpeTrainer`] class and define the unknown token and special tokens.
143
+
144
+ ```py
145
+ from tokenizers import Tokenizer
146
+ from tokenizers.models import BPE
147
+ from tokenizers.trainers import BpeTrainer
148
+
149
+ tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
150
+ trainer = BpeTrainer(special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
151
+ ```
152
+
153
+ Split the tokens on [`~tokenizers.pre_tokenizers.Whitespace`] to create tokens that don't overlap with each other.
154
+
155
+ ```py
156
+ from tokenizers.pre_tokenizers import Whitespace
157
+
158
+ tokenizer.pre_tokenizer = Whitespace()
159
+ ```
160
+
161
+ Call [`~tokenizers.Tokenizer.train`] on the text files and trainer to start training.
162
+
163
+ ```py
164
+ files = [...]
165
+ tokenizer.train(files, trainer)
166
+ ```
167
+
168
+ Use [`~tokenizers.Tokenizer.save`] to save the tokenizers configuration and vocabulary to a JSON file.
169
+
170
+ ```py
171
+ tokenizer.save("tokenizer.json")
172
+ ```
173
+
174
+ Now you can load and reuse the tokenizer object in Transformers by passing it to the `tokenizer_object` parameter in [`PreTrainedTokenizerFast`].
175
+
176
+ ```py
177
+ from transformers import PreTrainedTokenizerFast
178
+
179
+ fast_tokenizer = PreTrainedTokenizerFast(tokenizer_object=tokenizer)
180
+ ```
181
+
182
+ To load a saved tokenizer from its JSON file, pass the file path to the `tokenizer_file` parameter in [`PreTrainedTokenizerFast`].
183
+
184
+ ```py
185
+ from transformers import PreTrainedTokenizerFast
186
+
187
+ fast_tokenizer = PreTrainedTokenizerFast(tokenizer_file="tokenizer.json")
188
+ ```
189
+
190
+ ## tiktoken
191
+
192
+ [tiktoken](https://github.com/openai/tiktoken) is a [byte-pair encoding (BPE)](./tokenizer_summary#byte-pair-encoding-bpe) tokenizer by OpenAI. It includes several tokenization schemes or encodings for how text should be tokenized.
193
+
194
+ There are currently two models trained and released with tiktoken, GPT2 and Llama3. Transformers supports models with a [tokenizer.model](https://hf.co/meta-llama/Meta-Llama-3-8B/blob/main/original/tokenizer.model) tiktoken file. The tiktoken file is automatically converted into Transformers Rust-based [`PreTrainedTokenizerFast`].
195
+
196
+ Add the `subfolder` parameter to [`~PreTrainedModel.from_pretrained`] to specify where the `tokenizer.model` tiktoken file is located.
197
+
198
+ ```py
199
+ from transformers import AutoTokenizer
200
+
201
+ tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct", subfolder="original")
202
+ ```
203
+
204
+ ### Create a tiktoken tokenizer
205
+
206
+ The tiktoken `tokenizer.model` file contains no information about additional tokens or pattern strings. If these are important, convert the tokenizer to `tokenizer.json` (the appropriate format for [`PreTrainedTokenizerFast`]).
207
+
208
+ Generate the tiktoken `tokenizer.model` file with the [tiktoken.get_encoding](https://github.com/openai/tiktoken/blob/63527649963def8c759b0f91f2eb69a40934e468/tiktoken/registry.py#L63) function, and convert it to `tokenizer.json` with [convert_tiktoken_to_fast](https://github.com/huggingface/transformers/blob/99e0ab6ed888136ea4877c6d8ab03690a1478363/src/transformers/integrations/tiktoken.py#L8).
209
+
210
+ ```py
211
+ from transformers.integrations.tiktoken import convert_tiktoken_to_fast
212
+ from tiktoken import get_encoding
213
+
214
+ # Load your custom encoding or the one provided by OpenAI
215
+ encoding = get_encoding("gpt2")
216
+ convert_tiktoken_to_fast(encoding, "config/save/dir")
217
+ ```
218
+
219
+ The resulting `tokenizer.json` file is saved to the specified directory and loaded with [`~PreTrainedTokenizerFast.from_pretrained`].
220
+
221
+ ```py
222
+ tokenizer = PreTrainedTokenizerFast.from_pretrained("config/save/dir")
223
+ ```
224
+
225
+ ## Preprocess
226
+
227
+ <Youtube id="Yffk5aydLzg"/>
228
+
229
+ A Transformers model expects the input to be a PyTorch, TensorFlow, or NumPy tensor. A tokenizers job is to preprocess text into those tensors. Specify the framework tensor type to return with the `return_tensors` parameter.
230
+
231
+ ```py
232
+ from transformers import AutoTokenizer
233
+
234
+ tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-2b")
235
+ tokenizer("We are very happy to show you the 🤗 Transformers library.", return_tensors="pt")
236
+ {'input_ids': tensor([[ 2, 1734, 708, 1508, 4915, 577, 1500, 692, 573,
237
+ 156808, 128149, 9581, 235265]]),
238
+ 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])
239
+ }
240
+ ```
241
+
242
+ The tokenization process of converting text into input ids is completed in two steps.
243
+
244
+ <hfoptions id="steps">
245
+ <hfoption id="1. tokenize">
246
+
247
+ In the first step, a string of text is split into tokens by the [`~PreTrainedTokenizer.tokenize`] function. How the text is split depends on the tokenization algorithm.
248
+
249
+ ```py
250
+ tokens = tokenizer.tokenize("We are very happy to show you the 🤗 Transformers library")
251
+ print(tokens)
252
+ ['We', '▁are', '▁very', '▁happy', '▁to', '▁show', '▁you', '▁the', '▁🤗', '▁Transformers', '▁library']
253
+ ```
254
+
255
+ Gemma uses a [SentencePiece](./tokenizer_summary#sentencepiece) tokenizer which replaces spaces with an underscore `_`.
256
+
257
+ </hfoption>
258
+ <hfoption id="2. convert tokens to ids">
259
+
260
+ In the second step, the tokens are converted into ids with [`~PreTrainedTokenizer.convert_tokens_to_ids`].
261
+
262
+ ```py
263
+ ids = tokenizer.convert_tokens_to_ids(tokens)
264
+ print(ids)
265
+ [1734, 708, 1508, 4915, 577, 1500, 692, 573, 156808, 128149, 9581]
266
+ ```
267
+
268
+ </hfoption>
269
+ <hfoption id="3. decode ids to text">
270
+
271
+ Lastly, the model prediction typically generates numerical outputs which are converted back to text with [`~PreTrainedTokenizer.decode`].
272
+
273
+ ```py
274
+ decoded_string = tokenizer.decode(ids)
275
+ print(decoded_string)
276
+ 'We are very happy to show you the 🤗 Transformers library'
277
+ ```
278
+
279
+ </hfoption>
280
+ </hfoptions>
281
+
282
+ > [!TIP]
283
+ > Visualize how different tokenizers work in the [Tokenizer Playground](https://xenova-the-tokenizer-playground.static.hf.space).
284
+
285
+ ### Special tokens
286
+
287
+ Special tokens provide the model with some additional information about the text.
288
+
289
+ For example, if you compare the tokens obtained from passing text directly to the tokenizer and from [`~PreTrainedTokenizer.convert_tokens_to_ids`], you'll notice some additional tokens are added.
290
+
291
+ ```py
292
+ model_inputs = tokenizer("We are very happy to show you the 🤗 Transformers library.")
293
+ [2, 1734, 708, 1508, 4915, 577, 1500, 692, 573, 156808, 128149, 9581]
294
+ tokenizer.convert_tokens_to_ids(tokens)
295
+ [1734, 708, 1508, 4915, 577, 1500, 692, 573, 156808, 128149, 9581]
296
+ ```
297
+
298
+ When you [`~PreTrainedTokenizer.decode`] the ids, you'll see `<bos>` at the beginning of the string. This is used to indicate the beginning of a sentence to the model.
299
+
300
+ ```py
301
+ print(tokenizer.decode(model_inputs["input_ids"]))
302
+ print(tokenizer.decode(ids))
303
+ '<bos>We are very happy to show you the 🤗 Transformers library.'
304
+ 'We are very happy to show you the 🤗 Transformers library'
305
+ ```
306
+
307
+ Not all models need special tokens, but if they do, a tokenizer automatically adds them.
308
+
309
+ ### Batch tokenization
310
+
311
+ It is faster and more efficient to preprocess *batches* of text instead of a single sentence at a time. Fast tokenizers are especially good at parallelizing tokenization.
312
+
313
+ Pass a list of string text to the tokenizer.
314
+
315
+ ```py
316
+ batch_sentences = [
317
+ "But what about second breakfast?",
318
+ "Don't think he knows about second breakfast, Pip.",
319
+ "What about elevensies?",
320
+ ]
321
+ encoded_inputs = tokenizer(batch_sentences, return_tensors="pt")
322
+ print(encoded_inputs)
323
+ {
324
+ 'input_ids':
325
+ [[2, 1860, 1212, 1105, 2257, 14457, 235336],
326
+ [2, 4454, 235303, 235251, 1742, 693, 9242, 1105, 2257, 14457, 235269, 48782, 235265],
327
+ [2, 1841, 1105, 29754, 37453, 235336]],
328
+ 'attention_mask': [[1, 1, 1, 1, 1, 1, 1],
329
+ [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
330
+ [1, 1, 1, 1, 1, 1]]
331
+ }
332
+ ```
333
+
334
+ ### Padding
335
+
336
+ > [!TIP]
337
+ > Learn about additional padding strategies in the [Padding and truncation](./pad_truncation) guide.
338
+
339
+ In the output above, the `input_ids` have different lengths. This is an issue because Transformers expects them to have the same lengths so it can pack them into a batch. Sequences with uneven lengths can't be batched.
340
+
341
+ Padding adds a special *padding token* to ensure all sequences have the same length. Set `padding=True` to pad the sequences to the longest sequence length in the batch.
342
+
343
+ ```py
344
+ encoded_inputs = tokenizer(batch_sentences, padding=True, return_tensors="pt")
345
+ print(encoded_inputs)
346
+ ```
347
+
348
+ The tokenizer added the special padding token `0` to the left side (*left padding*) because Gemma and LLMs in general are not trained to continue generation from a padding token.
349
+
350
+ ### Truncation
351
+
352
+ > [!TIP]
353
+ > Learn about additional truncation strategies in the [Padding and truncation](./pad_truncation) guide.
354
+
355
+ Models are only able to process sequences up to a certain length. If you try to process a sequence longer than a model can handle, it crashes.
356
+
357
+ Truncation removes tokens from a sequence to ensure it doesn't exceed the maximum length. Set `truncation=True` to truncate a sequence to the maximum length accepted by the model. You can also set the maximum length yourself with the `max_length` parameter.
358
+
359
+ ```py
360
+ encoded_inputs = tokenizer(batch_sentences, max_length=8, truncation=True, return_tensors="pt")
361
+ print(encoded_inputs)
362
+ ```
transformers/docs/source/en/feature_extractors.md ADDED
@@ -0,0 +1,200 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!--Copyright 2024 The HuggingFace Team. All rights reserved.
2
+
3
+ Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
4
+ the License. You may obtain a copy of the License at
5
+
6
+ http://www.apache.org/licenses/LICENSE-2.0
7
+
8
+ Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
9
+ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
10
+ specific language governing permissions and limitations under the License.
11
+
12
+ ⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
13
+ rendered properly in your Markdown viewer.
14
+
15
+ -->
16
+
17
+ # Feature extractors
18
+
19
+ Feature extractors preprocess audio data into the correct format for a given model. It takes the raw audio signal and converts it into a tensor that can be fed to a model. The tensor shape depends on the model, but the feature extractor will correctly preprocess the audio data for you given the model you're using. Feature extractors also include methods for padding, truncation, and resampling.
20
+
21
+ Call [`~AutoFeatureExtractor.from_pretrained`] to load a feature extractor and its preprocessor configuration from the Hugging Face [Hub](https://hf.co/models) or local directory. The feature extractor and preprocessor configuration is saved in a [preprocessor_config.json](https://hf.co/openai/whisper-tiny/blob/main/preprocessor_config.json) file.
22
+
23
+ Pass the audio signal, typically stored in `array`, to the feature extractor and set the `sampling_rate` parameter to the pretrained audio models sampling rate. It is important the sampling rate of the audio data matches the sampling rate of the data a pretrained audio model was trained on.
24
+
25
+ ```py
26
+ from transformers import AutoFeatureExtractor
27
+
28
+ feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-base")
29
+ dataset = load_dataset("PolyAI/minds14", name="en-US", split="train")
30
+ processed_sample = feature_extractor(dataset[0]["audio"]["array"], sampling_rate=16000)
31
+ processed_sample
32
+ {'input_values': [array([ 9.4472744e-05, 3.0777880e-03, -2.8888427e-03, ...,
33
+ -2.8888427e-03, 9.4472744e-05, 9.4472744e-05], dtype=float32)]}
34
+ ```
35
+
36
+ The feature extractor returns an input, `input_values`, that is ready for the model to consume.
37
+
38
+ This guide walks you through the feature extractor classes and how to preprocess audio data.
39
+
40
+ ## Feature extractor classes
41
+
42
+ Transformers feature extractors inherit from the base [`SequenceFeatureExtractor`] class which subclasses [`FeatureExtractionMixin`].
43
+
44
+ - [`SequenceFeatureExtractor`] provides a method to [`~SequenceFeatureExtractor.pad`] sequences to a certain length to avoid uneven sequence lengths.
45
+ - [`FeatureExtractionMixin`] provides [`~FeatureExtractionMixin.from_pretrained`] and [`~FeatureExtractionMixin.save_pretrained`] to load and save a feature extractor.
46
+
47
+ There are two ways you can load a feature extractor, [`AutoFeatureExtractor`] and a model-specific feature extractor class.
48
+
49
+ <hfoptions id="feature-extractor-classes">
50
+ <hfoption id="AutoFeatureExtractor">
51
+
52
+ The [AutoClass](./model_doc/auto) API automatically loads the correct feature extractor for a given model.
53
+
54
+ Use [`~AutoFeatureExtractor.from_pretrained`] to load a feature extractor.
55
+
56
+ ```py
57
+ from transformers import AutoFeatureExtractor
58
+
59
+ feature_extractor = AutoFeatureExtractor.from_pretrained("openai/whisper-tiny")
60
+ ```
61
+
62
+ </hfoption>
63
+ <hfoption id="model-specific feature extractor">
64
+
65
+ Every pretrained audio model has a specific associated feature extractor for correctly processing audio data. When you load a feature extractor, it retrieves the feature extractors configuration (feature size, chunk length, etc.) from [preprocessor_config.json](https://hf.co/openai/whisper-tiny/blob/main/preprocessor_config.json).
66
+
67
+ A feature extractor can be loaded directly from its model-specific class.
68
+
69
+ ```py
70
+ from transformers import WhisperFeatureExtractor
71
+
72
+ feature_extractor = WhisperFeatureExtractor.from_pretrained("openai/whisper-tiny")
73
+ ```
74
+
75
+ </hfoption>
76
+ </hfoptions>
77
+
78
+ ## Preprocess
79
+
80
+ A feature extractor expects the input as a PyTorch tensor of a certain shape. The exact input shape can vary depending on the specific audio model you're using.
81
+
82
+ For example, [Whisper](https://huggingface.co/docs/transformers/model_doc/whisper) expects `input_features` to be a tensor of shape `(batch_size, feature_size, sequence_length)` but [Wav2Vec2](https://hf.co/docs/transformers/model_doc/wav2vec2) expects `input_values` to be a tensor of shape `(batch_size, sequence_length)`.
83
+
84
+ The feature extractor generates the correct input shape for whichever audio model you're using.
85
+
86
+ A feature extractor also sets the sampling rate (the number of audio signal values taken per second) of the audio files. The sampling rate of your audio data must match the sampling rate of the dataset a pretrained model was trained on. This value is typically given in the model card.
87
+
88
+ Load a dataset and feature extractor with [`~FeatureExtractionMixin.from_pretrained`].
89
+
90
+ ```py
91
+ from datasets import load_dataset, Audio
92
+ from transformers import AutoFeatureExtractor
93
+
94
+ dataset = load_dataset("PolyAI/minds14", name="en-US", split="train")
95
+ feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-base")
96
+ ```
97
+
98
+ Check out the first example from the dataset and access the `audio` column which contains `array`, the raw audio signal.
99
+
100
+ ```py
101
+ dataset[0]["audio"]["array"]
102
+ array([ 0. , 0.00024414, -0.00024414, ..., -0.00024414,
103
+ 0. , 0. ])
104
+ ```
105
+
106
+ The feature extractor preprocesses `array` into the expected input format for a given audio model. Use the `sampling_rate` parameter to set the appropriate sampling rate.
107
+
108
+ ```py
109
+ processed_dataset = feature_extractor(dataset[0]["audio"]["array"], sampling_rate=16000)
110
+ processed_dataset
111
+ {'input_values': [array([ 9.4472744e-05, 3.0777880e-03, -2.8888427e-03, ...,
112
+ -2.8888427e-03, 9.4472744e-05, 9.4472744e-05], dtype=float32)]}
113
+ ```
114
+
115
+ ### Padding
116
+
117
+ Audio sequence lengths that are different is an issue because Transformers expects all sequences to have the same lengths so they can be batched. Uneven sequence lengths can't be batched.
118
+
119
+ ```py
120
+ dataset[0]["audio"]["array"].shape
121
+ (86699,)
122
+
123
+ dataset[1]["audio"]["array"].shape
124
+ (53248,)
125
+ ```
126
+
127
+ Padding adds a special *padding token* to ensure all sequences have the same length. The feature extractor adds a `0` - interpreted as silence - to `array` to pad it. Set `padding=True` to pad sequences to the longest sequence length in the batch.
128
+
129
+ ```py
130
+ def preprocess_function(examples):
131
+ audio_arrays = [x["array"] for x in examples["audio"]]
132
+ inputs = feature_extractor(
133
+ audio_arrays,
134
+ sampling_rate=16000,
135
+ padding=True,
136
+ )
137
+ return inputs
138
+
139
+ processed_dataset = preprocess_function(dataset[:5])
140
+ processed_dataset["input_values"][0].shape
141
+ (86699,)
142
+
143
+ processed_dataset["input_values"][1].shape
144
+ (86699,)
145
+ ```
146
+
147
+ ### Truncation
148
+
149
+ Models can only process sequences up to a certain length before crashing.
150
+
151
+ Truncation is a strategy for removing excess tokens from a sequence to ensure it doesn't exceed the maximum length. Set `truncation=True` to truncate a sequence to the length in the `max_length` parameter.
152
+
153
+ ```py
154
+ def preprocess_function(examples):
155
+ audio_arrays = [x["array"] for x in examples["audio"]]
156
+ inputs = feature_extractor(
157
+ audio_arrays,
158
+ sampling_rate=16000,
159
+ max_length=50000,
160
+ truncation=True,
161
+ )
162
+ return inputs
163
+
164
+ processed_dataset = preprocess_function(dataset[:5])
165
+ processed_dataset["input_values"][0].shape
166
+ (50000,)
167
+
168
+ processed_dataset["input_values"][1].shape
169
+ (50000,)
170
+ ```
171
+
172
+ ### Resampling
173
+
174
+ The [Datasets](https://hf.co/docs/datasets/index) library can also resample audio data to match an audio models expected sampling rate. This method resamples the audio data on the fly when they're loaded which can be faster than resampling the entire dataset in-place.
175
+
176
+ The audio dataset you've been working on has a sampling rate of 8kHz and the pretrained model expects 16kHz.
177
+
178
+ ```py
179
+ dataset[0]["audio"]
180
+ {'path': '/root/.cache/huggingface/datasets/downloads/extracted/f507fdca7f475d961f5bb7093bcc9d544f16f8cab8608e772a2ed4fbeb4d6f50/en-US~JOINT_ACCOUNT/602ba55abb1e6d0fbce92065.wav',
181
+ 'array': array([ 0. , 0.00024414, -0.00024414, ..., -0.00024414,
182
+ 0. , 0. ]),
183
+ 'sampling_rate': 8000}
184
+ ```
185
+
186
+ Call [`~datasets.Dataset.cast_column`] on the `audio` column to upsample the sampling rate to 16kHz.
187
+
188
+ ```py
189
+ dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))
190
+ ```
191
+
192
+ When you load the dataset sample, it is now resampled to 16kHz.
193
+
194
+ ```py
195
+ dataset[0]["audio"]
196
+ {'path': '/root/.cache/huggingface/datasets/downloads/extracted/f507fdca7f475d961f5bb7093bcc9d544f16f8cab8608e772a2ed4fbeb4d6f50/en-US~JOINT_ACCOUNT/602ba55abb1e6d0fbce92065.wav',
197
+ 'array': array([ 1.70562416e-05, 2.18727451e-04, 2.28099874e-04, ...,
198
+ 3.43842403e-05, -5.96364771e-06, -1.76846661e-05]),
199
+ 'sampling_rate': 16000}
200
+ ```
transformers/docs/source/en/fsdp.md ADDED
@@ -0,0 +1,145 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!--Copyright 2024 The HuggingFace Team. All rights reserved.
2
+
3
+ Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
4
+ the License. You may obtain a copy of the License at
5
+
6
+ http://www.apache.org/licenses/LICENSE-2.0
7
+
8
+ Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
9
+ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
10
+ specific language governing permissions and limitations under the License.
11
+
12
+ ⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
13
+ rendered properly in your Markdown viewer.
14
+
15
+ -->
16
+
17
+ # FullyShardedDataParallel
18
+
19
+ [Fully Sharded Data Parallel (FSDP)](https://pytorch.org/blog/introducing-pytorch-fully-sharded-data-parallel-api/) is a [parallelism](./perf_train_gpu_many) method that combines the advantages of data and model parallelism for distributed training.
20
+
21
+ Unlike [DistributedDataParallel (DDP)](./perf_train_gpu_many#distributeddataparallel), FSDP saves more memory because it doesn't replicate a model on each GPU. It shards the models parameters, gradients and optimizer states across GPUs. Each model shard processes a portion of the data and the results are synchronized to speed up training.
22
+
23
+ This guide covers how to set up training a model with FSDP and [Accelerate](https://hf.co/docs/accelerate/index), a library for managing distributed training.
24
+
25
+ ```bash
26
+ pip install accelerate
27
+ ```
28
+
29
+ ## Configuration options
30
+
31
+ Always start by running the [accelerate config](https://hf.co/docs/accelerate/package_reference/cli#accelerate-config) command to help Accelerate set up the correct distributed training environment.
32
+
33
+ ```bash
34
+ accelerate config
35
+ ```
36
+
37
+ The section below discusses some of the more important FSDP configuration options. Learn more about other available options in the [fsdp_config](https://hf.co/docs/transformers/main_classes/trainer#transformers.TrainingArguments.fsdp_config) parameter.
38
+
39
+ ### Sharding strategy
40
+
41
+ FSDP offers several sharding strategies to distribute a model. Refer to the table below to help you choose the best strategy for your setup. Specify a strategy with the `fsdp_sharding_strategy` parameter in the configuration file.
42
+
43
+ | sharding strategy | description | parameter value |
44
+ |---|---|---|
45
+ | `FULL_SHARD` | shards model parameters, gradients, and optimizer states | `1` |
46
+ | `SHARD_GRAD_OP` | shards gradients and optimizer states | `2` |
47
+ | `NO_SHARD` | don't shard the model | `3` |
48
+ | `HYBRID_SHARD` | shards model parameters, gradients, and optimizer states within each GPU | `4` |
49
+ | `HYBRID_SHARD_ZERO2` | shards gradients and optimizer states within each GPU | `5` |
50
+
51
+ ### CPU offload
52
+
53
+ Offload model parameters and gradients when they aren't being used to the CPU to save additional GPU memory. This is useful for scenarios where a model is too large even with FSDP.
54
+
55
+ Specify `fsdp_offload_params: true` in the configuration file to enable offloading.
56
+
57
+ ### Wrapping policy
58
+
59
+ FSDP is applied by wrapping each layer in the network. The wrapping is usually applied in a nested way where the full weights are discarded after each forward pass to save memory for the next layer.
60
+
61
+ There are several wrapping policies available, but the *auto wrapping* policy is the simplest and doesn't require any changes to your code. Specify `fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP` to wrap a Transformer layer and `fsdp_transformer_layer_cls_to_wrap` to determine which layer to wrap (for example, `BertLayer`).
62
+
63
+ Size-based wrapping is also available. If a layer exceeds a certain number of parameters, it is wrapped. Specify `fsdp_wrap_policy: SIZED_BASED_WRAP` and `min_num_param` to set the minimum number of parameters for a layer to be wrapped.
64
+
65
+ ### Checkpoints
66
+
67
+ Intermediate checkpoints should be saved as a sharded state dict because saving the full state dict - even with CPU offloading - is time consuming and can cause `NCCL Timeout` errors due to indefinite hanging during broadcasting.
68
+
69
+ Specify `fsdp_state_dict_type: SHARDED_STATE_DICT` in the configuration file to save the sharded state dict. Now you can resume training from the sharded state dict with [`~accelerate.Accelerator.load_state`].
70
+
71
+ ```py
72
+ accelerator.load_state("directory/containing/checkpoints")
73
+ ```
74
+
75
+ Once training is complete though, you should save the full state dict because the sharded state dict is only compatible with FSDP.
76
+
77
+ ```py
78
+ if trainer.is_fsdp_enabled:
79
+ trainer.accelerator.state.fsdp_plugin.set_state_dict_type("FULL_STATE_DICT")
80
+
81
+ trainer.save_model(script_args.output_dir)
82
+ ```
83
+
84
+ ### TPU
85
+
86
+ [PyTorch XLA](https://pytorch.org/xla/release/2.1/index.html), a package for running PyTorch on XLA devices, enables FSDP on TPUs. Modify the configuration file to include the parameters below. Refer to the [xla_fsdp_settings](https://github.com/pytorch/xla/blob/2e6e183e0724818f137c8135b34ef273dea33318/torch_xla/distributed/fsdp/xla_fully_sharded_data_parallel.py#L128) parameter for additional XLA-specific parameters you can configure for FSDP.
87
+
88
+ ```yaml
89
+ xla: True # must be set to True to enable PyTorch/XLA
90
+ xla_fsdp_settings: # XLA specific FSDP parameters
91
+ xla_fsdp_grad_ckpt: True # enable gradient checkpointing
92
+ ```
93
+
94
+ ## Training
95
+
96
+ After running [accelerate config](https://hf.co/docs/accelerate/package_reference/cli#accelerate-config), your configuration file should be ready. An example configuration file is shown below that fully shards the parameter, gradient and optimizer states on two GPUs. Your file may look different depending on how you set up your configuration.
97
+
98
+ ```yaml
99
+ compute_environment: LOCAL_MACHINE
100
+ debug: false
101
+ distributed_type: FSDP
102
+ downcast_bf16: 'no'
103
+ fsdp_config:
104
+ fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
105
+ fsdp_backward_prefetch_policy: BACKWARD_PRE
106
+ fsdp_cpu_ram_efficient_loading: true
107
+ fsdp_forward_prefetch: false
108
+ fsdp_offload_params: true
109
+ fsdp_sharding_strategy: 1
110
+ fsdp_state_dict_type: SHARDED_STATE_DICT
111
+ fsdp_sync_module_states: true
112
+ fsdp_transformer_layer_cls_to_wrap: BertLayer
113
+ fsdp_use_orig_params: true
114
+ machine_rank: 0
115
+ main_training_function: main
116
+ mixed_precision: bf16
117
+ num_machines: 1
118
+ num_processes: 2
119
+ rdzv_backend: static
120
+ same_network: true
121
+ tpu_env: []
122
+ tpu_use_cluster: false
123
+ tpu_use_sudo: false
124
+ use_cpu: false
125
+ ```
126
+
127
+ Run the [accelerate launch](https://hf.co/docs/accelerate/package_reference/cli#accelerate-launch) command to launch a training script with the FSDP configurations you chose in the configuration file.
128
+
129
+ ```bash
130
+ accelerate launch my-training-script.py
131
+ ```
132
+
133
+ It is also possible to directly specify some of the FSDP arguments in the command line.
134
+
135
+ ```bash
136
+ accelerate launch --fsdp="full shard" --fsdp_config="path/to/fsdp_config/" my-training-script.py
137
+ ```
138
+
139
+ ## Resources
140
+
141
+ FSDP is a powerful tool for training large models with fewer GPUs compared to other parallelism strategies. Refer to the following resources below to learn even more about FSDP.
142
+
143
+ - Follow along with the more in-depth Accelerate guide for [FSDP](https://hf.co/docs/accelerate/usage_guides/fsdp).
144
+ - Read the [Introducing PyTorch Fully Sharded Data Parallel (FSDP) API](https://pytorch.org/blog/introducing-pytorch-fully-sharded-data-parallel-api/) blog post.
145
+ - Read the [Scaling PyTorch models on Cloud TPUs with FSDP](https://pytorch.org/blog/scaling-pytorch-models-on-cloud-tpus-with-fsdp/) blog post.