| - sections: |
| - local: index |
| title: Transformers |
| - local: installation |
| title: Installation |
| - local: quicktour |
| title: Quickstart |
| title: Get started |
| - isExpanded: false |
| sections: |
| - sections: |
| - local: models |
| title: Loading models |
| - local: custom_models |
| title: Customizing models |
| - local: how_to_hack_models |
| title: Customizing model components |
| - local: model_sharing |
| title: Sharing |
| - local: add_new_model |
| title: Adding a new model to Transformers |
| - local: modular_transformers |
| title: Modular Transformers |
| - local: auto_docstring |
| title: Document your models |
| - local: task_summary |
| title: What 🤗 Transformers can do |
| - local: tasks_explained |
| title: How 🤗 Transformers solve tasks |
| - local: model_summary |
| title: The Transformer model family |
| - local: attention |
| title: Attention mechanisms |
| - local: attention_interface |
| title: Customizing attention function |
| title: Models |
| - sections: |
| - local: fast_tokenizers |
| title: Tokenizers |
| - local: image_processors |
| title: Image processors |
| - local: video_processors |
| title: Video processors |
| - local: backbones |
| title: Backbones |
| - local: feature_extractors |
| title: Feature extractors |
| - local: processors |
| title: Processors |
| - local: tokenizer_summary |
| title: Summary of the tokenizers |
| - local: pad_truncation |
| title: Padding and truncation |
| title: Preprocessors |
| title: Base classes |
| - isExpanded: false |
| sections: |
| - sections: |
| - local: pipeline_tutorial |
| title: Pipeline |
| - local: pipeline_gradio |
| title: Machine learning apps |
| - local: pipeline_webserver |
| title: Web server inference |
| - local: add_new_pipeline |
| title: Adding a new pipeline |
| title: Pipeline API |
| - sections: |
| - local: llm_tutorial |
| title: Text generation |
| - local: generation_strategies |
| title: Generation strategies |
| - local: generation_features |
| title: Generation features |
| - local: tasks/prompting |
| title: Prompt engineering |
| - local: llm_optims |
| title: Optimizing inference |
| - local: kv_cache |
| title: KV cache strategies |
| - local: serving |
| title: Serving |
| - local: cache_explanation |
| title: Caching |
| - local: llm_tutorial_optimization |
| title: Getting the most out of LLMs |
| - local: perplexity |
| title: Perplexity of fixed-length models |
| title: LLMs |
| - sections: |
| - local: conversations |
| title: Chat basics |
| - local: chat_templating |
| title: Templates |
| - local: chat_templating_multimodal |
| title: Multimodal templates |
| - local: chat_templating_writing |
| title: Template writing |
| - local: chat_extras |
| title: Tools and RAG |
| title: Chat with models |
| - sections: |
| - local: perf_torch_compile |
| title: torch.compile |
| - local: perf_infer_gpu_one |
| title: GPU |
| - local: perf_infer_gpu_multi |
| title: Distributed GPU inference |
| - local: perf_infer_cpu |
| title: CPU |
| - local: tf_xla |
| title: XLA |
| title: Optimization |
| - local: agents |
| title: Agents |
| - local: tools |
| title: Tools |
| title: Inference |
| - isExpanded: false |
| sections: |
| - sections: |
| - local: trainer |
| title: Trainer |
| - local: training |
| title: Fine-tuning |
| - local: optimizers |
| title: Optimizers |
| - local: hpo_train |
| title: Hyperparameter search |
| title: Trainer API |
| - sections: |
| - local: gpu_selection |
| title: GPU selection |
| - local: accelerate |
| title: Accelerate |
| - local: fsdp |
| title: FullyShardedDataParallel |
| - local: deepspeed |
| title: DeepSpeed |
| - local: debugging |
| title: Multi-GPU debugging |
| - local: perf_train_cpu_many |
| title: Distributed CPUs |
| - local: perf_train_gpu_many |
| title: Parallelism methods |
| title: Distributed training |
| - sections: |
| - local: perf_train_gpu_one |
| title: GPU |
| - local: perf_train_cpu |
| title: CPU |
| - local: perf_train_tpu_tf |
| title: TPU |
| - local: perf_train_special |
| title: Apple Silicon |
| - local: perf_train_gaudi |
| title: Intel Gaudi |
| - local: perf_hardware |
| title: Build your own machine |
| title: Hardware |
| - local: peft |
| title: PEFT |
| - local: model_memory_anatomy |
| title: Model training anatomy |
| title: Training |
| - isExpanded: false |
| sections: |
| - local: quantization/overview |
| title: Overview |
| - local: quantization/selecting |
| title: Selecting a quantization method |
| - local: quantization/concept_guide |
| title: Quantization concepts |
| - local: quantization/aqlm |
| title: AQLM |
| - local: quantization/auto_round |
| title: AutoRound |
| - local: quantization/awq |
| title: AWQ |
| - local: quantization/bitnet |
| title: BitNet |
| - local: quantization/bitsandbytes |
| title: bitsandbytes |
| - local: quantization/compressed_tensors |
| title: compressed-tensors |
| - local: quantization/eetq |
| title: EETQ |
| - local: quantization/fbgemm_fp8 |
| title: FBGEMM |
| - local: quantization/finegrained_fp8 |
| title: Fine-grained FP8 |
| - local: gguf |
| title: GGUF |
| - local: quantization/gptq |
| title: GPTQ |
| - local: quantization/higgs |
| title: HIGGS |
| - local: quantization/hqq |
| title: HQQ |
| - local: quantization/optimum |
| title: Optimum |
| - local: quantization/quanto |
| title: Quanto |
| - local: quantization/quark |
| title: Quark |
| - local: quantization/torchao |
| title: torchao |
| - local: quantization/spqr |
| title: SpQR |
| - local: quantization/vptq |
| title: VPTQ |
| - local: quantization/contribute |
| title: Contribute |
| title: Quantization |
| - isExpanded: false |
| sections: |
| - local: serialization |
| title: ONNX |
| - local: tflite |
| title: LiteRT |
| - local: executorch |
| title: ExecuTorch |
| - local: torchscript |
| title: TorchScript |
| title: Export to production |
| - isExpanded: false |
| sections: |
| - sections: |
| - sections: |
| - local: tasks/sequence_classification |
| title: Text classification |
| - local: tasks/token_classification |
| title: Token classification |
| - local: tasks/question_answering |
| title: Question answering |
| - local: tasks/language_modeling |
| title: Causal language modeling |
| - local: tasks/masked_language_modeling |
| title: Masked language modeling |
| - local: tasks/translation |
| title: Translation |
| - local: tasks/summarization |
| title: Summarization |
| - local: tasks/multiple_choice |
| title: Multiple choice |
| title: Natural language processing |
| - sections: |
| - local: tasks/audio_classification |
| title: Audio classification |
| - local: tasks/asr |
| title: Automatic speech recognition |
| title: Audio |
| - sections: |
| - local: tasks/image_classification |
| title: Image classification |
| - local: tasks/semantic_segmentation |
| title: Image segmentation |
| - local: tasks/video_classification |
| title: Video classification |
| - local: tasks/object_detection |
| title: Object detection |
| - local: tasks/zero_shot_object_detection |
| title: Zero-shot object detection |
| - local: tasks/zero_shot_image_classification |
| title: Zero-shot image classification |
| - local: tasks/monocular_depth_estimation |
| title: Depth estimation |
| - local: tasks/image_to_image |
| title: Image-to-Image |
| - local: tasks/image_feature_extraction |
| title: Image Feature Extraction |
| - local: tasks/mask_generation |
| title: Mask Generation |
| - local: tasks/keypoint_detection |
| title: Keypoint detection |
| - local: tasks/knowledge_distillation_for_image_classification |
| title: Knowledge Distillation for Computer Vision |
| title: Computer vision |
| - sections: |
| - local: tasks/image_captioning |
| title: Image captioning |
| - local: tasks/document_question_answering |
| title: Document Question Answering |
| - local: tasks/visual_question_answering |
| title: Visual Question Answering |
| - local: tasks/text-to-speech |
| title: Text to speech |
| - local: tasks/idefics |
| title: Image tasks with IDEFICS |
| - local: tasks/image_text_to_text |
| title: Image-text-to-text |
| - local: tasks/video_text_to_text |
| title: Video-text-to-text |
| - local: tasks/visual_document_retrieval |
| title: Visual Document Retrieval |
| title: Multimodal |
| title: Task recipes |
| - local: run_scripts |
| title: Training scripts |
| - local: glossary |
| title: Glossary |
| - local: philosophy |
| title: Philosophy |
| - local: notebooks |
| title: Notebooks with examples |
| - local: community |
| title: Community resources |
| - local: troubleshooting |
| title: Troubleshoot |
| title: Resources |
| - isExpanded: false |
| sections: |
| - local: contributing |
| title: Contribute to Transformers |
| - local: testing |
| title: Transformers model tests |
| - local: pr_checks |
| title: Pull request checks |
| title: Contribute |
| - isExpanded: false |
| sections: |
| - sections: |
| - local: model_doc/auto |
| title: Auto Classes |
| - local: main_classes/backbones |
| title: Backbones |
| - local: main_classes/callback |
| title: Callbacks |
| - local: main_classes/configuration |
| title: Configuration |
| - local: main_classes/data_collator |
| title: Data Collator |
| - local: main_classes/keras_callbacks |
| title: Keras callbacks |
| - local: main_classes/logging |
| title: Logging |
| - local: main_classes/model |
| title: Models |
| - local: main_classes/text_generation |
| title: Text Generation |
| - local: main_classes/onnx |
| title: ONNX |
| - local: main_classes/optimizer_schedules |
| title: Optimization |
| - local: main_classes/output |
| title: Model outputs |
| - local: main_classes/peft |
| title: PEFT |
| - local: main_classes/pipelines |
| title: Pipelines |
| - local: main_classes/processors |
| title: Processors |
| - local: main_classes/quantization |
| title: Quantization |
| - local: main_classes/tokenizer |
| title: Tokenizer |
| - local: main_classes/trainer |
| title: Trainer |
| - local: main_classes/deepspeed |
| title: DeepSpeed |
| - local: main_classes/executorch |
| title: ExecuTorch |
| - local: main_classes/feature_extractor |
| title: Feature Extractor |
| - local: main_classes/image_processor |
| title: Image Processor |
| - local: main_classes/video_processor |
| title: Video Processor |
| title: Main Classes |
| - sections: |
| - sections: |
| - local: model_doc/albert |
| title: ALBERT |
| - local: model_doc/bamba |
| title: Bamba |
| - local: model_doc/bart |
| title: BART |
| - local: model_doc/barthez |
| title: BARThez |
| - local: model_doc/bartpho |
| title: BARTpho |
| - local: model_doc/bert |
| title: BERT |
| - local: model_doc/bert-generation |
| title: BertGeneration |
| - local: model_doc/bert-japanese |
| title: BertJapanese |
| - local: model_doc/bertweet |
| title: Bertweet |
| - local: model_doc/big_bird |
| title: BigBird |
| - local: model_doc/bigbird_pegasus |
| title: BigBirdPegasus |
| - local: model_doc/biogpt |
| title: BioGpt |
| - local: model_doc/bitnet |
| title: BitNet |
| - local: model_doc/blenderbot |
| title: Blenderbot |
| - local: model_doc/blenderbot-small |
| title: Blenderbot Small |
| - local: model_doc/bloom |
| title: BLOOM |
| - local: model_doc/bort |
| title: BORT |
| - local: model_doc/byt5 |
| title: ByT5 |
| - local: model_doc/camembert |
| title: CamemBERT |
| - local: model_doc/canine |
| title: CANINE |
| - local: model_doc/codegen |
| title: CodeGen |
| - local: model_doc/code_llama |
| title: CodeLlama |
| - local: model_doc/cohere |
| title: Cohere |
| - local: model_doc/cohere2 |
| title: Cohere2 |
| - local: model_doc/convbert |
| title: ConvBERT |
| - local: model_doc/cpm |
| title: CPM |
| - local: model_doc/cpmant |
| title: CPMANT |
| - local: model_doc/ctrl |
| title: CTRL |
| - local: model_doc/dbrx |
| title: DBRX |
| - local: model_doc/deberta |
| title: DeBERTa |
| - local: model_doc/deberta-v2 |
| title: DeBERTa-v2 |
| - local: model_doc/deepseek_v3 |
| title: DeepSeek-V3 |
| - local: model_doc/dialogpt |
| title: DialoGPT |
| - local: model_doc/diffllama |
| title: DiffLlama |
| - local: model_doc/distilbert |
| title: DistilBERT |
| - local: model_doc/dpr |
| title: DPR |
| - local: model_doc/electra |
| title: ELECTRA |
| - local: model_doc/encoder-decoder |
| title: Encoder Decoder Models |
| - local: model_doc/ernie |
| title: ERNIE |
| - local: model_doc/ernie_m |
| title: ErnieM |
| - local: model_doc/esm |
| title: ESM |
| - local: model_doc/falcon |
| title: Falcon |
| - local: model_doc/falcon3 |
| title: Falcon3 |
| - local: model_doc/falcon_mamba |
| title: FalconMamba |
| - local: model_doc/flan-t5 |
| title: FLAN-T5 |
| - local: model_doc/flan-ul2 |
| title: FLAN-UL2 |
| - local: model_doc/flaubert |
| title: FlauBERT |
| - local: model_doc/fnet |
| title: FNet |
| - local: model_doc/fsmt |
| title: FSMT |
| - local: model_doc/funnel |
| title: Funnel Transformer |
| - local: model_doc/fuyu |
| title: Fuyu |
| - local: model_doc/gemma |
| title: Gemma |
| - local: model_doc/gemma2 |
| title: Gemma2 |
| - local: model_doc/glm |
| title: GLM |
| - local: model_doc/glm4 |
| title: glm4 |
| - local: model_doc/openai-gpt |
| title: GPT |
| - local: model_doc/gpt_neo |
| title: GPT Neo |
| - local: model_doc/gpt_neox |
| title: GPT NeoX |
| - local: model_doc/gpt_neox_japanese |
| title: GPT NeoX Japanese |
| - local: model_doc/gptj |
| title: GPT-J |
| - local: model_doc/gpt2 |
| title: GPT2 |
| - local: model_doc/gpt_bigcode |
| title: GPTBigCode |
| - local: model_doc/gptsan-japanese |
| title: GPTSAN Japanese |
| - local: model_doc/gpt-sw3 |
| title: GPTSw3 |
| - local: model_doc/granite |
| title: Granite |
| - local: model_doc/granitemoe |
| title: GraniteMoe |
| - local: model_doc/granitemoehybrid |
| title: GraniteMoeHybrid |
| - local: model_doc/granitemoeshared |
| title: GraniteMoeShared |
| - local: model_doc/helium |
| title: Helium |
| - local: model_doc/herbert |
| title: HerBERT |
| - local: model_doc/hgnet_v2 |
| title: HGNet-V2 |
| - local: model_doc/ibert |
| title: I-BERT |
| - local: model_doc/jamba |
| title: Jamba |
| - local: model_doc/jetmoe |
| title: JetMoe |
| - local: model_doc/jukebox |
| title: Jukebox |
| - local: model_doc/led |
| title: LED |
| - local: model_doc/llama |
| title: LLaMA |
| - local: model_doc/llama2 |
| title: Llama2 |
| - local: model_doc/llama3 |
| title: Llama3 |
| - local: model_doc/longformer |
| title: Longformer |
| - local: model_doc/longt5 |
| title: LongT5 |
| - local: model_doc/luke |
| title: LUKE |
| - local: model_doc/m2m_100 |
| title: M2M100 |
| - local: model_doc/madlad-400 |
| title: MADLAD-400 |
| - local: model_doc/mamba |
| title: Mamba |
| - local: model_doc/mamba2 |
| title: mamba2 |
| - local: model_doc/marian |
| title: MarianMT |
| - local: model_doc/markuplm |
| title: MarkupLM |
| - local: model_doc/mbart |
| title: MBart and MBart-50 |
| - local: model_doc/mega |
| title: MEGA |
| - local: model_doc/megatron-bert |
| title: MegatronBERT |
| - local: model_doc/megatron_gpt2 |
| title: MegatronGPT2 |
| - local: model_doc/mistral |
| title: Mistral |
| - local: model_doc/mixtral |
| title: Mixtral |
| - local: model_doc/mluke |
| title: mLUKE |
| - local: model_doc/mobilebert |
| title: MobileBERT |
| - local: model_doc/modernbert |
| title: ModernBert |
| - local: model_doc/mpnet |
| title: MPNet |
| - local: model_doc/mpt |
| title: MPT |
| - local: model_doc/mra |
| title: MRA |
| - local: model_doc/mt5 |
| title: MT5 |
| - local: model_doc/mvp |
| title: MVP |
| - local: model_doc/myt5 |
| title: myt5 |
| - local: model_doc/nemotron |
| title: Nemotron |
| - local: model_doc/nezha |
| title: NEZHA |
| - local: model_doc/nllb |
| title: NLLB |
| - local: model_doc/nllb-moe |
| title: NLLB-MoE |
| - local: model_doc/nystromformer |
| title: Nyströmformer |
| - local: model_doc/olmo |
| title: OLMo |
| - local: model_doc/olmo2 |
| title: OLMo2 |
| - local: model_doc/olmoe |
| title: OLMoE |
| - local: model_doc/open-llama |
| title: Open-Llama |
| - local: model_doc/opt |
| title: OPT |
| - local: model_doc/pegasus |
| title: Pegasus |
| - local: model_doc/pegasus_x |
| title: PEGASUS-X |
| - local: model_doc/persimmon |
| title: Persimmon |
| - local: model_doc/phi |
| title: Phi |
| - local: model_doc/phi3 |
| title: Phi-3 |
| - local: model_doc/phimoe |
| title: PhiMoE |
| - local: model_doc/phobert |
| title: PhoBERT |
| - local: model_doc/plbart |
| title: PLBart |
| - local: model_doc/prophetnet |
| title: ProphetNet |
| - local: model_doc/qdqbert |
| title: QDQBert |
| - local: model_doc/qwen2 |
| title: Qwen2 |
| - local: model_doc/qwen2_moe |
| title: Qwen2MoE |
| - local: model_doc/qwen3 |
| title: Qwen3 |
| - local: model_doc/qwen3_moe |
| title: Qwen3MoE |
| - local: model_doc/rag |
| title: RAG |
| - local: model_doc/realm |
| title: REALM |
| - local: model_doc/recurrent_gemma |
| title: RecurrentGemma |
| - local: model_doc/reformer |
| title: Reformer |
| - local: model_doc/rembert |
| title: RemBERT |
| - local: model_doc/retribert |
| title: RetriBERT |
| - local: model_doc/roberta |
| title: RoBERTa |
| - local: model_doc/roberta-prelayernorm |
| title: RoBERTa-PreLayerNorm |
| - local: model_doc/roc_bert |
| title: RoCBert |
| - local: model_doc/roformer |
| title: RoFormer |
| - local: model_doc/rwkv |
| title: RWKV |
| - local: model_doc/splinter |
| title: Splinter |
| - local: model_doc/squeezebert |
| title: SqueezeBERT |
| - local: model_doc/stablelm |
| title: StableLm |
| - local: model_doc/starcoder2 |
| title: Starcoder2 |
| - local: model_doc/switch_transformers |
| title: SwitchTransformers |
| - local: model_doc/t5 |
| title: T5 |
| - local: model_doc/t5v1.1 |
| title: T5v1.1 |
| - local: model_doc/tapex |
| title: TAPEX |
| - local: model_doc/transfo-xl |
| title: Transformer XL |
| - local: model_doc/ul2 |
| title: UL2 |
| - local: model_doc/umt5 |
| title: UMT5 |
| - local: model_doc/xmod |
| title: X-MOD |
| - local: model_doc/xglm |
| title: XGLM |
| - local: model_doc/xlm |
| title: XLM |
| - local: model_doc/xlm-prophetnet |
| title: XLM-ProphetNet |
| - local: model_doc/xlm-roberta |
| title: XLM-RoBERTa |
| - local: model_doc/xlm-roberta-xl |
| title: XLM-RoBERTa-XL |
| - local: model_doc/xlm-v |
| title: XLM-V |
| - local: model_doc/xlnet |
| title: XLNet |
| - local: model_doc/yoso |
| title: YOSO |
| - local: model_doc/zamba |
| title: Zamba |
| - local: model_doc/zamba2 |
| title: Zamba2 |
| title: Text models |
| - sections: |
| - local: model_doc/beit |
| title: BEiT |
| - local: model_doc/bit |
| title: BiT |
| - local: model_doc/conditional_detr |
| title: Conditional DETR |
| - local: model_doc/convnext |
| title: ConvNeXT |
| - local: model_doc/convnextv2 |
| title: ConvNeXTV2 |
| - local: model_doc/cvt |
| title: CvT |
| - local: model_doc/d_fine |
| title: D-FINE |
| - local: model_doc/dab-detr |
| title: DAB-DETR |
| - local: model_doc/deformable_detr |
| title: Deformable DETR |
| - local: model_doc/deit |
| title: DeiT |
| - local: model_doc/depth_anything |
| title: Depth Anything |
| - local: model_doc/depth_anything_v2 |
| title: Depth Anything V2 |
| - local: model_doc/depth_pro |
| title: DepthPro |
| - local: model_doc/deta |
| title: DETA |
| - local: model_doc/detr |
| title: DETR |
| - local: model_doc/dinat |
| title: DiNAT |
| - local: model_doc/dinov2 |
| title: DINOV2 |
| - local: model_doc/dinov2_with_registers |
| title: DINOv2 with Registers |
| - local: model_doc/dit |
| title: DiT |
| - local: model_doc/dpt |
| title: DPT |
| - local: model_doc/efficientformer |
| title: EfficientFormer |
| - local: model_doc/efficientnet |
| title: EfficientNet |
| - local: model_doc/focalnet |
| title: FocalNet |
| - local: model_doc/glpn |
| title: GLPN |
| - local: model_doc/hiera |
| title: Hiera |
| - local: model_doc/ijepa |
| title: I-JEPA |
| - local: model_doc/imagegpt |
| title: ImageGPT |
| - local: model_doc/levit |
| title: LeViT |
| - local: model_doc/mask2former |
| title: Mask2Former |
| - local: model_doc/maskformer |
| title: MaskFormer |
| - local: model_doc/mlcd |
| title: MLCD |
| - local: model_doc/mobilenet_v1 |
| title: MobileNetV1 |
| - local: model_doc/mobilenet_v2 |
| title: MobileNetV2 |
| - local: model_doc/mobilevit |
| title: MobileViT |
| - local: model_doc/mobilevitv2 |
| title: MobileViTV2 |
| - local: model_doc/nat |
| title: NAT |
| - local: model_doc/poolformer |
| title: PoolFormer |
| - local: model_doc/prompt_depth_anything |
| title: Prompt Depth Anything |
| - local: model_doc/pvt |
| title: Pyramid Vision Transformer (PVT) |
| - local: model_doc/pvt_v2 |
| title: Pyramid Vision Transformer v2 (PVTv2) |
| - local: model_doc/regnet |
| title: RegNet |
| - local: model_doc/resnet |
| title: ResNet |
| - local: model_doc/rt_detr |
| title: RT-DETR |
| - local: model_doc/rt_detr_v2 |
| title: RT-DETRv2 |
| - local: model_doc/segformer |
| title: SegFormer |
| - local: model_doc/seggpt |
| title: SegGpt |
| - local: model_doc/superglue |
| title: SuperGlue |
| - local: model_doc/superpoint |
| title: SuperPoint |
| - local: model_doc/swiftformer |
| title: SwiftFormer |
| - local: model_doc/swin |
| title: Swin Transformer |
| - local: model_doc/swinv2 |
| title: Swin Transformer V2 |
| - local: model_doc/swin2sr |
| title: Swin2SR |
| - local: model_doc/table-transformer |
| title: Table Transformer |
| - local: model_doc/textnet |
| title: TextNet |
| - local: model_doc/timm_wrapper |
| title: Timm Wrapper |
| - local: model_doc/upernet |
| title: UperNet |
| - local: model_doc/van |
| title: VAN |
| - local: model_doc/vit |
| title: Vision Transformer (ViT) |
| - local: model_doc/vit_hybrid |
| title: ViT Hybrid |
| - local: model_doc/vitdet |
| title: ViTDet |
| - local: model_doc/vit_mae |
| title: ViTMAE |
| - local: model_doc/vitmatte |
| title: ViTMatte |
| - local: model_doc/vit_msn |
| title: ViTMSN |
| - local: model_doc/vitpose |
| title: ViTPose |
| - local: model_doc/yolos |
| title: YOLOS |
| - local: model_doc/zoedepth |
| title: ZoeDepth |
| title: Vision models |
| - sections: |
| - local: model_doc/audio-spectrogram-transformer |
| title: Audio Spectrogram Transformer |
| - local: model_doc/bark |
| title: Bark |
| - local: model_doc/clap |
| title: CLAP |
| - local: model_doc/csm |
| title: CSM |
| - local: model_doc/dac |
| title: dac |
| - local: model_doc/encodec |
| title: EnCodec |
| - local: model_doc/fastspeech2_conformer |
| title: FastSpeech2Conformer |
| - local: model_doc/granite_speech |
| title: GraniteSpeech |
| - local: model_doc/hubert |
| title: Hubert |
| - local: model_doc/mctct |
| title: MCTCT |
| - local: model_doc/mimi |
| title: Mimi |
| - local: model_doc/mms |
| title: MMS |
| - local: model_doc/moonshine |
| title: Moonshine |
| - local: model_doc/moshi |
| title: Moshi |
| - local: model_doc/musicgen |
| title: MusicGen |
| - local: model_doc/musicgen_melody |
| title: MusicGen Melody |
| - local: model_doc/pop2piano |
| title: Pop2Piano |
| - local: model_doc/seamless_m4t |
| title: Seamless-M4T |
| - local: model_doc/seamless_m4t_v2 |
| title: SeamlessM4T-v2 |
| - local: model_doc/sew |
| title: SEW |
| - local: model_doc/sew-d |
| title: SEW-D |
| - local: model_doc/speech_to_text |
| title: Speech2Text |
| - local: model_doc/speech_to_text_2 |
| title: Speech2Text2 |
| - local: model_doc/speecht5 |
| title: SpeechT5 |
| - local: model_doc/unispeech |
| title: UniSpeech |
| - local: model_doc/unispeech-sat |
| title: UniSpeech-SAT |
| - local: model_doc/univnet |
| title: UnivNet |
| - local: model_doc/vits |
| title: VITS |
| - local: model_doc/wav2vec2 |
| title: Wav2Vec2 |
| - local: model_doc/wav2vec2-bert |
| title: Wav2Vec2-BERT |
| - local: model_doc/wav2vec2-conformer |
| title: Wav2Vec2-Conformer |
| - local: model_doc/wav2vec2_phoneme |
| title: Wav2Vec2Phoneme |
| - local: model_doc/wavlm |
| title: WavLM |
| - local: model_doc/whisper |
| title: Whisper |
| - local: model_doc/xls_r |
| title: XLS-R |
| - local: model_doc/xlsr_wav2vec2 |
| title: XLSR-Wav2Vec2 |
| title: Audio models |
| - sections: |
| - local: model_doc/timesformer |
| title: TimeSformer |
| - local: model_doc/videomae |
| title: VideoMAE |
| - local: model_doc/vivit |
| title: ViViT |
| title: Video models |
| - sections: |
| - local: model_doc/align |
| title: ALIGN |
| - local: model_doc/altclip |
| title: AltCLIP |
| - local: model_doc/aria |
| title: Aria |
| - local: model_doc/aya_vision |
| title: AyaVision |
| - local: model_doc/blip |
| title: BLIP |
| - local: model_doc/blip-2 |
| title: BLIP-2 |
| - local: model_doc/bridgetower |
| title: BridgeTower |
| - local: model_doc/bros |
| title: BROS |
| - local: model_doc/chameleon |
| title: Chameleon |
| - local: model_doc/chinese_clip |
| title: Chinese-CLIP |
| - local: model_doc/clip |
| title: CLIP |
| - local: model_doc/clipseg |
| title: CLIPSeg |
| - local: model_doc/clvp |
| title: CLVP |
| - local: model_doc/colpali |
| title: ColPali |
| - local: model_doc/data2vec |
| title: Data2Vec |
| - local: model_doc/deplot |
| title: DePlot |
| - local: model_doc/donut |
| title: Donut |
| - local: model_doc/emu3 |
| title: Emu3 |
| - local: model_doc/flava |
| title: FLAVA |
| - local: model_doc/gemma3 |
| title: Gemma3 |
| - local: model_doc/git |
| title: GIT |
| - local: model_doc/got_ocr2 |
| title: GOT-OCR2 |
| - local: model_doc/granitevision |
| title: GraniteVision |
| - local: model_doc/grounding-dino |
| title: Grounding DINO |
| - local: model_doc/groupvit |
| title: GroupViT |
| - local: model_doc/idefics |
| title: IDEFICS |
| - local: model_doc/idefics2 |
| title: Idefics2 |
| - local: model_doc/idefics3 |
| title: Idefics3 |
| - local: model_doc/instructblip |
| title: InstructBLIP |
| - local: model_doc/instructblipvideo |
| title: InstructBlipVideo |
| - local: model_doc/internvl |
| title: InternVL |
| - local: model_doc/janus |
| title: Janus |
| - local: model_doc/kosmos-2 |
| title: KOSMOS-2 |
| - local: model_doc/layoutlm |
| title: LayoutLM |
| - local: model_doc/layoutlmv2 |
| title: LayoutLMV2 |
| - local: model_doc/layoutlmv3 |
| title: LayoutLMV3 |
| - local: model_doc/layoutxlm |
| title: LayoutXLM |
| - local: model_doc/lilt |
| title: LiLT |
| - local: model_doc/llama4 |
| title: Llama4 |
| - local: model_doc/llava |
| title: Llava |
| - local: model_doc/llava_next |
| title: LLaVA-NeXT |
| - local: model_doc/llava_next_video |
| title: LLaVa-NeXT-Video |
| - local: model_doc/llava_onevision |
| title: LLaVA-Onevision |
| - local: model_doc/lxmert |
| title: LXMERT |
| - local: model_doc/matcha |
| title: MatCha |
| - local: model_doc/mgp-str |
| title: MGP-STR |
| - local: model_doc/mistral3 |
| title: Mistral3 |
| - local: model_doc/mllama |
| title: mllama |
| - local: model_doc/nougat |
| title: Nougat |
| - local: model_doc/omdet-turbo |
| title: OmDet-Turbo |
| - local: model_doc/oneformer |
| title: OneFormer |
| - local: model_doc/owlvit |
| title: OWL-ViT |
| - local: model_doc/owlv2 |
| title: OWLv2 |
| - local: model_doc/paligemma |
| title: PaliGemma |
| - local: model_doc/perceiver |
| title: Perceiver |
| - local: model_doc/phi4_multimodal |
| title: Phi4 Multimodal |
| - local: model_doc/pix2struct |
| title: Pix2Struct |
| - local: model_doc/pixtral |
| title: Pixtral |
| - local: model_doc/qwen2_5_omni |
| title: Qwen2.5-Omni |
| - local: model_doc/qwen2_5_vl |
| title: Qwen2.5-VL |
| - local: model_doc/qwen2_audio |
| title: Qwen2Audio |
| - local: model_doc/qwen2_vl |
| title: Qwen2VL |
| - local: model_doc/sam |
| title: Segment Anything |
| - local: model_doc/sam_hq |
| title: Segment Anything High Quality |
| - local: model_doc/shieldgemma2 |
| title: ShieldGemma2 |
| - local: model_doc/siglip |
| title: SigLIP |
| - local: model_doc/siglip2 |
| title: SigLIP2 |
| - local: model_doc/smolvlm |
| title: SmolVLM |
| - local: model_doc/speech-encoder-decoder |
| title: Speech Encoder Decoder Models |
| - local: model_doc/tapas |
| title: TAPAS |
| - local: model_doc/trocr |
| title: TrOCR |
| - local: model_doc/tvlt |
| title: TVLT |
| - local: model_doc/tvp |
| title: TVP |
| - local: model_doc/udop |
| title: UDOP |
| - local: model_doc/video_llava |
| title: VideoLlava |
| - local: model_doc/vilt |
| title: ViLT |
| - local: model_doc/vipllava |
| title: VipLlava |
| - local: model_doc/vision-encoder-decoder |
| title: Vision Encoder Decoder Models |
| - local: model_doc/vision-text-dual-encoder |
| title: Vision Text Dual Encoder |
| - local: model_doc/visual_bert |
| title: VisualBERT |
| - local: model_doc/xclip |
| title: X-CLIP |
| title: Multimodal models |
| - sections: |
| - local: model_doc/decision_transformer |
| title: Decision Transformer |
| - local: model_doc/trajectory_transformer |
| title: Trajectory Transformer |
| title: Reinforcement learning models |
| - sections: |
| - local: model_doc/autoformer |
| title: Autoformer |
| - local: model_doc/informer |
| title: Informer |
| - local: model_doc/patchtsmixer |
| title: PatchTSMixer |
| - local: model_doc/patchtst |
| title: PatchTST |
| - local: model_doc/time_series_transformer |
| title: Time Series Transformer |
| - local: model_doc/timesfm |
| title: TimesFM |
| title: Time series models |
| - sections: |
| - local: model_doc/graphormer |
| title: Graphormer |
| title: Graph models |
| title: Models |
| - sections: |
| - local: internal/modeling_utils |
| title: Custom Layers and Utilities |
| - local: internal/model_debugging_utils |
| title: Utilities for Model Debugging |
| - local: internal/pipelines_utils |
| title: Utilities for pipelines |
| - local: internal/tokenization_utils |
| title: Utilities for Tokenizers |
| - local: internal/trainer_utils |
| title: Utilities for Trainer |
| - local: internal/generation_utils |
| title: Utilities for Generation |
| - local: internal/image_processing_utils |
| title: Utilities for Image Processors |
| - local: internal/audio_utils |
| title: Utilities for Audio processing |
| - local: internal/file_utils |
| title: General Utilities |
| - local: internal/import_utils |
| title: Importing Utilities |
| - local: internal/time_series_utils |
| title: Utilities for Time Series |
| title: Internal helpers |
| title: API |
|
|