diff --git a/.venv/lib/python3.11/site-packages/transformers/__pycache__/image_processing_utils.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/__pycache__/image_processing_utils.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a5725153d17701ba387c6681c3bdd6291bae9d9f
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/__pycache__/image_processing_utils.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/transformers/__pycache__/image_transforms.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/__pycache__/image_transforms.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3f8b31a738e3eae16ffb94bc75d499450fcebfb0
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/__pycache__/image_transforms.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/transformers/__pycache__/modelcard.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/__pycache__/modelcard.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c589a78197c19eb463bdec2f981bb82c58e6e431
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/__pycache__/modelcard.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/transformers/__pycache__/training_args_tf.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/__pycache__/training_args_tf.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f88c77252185ad0e4892fb712861eaad491e1c09
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/__pycache__/training_args_tf.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/transformers/generation/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/generation/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..dadeab816cc49a8b6a3dcbfba77183f935428be1
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/generation/__pycache__/__init__.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/transformers/generation/__pycache__/beam_constraints.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/generation/__pycache__/beam_constraints.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..cac9a2ef5255abae0588c8e41bd69d4eef83e86b
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/generation/__pycache__/beam_constraints.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/transformers/generation/__pycache__/beam_search.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/generation/__pycache__/beam_search.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9890b9a6e459e22a1bc8611dcb8031cf9f60cfa2
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/generation/__pycache__/beam_search.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/transformers/pipelines/__init__.py b/.venv/lib/python3.11/site-packages/transformers/pipelines/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..257f5689b0ed71afd8560aeb183f4e47beb03d47
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/transformers/pipelines/__init__.py
@@ -0,0 +1,1178 @@
+# coding=utf-8
+# Copyright 2018 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import json
+import os
+import warnings
+from pathlib import Path
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union
+
+from huggingface_hub import model_info
+
+from ..configuration_utils import PretrainedConfig
+from ..dynamic_module_utils import get_class_from_dynamic_module
+from ..feature_extraction_utils import PreTrainedFeatureExtractor
+from ..image_processing_utils import BaseImageProcessor
+from ..models.auto.configuration_auto import AutoConfig
+from ..models.auto.feature_extraction_auto import FEATURE_EXTRACTOR_MAPPING, AutoFeatureExtractor
+from ..models.auto.image_processing_auto import IMAGE_PROCESSOR_MAPPING, AutoImageProcessor
+from ..models.auto.modeling_auto import AutoModelForDepthEstimation, AutoModelForImageToImage
+from ..models.auto.processing_auto import PROCESSOR_MAPPING, AutoProcessor
+from ..models.auto.tokenization_auto import TOKENIZER_MAPPING, AutoTokenizer
+from ..processing_utils import ProcessorMixin
+from ..tokenization_utils import PreTrainedTokenizer
+from ..utils import (
+    CONFIG_NAME,
+    HUGGINGFACE_CO_RESOLVE_ENDPOINT,
+    cached_file,
+    extract_commit_hash,
+    find_adapter_config_file,
+    is_kenlm_available,
+    is_offline_mode,
+    is_peft_available,
+    is_pyctcdecode_available,
+    is_tf_available,
+    is_torch_available,
+    logging,
+)
+from .audio_classification import AudioClassificationPipeline
+from .automatic_speech_recognition import AutomaticSpeechRecognitionPipeline
+from .base import (
+    ArgumentHandler,
+    CsvPipelineDataFormat,
+    JsonPipelineDataFormat,
+    PipedPipelineDataFormat,
+    Pipeline,
+    PipelineDataFormat,
+    PipelineException,
+    PipelineRegistry,
+    get_default_model_and_revision,
+    infer_framework_load_model,
+)
+from .depth_estimation import DepthEstimationPipeline
+from .document_question_answering import DocumentQuestionAnsweringPipeline
+from .feature_extraction import FeatureExtractionPipeline
+from .fill_mask import FillMaskPipeline
+from .image_classification import ImageClassificationPipeline
+from .image_feature_extraction import ImageFeatureExtractionPipeline
+from .image_segmentation import ImageSegmentationPipeline
+from .image_text_to_text import ImageTextToTextPipeline
+from .image_to_image import ImageToImagePipeline
+from .image_to_text import ImageToTextPipeline
+from .mask_generation import MaskGenerationPipeline
+from .object_detection import ObjectDetectionPipeline
+from .question_answering import QuestionAnsweringArgumentHandler, QuestionAnsweringPipeline
+from .table_question_answering import TableQuestionAnsweringArgumentHandler, TableQuestionAnsweringPipeline
+from .text2text_generation import SummarizationPipeline, Text2TextGenerationPipeline, TranslationPipeline
+from .text_classification import TextClassificationPipeline
+from .text_generation import TextGenerationPipeline
+from .text_to_audio import TextToAudioPipeline
+from .token_classification import (
+    AggregationStrategy,
+    NerPipeline,
+    TokenClassificationArgumentHandler,
+    TokenClassificationPipeline,
+)
+from .video_classification import VideoClassificationPipeline
+from .visual_question_answering import VisualQuestionAnsweringPipeline
+from .zero_shot_audio_classification import ZeroShotAudioClassificationPipeline
+from .zero_shot_classification import ZeroShotClassificationArgumentHandler, ZeroShotClassificationPipeline
+from .zero_shot_image_classification import ZeroShotImageClassificationPipeline
+from .zero_shot_object_detection import ZeroShotObjectDetectionPipeline
+
+
+if is_tf_available():
+    import tensorflow as tf
+
+    from ..models.auto.modeling_tf_auto import (
+        TFAutoModel,
+        TFAutoModelForCausalLM,
+        TFAutoModelForImageClassification,
+        TFAutoModelForMaskedLM,
+        TFAutoModelForQuestionAnswering,
+        TFAutoModelForSeq2SeqLM,
+        TFAutoModelForSequenceClassification,
+        TFAutoModelForTableQuestionAnswering,
+        TFAutoModelForTokenClassification,
+        TFAutoModelForVision2Seq,
+        TFAutoModelForZeroShotImageClassification,
+    )
+
+if is_torch_available():
+    import torch
+
+    from ..models.auto.modeling_auto import (
+        AutoModel,
+        AutoModelForAudioClassification,
+        AutoModelForCausalLM,
+        AutoModelForCTC,
+        AutoModelForDocumentQuestionAnswering,
+        AutoModelForImageClassification,
+        AutoModelForImageSegmentation,
+        AutoModelForImageTextToText,
+        AutoModelForMaskedLM,
+        AutoModelForMaskGeneration,
+        AutoModelForObjectDetection,
+        AutoModelForQuestionAnswering,
+        AutoModelForSemanticSegmentation,
+        AutoModelForSeq2SeqLM,
+        AutoModelForSequenceClassification,
+        AutoModelForSpeechSeq2Seq,
+        AutoModelForTableQuestionAnswering,
+        AutoModelForTextToSpectrogram,
+        AutoModelForTextToWaveform,
+        AutoModelForTokenClassification,
+        AutoModelForVideoClassification,
+        AutoModelForVision2Seq,
+        AutoModelForVisualQuestionAnswering,
+        AutoModelForZeroShotImageClassification,
+        AutoModelForZeroShotObjectDetection,
+    )
+
+
+if TYPE_CHECKING:
+    from ..modeling_tf_utils import TFPreTrainedModel
+    from ..modeling_utils import PreTrainedModel
+    from ..tokenization_utils_fast import PreTrainedTokenizerFast
+
+
+logger = logging.get_logger(__name__)
+
+
+# Register all the supported tasks here
+TASK_ALIASES = {
+    "sentiment-analysis": "text-classification",
+    "ner": "token-classification",
+    "vqa": "visual-question-answering",
+    "text-to-speech": "text-to-audio",
+}
+SUPPORTED_TASKS = {
+    "audio-classification": {
+        "impl": AudioClassificationPipeline,
+        "tf": (),
+        "pt": (AutoModelForAudioClassification,) if is_torch_available() else (),
+        "default": {"model": {"pt": ("superb/wav2vec2-base-superb-ks", "372e048")}},
+        "type": "audio",
+    },
+    "automatic-speech-recognition": {
+        "impl": AutomaticSpeechRecognitionPipeline,
+        "tf": (),
+        "pt": (AutoModelForCTC, AutoModelForSpeechSeq2Seq) if is_torch_available() else (),
+        "default": {"model": {"pt": ("facebook/wav2vec2-base-960h", "22aad52")}},
+        "type": "multimodal",
+    },
+    "text-to-audio": {
+        "impl": TextToAudioPipeline,
+        "tf": (),
+        "pt": (AutoModelForTextToWaveform, AutoModelForTextToSpectrogram) if is_torch_available() else (),
+        "default": {"model": {"pt": ("suno/bark-small", "1dbd7a1")}},
+        "type": "text",
+    },
+    "feature-extraction": {
+        "impl": FeatureExtractionPipeline,
+        "tf": (TFAutoModel,) if is_tf_available() else (),
+        "pt": (AutoModel,) if is_torch_available() else (),
+        "default": {
+            "model": {
+                "pt": ("distilbert/distilbert-base-cased", "6ea8117"),
+                "tf": ("distilbert/distilbert-base-cased", "6ea8117"),
+            }
+        },
+        "type": "multimodal",
+    },
+    "text-classification": {
+        "impl": TextClassificationPipeline,
+        "tf": (TFAutoModelForSequenceClassification,) if is_tf_available() else (),
+        "pt": (AutoModelForSequenceClassification,) if is_torch_available() else (),
+        "default": {
+            "model": {
+                "pt": ("distilbert/distilbert-base-uncased-finetuned-sst-2-english", "714eb0f"),
+                "tf": ("distilbert/distilbert-base-uncased-finetuned-sst-2-english", "714eb0f"),
+            },
+        },
+        "type": "text",
+    },
+    "token-classification": {
+        "impl": TokenClassificationPipeline,
+        "tf": (TFAutoModelForTokenClassification,) if is_tf_available() else (),
+        "pt": (AutoModelForTokenClassification,) if is_torch_available() else (),
+        "default": {
+            "model": {
+                "pt": ("dbmdz/bert-large-cased-finetuned-conll03-english", "4c53496"),
+                "tf": ("dbmdz/bert-large-cased-finetuned-conll03-english", "4c53496"),
+            },
+        },
+        "type": "text",
+    },
+    "question-answering": {
+        "impl": QuestionAnsweringPipeline,
+        "tf": (TFAutoModelForQuestionAnswering,) if is_tf_available() else (),
+        "pt": (AutoModelForQuestionAnswering,) if is_torch_available() else (),
+        "default": {
+            "model": {
+                "pt": ("distilbert/distilbert-base-cased-distilled-squad", "564e9b5"),
+                "tf": ("distilbert/distilbert-base-cased-distilled-squad", "564e9b5"),
+            },
+        },
+        "type": "text",
+    },
+    "table-question-answering": {
+        "impl": TableQuestionAnsweringPipeline,
+        "pt": (AutoModelForTableQuestionAnswering,) if is_torch_available() else (),
+        "tf": (TFAutoModelForTableQuestionAnswering,) if is_tf_available() else (),
+        "default": {
+            "model": {
+                "pt": ("google/tapas-base-finetuned-wtq", "e3dde19"),
+                "tf": ("google/tapas-base-finetuned-wtq", "e3dde19"),
+            },
+        },
+        "type": "text",
+    },
+    "visual-question-answering": {
+        "impl": VisualQuestionAnsweringPipeline,
+        "pt": (AutoModelForVisualQuestionAnswering,) if is_torch_available() else (),
+        "tf": (),
+        "default": {
+            "model": {"pt": ("dandelin/vilt-b32-finetuned-vqa", "d0a1f6a")},
+        },
+        "type": "multimodal",
+    },
+    "document-question-answering": {
+        "impl": DocumentQuestionAnsweringPipeline,
+        "pt": (AutoModelForDocumentQuestionAnswering,) if is_torch_available() else (),
+        "tf": (),
+        "default": {
+            "model": {"pt": ("impira/layoutlm-document-qa", "beed3c4")},
+        },
+        "type": "multimodal",
+    },
+    "fill-mask": {
+        "impl": FillMaskPipeline,
+        "tf": (TFAutoModelForMaskedLM,) if is_tf_available() else (),
+        "pt": (AutoModelForMaskedLM,) if is_torch_available() else (),
+        "default": {
+            "model": {
+                "pt": ("distilbert/distilroberta-base", "fb53ab8"),
+                "tf": ("distilbert/distilroberta-base", "fb53ab8"),
+            }
+        },
+        "type": "text",
+    },
+    "summarization": {
+        "impl": SummarizationPipeline,
+        "tf": (TFAutoModelForSeq2SeqLM,) if is_tf_available() else (),
+        "pt": (AutoModelForSeq2SeqLM,) if is_torch_available() else (),
+        "default": {
+            "model": {"pt": ("sshleifer/distilbart-cnn-12-6", "a4f8f3e"), "tf": ("google-t5/t5-small", "df1b051")}
+        },
+        "type": "text",
+    },
+    # This task is a special case as it's parametrized by SRC, TGT languages.
+    "translation": {
+        "impl": TranslationPipeline,
+        "tf": (TFAutoModelForSeq2SeqLM,) if is_tf_available() else (),
+        "pt": (AutoModelForSeq2SeqLM,) if is_torch_available() else (),
+        "default": {
+            ("en", "fr"): {"model": {"pt": ("google-t5/t5-base", "a9723ea"), "tf": ("google-t5/t5-base", "a9723ea")}},
+            ("en", "de"): {"model": {"pt": ("google-t5/t5-base", "a9723ea"), "tf": ("google-t5/t5-base", "a9723ea")}},
+            ("en", "ro"): {"model": {"pt": ("google-t5/t5-base", "a9723ea"), "tf": ("google-t5/t5-base", "a9723ea")}},
+        },
+        "type": "text",
+    },
+    "text2text-generation": {
+        "impl": Text2TextGenerationPipeline,
+        "tf": (TFAutoModelForSeq2SeqLM,) if is_tf_available() else (),
+        "pt": (AutoModelForSeq2SeqLM,) if is_torch_available() else (),
+        "default": {"model": {"pt": ("google-t5/t5-base", "a9723ea"), "tf": ("google-t5/t5-base", "a9723ea")}},
+        "type": "text",
+    },
+    "text-generation": {
+        "impl": TextGenerationPipeline,
+        "tf": (TFAutoModelForCausalLM,) if is_tf_available() else (),
+        "pt": (AutoModelForCausalLM,) if is_torch_available() else (),
+        "default": {"model": {"pt": ("openai-community/gpt2", "607a30d"), "tf": ("openai-community/gpt2", "607a30d")}},
+        "type": "text",
+    },
+    "zero-shot-classification": {
+        "impl": ZeroShotClassificationPipeline,
+        "tf": (TFAutoModelForSequenceClassification,) if is_tf_available() else (),
+        "pt": (AutoModelForSequenceClassification,) if is_torch_available() else (),
+        "default": {
+            "model": {
+                "pt": ("facebook/bart-large-mnli", "d7645e1"),
+                "tf": ("FacebookAI/roberta-large-mnli", "2a8f12d"),
+            },
+            "config": {
+                "pt": ("facebook/bart-large-mnli", "d7645e1"),
+                "tf": ("FacebookAI/roberta-large-mnli", "2a8f12d"),
+            },
+        },
+        "type": "text",
+    },
+    "zero-shot-image-classification": {
+        "impl": ZeroShotImageClassificationPipeline,
+        "tf": (TFAutoModelForZeroShotImageClassification,) if is_tf_available() else (),
+        "pt": (AutoModelForZeroShotImageClassification,) if is_torch_available() else (),
+        "default": {
+            "model": {
+                "pt": ("openai/clip-vit-base-patch32", "3d74acf"),
+                "tf": ("openai/clip-vit-base-patch32", "3d74acf"),
+            }
+        },
+        "type": "multimodal",
+    },
+    "zero-shot-audio-classification": {
+        "impl": ZeroShotAudioClassificationPipeline,
+        "tf": (),
+        "pt": (AutoModel,) if is_torch_available() else (),
+        "default": {
+            "model": {
+                "pt": ("laion/clap-htsat-fused", "cca9e28"),
+            }
+        },
+        "type": "multimodal",
+    },
+    "image-classification": {
+        "impl": ImageClassificationPipeline,
+        "tf": (TFAutoModelForImageClassification,) if is_tf_available() else (),
+        "pt": (AutoModelForImageClassification,) if is_torch_available() else (),
+        "default": {
+            "model": {
+                "pt": ("google/vit-base-patch16-224", "3f49326"),
+                "tf": ("google/vit-base-patch16-224", "3f49326"),
+            }
+        },
+        "type": "image",
+    },
+    "image-feature-extraction": {
+        "impl": ImageFeatureExtractionPipeline,
+        "tf": (TFAutoModel,) if is_tf_available() else (),
+        "pt": (AutoModel,) if is_torch_available() else (),
+        "default": {
+            "model": {
+                "pt": ("google/vit-base-patch16-224", "3f49326"),
+                "tf": ("google/vit-base-patch16-224", "3f49326"),
+            }
+        },
+        "type": "image",
+    },
+    "image-segmentation": {
+        "impl": ImageSegmentationPipeline,
+        "tf": (),
+        "pt": (AutoModelForImageSegmentation, AutoModelForSemanticSegmentation) if is_torch_available() else (),
+        "default": {"model": {"pt": ("facebook/detr-resnet-50-panoptic", "d53b52a")}},
+        "type": "multimodal",
+    },
+    "image-to-text": {
+        "impl": ImageToTextPipeline,
+        "tf": (TFAutoModelForVision2Seq,) if is_tf_available() else (),
+        "pt": (AutoModelForVision2Seq,) if is_torch_available() else (),
+        "default": {
+            "model": {
+                "pt": ("ydshieh/vit-gpt2-coco-en", "5bebf1e"),
+                "tf": ("ydshieh/vit-gpt2-coco-en", "5bebf1e"),
+            }
+        },
+        "type": "multimodal",
+    },
+    "image-text-to-text": {
+        "impl": ImageTextToTextPipeline,
+        "tf": (),
+        "pt": (AutoModelForImageTextToText,) if is_torch_available() else (),
+        "default": {
+            "model": {
+                "pt": ("llava-hf/llava-onevision-qwen2-0.5b-ov-hf", "2c9ba3b"),
+            }
+        },
+        "type": "multimodal",
+    },
+    "object-detection": {
+        "impl": ObjectDetectionPipeline,
+        "tf": (),
+        "pt": (AutoModelForObjectDetection,) if is_torch_available() else (),
+        "default": {"model": {"pt": ("facebook/detr-resnet-50", "1d5f47b")}},
+        "type": "multimodal",
+    },
+    "zero-shot-object-detection": {
+        "impl": ZeroShotObjectDetectionPipeline,
+        "tf": (),
+        "pt": (AutoModelForZeroShotObjectDetection,) if is_torch_available() else (),
+        "default": {"model": {"pt": ("google/owlvit-base-patch32", "cbc355f")}},
+        "type": "multimodal",
+    },
+    "depth-estimation": {
+        "impl": DepthEstimationPipeline,
+        "tf": (),
+        "pt": (AutoModelForDepthEstimation,) if is_torch_available() else (),
+        "default": {"model": {"pt": ("Intel/dpt-large", "bc15f29")}},
+        "type": "image",
+    },
+    "video-classification": {
+        "impl": VideoClassificationPipeline,
+        "tf": (),
+        "pt": (AutoModelForVideoClassification,) if is_torch_available() else (),
+        "default": {"model": {"pt": ("MCG-NJU/videomae-base-finetuned-kinetics", "488eb9a")}},
+        "type": "video",
+    },
+    "mask-generation": {
+        "impl": MaskGenerationPipeline,
+        "tf": (),
+        "pt": (AutoModelForMaskGeneration,) if is_torch_available() else (),
+        "default": {"model": {"pt": ("facebook/sam-vit-huge", "87aecf0")}},
+        "type": "multimodal",
+    },
+    "image-to-image": {
+        "impl": ImageToImagePipeline,
+        "tf": (),
+        "pt": (AutoModelForImageToImage,) if is_torch_available() else (),
+        "default": {"model": {"pt": ("caidas/swin2SR-classical-sr-x2-64", "cee1c92")}},
+        "type": "image",
+    },
+}
+
+NO_FEATURE_EXTRACTOR_TASKS = set()
+NO_IMAGE_PROCESSOR_TASKS = set()
+NO_TOKENIZER_TASKS = set()
+
+# Those model configs are special, they are generic over their task, meaning
+# any tokenizer/feature_extractor might be use for a given model so we cannot
+# use the statically defined TOKENIZER_MAPPING and FEATURE_EXTRACTOR_MAPPING to
+# see if the model defines such objects or not.
+MULTI_MODEL_AUDIO_CONFIGS = {"SpeechEncoderDecoderConfig"}
+MULTI_MODEL_VISION_CONFIGS = {"VisionEncoderDecoderConfig", "VisionTextDualEncoderConfig"}
+for task, values in SUPPORTED_TASKS.items():
+    if values["type"] == "text":
+        NO_FEATURE_EXTRACTOR_TASKS.add(task)
+        NO_IMAGE_PROCESSOR_TASKS.add(task)
+    elif values["type"] in {"image", "video"}:
+        NO_TOKENIZER_TASKS.add(task)
+    elif values["type"] in {"audio"}:
+        NO_TOKENIZER_TASKS.add(task)
+        NO_IMAGE_PROCESSOR_TASKS.add(task)
+    elif values["type"] != "multimodal":
+        raise ValueError(f"SUPPORTED_TASK {task} contains invalid type {values['type']}")
+
+PIPELINE_REGISTRY = PipelineRegistry(supported_tasks=SUPPORTED_TASKS, task_aliases=TASK_ALIASES)
+
+
+def get_supported_tasks() -> List[str]:
+    """
+    Returns a list of supported task strings.
+    """
+    return PIPELINE_REGISTRY.get_supported_tasks()
+
+
+def get_task(model: str, token: Optional[str] = None, **deprecated_kwargs) -> str:
+    use_auth_token = deprecated_kwargs.pop("use_auth_token", None)
+    if use_auth_token is not None:
+        warnings.warn(
+            "The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers. Please use `token` instead.",
+            FutureWarning,
+        )
+        if token is not None:
+            raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.")
+        token = use_auth_token
+
+    if is_offline_mode():
+        raise RuntimeError("You cannot infer task automatically within `pipeline` when using offline mode")
+    try:
+        info = model_info(model, token=token)
+    except Exception as e:
+        raise RuntimeError(f"Instantiating a pipeline without a task set raised an error: {e}")
+    if not info.pipeline_tag:
+        raise RuntimeError(
+            f"The model {model} does not seem to have a correct `pipeline_tag` set to infer the task automatically"
+        )
+    if getattr(info, "library_name", "transformers") not in {"transformers", "timm"}:
+        raise RuntimeError(f"This model is meant to be used with {info.library_name} not with transformers")
+    task = info.pipeline_tag
+    return task
+
+
+def check_task(task: str) -> Tuple[str, Dict, Any]:
+    """
+    Checks an incoming task string, to validate it's correct and return the default Pipeline and Model classes, and
+    default models if they exist.
+
+    Args:
+        task (`str`):
+            The task defining which pipeline will be returned. Currently accepted tasks are:
+
+            - `"audio-classification"`
+            - `"automatic-speech-recognition"`
+            - `"conversational"`
+            - `"depth-estimation"`
+            - `"document-question-answering"`
+            - `"feature-extraction"`
+            - `"fill-mask"`
+            - `"image-classification"`
+            - `"image-feature-extraction"`
+            - `"image-segmentation"`
+            - `"image-to-text"`
+            - `"image-to-image"`
+            - `"object-detection"`
+            - `"question-answering"`
+            - `"summarization"`
+            - `"table-question-answering"`
+            - `"text2text-generation"`
+            - `"text-classification"` (alias `"sentiment-analysis"` available)
+            - `"text-generation"`
+            - `"text-to-audio"` (alias `"text-to-speech"` available)
+            - `"token-classification"` (alias `"ner"` available)
+            - `"translation"`
+            - `"translation_xx_to_yy"`
+            - `"video-classification"`
+            - `"visual-question-answering"` (alias `"vqa"` available)
+            - `"zero-shot-classification"`
+            - `"zero-shot-image-classification"`
+            - `"zero-shot-object-detection"`
+
+    Returns:
+        (normalized_task: `str`, task_defaults: `dict`, task_options: (`tuple`, None)) The normalized task name
+        (removed alias and options). The actual dictionary required to initialize the pipeline and some extra task
+        options for parametrized tasks like "translation_XX_to_YY"
+
+
+    """
+    return PIPELINE_REGISTRY.check_task(task)
+
+
+def clean_custom_task(task_info):
+    import transformers
+
+    if "impl" not in task_info:
+        raise RuntimeError("This model introduces a custom pipeline without specifying its implementation.")
+    pt_class_names = task_info.get("pt", ())
+    if isinstance(pt_class_names, str):
+        pt_class_names = [pt_class_names]
+    task_info["pt"] = tuple(getattr(transformers, c) for c in pt_class_names)
+    tf_class_names = task_info.get("tf", ())
+    if isinstance(tf_class_names, str):
+        tf_class_names = [tf_class_names]
+    task_info["tf"] = tuple(getattr(transformers, c) for c in tf_class_names)
+    return task_info, None
+
+
+def pipeline(
+    task: str = None,
+    model: Optional[Union[str, "PreTrainedModel", "TFPreTrainedModel"]] = None,
+    config: Optional[Union[str, PretrainedConfig]] = None,
+    tokenizer: Optional[Union[str, PreTrainedTokenizer, "PreTrainedTokenizerFast"]] = None,
+    feature_extractor: Optional[Union[str, PreTrainedFeatureExtractor]] = None,
+    image_processor: Optional[Union[str, BaseImageProcessor]] = None,
+    processor: Optional[Union[str, ProcessorMixin]] = None,
+    framework: Optional[str] = None,
+    revision: Optional[str] = None,
+    use_fast: bool = True,
+    token: Optional[Union[str, bool]] = None,
+    device: Optional[Union[int, str, "torch.device"]] = None,
+    device_map=None,
+    torch_dtype=None,
+    trust_remote_code: Optional[bool] = None,
+    model_kwargs: Dict[str, Any] = None,
+    pipeline_class: Optional[Any] = None,
+    **kwargs,
+) -> Pipeline:
+    """
+    Utility factory method to build a [`Pipeline`].
+
+    A pipeline consists of:
+
+        - One or more components for pre-processing model inputs, such as a [tokenizer](tokenizer),
+        [image_processor](image_processor), [feature_extractor](feature_extractor), or [processor](processors).
+        - A [model](model) that generates predictions from the inputs.
+        - Optional post-processing steps to refine the model's output, which can also be handled by processors.
+
+    <Tip>
+    While there are such optional arguments as `tokenizer`, `feature_extractor`, `image_processor`, and `processor`,
+    they shouldn't be specified all at once. If these components are not provided, `pipeline` will try to load
+    required ones automatically. In case you want to provide these components explicitly, please refer to a
+    specific pipeline in order to get more details regarding what components are required.
+    </Tip>
+
+    Args:
+        task (`str`):
+            The task defining which pipeline will be returned. Currently accepted tasks are:
+
+            - `"audio-classification"`: will return a [`AudioClassificationPipeline`].
+            - `"automatic-speech-recognition"`: will return a [`AutomaticSpeechRecognitionPipeline`].
+            - `"depth-estimation"`: will return a [`DepthEstimationPipeline`].
+            - `"document-question-answering"`: will return a [`DocumentQuestionAnsweringPipeline`].
+            - `"feature-extraction"`: will return a [`FeatureExtractionPipeline`].
+            - `"fill-mask"`: will return a [`FillMaskPipeline`]:.
+            - `"image-classification"`: will return a [`ImageClassificationPipeline`].
+            - `"image-feature-extraction"`: will return an [`ImageFeatureExtractionPipeline`].
+            - `"image-segmentation"`: will return a [`ImageSegmentationPipeline`].
+            - `"image-text-to-text"`: will return a [`ImageTextToTextPipeline`].
+            - `"image-to-image"`: will return a [`ImageToImagePipeline`].
+            - `"image-to-text"`: will return a [`ImageToTextPipeline`].
+            - `"mask-generation"`: will return a [`MaskGenerationPipeline`].
+            - `"object-detection"`: will return a [`ObjectDetectionPipeline`].
+            - `"question-answering"`: will return a [`QuestionAnsweringPipeline`].
+            - `"summarization"`: will return a [`SummarizationPipeline`].
+            - `"table-question-answering"`: will return a [`TableQuestionAnsweringPipeline`].
+            - `"text2text-generation"`: will return a [`Text2TextGenerationPipeline`].
+            - `"text-classification"` (alias `"sentiment-analysis"` available): will return a
+              [`TextClassificationPipeline`].
+            - `"text-generation"`: will return a [`TextGenerationPipeline`]:.
+            - `"text-to-audio"` (alias `"text-to-speech"` available): will return a [`TextToAudioPipeline`]:.
+            - `"token-classification"` (alias `"ner"` available): will return a [`TokenClassificationPipeline`].
+            - `"translation"`: will return a [`TranslationPipeline`].
+            - `"translation_xx_to_yy"`: will return a [`TranslationPipeline`].
+            - `"video-classification"`: will return a [`VideoClassificationPipeline`].
+            - `"visual-question-answering"`: will return a [`VisualQuestionAnsweringPipeline`].
+            - `"zero-shot-classification"`: will return a [`ZeroShotClassificationPipeline`].
+            - `"zero-shot-image-classification"`: will return a [`ZeroShotImageClassificationPipeline`].
+            - `"zero-shot-audio-classification"`: will return a [`ZeroShotAudioClassificationPipeline`].
+            - `"zero-shot-object-detection"`: will return a [`ZeroShotObjectDetectionPipeline`].
+
+        model (`str` or [`PreTrainedModel`] or [`TFPreTrainedModel`], *optional*):
+            The model that will be used by the pipeline to make predictions. This can be a model identifier or an
+            actual instance of a pretrained model inheriting from [`PreTrainedModel`] (for PyTorch) or
+            [`TFPreTrainedModel`] (for TensorFlow).
+
+            If not provided, the default for the `task` will be loaded.
+        config (`str` or [`PretrainedConfig`], *optional*):
+            The configuration that will be used by the pipeline to instantiate the model. This can be a model
+            identifier or an actual pretrained model configuration inheriting from [`PretrainedConfig`].
+
+            If not provided, the default configuration file for the requested model will be used. That means that if
+            `model` is given, its default configuration will be used. However, if `model` is not supplied, this
+            `task`'s default model's config is used instead.
+        tokenizer (`str` or [`PreTrainedTokenizer`], *optional*):
+            The tokenizer that will be used by the pipeline to encode data for the model. This can be a model
+            identifier or an actual pretrained tokenizer inheriting from [`PreTrainedTokenizer`].
+
+            If not provided, the default tokenizer for the given `model` will be loaded (if it is a string). If `model`
+            is not specified or not a string, then the default tokenizer for `config` is loaded (if it is a string).
+            However, if `config` is also not given or not a string, then the default tokenizer for the given `task`
+            will be loaded.
+        feature_extractor (`str` or [`PreTrainedFeatureExtractor`], *optional*):
+            The feature extractor that will be used by the pipeline to encode data for the model. This can be a model
+            identifier or an actual pretrained feature extractor inheriting from [`PreTrainedFeatureExtractor`].
+
+            Feature extractors are used for non-NLP models, such as Speech or Vision models as well as multi-modal
+            models. Multi-modal models will also require a tokenizer to be passed.
+
+            If not provided, the default feature extractor for the given `model` will be loaded (if it is a string). If
+            `model` is not specified or not a string, then the default feature extractor for `config` is loaded (if it
+            is a string). However, if `config` is also not given or not a string, then the default feature extractor
+            for the given `task` will be loaded.
+        image_processor (`str` or [`BaseImageProcessor`], *optional*):
+            The image processor that will be used by the pipeline to preprocess images for the model. This can be a
+            model identifier or an actual image processor inheriting from [`BaseImageProcessor`].
+
+            Image processors are used for Vision models and multi-modal models that require image inputs. Multi-modal
+            models will also require a tokenizer to be passed.
+
+            If not provided, the default image processor for the given `model` will be loaded (if it is a string). If
+            `model` is not specified or not a string, then the default image processor for `config` is loaded (if it is
+            a string).
+        processor (`str` or [`ProcessorMixin`], *optional*):
+            The processor that will be used by the pipeline to preprocess data for the model. This can be a model
+            identifier or an actual processor inheriting from [`ProcessorMixin`].
+
+            Processors are used for multi-modal models that require multi-modal inputs, for example, a model that
+            requires both text and image inputs.
+
+            If not provided, the default processor for the given `model` will be loaded (if it is a string). If `model`
+            is not specified or not a string, then the default processor for `config` is loaded (if it is a string).
+        framework (`str`, *optional*):
+            The framework to use, either `"pt"` for PyTorch or `"tf"` for TensorFlow. The specified framework must be
+            installed.
+
+            If no framework is specified, will default to the one currently installed. If no framework is specified and
+            both frameworks are installed, will default to the framework of the `model`, or to PyTorch if no model is
+            provided.
+        revision (`str`, *optional*, defaults to `"main"`):
+            When passing a task name or a string model identifier: The specific model version to use. It can be a
+            branch name, a tag name, or a commit id, since we use a git-based system for storing models and other
+            artifacts on huggingface.co, so `revision` can be any identifier allowed by git.
+        use_fast (`bool`, *optional*, defaults to `True`):
+            Whether or not to use a Fast tokenizer if possible (a [`PreTrainedTokenizerFast`]).
+        use_auth_token (`str` or *bool*, *optional*):
+            The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated
+            when running `huggingface-cli login` (stored in `~/.huggingface`).
+        device (`int` or `str` or `torch.device`):
+            Defines the device (*e.g.*, `"cpu"`, `"cuda:1"`, `"mps"`, or a GPU ordinal rank like `1`) on which this
+            pipeline will be allocated.
+        device_map (`str` or `Dict[str, Union[int, str, torch.device]`, *optional*):
+            Sent directly as `model_kwargs` (just a simpler shortcut). When `accelerate` library is present, set
+            `device_map="auto"` to compute the most optimized `device_map` automatically (see
+            [here](https://huggingface.co/docs/accelerate/main/en/package_reference/big_modeling#accelerate.cpu_offload)
+            for more information).
+
+            <Tip warning={true}>
+
+            Do not use `device_map` AND `device` at the same time as they will conflict
+
+            </Tip>
+
+        torch_dtype (`str` or `torch.dtype`, *optional*):
+            Sent directly as `model_kwargs` (just a simpler shortcut) to use the available precision for this model
+            (`torch.float16`, `torch.bfloat16`, ... or `"auto"`).
+        trust_remote_code (`bool`, *optional*, defaults to `False`):
+            Whether or not to allow for custom code defined on the Hub in their own modeling, configuration,
+            tokenization or even pipeline files. This option should only be set to `True` for repositories you trust
+            and in which you have read the code, as it will execute code present on the Hub on your local machine.
+        model_kwargs (`Dict[str, Any]`, *optional*):
+            Additional dictionary of keyword arguments passed along to the model's `from_pretrained(...,
+            **model_kwargs)` function.
+        kwargs (`Dict[str, Any]`, *optional*):
+            Additional keyword arguments passed along to the specific pipeline init (see the documentation for the
+            corresponding pipeline class for possible values).
+
+    Returns:
+        [`Pipeline`]: A suitable pipeline for the task.
+
+    Examples:
+
+    ```python
+    >>> from transformers import pipeline, AutoModelForTokenClassification, AutoTokenizer
+
+    >>> # Sentiment analysis pipeline
+    >>> analyzer = pipeline("sentiment-analysis")
+
+    >>> # Question answering pipeline, specifying the checkpoint identifier
+    >>> oracle = pipeline(
+    ...     "question-answering", model="distilbert/distilbert-base-cased-distilled-squad", tokenizer="google-bert/bert-base-cased"
+    ... )
+
+    >>> # Named entity recognition pipeline, passing in a specific model and tokenizer
+    >>> model = AutoModelForTokenClassification.from_pretrained("dbmdz/bert-large-cased-finetuned-conll03-english")
+    >>> tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-cased")
+    >>> recognizer = pipeline("ner", model=model, tokenizer=tokenizer)
+    ```"""
+    if model_kwargs is None:
+        model_kwargs = {}
+    # Make sure we only pass use_auth_token once as a kwarg (it used to be possible to pass it in model_kwargs,
+    # this is to keep BC).
+    use_auth_token = model_kwargs.pop("use_auth_token", None)
+    if use_auth_token is not None:
+        warnings.warn(
+            "The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers. Please use `token` instead.",
+            FutureWarning,
+        )
+        if token is not None:
+            raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.")
+        token = use_auth_token
+
+    code_revision = kwargs.pop("code_revision", None)
+    commit_hash = kwargs.pop("_commit_hash", None)
+
+    hub_kwargs = {
+        "revision": revision,
+        "token": token,
+        "trust_remote_code": trust_remote_code,
+        "_commit_hash": commit_hash,
+    }
+
+    if task is None and model is None:
+        raise RuntimeError(
+            "Impossible to instantiate a pipeline without either a task or a model "
+            "being specified. "
+            "Please provide a task class or a model"
+        )
+
+    if model is None and tokenizer is not None:
+        raise RuntimeError(
+            "Impossible to instantiate a pipeline with tokenizer specified but not the model as the provided tokenizer"
+            " may not be compatible with the default model. Please provide a PreTrainedModel class or a"
+            " path/identifier to a pretrained model when providing tokenizer."
+        )
+    if model is None and feature_extractor is not None:
+        raise RuntimeError(
+            "Impossible to instantiate a pipeline with feature_extractor specified but not the model as the provided"
+            " feature_extractor may not be compatible with the default model. Please provide a PreTrainedModel class"
+            " or a path/identifier to a pretrained model when providing feature_extractor."
+        )
+    if isinstance(model, Path):
+        model = str(model)
+
+    if commit_hash is None:
+        pretrained_model_name_or_path = None
+        if isinstance(config, str):
+            pretrained_model_name_or_path = config
+        elif config is None and isinstance(model, str):
+            pretrained_model_name_or_path = model
+
+        if not isinstance(config, PretrainedConfig) and pretrained_model_name_or_path is not None:
+            # We make a call to the config file first (which may be absent) to get the commit hash as soon as possible
+            resolved_config_file = cached_file(
+                pretrained_model_name_or_path,
+                CONFIG_NAME,
+                _raise_exceptions_for_gated_repo=False,
+                _raise_exceptions_for_missing_entries=False,
+                _raise_exceptions_for_connection_errors=False,
+                cache_dir=model_kwargs.get("cache_dir"),
+                **hub_kwargs,
+            )
+            hub_kwargs["_commit_hash"] = extract_commit_hash(resolved_config_file, commit_hash)
+        else:
+            hub_kwargs["_commit_hash"] = getattr(config, "_commit_hash", None)
+
+    # Config is the primordial information item.
+    # Instantiate config if needed
+    if isinstance(config, str):
+        config = AutoConfig.from_pretrained(
+            config, _from_pipeline=task, code_revision=code_revision, **hub_kwargs, **model_kwargs
+        )
+        hub_kwargs["_commit_hash"] = config._commit_hash
+    elif config is None and isinstance(model, str):
+        # Check for an adapter file in the model path if PEFT is available
+        if is_peft_available():
+            # `find_adapter_config_file` doesn't accept `trust_remote_code`
+            _hub_kwargs = {k: v for k, v in hub_kwargs.items() if k != "trust_remote_code"}
+            maybe_adapter_path = find_adapter_config_file(
+                model,
+                token=hub_kwargs["token"],
+                revision=hub_kwargs["revision"],
+                _commit_hash=hub_kwargs["_commit_hash"],
+            )
+
+            if maybe_adapter_path is not None:
+                with open(maybe_adapter_path, "r", encoding="utf-8") as f:
+                    adapter_config = json.load(f)
+                    model = adapter_config["base_model_name_or_path"]
+
+        config = AutoConfig.from_pretrained(
+            model, _from_pipeline=task, code_revision=code_revision, **hub_kwargs, **model_kwargs
+        )
+        hub_kwargs["_commit_hash"] = config._commit_hash
+
+    custom_tasks = {}
+    if config is not None and len(getattr(config, "custom_pipelines", {})) > 0:
+        custom_tasks = config.custom_pipelines
+        if task is None and trust_remote_code is not False:
+            if len(custom_tasks) == 1:
+                task = list(custom_tasks.keys())[0]
+            else:
+                raise RuntimeError(
+                    "We can't infer the task automatically for this model as there are multiple tasks available. Pick "
+                    f"one in {', '.join(custom_tasks.keys())}"
+                )
+
+    if task is None and model is not None:
+        if not isinstance(model, str):
+            raise RuntimeError(
+                "Inferring the task automatically requires to check the hub with a model_id defined as a `str`. "
+                f"{model} is not a valid model_id."
+            )
+        task = get_task(model, token)
+
+    # Retrieve the task
+    if task in custom_tasks:
+        normalized_task = task
+        targeted_task, task_options = clean_custom_task(custom_tasks[task])
+        if pipeline_class is None:
+            if not trust_remote_code:
+                raise ValueError(
+                    "Loading this pipeline requires you to execute the code in the pipeline file in that"
+                    " repo on your local machine. Make sure you have read the code there to avoid malicious use, then"
+                    " set the option `trust_remote_code=True` to remove this error."
+                )
+            class_ref = targeted_task["impl"]
+            pipeline_class = get_class_from_dynamic_module(
+                class_ref,
+                model,
+                code_revision=code_revision,
+                **hub_kwargs,
+            )
+    else:
+        normalized_task, targeted_task, task_options = check_task(task)
+        if pipeline_class is None:
+            pipeline_class = targeted_task["impl"]
+
+    # Use default model/config/tokenizer for the task if no model is provided
+    if model is None:
+        # At that point framework might still be undetermined
+        model, default_revision = get_default_model_and_revision(targeted_task, framework, task_options)
+        revision = revision if revision is not None else default_revision
+        logger.warning(
+            f"No model was supplied, defaulted to {model} and revision"
+            f" {revision} ({HUGGINGFACE_CO_RESOLVE_ENDPOINT}/{model}).\n"
+            "Using a pipeline without specifying a model name and revision in production is not recommended."
+        )
+        hub_kwargs["revision"] = revision
+        if config is None and isinstance(model, str):
+            config = AutoConfig.from_pretrained(model, _from_pipeline=task, **hub_kwargs, **model_kwargs)
+            hub_kwargs["_commit_hash"] = config._commit_hash
+
+    if device_map is not None:
+        if "device_map" in model_kwargs:
+            raise ValueError(
+                'You cannot use both `pipeline(... device_map=..., model_kwargs={"device_map":...})` as those'
+                " arguments might conflict, use only one.)"
+            )
+        if device is not None:
+            logger.warning(
+                "Both `device` and `device_map` are specified. `device` will override `device_map`. You"
+                " will most likely encounter unexpected behavior. Please remove `device` and keep `device_map`."
+            )
+        model_kwargs["device_map"] = device_map
+    if torch_dtype is not None:
+        if "torch_dtype" in model_kwargs:
+            raise ValueError(
+                'You cannot use both `pipeline(... torch_dtype=..., model_kwargs={"torch_dtype":...})` as those'
+                " arguments might conflict, use only one.)"
+            )
+        if isinstance(torch_dtype, str) and hasattr(torch, torch_dtype):
+            torch_dtype = getattr(torch, torch_dtype)
+        model_kwargs["torch_dtype"] = torch_dtype
+
+    model_name = model if isinstance(model, str) else None
+
+    # Load the correct model if possible
+    # Infer the framework from the model if not already defined
+    if isinstance(model, str) or framework is None:
+        model_classes = {"tf": targeted_task["tf"], "pt": targeted_task["pt"]}
+        framework, model = infer_framework_load_model(
+            model,
+            model_classes=model_classes,
+            config=config,
+            framework=framework,
+            task=task,
+            **hub_kwargs,
+            **model_kwargs,
+        )
+
+    model_config = model.config
+    hub_kwargs["_commit_hash"] = model.config._commit_hash
+
+    load_tokenizer = type(model_config) in TOKENIZER_MAPPING or model_config.tokenizer_class is not None
+    load_feature_extractor = type(model_config) in FEATURE_EXTRACTOR_MAPPING or feature_extractor is not None
+    load_image_processor = type(model_config) in IMAGE_PROCESSOR_MAPPING or image_processor is not None
+    load_processor = type(model_config) in PROCESSOR_MAPPING or processor is not None
+
+    # Check that pipeline class required loading
+    load_tokenizer = load_tokenizer and pipeline_class._load_tokenizer
+    load_feature_extractor = load_feature_extractor and pipeline_class._load_feature_extractor
+    load_image_processor = load_image_processor and pipeline_class._load_image_processor
+    load_processor = load_processor and pipeline_class._load_processor
+
+    # If `model` (instance of `PretrainedModel` instead of `str`) is passed (and/or same for config), while
+    # `image_processor` or `feature_extractor` is `None`, the loading will fail. This happens particularly for some
+    # vision tasks when calling `pipeline()` with `model` and only one of the `image_processor` and `feature_extractor`.
+    # TODO: we need to make `NO_IMAGE_PROCESSOR_TASKS` and `NO_FEATURE_EXTRACTOR_TASKS` more robust to avoid such issue.
+    # This block is only temporarily to make CI green.
+    if load_image_processor and load_feature_extractor:
+        load_feature_extractor = False
+
+    if (
+        tokenizer is None
+        and not load_tokenizer
+        and normalized_task not in NO_TOKENIZER_TASKS
+        # Using class name to avoid importing the real class.
+        and (
+            model_config.__class__.__name__ in MULTI_MODEL_AUDIO_CONFIGS
+            or model_config.__class__.__name__ in MULTI_MODEL_VISION_CONFIGS
+        )
+    ):
+        # This is a special category of models, that are fusions of multiple models
+        # so the model_config might not define a tokenizer, but it seems to be
+        # necessary for the task, so we're force-trying to load it.
+        load_tokenizer = True
+    if (
+        image_processor is None
+        and not load_image_processor
+        and normalized_task not in NO_IMAGE_PROCESSOR_TASKS
+        # Using class name to avoid importing the real class.
+        and model_config.__class__.__name__ in MULTI_MODEL_VISION_CONFIGS
+    ):
+        # This is a special category of models, that are fusions of multiple models
+        # so the model_config might not define a tokenizer, but it seems to be
+        # necessary for the task, so we're force-trying to load it.
+        load_image_processor = True
+    if (
+        feature_extractor is None
+        and not load_feature_extractor
+        and normalized_task not in NO_FEATURE_EXTRACTOR_TASKS
+        # Using class name to avoid importing the real class.
+        and model_config.__class__.__name__ in MULTI_MODEL_AUDIO_CONFIGS
+    ):
+        # This is a special category of models, that are fusions of multiple models
+        # so the model_config might not define a tokenizer, but it seems to be
+        # necessary for the task, so we're force-trying to load it.
+        load_feature_extractor = True
+
+    if task in NO_TOKENIZER_TASKS:
+        # These will never require a tokenizer.
+        # the model on the other hand might have a tokenizer, but
+        # the files could be missing from the hub, instead of failing
+        # on such repos, we just force to not load it.
+        load_tokenizer = False
+
+    if task in NO_FEATURE_EXTRACTOR_TASKS:
+        load_feature_extractor = False
+    if task in NO_IMAGE_PROCESSOR_TASKS:
+        load_image_processor = False
+
+    if load_tokenizer:
+        # Try to infer tokenizer from model or config name (if provided as str)
+        if tokenizer is None:
+            if isinstance(model_name, str):
+                tokenizer = model_name
+            elif isinstance(config, str):
+                tokenizer = config
+            else:
+                # Impossible to guess what is the right tokenizer here
+                raise Exception(
+                    "Impossible to guess which tokenizer to use. "
+                    "Please provide a PreTrainedTokenizer class or a path/identifier to a pretrained tokenizer."
+                )
+
+        # Instantiate tokenizer if needed
+        if isinstance(tokenizer, (str, tuple)):
+            if isinstance(tokenizer, tuple):
+                # For tuple we have (tokenizer name, {kwargs})
+                use_fast = tokenizer[1].pop("use_fast", use_fast)
+                tokenizer_identifier = tokenizer[0]
+                tokenizer_kwargs = tokenizer[1]
+            else:
+                tokenizer_identifier = tokenizer
+                tokenizer_kwargs = model_kwargs.copy()
+                tokenizer_kwargs.pop("torch_dtype", None)
+
+            tokenizer = AutoTokenizer.from_pretrained(
+                tokenizer_identifier, use_fast=use_fast, _from_pipeline=task, **hub_kwargs, **tokenizer_kwargs
+            )
+
+    if load_image_processor:
+        # Try to infer image processor from model or config name (if provided as str)
+        if image_processor is None:
+            if isinstance(model_name, str):
+                image_processor = model_name
+            elif isinstance(config, str):
+                image_processor = config
+            # Backward compatibility, as `feature_extractor` used to be the name
+            # for `ImageProcessor`.
+            elif feature_extractor is not None and isinstance(feature_extractor, BaseImageProcessor):
+                image_processor = feature_extractor
+            else:
+                # Impossible to guess what is the right image_processor here
+                raise Exception(
+                    "Impossible to guess which image processor to use. "
+                    "Please provide a PreTrainedImageProcessor class or a path/identifier "
+                    "to a pretrained image processor."
+                )
+
+        # Instantiate image_processor if needed
+        if isinstance(image_processor, (str, tuple)):
+            image_processor = AutoImageProcessor.from_pretrained(
+                image_processor, _from_pipeline=task, **hub_kwargs, **model_kwargs
+            )
+
+    if load_feature_extractor:
+        # Try to infer feature extractor from model or config name (if provided as str)
+        if feature_extractor is None:
+            if isinstance(model_name, str):
+                feature_extractor = model_name
+            elif isinstance(config, str):
+                feature_extractor = config
+            else:
+                # Impossible to guess what is the right feature_extractor here
+                raise Exception(
+                    "Impossible to guess which feature extractor to use. "
+                    "Please provide a PreTrainedFeatureExtractor class or a path/identifier "
+                    "to a pretrained feature extractor."
+                )
+
+        # Instantiate feature_extractor if needed
+        if isinstance(feature_extractor, (str, tuple)):
+            feature_extractor = AutoFeatureExtractor.from_pretrained(
+                feature_extractor, _from_pipeline=task, **hub_kwargs, **model_kwargs
+            )
+
+            if (
+                feature_extractor._processor_class
+                and feature_extractor._processor_class.endswith("WithLM")
+                and isinstance(model_name, str)
+            ):
+                try:
+                    import kenlm  # to trigger `ImportError` if not installed
+                    from pyctcdecode import BeamSearchDecoderCTC
+
+                    if os.path.isdir(model_name) or os.path.isfile(model_name):
+                        decoder = BeamSearchDecoderCTC.load_from_dir(model_name)
+                    else:
+                        language_model_glob = os.path.join(
+                            BeamSearchDecoderCTC._LANGUAGE_MODEL_SERIALIZED_DIRECTORY, "*"
+                        )
+                        alphabet_filename = BeamSearchDecoderCTC._ALPHABET_SERIALIZED_FILENAME
+                        allow_patterns = [language_model_glob, alphabet_filename]
+                        decoder = BeamSearchDecoderCTC.load_from_hf_hub(model_name, allow_patterns=allow_patterns)
+
+                    kwargs["decoder"] = decoder
+                except ImportError as e:
+                    logger.warning(f"Could not load the `decoder` for {model_name}. Defaulting to raw CTC. Error: {e}")
+                    if not is_kenlm_available():
+                        logger.warning("Try to install `kenlm`: `pip install kenlm")
+
+                    if not is_pyctcdecode_available():
+                        logger.warning("Try to install `pyctcdecode`: `pip install pyctcdecode")
+
+    if load_processor:
+        # Try to infer processor from model or config name (if provided as str)
+        if processor is None:
+            if isinstance(model_name, str):
+                processor = model_name
+            elif isinstance(config, str):
+                processor = config
+            else:
+                # Impossible to guess what is the right processor here
+                raise Exception(
+                    "Impossible to guess which processor to use. "
+                    "Please provide a processor instance or a path/identifier "
+                    "to a processor."
+                )
+
+        # Instantiate processor if needed
+        if isinstance(processor, (str, tuple)):
+            processor = AutoProcessor.from_pretrained(processor, _from_pipeline=task, **hub_kwargs, **model_kwargs)
+            if not isinstance(processor, ProcessorMixin):
+                raise TypeError(
+                    "Processor was loaded, but it is not an instance of `ProcessorMixin`. "
+                    f"Got type `{type(processor)}` instead. Please check that you specified "
+                    "correct pipeline task for the model and model has processor implemented and saved."
+                )
+
+    if task == "translation" and model.config.task_specific_params:
+        for key in model.config.task_specific_params:
+            if key.startswith("translation"):
+                task = key
+                warnings.warn(
+                    f'"translation" task was used, instead of "translation_XX_to_YY", defaulting to "{task}"',
+                    UserWarning,
+                )
+                break
+
+    if tokenizer is not None:
+        kwargs["tokenizer"] = tokenizer
+
+    if feature_extractor is not None:
+        kwargs["feature_extractor"] = feature_extractor
+
+    if torch_dtype is not None:
+        kwargs["torch_dtype"] = torch_dtype
+
+    if image_processor is not None:
+        kwargs["image_processor"] = image_processor
+
+    if device is not None:
+        kwargs["device"] = device
+
+    if processor is not None:
+        kwargs["processor"] = processor
+
+    return pipeline_class(model=model, framework=framework, task=task, **kwargs)
diff --git a/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3c13de8d2f8416c40cc05909e6b4dc3a15c66706
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/__init__.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/audio_classification.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/audio_classification.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..78a6def32f2b9c4e5c4f398d1c4caa2317d9c6d0
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/audio_classification.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/audio_utils.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/audio_utils.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4ecc0f8845fbe145dfeb09070c6be150cc99b895
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/audio_utils.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/automatic_speech_recognition.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/automatic_speech_recognition.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..be7914fb35489ccfd95dee43f879e2436d43095c
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/automatic_speech_recognition.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/base.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/base.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1c254c1421872f69e5ea59af1fb57b4f90999947
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/base.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/depth_estimation.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/depth_estimation.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a952dd38e0ea7d65e8d39e69191a2ca651bd605c
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/depth_estimation.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/document_question_answering.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/document_question_answering.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4e8da27a53928f48b178da6848f65430ebf562cc
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/document_question_answering.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/feature_extraction.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/feature_extraction.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3c8cbfc27f17c71f2992710d1d01fa38a4103e4b
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/feature_extraction.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/fill_mask.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/fill_mask.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d48fdb57cc820d93b00c44aa67d069f79eaef3ce
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/fill_mask.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/image_classification.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/image_classification.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7ca87019d3bea003988f0d743a88fba6c2eb9432
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/image_classification.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/image_feature_extraction.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/image_feature_extraction.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1f26ad155baf5fad6d5ec5b16ce5cf31a37b4f97
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/image_feature_extraction.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/image_segmentation.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/image_segmentation.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a1597f2d4807c8ea0c7e2890c0a807bcf6394d0c
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/image_segmentation.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/image_text_to_text.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/image_text_to_text.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..55138504bdf575226a181dc4401e207ed1af9d83
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/image_text_to_text.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/image_to_image.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/image_to_image.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7117056ab6f1eed8e2ac85b608865bf1d8d1f381
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/image_to_image.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/image_to_text.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/image_to_text.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..282a5eab4a5afa2f511bf832c0f5c9f977f9a19e
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/image_to_text.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/mask_generation.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/mask_generation.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d81847e9085c6a700c2ed99df5a5ec3b485ec0fc
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/mask_generation.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/object_detection.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/object_detection.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..20d7177e8f35bc38403669156b96ff28c56cea8e
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/object_detection.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/pt_utils.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/pt_utils.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..eaa157f04981120aa846f4bff0a97ef0271359b0
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/pt_utils.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/question_answering.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/question_answering.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..dadc7dbcbc6d2a1b1143894da48036f00ab71f41
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/question_answering.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/table_question_answering.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/table_question_answering.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6bbb4de92436af95784a53a396dfe1df5c89ed9c
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/table_question_answering.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/text2text_generation.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/text2text_generation.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8b0677e2f2c048d094bf1e3b1888005074e3b9b4
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/text2text_generation.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/text_classification.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/text_classification.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9226af6305246af4ac726bf575396c4ae5d0cbb3
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/text_classification.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/text_generation.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/text_generation.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c55bc926b9fc6b43b404c05ced61caea13ceb21e
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/text_generation.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/text_to_audio.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/text_to_audio.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6d005050223b6a943128b3a79ff044246a046f50
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/text_to_audio.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/token_classification.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/token_classification.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b35ba5e8975996d511e41dc35e813edd4d212992
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/token_classification.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/video_classification.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/video_classification.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f30a7f3b4ab8da22ab900cc036524ac69ddc7689
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/video_classification.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/visual_question_answering.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/visual_question_answering.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e2c83eff32e78865d7f9430829b8a7ae10416e3a
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/visual_question_answering.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/zero_shot_audio_classification.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/zero_shot_audio_classification.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9909e35082363c85c942b355399deb0f0e89e4f1
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/zero_shot_audio_classification.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/zero_shot_classification.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/zero_shot_classification.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8b2661528b79429f9ffd1421f3ba8135ab2c29bc
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/zero_shot_classification.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/zero_shot_image_classification.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/zero_shot_image_classification.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4db3ff00805b9daa790f19fdd193510846fdf908
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/zero_shot_image_classification.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/zero_shot_object_detection.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/zero_shot_object_detection.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..bbe4539f856a5eb2d287d9a5556e54e280324f62
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/zero_shot_object_detection.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/transformers/pipelines/audio_classification.py b/.venv/lib/python3.11/site-packages/transformers/pipelines/audio_classification.py
new file mode 100644
index 0000000000000000000000000000000000000000..f6acbb3096e07d5b467ef4f2dccbda3ce1cd8e51
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/transformers/pipelines/audio_classification.py
@@ -0,0 +1,234 @@
+# Copyright 2021 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import subprocess
+from typing import Union
+
+import numpy as np
+import requests
+
+from ..utils import add_end_docstrings, is_torch_available, is_torchaudio_available, logging
+from .base import Pipeline, build_pipeline_init_args
+
+
+if is_torch_available():
+    from ..models.auto.modeling_auto import MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING_NAMES
+
+logger = logging.get_logger(__name__)
+
+
+def ffmpeg_read(bpayload: bytes, sampling_rate: int) -> np.array:
+    """
+    Helper function to read an audio file through ffmpeg.
+    """
+    ar = f"{sampling_rate}"
+    ac = "1"
+    format_for_conversion = "f32le"
+    ffmpeg_command = [
+        "ffmpeg",
+        "-i",
+        "pipe:0",
+        "-ac",
+        ac,
+        "-ar",
+        ar,
+        "-f",
+        format_for_conversion,
+        "-hide_banner",
+        "-loglevel",
+        "quiet",
+        "pipe:1",
+    ]
+
+    try:
+        ffmpeg_process = subprocess.Popen(ffmpeg_command, stdin=subprocess.PIPE, stdout=subprocess.PIPE)
+    except FileNotFoundError:
+        raise ValueError("ffmpeg was not found but is required to load audio files from filename")
+    output_stream = ffmpeg_process.communicate(bpayload)
+    out_bytes = output_stream[0]
+
+    audio = np.frombuffer(out_bytes, np.float32)
+    if audio.shape[0] == 0:
+        raise ValueError("Malformed soundfile")
+    return audio
+
+
+@add_end_docstrings(build_pipeline_init_args(has_feature_extractor=True))
+class AudioClassificationPipeline(Pipeline):
+    """
+    Audio classification pipeline using any `AutoModelForAudioClassification`. This pipeline predicts the class of a
+    raw waveform or an audio file. In case of an audio file, ffmpeg should be installed to support multiple audio
+    formats.
+
+    Example:
+
+    ```python
+    >>> from transformers import pipeline
+
+    >>> classifier = pipeline(model="superb/wav2vec2-base-superb-ks")
+    >>> classifier("https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/1.flac")
+    [{'score': 0.997, 'label': '_unknown_'}, {'score': 0.002, 'label': 'left'}, {'score': 0.0, 'label': 'yes'}, {'score': 0.0, 'label': 'down'}, {'score': 0.0, 'label': 'stop'}]
+    ```
+
+    Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial)
+
+
+    This pipeline can currently be loaded from [`pipeline`] using the following task identifier:
+    `"audio-classification"`.
+
+    See the list of available models on
+    [huggingface.co/models](https://huggingface.co/models?filter=audio-classification).
+    """
+
+    def __init__(self, *args, **kwargs):
+        # Default, might be overriden by the model.config.
+        kwargs["top_k"] = kwargs.get("top_k", 5)
+        super().__init__(*args, **kwargs)
+
+        if self.framework != "pt":
+            raise ValueError(f"The {self.__class__} is only available in PyTorch.")
+
+        self.check_model_type(MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING_NAMES)
+
+    def __call__(
+        self,
+        inputs: Union[np.ndarray, bytes, str],
+        **kwargs,
+    ):
+        """
+        Classify the sequence(s) given as inputs. See the [`AutomaticSpeechRecognitionPipeline`] documentation for more
+        information.
+
+        Args:
+            inputs (`np.ndarray` or `bytes` or `str` or `dict`):
+                The inputs is either :
+                    - `str` that is the filename of the audio file, the file will be read at the correct sampling rate
+                      to get the waveform using *ffmpeg*. This requires *ffmpeg* to be installed on the system.
+                    - `bytes` it is supposed to be the content of an audio file and is interpreted by *ffmpeg* in the
+                      same way.
+                    - (`np.ndarray` of shape (n, ) of type `np.float32` or `np.float64`)
+                        Raw audio at the correct sampling rate (no further check will be done)
+                    - `dict` form can be used to pass raw audio sampled at arbitrary `sampling_rate` and let this
+                      pipeline do the resampling. The dict must be either be in the format `{"sampling_rate": int,
+                      "raw": np.array}`, or `{"sampling_rate": int, "array": np.array}`, where the key `"raw"` or
+                      `"array"` is used to denote the raw audio waveform.
+            top_k (`int`, *optional*, defaults to None):
+                The number of top labels that will be returned by the pipeline. If the provided number is `None` or
+                higher than the number of labels available in the model configuration, it will default to the number of
+                labels.
+            function_to_apply(`str`, *optional*, defaults to "softmax"):
+                The function to apply to the model output. By default, the pipeline will apply the softmax function to
+                the output of the model. Valid options: ["softmax", "sigmoid", "none"]. Note that passing Python's
+                built-in `None` will default to "softmax", so you need to pass the string "none" to disable any
+                post-processing.
+
+        Return:
+            A list of `dict` with the following keys:
+
+            - **label** (`str`) -- The label predicted.
+            - **score** (`float`) -- The corresponding probability.
+        """
+        return super().__call__(inputs, **kwargs)
+
+    def _sanitize_parameters(self, top_k=None, function_to_apply=None, **kwargs):
+        # No parameters on this pipeline right now
+        postprocess_params = {}
+        if top_k is not None:
+            if top_k > self.model.config.num_labels:
+                top_k = self.model.config.num_labels
+            postprocess_params["top_k"] = top_k
+        if function_to_apply is not None:
+            if function_to_apply not in ["softmax", "sigmoid", "none"]:
+                raise ValueError(
+                    f"Invalid value for `function_to_apply`: {function_to_apply}. "
+                    "Valid options are ['softmax', 'sigmoid', 'none']"
+                )
+            postprocess_params["function_to_apply"] = function_to_apply
+        else:
+            postprocess_params["function_to_apply"] = "softmax"
+        return {}, {}, postprocess_params
+
+    def preprocess(self, inputs):
+        if isinstance(inputs, str):
+            if inputs.startswith("http://") or inputs.startswith("https://"):
+                # We need to actually check for a real protocol, otherwise it's impossible to use a local file
+                # like http_huggingface_co.png
+                inputs = requests.get(inputs).content
+            else:
+                with open(inputs, "rb") as f:
+                    inputs = f.read()
+
+        if isinstance(inputs, bytes):
+            inputs = ffmpeg_read(inputs, self.feature_extractor.sampling_rate)
+
+        if isinstance(inputs, dict):
+            # Accepting `"array"` which is the key defined in `datasets` for
+            # better integration
+            if not ("sampling_rate" in inputs and ("raw" in inputs or "array" in inputs)):
+                raise ValueError(
+                    "When passing a dictionary to AudioClassificationPipeline, the dict needs to contain a "
+                    '"raw" key containing the numpy array representing the audio and a "sampling_rate" key, '
+                    "containing the sampling_rate associated with that array"
+                )
+
+            _inputs = inputs.pop("raw", None)
+            if _inputs is None:
+                # Remove path which will not be used from `datasets`.
+                inputs.pop("path", None)
+                _inputs = inputs.pop("array", None)
+            in_sampling_rate = inputs.pop("sampling_rate")
+            inputs = _inputs
+            if in_sampling_rate != self.feature_extractor.sampling_rate:
+                import torch
+
+                if is_torchaudio_available():
+                    from torchaudio import functional as F
+                else:
+                    raise ImportError(
+                        "torchaudio is required to resample audio samples in AudioClassificationPipeline. "
+                        "The torchaudio package can be installed through: `pip install torchaudio`."
+                    )
+
+                inputs = F.resample(
+                    torch.from_numpy(inputs), in_sampling_rate, self.feature_extractor.sampling_rate
+                ).numpy()
+
+        if not isinstance(inputs, np.ndarray):
+            raise TypeError("We expect a numpy ndarray as input")
+        if len(inputs.shape) != 1:
+            raise ValueError("We expect a single channel audio input for AudioClassificationPipeline")
+
+        processed = self.feature_extractor(
+            inputs, sampling_rate=self.feature_extractor.sampling_rate, return_tensors="pt"
+        )
+        return processed
+
+    def _forward(self, model_inputs):
+        model_outputs = self.model(**model_inputs)
+        return model_outputs
+
+    def postprocess(self, model_outputs, top_k=5, function_to_apply="softmax"):
+        if function_to_apply == "softmax":
+            probs = model_outputs.logits[0].softmax(-1)
+        elif function_to_apply == "sigmoid":
+            probs = model_outputs.logits[0].sigmoid()
+        else:
+            probs = model_outputs.logits[0]
+        scores, ids = probs.topk(top_k)
+
+        scores = scores.tolist()
+        ids = ids.tolist()
+
+        labels = [{"score": score, "label": self.model.config.id2label[_id]} for score, _id in zip(scores, ids)]
+
+        return labels
diff --git a/.venv/lib/python3.11/site-packages/transformers/pipelines/audio_utils.py b/.venv/lib/python3.11/site-packages/transformers/pipelines/audio_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..72a5f51db6129ae46939a5f2d640d286f479749f
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/transformers/pipelines/audio_utils.py
@@ -0,0 +1,297 @@
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+import datetime
+import platform
+import subprocess
+from typing import Optional, Tuple, Union
+
+import numpy as np
+
+
+def ffmpeg_read(bpayload: bytes, sampling_rate: int) -> np.array:
+    """
+    Helper function to read an audio file through ffmpeg.
+    """
+    ar = f"{sampling_rate}"
+    ac = "1"
+    format_for_conversion = "f32le"
+    ffmpeg_command = [
+        "ffmpeg",
+        "-i",
+        "pipe:0",
+        "-ac",
+        ac,
+        "-ar",
+        ar,
+        "-f",
+        format_for_conversion,
+        "-hide_banner",
+        "-loglevel",
+        "quiet",
+        "pipe:1",
+    ]
+
+    try:
+        with subprocess.Popen(ffmpeg_command, stdin=subprocess.PIPE, stdout=subprocess.PIPE) as ffmpeg_process:
+            output_stream = ffmpeg_process.communicate(bpayload)
+    except FileNotFoundError as error:
+        raise ValueError("ffmpeg was not found but is required to load audio files from filename") from error
+    out_bytes = output_stream[0]
+    audio = np.frombuffer(out_bytes, np.float32)
+    if audio.shape[0] == 0:
+        raise ValueError(
+            "Soundfile is either not in the correct format or is malformed. Ensure that the soundfile has "
+            "a valid audio file extension (e.g. wav, flac or mp3) and is not corrupted. If reading from a remote "
+            "URL, ensure that the URL is the full address to **download** the audio file."
+        )
+    return audio
+
+
+def ffmpeg_microphone(
+    sampling_rate: int,
+    chunk_length_s: float,
+    format_for_conversion: str = "f32le",
+    ffmpeg_input_device: Optional[str] = None,
+    ffmpeg_additional_args: Optional[list[str]] = None,
+):
+    """
+    Helper function to read audio from a microphone using ffmpeg. The default input device will be used unless another
+    input device is specified using the `ffmpeg_input_device` argument. Uses 'alsa' on Linux, 'avfoundation' on MacOS and
+    'dshow' on Windows.
+
+    Arguments:
+        sampling_rate (`int`):
+            The sampling_rate to use when reading the data from the microphone. Try using the model's sampling_rate to
+            avoid resampling later.
+        chunk_length_s (`float` or `int`):
+            The length of the maximum chunk of audio to be sent returned.
+        format_for_conversion (`str`, defaults to `f32le`):
+            The name of the format of the audio samples to be returned by ffmpeg. The standard is `f32le`, `s16le`
+            could also be used.
+        ffmpeg_input_device (`str`, *optional*):
+            The identifier of the input device to be used by ffmpeg (i.e. ffmpeg's '-i' argument). If unset,
+            the default input device will be used. See `https://www.ffmpeg.org/ffmpeg-devices.html#Input-Devices`
+            for how to specify and list input devices.
+        ffmpeg_additional_args (`list[str]`, *optional*):
+            Additional arguments to pass to ffmpeg, can include arguments like -nostdin for running as a background
+            process. For example, to pass -nostdin to the ffmpeg process, pass in ["-nostdin"]. If passing in flags
+            with multiple arguments, use the following convention (eg ["flag", "arg1", "arg2]).
+
+    Returns:
+        A generator yielding audio chunks of `chunk_length_s` seconds as `bytes` objects of length
+        `int(round(sampling_rate * chunk_length_s)) * size_of_sample`.
+    """
+    ar = f"{sampling_rate}"
+    ac = "1"
+    if format_for_conversion == "s16le":
+        size_of_sample = 2
+    elif format_for_conversion == "f32le":
+        size_of_sample = 4
+    else:
+        raise ValueError(f"Unhandled format `{format_for_conversion}`. Please use `s16le` or `f32le`")
+
+    system = platform.system()
+
+    if system == "Linux":
+        format_ = "alsa"
+        input_ = ffmpeg_input_device or "default"
+    elif system == "Darwin":
+        format_ = "avfoundation"
+        input_ = ffmpeg_input_device or ":default"
+    elif system == "Windows":
+        format_ = "dshow"
+        input_ = ffmpeg_input_device or _get_microphone_name()
+
+    ffmpeg_additional_args = [] if ffmpeg_additional_args is None else ffmpeg_additional_args
+
+    ffmpeg_command = [
+        "ffmpeg",
+        "-f",
+        format_,
+        "-i",
+        input_,
+        "-ac",
+        ac,
+        "-ar",
+        ar,
+        "-f",
+        format_for_conversion,
+        "-fflags",
+        "nobuffer",
+        "-hide_banner",
+        "-loglevel",
+        "quiet",
+        "pipe:1",
+    ]
+
+    ffmpeg_command.extend(ffmpeg_additional_args)
+
+    chunk_len = int(round(sampling_rate * chunk_length_s)) * size_of_sample
+    iterator = _ffmpeg_stream(ffmpeg_command, chunk_len)
+    for item in iterator:
+        yield item
+
+
+def ffmpeg_microphone_live(
+    sampling_rate: int,
+    chunk_length_s: float,
+    stream_chunk_s: Optional[int] = None,
+    stride_length_s: Optional[Union[Tuple[float, float], float]] = None,
+    format_for_conversion: str = "f32le",
+    ffmpeg_input_device: Optional[str] = None,
+    ffmpeg_additional_args: Optional[list[str]] = None,
+):
+    """
+    Helper function to read audio from a microphone using ffmpeg. This will output `partial` overlapping chunks starting
+    from `stream_chunk_s` (if it is defined) until `chunk_length_s` is reached. It will make use of striding to avoid
+    errors on the "sides" of the various chunks. The default input device will be used unless another input device is
+    specified using the `ffmpeg_input_device` argument. Uses 'alsa' on Linux, 'avfoundation' on MacOS and 'dshow' on Windows.
+
+    Arguments:
+        sampling_rate (`int`):
+            The sampling_rate to use when reading the data from the microphone. Try using the model's sampling_rate to
+            avoid resampling later.
+        chunk_length_s (`float` or `int`):
+            The length of the maximum chunk of audio to be sent returned. This includes the eventual striding.
+        stream_chunk_s (`float` or `int`):
+            The length of the minimal temporary audio to be returned.
+        stride_length_s (`float` or `int` or `(float, float)`, *optional*):
+            The length of the striding to be used. Stride is used to provide context to a model on the (left, right) of
+            an audio sample but without using that part to actually make the prediction. Setting this does not change
+            the length of the chunk.
+        format_for_conversion (`str`, *optional*, defaults to `f32le`):
+            The name of the format of the audio samples to be returned by ffmpeg. The standard is `f32le`, `s16le`
+            could also be used.
+        ffmpeg_input_device (`str`, *optional*):
+            The identifier of the input device to be used by ffmpeg (i.e. ffmpeg's '-i' argument). If unset,
+            the default input device will be used. See `https://www.ffmpeg.org/ffmpeg-devices.html#Input-Devices`
+            for how to specify and list input devices.
+        ffmpeg_additional_args (`list[str]`, *optional*):
+            Additional arguments to pass to ffmpeg, can include arguments like -nostdin for running as a background
+            process. For example, to pass -nostdin to the ffmpeg process, pass in ["-nostdin"]. If passing in flags
+            with multiple arguments, use the following convention (eg ["flag", "arg1", "arg2]).
+
+    Return:
+        A generator yielding dictionaries of the following form
+
+        `{"sampling_rate": int, "raw": np.array(), "partial" bool}` With optionally a `"stride" (int, int)` key if
+        `stride_length_s` is defined.
+
+        `stride` and `raw` are all expressed in `samples`, and `partial` is a boolean saying if the current yield item
+        is a whole chunk, or a partial temporary result to be later replaced by another larger chunk.
+    """
+    if stream_chunk_s is not None:
+        chunk_s = stream_chunk_s
+    else:
+        chunk_s = chunk_length_s
+
+    microphone = ffmpeg_microphone(
+        sampling_rate,
+        chunk_s,
+        format_for_conversion=format_for_conversion,
+        ffmpeg_input_device=ffmpeg_input_device,
+        ffmpeg_additional_args=[] if ffmpeg_additional_args is None else ffmpeg_additional_args,
+    )
+
+    if format_for_conversion == "s16le":
+        dtype = np.int16
+        size_of_sample = 2
+    elif format_for_conversion == "f32le":
+        dtype = np.float32
+        size_of_sample = 4
+    else:
+        raise ValueError(f"Unhandled format `{format_for_conversion}`. Please use `s16le` or `f32le`")
+
+    if stride_length_s is None:
+        stride_length_s = chunk_length_s / 6
+    chunk_len = int(round(sampling_rate * chunk_length_s)) * size_of_sample
+    if isinstance(stride_length_s, (int, float)):
+        stride_length_s = [stride_length_s, stride_length_s]
+
+    stride_left = int(round(sampling_rate * stride_length_s[0])) * size_of_sample
+    stride_right = int(round(sampling_rate * stride_length_s[1])) * size_of_sample
+    audio_time = datetime.datetime.now()
+    delta = datetime.timedelta(seconds=chunk_s)
+    for item in chunk_bytes_iter(microphone, chunk_len, stride=(stride_left, stride_right), stream=True):
+        # Put everything back in numpy scale
+        item["raw"] = np.frombuffer(item["raw"], dtype=dtype)
+        item["stride"] = (
+            item["stride"][0] // size_of_sample,
+            item["stride"][1] // size_of_sample,
+        )
+        item["sampling_rate"] = sampling_rate
+        audio_time += delta
+        if datetime.datetime.now() > audio_time + 10 * delta:
+            # We're late !! SKIP
+            continue
+        yield item
+
+
+def chunk_bytes_iter(iterator, chunk_len: int, stride: Tuple[int, int], stream: bool = False):
+    """
+    Reads raw bytes from an iterator and does chunks of length `chunk_len`. Optionally adds `stride` to each chunks to
+    get overlaps. `stream` is used to return partial results even if a full `chunk_len` is not yet available.
+    """
+    acc = b""
+    stride_left, stride_right = stride
+    if stride_left + stride_right >= chunk_len:
+        raise ValueError(
+            f"Stride needs to be strictly smaller than chunk_len: ({stride_left}, {stride_right}) vs {chunk_len}"
+        )
+    _stride_left = 0
+    for raw in iterator:
+        acc += raw
+        if stream and len(acc) < chunk_len:
+            stride = (_stride_left, 0)
+            yield {"raw": acc[:chunk_len], "stride": stride, "partial": True}
+        else:
+            while len(acc) >= chunk_len:
+                # We are flushing the accumulator
+                stride = (_stride_left, stride_right)
+                item = {"raw": acc[:chunk_len], "stride": stride}
+                if stream:
+                    item["partial"] = False
+                yield item
+                _stride_left = stride_left
+                acc = acc[chunk_len - stride_left - stride_right :]
+    # Last chunk
+    if len(acc) > stride_left:
+        item = {"raw": acc, "stride": (_stride_left, 0)}
+        if stream:
+            item["partial"] = False
+        yield item
+
+
+def _ffmpeg_stream(ffmpeg_command, buflen: int):
+    """
+    Internal function to create the generator of data through ffmpeg
+    """
+    bufsize = 2**24  # 16Mo
+    try:
+        with subprocess.Popen(ffmpeg_command, stdout=subprocess.PIPE, bufsize=bufsize) as ffmpeg_process:
+            while True:
+                raw = ffmpeg_process.stdout.read(buflen)
+                if raw == b"":
+                    break
+                yield raw
+    except FileNotFoundError as error:
+        raise ValueError("ffmpeg was not found but is required to stream audio files from filename") from error
+
+
+def _get_microphone_name():
+    """
+    Retrieve the microphone name in Windows .
+    """
+    command = ["ffmpeg", "-list_devices", "true", "-f", "dshow", "-i", ""]
+
+    try:
+        ffmpeg_devices = subprocess.run(command, text=True, stderr=subprocess.PIPE, encoding="utf-8")
+        microphone_lines = [line for line in ffmpeg_devices.stderr.splitlines() if "(audio)" in line]
+
+        if microphone_lines:
+            microphone_name = microphone_lines[0].split('"')[1]
+            print(f"Using microphone: {microphone_name}")
+            return f"audio={microphone_name}"
+    except FileNotFoundError:
+        print("ffmpeg was not found. Please install it or make sure it is in your system PATH.")
+
+    return "default"
diff --git a/.venv/lib/python3.11/site-packages/transformers/pipelines/automatic_speech_recognition.py b/.venv/lib/python3.11/site-packages/transformers/pipelines/automatic_speech_recognition.py
new file mode 100644
index 0000000000000000000000000000000000000000..66a9c49ea5f3516053fa9d5835109dcd53e3ff1a
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/transformers/pipelines/automatic_speech_recognition.py
@@ -0,0 +1,766 @@
+# Copyright 2021 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import warnings
+from collections import defaultdict
+from typing import TYPE_CHECKING, Dict, Optional, Union
+
+import numpy as np
+import requests
+
+from ..tokenization_utils import PreTrainedTokenizer
+from ..utils import is_torch_available, is_torchaudio_available, logging
+from .audio_utils import ffmpeg_read
+from .base import ChunkPipeline
+
+
+if TYPE_CHECKING:
+    from pyctcdecode import BeamSearchDecoderCTC
+
+    from ..feature_extraction_sequence_utils import SequenceFeatureExtractor
+    from ..modeling_utils import PreTrainedModel
+
+logger = logging.get_logger(__name__)
+
+if is_torch_available():
+    import torch
+
+    from ..models.auto.modeling_auto import MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING_NAMES
+
+
+def rescale_stride(stride, ratio):
+    """
+    Rescales the stride values from audio space to tokens/logits space.
+
+    (160_000, 16_000, 16_000) -> (2000, 200, 200) for instance.
+    """
+    # Shape is [B, SEQ] for tokens
+    # [B, SEQ, V] for logits
+
+    new_strides = []
+    for input_n, left, right in stride:
+        token_n = int(round(input_n * ratio))
+        left = int(round(left / input_n * token_n))
+        right = int(round(right / input_n * token_n))
+        new_stride = (token_n, left, right)
+        new_strides.append(new_stride)
+
+    return new_strides
+
+
+def chunk_iter(inputs, feature_extractor, chunk_len, stride_left, stride_right, dtype=None):
+    inputs_len = inputs.shape[0]
+    step = chunk_len - stride_left - stride_right
+    for chunk_start_idx in range(0, inputs_len, step):
+        chunk_end_idx = chunk_start_idx + chunk_len
+        chunk = inputs[chunk_start_idx:chunk_end_idx]
+        processed = feature_extractor(chunk, sampling_rate=feature_extractor.sampling_rate, return_tensors="pt")
+        if dtype is not None:
+            processed = processed.to(dtype=dtype)
+        _stride_left = 0 if chunk_start_idx == 0 else stride_left
+        is_last = chunk_end_idx >= inputs_len
+        _stride_right = 0 if is_last else stride_right
+
+        chunk_len = chunk.shape[0]
+        stride = (chunk_len, _stride_left, _stride_right)
+        if chunk.shape[0] > _stride_left:
+            yield {"is_last": is_last, "stride": stride, **processed}
+        if is_last:
+            break
+
+
+def _fast_find_longest_common_sequence(sequence_left, sequence_right):
+    seq_len_left = len(sequence_left)
+    seq_len_right = len(sequence_right)
+    counter = [[0] * (seq_len_right + 1) for _ in range(seq_len_left + 1)]
+    longest = 0
+    for i in range(seq_len_left):
+        for j in range(seq_len_right):
+            if sequence_left[i] == sequence_right[j]:
+                previous_counter = counter[i][j] + 1
+                counter[i + 1][j + 1] = previous_counter
+                if previous_counter > longest:
+                    longest = previous_counter
+
+    counter = np.array(counter)
+    # we return the idx of the first element of the longest common sequence in the left sequence
+    index_left = np.argwhere(counter == longest)[-1][0] - longest if longest != 0 else -1
+    index_right = np.argwhere(counter == longest)[-1][1] - longest if longest != 0 else -1
+    return index_left, index_right, longest
+
+
+def _find_longest_common_sequence(sequences, tokenizer):
+    # TODO  Use a faster algorithm this can probably be done in O(n)
+    # using suffix array.
+    # It might be tedious to do because of fault tolerance.
+    # We actually have a really good property which is that the total sequence
+    # MUST be those subsequences in order.
+    # Also the algorithm should be more tolerant to errors.
+    sequence = [tok_id for tok_id in sequences[0][0].tolist() if tok_id not in tokenizer.all_special_ids]
+    for new_seq in sequences[1:]:
+        new_sequence = [tok_id for tok_id in new_seq[0].tolist() if tok_id not in tokenizer.all_special_ids]
+
+        index = 0
+        max_ = 0.0
+        for i in range(1, len(new_sequence) + 1):
+            # epsilon to favor long perfect matches
+            eps = i / 10000.0
+            matches = np.sum(np.array(sequence[-i:]) == np.array(new_sequence[:i]))
+            matching = matches / i + eps
+            if matches > 1 and matching > max_:
+                index = i
+                max_ = matching
+        sequence.extend(new_sequence[index:])
+    return np.array(sequence)
+
+
+class AutomaticSpeechRecognitionPipeline(ChunkPipeline):
+    """
+    Pipeline that aims at extracting spoken text contained within some audio.
+
+    The input can be either a raw waveform or a audio file. In case of the audio file, ffmpeg should be installed for
+    to support multiple audio formats
+
+    Example:
+
+    ```python
+    >>> from transformers import pipeline
+
+    >>> transcriber = pipeline(model="openai/whisper-base")
+    >>> transcriber("https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/1.flac")
+    {'text': ' He hoped there would be stew for dinner, turnips and carrots and bruised potatoes and fat mutton pieces to be ladled out in thick, peppered flour-fatten sauce.'}
+    ```
+
+    Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial)
+
+    Arguments:
+        model ([`PreTrainedModel`] or [`TFPreTrainedModel`]):
+            The model that will be used by the pipeline to make predictions. This needs to be a model inheriting from
+            [`PreTrainedModel`] for PyTorch and [`TFPreTrainedModel`] for TensorFlow.
+        feature_extractor ([`SequenceFeatureExtractor`]):
+            The feature extractor that will be used by the pipeline to encode waveform for the model.
+        tokenizer ([`PreTrainedTokenizer`]):
+            The tokenizer that will be used by the pipeline to encode data for the model. This object inherits from
+            [`PreTrainedTokenizer`].
+        decoder (`pyctcdecode.BeamSearchDecoderCTC`, *optional*):
+            [PyCTCDecode's
+            BeamSearchDecoderCTC](https://github.com/kensho-technologies/pyctcdecode/blob/2fd33dc37c4111417e08d89ccd23d28e9b308d19/pyctcdecode/decoder.py#L180)
+            can be passed for language model boosted decoding. See [`Wav2Vec2ProcessorWithLM`] for more information.
+        chunk_length_s (`float`, *optional*, defaults to 0):
+            The input length for in each chunk. If `chunk_length_s = 0` then chunking is disabled (default).
+
+            <Tip>
+
+            For more information on how to effectively use `chunk_length_s`, please have a look at the [ASR chunking
+            blog post](https://huggingface.co/blog/asr-chunking).
+
+            </Tip>
+
+        stride_length_s (`float`, *optional*, defaults to `chunk_length_s / 6`):
+            The length of stride on the left and right of each chunk. Used only with `chunk_length_s > 0`. This enables
+            the model to *see* more context and infer letters better than without this context but the pipeline
+            discards the stride bits at the end to make the final reconstitution as perfect as possible.
+
+            <Tip>
+
+            For more information on how to effectively use `stride_length_s`, please have a look at the [ASR chunking
+            blog post](https://huggingface.co/blog/asr-chunking).
+
+            </Tip>
+
+        framework (`str`, *optional*):
+            The framework to use, either `"pt"` for PyTorch or `"tf"` for TensorFlow. The specified framework must be
+            installed. If no framework is specified, will default to the one currently installed. If no framework is
+            specified and both frameworks are installed, will default to the framework of the `model`, or to PyTorch if
+            no model is provided.
+        device (Union[`int`, `torch.device`], *optional*):
+            Device ordinal for CPU/GPU supports. Setting this to `None` will leverage CPU, a positive will run the
+            model on the associated CUDA device id.
+        torch_dtype (Union[`int`, `torch.dtype`], *optional*):
+            The data-type (dtype) of the computation. Setting this to `None` will use float32 precision. Set to
+            `torch.float16` or `torch.bfloat16` to use half-precision in the respective dtypes.
+
+    """
+
+    def __init__(
+        self,
+        model: "PreTrainedModel",
+        feature_extractor: Union["SequenceFeatureExtractor", str] = None,
+        tokenizer: Optional[PreTrainedTokenizer] = None,
+        decoder: Optional[Union["BeamSearchDecoderCTC", str]] = None,
+        device: Union[int, "torch.device"] = None,
+        torch_dtype: Optional[Union[str, "torch.dtype"]] = None,
+        **kwargs,
+    ):
+        # set the model type so we can check we have the right pre- and post-processing parameters
+        if model.config.model_type == "whisper":
+            self.type = "seq2seq_whisper"
+        elif model.__class__.__name__ in MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING_NAMES.values():
+            self.type = "seq2seq"
+        elif (
+            feature_extractor._processor_class
+            and feature_extractor._processor_class.endswith("WithLM")
+            and decoder is not None
+        ):
+            self.decoder = decoder
+            self.type = "ctc_with_lm"
+        else:
+            self.type = "ctc"
+
+        super().__init__(model, tokenizer, feature_extractor, device=device, torch_dtype=torch_dtype, **kwargs)
+
+    def __call__(
+        self,
+        inputs: Union[np.ndarray, bytes, str],
+        **kwargs,
+    ):
+        """
+        Transcribe the audio sequence(s) given as inputs to text. See the [`AutomaticSpeechRecognitionPipeline`]
+        documentation for more information.
+
+        Args:
+            inputs (`np.ndarray` or `bytes` or `str` or `dict`):
+                The inputs is either :
+                    - `str` that is either the filename of a local audio file, or a public URL address to download the
+                      audio file. The file will be read at the correct sampling rate to get the waveform using
+                      *ffmpeg*. This requires *ffmpeg* to be installed on the system.
+                    - `bytes` it is supposed to be the content of an audio file and is interpreted by *ffmpeg* in the
+                      same way.
+                    - (`np.ndarray` of shape (n, ) of type `np.float32` or `np.float64`)
+                        Raw audio at the correct sampling rate (no further check will be done)
+                    - `dict` form can be used to pass raw audio sampled at arbitrary `sampling_rate` and let this
+                      pipeline do the resampling. The dict must be in the format `{"sampling_rate": int, "raw":
+                      np.array}` with optionally a `"stride": (left: int, right: int)` than can ask the pipeline to
+                      treat the first `left` samples and last `right` samples to be ignored in decoding (but used at
+                      inference to provide more context to the model). Only use `stride` with CTC models.
+            return_timestamps (*optional*, `str` or `bool`):
+                Only available for pure CTC models (Wav2Vec2, HuBERT, etc) and the Whisper model. Not available for
+                other sequence-to-sequence models.
+
+                For CTC models, timestamps can take one of two formats:
+                    - `"char"`: the pipeline will return timestamps along the text for every character in the text. For
+                        instance, if you get `[{"text": "h", "timestamp": (0.5, 0.6)}, {"text": "i", "timestamp": (0.7,
+                        0.9)}]`, then it means the model predicts that the letter "h" was spoken after `0.5` and before
+                        `0.6` seconds.
+                    - `"word"`: the pipeline will return timestamps along the text for every word in the text. For
+                        instance, if you get `[{"text": "hi ", "timestamp": (0.5, 0.9)}, {"text": "there", "timestamp":
+                        (1.0, 1.5)}]`, then it means the model predicts that the word "hi" was spoken after `0.5` and
+                        before `0.9` seconds.
+
+                For the Whisper model, timestamps can take one of two formats:
+                    - `"word"`: same as above for word-level CTC timestamps. Word-level timestamps are predicted
+                        through the *dynamic-time warping (DTW)* algorithm, an approximation to word-level timestamps
+                        by inspecting the cross-attention weights.
+                    - `True`: the pipeline will return timestamps along the text for *segments* of words in the text.
+                        For instance, if you get `[{"text": " Hi there!", "timestamp": (0.5, 1.5)}]`, then it means the
+                        model predicts that the segment "Hi there!" was spoken after `0.5` and before `1.5` seconds.
+                        Note that a segment of text refers to a sequence of one or more words, rather than individual
+                        words as with word-level timestamps.
+            generate_kwargs (`dict`, *optional*):
+                The dictionary of ad-hoc parametrization of `generate_config` to be used for the generation call. For a
+                complete overview of generate, check the [following
+                guide](https://huggingface.co/docs/transformers/en/main_classes/text_generation).
+
+        Return:
+            `Dict`: A dictionary with the following keys:
+                - **text** (`str`): The recognized text.
+                - **chunks** (*optional(, `List[Dict]`)
+                    When using `return_timestamps`, the `chunks` will become a list containing all the various text
+                    chunks identified by the model, *e.g.* `[{"text": "hi ", "timestamp": (0.5, 0.9)}, {"text":
+                    "there", "timestamp": (1.0, 1.5)}]`. The original full text can roughly be recovered by doing
+                    `"".join(chunk["text"] for chunk in output["chunks"])`.
+        """
+        return super().__call__(inputs, **kwargs)
+
+    def _sanitize_parameters(
+        self,
+        chunk_length_s=None,
+        stride_length_s=None,
+        ignore_warning=None,
+        decoder_kwargs=None,
+        return_timestamps=None,
+        return_language=None,
+        generate_kwargs=None,
+        max_new_tokens=None,
+    ):
+        # No parameters on this pipeline right now
+        preprocess_params = {}
+        if chunk_length_s is not None:
+            if self.type == "seq2seq" and not ignore_warning:
+                logger.warning(
+                    "Using `chunk_length_s` is very experimental with seq2seq models. The results will not necessarily"
+                    " be entirely accurate and will have caveats. More information:"
+                    " https://github.com/huggingface/transformers/pull/20104. Ignore this warning with pipeline(...,"
+                    " ignore_warning=True)"
+                )
+            preprocess_params["chunk_length_s"] = chunk_length_s
+        if stride_length_s is not None:
+            preprocess_params["stride_length_s"] = stride_length_s
+
+        forward_params = defaultdict(dict)
+        if max_new_tokens is not None:
+            warnings.warn(
+                "`max_new_tokens` is deprecated and will be removed in version 4.49 of Transformers. To remove this warning, pass `max_new_tokens` as a key inside `generate_kwargs` instead.",
+                FutureWarning,
+            )
+            forward_params["max_new_tokens"] = max_new_tokens
+        if generate_kwargs is not None:
+            if max_new_tokens is not None and "max_new_tokens" in generate_kwargs:
+                raise ValueError(
+                    "`max_new_tokens` is defined both as an argument and inside `generate_kwargs` argument, please use"
+                    " only 1 version"
+                )
+            forward_params.update(generate_kwargs)
+
+        postprocess_params = {}
+        if decoder_kwargs is not None:
+            postprocess_params["decoder_kwargs"] = decoder_kwargs
+        if return_timestamps is not None:
+            # Check whether we have a valid setting for return_timestamps and throw an error before we perform a forward pass
+            if self.type == "seq2seq" and return_timestamps:
+                raise ValueError("We cannot return_timestamps yet on non-CTC models apart from Whisper!")
+            if self.type == "ctc_with_lm" and return_timestamps != "word":
+                raise ValueError("CTC with LM can only predict word level timestamps, set `return_timestamps='word'`")
+            if self.type == "ctc" and return_timestamps not in ["char", "word"]:
+                raise ValueError(
+                    "CTC can either predict character level timestamps, or word level timestamps. "
+                    "Set `return_timestamps='char'` or `return_timestamps='word'` as required."
+                )
+            if self.type == "seq2seq_whisper" and return_timestamps == "char":
+                raise ValueError(
+                    "Whisper cannot return `char` timestamps, only word level or segment level timestamps. "
+                    "Use `return_timestamps='word'` or `return_timestamps=True` respectively."
+                )
+            forward_params["return_timestamps"] = return_timestamps
+            postprocess_params["return_timestamps"] = return_timestamps
+        if return_language is not None:
+            if self.type != "seq2seq_whisper":
+                raise ValueError("Only Whisper can return language for now.")
+            postprocess_params["return_language"] = return_language
+
+        if self.assistant_model is not None:
+            forward_params["assistant_model"] = self.assistant_model
+        if self.assistant_tokenizer is not None:
+            forward_params["tokenizer"] = self.tokenizer
+            forward_params["assistant_tokenizer"] = self.assistant_tokenizer
+
+        return preprocess_params, forward_params, postprocess_params
+
+    def preprocess(self, inputs, chunk_length_s=0, stride_length_s=None):
+        if isinstance(inputs, str):
+            if inputs.startswith("http://") or inputs.startswith("https://"):
+                # We need to actually check for a real protocol, otherwise it's impossible to use a local file
+                # like http_huggingface_co.png
+                inputs = requests.get(inputs).content
+            else:
+                with open(inputs, "rb") as f:
+                    inputs = f.read()
+
+        if isinstance(inputs, bytes):
+            inputs = ffmpeg_read(inputs, self.feature_extractor.sampling_rate)
+
+        stride = None
+        extra = {}
+        if isinstance(inputs, dict):
+            stride = inputs.pop("stride", None)
+            # Accepting `"array"` which is the key defined in `datasets` for
+            # better integration
+            if not ("sampling_rate" in inputs and ("raw" in inputs or "array" in inputs)):
+                raise ValueError(
+                    "When passing a dictionary to AutomaticSpeechRecognitionPipeline, the dict needs to contain a "
+                    '"raw" key containing the numpy array representing the audio and a "sampling_rate" key, '
+                    "containing the sampling_rate associated with that array"
+                )
+
+            _inputs = inputs.pop("raw", None)
+            if _inputs is None:
+                # Remove path which will not be used from `datasets`.
+                inputs.pop("path", None)
+                _inputs = inputs.pop("array", None)
+            in_sampling_rate = inputs.pop("sampling_rate")
+            extra = inputs
+            inputs = _inputs
+            if in_sampling_rate != self.feature_extractor.sampling_rate:
+                if is_torchaudio_available():
+                    from torchaudio import functional as F
+                else:
+                    raise ImportError(
+                        "torchaudio is required to resample audio samples in AutomaticSpeechRecognitionPipeline. "
+                        "The torchaudio package can be installed through: `pip install torchaudio`."
+                    )
+
+                inputs = F.resample(
+                    torch.from_numpy(inputs), in_sampling_rate, self.feature_extractor.sampling_rate
+                ).numpy()
+                ratio = self.feature_extractor.sampling_rate / in_sampling_rate
+            else:
+                ratio = 1
+            if stride is not None:
+                if stride[0] + stride[1] > inputs.shape[0]:
+                    raise ValueError("Stride is too large for input")
+
+                # Stride needs to get the chunk length here, it's going to get
+                # swallowed by the `feature_extractor` later, and then batching
+                # can add extra data in the inputs, so we need to keep track
+                # of the original length in the stride so we can cut properly.
+                stride = (inputs.shape[0], int(round(stride[0] * ratio)), int(round(stride[1] * ratio)))
+        if not isinstance(inputs, np.ndarray):
+            raise TypeError(f"We expect a numpy ndarray as input, got `{type(inputs)}`")
+        if len(inputs.shape) != 1:
+            raise ValueError("We expect a single channel audio input for AutomaticSpeechRecognitionPipeline")
+
+        if chunk_length_s:
+            if stride_length_s is None:
+                stride_length_s = chunk_length_s / 6
+
+            if isinstance(stride_length_s, (int, float)):
+                stride_length_s = [stride_length_s, stride_length_s]
+
+            # XXX: Carefuly, this variable will not exist in `seq2seq` setting.
+            # Currently chunking is not possible at this level for `seq2seq` so
+            # it's ok.
+            align_to = getattr(self.model.config, "inputs_to_logits_ratio", 1)
+            chunk_len = int(round(chunk_length_s * self.feature_extractor.sampling_rate / align_to) * align_to)
+            stride_left = int(round(stride_length_s[0] * self.feature_extractor.sampling_rate / align_to) * align_to)
+            stride_right = int(round(stride_length_s[1] * self.feature_extractor.sampling_rate / align_to) * align_to)
+
+            if chunk_len < stride_left + stride_right:
+                raise ValueError("Chunk length must be superior to stride length")
+
+            for item in chunk_iter(
+                inputs, self.feature_extractor, chunk_len, stride_left, stride_right, self.torch_dtype
+            ):
+                yield {**item, **extra}
+        else:
+            if self.type == "seq2seq_whisper" and inputs.shape[0] > self.feature_extractor.n_samples:
+                processed = self.feature_extractor(
+                    inputs,
+                    sampling_rate=self.feature_extractor.sampling_rate,
+                    truncation=False,
+                    padding="longest",
+                    return_tensors="pt",
+                    return_attention_mask=True,
+                )
+            else:
+                if self.type == "seq2seq_whisper" and stride is None:
+                    processed = self.feature_extractor(
+                        inputs,
+                        sampling_rate=self.feature_extractor.sampling_rate,
+                        return_tensors="pt",
+                        return_token_timestamps=True,
+                        return_attention_mask=True,
+                    )
+                    extra["num_frames"] = processed.pop("num_frames")
+                else:
+                    processed = self.feature_extractor(
+                        inputs,
+                        sampling_rate=self.feature_extractor.sampling_rate,
+                        return_tensors="pt",
+                        return_attention_mask=True,
+                    )
+            if self.torch_dtype is not None:
+                processed = processed.to(dtype=self.torch_dtype)
+            if stride is not None:
+                if self.type == "seq2seq":
+                    raise ValueError("Stride is only usable with CTC models, try removing it !")
+
+                processed["stride"] = stride
+            yield {"is_last": True, **processed, **extra}
+
+    def _forward(self, model_inputs, return_timestamps=False, **generate_kwargs):
+        attention_mask = model_inputs.pop("attention_mask", None)
+        stride = model_inputs.pop("stride", None)
+        num_frames = model_inputs.pop("num_frames", None)
+        is_last = model_inputs.pop("is_last")
+
+        if stride is not None and num_frames is not None:
+            raise ValueError("num_frames must be used only when stride is None")
+
+        if self.type in {"seq2seq", "seq2seq_whisper"}:
+            # Consume values so we can let extra information flow freely through
+            # the pipeline (important for `partial` in microphone)
+            if "input_features" in model_inputs:
+                inputs = model_inputs.pop("input_features")
+            elif "input_values" in model_inputs:
+                inputs = model_inputs.pop("input_values")
+            else:
+                raise ValueError(
+                    "Seq2Seq speech recognition model requires either a "
+                    f"`input_features` or `input_values` key, but only has {model_inputs.keys()}"
+                )
+
+            # custom processing for Whisper timestamps and word-level timestamps
+            if return_timestamps and self.type == "seq2seq_whisper":
+                generate_kwargs["return_timestamps"] = return_timestamps
+                if return_timestamps == "word":
+                    generate_kwargs["return_token_timestamps"] = True
+                    generate_kwargs["return_segments"] = True
+
+                    if stride is not None:
+                        if isinstance(stride, tuple):
+                            generate_kwargs["num_frames"] = stride[0] // self.feature_extractor.hop_length
+                        else:
+                            generate_kwargs["num_frames"] = [s[0] // self.feature_extractor.hop_length for s in stride]
+                    else:
+                        generate_kwargs["num_frames"] = num_frames
+
+            # User-defined `generation_config` passed to the pipeline call take precedence
+            if "generation_config" not in generate_kwargs:
+                generate_kwargs["generation_config"] = self.generation_config
+
+            tokens = self.model.generate(
+                inputs=inputs,
+                attention_mask=attention_mask,
+                **generate_kwargs,
+            )
+            # whisper longform generation stores timestamps in "segments"
+            if return_timestamps == "word" and self.type == "seq2seq_whisper":
+                if "segments" not in tokens:
+                    out = {"tokens": tokens["sequences"], "token_timestamps": tokens["token_timestamps"]}
+                else:
+                    token_timestamps = [
+                        torch.cat([segment["token_timestamps"] for segment in segment_list])
+                        for segment_list in tokens["segments"]
+                    ]
+                    out = {"tokens": tokens["sequences"], "token_timestamps": token_timestamps}
+            else:
+                out = {"tokens": tokens}
+            if self.type == "seq2seq_whisper":
+                if stride is not None:
+                    out["stride"] = stride
+
+        else:
+            inputs = {
+                self.model.main_input_name: model_inputs.pop(self.model.main_input_name),
+                "attention_mask": attention_mask,
+            }
+            outputs = self.model(**inputs)
+            logits = outputs.logits
+
+            if self.type == "ctc_with_lm":
+                out = {"logits": logits}
+            else:
+                out = {"tokens": logits.argmax(dim=-1)}
+            if stride is not None:
+                # Send stride to `postprocess`.
+                # it needs to be handled there where
+                # the pieces are to be concatenated.
+                ratio = 1 / self.model.config.inputs_to_logits_ratio
+                if isinstance(stride, tuple):
+                    out["stride"] = rescale_stride([stride], ratio)[0]
+                else:
+                    out["stride"] = rescale_stride(stride, ratio)
+        # Leftover
+        extra = model_inputs
+        return {"is_last": is_last, **out, **extra}
+
+    def postprocess(
+        self, model_outputs, decoder_kwargs: Optional[Dict] = None, return_timestamps=None, return_language=None
+    ):
+        # Optional return types
+        optional = {}
+
+        final_items = []
+        key = "logits" if self.type == "ctc_with_lm" else "tokens"
+        stride = None
+        for outputs in model_outputs:
+            if self.framework == "pt" and outputs[key].dtype in (torch.bfloat16, torch.float16):
+                items = outputs[key].to(torch.float32).numpy()
+            else:
+                items = outputs[key].numpy()
+            stride = outputs.get("stride", None)
+            if stride is not None and self.type in {"ctc", "ctc_with_lm"}:
+                total_n, left, right = stride
+                # Total_n might be < logits.shape[1]
+                # because of padding, that's why
+                # we need to reconstruct this information
+                # This won't work with left padding (which doesn't exist right now)
+                right_n = total_n - right
+                items = items[:, left:right_n]
+            final_items.append(items)
+
+        if stride and self.type == "seq2seq":
+            items = _find_longest_common_sequence(final_items, self.tokenizer)
+        elif self.type == "seq2seq_whisper":
+            time_precision = self.feature_extractor.chunk_length / self.model.config.max_source_positions
+            # Send the chunking back to seconds, it's easier to handle in whisper
+            sampling_rate = self.feature_extractor.sampling_rate
+            for output in model_outputs:
+                if "stride" in output:
+                    chunk_len, stride_left, stride_right = output["stride"]
+                    # Go back in seconds
+                    chunk_len /= sampling_rate
+                    stride_left /= sampling_rate
+                    stride_right /= sampling_rate
+                    output["stride"] = chunk_len, stride_left, stride_right
+
+            text, optional = self.tokenizer._decode_asr(
+                model_outputs,
+                return_timestamps=return_timestamps,
+                return_language=return_language,
+                time_precision=time_precision,
+            )
+        else:
+            items = np.concatenate(final_items, axis=1)
+            items = items.squeeze(0)
+
+        if self.type == "ctc_with_lm":
+            if decoder_kwargs is None:
+                decoder_kwargs = {}
+            beams = self.decoder.decode_beams(items, **decoder_kwargs)
+            text = beams[0][0]
+            if return_timestamps:
+                # Simply cast from pyctcdecode format to wav2vec2 format to leverage
+                # pre-existing code later
+                chunk_offset = beams[0][2]
+                offsets = []
+                for word, (start_offset, end_offset) in chunk_offset:
+                    offsets.append({"word": word, "start_offset": start_offset, "end_offset": end_offset})
+        elif self.type != "seq2seq_whisper":
+            skip_special_tokens = self.type != "ctc"
+            text = self.tokenizer.decode(items, skip_special_tokens=skip_special_tokens)
+            if return_timestamps:
+                offsets = self.tokenizer.decode(
+                    items, skip_special_tokens=skip_special_tokens, output_char_offsets=True
+                )["char_offsets"]
+                if return_timestamps == "word":
+                    offsets = self.tokenizer._get_word_offsets(offsets, self.tokenizer.replace_word_delimiter_char)
+
+        if return_timestamps and self.type not in {"seq2seq", "seq2seq_whisper"}:
+            chunks = []
+            for item in offsets:
+                start = item["start_offset"] * self.model.config.inputs_to_logits_ratio
+                start /= self.feature_extractor.sampling_rate
+
+                stop = item["end_offset"] * self.model.config.inputs_to_logits_ratio
+                stop /= self.feature_extractor.sampling_rate
+
+                chunks.append({"text": item[return_timestamps], "timestamp": (start, stop)})
+            optional["chunks"] = chunks
+
+        extra = defaultdict(list)
+        for output in model_outputs:
+            output.pop("tokens", None)
+            output.pop("logits", None)
+            output.pop("is_last", None)
+            output.pop("stride", None)
+            output.pop("token_timestamps", None)
+            for k, v in output.items():
+                extra[k].append(v)
+        return {"text": text, **optional, **extra}
+
+
+def _find_timestamp_sequence(sequences, tokenizer, feature_extractor, max_source_positions):
+    """
+    Computes the final sequences by merging the end of the nth sequence with the beginning of the n+1th sequence. Since
+    `WhisperForConditionalGeneration` produces the timestamps pairwise, we filter the consecutive timestamps and only
+    iterate over them. We keep track of the `time` which indicates the actual starting time of the chunk that is
+    processed. We need to make sure to offset the timestamps tokens by the `time` in order for the tokenizer to
+    properly compute the final `offset`.
+    """
+    # index of the first timestamp token
+    timestamp_begin = tokenizer.convert_tokens_to_ids("<|notimestamps|>") + 1
+    items = []
+    # approximation of the token to time ratio : ~0.2seconds
+    time_precision = feature_extractor.chunk_length / max_source_positions
+    time = 0
+    for seq_idx, item in enumerate(sequences):
+        sequence, stride = item
+        if isinstance(sequence, list):
+            sequence = np.array(sequence)
+        chunk_len, stride_left, stride_right = stride
+        sequence = sequence.squeeze(0)
+        # get rid of the `forced_decoder_idx` that are use to parametrize the generation
+        begin_idx = np.where(sequence == timestamp_begin)[0][0] if timestamp_begin in sequence else 0
+        sequence = sequence[begin_idx:]
+
+        timestamp_tokens = sequence >= timestamp_begin
+        if seq_idx != 0 and sum(timestamp_tokens) > 0:
+            consecutive = np.where(timestamp_tokens[:-1] & timestamp_tokens[1:])[0] + 1
+            last_timestamp = np.where(timestamp_tokens)[0][-1]
+            consecutive = np.append(consecutive, last_timestamp) if last_timestamp not in consecutive else consecutive
+            time -= stride_left + stride_right
+            offset = int((time / feature_extractor.sampling_rate) / time_precision)
+            overlap_time = int((stride_left / feature_extractor.sampling_rate) / time_precision)
+            # relevant timestamps are in the overlapping part
+            relevant_timestamp = np.where(sequence[consecutive] >= timestamp_begin + overlap_time)[0]
+            if relevant_timestamp.shape[0] > 0:
+                relevant_timestamp = (
+                    consecutive[relevant_timestamp[0] - 1] if relevant_timestamp[0] > 0 else consecutive[0]
+                )
+                # if a big stride is used, we need to check some of the previous items for the best overlap
+                best_match = 0
+                sliced_sequence = []
+                for idx, previous_sequence in enumerate(reversed(items)):
+                    previous_tokens = previous_sequence[1:-1]
+                    if previous_sequence[0] < (timestamp_begin + offset - overlap_time) and idx != 0:
+                        break  # the previous sequence is too far in the past
+                    if len(previous_tokens) > 0:
+                        # find the longest common sequence between the overlapping parts
+                        index_left, index_right, match_length = _fast_find_longest_common_sequence(
+                            sequence[1:relevant_timestamp], previous_tokens
+                        )
+                        # don't do anything if only 1 token was matched
+                        if match_length > 1 and match_length > best_match:
+                            best_match = match_length
+                            best_idx = idx
+                            end_of_curr_sequence_idx = (
+                                np.where(sequence[index_left + 1 :] >= timestamp_begin)[0][0] + 1
+                            )
+                            end_of_curr_sequence_idx = end_of_curr_sequence_idx + 1 + index_left
+                            # if all the tokens are matched, suffix
+                            if index_left == 0 and match_length == len(previous_tokens):
+                                sliced_sequence = np.insert(
+                                    sequence[index_left + 1 : end_of_curr_sequence_idx], 0, previous_sequence[0]
+                                )
+                                sliced_sequence[-1] = previous_sequence[-1]
+                            # if part of the previous sequence is not taken
+                            elif index_left >= 0:
+                                sliced_sequence = sequence[index_left + 1 : end_of_curr_sequence_idx]
+                                # let's insert the missing part of the previous sequence
+                                previous_slice = (
+                                    previous_sequence[: index_right + 1] if index_right > 0 else [previous_sequence[0]]
+                                )
+                                sliced_sequence = np.insert(sliced_sequence, 0, previous_slice)
+                                sliced_sequence[-1] += offset
+
+                if len(sliced_sequence) > 0:
+                    items[len(items) - best_idx - 1] = sliced_sequence
+                    items = items[: len(items) - best_idx]
+                    sequence = sequence[end_of_curr_sequence_idx:]
+
+        # sequence might have changed
+        timestamp_tokens = sequence >= timestamp_begin
+        consecutive = np.where(timestamp_tokens[:-1] & timestamp_tokens[1:])[0] + 1
+        if sum(timestamp_tokens) > 0:
+            last_timestamp = np.where(timestamp_tokens)[0][-1]
+            consecutive = (
+                np.append(consecutive, last_timestamp + 1) if last_timestamp not in consecutive else consecutive
+            )
+
+        if len(consecutive) > 0:
+            last_slice = 0
+            for current_slice in consecutive:
+                actual_offset = items[-1][-1] if seq_idx != 0 or last_slice != 0 else sequence[0]
+                sliced_tokens = sequence[last_slice:current_slice]
+                duration = sliced_tokens[-1] - sliced_tokens[0]
+                sliced_tokens[0] = actual_offset
+                sliced_tokens[-1] = actual_offset + duration
+                items.append(sliced_tokens)
+                last_slice = current_slice
+
+        time += chunk_len
+    result = []
+    for i in range(len(items)):
+        result += items[i].tolist()
+    return result
diff --git a/.venv/lib/python3.11/site-packages/transformers/pipelines/base.py b/.venv/lib/python3.11/site-packages/transformers/pipelines/base.py
new file mode 100644
index 0000000000000000000000000000000000000000..a24e9c3f69787849de363dc501666d511e84ee13
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/transformers/pipelines/base.py
@@ -0,0 +1,1484 @@
+# coding=utf-8
+# Copyright 2018 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import collections
+import copy
+import csv
+import importlib
+import json
+import os
+import pickle
+import sys
+import traceback
+import types
+import warnings
+from abc import ABC, abstractmethod
+from collections import UserDict
+from contextlib import contextmanager
+from os.path import abspath, exists
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union
+
+from ..dynamic_module_utils import custom_object_save
+from ..feature_extraction_utils import PreTrainedFeatureExtractor
+from ..image_processing_utils import BaseImageProcessor
+from ..modelcard import ModelCard
+from ..models.auto import AutoConfig, AutoTokenizer
+from ..processing_utils import ProcessorMixin
+from ..tokenization_utils import PreTrainedTokenizer
+from ..utils import (
+    ModelOutput,
+    PushToHubMixin,
+    add_end_docstrings,
+    copy_func,
+    infer_framework,
+    is_tf_available,
+    is_torch_available,
+    is_torch_cuda_available,
+    is_torch_mlu_available,
+    is_torch_mps_available,
+    is_torch_musa_available,
+    is_torch_npu_available,
+    is_torch_xpu_available,
+    logging,
+)
+
+
+GenericTensor = Union[List["GenericTensor"], "torch.Tensor", "tf.Tensor"]
+
+if is_tf_available():
+    import tensorflow as tf
+
+    from ..models.auto.modeling_tf_auto import TFAutoModel
+
+if is_torch_available():
+    import torch
+    from torch.utils.data import DataLoader, Dataset
+
+    from ..models.auto.modeling_auto import AutoModel
+
+    # Re-export for backward compatibility
+    from .pt_utils import KeyDataset
+else:
+    Dataset = None
+    KeyDataset = None
+
+if TYPE_CHECKING:
+    from ..modeling_tf_utils import TFPreTrainedModel
+    from ..modeling_utils import PreTrainedModel
+
+
+logger = logging.get_logger(__name__)
+
+
+def no_collate_fn(items):
+    if len(items) != 1:
+        raise ValueError("This collate_fn is meant to be used with batch_size=1")
+    return items[0]
+
+
+def _pad(items, key, padding_value, padding_side):
+    batch_size = len(items)
+    if isinstance(items[0][key], torch.Tensor):
+        # Others include `attention_mask` etc...
+        shape = items[0][key].shape
+        dim = len(shape)
+        if dim == 1:
+            # We have a list of 1-dim torch tensors, which can be stacked without padding
+            return torch.cat([item[key] for item in items], dim=0)
+        if key in ["pixel_values", "image"]:
+            # This is probable image so padding shouldn't be necessary
+            # B, C, H, W
+            return torch.cat([item[key] for item in items], dim=0)
+        elif dim == 4 and key == "input_features":
+            # this is probably a mel spectrogram batched
+            return torch.cat([item[key] for item in items], dim=0)
+        max_length = max(item[key].shape[1] for item in items)
+        min_length = min(item[key].shape[1] for item in items)
+        dtype = items[0][key].dtype
+
+        if dim == 2:
+            if max_length == min_length:
+                # Bypass for `ImageGPT` which doesn't provide a padding value, yet
+                # we can consistently pad since the size should be matching
+                return torch.cat([item[key] for item in items], dim=0)
+            tensor = torch.zeros((batch_size, max_length), dtype=dtype) + padding_value
+        elif dim == 3:
+            tensor = torch.zeros((batch_size, max_length, shape[-1]), dtype=dtype) + padding_value
+        elif dim == 4:
+            tensor = torch.zeros((batch_size, max_length, shape[-2], shape[-1]), dtype=dtype) + padding_value
+
+        for i, item in enumerate(items):
+            if dim == 2:
+                if padding_side == "left":
+                    tensor[i, -len(item[key][0]) :] = item[key][0].clone()
+                else:
+                    tensor[i, : len(item[key][0])] = item[key][0].clone()
+            elif dim == 3:
+                if padding_side == "left":
+                    tensor[i, -len(item[key][0]) :, :] = item[key][0].clone()
+                else:
+                    tensor[i, : len(item[key][0]), :] = item[key][0].clone()
+            elif dim == 4:
+                if padding_side == "left":
+                    tensor[i, -len(item[key][0]) :, :, :] = item[key][0].clone()
+                else:
+                    tensor[i, : len(item[key][0]), :, :] = item[key][0].clone()
+
+        return tensor
+    else:
+        return [item[key] for item in items]
+
+
+def pad_collate_fn(tokenizer, feature_extractor):
+    # Tokenizer
+    t_padding_side = None
+    # Feature extractor
+    f_padding_side = None
+    if tokenizer is None and feature_extractor is None:
+        raise ValueError("Pipeline without tokenizer or feature_extractor cannot do batching")
+    if tokenizer is not None:
+        if tokenizer.pad_token_id is None:
+            raise ValueError(
+                "Pipeline with tokenizer without pad_token cannot do batching. You can try to set it with "
+                "`pipe.tokenizer.pad_token_id = model.config.eos_token_id`."
+            )
+        else:
+            t_padding_value = tokenizer.pad_token_id
+            t_padding_side = tokenizer.padding_side
+    if feature_extractor is not None:
+        # Feature extractor can be images, where no padding is expected
+        f_padding_value = getattr(feature_extractor, "padding_value", None)
+        f_padding_side = getattr(feature_extractor, "padding_side", None)
+
+    if t_padding_side is not None and f_padding_side is not None and t_padding_side != f_padding_side:
+        raise ValueError(
+            f"The feature extractor, and tokenizer don't agree on padding side {t_padding_side} != {f_padding_side}"
+        )
+    padding_side = "right"
+    if t_padding_side is not None:
+        padding_side = t_padding_side
+    if f_padding_side is not None:
+        padding_side = f_padding_side
+
+    def inner(items):
+        keys = set(items[0].keys())
+        for item in items:
+            if set(item.keys()) != keys:
+                raise ValueError(
+                    f"The elements of the batch contain different keys. Cannot batch them ({set(item.keys())} !="
+                    f" {keys})"
+                )
+        # input_values, input_pixels, input_ids, ...
+        padded = {}
+        for key in keys:
+            if key in {"input_ids"}:
+                # ImageGPT uses a feature extractor
+                if tokenizer is None and feature_extractor is not None:
+                    _padding_value = f_padding_value
+                else:
+                    _padding_value = t_padding_value
+            elif key in {"input_values", "pixel_values", "input_features"}:
+                _padding_value = f_padding_value
+            elif key in {"p_mask", "special_tokens_mask"}:
+                _padding_value = 1
+            elif key in {"attention_mask", "token_type_ids"}:
+                _padding_value = 0
+            else:
+                # This is likely another random key maybe even user provided
+                _padding_value = 0
+            padded[key] = _pad(items, key, _padding_value, padding_side)
+        return padded
+
+    return inner
+
+
+def infer_framework_load_model(
+    model,
+    config: AutoConfig,
+    model_classes: Optional[Dict[str, Tuple[type]]] = None,
+    task: Optional[str] = None,
+    framework: Optional[str] = None,
+    **model_kwargs,
+):
+    """
+    Select framework (TensorFlow or PyTorch) to use from the `model` passed. Returns a tuple (framework, model).
+
+    If `model` is instantiated, this function will just infer the framework from the model class. Otherwise `model` is
+    actually a checkpoint name and this method will try to instantiate it using `model_classes`. Since we don't want to
+    instantiate the model twice, this model is returned for use by the pipeline.
+
+    If both frameworks are installed and available for `model`, PyTorch is selected.
+
+    Args:
+        model (`str`, [`PreTrainedModel`] or [`TFPreTrainedModel]`):
+            The model to infer the framework from. If `str`, a checkpoint name. The model to infer the framewrok from.
+        config ([`AutoConfig`]):
+            The config associated with the model to help using the correct class
+        model_classes (dictionary `str` to `type`, *optional*):
+            A mapping framework to class.
+        task (`str`):
+            The task defining which pipeline will be returned.
+        model_kwargs:
+            Additional dictionary of keyword arguments passed along to the model's `from_pretrained(...,
+            **model_kwargs)` function.
+
+    Returns:
+        `Tuple`: A tuple framework, model.
+    """
+    if not is_tf_available() and not is_torch_available():
+        raise RuntimeError(
+            "At least one of TensorFlow 2.0 or PyTorch should be installed. "
+            "To install TensorFlow 2.0, read the instructions at https://www.tensorflow.org/install/ "
+            "To install PyTorch, read the instructions at https://pytorch.org/."
+        )
+    if isinstance(model, str):
+        model_kwargs["_from_pipeline"] = task
+        class_tuple = ()
+        look_pt = is_torch_available() and framework in {"pt", None}
+        look_tf = is_tf_available() and framework in {"tf", None}
+        if model_classes:
+            if look_pt:
+                class_tuple = class_tuple + model_classes.get("pt", (AutoModel,))
+            if look_tf:
+                class_tuple = class_tuple + model_classes.get("tf", (TFAutoModel,))
+        if config.architectures:
+            classes = []
+            for architecture in config.architectures:
+                transformers_module = importlib.import_module("transformers")
+                if look_pt:
+                    _class = getattr(transformers_module, architecture, None)
+                    if _class is not None:
+                        classes.append(_class)
+                if look_tf:
+                    _class = getattr(transformers_module, f"TF{architecture}", None)
+                    if _class is not None:
+                        classes.append(_class)
+            class_tuple = class_tuple + tuple(classes)
+
+        if len(class_tuple) == 0:
+            raise ValueError(f"Pipeline cannot infer suitable model classes from {model}")
+
+        all_traceback = {}
+        for model_class in class_tuple:
+            kwargs = model_kwargs.copy()
+            if framework == "pt" and model.endswith(".h5"):
+                kwargs["from_tf"] = True
+                logger.warning(
+                    "Model might be a TensorFlow model (ending with `.h5`) but TensorFlow is not available. "
+                    "Trying to load the model with PyTorch."
+                )
+            elif framework == "tf" and model.endswith(".bin"):
+                kwargs["from_pt"] = True
+                logger.warning(
+                    "Model might be a PyTorch model (ending with `.bin`) but PyTorch is not available. "
+                    "Trying to load the model with Tensorflow."
+                )
+
+            try:
+                model = model_class.from_pretrained(model, **kwargs)
+                if hasattr(model, "eval"):
+                    model = model.eval()
+                # Stop loading on the first successful load.
+                break
+            except (OSError, ValueError):
+                all_traceback[model_class.__name__] = traceback.format_exc()
+                continue
+
+        if isinstance(model, str):
+            error = ""
+            for class_name, trace in all_traceback.items():
+                error += f"while loading with {class_name}, an error is thrown:\n{trace}\n"
+            raise ValueError(
+                f"Could not load model {model} with any of the following classes: {class_tuple}. See the original errors:\n\n{error}\n"
+            )
+
+    if framework is None:
+        framework = infer_framework(model.__class__)
+    return framework, model
+
+
+def infer_framework_from_model(
+    model,
+    model_classes: Optional[Dict[str, Tuple[type]]] = None,
+    task: Optional[str] = None,
+    framework: Optional[str] = None,
+    **model_kwargs,
+):
+    """
+    Select framework (TensorFlow or PyTorch) to use from the `model` passed. Returns a tuple (framework, model).
+
+    If `model` is instantiated, this function will just infer the framework from the model class. Otherwise `model` is
+    actually a checkpoint name and this method will try to instantiate it using `model_classes`. Since we don't want to
+    instantiate the model twice, this model is returned for use by the pipeline.
+
+    If both frameworks are installed and available for `model`, PyTorch is selected.
+
+    Args:
+        model (`str`, [`PreTrainedModel`] or [`TFPreTrainedModel]`):
+            The model to infer the framework from. If `str`, a checkpoint name. The model to infer the framewrok from.
+        model_classes (dictionary `str` to `type`, *optional*):
+            A mapping framework to class.
+        task (`str`):
+            The task defining which pipeline will be returned.
+        model_kwargs:
+            Additional dictionary of keyword arguments passed along to the model's `from_pretrained(...,
+            **model_kwargs)` function.
+
+    Returns:
+        `Tuple`: A tuple framework, model.
+    """
+    if isinstance(model, str):
+        config = AutoConfig.from_pretrained(model, _from_pipeline=task, **model_kwargs)
+    else:
+        config = model.config
+    return infer_framework_load_model(
+        model, config, model_classes=model_classes, _from_pipeline=task, task=task, framework=framework, **model_kwargs
+    )
+
+
+def get_framework(model, revision: Optional[str] = None):
+    """
+    Select framework (TensorFlow or PyTorch) to use.
+
+    Args:
+        model (`str`, [`PreTrainedModel`] or [`TFPreTrainedModel]`):
+            If both frameworks are installed, picks the one corresponding to the model passed (either a model class or
+            the model name). If no specific model is provided, defaults to using PyTorch.
+    """
+    warnings.warn(
+        "`get_framework` is deprecated and will be removed in v5, use `infer_framework_from_model` instead.",
+        FutureWarning,
+    )
+    if not is_tf_available() and not is_torch_available():
+        raise RuntimeError(
+            "At least one of TensorFlow 2.0 or PyTorch should be installed. "
+            "To install TensorFlow 2.0, read the instructions at https://www.tensorflow.org/install/ "
+            "To install PyTorch, read the instructions at https://pytorch.org/."
+        )
+    if isinstance(model, str):
+        if is_torch_available() and not is_tf_available():
+            model = AutoModel.from_pretrained(model, revision=revision)
+        elif is_tf_available() and not is_torch_available():
+            model = TFAutoModel.from_pretrained(model, revision=revision)
+        else:
+            try:
+                model = AutoModel.from_pretrained(model, revision=revision)
+            except OSError:
+                model = TFAutoModel.from_pretrained(model, revision=revision)
+
+    framework = infer_framework(model.__class__)
+    return framework
+
+
+def get_default_model_and_revision(
+    targeted_task: Dict, framework: Optional[str], task_options: Optional[Any]
+) -> Union[str, Tuple[str, str]]:
+    """
+    Select a default model to use for a given task. Defaults to pytorch if ambiguous.
+
+    Args:
+        targeted_task (`Dict`):
+           Dictionary representing the given task, that should contain default models
+
+        framework (`str`, None)
+           "pt", "tf" or None, representing a specific framework if it was specified, or None if we don't know yet.
+
+        task_options (`Any`, None)
+           Any further value required by the task to get fully specified, for instance (SRC, TGT) languages for
+           translation task.
+
+    Returns
+
+        `str` The model string representing the default model for this pipeline
+    """
+    if is_torch_available() and not is_tf_available():
+        framework = "pt"
+    elif is_tf_available() and not is_torch_available():
+        framework = "tf"
+
+    defaults = targeted_task["default"]
+    if task_options:
+        if task_options not in defaults:
+            raise ValueError(f"The task does not provide any default models for options {task_options}")
+        default_models = defaults[task_options]["model"]
+    elif "model" in defaults:
+        default_models = targeted_task["default"]["model"]
+    else:
+        # XXX This error message needs to be updated to be more generic if more tasks are going to become
+        # parametrized
+        raise ValueError('The task defaults can\'t be correctly selected. You probably meant "translation_XX_to_YY"')
+
+    if framework is None:
+        framework = "pt"
+
+    return default_models[framework]
+
+
+def load_assistant_model(
+    model: "PreTrainedModel",
+    assistant_model: Optional[Union[str, "PreTrainedModel"]],
+    assistant_tokenizer: Optional[PreTrainedTokenizer],
+) -> Tuple[Optional["PreTrainedModel"], Optional[PreTrainedTokenizer]]:
+    """
+    Prepares the assistant model and the assistant tokenizer for a pipeline whose model that can call `generate`.
+
+    Args:
+        model ([`PreTrainedModel`]):
+            The main model that will be used by the pipeline to make predictions.
+        assistant_model (`str` or [`PreTrainedModel`], *optional*):
+            The assistant model that will be used by the pipeline to make predictions.
+        assistant_tokenizer ([`PreTrainedTokenizer`], *optional*):
+            The assistant tokenizer that will be used by the pipeline to encode data for the model.
+
+    Returns:
+        Tuple: The loaded assistant model and (optionally) the loaded tokenizer.
+    """
+    if not model.can_generate() or assistant_model is None:
+        return None, None
+
+    if not isinstance(model, PreTrainedModel):
+        raise ValueError(
+            "Assisted generation, triggered by the `assistant_model` argument, is only available for "
+            "`PreTrainedModel` model instances. For instance, TF or JAX models are not supported."
+        )
+
+    # If the model is passed as a string, load the model and the corresponding tokenizer
+    if isinstance(assistant_model, str):
+        assistant_config = AutoConfig.from_pretrained(assistant_model)
+        _, loaded_assistant_model = infer_framework_load_model(assistant_model, config=assistant_config)
+        loaded_assistant_model = loaded_assistant_model.to(device=model.device, dtype=model.dtype)
+        loaded_assistant_tokenizer = AutoTokenizer.from_pretrained(assistant_model)
+    else:
+        loaded_assistant_model = assistant_model
+        loaded_assistant_tokenizer = assistant_tokenizer
+
+    # Finally, let's check the tokenizers: if the two models have different tokenizers, we need to keep the assistant
+    # tokenizer
+    same_vocab_size = model.config.vocab_size == loaded_assistant_model.config.vocab_size
+    same_special_tokens = all(
+        getattr(model.config, token) == getattr(loaded_assistant_model.config, token)
+        for token in ("eos_token_id", "pad_token_id", "bos_token_id")
+    )
+    if same_vocab_size and same_special_tokens:
+        loaded_assistant_tokenizer = None
+    elif loaded_assistant_tokenizer is None:
+        raise ValueError(
+            "The assistant model has a different tokenizer than the main model. You should pass the assistant "
+            "tokenizer."
+        )
+
+    return loaded_assistant_model, loaded_assistant_tokenizer
+
+
+class PipelineException(Exception):
+    """
+    Raised by a [`Pipeline`] when handling __call__.
+
+    Args:
+        task (`str`): The task of the pipeline.
+        model (`str`): The model used by the pipeline.
+        reason (`str`): The error message to display.
+    """
+
+    def __init__(self, task: str, model: str, reason: str):
+        super().__init__(reason)
+
+        self.task = task
+        self.model = model
+
+
+class ArgumentHandler(ABC):
+    """
+    Base interface for handling arguments for each [`~pipelines.Pipeline`].
+    """
+
+    @abstractmethod
+    def __call__(self, *args, **kwargs):
+        raise NotImplementedError()
+
+
+class PipelineDataFormat:
+    """
+    Base class for all the pipeline supported data format both for reading and writing. Supported data formats
+    currently includes:
+
+    - JSON
+    - CSV
+    - stdin/stdout (pipe)
+
+    `PipelineDataFormat` also includes some utilities to work with multi-columns like mapping from datasets columns to
+    pipelines keyword arguments through the `dataset_kwarg_1=dataset_column_1` format.
+
+    Args:
+        output_path (`str`): Where to save the outgoing data.
+        input_path (`str`): Where to look for the input data.
+        column (`str`): The column to read.
+        overwrite (`bool`, *optional*, defaults to `False`):
+            Whether or not to overwrite the `output_path`.
+    """
+
+    SUPPORTED_FORMATS = ["json", "csv", "pipe"]
+
+    def __init__(
+        self,
+        output_path: Optional[str],
+        input_path: Optional[str],
+        column: Optional[str],
+        overwrite: bool = False,
+    ):
+        self.output_path = output_path
+        self.input_path = input_path
+        self.column = column.split(",") if column is not None else [""]
+        self.is_multi_columns = len(self.column) > 1
+
+        if self.is_multi_columns:
+            self.column = [tuple(c.split("=")) if "=" in c else (c, c) for c in self.column]
+
+        if output_path is not None and not overwrite:
+            if exists(abspath(self.output_path)):
+                raise OSError(f"{self.output_path} already exists on disk")
+
+        if input_path is not None:
+            if not exists(abspath(self.input_path)):
+                raise OSError(f"{self.input_path} doesnt exist on disk")
+
+    @abstractmethod
+    def __iter__(self):
+        raise NotImplementedError()
+
+    @abstractmethod
+    def save(self, data: Union[dict, List[dict]]):
+        """
+        Save the provided data object with the representation for the current [`~pipelines.PipelineDataFormat`].
+
+        Args:
+            data (`dict` or list of `dict`): The data to store.
+        """
+        raise NotImplementedError()
+
+    def save_binary(self, data: Union[dict, List[dict]]) -> str:
+        """
+        Save the provided data object as a pickle-formatted binary data on the disk.
+
+        Args:
+            data (`dict` or list of `dict`): The data to store.
+
+        Returns:
+            `str`: Path where the data has been saved.
+        """
+        path, _ = os.path.splitext(self.output_path)
+        binary_path = os.path.extsep.join((path, "pickle"))
+
+        with open(binary_path, "wb+") as f_output:
+            pickle.dump(data, f_output)
+
+        return binary_path
+
+    @staticmethod
+    def from_str(
+        format: str,
+        output_path: Optional[str],
+        input_path: Optional[str],
+        column: Optional[str],
+        overwrite=False,
+    ) -> "PipelineDataFormat":
+        """
+        Creates an instance of the right subclass of [`~pipelines.PipelineDataFormat`] depending on `format`.
+
+        Args:
+            format (`str`):
+                The format of the desired pipeline. Acceptable values are `"json"`, `"csv"` or `"pipe"`.
+            output_path (`str`, *optional*):
+                Where to save the outgoing data.
+            input_path (`str`, *optional*):
+                Where to look for the input data.
+            column (`str`, *optional*):
+                The column to read.
+            overwrite (`bool`, *optional*, defaults to `False`):
+                Whether or not to overwrite the `output_path`.
+
+        Returns:
+            [`~pipelines.PipelineDataFormat`]: The proper data format.
+        """
+        if format == "json":
+            return JsonPipelineDataFormat(output_path, input_path, column, overwrite=overwrite)
+        elif format == "csv":
+            return CsvPipelineDataFormat(output_path, input_path, column, overwrite=overwrite)
+        elif format == "pipe":
+            return PipedPipelineDataFormat(output_path, input_path, column, overwrite=overwrite)
+        else:
+            raise KeyError(f"Unknown reader {format} (Available reader are json/csv/pipe)")
+
+
+class CsvPipelineDataFormat(PipelineDataFormat):
+    """
+    Support for pipelines using CSV data format.
+
+    Args:
+        output_path (`str`): Where to save the outgoing data.
+        input_path (`str`): Where to look for the input data.
+        column (`str`): The column to read.
+        overwrite (`bool`, *optional*, defaults to `False`):
+            Whether or not to overwrite the `output_path`.
+    """
+
+    def __init__(
+        self,
+        output_path: Optional[str],
+        input_path: Optional[str],
+        column: Optional[str],
+        overwrite=False,
+    ):
+        super().__init__(output_path, input_path, column, overwrite=overwrite)
+
+    def __iter__(self):
+        with open(self.input_path, "r") as f:
+            reader = csv.DictReader(f)
+            for row in reader:
+                if self.is_multi_columns:
+                    yield {k: row[c] for k, c in self.column}
+                else:
+                    yield row[self.column[0]]
+
+    def save(self, data: List[dict]):
+        """
+        Save the provided data object with the representation for the current [`~pipelines.PipelineDataFormat`].
+
+        Args:
+            data (`List[dict]`): The data to store.
+        """
+        with open(self.output_path, "w") as f:
+            if len(data) > 0:
+                writer = csv.DictWriter(f, list(data[0].keys()))
+                writer.writeheader()
+                writer.writerows(data)
+
+
+class JsonPipelineDataFormat(PipelineDataFormat):
+    """
+    Support for pipelines using JSON file format.
+
+    Args:
+        output_path (`str`): Where to save the outgoing data.
+        input_path (`str`): Where to look for the input data.
+        column (`str`): The column to read.
+        overwrite (`bool`, *optional*, defaults to `False`):
+            Whether or not to overwrite the `output_path`.
+    """
+
+    def __init__(
+        self,
+        output_path: Optional[str],
+        input_path: Optional[str],
+        column: Optional[str],
+        overwrite=False,
+    ):
+        super().__init__(output_path, input_path, column, overwrite=overwrite)
+
+        with open(input_path, "r") as f:
+            self._entries = json.load(f)
+
+    def __iter__(self):
+        for entry in self._entries:
+            if self.is_multi_columns:
+                yield {k: entry[c] for k, c in self.column}
+            else:
+                yield entry[self.column[0]]
+
+    def save(self, data: dict):
+        """
+        Save the provided data object in a json file.
+
+        Args:
+            data (`dict`): The data to store.
+        """
+        with open(self.output_path, "w") as f:
+            json.dump(data, f)
+
+
+class PipedPipelineDataFormat(PipelineDataFormat):
+    """
+    Read data from piped input to the python process. For multi columns data, columns should separated by \t
+
+    If columns are provided, then the output will be a dictionary with {column_x: value_x}
+
+    Args:
+        output_path (`str`): Where to save the outgoing data.
+        input_path (`str`): Where to look for the input data.
+        column (`str`): The column to read.
+        overwrite (`bool`, *optional*, defaults to `False`):
+            Whether or not to overwrite the `output_path`.
+    """
+
+    def __iter__(self):
+        for line in sys.stdin:
+            # Split for multi-columns
+            if "\t" in line:
+                line = line.split("\t")
+                if self.column:
+                    # Dictionary to map arguments
+                    yield {kwargs: l for (kwargs, _), l in zip(self.column, line)}
+                else:
+                    yield tuple(line)
+
+            # No dictionary to map arguments
+            else:
+                yield line
+
+    def save(self, data: dict):
+        """
+        Print the data.
+
+        Args:
+            data (`dict`): The data to store.
+        """
+        print(data)
+
+    def save_binary(self, data: Union[dict, List[dict]]) -> str:
+        if self.output_path is None:
+            raise KeyError(
+                "When using piped input on pipeline outputting large object requires an output file path. "
+                "Please provide such output path through --output argument."
+            )
+
+        return super().save_binary(data)
+
+
+class _ScikitCompat(ABC):
+    """
+    Interface layer for the Scikit and Keras compatibility.
+    """
+
+    @abstractmethod
+    def transform(self, X):
+        raise NotImplementedError()
+
+    @abstractmethod
+    def predict(self, X):
+        raise NotImplementedError()
+
+
+def build_pipeline_init_args(
+    has_tokenizer: bool = False,
+    has_feature_extractor: bool = False,
+    has_image_processor: bool = False,
+    has_processor: bool = False,
+    supports_binary_output: bool = True,
+) -> str:
+    docstring = r"""
+    Arguments:
+        model ([`PreTrainedModel`] or [`TFPreTrainedModel`]):
+            The model that will be used by the pipeline to make predictions. This needs to be a model inheriting from
+            [`PreTrainedModel`] for PyTorch and [`TFPreTrainedModel`] for TensorFlow."""
+    if has_tokenizer:
+        docstring += r"""
+        tokenizer ([`PreTrainedTokenizer`]):
+            The tokenizer that will be used by the pipeline to encode data for the model. This object inherits from
+            [`PreTrainedTokenizer`]."""
+    if has_feature_extractor:
+        docstring += r"""
+        feature_extractor ([`SequenceFeatureExtractor`]):
+            The feature extractor that will be used by the pipeline to encode data for the model. This object inherits from
+            [`SequenceFeatureExtractor`]."""
+    if has_image_processor:
+        docstring += r"""
+        image_processor ([`BaseImageProcessor`]):
+            The image processor that will be used by the pipeline to encode data for the model. This object inherits from
+            [`BaseImageProcessor`]."""
+    if has_processor:
+        docstring += r"""
+        processor ([`ProcessorMixin`]):
+            The processor that will be used by the pipeline to encode data for the model. This object inherits from
+            [`ProcessorMixin`]. Processor is a composite object that might contain `tokenizer`, `feature_extractor`, and
+            `image_processor`."""
+    docstring += r"""
+        modelcard (`str` or [`ModelCard`], *optional*):
+            Model card attributed to the model for this pipeline.
+        framework (`str`, *optional*):
+            The framework to use, either `"pt"` for PyTorch or `"tf"` for TensorFlow. The specified framework must be
+            installed.
+
+            If no framework is specified, will default to the one currently installed. If no framework is specified and
+            both frameworks are installed, will default to the framework of the `model`, or to PyTorch if no model is
+            provided.
+        task (`str`, defaults to `""`):
+            A task-identifier for the pipeline.
+        num_workers (`int`, *optional*, defaults to 8):
+            When the pipeline will use *DataLoader* (when passing a dataset, on GPU for a Pytorch model), the number of
+            workers to be used.
+        batch_size (`int`, *optional*, defaults to 1):
+            When the pipeline will use *DataLoader* (when passing a dataset, on GPU for a Pytorch model), the size of
+            the batch to use, for inference this is not always beneficial, please read [Batching with
+            pipelines](https://huggingface.co/transformers/main_classes/pipelines.html#pipeline-batching) .
+        args_parser ([`~pipelines.ArgumentHandler`], *optional*):
+            Reference to the object in charge of parsing supplied pipeline parameters.
+        device (`int`, *optional*, defaults to -1):
+            Device ordinal for CPU/GPU supports. Setting this to -1 will leverage CPU, a positive will run the model on
+            the associated CUDA device id. You can pass native `torch.device` or a `str` too
+        torch_dtype (`str` or `torch.dtype`, *optional*):
+            Sent directly as `model_kwargs` (just a simpler shortcut) to use the available precision for this model
+            (`torch.float16`, `torch.bfloat16`, ... or `"auto"`)"""
+    if supports_binary_output:
+        docstring += r"""
+        binary_output (`bool`, *optional*, defaults to `False`):
+            Flag indicating if the output the pipeline should happen in a serialized format (i.e., pickle) or as
+            the raw output data e.g. text."""
+    return docstring
+
+
+PIPELINE_INIT_ARGS = build_pipeline_init_args(
+    has_tokenizer=True,
+    has_feature_extractor=True,
+    has_image_processor=True,
+    has_processor=True,
+    supports_binary_output=True,
+)
+
+
+if is_torch_available():
+    from transformers.pipelines.pt_utils import (
+        PipelineChunkIterator,
+        PipelineDataset,
+        PipelineIterator,
+        PipelinePackIterator,
+    )
+
+
+@add_end_docstrings(
+    build_pipeline_init_args(
+        has_tokenizer=True, has_feature_extractor=True, has_image_processor=True, has_processor=True
+    )
+)
+class Pipeline(_ScikitCompat, PushToHubMixin):
+    """
+    The Pipeline class is the class from which all pipelines inherit. Refer to this class for methods shared across
+    different pipelines.
+
+    Base class implementing pipelined operations. Pipeline workflow is defined as a sequence of the following
+    operations:
+
+        Input -> Tokenization -> Model Inference -> Post-Processing (task dependent) -> Output
+
+    Pipeline supports running on CPU or GPU through the device argument (see below).
+
+    Some pipeline, like for instance [`FeatureExtractionPipeline`] (`'feature-extraction'`) output large tensor object
+    as nested-lists. In order to avoid dumping such large structure as textual data we provide the `binary_output`
+    constructor argument. If set to `True`, the output will be stored in the pickle format.
+    """
+
+    # Historically we have pipelines working with `tokenizer`, `feature_extractor`, and `image_processor`
+    # as separate processing components. While we have `processor` class that combines them, some pipelines
+    # might still operate with these components separately.
+    # With the addition of `processor` to `pipeline`, we want to avoid:
+    #  - loading `processor` for pipelines that still work with `image_processor` and `tokenizer` separately;
+    #  - loading `image_processor`/`tokenizer` as a separate component while we operate only with `processor`,
+    #    because `processor` will load required sub-components by itself.
+    # Below flags allow granular control over loading components and set to be backward compatible with current
+    # pipelines logic. You may override these flags when creating your pipeline. For example, for
+    # `zero-shot-object-detection` pipeline which operates with `processor` you should set `_load_processor=True`
+    # and all the rest flags to `False` to avoid unnecessary loading of the components.
+    _load_processor = False
+    _load_image_processor = True
+    _load_feature_extractor = True
+    _load_tokenizer = True
+
+    default_input_names = None
+
+    def __init__(
+        self,
+        model: Union["PreTrainedModel", "TFPreTrainedModel"],
+        tokenizer: Optional[PreTrainedTokenizer] = None,
+        feature_extractor: Optional[PreTrainedFeatureExtractor] = None,
+        image_processor: Optional[BaseImageProcessor] = None,
+        processor: Optional[ProcessorMixin] = None,
+        modelcard: Optional[ModelCard] = None,
+        framework: Optional[str] = None,
+        task: str = "",
+        args_parser: ArgumentHandler = None,
+        device: Union[int, "torch.device"] = None,
+        torch_dtype: Optional[Union[str, "torch.dtype"]] = None,
+        binary_output: bool = False,
+        **kwargs,
+    ):
+        if framework is None:
+            framework, model = infer_framework_load_model(model, config=model.config)
+
+        self.task = task
+        self.model = model
+        self.tokenizer = tokenizer
+        self.feature_extractor = feature_extractor
+        self.image_processor = image_processor
+        self.processor = processor
+        self.modelcard = modelcard
+        self.framework = framework
+
+        # `accelerate` device map
+        hf_device_map = getattr(self.model, "hf_device_map", None)
+
+        if hf_device_map is not None and device is not None:
+            raise ValueError(
+                "The model has been loaded with `accelerate` and therefore cannot be moved to a specific device. Please "
+                "discard the `device` argument when creating your pipeline object."
+            )
+
+        if device is None:
+            if hf_device_map is not None:
+                # Take the first device used by `accelerate`.
+                device = next(iter(hf_device_map.values()))
+            else:
+                device = 0
+
+        if is_torch_available() and self.framework == "pt":
+            if device == -1 and self.model.device is not None:
+                device = self.model.device
+            if isinstance(device, torch.device):
+                if device.type == "xpu" and not is_torch_xpu_available(check_device=True):
+                    raise ValueError(f'{device} is not available, you should use device="cpu" instead')
+                self.device = device
+            elif isinstance(device, str):
+                if "xpu" in device and not is_torch_xpu_available(check_device=True):
+                    raise ValueError(f'{device} is not available, you should use device="cpu" instead')
+                self.device = torch.device(device)
+            elif device < 0:
+                self.device = torch.device("cpu")
+            elif is_torch_mlu_available():
+                self.device = torch.device(f"mlu:{device}")
+            elif is_torch_musa_available():
+                self.device = torch.device(f"musa:{device}")
+            elif is_torch_cuda_available():
+                self.device = torch.device(f"cuda:{device}")
+            elif is_torch_npu_available():
+                self.device = torch.device(f"npu:{device}")
+            elif is_torch_xpu_available(check_device=True):
+                self.device = torch.device(f"xpu:{device}")
+            elif is_torch_mps_available():
+                self.device = torch.device(f"mps:{device}")
+            else:
+                self.device = torch.device("cpu")
+        else:
+            self.device = device if device is not None else -1
+
+        logger.warning(f"Device set to use {self.device}")
+
+        self.binary_output = binary_output
+        # We shouldn't call `model.to()` for models loaded with accelerate as well as the case that model is already on device
+        if (
+            self.framework == "pt"
+            and self.model.device != self.device
+            and not (isinstance(self.device, int) and self.device < 0)
+            and hf_device_map is None
+        ):
+            self.model.to(self.device)
+
+        # If the model can generate:
+        # 1 - create a local generation config. This is done to avoid side-effects on the model as we apply local
+        # tweaks to the generation config.
+        # 2 - load the assistant model if it is passed.
+        self.assistant_model, self.assistant_tokenizer = load_assistant_model(
+            self.model, kwargs.pop("assistant_model", None), kwargs.pop("assistant_tokenizer", None)
+        )
+        if self.model.can_generate():
+            self.prefix = self.model.config.prefix if hasattr(self.model.config, "prefix") else None
+            self.generation_config = copy.deepcopy(self.model.generation_config)
+            # Update the generation config with task specific params if they exist
+            # NOTE: `prefix` is pipeline-specific and doesn't exist in the generation config.
+            task_specific_params = self.model.config.task_specific_params
+            if task_specific_params is not None and task in task_specific_params:
+                this_task_params = task_specific_params.get(task)
+                if "prefix" in this_task_params:
+                    self.prefix = this_task_params.pop("prefix")
+                self.generation_config.update(**this_task_params)
+            # If the tokenizer has a pad token but the model doesn't, set it so that `generate` is aware of it.
+            if (
+                self.tokenizer is not None
+                and self.tokenizer.pad_token_id is not None
+                and self.generation_config.pad_token_id is None
+            ):
+                self.generation_config.pad_token_id = self.tokenizer.pad_token_id
+
+        self.call_count = 0
+        self._batch_size = kwargs.pop("batch_size", None)
+        self._num_workers = kwargs.pop("num_workers", None)
+        self._preprocess_params, self._forward_params, self._postprocess_params = self._sanitize_parameters(**kwargs)
+
+        # In processor only mode, we can get the modality processors from the processor
+        if self.processor is not None and all(
+            [self.tokenizer is None, self.feature_extractor is None, self.image_processor is None]
+        ):
+            self.tokenizer = getattr(self.processor, "tokenizer", None)
+            self.feature_extractor = getattr(self.processor, "feature_extractor", None)
+            self.image_processor = getattr(self.processor, "image_processor", None)
+
+        if self.image_processor is None and self.feature_extractor is not None:
+            if isinstance(self.feature_extractor, BaseImageProcessor):
+                # Backward compatible change, if users called
+                # ImageSegmentationPipeline(.., feature_extractor=MyFeatureExtractor())
+                # then we should keep working
+                self.image_processor = self.feature_extractor
+
+    def save_pretrained(
+        self,
+        save_directory: Union[str, os.PathLike],
+        safe_serialization: bool = True,
+        **kwargs,
+    ):
+        """
+        Save the pipeline's model and tokenizer.
+
+        Args:
+            save_directory (`str` or `os.PathLike`):
+                A path to the directory where to saved. It will be created if it doesn't exist.
+            safe_serialization (`str`):
+                Whether to save the model using `safetensors` or the traditional way for PyTorch or Tensorflow.
+            kwargs (`Dict[str, Any]`, *optional*):
+                Additional key word arguments passed along to the [`~utils.PushToHubMixin.push_to_hub`] method.
+        """
+        use_auth_token = kwargs.pop("use_auth_token", None)
+
+        if use_auth_token is not None:
+            warnings.warn(
+                "The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers. Please use `token` instead.",
+                FutureWarning,
+            )
+            if kwargs.get("token", None) is not None:
+                raise ValueError(
+                    "`token` and `use_auth_token` are both specified. Please set only the argument `token`."
+                )
+            kwargs["token"] = use_auth_token
+
+        if os.path.isfile(save_directory):
+            logger.error(f"Provided path ({save_directory}) should be a directory, not a file")
+            return
+        os.makedirs(save_directory, exist_ok=True)
+
+        if hasattr(self, "_registered_impl"):
+            # Add info to the config
+            pipeline_info = self._registered_impl.copy()
+            custom_pipelines = {}
+            for task, info in pipeline_info.items():
+                if info["impl"] != self.__class__:
+                    continue
+
+                info = info.copy()
+                module_name = info["impl"].__module__
+                last_module = module_name.split(".")[-1]
+                # Change classes into their names/full names
+                info["impl"] = f"{last_module}.{info['impl'].__name__}"
+                info["pt"] = tuple(c.__name__ for c in info["pt"])
+                info["tf"] = tuple(c.__name__ for c in info["tf"])
+
+                custom_pipelines[task] = info
+            self.model.config.custom_pipelines = custom_pipelines
+            # Save the pipeline custom code
+            custom_object_save(self, save_directory)
+
+        kwargs["safe_serialization"] = safe_serialization
+        self.model.save_pretrained(save_directory, **kwargs)
+
+        if self.tokenizer is not None:
+            self.tokenizer.save_pretrained(save_directory, **kwargs)
+
+        if self.feature_extractor is not None:
+            self.feature_extractor.save_pretrained(save_directory, **kwargs)
+
+        if self.image_processor is not None:
+            self.image_processor.save_pretrained(save_directory, **kwargs)
+
+        if self.modelcard is not None:
+            self.modelcard.save_pretrained(save_directory)
+
+    def transform(self, X):
+        """
+        Scikit / Keras interface to transformers' pipelines. This method will forward to __call__().
+        """
+        return self(X)
+
+    def predict(self, X):
+        """
+        Scikit / Keras interface to transformers' pipelines. This method will forward to __call__().
+        """
+        return self(X)
+
+    @property
+    def torch_dtype(self) -> Optional["torch.dtype"]:
+        """
+        Torch dtype of the model (if it's Pytorch model), `None` otherwise.
+        """
+        return getattr(self.model, "dtype", None)
+
+    @contextmanager
+    def device_placement(self):
+        """
+        Context Manager allowing tensor allocation on the user-specified device in framework agnostic way.
+
+        Returns:
+            Context manager
+
+        Examples:
+
+        ```python
+        # Explicitly ask for tensor allocation on CUDA device :0
+        pipe = pipeline(..., device=0)
+        with pipe.device_placement():
+            # Every framework specific tensor allocation will be done on the request device
+            output = pipe(...)
+        ```"""
+        if self.framework == "tf":
+            with tf.device("/CPU:0" if self.device == -1 else f"/device:GPU:{self.device}"):
+                yield
+        else:
+            if self.device.type == "cuda":
+                with torch.cuda.device(self.device):
+                    yield
+            elif self.device.type == "mlu":
+                with torch.mlu.device(self.device):
+                    yield
+            elif self.device.type == "musa":
+                with torch.musa.device(self.device):
+                    yield
+            else:
+                yield
+
+    def ensure_tensor_on_device(self, **inputs):
+        """
+        Ensure PyTorch tensors are on the specified device.
+
+        Args:
+            inputs (keyword arguments that should be `torch.Tensor`, the rest is ignored):
+                The tensors to place on `self.device`.
+            Recursive on lists **only**.
+
+        Return:
+            `Dict[str, torch.Tensor]`: The same as `inputs` but on the proper device.
+        """
+        return self._ensure_tensor_on_device(inputs, self.device)
+
+    def _ensure_tensor_on_device(self, inputs, device):
+        if isinstance(inputs, ModelOutput):
+            return ModelOutput(
+                {name: self._ensure_tensor_on_device(tensor, device) for name, tensor in inputs.items()}
+            )
+        elif isinstance(inputs, dict):
+            return {name: self._ensure_tensor_on_device(tensor, device) for name, tensor in inputs.items()}
+        elif isinstance(inputs, UserDict):
+            return UserDict({name: self._ensure_tensor_on_device(tensor, device) for name, tensor in inputs.items()})
+        elif isinstance(inputs, list):
+            return [self._ensure_tensor_on_device(item, device) for item in inputs]
+        elif isinstance(inputs, tuple):
+            return tuple([self._ensure_tensor_on_device(item, device) for item in inputs])
+        elif isinstance(inputs, torch.Tensor):
+            return inputs.to(device)
+        else:
+            return inputs
+
+    def check_model_type(self, supported_models: Union[List[str], dict]):
+        """
+        Check if the model class is in supported by the pipeline.
+
+        Args:
+            supported_models (`List[str]` or `dict`):
+                The list of models supported by the pipeline, or a dictionary with model class values.
+        """
+        if not isinstance(supported_models, list):  # Create from a model mapping
+            supported_models_names = []
+            for _, model_name in supported_models.items():
+                # Mapping can now contain tuples of models for the same configuration.
+                if isinstance(model_name, tuple):
+                    supported_models_names.extend(list(model_name))
+                else:
+                    supported_models_names.append(model_name)
+            if hasattr(supported_models, "_model_mapping"):
+                for _, model in supported_models._model_mapping._extra_content.items():
+                    if isinstance(model_name, tuple):
+                        supported_models_names.extend([m.__name__ for m in model])
+                    else:
+                        supported_models_names.append(model.__name__)
+            supported_models = supported_models_names
+        if self.model.__class__.__name__ not in supported_models:
+            logger.error(
+                f"The model '{self.model.__class__.__name__}' is not supported for {self.task}. Supported models are"
+                f" {supported_models}."
+            )
+
+    @abstractmethod
+    def _sanitize_parameters(self, **pipeline_parameters):
+        """
+        _sanitize_parameters will be called with any excessive named arguments from either `__init__` or `__call__`
+        methods. It should return 3 dictionaries of the resolved parameters used by the various `preprocess`,
+        `forward` and `postprocess` methods. Do not fill dictionaries if the caller didn't specify a kwargs. This
+        lets you keep defaults in function signatures, which is more "natural".
+
+        It is not meant to be called directly, it will be automatically called and the final parameters resolved by
+        `__init__` and `__call__`
+        """
+        raise NotImplementedError("_sanitize_parameters not implemented")
+
+    @abstractmethod
+    def preprocess(self, input_: Any, **preprocess_parameters: Dict) -> Dict[str, GenericTensor]:
+        """
+        Preprocess will take the `input_` of a specific pipeline and return a dictionary of everything necessary for
+        `_forward` to run properly. It should contain at least one tensor, but might have arbitrary other items.
+        """
+        raise NotImplementedError("preprocess not implemented")
+
+    @abstractmethod
+    def _forward(self, input_tensors: Dict[str, GenericTensor], **forward_parameters: Dict) -> ModelOutput:
+        """
+        _forward will receive the prepared dictionary from `preprocess` and run it on the model. This method might
+        involve the GPU or the CPU and should be agnostic to it. Isolating this function is the reason for `preprocess`
+        and `postprocess` to exist, so that the hot path, this method generally can run as fast as possible.
+
+        It is not meant to be called directly, `forward` is preferred. It is basically the same but contains additional
+        code surrounding `_forward` making sure tensors and models are on the same device, disabling the training part
+        of the code (leading to faster inference).
+        """
+        raise NotImplementedError("_forward not implemented")
+
+    @abstractmethod
+    def postprocess(self, model_outputs: ModelOutput, **postprocess_parameters: Dict) -> Any:
+        """
+        Postprocess will receive the raw outputs of the `_forward` method, generally tensors, and reformat them into
+        something more friendly. Generally it will output a list or a dict or results (containing just strings and
+        numbers).
+        """
+        raise NotImplementedError("postprocess not implemented")
+
+    def get_inference_context(self):
+        return torch.no_grad
+
+    def forward(self, model_inputs, **forward_params):
+        with self.device_placement():
+            if self.framework == "tf":
+                model_inputs["training"] = False
+                model_outputs = self._forward(model_inputs, **forward_params)
+            elif self.framework == "pt":
+                inference_context = self.get_inference_context()
+                with inference_context():
+                    model_inputs = self._ensure_tensor_on_device(model_inputs, device=self.device)
+                    model_outputs = self._forward(model_inputs, **forward_params)
+                    model_outputs = self._ensure_tensor_on_device(model_outputs, device=torch.device("cpu"))
+            else:
+                raise ValueError(f"Framework {self.framework} is not supported")
+        return model_outputs
+
+    def get_iterator(
+        self, inputs, num_workers: int, batch_size: int, preprocess_params, forward_params, postprocess_params
+    ):
+        if isinstance(inputs, collections.abc.Sized):
+            dataset = PipelineDataset(inputs, self.preprocess, preprocess_params)
+        else:
+            if num_workers > 1:
+                logger.warning(
+                    "For iterable dataset using num_workers>1 is likely to result"
+                    " in errors since everything is iterable, setting `num_workers=1`"
+                    " to guarantee correctness."
+                )
+                num_workers = 1
+            dataset = PipelineIterator(inputs, self.preprocess, preprocess_params)
+        if "TOKENIZERS_PARALLELISM" not in os.environ:
+            logger.info("Disabling tokenizer parallelism, we're using DataLoader multithreading already")
+            os.environ["TOKENIZERS_PARALLELISM"] = "false"
+        # TODO hack by collating feature_extractor and image_processor
+        feature_extractor = self.feature_extractor if self.feature_extractor is not None else self.image_processor
+        collate_fn = no_collate_fn if batch_size == 1 else pad_collate_fn(self.tokenizer, feature_extractor)
+        dataloader = DataLoader(dataset, num_workers=num_workers, batch_size=batch_size, collate_fn=collate_fn)
+        model_iterator = PipelineIterator(dataloader, self.forward, forward_params, loader_batch_size=batch_size)
+        final_iterator = PipelineIterator(model_iterator, self.postprocess, postprocess_params)
+        return final_iterator
+
+    def __call__(self, inputs, *args, num_workers=None, batch_size=None, **kwargs):
+        if args:
+            logger.warning(f"Ignoring args : {args}")
+
+        if num_workers is None:
+            if self._num_workers is None:
+                num_workers = 0
+            else:
+                num_workers = self._num_workers
+        if batch_size is None:
+            if self._batch_size is None:
+                batch_size = 1
+            else:
+                batch_size = self._batch_size
+
+        preprocess_params, forward_params, postprocess_params = self._sanitize_parameters(**kwargs)
+
+        # Fuse __init__ params and __call__ params without modifying the __init__ ones.
+        preprocess_params = {**self._preprocess_params, **preprocess_params}
+        forward_params = {**self._forward_params, **forward_params}
+        postprocess_params = {**self._postprocess_params, **postprocess_params}
+
+        self.call_count += 1
+        if self.call_count > 10 and self.framework == "pt" and self.device.type == "cuda":
+            logger.warning_once(
+                "You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a"
+                " dataset",
+            )
+
+        is_dataset = Dataset is not None and isinstance(inputs, Dataset)
+        is_generator = isinstance(inputs, types.GeneratorType)
+        is_list = isinstance(inputs, list)
+
+        is_iterable = is_dataset or is_generator or is_list
+
+        # TODO make the get_iterator work also for `tf` (and `flax`).
+        can_use_iterator = self.framework == "pt" and (is_dataset or is_generator or is_list)
+
+        if is_list:
+            if can_use_iterator:
+                final_iterator = self.get_iterator(
+                    inputs, num_workers, batch_size, preprocess_params, forward_params, postprocess_params
+                )
+                outputs = list(final_iterator)
+                return outputs
+            else:
+                return self.run_multi(inputs, preprocess_params, forward_params, postprocess_params)
+        elif can_use_iterator:
+            return self.get_iterator(
+                inputs, num_workers, batch_size, preprocess_params, forward_params, postprocess_params
+            )
+        elif is_iterable:
+            return self.iterate(inputs, preprocess_params, forward_params, postprocess_params)
+        elif self.framework == "pt" and isinstance(self, ChunkPipeline):
+            return next(
+                iter(
+                    self.get_iterator(
+                        [inputs], num_workers, batch_size, preprocess_params, forward_params, postprocess_params
+                    )
+                )
+            )
+        else:
+            return self.run_single(inputs, preprocess_params, forward_params, postprocess_params)
+
+    def run_multi(self, inputs, preprocess_params, forward_params, postprocess_params):
+        return [self.run_single(item, preprocess_params, forward_params, postprocess_params) for item in inputs]
+
+    def run_single(self, inputs, preprocess_params, forward_params, postprocess_params):
+        model_inputs = self.preprocess(inputs, **preprocess_params)
+        model_outputs = self.forward(model_inputs, **forward_params)
+        outputs = self.postprocess(model_outputs, **postprocess_params)
+        return outputs
+
+    def iterate(self, inputs, preprocess_params, forward_params, postprocess_params):
+        # This function should become `get_iterator` again, this is a temporary
+        # easy solution.
+        for input_ in inputs:
+            yield self.run_single(input_, preprocess_params, forward_params, postprocess_params)
+
+
+Pipeline.push_to_hub = copy_func(Pipeline.push_to_hub)
+if Pipeline.push_to_hub.__doc__ is not None:
+    Pipeline.push_to_hub.__doc__ = Pipeline.push_to_hub.__doc__.format(
+        object="pipe", object_class="pipeline", object_files="pipeline file"
+    ).replace(".from_pretrained", "")
+
+
+class ChunkPipeline(Pipeline):
+    def run_single(self, inputs, preprocess_params, forward_params, postprocess_params):
+        all_outputs = []
+        for model_inputs in self.preprocess(inputs, **preprocess_params):
+            model_outputs = self.forward(model_inputs, **forward_params)
+            all_outputs.append(model_outputs)
+        outputs = self.postprocess(all_outputs, **postprocess_params)
+        return outputs
+
+    def get_iterator(
+        self, inputs, num_workers: int, batch_size: int, preprocess_params, forward_params, postprocess_params
+    ):
+        if "TOKENIZERS_PARALLELISM" not in os.environ:
+            logger.info("Disabling tokenizer parallelism, we're using DataLoader multithreading already")
+            os.environ["TOKENIZERS_PARALLELISM"] = "false"
+        if num_workers > 1:
+            logger.warning(
+                "For ChunkPipeline using num_workers>0 is likely to result in errors since everything is iterable,"
+                " setting `num_workers=1` to guarantee correctness."
+            )
+            num_workers = 1
+        dataset = PipelineChunkIterator(inputs, self.preprocess, preprocess_params)
+
+        # TODO hack by collating feature_extractor and image_processor
+        feature_extractor = self.feature_extractor if self.feature_extractor is not None else self.image_processor
+        collate_fn = no_collate_fn if batch_size == 1 else pad_collate_fn(self.tokenizer, feature_extractor)
+        dataloader = DataLoader(dataset, num_workers=num_workers, batch_size=batch_size, collate_fn=collate_fn)
+        model_iterator = PipelinePackIterator(dataloader, self.forward, forward_params, loader_batch_size=batch_size)
+        final_iterator = PipelineIterator(model_iterator, self.postprocess, postprocess_params)
+        return final_iterator
+
+
+class PipelineRegistry:
+    def __init__(self, supported_tasks: Dict[str, Any], task_aliases: Dict[str, str]) -> None:
+        self.supported_tasks = supported_tasks
+        self.task_aliases = task_aliases
+
+    def get_supported_tasks(self) -> List[str]:
+        supported_task = list(self.supported_tasks.keys()) + list(self.task_aliases.keys())
+        supported_task.sort()
+        return supported_task
+
+    def check_task(self, task: str) -> Tuple[str, Dict, Any]:
+        if task in self.task_aliases:
+            task = self.task_aliases[task]
+        if task in self.supported_tasks:
+            targeted_task = self.supported_tasks[task]
+            return task, targeted_task, None
+
+        if task.startswith("translation"):
+            tokens = task.split("_")
+            if len(tokens) == 4 and tokens[0] == "translation" and tokens[2] == "to":
+                targeted_task = self.supported_tasks["translation"]
+                task = "translation"
+                return task, targeted_task, (tokens[1], tokens[3])
+            raise KeyError(f"Invalid translation task {task}, use 'translation_XX_to_YY' format")
+
+        raise KeyError(
+            f"Unknown task {task}, available tasks are {self.get_supported_tasks() + ['translation_XX_to_YY']}"
+        )
+
+    def register_pipeline(
+        self,
+        task: str,
+        pipeline_class: type,
+        pt_model: Optional[Union[type, Tuple[type]]] = None,
+        tf_model: Optional[Union[type, Tuple[type]]] = None,
+        default: Optional[Dict] = None,
+        type: Optional[str] = None,
+    ) -> None:
+        if task in self.supported_tasks:
+            logger.warning(f"{task} is already registered. Overwriting pipeline for task {task}...")
+
+        if pt_model is None:
+            pt_model = ()
+        elif not isinstance(pt_model, tuple):
+            pt_model = (pt_model,)
+
+        if tf_model is None:
+            tf_model = ()
+        elif not isinstance(tf_model, tuple):
+            tf_model = (tf_model,)
+
+        task_impl = {"impl": pipeline_class, "pt": pt_model, "tf": tf_model}
+
+        if default is not None:
+            if "model" not in default and ("pt" in default or "tf" in default):
+                default = {"model": default}
+            task_impl["default"] = default
+
+        if type is not None:
+            task_impl["type"] = type
+
+        self.supported_tasks[task] = task_impl
+        pipeline_class._registered_impl = {task: task_impl}
+
+    def to_dict(self):
+        return self.supported_tasks
diff --git a/.venv/lib/python3.11/site-packages/transformers/pipelines/depth_estimation.py b/.venv/lib/python3.11/site-packages/transformers/pipelines/depth_estimation.py
new file mode 100644
index 0000000000000000000000000000000000000000..2203ac09c9cf9b6e9a51055c60678f5266ddd439
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/transformers/pipelines/depth_estimation.py
@@ -0,0 +1,133 @@
+from typing import List, Union
+
+from ..utils import (
+    add_end_docstrings,
+    is_torch_available,
+    is_vision_available,
+    logging,
+    requires_backends,
+)
+from .base import Pipeline, build_pipeline_init_args
+
+
+if is_vision_available():
+    from PIL import Image
+
+    from ..image_utils import load_image
+
+if is_torch_available():
+    from ..models.auto.modeling_auto import MODEL_FOR_DEPTH_ESTIMATION_MAPPING_NAMES
+
+logger = logging.get_logger(__name__)
+
+
+@add_end_docstrings(build_pipeline_init_args(has_image_processor=True))
+class DepthEstimationPipeline(Pipeline):
+    """
+    Depth estimation pipeline using any `AutoModelForDepthEstimation`. This pipeline predicts the depth of an image.
+
+    Example:
+
+    ```python
+    >>> from transformers import pipeline
+
+    >>> depth_estimator = pipeline(task="depth-estimation", model="LiheYoung/depth-anything-base-hf")
+    >>> output = depth_estimator("http://images.cocodataset.org/val2017/000000039769.jpg")
+    >>> # This is a tensor with the values being the depth expressed in meters for each pixel
+    >>> output["predicted_depth"].shape
+    torch.Size([1, 384, 384])
+    ```
+
+    Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial)
+
+
+    This depth estimation pipeline can currently be loaded from [`pipeline`] using the following task identifier:
+    `"depth-estimation"`.
+
+    See the list of available models on [huggingface.co/models](https://huggingface.co/models?filter=depth-estimation).
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        requires_backends(self, "vision")
+        self.check_model_type(MODEL_FOR_DEPTH_ESTIMATION_MAPPING_NAMES)
+
+    def __call__(self, inputs: Union[str, List[str], "Image.Image", List["Image.Image"]] = None, **kwargs):
+        """
+        Predict the depth(s) of the image(s) passed as inputs.
+
+        Args:
+            inputs (`str`, `List[str]`, `PIL.Image` or `List[PIL.Image]`):
+                The pipeline handles three types of images:
+
+                - A string containing a http link pointing to an image
+                - A string containing a local path to an image
+                - An image loaded in PIL directly
+
+                The pipeline accepts either a single image or a batch of images, which must then be passed as a string.
+                Images in a batch must all be in the same format: all as http links, all as local paths, or all as PIL
+                images.
+            parameters (`Dict`, *optional*):
+                A dictionary of argument names to parameter values, to control pipeline behaviour.
+                The only parameter available right now is `timeout`, which is the length of time, in seconds,
+                that the pipeline should wait before giving up on trying to download an image.
+            timeout (`float`, *optional*, defaults to None):
+                The maximum time in seconds to wait for fetching images from the web. If None, no timeout is set and
+                the call may block forever.
+
+        Return:
+            A dictionary or a list of dictionaries containing result. If the input is a single image, will return a
+            dictionary, if the input is a list of several images, will return a list of dictionaries corresponding to
+            the images.
+
+            The dictionaries contain the following keys:
+
+            - **predicted_depth** (`torch.Tensor`) -- The predicted depth by the model as a `torch.Tensor`.
+            - **depth** (`PIL.Image`) -- The predicted depth by the model as a `PIL.Image`.
+        """
+        # After deprecation of this is completed, remove the default `None` value for `images`
+        if "images" in kwargs:
+            inputs = kwargs.pop("images")
+        if inputs is None:
+            raise ValueError("Cannot call the depth-estimation pipeline without an inputs argument!")
+        return super().__call__(inputs, **kwargs)
+
+    def _sanitize_parameters(self, timeout=None, parameters=None, **kwargs):
+        preprocess_params = {}
+        if timeout is not None:
+            preprocess_params["timeout"] = timeout
+        if isinstance(parameters, dict) and "timeout" in parameters:
+            preprocess_params["timeout"] = parameters["timeout"]
+        return preprocess_params, {}, {}
+
+    def preprocess(self, image, timeout=None):
+        image = load_image(image, timeout)
+        model_inputs = self.image_processor(images=image, return_tensors=self.framework)
+        if self.framework == "pt":
+            model_inputs = model_inputs.to(self.torch_dtype)
+        model_inputs["target_size"] = image.size[::-1]
+        return model_inputs
+
+    def _forward(self, model_inputs):
+        target_size = model_inputs.pop("target_size")
+        model_outputs = self.model(**model_inputs)
+        model_outputs["target_size"] = target_size
+        return model_outputs
+
+    def postprocess(self, model_outputs):
+        outputs = self.image_processor.post_process_depth_estimation(
+            model_outputs,
+            # this acts as `source_sizes` for ZoeDepth and as `target_sizes` for the rest of the models so do *not*
+            # replace with `target_sizes = [model_outputs["target_size"]]`
+            [model_outputs["target_size"]],
+        )
+
+        formatted_outputs = []
+        for output in outputs:
+            depth = output["predicted_depth"].detach().cpu().numpy()
+            depth = (depth - depth.min()) / (depth.max() - depth.min())
+            depth = Image.fromarray((depth * 255).astype("uint8"))
+
+            formatted_outputs.append({"predicted_depth": output["predicted_depth"], "depth": depth})
+
+        return formatted_outputs[0] if len(outputs) == 1 else formatted_outputs
diff --git a/.venv/lib/python3.11/site-packages/transformers/pipelines/document_question_answering.py b/.venv/lib/python3.11/site-packages/transformers/pipelines/document_question_answering.py
new file mode 100644
index 0000000000000000000000000000000000000000..c176d841e29fa6c6bb8c6867562f985d181c7138
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/transformers/pipelines/document_question_answering.py
@@ -0,0 +1,516 @@
+# Copyright 2022 The Impira Team and the HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import re
+from typing import List, Optional, Tuple, Union
+
+import numpy as np
+
+from ..utils import (
+    ExplicitEnum,
+    add_end_docstrings,
+    is_pytesseract_available,
+    is_torch_available,
+    is_vision_available,
+    logging,
+)
+from .base import ChunkPipeline, build_pipeline_init_args
+from .question_answering import select_starts_ends
+
+
+if is_vision_available():
+    from PIL import Image
+
+    from ..image_utils import load_image
+
+if is_torch_available():
+    import torch
+
+    from ..models.auto.modeling_auto import MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING_NAMES
+
+TESSERACT_LOADED = False
+if is_pytesseract_available():
+    TESSERACT_LOADED = True
+    import pytesseract
+
+logger = logging.get_logger(__name__)
+
+
+# normalize_bbox() and apply_tesseract() are derived from apply_tesseract in models/layoutlmv3/feature_extraction_layoutlmv3.py.
+# However, because the pipeline may evolve from what layoutlmv3 currently does, it's copied (vs. imported) to avoid creating an
+# unnecessary dependency.
+def normalize_box(box, width, height):
+    return [
+        int(1000 * (box[0] / width)),
+        int(1000 * (box[1] / height)),
+        int(1000 * (box[2] / width)),
+        int(1000 * (box[3] / height)),
+    ]
+
+
+def apply_tesseract(image: "Image.Image", lang: Optional[str], tesseract_config: Optional[str]):
+    """Applies Tesseract OCR on a document image, and returns recognized words + normalized bounding boxes."""
+    # apply OCR
+    data = pytesseract.image_to_data(image, lang=lang, output_type="dict", config=tesseract_config)
+    words, left, top, width, height = data["text"], data["left"], data["top"], data["width"], data["height"]
+
+    # filter empty words and corresponding coordinates
+    irrelevant_indices = [idx for idx, word in enumerate(words) if not word.strip()]
+    words = [word for idx, word in enumerate(words) if idx not in irrelevant_indices]
+    left = [coord for idx, coord in enumerate(left) if idx not in irrelevant_indices]
+    top = [coord for idx, coord in enumerate(top) if idx not in irrelevant_indices]
+    width = [coord for idx, coord in enumerate(width) if idx not in irrelevant_indices]
+    height = [coord for idx, coord in enumerate(height) if idx not in irrelevant_indices]
+
+    # turn coordinates into (left, top, left+width, top+height) format
+    actual_boxes = []
+    for x, y, w, h in zip(left, top, width, height):
+        actual_box = [x, y, x + w, y + h]
+        actual_boxes.append(actual_box)
+
+    image_width, image_height = image.size
+
+    # finally, normalize the bounding boxes
+    normalized_boxes = []
+    for box in actual_boxes:
+        normalized_boxes.append(normalize_box(box, image_width, image_height))
+
+    if len(words) != len(normalized_boxes):
+        raise ValueError("Not as many words as there are bounding boxes")
+
+    return words, normalized_boxes
+
+
+class ModelType(ExplicitEnum):
+    LayoutLM = "layoutlm"
+    LayoutLMv2andv3 = "layoutlmv2andv3"
+    VisionEncoderDecoder = "vision_encoder_decoder"
+
+
+@add_end_docstrings(build_pipeline_init_args(has_image_processor=True, has_tokenizer=True))
+class DocumentQuestionAnsweringPipeline(ChunkPipeline):
+    # TODO: Update task_summary docs to include an example with document QA and then update the first sentence
+    """
+    Document Question Answering pipeline using any `AutoModelForDocumentQuestionAnswering`. The inputs/outputs are
+    similar to the (extractive) question answering pipeline; however, the pipeline takes an image (and optional OCR'd
+    words/boxes) as input instead of text context.
+
+    Example:
+
+    ```python
+    >>> from transformers import pipeline
+
+    >>> document_qa = pipeline(model="impira/layoutlm-document-qa")
+    >>> document_qa(
+    ...     image="https://huggingface.co/spaces/impira/docquery/resolve/2359223c1837a7587402bda0f2643382a6eefeab/invoice.png",
+    ...     question="What is the invoice number?",
+    ... )
+    [{'score': 0.425, 'answer': 'us-001', 'start': 16, 'end': 16}]
+    ```
+
+    Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial)
+
+    This document question answering pipeline can currently be loaded from [`pipeline`] using the following task
+    identifier: `"document-question-answering"`.
+
+    The models that this pipeline can use are models that have been fine-tuned on a document question answering task.
+    See the up-to-date list of available models on
+    [huggingface.co/models](https://huggingface.co/models?filter=document-question-answering).
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        if self.tokenizer is not None and not self.tokenizer.__class__.__name__.endswith("Fast"):
+            raise ValueError(
+                "`DocumentQuestionAnsweringPipeline` requires a fast tokenizer, but a slow tokenizer "
+                f"(`{self.tokenizer.__class__.__name__}`) is provided."
+            )
+
+        if self.model.config.__class__.__name__ == "VisionEncoderDecoderConfig":
+            self.model_type = ModelType.VisionEncoderDecoder
+            if self.model.config.encoder.model_type != "donut-swin":
+                raise ValueError("Currently, the only supported VisionEncoderDecoder model is Donut")
+        else:
+            self.check_model_type(MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING_NAMES)
+            if self.model.config.__class__.__name__ == "LayoutLMConfig":
+                self.model_type = ModelType.LayoutLM
+            else:
+                self.model_type = ModelType.LayoutLMv2andv3
+
+    def _sanitize_parameters(
+        self,
+        padding=None,
+        doc_stride=None,
+        max_question_len=None,
+        lang: Optional[str] = None,
+        tesseract_config: Optional[str] = None,
+        max_answer_len=None,
+        max_seq_len=None,
+        top_k=None,
+        handle_impossible_answer=None,
+        timeout=None,
+        **kwargs,
+    ):
+        preprocess_params, postprocess_params = {}, {}
+        if padding is not None:
+            preprocess_params["padding"] = padding
+        if doc_stride is not None:
+            preprocess_params["doc_stride"] = doc_stride
+        if max_question_len is not None:
+            preprocess_params["max_question_len"] = max_question_len
+        if max_seq_len is not None:
+            preprocess_params["max_seq_len"] = max_seq_len
+        if lang is not None:
+            preprocess_params["lang"] = lang
+        if tesseract_config is not None:
+            preprocess_params["tesseract_config"] = tesseract_config
+        if timeout is not None:
+            preprocess_params["timeout"] = timeout
+
+        if top_k is not None:
+            if top_k < 1:
+                raise ValueError(f"top_k parameter should be >= 1 (got {top_k})")
+            postprocess_params["top_k"] = top_k
+        if max_answer_len is not None:
+            if max_answer_len < 1:
+                raise ValueError(f"max_answer_len parameter should be >= 1 (got {max_answer_len}")
+            postprocess_params["max_answer_len"] = max_answer_len
+        if handle_impossible_answer is not None:
+            postprocess_params["handle_impossible_answer"] = handle_impossible_answer
+
+        forward_params = {}
+        if self.assistant_model is not None:
+            forward_params["assistant_model"] = self.assistant_model
+        if self.assistant_tokenizer is not None:
+            forward_params["tokenizer"] = self.tokenizer
+            forward_params["assistant_tokenizer"] = self.assistant_tokenizer
+
+        return preprocess_params, forward_params, postprocess_params
+
+    def __call__(
+        self,
+        image: Union["Image.Image", str],
+        question: Optional[str] = None,
+        word_boxes: Tuple[str, List[float]] = None,
+        **kwargs,
+    ):
+        """
+        Answer the question(s) given as inputs by using the document(s). A document is defined as an image and an
+        optional list of (word, box) tuples which represent the text in the document. If the `word_boxes` are not
+        provided, it will use the Tesseract OCR engine (if available) to extract the words and boxes automatically for
+        LayoutLM-like models which require them as input. For Donut, no OCR is run.
+
+        You can invoke the pipeline several ways:
+
+        - `pipeline(image=image, question=question)`
+        - `pipeline(image=image, question=question, word_boxes=word_boxes)`
+        - `pipeline([{"image": image, "question": question}])`
+        - `pipeline([{"image": image, "question": question, "word_boxes": word_boxes}])`
+
+        Args:
+            image (`str` or `PIL.Image`):
+                The pipeline handles three types of images:
+
+                - A string containing a http link pointing to an image
+                - A string containing a local path to an image
+                - An image loaded in PIL directly
+
+                The pipeline accepts either a single image or a batch of images. If given a single image, it can be
+                broadcasted to multiple questions.
+            question (`str`):
+                A question to ask of the document.
+            word_boxes (`List[str, Tuple[float, float, float, float]]`, *optional*):
+                A list of words and bounding boxes (normalized 0->1000). If you provide this optional input, then the
+                pipeline will use these words and boxes instead of running OCR on the image to derive them for models
+                that need them (e.g. LayoutLM). This allows you to reuse OCR'd results across many invocations of the
+                pipeline without having to re-run it each time.
+            top_k (`int`, *optional*, defaults to 1):
+                The number of answers to return (will be chosen by order of likelihood). Note that we return less than
+                top_k answers if there are not enough options available within the context.
+            doc_stride (`int`, *optional*, defaults to 128):
+                If the words in the document are too long to fit with the question for the model, it will be split in
+                several chunks with some overlap. This argument controls the size of that overlap.
+            max_answer_len (`int`, *optional*, defaults to 15):
+                The maximum length of predicted answers (e.g., only answers with a shorter length are considered).
+            max_seq_len (`int`, *optional*, defaults to 384):
+                The maximum length of the total sentence (context + question) in tokens of each chunk passed to the
+                model. The context will be split in several chunks (using `doc_stride` as overlap) if needed.
+            max_question_len (`int`, *optional*, defaults to 64):
+                The maximum length of the question after tokenization. It will be truncated if needed.
+            handle_impossible_answer (`bool`, *optional*, defaults to `False`):
+                Whether or not we accept impossible as an answer.
+            lang (`str`, *optional*):
+                Language to use while running OCR. Defaults to english.
+            tesseract_config (`str`, *optional*):
+                Additional flags to pass to tesseract while running OCR.
+            timeout (`float`, *optional*, defaults to None):
+                The maximum time in seconds to wait for fetching images from the web. If None, no timeout is set and
+                the call may block forever.
+
+        Return:
+            A `dict` or a list of `dict`: Each result comes as a dictionary with the following keys:
+
+            - **score** (`float`) -- The probability associated to the answer.
+            - **start** (`int`) -- The start word index of the answer (in the OCR'd version of the input or provided
+              `word_boxes`).
+            - **end** (`int`) -- The end word index of the answer (in the OCR'd version of the input or provided
+              `word_boxes`).
+            - **answer** (`str`) -- The answer to the question.
+            - **words** (`list[int]`) -- The index of each word/box pair that is in the answer
+        """
+        if isinstance(question, str):
+            inputs = {"question": question, "image": image}
+            if word_boxes is not None:
+                inputs["word_boxes"] = word_boxes
+        else:
+            inputs = image
+        return super().__call__(inputs, **kwargs)
+
+    def preprocess(
+        self,
+        input,
+        padding="do_not_pad",
+        doc_stride=None,
+        max_seq_len=None,
+        word_boxes: Tuple[str, List[float]] = None,
+        lang=None,
+        tesseract_config="",
+        timeout=None,
+    ):
+        # NOTE: This code mirrors the code in question answering and will be implemented in a follow up PR
+        # to support documents with enough tokens that overflow the model's window
+        if max_seq_len is None:
+            max_seq_len = self.tokenizer.model_max_length
+
+        if doc_stride is None:
+            doc_stride = min(max_seq_len // 2, 256)
+
+        image = None
+        image_features = {}
+        if input.get("image", None) is not None:
+            image = load_image(input["image"], timeout=timeout)
+            if self.image_processor is not None:
+                image_inputs = self.image_processor(images=image, return_tensors=self.framework)
+                if self.framework == "pt":
+                    image_inputs = image_inputs.to(self.torch_dtype)
+                image_features.update(image_inputs)
+            elif self.feature_extractor is not None:
+                image_features.update(self.feature_extractor(images=image, return_tensors=self.framework))
+            elif self.model_type == ModelType.VisionEncoderDecoder:
+                raise ValueError("If you are using a VisionEncoderDecoderModel, you must provide a feature extractor")
+
+        words, boxes = None, None
+        if not self.model_type == ModelType.VisionEncoderDecoder:
+            if "word_boxes" in input:
+                words = [x[0] for x in input["word_boxes"]]
+                boxes = [x[1] for x in input["word_boxes"]]
+            elif "words" in image_features and "boxes" in image_features:
+                words = image_features.pop("words")[0]
+                boxes = image_features.pop("boxes")[0]
+            elif image is not None:
+                if not TESSERACT_LOADED:
+                    raise ValueError(
+                        "If you provide an image without word_boxes, then the pipeline will run OCR using Tesseract,"
+                        " but pytesseract is not available"
+                    )
+                if TESSERACT_LOADED:
+                    words, boxes = apply_tesseract(image, lang=lang, tesseract_config=tesseract_config)
+            else:
+                raise ValueError(
+                    "You must provide an image or word_boxes. If you provide an image, the pipeline will automatically"
+                    " run OCR to derive words and boxes"
+                )
+
+        if self.tokenizer.padding_side != "right":
+            raise ValueError(
+                "Document question answering only supports tokenizers whose padding side is 'right', not"
+                f" {self.tokenizer.padding_side}"
+            )
+
+        if self.model_type == ModelType.VisionEncoderDecoder:
+            task_prompt = f'<s_docvqa><s_question>{input["question"]}</s_question><s_answer>'
+            # Adapted from https://huggingface.co/spaces/nielsr/donut-docvqa/blob/main/app.py
+            encoding = {
+                "inputs": image_features["pixel_values"],
+                "decoder_input_ids": self.tokenizer(
+                    task_prompt, add_special_tokens=False, return_tensors=self.framework
+                ).input_ids,
+                "return_dict_in_generate": True,
+            }
+            yield {
+                **encoding,
+                "p_mask": None,
+                "word_ids": None,
+                "words": None,
+                "output_attentions": True,
+                "is_last": True,
+            }
+        else:
+            tokenizer_kwargs = {}
+            if self.model_type == ModelType.LayoutLM:
+                tokenizer_kwargs["text"] = input["question"].split()
+                tokenizer_kwargs["text_pair"] = words
+                tokenizer_kwargs["is_split_into_words"] = True
+            else:
+                tokenizer_kwargs["text"] = [input["question"]]
+                tokenizer_kwargs["text_pair"] = [words]
+                tokenizer_kwargs["boxes"] = [boxes]
+
+            encoding = self.tokenizer(
+                padding=padding,
+                max_length=max_seq_len,
+                stride=doc_stride,
+                return_token_type_ids=True,
+                truncation="only_second",
+                return_overflowing_tokens=True,
+                **tokenizer_kwargs,
+            )
+            # TODO: check why slower `LayoutLMTokenizer` and `LayoutLMv2Tokenizer` don't have this key in outputs
+            # FIXME: ydshieh and/or Narsil
+            encoding.pop("overflow_to_sample_mapping", None)  # We do not use this
+
+            num_spans = len(encoding["input_ids"])
+
+            # p_mask: mask with 1 for token than cannot be in the answer (0 for token which can be in an answer)
+            # We put 0 on the tokens from the context and 1 everywhere else (question and special tokens)
+            # This logic mirrors the logic in the question_answering pipeline
+            p_mask = np.array([[tok != 1 for tok in encoding.sequence_ids(span_id)] for span_id in range(num_spans)])
+            for span_idx in range(num_spans):
+                if self.framework == "pt":
+                    span_encoding = {k: torch.tensor(v[span_idx : span_idx + 1]) for (k, v) in encoding.items()}
+                    if "pixel_values" in image_features:
+                        span_encoding["image"] = image_features["pixel_values"]
+                else:
+                    raise ValueError("Unsupported: Tensorflow preprocessing for DocumentQuestionAnsweringPipeline")
+
+                input_ids_span_idx = encoding["input_ids"][span_idx]
+                # keep the cls_token unmasked (some models use it to indicate unanswerable questions)
+                if self.tokenizer.cls_token_id is not None:
+                    cls_indices = np.nonzero(np.array(input_ids_span_idx) == self.tokenizer.cls_token_id)[0]
+                    for cls_index in cls_indices:
+                        p_mask[span_idx][cls_index] = 0
+
+                # For each span, place a bounding box [0,0,0,0] for question and CLS tokens, [1000,1000,1000,1000]
+                # for SEP tokens, and the word's bounding box for words in the original document.
+                if "boxes" not in tokenizer_kwargs:
+                    bbox = []
+                    for input_id, sequence_id, word_id in zip(
+                        encoding.input_ids[span_idx],
+                        encoding.sequence_ids(span_idx),
+                        encoding.word_ids(span_idx),
+                    ):
+                        if sequence_id == 1:
+                            bbox.append(boxes[word_id])
+                        elif input_id == self.tokenizer.sep_token_id:
+                            bbox.append([1000] * 4)
+                        else:
+                            bbox.append([0] * 4)
+
+                    if self.framework == "pt":
+                        span_encoding["bbox"] = torch.tensor(bbox).unsqueeze(0)
+                    elif self.framework == "tf":
+                        raise ValueError("Unsupported: Tensorflow preprocessing for DocumentQuestionAnsweringPipeline")
+                yield {
+                    **span_encoding,
+                    "p_mask": p_mask[span_idx],
+                    "word_ids": encoding.word_ids(span_idx),
+                    "words": words,
+                    "is_last": span_idx == num_spans - 1,
+                }
+
+    def _forward(self, model_inputs, **generate_kwargs):
+        p_mask = model_inputs.pop("p_mask", None)
+        word_ids = model_inputs.pop("word_ids", None)
+        words = model_inputs.pop("words", None)
+        is_last = model_inputs.pop("is_last", False)
+
+        if self.model_type == ModelType.VisionEncoderDecoder:
+            # User-defined `generation_config` passed to the pipeline call take precedence
+            if "generation_config" not in generate_kwargs:
+                generate_kwargs["generation_config"] = self.generation_config
+
+            model_outputs = self.model.generate(**model_inputs, **generate_kwargs)
+        else:
+            model_outputs = self.model(**model_inputs)
+
+        model_outputs = dict(model_outputs.items())
+        model_outputs["p_mask"] = p_mask
+        model_outputs["word_ids"] = word_ids
+        model_outputs["words"] = words
+        model_outputs["attention_mask"] = model_inputs.get("attention_mask", None)
+        model_outputs["is_last"] = is_last
+        return model_outputs
+
+    def postprocess(self, model_outputs, top_k=1, **kwargs):
+        if self.model_type == ModelType.VisionEncoderDecoder:
+            answers = [self.postprocess_encoder_decoder_single(o) for o in model_outputs]
+        else:
+            answers = self.postprocess_extractive_qa(model_outputs, top_k=top_k, **kwargs)
+
+        answers = sorted(answers, key=lambda x: x.get("score", 0), reverse=True)[:top_k]
+        return answers
+
+    def postprocess_encoder_decoder_single(self, model_outputs, **kwargs):
+        sequence = self.tokenizer.batch_decode(model_outputs["sequences"])[0]
+
+        # TODO: A lot of this logic is specific to Donut and should probably be handled in the tokenizer
+        # (see https://github.com/huggingface/transformers/pull/18414/files#r961747408 for more context).
+        sequence = sequence.replace(self.tokenizer.eos_token, "").replace(self.tokenizer.pad_token, "")
+        sequence = re.sub(r"<.*?>", "", sequence, count=1).strip()  # remove first task start token
+        ret = {
+            "answer": None,
+        }
+
+        answer = re.search(r"<s_answer>(.*)</s_answer>", sequence)
+        if answer is not None:
+            ret["answer"] = answer.group(1).strip()
+        return ret
+
+    def postprocess_extractive_qa(
+        self, model_outputs, top_k=1, handle_impossible_answer=False, max_answer_len=15, **kwargs
+    ):
+        min_null_score = 1000000  # large and positive
+        answers = []
+        for output in model_outputs:
+            words = output["words"]
+
+            starts, ends, scores, min_null_score = select_starts_ends(
+                start=output["start_logits"],
+                end=output["end_logits"],
+                p_mask=output["p_mask"],
+                attention_mask=output["attention_mask"].numpy()
+                if output.get("attention_mask", None) is not None
+                else None,
+                min_null_score=min_null_score,
+                top_k=top_k,
+                handle_impossible_answer=handle_impossible_answer,
+                max_answer_len=max_answer_len,
+            )
+            word_ids = output["word_ids"]
+            for start, end, score in zip(starts, ends, scores):
+                word_start, word_end = word_ids[start], word_ids[end]
+                if word_start is not None and word_end is not None:
+                    answers.append(
+                        {
+                            "score": float(score),
+                            "answer": " ".join(words[word_start : word_end + 1]),
+                            "start": word_start,
+                            "end": word_end,
+                        }
+                    )
+
+        if handle_impossible_answer:
+            answers.append({"score": min_null_score, "answer": "", "start": 0, "end": 0})
+
+        return answers
diff --git a/.venv/lib/python3.11/site-packages/transformers/pipelines/feature_extraction.py b/.venv/lib/python3.11/site-packages/transformers/pipelines/feature_extraction.py
new file mode 100644
index 0000000000000000000000000000000000000000..7d67a615ac02d29625f51242e1f747b39e6118bd
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/transformers/pipelines/feature_extraction.py
@@ -0,0 +1,86 @@
+from typing import Dict
+
+from ..utils import add_end_docstrings
+from .base import GenericTensor, Pipeline, build_pipeline_init_args
+
+
+@add_end_docstrings(
+    build_pipeline_init_args(has_tokenizer=True, supports_binary_output=False),
+    r"""
+        tokenize_kwargs (`dict`, *optional*):
+                Additional dictionary of keyword arguments passed along to the tokenizer.
+        return_tensors (`bool`, *optional*):
+            If `True`, returns a tensor according to the specified framework, otherwise returns a list.""",
+)
+class FeatureExtractionPipeline(Pipeline):
+    """
+    Feature extraction pipeline uses no model head. This pipeline extracts the hidden states from the base
+    transformer, which can be used as features in downstream tasks.
+
+    Example:
+
+    ```python
+    >>> from transformers import pipeline
+
+    >>> extractor = pipeline(model="google-bert/bert-base-uncased", task="feature-extraction")
+    >>> result = extractor("This is a simple test.", return_tensors=True)
+    >>> result.shape  # This is a tensor of shape [1, sequence_length, hidden_dimension] representing the input string.
+    torch.Size([1, 8, 768])
+    ```
+
+    Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial)
+
+    This feature extraction pipeline can currently be loaded from [`pipeline`] using the task identifier:
+    `"feature-extraction"`.
+
+    All models may be used for this pipeline. See a list of all models, including community-contributed models on
+    [huggingface.co/models](https://huggingface.co/models).
+    """
+
+    def _sanitize_parameters(self, truncation=None, tokenize_kwargs=None, return_tensors=None, **kwargs):
+        if tokenize_kwargs is None:
+            tokenize_kwargs = {}
+
+        if truncation is not None:
+            if "truncation" in tokenize_kwargs:
+                raise ValueError(
+                    "truncation parameter defined twice (given as keyword argument as well as in tokenize_kwargs)"
+                )
+            tokenize_kwargs["truncation"] = truncation
+
+        preprocess_params = tokenize_kwargs
+
+        postprocess_params = {}
+        if return_tensors is not None:
+            postprocess_params["return_tensors"] = return_tensors
+
+        return preprocess_params, {}, postprocess_params
+
+    def preprocess(self, inputs, **tokenize_kwargs) -> Dict[str, GenericTensor]:
+        model_inputs = self.tokenizer(inputs, return_tensors=self.framework, **tokenize_kwargs)
+        return model_inputs
+
+    def _forward(self, model_inputs):
+        model_outputs = self.model(**model_inputs)
+        return model_outputs
+
+    def postprocess(self, model_outputs, return_tensors=False):
+        # [0] is the first available tensor, logits or last_hidden_state.
+        if return_tensors:
+            return model_outputs[0]
+        if self.framework == "pt":
+            return model_outputs[0].tolist()
+        elif self.framework == "tf":
+            return model_outputs[0].numpy().tolist()
+
+    def __call__(self, *args, **kwargs):
+        """
+        Extract the features of the input(s).
+
+        Args:
+            args (`str` or `List[str]`): One or several texts (or one list of texts) to get the features of.
+
+        Return:
+            A nested list of `float`: The features computed by the model.
+        """
+        return super().__call__(*args, **kwargs)
diff --git a/.venv/lib/python3.11/site-packages/transformers/pipelines/fill_mask.py b/.venv/lib/python3.11/site-packages/transformers/pipelines/fill_mask.py
new file mode 100644
index 0000000000000000000000000000000000000000..c14f54118486b971f64b0985fe2dc688de52f863
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/transformers/pipelines/fill_mask.py
@@ -0,0 +1,273 @@
+from typing import Dict
+
+import numpy as np
+
+from ..utils import add_end_docstrings, is_tf_available, is_torch_available, logging
+from .base import GenericTensor, Pipeline, PipelineException, build_pipeline_init_args
+
+
+if is_tf_available():
+    import tensorflow as tf
+
+    from ..tf_utils import stable_softmax
+
+
+if is_torch_available():
+    import torch
+
+
+logger = logging.get_logger(__name__)
+
+
+@add_end_docstrings(
+    build_pipeline_init_args(has_tokenizer=True),
+    r"""
+        top_k (`int`, *optional*, defaults to 5):
+            The number of predictions to return.
+        targets (`str` or `List[str]`, *optional*):
+            When passed, the model will limit the scores to the passed targets instead of looking up in the whole
+            vocab. If the provided targets are not in the model vocab, they will be tokenized and the first resulting
+            token will be used (with a warning, and that might be slower).
+        tokenizer_kwargs (`dict`, *optional*):
+            Additional dictionary of keyword arguments passed along to the tokenizer.""",
+)
+class FillMaskPipeline(Pipeline):
+    """
+    Masked language modeling prediction pipeline using any `ModelWithLMHead`. See the [masked language modeling
+    examples](../task_summary#masked-language-modeling) for more information.
+
+    Example:
+
+    ```python
+    >>> from transformers import pipeline
+
+    >>> fill_masker = pipeline(model="google-bert/bert-base-uncased")
+    >>> fill_masker("This is a simple [MASK].")
+    [{'score': 0.042, 'token': 3291, 'token_str': 'problem', 'sequence': 'this is a simple problem.'}, {'score': 0.031, 'token': 3160, 'token_str': 'question', 'sequence': 'this is a simple question.'}, {'score': 0.03, 'token': 8522, 'token_str': 'equation', 'sequence': 'this is a simple equation.'}, {'score': 0.027, 'token': 2028, 'token_str': 'one', 'sequence': 'this is a simple one.'}, {'score': 0.024, 'token': 3627, 'token_str': 'rule', 'sequence': 'this is a simple rule.'}]
+    ```
+
+    Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial)
+
+    This mask filling pipeline can currently be loaded from [`pipeline`] using the following task identifier:
+    `"fill-mask"`.
+
+    The models that this pipeline can use are models that have been trained with a masked language modeling objective,
+    which includes the bi-directional models in the library. See the up-to-date list of available models on
+    [huggingface.co/models](https://huggingface.co/models?filter=fill-mask).
+
+    <Tip>
+
+    This pipeline only works for inputs with exactly one token masked. Experimental: We added support for multiple
+    masks. The returned values are raw model output, and correspond to disjoint probabilities where one might expect
+    joint probabilities (See [discussion](https://github.com/huggingface/transformers/pull/10222)).
+
+    </Tip>
+
+    <Tip>
+
+    This pipeline now supports tokenizer_kwargs. For example try:
+
+    ```python
+    >>> from transformers import pipeline
+
+    >>> fill_masker = pipeline(model="google-bert/bert-base-uncased")
+    >>> tokenizer_kwargs = {"truncation": True}
+    >>> fill_masker(
+    ...     "This is a simple [MASK]. " + "...with a large amount of repeated text appended. " * 100,
+    ...     tokenizer_kwargs=tokenizer_kwargs,
+    ... )
+    ```
+
+
+    </Tip>
+
+
+    """
+
+    def get_masked_index(self, input_ids: GenericTensor) -> np.ndarray:
+        if self.framework == "tf":
+            masked_index = tf.where(input_ids == self.tokenizer.mask_token_id).numpy()
+        elif self.framework == "pt":
+            masked_index = torch.nonzero(input_ids == self.tokenizer.mask_token_id, as_tuple=False)
+        else:
+            raise ValueError("Unsupported framework")
+        return masked_index
+
+    def _ensure_exactly_one_mask_token(self, input_ids: GenericTensor) -> np.ndarray:
+        masked_index = self.get_masked_index(input_ids)
+        numel = np.prod(masked_index.shape)
+        if numel < 1:
+            raise PipelineException(
+                "fill-mask",
+                self.model.base_model_prefix,
+                f"No mask_token ({self.tokenizer.mask_token}) found on the input",
+            )
+
+    def ensure_exactly_one_mask_token(self, model_inputs: GenericTensor):
+        if isinstance(model_inputs, list):
+            for model_input in model_inputs:
+                self._ensure_exactly_one_mask_token(model_input["input_ids"][0])
+        else:
+            for input_ids in model_inputs["input_ids"]:
+                self._ensure_exactly_one_mask_token(input_ids)
+
+    def preprocess(
+        self, inputs, return_tensors=None, tokenizer_kwargs=None, **preprocess_parameters
+    ) -> Dict[str, GenericTensor]:
+        if return_tensors is None:
+            return_tensors = self.framework
+        if tokenizer_kwargs is None:
+            tokenizer_kwargs = {}
+
+        model_inputs = self.tokenizer(inputs, return_tensors=return_tensors, **tokenizer_kwargs)
+        self.ensure_exactly_one_mask_token(model_inputs)
+        return model_inputs
+
+    def _forward(self, model_inputs):
+        model_outputs = self.model(**model_inputs)
+        model_outputs["input_ids"] = model_inputs["input_ids"]
+        return model_outputs
+
+    def postprocess(self, model_outputs, top_k=5, target_ids=None):
+        # Cap top_k if there are targets
+        if target_ids is not None and target_ids.shape[0] < top_k:
+            top_k = target_ids.shape[0]
+        input_ids = model_outputs["input_ids"][0]
+        outputs = model_outputs["logits"]
+
+        if self.framework == "tf":
+            masked_index = tf.where(input_ids == self.tokenizer.mask_token_id).numpy()[:, 0]
+
+            outputs = outputs.numpy()
+
+            logits = outputs[0, masked_index, :]
+            probs = stable_softmax(logits, axis=-1)
+            if target_ids is not None:
+                probs = tf.gather_nd(tf.squeeze(probs, 0), target_ids.reshape(-1, 1))
+                probs = tf.expand_dims(probs, 0)
+
+            topk = tf.math.top_k(probs, k=top_k)
+            values, predictions = topk.values.numpy(), topk.indices.numpy()
+        else:
+            masked_index = torch.nonzero(input_ids == self.tokenizer.mask_token_id, as_tuple=False).squeeze(-1)
+            # Fill mask pipeline supports only one ${mask_token} per sample
+
+            logits = outputs[0, masked_index, :]
+            probs = logits.softmax(dim=-1)
+            if target_ids is not None:
+                probs = probs[..., target_ids]
+
+            values, predictions = probs.topk(top_k)
+
+        result = []
+        single_mask = values.shape[0] == 1
+        for i, (_values, _predictions) in enumerate(zip(values.tolist(), predictions.tolist())):
+            row = []
+            for v, p in zip(_values, _predictions):
+                # Copy is important since we're going to modify this array in place
+                tokens = input_ids.numpy().copy()
+                if target_ids is not None:
+                    p = target_ids[p].tolist()
+
+                tokens[masked_index[i]] = p
+                # Filter padding out:
+                tokens = tokens[np.where(tokens != self.tokenizer.pad_token_id)]
+                # Originally we skip special tokens to give readable output.
+                # For multi masks though, the other [MASK] would be removed otherwise
+                # making the output look odd, so we add them back
+                sequence = self.tokenizer.decode(tokens, skip_special_tokens=single_mask)
+                proposition = {"score": v, "token": p, "token_str": self.tokenizer.decode([p]), "sequence": sequence}
+                row.append(proposition)
+            result.append(row)
+        if single_mask:
+            return result[0]
+        return result
+
+    def get_target_ids(self, targets, top_k=None):
+        if isinstance(targets, str):
+            targets = [targets]
+        try:
+            vocab = self.tokenizer.get_vocab()
+        except Exception:
+            vocab = {}
+        target_ids = []
+        for target in targets:
+            id_ = vocab.get(target, None)
+            if id_ is None:
+                input_ids = self.tokenizer(
+                    target,
+                    add_special_tokens=False,
+                    return_attention_mask=False,
+                    return_token_type_ids=False,
+                    max_length=1,
+                    truncation=True,
+                )["input_ids"]
+                if len(input_ids) == 0:
+                    logger.warning(
+                        f"The specified target token `{target}` does not exist in the model vocabulary. "
+                        "We cannot replace it with anything meaningful, ignoring it"
+                    )
+                    continue
+                id_ = input_ids[0]
+                # XXX: If users encounter this pass
+                # it becomes pretty slow, so let's make sure
+                # The warning enables them to fix the input to
+                # get faster performance.
+                logger.warning(
+                    f"The specified target token `{target}` does not exist in the model vocabulary. "
+                    f"Replacing with `{self.tokenizer.convert_ids_to_tokens(id_)}`."
+                )
+            target_ids.append(id_)
+        target_ids = list(set(target_ids))
+        if len(target_ids) == 0:
+            raise ValueError("At least one target must be provided when passed.")
+        target_ids = np.array(target_ids)
+        return target_ids
+
+    def _sanitize_parameters(self, top_k=None, targets=None, tokenizer_kwargs=None):
+        preprocess_params = {}
+
+        if tokenizer_kwargs is not None:
+            preprocess_params["tokenizer_kwargs"] = tokenizer_kwargs
+
+        postprocess_params = {}
+
+        if targets is not None:
+            target_ids = self.get_target_ids(targets, top_k)
+            postprocess_params["target_ids"] = target_ids
+
+        if top_k is not None:
+            postprocess_params["top_k"] = top_k
+
+        if self.tokenizer.mask_token_id is None:
+            raise PipelineException(
+                "fill-mask", self.model.base_model_prefix, "The tokenizer does not define a `mask_token`."
+            )
+        return preprocess_params, {}, postprocess_params
+
+    def __call__(self, inputs, **kwargs):
+        """
+        Fill the masked token in the text(s) given as inputs.
+
+        Args:
+            inputs (`str` or `List[str]`):
+                One or several texts (or one list of prompts) with masked tokens.
+            targets (`str` or `List[str]`, *optional*):
+                When passed, the model will limit the scores to the passed targets instead of looking up in the whole
+                vocab. If the provided targets are not in the model vocab, they will be tokenized and the first
+                resulting token will be used (with a warning, and that might be slower).
+            top_k (`int`, *optional*):
+                When passed, overrides the number of predictions to return.
+
+        Return:
+            A list or a list of list of `dict`: Each result comes as list of dictionaries with the following keys:
+
+            - **sequence** (`str`) -- The corresponding input with the mask token prediction.
+            - **score** (`float`) -- The corresponding probability.
+            - **token** (`int`) -- The predicted token id (to replace the masked one).
+            - **token_str** (`str`) -- The predicted token (to replace the masked one).
+        """
+        outputs = super().__call__(inputs, **kwargs)
+        if isinstance(inputs, list) and len(inputs) == 1:
+            return outputs[0]
+        return outputs
diff --git a/.venv/lib/python3.11/site-packages/transformers/pipelines/image_classification.py b/.venv/lib/python3.11/site-packages/transformers/pipelines/image_classification.py
new file mode 100644
index 0000000000000000000000000000000000000000..0085e5eb73f826598dae8461a15431e3e5ef8f80
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/transformers/pipelines/image_classification.py
@@ -0,0 +1,226 @@
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import List, Union
+
+import numpy as np
+
+from ..utils import (
+    ExplicitEnum,
+    add_end_docstrings,
+    is_tf_available,
+    is_torch_available,
+    is_vision_available,
+    logging,
+    requires_backends,
+)
+from .base import Pipeline, build_pipeline_init_args
+
+
+if is_vision_available():
+    from PIL import Image
+
+    from ..image_utils import load_image
+
+if is_tf_available():
+    from ..models.auto.modeling_tf_auto import TF_MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES
+
+if is_torch_available():
+    import torch
+
+    from ..models.auto.modeling_auto import MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES
+
+logger = logging.get_logger(__name__)
+
+
+# Copied from transformers.pipelines.text_classification.sigmoid
+def sigmoid(_outputs):
+    return 1.0 / (1.0 + np.exp(-_outputs))
+
+
+# Copied from transformers.pipelines.text_classification.softmax
+def softmax(_outputs):
+    maxes = np.max(_outputs, axis=-1, keepdims=True)
+    shifted_exp = np.exp(_outputs - maxes)
+    return shifted_exp / shifted_exp.sum(axis=-1, keepdims=True)
+
+
+# Copied from transformers.pipelines.text_classification.ClassificationFunction
+class ClassificationFunction(ExplicitEnum):
+    SIGMOID = "sigmoid"
+    SOFTMAX = "softmax"
+    NONE = "none"
+
+
+@add_end_docstrings(
+    build_pipeline_init_args(has_image_processor=True),
+    r"""
+        function_to_apply (`str`, *optional*, defaults to `"default"`):
+            The function to apply to the model outputs in order to retrieve the scores. Accepts four different values:
+
+            - `"default"`: if the model has a single label, will apply the sigmoid function on the output. If the model
+              has several labels, will apply the softmax function on the output.
+            - `"sigmoid"`: Applies the sigmoid function on the output.
+            - `"softmax"`: Applies the softmax function on the output.
+            - `"none"`: Does not apply any function on the output.""",
+)
+class ImageClassificationPipeline(Pipeline):
+    """
+    Image classification pipeline using any `AutoModelForImageClassification`. This pipeline predicts the class of an
+    image.
+
+    Example:
+
+    ```python
+    >>> from transformers import pipeline
+
+    >>> classifier = pipeline(model="microsoft/beit-base-patch16-224-pt22k-ft22k")
+    >>> classifier("https://huggingface.co/datasets/Narsil/image_dummy/raw/main/parrots.png")
+    [{'score': 0.442, 'label': 'macaw'}, {'score': 0.088, 'label': 'popinjay'}, {'score': 0.075, 'label': 'parrot'}, {'score': 0.073, 'label': 'parodist, lampooner'}, {'score': 0.046, 'label': 'poll, poll_parrot'}]
+    ```
+
+    Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial)
+
+    This image classification pipeline can currently be loaded from [`pipeline`] using the following task identifier:
+    `"image-classification"`.
+
+    See the list of available models on
+    [huggingface.co/models](https://huggingface.co/models?filter=image-classification).
+    """
+
+    function_to_apply: ClassificationFunction = ClassificationFunction.NONE
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        requires_backends(self, "vision")
+        self.check_model_type(
+            TF_MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES
+            if self.framework == "tf"
+            else MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES
+        )
+
+    def _sanitize_parameters(self, top_k=None, function_to_apply=None, timeout=None):
+        preprocess_params = {}
+        if timeout is not None:
+            preprocess_params["timeout"] = timeout
+        postprocess_params = {}
+        if top_k is not None:
+            postprocess_params["top_k"] = top_k
+        if isinstance(function_to_apply, str):
+            function_to_apply = ClassificationFunction(function_to_apply.lower())
+        if function_to_apply is not None:
+            postprocess_params["function_to_apply"] = function_to_apply
+        return preprocess_params, {}, postprocess_params
+
+    def __call__(self, inputs: Union[str, List[str], "Image.Image", List["Image.Image"]] = None, **kwargs):
+        """
+        Assign labels to the image(s) passed as inputs.
+
+        Args:
+            inputs (`str`, `List[str]`, `PIL.Image` or `List[PIL.Image]`):
+                The pipeline handles three types of images:
+
+                - A string containing a http link pointing to an image
+                - A string containing a local path to an image
+                - An image loaded in PIL directly
+
+                The pipeline accepts either a single image or a batch of images, which must then be passed as a string.
+                Images in a batch must all be in the same format: all as http links, all as local paths, or all as PIL
+                images.
+            function_to_apply (`str`, *optional*, defaults to `"default"`):
+                The function to apply to the model outputs in order to retrieve the scores. Accepts four different
+                values:
+
+                If this argument is not specified, then it will apply the following functions according to the number
+                of labels:
+
+                - If the model has a single label, will apply the sigmoid function on the output.
+                - If the model has several labels, will apply the softmax function on the output.
+
+                Possible values are:
+
+                - `"sigmoid"`: Applies the sigmoid function on the output.
+                - `"softmax"`: Applies the softmax function on the output.
+                - `"none"`: Does not apply any function on the output.
+            top_k (`int`, *optional*, defaults to 5):
+                The number of top labels that will be returned by the pipeline. If the provided number is higher than
+                the number of labels available in the model configuration, it will default to the number of labels.
+            timeout (`float`, *optional*, defaults to None):
+                The maximum time in seconds to wait for fetching images from the web. If None, no timeout is set and
+                the call may block forever.
+
+        Return:
+            A dictionary or a list of dictionaries containing result. If the input is a single image, will return a
+            dictionary, if the input is a list of several images, will return a list of dictionaries corresponding to
+            the images.
+
+            The dictionaries contain the following keys:
+
+            - **label** (`str`) -- The label identified by the model.
+            - **score** (`int`) -- The score attributed by the model for that label.
+        """
+        # After deprecation of this is completed, remove the default `None` value for `images`
+        if "images" in kwargs:
+            inputs = kwargs.pop("images")
+        if inputs is None:
+            raise ValueError("Cannot call the image-classification pipeline without an inputs argument!")
+        return super().__call__(inputs, **kwargs)
+
+    def preprocess(self, image, timeout=None):
+        image = load_image(image, timeout=timeout)
+        model_inputs = self.image_processor(images=image, return_tensors=self.framework)
+        if self.framework == "pt":
+            model_inputs = model_inputs.to(self.torch_dtype)
+        return model_inputs
+
+    def _forward(self, model_inputs):
+        model_outputs = self.model(**model_inputs)
+        return model_outputs
+
+    def postprocess(self, model_outputs, function_to_apply=None, top_k=5):
+        if function_to_apply is None:
+            if self.model.config.problem_type == "single_label_classification" or self.model.config.num_labels == 1:
+                function_to_apply = ClassificationFunction.SIGMOID
+            elif self.model.config.problem_type == "multi_label_classification" or self.model.config.num_labels > 1:
+                function_to_apply = ClassificationFunction.SOFTMAX
+            elif hasattr(self.model.config, "function_to_apply") and function_to_apply is None:
+                function_to_apply = self.model.config.function_to_apply
+            else:
+                function_to_apply = ClassificationFunction.NONE
+
+        if top_k > self.model.config.num_labels:
+            top_k = self.model.config.num_labels
+
+        outputs = model_outputs["logits"][0]
+        if self.framework == "pt" and outputs.dtype in (torch.bfloat16, torch.float16):
+            outputs = outputs.to(torch.float32).numpy()
+        else:
+            outputs = outputs.numpy()
+
+        if function_to_apply == ClassificationFunction.SIGMOID:
+            scores = sigmoid(outputs)
+        elif function_to_apply == ClassificationFunction.SOFTMAX:
+            scores = softmax(outputs)
+        elif function_to_apply == ClassificationFunction.NONE:
+            scores = outputs
+        else:
+            raise ValueError(f"Unrecognized `function_to_apply` argument: {function_to_apply}")
+
+        dict_scores = [
+            {"label": self.model.config.id2label[i], "score": score.item()} for i, score in enumerate(scores)
+        ]
+        dict_scores.sort(key=lambda x: x["score"], reverse=True)
+        if top_k is not None:
+            dict_scores = dict_scores[:top_k]
+
+        return dict_scores
diff --git a/.venv/lib/python3.11/site-packages/transformers/pipelines/image_feature_extraction.py b/.venv/lib/python3.11/site-packages/transformers/pipelines/image_feature_extraction.py
new file mode 100644
index 0000000000000000000000000000000000000000..391eb2b3aec714dbac61fe46bddc7ee74f10cd2f
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/transformers/pipelines/image_feature_extraction.py
@@ -0,0 +1,112 @@
+from typing import Dict
+
+from ..utils import add_end_docstrings, is_vision_available
+from .base import GenericTensor, Pipeline, build_pipeline_init_args
+
+
+if is_vision_available():
+    from ..image_utils import load_image
+
+
+@add_end_docstrings(
+    build_pipeline_init_args(has_image_processor=True),
+    """
+        image_processor_kwargs (`dict`, *optional*):
+                Additional dictionary of keyword arguments passed along to the image processor e.g.
+                {"size": {"height": 100, "width": 100}}
+        pool (`bool`, *optional*, defaults to `False`):
+            Whether or not to return the pooled output. If `False`, the model will return the raw hidden states.
+    """,
+)
+class ImageFeatureExtractionPipeline(Pipeline):
+    """
+    Image feature extraction pipeline uses no model head. This pipeline extracts the hidden states from the base
+    transformer, which can be used as features in downstream tasks.
+
+    Example:
+
+    ```python
+    >>> from transformers import pipeline
+
+    >>> extractor = pipeline(model="google/vit-base-patch16-224", task="image-feature-extraction")
+    >>> result = extractor("https://huggingface.co/datasets/Narsil/image_dummy/raw/main/parrots.png", return_tensors=True)
+    >>> result.shape  # This is a tensor of shape [1, sequence_lenth, hidden_dimension] representing the input image.
+    torch.Size([1, 197, 768])
+    ```
+
+    Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial)
+
+    This image feature extraction pipeline can currently be loaded from [`pipeline`] using the task identifier:
+    `"image-feature-extraction"`.
+
+    All vision models may be used for this pipeline. See a list of all models, including community-contributed models on
+    [huggingface.co/models](https://huggingface.co/models).
+    """
+
+    def _sanitize_parameters(self, image_processor_kwargs=None, return_tensors=None, pool=None, **kwargs):
+        preprocess_params = {} if image_processor_kwargs is None else image_processor_kwargs
+
+        postprocess_params = {}
+        if pool is not None:
+            postprocess_params["pool"] = pool
+        if return_tensors is not None:
+            postprocess_params["return_tensors"] = return_tensors
+
+        if "timeout" in kwargs:
+            preprocess_params["timeout"] = kwargs["timeout"]
+
+        return preprocess_params, {}, postprocess_params
+
+    def preprocess(self, image, timeout=None, **image_processor_kwargs) -> Dict[str, GenericTensor]:
+        image = load_image(image, timeout=timeout)
+        model_inputs = self.image_processor(image, return_tensors=self.framework, **image_processor_kwargs)
+        if self.framework == "pt":
+            model_inputs = model_inputs.to(self.torch_dtype)
+        return model_inputs
+
+    def _forward(self, model_inputs):
+        model_outputs = self.model(**model_inputs)
+        return model_outputs
+
+    def postprocess(self, model_outputs, pool=None, return_tensors=False):
+        pool = pool if pool is not None else False
+
+        if pool:
+            if "pooler_output" not in model_outputs:
+                raise ValueError(
+                    "No pooled output was returned. Make sure the model has a `pooler` layer when using the `pool` option."
+                )
+            outputs = model_outputs["pooler_output"]
+        else:
+            # [0] is the first available tensor, logits or last_hidden_state.
+            outputs = model_outputs[0]
+
+        if return_tensors:
+            return outputs
+        if self.framework == "pt":
+            return outputs.tolist()
+        elif self.framework == "tf":
+            return outputs.numpy().tolist()
+
+    def __call__(self, *args, **kwargs):
+        """
+        Extract the features of the input(s).
+
+        Args:
+            images (`str`, `List[str]`, `PIL.Image` or `List[PIL.Image]`):
+                The pipeline handles three types of images:
+
+                - A string containing a http link pointing to an image
+                - A string containing a local path to an image
+                - An image loaded in PIL directly
+
+                The pipeline accepts either a single image or a batch of images, which must then be passed as a string.
+                Images in a batch must all be in the same format: all as http links, all as local paths, or all as PIL
+                images.
+            timeout (`float`, *optional*, defaults to None):
+                The maximum time in seconds to wait for fetching images from the web. If None, no timeout is used and
+                the call may block forever.
+        Return:
+            A nested list of `float`: The features computed by the model.
+        """
+        return super().__call__(*args, **kwargs)
diff --git a/.venv/lib/python3.11/site-packages/transformers/pipelines/image_segmentation.py b/.venv/lib/python3.11/site-packages/transformers/pipelines/image_segmentation.py
new file mode 100644
index 0000000000000000000000000000000000000000..d388e591bf9df45c4905a6c8ff86fdce1e123906
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/transformers/pipelines/image_segmentation.py
@@ -0,0 +1,220 @@
+from typing import Any, Dict, List, Union
+
+import numpy as np
+
+from ..utils import add_end_docstrings, is_torch_available, is_vision_available, logging, requires_backends
+from .base import Pipeline, build_pipeline_init_args
+
+
+if is_vision_available():
+    from PIL import Image
+
+    from ..image_utils import load_image
+
+if is_torch_available():
+    from ..models.auto.modeling_auto import (
+        MODEL_FOR_IMAGE_SEGMENTATION_MAPPING_NAMES,
+        MODEL_FOR_INSTANCE_SEGMENTATION_MAPPING_NAMES,
+        MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING_NAMES,
+        MODEL_FOR_UNIVERSAL_SEGMENTATION_MAPPING_NAMES,
+    )
+
+
+logger = logging.get_logger(__name__)
+
+
+Prediction = Dict[str, Any]
+Predictions = List[Prediction]
+
+
+@add_end_docstrings(build_pipeline_init_args(has_image_processor=True))
+class ImageSegmentationPipeline(Pipeline):
+    """
+    Image segmentation pipeline using any `AutoModelForXXXSegmentation`. This pipeline predicts masks of objects and
+    their classes.
+
+    Example:
+
+    ```python
+    >>> from transformers import pipeline
+
+    >>> segmenter = pipeline(model="facebook/detr-resnet-50-panoptic")
+    >>> segments = segmenter("https://huggingface.co/datasets/Narsil/image_dummy/raw/main/parrots.png")
+    >>> len(segments)
+    2
+
+    >>> segments[0]["label"]
+    'bird'
+
+    >>> segments[1]["label"]
+    'bird'
+
+    >>> type(segments[0]["mask"])  # This is a black and white mask showing where is the bird on the original image.
+    <class 'PIL.Image.Image'>
+
+    >>> segments[0]["mask"].size
+    (768, 512)
+    ```
+
+
+    This image segmentation pipeline can currently be loaded from [`pipeline`] using the following task identifier:
+    `"image-segmentation"`.
+
+    See the list of available models on
+    [huggingface.co/models](https://huggingface.co/models?filter=image-segmentation).
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        if self.framework == "tf":
+            raise ValueError(f"The {self.__class__} is only available in PyTorch.")
+
+        requires_backends(self, "vision")
+        mapping = MODEL_FOR_IMAGE_SEGMENTATION_MAPPING_NAMES.copy()
+        mapping.update(MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING_NAMES)
+        mapping.update(MODEL_FOR_INSTANCE_SEGMENTATION_MAPPING_NAMES)
+        mapping.update(MODEL_FOR_UNIVERSAL_SEGMENTATION_MAPPING_NAMES)
+        self.check_model_type(mapping)
+
+    def _sanitize_parameters(self, **kwargs):
+        preprocess_kwargs = {}
+        postprocess_kwargs = {}
+        if "subtask" in kwargs:
+            postprocess_kwargs["subtask"] = kwargs["subtask"]
+            preprocess_kwargs["subtask"] = kwargs["subtask"]
+        if "threshold" in kwargs:
+            postprocess_kwargs["threshold"] = kwargs["threshold"]
+        if "mask_threshold" in kwargs:
+            postprocess_kwargs["mask_threshold"] = kwargs["mask_threshold"]
+        if "overlap_mask_area_threshold" in kwargs:
+            postprocess_kwargs["overlap_mask_area_threshold"] = kwargs["overlap_mask_area_threshold"]
+        if "timeout" in kwargs:
+            preprocess_kwargs["timeout"] = kwargs["timeout"]
+
+        return preprocess_kwargs, {}, postprocess_kwargs
+
+    def __call__(self, inputs=None, **kwargs) -> Union[Predictions, List[Prediction]]:
+        """
+        Perform segmentation (detect masks & classes) in the image(s) passed as inputs.
+
+        Args:
+            inputs (`str`, `List[str]`, `PIL.Image` or `List[PIL.Image]`):
+                The pipeline handles three types of images:
+
+                - A string containing an HTTP(S) link pointing to an image
+                - A string containing a local path to an image
+                - An image loaded in PIL directly
+
+                The pipeline accepts either a single image or a batch of images. Images in a batch must all be in the
+                same format: all as HTTP(S) links, all as local paths, or all as PIL images.
+            subtask (`str`, *optional*):
+                Segmentation task to be performed, choose [`semantic`, `instance` and `panoptic`] depending on model
+                capabilities. If not set, the pipeline will attempt tp resolve in the following order:
+                  `panoptic`, `instance`, `semantic`.
+            threshold (`float`, *optional*, defaults to 0.9):
+                Probability threshold to filter out predicted masks.
+            mask_threshold (`float`, *optional*, defaults to 0.5):
+                Threshold to use when turning the predicted masks into binary values.
+            overlap_mask_area_threshold (`float`, *optional*, defaults to 0.5):
+                Mask overlap threshold to eliminate small, disconnected segments.
+            timeout (`float`, *optional*, defaults to None):
+                The maximum time in seconds to wait for fetching images from the web. If None, no timeout is set and
+                the call may block forever.
+
+        Return:
+            A dictionary or a list of dictionaries containing the result. If the input is a single image, will return a
+            list of dictionaries, if the input is a list of several images, will return a list of list of dictionaries
+            corresponding to each image.
+
+            The dictionaries contain the mask, label and score (where applicable) of each detected object and contains
+            the following keys:
+
+            - **label** (`str`) -- The class label identified by the model.
+            - **mask** (`PIL.Image`) -- A binary mask of the detected object as a Pil Image of shape (width, height) of
+              the original image. Returns a mask filled with zeros if no object is found.
+            - **score** (*optional* `float`) -- Optionally, when the model is capable of estimating a confidence of the
+              "object" described by the label and the mask.
+        """
+        # After deprecation of this is completed, remove the default `None` value for `images`
+        if "images" in kwargs:
+            inputs = kwargs.pop("images")
+        if inputs is None:
+            raise ValueError("Cannot call the image-classification pipeline without an inputs argument!")
+        return super().__call__(inputs, **kwargs)
+
+    def preprocess(self, image, subtask=None, timeout=None):
+        image = load_image(image, timeout=timeout)
+        target_size = [(image.height, image.width)]
+        if self.model.config.__class__.__name__ == "OneFormerConfig":
+            if subtask is None:
+                kwargs = {}
+            else:
+                kwargs = {"task_inputs": [subtask]}
+            inputs = self.image_processor(images=[image], return_tensors="pt", **kwargs)
+            if self.framework == "pt":
+                inputs = inputs.to(self.torch_dtype)
+            inputs["task_inputs"] = self.tokenizer(
+                inputs["task_inputs"],
+                padding="max_length",
+                max_length=self.model.config.task_seq_len,
+                return_tensors=self.framework,
+            )["input_ids"]
+        else:
+            inputs = self.image_processor(images=[image], return_tensors="pt")
+            if self.framework == "pt":
+                inputs = inputs.to(self.torch_dtype)
+        inputs["target_size"] = target_size
+        return inputs
+
+    def _forward(self, model_inputs):
+        target_size = model_inputs.pop("target_size")
+        model_outputs = self.model(**model_inputs)
+        model_outputs["target_size"] = target_size
+        return model_outputs
+
+    def postprocess(
+        self, model_outputs, subtask=None, threshold=0.9, mask_threshold=0.5, overlap_mask_area_threshold=0.5
+    ):
+        fn = None
+        if subtask in {"panoptic", None} and hasattr(self.image_processor, "post_process_panoptic_segmentation"):
+            fn = self.image_processor.post_process_panoptic_segmentation
+        elif subtask in {"instance", None} and hasattr(self.image_processor, "post_process_instance_segmentation"):
+            fn = self.image_processor.post_process_instance_segmentation
+
+        if fn is not None:
+            outputs = fn(
+                model_outputs,
+                threshold=threshold,
+                mask_threshold=mask_threshold,
+                overlap_mask_area_threshold=overlap_mask_area_threshold,
+                target_sizes=model_outputs["target_size"],
+            )[0]
+
+            annotation = []
+            segmentation = outputs["segmentation"]
+
+            for segment in outputs["segments_info"]:
+                mask = (segmentation == segment["id"]) * 255
+                mask = Image.fromarray(mask.numpy().astype(np.uint8), mode="L")
+                label = self.model.config.id2label[segment["label_id"]]
+                score = segment["score"]
+                annotation.append({"score": score, "label": label, "mask": mask})
+
+        elif subtask in {"semantic", None} and hasattr(self.image_processor, "post_process_semantic_segmentation"):
+            outputs = self.image_processor.post_process_semantic_segmentation(
+                model_outputs, target_sizes=model_outputs["target_size"]
+            )[0]
+
+            annotation = []
+            segmentation = outputs.numpy()
+            labels = np.unique(segmentation)
+
+            for label in labels:
+                mask = (segmentation == label) * 255
+                mask = Image.fromarray(mask.astype(np.uint8), mode="L")
+                label = self.model.config.id2label[label]
+                annotation.append({"score": None, "label": label, "mask": mask})
+        else:
+            raise ValueError(f"Subtask {subtask} is not supported for model {type(self.model)}")
+        return annotation
diff --git a/.venv/lib/python3.11/site-packages/transformers/pipelines/image_text_to_text.py b/.venv/lib/python3.11/site-packages/transformers/pipelines/image_text_to_text.py
new file mode 100644
index 0000000000000000000000000000000000000000..5afba0d7c0410ed5ee7a0f4d53d0f791b43c6f8c
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/transformers/pipelines/image_text_to_text.py
@@ -0,0 +1,432 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import enum
+from typing import Dict, List, Optional, Union
+
+from ..processing_utils import ProcessingKwargs, Unpack
+from ..utils import (
+    add_end_docstrings,
+    is_torch_available,
+    is_vision_available,
+    logging,
+    requires_backends,
+)
+from .base import Pipeline, build_pipeline_init_args
+
+
+if is_vision_available():
+    from PIL import Image
+
+    from ..image_utils import load_images, valid_images
+
+
+if is_torch_available():
+    from ..models.auto.modeling_auto import MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES
+    from .pt_utils import KeyDataset
+
+logger = logging.get_logger(__name__)
+
+IMAGE_TOKEN = "<image>"
+
+
+class ReturnType(enum.Enum):
+    TENSORS = 0
+    NEW_TEXT = 1
+    FULL_TEXT = 2
+
+
+class Chat:
+    """This class is intended to just be used internally in this pipeline and not exposed to users. We convert chats
+    to this format because the rest of the pipeline code tends to assume that lists of messages are
+    actually a batch of samples rather than messages in the same conversation."""
+
+    def __init__(self, messages: Dict, images: Union[str, List[str], "Image.Image", List["Image.Image"]]):
+        for message in messages:
+            if not ("role" in message and "content" in message):
+                raise ValueError("When passing chat dicts as input, each dict must have a 'role' and 'content' key.")
+        images = retrieve_images_in_messages(messages, images)
+
+        self.messages = messages
+        self.images = images
+
+
+def retrieve_images_in_messages(
+    messages: dict, images: Optional[Union[str, List[str], "Image.Image", List["Image.Image"]]]
+):
+    """
+    Retrieve and combine images from the chat and the images passed as input.
+    """
+    if images is None:
+        images = []
+    idx_images = 0
+    retrieved_images = []
+    for message in messages:
+        for content in message["content"]:
+            if isinstance(content, dict):
+                if content.get("type") == "image":
+                    for key in ["image", "url", "path", "base64"]:
+                        if key in content:
+                            retrieved_images.append(content[key])
+                            break
+                    else:
+                        if idx_images < len(images):
+                            retrieved_images.append(images[idx_images])
+                            idx_images += 1
+                        else:
+                            raise ValueError(
+                                "The number of images in the chat messages should be the same as the number of images passed to the pipeline."
+                            )
+                # Add support for OpenAI/TGI chat format
+                elif content.get("type") == "image_url":
+                    if isinstance(content.get("image_url"), dict) and "url" in content["image_url"]:
+                        retrieved_images.append(content["image_url"]["url"])
+                        # Rewrite content to be in the Transformers chat format
+                        content["type"] = "image"
+                        content["image"] = content["image_url"]["url"]
+                        del content["image_url"]
+                    else:
+                        raise ValueError(
+                            "Wrong format for 'image_url' content type. The content should have an 'image_url' dict with a 'url' key."
+                        )
+
+    # The number of images passed should be consistent with the number of images in the chat without an image key
+    if idx_images != len(images):
+        raise ValueError(
+            "The number of images in the chat messages should be the same as the number of images passed to the pipeline."
+        )
+
+    return retrieved_images
+
+
+@add_end_docstrings(build_pipeline_init_args(has_processor=True))
+class ImageTextToTextPipeline(Pipeline):
+    """
+    Image-text-to-text pipeline using an `AutoModelForImageTextToText`. This pipeline generates text given an image and text.
+    When the underlying model is a conversational model, it can also accept one or more chats,
+    in which case the pipeline will operate in chat mode and will continue the chat(s) by adding its response(s).
+    Each chat takes the form of a list of dicts, where each dict contains "role" and "content" keys.
+
+    Example:
+
+    ```python
+    >>> from transformers import pipeline
+
+    >>> pipe = pipeline(task="image-text-to-text", model="Salesforce/blip-image-captioning-base")
+    >>> pipe("https://huggingface.co/datasets/Narsil/image_dummy/raw/main/parrots.png", text="A photo of")
+    [{'generated_text': 'a photo of two birds'}]
+    ```
+
+    ```python
+    >>> from transformers import pipeline
+
+    >>> pipe = pipeline("image-text-to-text", model="llava-hf/llava-interleave-qwen-0.5b-hf")
+    >>> messages = [
+    >>>     {
+    >>>         "role": "user",
+    >>>         "content": [
+    >>>             {
+    >>>                 "type": "image",
+    >>>                 "url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg",
+    >>>             },
+    >>>             {"type": "text", "text": "Describe this image."},
+    >>>         ],
+    >>>     },
+    >>>     {
+    >>>         "role": "assistant",
+    >>>         "content": [
+    >>>             {"type": "text", "text": "There is a dog and"},
+    >>>         ],
+    >>>     },
+    >>> ]
+    >>> pipe(text=messages, max_new_tokens=20, return_full_text=False)
+    [{'input_text': [{'role': 'user',
+        'content': [{'type': 'image',
+        'url': 'https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg'},
+        {'type': 'text', 'text': 'Describe this image.'}]},
+    {'role': 'assistant',
+        'content': [{'type': 'text', 'text': 'There is a dog and'}]}],
+    'generated_text': ' a person in the image. The dog is sitting on the sand, and the person is sitting on'}]
+    ```
+
+    Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial)
+
+    This image-text to text pipeline can currently be loaded from pipeline() using the following task identifier:
+    "image-text-to-text".
+
+    See the list of available models on
+    [huggingface.co/models](https://huggingface.co/models?pipeline_tag=image-text-to-text).
+    """
+
+    _load_processor = True
+    _load_image_processor = False
+    _load_feature_extractor = False
+    _load_tokenizer = False
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        requires_backends(self, "vision")
+        self.check_model_type(MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES)
+
+    def _sanitize_parameters(
+        self,
+        max_new_tokens=None,
+        generate_kwargs=None,
+        timeout=None,
+        return_full_text=None,
+        return_tensors=None,
+        return_type=None,
+        continue_final_message=None,
+        **kwargs: Unpack[ProcessingKwargs],
+    ):
+        forward_kwargs = {}
+        preprocess_params = {}
+        postprocess_params = {}
+
+        preprocess_params["processing_kwargs"] = kwargs
+
+        if timeout is not None:
+            preprocess_params["timeout"] = timeout
+
+        if continue_final_message is not None:
+            preprocess_params["continue_final_message"] = continue_final_message
+
+        if generate_kwargs is not None:
+            forward_kwargs["generate_kwargs"] = generate_kwargs
+
+        if max_new_tokens is not None:
+            if "generate_kwargs" not in forward_kwargs:
+                forward_kwargs["generate_kwargs"] = {}
+            if "max_new_tokens" in forward_kwargs["generate_kwargs"]:
+                raise ValueError(
+                    "'max_new_tokens' is defined twice, once in 'generate_kwargs' and once as a direct parameter,"
+                    " please use only one"
+                )
+            forward_kwargs["generate_kwargs"]["max_new_tokens"] = max_new_tokens
+
+        if return_full_text is not None and return_type is None:
+            if return_tensors is not None:
+                raise ValueError("`return_full_text` is mutually exclusive with `return_tensors`")
+            return_type = ReturnType.FULL_TEXT if return_full_text else ReturnType.NEW_TEXT
+        if return_tensors is not None and return_type is None:
+            return_type = ReturnType.TENSORS
+        if return_type is not None:
+            postprocess_params["return_type"] = return_type
+        if continue_final_message is not None:
+            postprocess_params["continue_final_message"] = continue_final_message
+
+        return preprocess_params, forward_kwargs, postprocess_params
+
+    def __call__(
+        self,
+        images: Optional[
+            Union[str, List[str], List[List[str]], "Image.Image", List["Image.Image"], List[List["Image.Image"]]]
+        ] = None,
+        text: Optional[Union[str, List[str], List[dict]]] = None,
+        **kwargs,
+    ):
+        """
+        Generate a text given text and the image(s) passed as inputs.
+
+        Args:
+            images (`str`, `List[str]`, `PIL.Image or `List[PIL.Image]`):
+                The pipeline handles three types of images:
+
+                - A string containing a HTTP(s) link pointing to an image
+                - A string containing a local path to an image
+                - An image loaded in PIL directly
+
+                The pipeline accepts either a single image or a batch of images.
+            text (str, List[str], `List[Dict[str, Union[str, PIL.Image]]]`):
+                The text to be used for generation. If a list of strings is passed, the length of the list should be the
+                same as the number of images. Text can also follow the chat format: a list of dictionaries where each
+                dictionary represents a message in a conversation. Each dictionary should have two keys: 'role' and
+                'content'. 'role' should be one of 'user', 'system' or 'assistant'. 'content' should be a list of dictionary
+                containing the text of the message and the type of the message. The type of the message can be either
+                'text' or 'image'. If the type is 'image', no text is needed.
+            return_tensors (`bool`, *optional*, defaults to `False`):
+                Returns the tensors of predictions (as token indices) in the outputs. If set to
+                `True`, the decoded text is not returned.
+            return_text (`bool`, *optional*):
+                Returns the decoded texts in the outputs.
+            return_full_text (`bool`, *optional*, defaults to `True`):
+                If set to `False` only added text is returned, otherwise the full text is returned. Cannot be
+                specified at the same time as `return_text`.
+            continue_final_message( `bool`, *optional*): This indicates that you want the model to continue the
+                last message in the input chat rather than starting a new one, allowing you to "prefill" its response.
+                By default this is `True` when the final message in the input chat has the `assistant` role and
+                `False` otherwise, but you can manually override that behaviour by setting this flag.
+
+        Return:
+            A list or a list of list of `dict`: Each result comes as a dictionary with the following key (cannot return a combination
+            of both `generated_text` and `generated_token_ids`):
+
+            - **generated_text** (`str`, present when `return_text=True`) -- The generated text.
+            - **generated_token_ids** (`torch.Tensor`, present when `return_tensors=True`) -- The token
+                ids of the generated text.
+            - **input_text** (`str`) -- The input text.
+        """
+        if images is None and text is None:
+            raise ValueError("You must at least provide either text or images.")
+        if images is not None and text is None and not valid_images(images):
+            """
+            Supports the following format
+            - {"image": image, "text": text}
+            - [{"image": image, "text": text}]
+            - Generator and datasets
+            This is a common pattern in other multimodal pipelines, so we support it here as well.
+            """
+            return super().__call__(images, **kwargs)
+
+        if isinstance(text, (list, tuple, KeyDataset)) and isinstance(text[0], (list, tuple, dict)):
+            # We have one or more prompts in list-of-dicts format, so this is chat mode
+            if isinstance(text[0], dict):
+                return super().__call__(Chat(text, images), **kwargs)
+            else:
+                if images is None:
+                    images = [None] * len(text)
+                chats = [Chat(chat, image) for chat, image in zip(text, images)]  # 🐈 🐈 🐈
+                return super().__call__(chats, **kwargs)
+
+        # encourage the user to use the chat format if supported
+        if getattr(self.processor, "chat_template", None) is not None:
+            logger.warning_once(
+                "The input data was not formatted as a chat with dicts containing 'role' and 'content' keys, even though this model supports chat. "
+                "Consider using the chat format for better results. For more information, see https://huggingface.co/docs/transformers/en/chat_templating"
+            )
+
+        # support text only generation
+        if images is None:
+            return super().__call__(text, **kwargs)
+        if text is None:
+            raise ValueError("You must provide text for this pipeline.")
+
+        return super().__call__({"images": images, "text": text}, **kwargs)
+
+    def preprocess(self, inputs=None, timeout=None, continue_final_message=None, processing_kwargs=None):
+        # In case we only have text inputs
+        if isinstance(inputs, (list, tuple, str)):
+            images = None
+            text = inputs
+            inputs_text = inputs
+        else:
+            if isinstance(inputs, Chat):
+                # If the user passes a chat that ends in an assistant message, we treat it as a prefill by default
+                # because very few models support multiple separate, consecutive assistant messages
+                if continue_final_message is None:
+                    continue_final_message = inputs.messages[-1]["role"] == "assistant"
+                text = self.processor.apply_chat_template(
+                    inputs.messages,
+                    add_generation_prompt=not continue_final_message,
+                    continue_final_message=continue_final_message,
+                    return_tensors=self.framework,
+                )
+                inputs_text = inputs
+                images = inputs.images
+            else:
+                text = inputs["text"]
+                inputs_text = inputs["text"]
+                images = inputs["images"]
+
+            images = load_images(images)
+
+        # if batched text inputs, we set padding to True unless specified otherwise
+        if isinstance(text, (list, tuple)) and len(text) > 1:
+            processing_kwargs.setdefault("padding", True)
+        model_inputs = self.processor(
+            images=images, text=text, return_tensors=self.framework, legacy=False, **processing_kwargs
+        ).to(dtype=self.torch_dtype)
+
+        model_inputs["text"] = inputs_text
+
+        return model_inputs
+
+    def _forward(self, model_inputs, generate_kwargs=None):
+        generate_kwargs = {} if generate_kwargs is None else generate_kwargs
+        prompt_text = model_inputs.pop("text")
+        input_ids = (
+            model_inputs["input_ids"] if "input_ids" in model_inputs else model_inputs["decoder_input_ids"]
+        )  # for decoder-only models
+        generated_sequence = self.model.generate(**model_inputs, **generate_kwargs)
+
+        return {"generated_sequence": generated_sequence, "prompt_text": prompt_text, "input_ids": input_ids}
+
+    def postprocess(self, model_outputs, return_type=ReturnType.FULL_TEXT, continue_final_message=None):
+        input_texts = model_outputs["prompt_text"]
+        input_texts = [input_texts] if isinstance(input_texts, (str, Chat)) else input_texts
+        generated_sequence = model_outputs["generated_sequence"]
+        input_ids = model_outputs["input_ids"]
+        if return_type == ReturnType.TENSORS:
+            return [
+                {"input_text": input_texts[i], "generated_token_ids": generated_sequence[i]}
+                for i in range(len(input_texts))
+            ]
+
+        # Decode inputs and outputs the same way to remove input text from generated text if present
+        generated_texts = self.processor.post_process_image_text_to_text(generated_sequence)
+        decoded_inputs = self.processor.post_process_image_text_to_text(input_ids)
+
+        # Force consistent behavior for including the input text in the output
+        if return_type in {ReturnType.NEW_TEXT, ReturnType.FULL_TEXT}:
+            # Remove the input text from the generated text if the generated text starts with the input text
+            # (accounting for the possibility of a space between the input and generated text)
+            new_generated_texts = []
+            for text_generated, decoded_input in zip(generated_texts, decoded_inputs):
+                # There can be added characters before the input text, so we need to find the beginning of the input text in the generated text
+                index_input_text = text_generated.find(decoded_input)
+                # Limit the search to 2 residual characters, like spaces or new lines, to avoid removing a large part of the answer
+                if 0 <= index_input_text <= 2:
+                    # If the input text is found, we remove it
+                    new_generated_texts.append(text_generated[index_input_text + len(decoded_input) :])
+                else:
+                    new_generated_texts.append(text_generated)
+            generated_texts = new_generated_texts
+        if return_type == ReturnType.FULL_TEXT:
+            full_texts = []
+            for prompt_text, generated_text in zip(input_texts, generated_texts):
+                if isinstance(prompt_text, str):
+                    generated_text = prompt_text + generated_text
+                elif isinstance(prompt_text, Chat):
+                    if continue_final_message is None:
+                        # If the user passes a chat ending in an assistant message, we treat it as a prefill by
+                        # default because very few models support multiple separate, consecutive assistant messages
+                        continue_final_message = prompt_text.messages[-1]["role"] == "assistant"
+                    if continue_final_message:
+                        # With assistant prefill, concat onto the end of the last message
+                        new_text = dict(prompt_text.messages[-1]["content"][-1].items())
+                        new_text["text"] += generated_text
+                        generated_text = list(prompt_text.messages)[:-1] + [
+                            {
+                                "role": prompt_text.messages[-1]["role"],
+                                "content": prompt_text.messages[-1]["content"][:-1] + [new_text],
+                            }
+                        ]
+                    else:
+                        # When we're not starting from a prefill, the output is a new assistant message
+                        generated_text = list(prompt_text.messages) + [
+                            {"role": "assistant", "content": generated_text}
+                        ]
+                full_texts.append(generated_text)
+            generated_texts = full_texts
+
+        records = [
+            {
+                "input_text": input_text.messages if isinstance(input_text, Chat) else input_text,
+                "generated_text": generated_text,
+            }
+            for input_text, generated_text in zip(input_texts, generated_texts)
+        ]
+
+        return records
diff --git a/.venv/lib/python3.11/site-packages/transformers/pipelines/image_to_image.py b/.venv/lib/python3.11/site-packages/transformers/pipelines/image_to_image.py
new file mode 100644
index 0000000000000000000000000000000000000000..cb66359a4dddea48519f2de2dc69e86cd4ac5645
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/transformers/pipelines/image_to_image.py
@@ -0,0 +1,136 @@
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import List, Union
+
+import numpy as np
+
+from ..utils import (
+    add_end_docstrings,
+    is_torch_available,
+    is_vision_available,
+    logging,
+    requires_backends,
+)
+from .base import Pipeline, build_pipeline_init_args
+
+
+if is_vision_available():
+    from PIL import Image
+
+    from ..image_utils import load_image
+
+if is_torch_available():
+    from ..models.auto.modeling_auto import MODEL_FOR_IMAGE_TO_IMAGE_MAPPING_NAMES
+
+logger = logging.get_logger(__name__)
+
+
+@add_end_docstrings(build_pipeline_init_args(has_image_processor=True))
+class ImageToImagePipeline(Pipeline):
+    """
+    Image to Image pipeline using any `AutoModelForImageToImage`. This pipeline generates an image based on a previous
+    image input.
+
+    Example:
+
+    ```python
+    >>> from PIL import Image
+    >>> import requests
+
+    >>> from transformers import pipeline
+
+    >>> upscaler = pipeline("image-to-image", model="caidas/swin2SR-classical-sr-x2-64")
+    >>> img = Image.open(requests.get("http://images.cocodataset.org/val2017/000000039769.jpg", stream=True).raw)
+    >>> img = img.resize((64, 64))
+    >>> upscaled_img = upscaler(img)
+    >>> img.size
+    (64, 64)
+
+    >>> upscaled_img.size
+    (144, 144)
+    ```
+
+    This image to image pipeline can currently be loaded from [`pipeline`] using the following task identifier:
+    `"image-to-image"`.
+
+    See the list of available models on [huggingface.co/models](https://huggingface.co/models?filter=image-to-image).
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        requires_backends(self, "vision")
+        self.check_model_type(MODEL_FOR_IMAGE_TO_IMAGE_MAPPING_NAMES)
+
+    def _sanitize_parameters(self, **kwargs):
+        preprocess_params = {}
+        postprocess_params = {}
+        forward_params = {}
+
+        if "timeout" in kwargs:
+            preprocess_params["timeout"] = kwargs["timeout"]
+        if "head_mask" in kwargs:
+            forward_params["head_mask"] = kwargs["head_mask"]
+
+        return preprocess_params, forward_params, postprocess_params
+
+    def __call__(
+        self, images: Union[str, List[str], "Image.Image", List["Image.Image"]], **kwargs
+    ) -> Union["Image.Image", List["Image.Image"]]:
+        """
+        Transform the image(s) passed as inputs.
+
+        Args:
+            images (`str`, `List[str]`, `PIL.Image` or `List[PIL.Image]`):
+                The pipeline handles three types of images:
+
+                - A string containing a http link pointing to an image
+                - A string containing a local path to an image
+                - An image loaded in PIL directly
+
+                The pipeline accepts either a single image or a batch of images, which must then be passed as a string.
+                Images in a batch must all be in the same format: all as http links, all as local paths, or all as PIL
+                images.
+            timeout (`float`, *optional*, defaults to None):
+                The maximum time in seconds to wait for fetching images from the web. If None, no timeout is used and
+                the call may block forever.
+
+        Return:
+            An image (Image.Image) or a list of images (List["Image.Image"]) containing result(s). If the input is a
+            single image, the return will be also a single image, if the input is a list of several images, it will
+            return a list of transformed images.
+        """
+        return super().__call__(images, **kwargs)
+
+    def _forward(self, model_inputs):
+        model_outputs = self.model(**model_inputs)
+        return model_outputs
+
+    def preprocess(self, image, timeout=None):
+        image = load_image(image, timeout=timeout)
+        inputs = self.image_processor(images=[image], return_tensors="pt")
+        if self.framework == "pt":
+            inputs = inputs.to(self.torch_dtype)
+        return inputs
+
+    def postprocess(self, model_outputs):
+        images = []
+        if "reconstruction" in model_outputs.keys():
+            outputs = model_outputs.reconstruction
+        for output in outputs:
+            output = output.data.squeeze().float().cpu().clamp_(0, 1).numpy()
+            output = np.moveaxis(output, source=0, destination=-1)
+            output = (output * 255.0).round().astype(np.uint8)  # float32 to uint8
+            images.append(Image.fromarray(output))
+
+        return images if len(images) > 1 else images[0]
diff --git a/.venv/lib/python3.11/site-packages/transformers/pipelines/image_to_text.py b/.venv/lib/python3.11/site-packages/transformers/pipelines/image_to_text.py
new file mode 100644
index 0000000000000000000000000000000000000000..32a3ec218dac305f93d8e41959200a78c590c8df
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/transformers/pipelines/image_to_text.py
@@ -0,0 +1,222 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import List, Union
+
+from ..utils import (
+    add_end_docstrings,
+    is_tf_available,
+    is_torch_available,
+    is_vision_available,
+    logging,
+    requires_backends,
+)
+from .base import Pipeline, build_pipeline_init_args
+
+
+if is_vision_available():
+    from PIL import Image
+
+    from ..image_utils import load_image
+
+if is_tf_available():
+    from ..models.auto.modeling_tf_auto import TF_MODEL_FOR_VISION_2_SEQ_MAPPING_NAMES
+
+if is_torch_available():
+    import torch
+
+    from ..models.auto.modeling_auto import MODEL_FOR_VISION_2_SEQ_MAPPING_NAMES
+
+logger = logging.get_logger(__name__)
+
+
+@add_end_docstrings(build_pipeline_init_args(has_tokenizer=True, has_image_processor=True))
+class ImageToTextPipeline(Pipeline):
+    """
+    Image To Text pipeline using a `AutoModelForVision2Seq`. This pipeline predicts a caption for a given image.
+
+    Example:
+
+    ```python
+    >>> from transformers import pipeline
+
+    >>> captioner = pipeline(model="ydshieh/vit-gpt2-coco-en")
+    >>> captioner("https://huggingface.co/datasets/Narsil/image_dummy/raw/main/parrots.png")
+    [{'generated_text': 'two birds are standing next to each other '}]
+    ```
+
+    Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial)
+
+    This image to text pipeline can currently be loaded from pipeline() using the following task identifier:
+    "image-to-text".
+
+    See the list of available models on
+    [huggingface.co/models](https://huggingface.co/models?pipeline_tag=image-to-text).
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        requires_backends(self, "vision")
+        self.check_model_type(
+            TF_MODEL_FOR_VISION_2_SEQ_MAPPING_NAMES if self.framework == "tf" else MODEL_FOR_VISION_2_SEQ_MAPPING_NAMES
+        )
+
+    def _sanitize_parameters(self, max_new_tokens=None, generate_kwargs=None, prompt=None, timeout=None):
+        forward_params = {}
+        preprocess_params = {}
+
+        if prompt is not None:
+            preprocess_params["prompt"] = prompt
+        if timeout is not None:
+            preprocess_params["timeout"] = timeout
+
+        if max_new_tokens is not None:
+            forward_params["max_new_tokens"] = max_new_tokens
+        if generate_kwargs is not None:
+            if max_new_tokens is not None and "max_new_tokens" in generate_kwargs:
+                raise ValueError(
+                    "`max_new_tokens` is defined both as an argument and inside `generate_kwargs` argument, please use"
+                    " only 1 version"
+                )
+            forward_params.update(generate_kwargs)
+
+        if self.assistant_model is not None:
+            forward_params["assistant_model"] = self.assistant_model
+        if self.assistant_tokenizer is not None:
+            forward_params["tokenizer"] = self.tokenizer
+            forward_params["assistant_tokenizer"] = self.assistant_tokenizer
+
+        return preprocess_params, forward_params, {}
+
+    def __call__(self, inputs: Union[str, List[str], "Image.Image", List["Image.Image"]] = None, **kwargs):
+        """
+        Assign labels to the image(s) passed as inputs.
+
+        Args:
+            inputs (`str`, `List[str]`, `PIL.Image` or `List[PIL.Image]`):
+                The pipeline handles three types of images:
+
+                - A string containing a HTTP(s) link pointing to an image
+                - A string containing a local path to an image
+                - An image loaded in PIL directly
+
+                The pipeline accepts either a single image or a batch of images.
+
+            max_new_tokens (`int`, *optional*):
+                The amount of maximum tokens to generate. By default it will use `generate` default.
+
+            generate_kwargs (`Dict`, *optional*):
+                Pass it to send all of these arguments directly to `generate` allowing full control of this function.
+
+            timeout (`float`, *optional*, defaults to None):
+                The maximum time in seconds to wait for fetching images from the web. If None, no timeout is set and
+                the call may block forever.
+
+        Return:
+            A list or a list of list of `dict`: Each result comes as a dictionary with the following key:
+
+            - **generated_text** (`str`) -- The generated text.
+        """
+        # After deprecation of this is completed, remove the default `None` value for `images`
+        if "images" in kwargs:
+            inputs = kwargs.pop("images")
+        if inputs is None:
+            raise ValueError("Cannot call the image-to-text pipeline without an inputs argument!")
+        return super().__call__(inputs, **kwargs)
+
+    def preprocess(self, image, prompt=None, timeout=None):
+        image = load_image(image, timeout=timeout)
+
+        if prompt is not None:
+            logger.warning_once(
+                "Passing `prompt` to the `image-to-text` pipeline is deprecated and will be removed in version 4.48"
+                " of 🤗 Transformers. Use the `image-text-to-text` pipeline instead",
+            )
+            if not isinstance(prompt, str):
+                raise ValueError(
+                    f"Received an invalid text input, got - {type(prompt)} - but expected a single string. "
+                    "Note also that one single text can be provided for conditional image to text generation."
+                )
+
+            model_type = self.model.config.model_type
+
+            if model_type == "git":
+                model_inputs = self.image_processor(images=image, return_tensors=self.framework)
+                if self.framework == "pt":
+                    model_inputs = model_inputs.to(self.torch_dtype)
+                input_ids = self.tokenizer(text=prompt, add_special_tokens=False).input_ids
+                input_ids = [self.tokenizer.cls_token_id] + input_ids
+                input_ids = torch.tensor(input_ids).unsqueeze(0)
+                model_inputs.update({"input_ids": input_ids})
+
+            elif model_type == "pix2struct":
+                model_inputs = self.image_processor(images=image, header_text=prompt, return_tensors=self.framework)
+                if self.framework == "pt":
+                    model_inputs = model_inputs.to(self.torch_dtype)
+
+            elif model_type != "vision-encoder-decoder":
+                # vision-encoder-decoder does not support conditional generation
+                model_inputs = self.image_processor(images=image, return_tensors=self.framework)
+                if self.framework == "pt":
+                    model_inputs = model_inputs.to(self.torch_dtype)
+                text_inputs = self.tokenizer(prompt, return_tensors=self.framework)
+                model_inputs.update(text_inputs)
+
+            else:
+                raise ValueError(f"Model type {model_type} does not support conditional text generation")
+
+        else:
+            model_inputs = self.image_processor(images=image, return_tensors=self.framework)
+            if self.framework == "pt":
+                model_inputs = model_inputs.to(self.torch_dtype)
+
+        if self.model.config.model_type == "git" and prompt is None:
+            model_inputs["input_ids"] = None
+
+        return model_inputs
+
+    def _forward(self, model_inputs, **generate_kwargs):
+        # Git model sets `model_inputs["input_ids"] = None` in `preprocess` (when `prompt=None`). In batch model, the
+        # pipeline will group them into a list of `None`, which fail `_forward`. Avoid this by checking it first.
+        if (
+            "input_ids" in model_inputs
+            and isinstance(model_inputs["input_ids"], list)
+            and all(x is None for x in model_inputs["input_ids"])
+        ):
+            model_inputs["input_ids"] = None
+
+        # User-defined `generation_config` passed to the pipeline call take precedence
+        if "generation_config" not in generate_kwargs:
+            generate_kwargs["generation_config"] = self.generation_config
+
+        # FIXME: We need to pop here due to a difference in how `generation.py` and `generation.tf_utils.py`
+        #  parse inputs. In the Tensorflow version, `generate` raises an error if we don't use `input_ids` whereas
+        #  the PyTorch version matches it with `self.model.main_input_name` or `self.model.encoder.main_input_name`
+        #  in the `_prepare_model_inputs` method.
+        inputs = model_inputs.pop(self.model.main_input_name)
+        model_outputs = self.model.generate(inputs, **model_inputs, **generate_kwargs)
+        return model_outputs
+
+    def postprocess(self, model_outputs):
+        records = []
+        for output_ids in model_outputs:
+            record = {
+                "generated_text": self.tokenizer.decode(
+                    output_ids,
+                    skip_special_tokens=True,
+                )
+            }
+            records.append(record)
+        return records
diff --git a/.venv/lib/python3.11/site-packages/transformers/pipelines/mask_generation.py b/.venv/lib/python3.11/site-packages/transformers/pipelines/mask_generation.py
new file mode 100644
index 0000000000000000000000000000000000000000..f87e45b7f8ecb410ba5d0a088188256d59290f0f
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/transformers/pipelines/mask_generation.py
@@ -0,0 +1,287 @@
+from collections import defaultdict
+from typing import Optional
+
+from ..image_utils import load_image
+from ..utils import (
+    add_end_docstrings,
+    is_torch_available,
+    logging,
+    requires_backends,
+)
+from .base import ChunkPipeline, build_pipeline_init_args
+
+
+if is_torch_available():
+    import torch
+
+    from ..models.auto.modeling_auto import MODEL_FOR_MASK_GENERATION_MAPPING_NAMES
+
+logger = logging.get_logger(__name__)
+
+
+@add_end_docstrings(
+    build_pipeline_init_args(has_image_processor=True),
+    r"""
+        points_per_batch (*optional*, int, default to 64):
+            Sets the number of points run simultaneously by the model. Higher numbers may be faster but use more GPU
+            memory.
+        output_bboxes_mask (`bool`, *optional*, default to `False`):
+            Whether or not to output the bounding box predictions.
+        output_rle_masks (`bool`, *optional*, default to `False`):
+            Whether or not to output the masks in `RLE` format""",
+)
+class MaskGenerationPipeline(ChunkPipeline):
+    """
+    Automatic mask generation for images using `SamForMaskGeneration`. This pipeline predicts binary masks for an
+    image, given an image. It is a `ChunkPipeline` because you can seperate the points in a mini-batch in order to
+    avoid OOM issues. Use the `points_per_batch` argument to control the number of points that will be processed at the
+    same time. Default is `64`.
+
+    The pipeline works in 3 steps:
+        1. `preprocess`: A grid of 1024 points evenly separated is generated along with bounding boxes and point
+           labels.
+            For more details on how the points and bounding boxes are created, check the `_generate_crop_boxes`
+            function. The image is also preprocessed using the `image_processor`. This function `yields` a minibatch of
+            `points_per_batch`.
+
+        2. `forward`: feeds the outputs of `preprocess` to the model. The image embedding is computed only once.
+            Calls both `self.model.get_image_embeddings` and makes sure that the gradients are not computed, and the
+            tensors and models are on the same device.
+
+        3. `postprocess`: The most important part of the automatic mask generation happens here. Three steps
+            are induced:
+                - image_processor.postprocess_masks (run on each minibatch loop): takes in the raw output masks,
+                  resizes them according
+                to the image size, and transforms there to binary masks.
+                - image_processor.filter_masks (on each minibatch loop): uses both `pred_iou_thresh` and
+                  `stability_scores`. Also
+                applies a variety of filters based on non maximum suppression to remove bad masks.
+                - image_processor.postprocess_masks_for_amg applies the NSM on the mask to only keep relevant ones.
+
+    Example:
+
+    ```python
+    >>> from transformers import pipeline
+
+    >>> generator = pipeline(model="facebook/sam-vit-base", task="mask-generation")
+    >>> outputs = generator(
+    ...     "http://images.cocodataset.org/val2017/000000039769.jpg",
+    ... )
+
+    >>> outputs = generator(
+    ...     "https://huggingface.co/datasets/Narsil/image_dummy/raw/main/parrots.png", points_per_batch=128
+    ... )
+    ```
+
+    Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial)
+
+    This segmentation pipeline can currently be loaded from [`pipeline`] using the following task identifier:
+    `"mask-generation"`.
+
+    See the list of available models on [huggingface.co/models](https://huggingface.co/models?filter=mask-generation).
+    """
+
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        requires_backends(self, "vision")
+        requires_backends(self, "torch")
+
+        if self.framework != "pt":
+            raise ValueError(f"The {self.__class__} is only available in PyTorch.")
+
+        self.check_model_type(MODEL_FOR_MASK_GENERATION_MAPPING_NAMES)
+
+    def _sanitize_parameters(self, **kwargs):
+        preprocess_kwargs = {}
+        postprocess_kwargs = {}
+        forward_params = {}
+        # preprocess args
+        if "points_per_batch" in kwargs:
+            preprocess_kwargs["points_per_batch"] = kwargs["points_per_batch"]
+        if "points_per_crop" in kwargs:
+            preprocess_kwargs["points_per_crop"] = kwargs["points_per_crop"]
+        if "crops_n_layers" in kwargs:
+            preprocess_kwargs["crops_n_layers"] = kwargs["crops_n_layers"]
+        if "crop_overlap_ratio" in kwargs:
+            preprocess_kwargs["crop_overlap_ratio"] = kwargs["crop_overlap_ratio"]
+        if "crop_n_points_downscale_factor" in kwargs:
+            preprocess_kwargs["crop_n_points_downscale_factor"] = kwargs["crop_n_points_downscale_factor"]
+        if "timeout" in kwargs:
+            preprocess_kwargs["timeout"] = kwargs["timeout"]
+        # postprocess args
+        if "pred_iou_thresh" in kwargs:
+            forward_params["pred_iou_thresh"] = kwargs["pred_iou_thresh"]
+        if "stability_score_offset" in kwargs:
+            forward_params["stability_score_offset"] = kwargs["stability_score_offset"]
+        if "mask_threshold" in kwargs:
+            forward_params["mask_threshold"] = kwargs["mask_threshold"]
+        if "stability_score_thresh" in kwargs:
+            forward_params["stability_score_thresh"] = kwargs["stability_score_thresh"]
+        if "crops_nms_thresh" in kwargs:
+            postprocess_kwargs["crops_nms_thresh"] = kwargs["crops_nms_thresh"]
+        if "output_rle_mask" in kwargs:
+            postprocess_kwargs["output_rle_mask"] = kwargs["output_rle_mask"]
+        if "output_bboxes_mask" in kwargs:
+            postprocess_kwargs["output_bboxes_mask"] = kwargs["output_bboxes_mask"]
+        return preprocess_kwargs, forward_params, postprocess_kwargs
+
+    def __call__(self, image, *args, num_workers=None, batch_size=None, **kwargs):
+        """
+        Generates binary segmentation masks
+
+        Args:
+            inputs (`np.ndarray` or `bytes` or `str` or `dict`):
+                Image or list of images.
+            mask_threshold (`float`, *optional*, defaults to 0.0):
+                Threshold to use when turning the predicted masks into binary values.
+            pred_iou_thresh (`float`, *optional*, defaults to 0.88):
+                A filtering threshold in `[0,1]` applied on the model's predicted mask quality.
+            stability_score_thresh (`float`, *optional*, defaults to 0.95):
+                A filtering threshold in `[0,1]`, using the stability of the mask under changes to the cutoff used to
+                binarize the model's mask predictions.
+            stability_score_offset (`int`, *optional*, defaults to 1):
+                The amount to shift the cutoff when calculated the stability score.
+            crops_nms_thresh (`float`, *optional*, defaults to 0.7):
+                The box IoU cutoff used by non-maximal suppression to filter duplicate masks.
+            crops_n_layers (`int`, *optional*, defaults to 0):
+                If `crops_n_layers>0`, mask prediction will be run again on crops of the image. Sets the number of
+                layers to run, where each layer has 2**i_layer number of image crops.
+            crop_overlap_ratio (`float`, *optional*, defaults to `512 / 1500`):
+                Sets the degree to which crops overlap. In the first crop layer, crops will overlap by this fraction of
+                the image length. Later layers with more crops scale down this overlap.
+            crop_n_points_downscale_factor (`int`, *optional*, defaults to `1`):
+                The number of points-per-side sampled in layer n is scaled down by crop_n_points_downscale_factor**n.
+            timeout (`float`, *optional*, defaults to None):
+                The maximum time in seconds to wait for fetching images from the web. If None, no timeout is set and
+                the call may block forever.
+
+        Return:
+            `Dict`: A dictionary with the following keys:
+                - **mask** (`PIL.Image`) -- A binary mask of the detected object as a PIL Image of shape `(width,
+                  height)` of the original image. Returns a mask filled with zeros if no object is found.
+                - **score** (*optional* `float`) -- Optionally, when the model is capable of estimating a confidence of
+                  the "object" described by the label and the mask.
+
+        """
+        return super().__call__(image, *args, num_workers=num_workers, batch_size=batch_size, **kwargs)
+
+    def preprocess(
+        self,
+        image,
+        points_per_batch=64,
+        crops_n_layers: int = 0,
+        crop_overlap_ratio: float = 512 / 1500,
+        points_per_crop: Optional[int] = 32,
+        crop_n_points_downscale_factor: Optional[int] = 1,
+        timeout: Optional[float] = None,
+    ):
+        image = load_image(image, timeout=timeout)
+        target_size = self.image_processor.size["longest_edge"]
+        crop_boxes, grid_points, cropped_images, input_labels = self.image_processor.generate_crop_boxes(
+            image, target_size, crops_n_layers, crop_overlap_ratio, points_per_crop, crop_n_points_downscale_factor
+        )
+        model_inputs = self.image_processor(images=cropped_images, return_tensors="pt")
+        if self.framework == "pt":
+            model_inputs = model_inputs.to(self.torch_dtype)
+
+        with self.device_placement():
+            if self.framework == "pt":
+                inference_context = self.get_inference_context()
+                with inference_context():
+                    model_inputs = self._ensure_tensor_on_device(model_inputs, device=self.device)
+                    image_embeddings = self.model.get_image_embeddings(model_inputs.pop("pixel_values"))
+                    model_inputs["image_embeddings"] = image_embeddings
+
+        n_points = grid_points.shape[1]
+        points_per_batch = points_per_batch if points_per_batch is not None else n_points
+
+        if points_per_batch <= 0:
+            raise ValueError(
+                "Cannot have points_per_batch<=0. Must be >=1 to returned batched outputs. "
+                "To return all points at once, set points_per_batch to None"
+            )
+
+        for i in range(0, n_points, points_per_batch):
+            batched_points = grid_points[:, i : i + points_per_batch, :, :]
+            labels = input_labels[:, i : i + points_per_batch]
+            is_last = i == n_points - points_per_batch
+            yield {
+                "input_points": batched_points,
+                "input_labels": labels,
+                "input_boxes": crop_boxes,
+                "is_last": is_last,
+                **model_inputs,
+            }
+
+    def _forward(
+        self,
+        model_inputs,
+        pred_iou_thresh=0.88,
+        stability_score_thresh=0.95,
+        mask_threshold=0,
+        stability_score_offset=1,
+    ):
+        input_boxes = model_inputs.pop("input_boxes")
+        is_last = model_inputs.pop("is_last")
+        original_sizes = model_inputs.pop("original_sizes").tolist()
+        reshaped_input_sizes = model_inputs.pop("reshaped_input_sizes").tolist()
+
+        model_outputs = self.model(**model_inputs)
+
+        # post processing happens here in order to avoid CPU GPU copies of ALL the masks
+        low_resolution_masks = model_outputs["pred_masks"]
+        masks = self.image_processor.post_process_masks(
+            low_resolution_masks, original_sizes, reshaped_input_sizes, mask_threshold, binarize=False
+        )
+        iou_scores = model_outputs["iou_scores"]
+        masks, iou_scores, boxes = self.image_processor.filter_masks(
+            masks[0],
+            iou_scores[0],
+            original_sizes[0],
+            input_boxes[0],
+            pred_iou_thresh,
+            stability_score_thresh,
+            mask_threshold,
+            stability_score_offset,
+        )
+        return {
+            "masks": masks,
+            "is_last": is_last,
+            "boxes": boxes,
+            "iou_scores": iou_scores,
+        }
+
+    def postprocess(
+        self,
+        model_outputs,
+        output_rle_mask=False,
+        output_bboxes_mask=False,
+        crops_nms_thresh=0.7,
+    ):
+        all_scores = []
+        all_masks = []
+        all_boxes = []
+        for model_output in model_outputs:
+            all_scores.append(model_output.pop("iou_scores"))
+            all_masks.extend(model_output.pop("masks"))
+            all_boxes.append(model_output.pop("boxes"))
+
+        all_scores = torch.cat(all_scores)
+        all_boxes = torch.cat(all_boxes)
+        output_masks, iou_scores, rle_mask, bounding_boxes = self.image_processor.post_process_for_mask_generation(
+            all_masks, all_scores, all_boxes, crops_nms_thresh
+        )
+
+        extra = defaultdict(list)
+        for output in model_outputs:
+            for k, v in output.items():
+                extra[k].append(v)
+
+        optional = {}
+        if output_rle_mask:
+            optional["rle_mask"] = rle_mask
+
+        if output_bboxes_mask:
+            optional["bounding_boxes"] = bounding_boxes
+
+        return {"masks": output_masks, "scores": iou_scores, **optional, **extra}
diff --git a/.venv/lib/python3.11/site-packages/transformers/pipelines/object_detection.py b/.venv/lib/python3.11/site-packages/transformers/pipelines/object_detection.py
new file mode 100644
index 0000000000000000000000000000000000000000..c84f17b2bd6ad0ac2bbbe95a3421e7197a5744c6
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/transformers/pipelines/object_detection.py
@@ -0,0 +1,191 @@
+from typing import Any, Dict, List, Union
+
+from ..utils import add_end_docstrings, is_torch_available, is_vision_available, logging, requires_backends
+from .base import Pipeline, build_pipeline_init_args
+
+
+if is_vision_available():
+    from ..image_utils import load_image
+
+
+if is_torch_available():
+    import torch
+
+    from ..models.auto.modeling_auto import (
+        MODEL_FOR_OBJECT_DETECTION_MAPPING_NAMES,
+        MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES,
+    )
+
+logger = logging.get_logger(__name__)
+
+
+Prediction = Dict[str, Any]
+Predictions = List[Prediction]
+
+
+@add_end_docstrings(build_pipeline_init_args(has_image_processor=True))
+class ObjectDetectionPipeline(Pipeline):
+    """
+    Object detection pipeline using any `AutoModelForObjectDetection`. This pipeline predicts bounding boxes of objects
+    and their classes.
+
+    Example:
+
+    ```python
+    >>> from transformers import pipeline
+
+    >>> detector = pipeline(model="facebook/detr-resnet-50")
+    >>> detector("https://huggingface.co/datasets/Narsil/image_dummy/raw/main/parrots.png")
+    [{'score': 0.997, 'label': 'bird', 'box': {'xmin': 69, 'ymin': 171, 'xmax': 396, 'ymax': 507}}, {'score': 0.999, 'label': 'bird', 'box': {'xmin': 398, 'ymin': 105, 'xmax': 767, 'ymax': 507}}]
+
+    >>> # x, y  are expressed relative to the top left hand corner.
+    ```
+
+    Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial)
+
+    This object detection pipeline can currently be loaded from [`pipeline`] using the following task identifier:
+    `"object-detection"`.
+
+    See the list of available models on [huggingface.co/models](https://huggingface.co/models?filter=object-detection).
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        if self.framework == "tf":
+            raise ValueError(f"The {self.__class__} is only available in PyTorch.")
+
+        requires_backends(self, "vision")
+        mapping = MODEL_FOR_OBJECT_DETECTION_MAPPING_NAMES.copy()
+        mapping.update(MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES)
+        self.check_model_type(mapping)
+
+    def _sanitize_parameters(self, **kwargs):
+        preprocess_params = {}
+        if "timeout" in kwargs:
+            preprocess_params["timeout"] = kwargs["timeout"]
+        postprocess_kwargs = {}
+        if "threshold" in kwargs:
+            postprocess_kwargs["threshold"] = kwargs["threshold"]
+        return preprocess_params, {}, postprocess_kwargs
+
+    def __call__(self, *args, **kwargs) -> Union[Predictions, List[Prediction]]:
+        """
+        Detect objects (bounding boxes & classes) in the image(s) passed as inputs.
+
+        Args:
+            inputs (`str`, `List[str]`, `PIL.Image` or `List[PIL.Image]`):
+                The pipeline handles three types of images:
+
+                - A string containing an HTTP(S) link pointing to an image
+                - A string containing a local path to an image
+                - An image loaded in PIL directly
+
+                The pipeline accepts either a single image or a batch of images. Images in a batch must all be in the
+                same format: all as HTTP(S) links, all as local paths, or all as PIL images.
+            threshold (`float`, *optional*, defaults to 0.5):
+                The probability necessary to make a prediction.
+            timeout (`float`, *optional*, defaults to None):
+                The maximum time in seconds to wait for fetching images from the web. If None, no timeout is set and
+                the call may block forever.
+
+        Return:
+            A list of dictionaries or a list of list of dictionaries containing the result. If the input is a single
+            image, will return a list of dictionaries, if the input is a list of several images, will return a list of
+            list of dictionaries corresponding to each image.
+
+            The dictionaries contain the following keys:
+
+            - **label** (`str`) -- The class label identified by the model.
+            - **score** (`float`) -- The score attributed by the model for that label.
+            - **box** (`List[Dict[str, int]]`) -- The bounding box of detected object in image's original size.
+        """
+        # After deprecation of this is completed, remove the default `None` value for `images`
+        if "images" in kwargs and "inputs" not in kwargs:
+            kwargs["inputs"] = kwargs.pop("images")
+        return super().__call__(*args, **kwargs)
+
+    def preprocess(self, image, timeout=None):
+        image = load_image(image, timeout=timeout)
+        target_size = torch.IntTensor([[image.height, image.width]])
+        inputs = self.image_processor(images=[image], return_tensors="pt")
+        if self.framework == "pt":
+            inputs = inputs.to(self.torch_dtype)
+        if self.tokenizer is not None:
+            inputs = self.tokenizer(text=inputs["words"], boxes=inputs["boxes"], return_tensors="pt")
+        inputs["target_size"] = target_size
+        return inputs
+
+    def _forward(self, model_inputs):
+        target_size = model_inputs.pop("target_size")
+        outputs = self.model(**model_inputs)
+        model_outputs = outputs.__class__({"target_size": target_size, **outputs})
+        if self.tokenizer is not None:
+            model_outputs["bbox"] = model_inputs["bbox"]
+        return model_outputs
+
+    def postprocess(self, model_outputs, threshold=0.5):
+        target_size = model_outputs["target_size"]
+        if self.tokenizer is not None:
+            # This is a LayoutLMForTokenClassification variant.
+            # The OCR got the boxes and the model classified the words.
+            height, width = target_size[0].tolist()
+
+            def unnormalize(bbox):
+                return self._get_bounding_box(
+                    torch.Tensor(
+                        [
+                            (width * bbox[0] / 1000),
+                            (height * bbox[1] / 1000),
+                            (width * bbox[2] / 1000),
+                            (height * bbox[3] / 1000),
+                        ]
+                    )
+                )
+
+            scores, classes = model_outputs["logits"].squeeze(0).softmax(dim=-1).max(dim=-1)
+            labels = [self.model.config.id2label[prediction] for prediction in classes.tolist()]
+            boxes = [unnormalize(bbox) for bbox in model_outputs["bbox"].squeeze(0)]
+            keys = ["score", "label", "box"]
+            annotation = [dict(zip(keys, vals)) for vals in zip(scores.tolist(), labels, boxes) if vals[0] > threshold]
+        else:
+            # This is a regular ForObjectDetectionModel
+            raw_annotations = self.image_processor.post_process_object_detection(model_outputs, threshold, target_size)
+            raw_annotation = raw_annotations[0]
+            scores = raw_annotation["scores"]
+            labels = raw_annotation["labels"]
+            boxes = raw_annotation["boxes"]
+
+            raw_annotation["scores"] = scores.tolist()
+            raw_annotation["labels"] = [self.model.config.id2label[label.item()] for label in labels]
+            raw_annotation["boxes"] = [self._get_bounding_box(box) for box in boxes]
+
+            # {"scores": [...], ...} --> [{"score":x, ...}, ...]
+            keys = ["score", "label", "box"]
+            annotation = [
+                dict(zip(keys, vals))
+                for vals in zip(raw_annotation["scores"], raw_annotation["labels"], raw_annotation["boxes"])
+            ]
+
+        return annotation
+
+    def _get_bounding_box(self, box: "torch.Tensor") -> Dict[str, int]:
+        """
+        Turns list [xmin, xmax, ymin, ymax] into dict { "xmin": xmin, ... }
+
+        Args:
+            box (`torch.Tensor`): Tensor containing the coordinates in corners format.
+
+        Returns:
+            bbox (`Dict[str, int]`): Dict containing the coordinates in corners format.
+        """
+        if self.framework != "pt":
+            raise ValueError("The ObjectDetectionPipeline is only available in PyTorch.")
+        xmin, ymin, xmax, ymax = box.int().tolist()
+        bbox = {
+            "xmin": xmin,
+            "ymin": ymin,
+            "xmax": xmax,
+            "ymax": ymax,
+        }
+        return bbox
diff --git a/.venv/lib/python3.11/site-packages/transformers/pipelines/pt_utils.py b/.venv/lib/python3.11/site-packages/transformers/pipelines/pt_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..19663437cd691efb265770ae007871cafe1275ed
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/transformers/pipelines/pt_utils.py
@@ -0,0 +1,321 @@
+import numpy as np
+import torch
+from torch.utils.data import Dataset, IterableDataset
+
+from ..utils.generic import ModelOutput
+
+
+class PipelineDataset(Dataset):
+    def __init__(self, dataset, process, params):
+        self.dataset = dataset
+        self.process = process
+        self.params = params
+
+    def __len__(self):
+        return len(self.dataset)
+
+    def __getitem__(self, i):
+        item = self.dataset[i]
+        processed = self.process(item, **self.params)
+        return processed
+
+
+class PipelineIterator(IterableDataset):
+    def __init__(self, loader, infer, params, loader_batch_size=None):
+        """
+        Roughly equivalent to
+
+        ```
+        for item in loader:
+            yield infer(item, **params)
+        ```
+
+                Arguments:
+                    loader (`torch.utils.data.DataLoader` or `Iterable`):
+                        The iterator that will be used to apply `infer` on.
+                    infer (any function):
+                        The function to apply of each element of `loader`.
+                    params (`dict`):
+                        The parameters passed to `infer` along with every item
+                    loader_batch_size (`int`, *optional*):
+                        If specified, the items of `loader` are supposed to come as batch, and are loader_batched here
+                        making it roughly behave as
+
+
+        ```
+        for items in loader:
+            for i in loader_batch_size:
+                item = items[i]
+                yield infer(item, **params)
+        ```"""
+        self.loader = loader
+        self.infer = infer
+        self.params = params
+        if loader_batch_size == 1:
+            # Let's spare some time by deactivating altogether
+            loader_batch_size = None
+        self.loader_batch_size = loader_batch_size
+
+        # Internal bookkeeping
+        self._loader_batch_index = None
+        self._loader_batch_data = None
+
+    def __len__(self):
+        return len(self.loader)
+
+    def __iter__(self):
+        self.iterator = iter(self.loader)
+        return self
+
+    def loader_batch_item(self):
+        """
+        Return item located at `loader_batch_index` within the current `loader_batch_data`.
+        """
+        if isinstance(self._loader_batch_data, torch.Tensor):
+            # Batch data is simple tensor, just fetch the slice
+            result = self._loader_batch_data[self._loader_batch_index].unsqueeze(0)
+        else:
+            # Batch data is assumed to be BaseModelOutput (or dict)
+            loader_batched = {}
+            for k, element in self._loader_batch_data.items():
+                if isinstance(element, ModelOutput):
+                    # Convert ModelOutput to tuple first
+                    element = element.to_tuple()
+                    if isinstance(element[0], torch.Tensor):
+                        loader_batched[k] = tuple(el[self._loader_batch_index].unsqueeze(0) for el in element)
+                    elif isinstance(element[0], np.ndarray):
+                        loader_batched[k] = tuple(np.expand_dims(el[self._loader_batch_index], 0) for el in element)
+                    continue
+                if k in {"hidden_states", "past_key_values", "attentions"} and isinstance(element, tuple):
+                    # Those are stored as lists of tensors so need specific unbatching.
+                    if isinstance(element[0], torch.Tensor):
+                        loader_batched[k] = tuple(el[self._loader_batch_index].unsqueeze(0) for el in element)
+                    elif isinstance(element[0], np.ndarray):
+                        loader_batched[k] = tuple(np.expand_dims(el[self._loader_batch_index], 0) for el in element)
+                    continue
+                if element is None:
+                    # This can happen for optional data that get passed around
+                    loader_batched[k] = None
+                elif isinstance(element[self._loader_batch_index], torch.Tensor):
+                    # Take correct batch data, but make it looked like batch_size=1
+                    # For compatibility with other methods within transformers
+
+                    loader_batched[k] = element[self._loader_batch_index].unsqueeze(0)
+                elif isinstance(element[self._loader_batch_index], np.ndarray):
+                    # Take correct batch data, but make it looked like batch_size=1
+                    # For compatibility with other methods within transformers
+                    loader_batched[k] = np.expand_dims(element[self._loader_batch_index], 0)
+                else:
+                    # This is typically a list, so no need to `unsqueeze`.
+                    loader_batched[k] = element[self._loader_batch_index]
+            # Recreate the element by reusing the original class to make it look
+            # batch_size=1
+            result = self._loader_batch_data.__class__(loader_batched)
+        self._loader_batch_index += 1
+        return result
+
+    def __next__(self):
+        if self._loader_batch_index is not None and self._loader_batch_index < self.loader_batch_size:
+            # We are currently unrolling a batch so we just need to return
+            # the current item within a batch
+            return self.loader_batch_item()
+
+        # We're out of items within a batch
+        item = next(self.iterator)
+        processed = self.infer(item, **self.params)
+        # We now have a batch of "inferred things".
+        if self.loader_batch_size is not None:
+            # Try to infer the size of the batch
+            if isinstance(processed, torch.Tensor):
+                first_tensor = processed
+            elif isinstance(processed, tuple):
+                first_tensor = processed[0]
+            else:
+                key = list(processed.keys())[0]
+                first_tensor = processed[key]
+
+            if isinstance(first_tensor, list):
+                observed_batch_size = len(first_tensor)
+            else:
+                observed_batch_size = first_tensor.shape[0]
+            if 0 < observed_batch_size < self.loader_batch_size:
+                # could be last batch so we can't unroll as many
+                # elements.
+                self.loader_batch_size = observed_batch_size
+            # Setting internal index to unwrap the batch
+            self._loader_batch_data = processed[0] if isinstance(processed, tuple) else processed
+            self._loader_batch_index = 0
+            return self.loader_batch_item()
+        else:
+            # We're not unrolling batches
+            return processed
+
+
+class PipelineChunkIterator(PipelineIterator):
+    def __init__(self, loader, infer, params, loader_batch_size=None):
+        """
+        Roughly equivalent to
+
+        ```
+        for iterator in loader:
+            for item in iterator:
+                yield infer(item, **params)
+        ```
+
+                Arguments:
+                    loader (`torch.utils.data.DataLoader` or `Iterable`):
+                        The iterator that will be used to apply `infer` on.
+                    infer (any function):
+                        The function to apply of each element of `loader`.
+                    params (`dict`):
+                        The parameters passed to `infer` along with every item
+        """
+        super().__init__(loader, infer, params)
+
+    def __iter__(self):
+        self.iterator = iter(self.loader)
+        self.subiterator = None
+        return self
+
+    def __next__(self):
+        if self.subiterator is None:
+            "Subiterator None means we haven't started a `preprocess` iterator. so start it"
+            self.subiterator = self.infer(next(self.iterator), **self.params)
+        try:
+            # Try to return next item
+            processed = next(self.subiterator)
+        except StopIteration:
+            # When a preprocess iterator ends, we can start lookig at the next item
+            # ChunkIterator will keep feeding until ALL elements of iterator
+            # all have created their subiterator and have been iterating against.
+            #
+            # Another way to look at it, is we're basically flattening lists of lists
+            # into a single list, but with generators
+            self.subiterator = self.infer(next(self.iterator), **self.params)
+            processed = next(self.subiterator)
+        return processed
+
+
+class PipelinePackIterator(PipelineIterator):
+    """
+    Roughly equivalent to
+
+    ```
+    packed =  []
+    for item in loader:
+        packed.append(item)
+        if item["is_last"]:
+            yield packed
+            packed = []
+    ```
+
+        but it also handles cases where `item` are batched (meaning it's a dict of Tensor with first dimension > 1. In
+        that case it does
+
+    ```
+    packed =  []
+    for batch in loader:
+        # item is batched
+        for item in batch:
+            packed.append(item)
+            if item["is_last"]:
+                yield packed
+                packed = []
+    ```
+
+        Arguments:
+            loader (`torch.utils.data.DataLoader` or `Iterable`):
+                The iterator that will be used to apply `infer` on.
+            infer (any function):
+                The function to apply of each element of `loader`.
+            params (`dict`):
+                The parameters passed to `infer` along with every item
+            loader_batch_size (`int`, *optional*):
+                If specified, the items of `loader` are supposed to come as batch, and are loader_batched here making
+                it roughly behave as
+
+
+    ```
+    for items in loader:
+        for i in loader_batch_size:
+            item = items[i]
+            yield infer(item, **params)
+    ```"""
+
+    def __iter__(self):
+        self.iterator = iter(self.loader)
+        return self
+
+    def __next__(self):
+        # Extremely similar to PipelineIterator in its unpacking mechanism
+        # BUT, we have an extra required item which is the presence of `is_last`
+        # That is because everything is flattened by `PipelineChunkIterator` we
+        # need to keep track of how to regroup here in the original `process`
+        # boundaries so that `process` and `postprocess` see the same data.
+
+        # This iterator accumulates items (possibly while unbatching) until it
+        # its a `is_last` and then just passes it on to the caller.
+        is_last = False
+        accumulator = []
+        if self._loader_batch_index is not None and self._loader_batch_index < self.loader_batch_size:
+            while self._loader_batch_index < self.loader_batch_size:
+                item = self.loader_batch_item()
+                is_last = item.pop("is_last")
+                accumulator.append(item)
+                if is_last:
+                    return accumulator
+
+        while not is_last:
+            processed = self.infer(next(self.iterator), **self.params)
+            if self.loader_batch_size is not None:
+                if isinstance(processed, torch.Tensor):
+                    first_tensor = processed
+                else:
+                    key = list(processed.keys())[0]
+                    first_tensor = processed[key]
+                if isinstance(first_tensor, list):
+                    observed_batch_size = len(first_tensor)
+                else:
+                    observed_batch_size = first_tensor.shape[0]
+                if 0 < observed_batch_size < self.loader_batch_size:
+                    # could be last batch so we can't unroll as many
+                    # elements.
+                    self.loader_batch_size = observed_batch_size
+                self._loader_batch_data = processed
+                self._loader_batch_index = 0
+                while self._loader_batch_index < self.loader_batch_size:
+                    item = self.loader_batch_item()
+                    is_last = item.pop("is_last")
+                    accumulator.append(item)
+                    if is_last:
+                        return accumulator
+            else:
+                item = processed
+                is_last = item.pop("is_last")
+                accumulator.append(item)
+        return accumulator
+
+
+class KeyDataset(Dataset):
+    def __init__(self, dataset: Dataset, key: str):
+        self.dataset = dataset
+        self.key = key
+
+    def __len__(self):
+        return len(self.dataset)
+
+    def __getitem__(self, i):
+        return self.dataset[i][self.key]
+
+
+class KeyPairDataset(Dataset):
+    def __init__(self, dataset: Dataset, key1: str, key2: str):
+        self.dataset = dataset
+        self.key1 = key1
+        self.key2 = key2
+
+    def __len__(self):
+        return len(self.dataset)
+
+    def __getitem__(self, i):
+        return {"text": self.dataset[i][self.key1], "text_pair": self.dataset[i][self.key2]}
diff --git a/.venv/lib/python3.11/site-packages/transformers/pipelines/question_answering.py b/.venv/lib/python3.11/site-packages/transformers/pipelines/question_answering.py
new file mode 100644
index 0000000000000000000000000000000000000000..7b876eefc492793087871602f51fcd6fb55f5244
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/transformers/pipelines/question_answering.py
@@ -0,0 +1,682 @@
+import inspect
+import types
+import warnings
+from collections.abc import Iterable
+from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union
+
+import numpy as np
+
+from ..data import SquadExample, SquadFeatures, squad_convert_examples_to_features
+from ..modelcard import ModelCard
+from ..tokenization_utils import PreTrainedTokenizer
+from ..utils import (
+    PaddingStrategy,
+    add_end_docstrings,
+    is_tf_available,
+    is_tokenizers_available,
+    is_torch_available,
+    logging,
+)
+from .base import ArgumentHandler, ChunkPipeline, build_pipeline_init_args
+
+
+logger = logging.get_logger(__name__)
+
+if TYPE_CHECKING:
+    from ..modeling_tf_utils import TFPreTrainedModel
+    from ..modeling_utils import PreTrainedModel
+
+    if is_tokenizers_available():
+        import tokenizers
+
+if is_tf_available():
+    import tensorflow as tf
+
+    from ..models.auto.modeling_tf_auto import TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING_NAMES
+
+    Dataset = None
+
+if is_torch_available():
+    import torch
+    from torch.utils.data import Dataset
+
+    from ..models.auto.modeling_auto import MODEL_FOR_QUESTION_ANSWERING_MAPPING_NAMES
+
+
+def decode_spans(
+    start: np.ndarray, end: np.ndarray, topk: int, max_answer_len: int, undesired_tokens: np.ndarray
+) -> Tuple:
+    """
+    Take the output of any `ModelForQuestionAnswering` and will generate probabilities for each span to be the actual
+    answer.
+
+    In addition, it filters out some unwanted/impossible cases like answer len being greater than max_answer_len or
+    answer end position being before the starting position. The method supports output the k-best answer through the
+    topk argument.
+
+    Args:
+        start (`np.ndarray`): Individual start probabilities for each token.
+        end (`np.ndarray`): Individual end probabilities for each token.
+        topk (`int`): Indicates how many possible answer span(s) to extract from the model output.
+        max_answer_len (`int`): Maximum size of the answer to extract from the model's output.
+        undesired_tokens (`np.ndarray`): Mask determining tokens that can be part of the answer
+    """
+    # Ensure we have batch axis
+    if start.ndim == 1:
+        start = start[None]
+
+    if end.ndim == 1:
+        end = end[None]
+
+    # Compute the score of each tuple(start, end) to be the real answer
+    outer = np.matmul(np.expand_dims(start, -1), np.expand_dims(end, 1))
+
+    # Remove candidate with end < start and end - start > max_answer_len
+    candidates = np.tril(np.triu(outer), max_answer_len - 1)
+
+    #  Inspired by Chen & al. (https://github.com/facebookresearch/DrQA)
+    scores_flat = candidates.flatten()
+    if topk == 1:
+        idx_sort = [np.argmax(scores_flat)]
+    elif len(scores_flat) < topk:
+        idx_sort = np.argsort(-scores_flat)
+    else:
+        idx = np.argpartition(-scores_flat, topk)[0:topk]
+        idx_sort = idx[np.argsort(-scores_flat[idx])]
+
+    starts, ends = np.unravel_index(idx_sort, candidates.shape)[1:]
+    desired_spans = np.isin(starts, undesired_tokens.nonzero()) & np.isin(ends, undesired_tokens.nonzero())
+    starts = starts[desired_spans]
+    ends = ends[desired_spans]
+    scores = candidates[0, starts, ends]
+
+    return starts, ends, scores
+
+
+def select_starts_ends(
+    start,
+    end,
+    p_mask,
+    attention_mask,
+    min_null_score=1000000,
+    top_k=1,
+    handle_impossible_answer=False,
+    max_answer_len=15,
+):
+    """
+    Takes the raw output of any `ModelForQuestionAnswering` and first normalizes its outputs and then uses
+    `decode_spans()` to generate probabilities for each span to be the actual answer.
+
+    Args:
+        start (`np.ndarray`): Individual start logits for each token.
+        end (`np.ndarray`): Individual end logits for each token.
+        p_mask (`np.ndarray`): A mask with 1 for values that cannot be in the answer
+        attention_mask (`np.ndarray`): The attention mask generated by the tokenizer
+        min_null_score(`float`): The minimum null (empty) answer score seen so far.
+        topk (`int`): Indicates how many possible answer span(s) to extract from the model output.
+        handle_impossible_answer(`bool`): Whether to allow null (empty) answers
+        max_answer_len (`int`): Maximum size of the answer to extract from the model's output.
+    """
+    # Ensure padded tokens & question tokens cannot belong to the set of candidate answers.
+    undesired_tokens = np.abs(np.array(p_mask) - 1)
+
+    if attention_mask is not None:
+        undesired_tokens = undesired_tokens & attention_mask
+
+    # Generate mask
+    undesired_tokens_mask = undesired_tokens == 0.0
+
+    # Make sure non-context indexes in the tensor cannot contribute to the softmax
+    start = np.where(undesired_tokens_mask, -10000.0, start)
+    end = np.where(undesired_tokens_mask, -10000.0, end)
+
+    # Normalize logits and spans to retrieve the answer
+    start = np.exp(start - start.max(axis=-1, keepdims=True))
+    start = start / start.sum()
+
+    end = np.exp(end - end.max(axis=-1, keepdims=True))
+    end = end / end.sum()
+
+    if handle_impossible_answer:
+        min_null_score = min(min_null_score, (start[0, 0] * end[0, 0]).item())
+
+    # Mask CLS
+    start[0, 0] = end[0, 0] = 0.0
+
+    starts, ends, scores = decode_spans(start, end, top_k, max_answer_len, undesired_tokens)
+    return starts, ends, scores, min_null_score
+
+
+class QuestionAnsweringArgumentHandler(ArgumentHandler):
+    """
+    QuestionAnsweringPipeline requires the user to provide multiple arguments (i.e. question & context) to be mapped to
+    internal [`SquadExample`].
+
+    QuestionAnsweringArgumentHandler manages all the possible to create a [`SquadExample`] from the command-line
+    supplied arguments.
+    """
+
+    def normalize(self, item):
+        if isinstance(item, SquadExample):
+            return item
+        elif isinstance(item, dict):
+            for k in ["question", "context"]:
+                if k not in item:
+                    raise KeyError("You need to provide a dictionary with keys {question:..., context:...}")
+                elif item[k] is None:
+                    raise ValueError(f"`{k}` cannot be None")
+                elif isinstance(item[k], str) and len(item[k]) == 0:
+                    raise ValueError(f"`{k}` cannot be empty")
+
+            return QuestionAnsweringPipeline.create_sample(**item)
+        raise ValueError(f"{item} argument needs to be of type (SquadExample, dict)")
+
+    def __call__(self, *args, **kwargs):
+        # Detect where the actual inputs are
+        if args is not None and len(args) > 0:
+            if len(args) == 1:
+                inputs = args[0]
+            elif len(args) == 2 and {type(el) for el in args} == {str}:
+                inputs = [{"question": args[0], "context": args[1]}]
+            else:
+                inputs = list(args)
+        # Generic compatibility with sklearn and Keras
+        # Batched data
+        elif "X" in kwargs:
+            warnings.warn(
+                "Passing the `X` argument to the pipeline is deprecated and will be removed in v5. Inputs should be passed using the `question` and `context` keyword arguments instead.",
+                FutureWarning,
+            )
+            inputs = kwargs["X"]
+        elif "data" in kwargs:
+            warnings.warn(
+                "Passing the `data` argument to the pipeline is deprecated and will be removed in v5. Inputs should be passed using the `question` and `context` keyword arguments instead.",
+                FutureWarning,
+            )
+            inputs = kwargs["data"]
+        elif "question" in kwargs and "context" in kwargs:
+            if isinstance(kwargs["question"], list) and isinstance(kwargs["context"], str):
+                inputs = [{"question": Q, "context": kwargs["context"]} for Q in kwargs["question"]]
+            elif isinstance(kwargs["question"], list) and isinstance(kwargs["context"], list):
+                if len(kwargs["question"]) != len(kwargs["context"]):
+                    raise ValueError("Questions and contexts don't have the same lengths")
+
+                inputs = [{"question": Q, "context": C} for Q, C in zip(kwargs["question"], kwargs["context"])]
+            elif isinstance(kwargs["question"], str) and isinstance(kwargs["context"], str):
+                inputs = [{"question": kwargs["question"], "context": kwargs["context"]}]
+            else:
+                raise ValueError("Arguments can't be understood")
+        else:
+            raise ValueError(f"Unknown arguments {kwargs}")
+
+        # When user is sending a generator we need to trust it's a valid example
+        generator_types = (types.GeneratorType, Dataset) if Dataset is not None else (types.GeneratorType,)
+        if isinstance(inputs, generator_types):
+            return inputs
+
+        # Normalize inputs
+        if isinstance(inputs, dict):
+            inputs = [inputs]
+        elif isinstance(inputs, Iterable):
+            # Copy to avoid overriding arguments
+            inputs = list(inputs)
+        else:
+            raise ValueError(f"Invalid arguments {kwargs}")
+
+        for i, item in enumerate(inputs):
+            inputs[i] = self.normalize(item)
+
+        return inputs
+
+
+@add_end_docstrings(build_pipeline_init_args(has_tokenizer=True))
+class QuestionAnsweringPipeline(ChunkPipeline):
+    """
+    Question Answering pipeline using any `ModelForQuestionAnswering`. See the [question answering
+    examples](../task_summary#question-answering) for more information.
+
+    Example:
+
+    ```python
+    >>> from transformers import pipeline
+
+    >>> oracle = pipeline(model="deepset/roberta-base-squad2")
+    >>> oracle(question="Where do I live?", context="My name is Wolfgang and I live in Berlin")
+    {'score': 0.9191, 'start': 34, 'end': 40, 'answer': 'Berlin'}
+    ```
+
+    Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial)
+
+    This question answering pipeline can currently be loaded from [`pipeline`] using the following task identifier:
+    `"question-answering"`.
+
+    The models that this pipeline can use are models that have been fine-tuned on a question answering task. See the
+    up-to-date list of available models on
+    [huggingface.co/models](https://huggingface.co/models?filter=question-answering).
+    """
+
+    default_input_names = "question,context"
+    handle_impossible_answer = False
+
+    def __init__(
+        self,
+        model: Union["PreTrainedModel", "TFPreTrainedModel"],
+        tokenizer: PreTrainedTokenizer,
+        modelcard: Optional[ModelCard] = None,
+        framework: Optional[str] = None,
+        task: str = "",
+        **kwargs,
+    ):
+        super().__init__(
+            model=model,
+            tokenizer=tokenizer,
+            modelcard=modelcard,
+            framework=framework,
+            task=task,
+            **kwargs,
+        )
+
+        self._args_parser = QuestionAnsweringArgumentHandler()
+        self.check_model_type(
+            TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING_NAMES
+            if self.framework == "tf"
+            else MODEL_FOR_QUESTION_ANSWERING_MAPPING_NAMES
+        )
+
+    @staticmethod
+    def create_sample(
+        question: Union[str, List[str]], context: Union[str, List[str]]
+    ) -> Union[SquadExample, List[SquadExample]]:
+        """
+        QuestionAnsweringPipeline leverages the [`SquadExample`] internally. This helper method encapsulate all the
+        logic for converting question(s) and context(s) to [`SquadExample`].
+
+        We currently support extractive question answering.
+
+        Arguments:
+            question (`str` or `List[str]`): The question(s) asked.
+            context (`str` or `List[str]`): The context(s) in which we will look for the answer.
+
+        Returns:
+            One or a list of [`SquadExample`]: The corresponding [`SquadExample`] grouping question and context.
+        """
+        if isinstance(question, list):
+            return [SquadExample(None, q, c, None, None, None) for q, c in zip(question, context)]
+        else:
+            return SquadExample(None, question, context, None, None, None)
+
+    def _sanitize_parameters(
+        self,
+        padding=None,
+        topk=None,
+        top_k=None,
+        doc_stride=None,
+        max_answer_len=None,
+        max_seq_len=None,
+        max_question_len=None,
+        handle_impossible_answer=None,
+        align_to_words=None,
+        **kwargs,
+    ):
+        # Set defaults values
+        preprocess_params = {}
+        if padding is not None:
+            preprocess_params["padding"] = padding
+        if doc_stride is not None:
+            preprocess_params["doc_stride"] = doc_stride
+        if max_question_len is not None:
+            preprocess_params["max_question_len"] = max_question_len
+        if max_seq_len is not None:
+            preprocess_params["max_seq_len"] = max_seq_len
+
+        postprocess_params = {}
+        if topk is not None and top_k is None:
+            warnings.warn("topk parameter is deprecated, use top_k instead", UserWarning)
+            top_k = topk
+        if top_k is not None:
+            if top_k < 1:
+                raise ValueError(f"top_k parameter should be >= 1 (got {top_k})")
+            postprocess_params["top_k"] = top_k
+        if max_answer_len is not None:
+            if max_answer_len < 1:
+                raise ValueError(f"max_answer_len parameter should be >= 1 (got {max_answer_len}")
+        if max_answer_len is not None:
+            postprocess_params["max_answer_len"] = max_answer_len
+        if handle_impossible_answer is not None:
+            postprocess_params["handle_impossible_answer"] = handle_impossible_answer
+        if align_to_words is not None:
+            postprocess_params["align_to_words"] = align_to_words
+        return preprocess_params, {}, postprocess_params
+
+    def __call__(self, *args, **kwargs):
+        """
+        Answer the question(s) given as inputs by using the context(s).
+
+        Args:
+            question (`str` or `List[str]`):
+                One or several question(s) (must be used in conjunction with the `context` argument).
+            context (`str` or `List[str]`):
+                One or several context(s) associated with the question(s) (must be used in conjunction with the
+                `question` argument).
+            top_k (`int`, *optional*, defaults to 1):
+                The number of answers to return (will be chosen by order of likelihood). Note that we return less than
+                top_k answers if there are not enough options available within the context.
+            doc_stride (`int`, *optional*, defaults to 128):
+                If the context is too long to fit with the question for the model, it will be split in several chunks
+                with some overlap. This argument controls the size of that overlap.
+            max_answer_len (`int`, *optional*, defaults to 15):
+                The maximum length of predicted answers (e.g., only answers with a shorter length are considered).
+            max_seq_len (`int`, *optional*, defaults to 384):
+                The maximum length of the total sentence (context + question) in tokens of each chunk passed to the
+                model. The context will be split in several chunks (using `doc_stride` as overlap) if needed.
+            max_question_len (`int`, *optional*, defaults to 64):
+                The maximum length of the question after tokenization. It will be truncated if needed.
+            handle_impossible_answer (`bool`, *optional*, defaults to `False`):
+                Whether or not we accept impossible as an answer.
+            align_to_words (`bool`, *optional*, defaults to `True`):
+                Attempts to align the answer to real words. Improves quality on space separated languages. Might hurt on
+                non-space-separated languages (like Japanese or Chinese)
+
+        Return:
+            A `dict` or a list of `dict`: Each result comes as a dictionary with the following keys:
+
+            - **score** (`float`) -- The probability associated to the answer.
+            - **start** (`int`) -- The character start index of the answer (in the tokenized version of the input).
+            - **end** (`int`) -- The character end index of the answer (in the tokenized version of the input).
+            - **answer** (`str`) -- The answer to the question.
+        """
+
+        # Convert inputs to features
+        if args:
+            warnings.warn(
+                "Passing a list of SQuAD examples to the pipeline is deprecated and will be removed in v5. Inputs should be passed using the `question` and `context` keyword arguments instead.",
+                FutureWarning,
+            )
+
+        examples = self._args_parser(*args, **kwargs)
+        if isinstance(examples, (list, tuple)) and len(examples) == 1:
+            return super().__call__(examples[0], **kwargs)
+        return super().__call__(examples, **kwargs)
+
+    def preprocess(self, example, padding="do_not_pad", doc_stride=None, max_question_len=64, max_seq_len=None):
+        # XXX: This is specal, args_parser will not handle anything generator or dataset like
+        # For those we expect user to send a simple valid example either directly as a SquadExample or simple dict.
+        # So we still need a little sanitation here.
+        if isinstance(example, dict):
+            example = SquadExample(None, example["question"], example["context"], None, None, None)
+
+        if max_seq_len is None:
+            max_seq_len = min(self.tokenizer.model_max_length, 384)
+        if doc_stride is None:
+            doc_stride = min(max_seq_len // 2, 128)
+
+        if doc_stride > max_seq_len:
+            raise ValueError(f"`doc_stride` ({doc_stride}) is larger than `max_seq_len` ({max_seq_len})")
+
+        if not self.tokenizer.is_fast:
+            features = squad_convert_examples_to_features(
+                examples=[example],
+                tokenizer=self.tokenizer,
+                max_seq_length=max_seq_len,
+                doc_stride=doc_stride,
+                max_query_length=max_question_len,
+                padding_strategy=PaddingStrategy.MAX_LENGTH,
+                is_training=False,
+                tqdm_enabled=False,
+            )
+        else:
+            # Define the side we want to truncate / pad and the text/pair sorting
+            question_first = self.tokenizer.padding_side == "right"
+
+            encoded_inputs = self.tokenizer(
+                text=example.question_text if question_first else example.context_text,
+                text_pair=example.context_text if question_first else example.question_text,
+                padding=padding,
+                truncation="only_second" if question_first else "only_first",
+                max_length=max_seq_len,
+                stride=doc_stride,
+                return_token_type_ids=True,
+                return_overflowing_tokens=True,
+                return_offsets_mapping=True,
+                return_special_tokens_mask=True,
+            )
+            # When the input is too long, it's converted in a batch of inputs with overflowing tokens
+            # and a stride of overlap between the inputs. If a batch of inputs is given, a special output
+            # "overflow_to_sample_mapping" indicate which member of the encoded batch belong to which original batch sample.
+            # Here we tokenize examples one-by-one so we don't need to use "overflow_to_sample_mapping".
+            # "num_span" is the number of output samples generated from the overflowing tokens.
+            num_spans = len(encoded_inputs["input_ids"])
+
+            # p_mask: mask with 1 for token than cannot be in the answer (0 for token which can be in an answer)
+            # We put 0 on the tokens from the context and 1 everywhere else (question and special tokens)
+            p_mask = [
+                [tok != 1 if question_first else 0 for tok in encoded_inputs.sequence_ids(span_id)]
+                for span_id in range(num_spans)
+            ]
+
+            features = []
+            for span_idx in range(num_spans):
+                input_ids_span_idx = encoded_inputs["input_ids"][span_idx]
+                attention_mask_span_idx = (
+                    encoded_inputs["attention_mask"][span_idx] if "attention_mask" in encoded_inputs else None
+                )
+                token_type_ids_span_idx = (
+                    encoded_inputs["token_type_ids"][span_idx] if "token_type_ids" in encoded_inputs else None
+                )
+                # keep the cls_token unmasked (some models use it to indicate unanswerable questions)
+                if self.tokenizer.cls_token_id is not None:
+                    cls_indices = np.nonzero(np.array(input_ids_span_idx) == self.tokenizer.cls_token_id)[0]
+                    for cls_index in cls_indices:
+                        p_mask[span_idx][cls_index] = 0
+                submask = p_mask[span_idx]
+                features.append(
+                    SquadFeatures(
+                        input_ids=input_ids_span_idx,
+                        attention_mask=attention_mask_span_idx,
+                        token_type_ids=token_type_ids_span_idx,
+                        p_mask=submask,
+                        encoding=encoded_inputs[span_idx],
+                        # We don't use the rest of the values - and actually
+                        # for Fast tokenizer we could totally avoid using SquadFeatures and SquadExample
+                        cls_index=None,
+                        token_to_orig_map={},
+                        example_index=0,
+                        unique_id=0,
+                        paragraph_len=0,
+                        token_is_max_context=0,
+                        tokens=[],
+                        start_position=0,
+                        end_position=0,
+                        is_impossible=False,
+                        qas_id=None,
+                    )
+                )
+
+        for i, feature in enumerate(features):
+            fw_args = {}
+            others = {}
+            model_input_names = self.tokenizer.model_input_names + ["p_mask", "token_type_ids"]
+
+            for k, v in feature.__dict__.items():
+                if k in model_input_names:
+                    if self.framework == "tf":
+                        tensor = tf.constant(v)
+                        if tensor.dtype == tf.int64:
+                            tensor = tf.cast(tensor, tf.int32)
+                        fw_args[k] = tf.expand_dims(tensor, 0)
+                    elif self.framework == "pt":
+                        tensor = torch.tensor(v)
+                        if tensor.dtype == torch.int32:
+                            tensor = tensor.long()
+                        fw_args[k] = tensor.unsqueeze(0)
+                else:
+                    others[k] = v
+
+            is_last = i == len(features) - 1
+            yield {"example": example, "is_last": is_last, **fw_args, **others}
+
+    def _forward(self, inputs):
+        example = inputs["example"]
+        model_inputs = {k: inputs[k] for k in self.tokenizer.model_input_names}
+        # `XXXForSequenceClassification` models should not use `use_cache=True` even if it's supported
+        model_forward = self.model.forward if self.framework == "pt" else self.model.call
+        if "use_cache" in inspect.signature(model_forward).parameters.keys():
+            model_inputs["use_cache"] = False
+        output = self.model(**model_inputs)
+        if isinstance(output, dict):
+            return {"start": output["start_logits"], "end": output["end_logits"], "example": example, **inputs}
+        else:
+            start, end = output[:2]
+            return {"start": start, "end": end, "example": example, **inputs}
+
+    def postprocess(
+        self,
+        model_outputs,
+        top_k=1,
+        handle_impossible_answer=False,
+        max_answer_len=15,
+        align_to_words=True,
+    ):
+        min_null_score = 1000000  # large and positive
+        answers = []
+        for output in model_outputs:
+            if self.framework == "pt" and output["start"].dtype == torch.bfloat16:
+                start_ = output["start"].to(torch.float32)
+            else:
+                start_ = output["start"]
+            if self.framework == "pt" and output["start"].dtype == torch.bfloat16:
+                end_ = output["end"].to(torch.float32)
+            else:
+                end_ = output["end"]
+            example = output["example"]
+            p_mask = output["p_mask"]
+            attention_mask = (
+                output["attention_mask"].numpy() if output.get("attention_mask", None) is not None else None
+            )
+
+            starts, ends, scores, min_null_score = select_starts_ends(
+                start_, end_, p_mask, attention_mask, min_null_score, top_k, handle_impossible_answer, max_answer_len
+            )
+
+            if not self.tokenizer.is_fast:
+                char_to_word = np.array(example.char_to_word_offset)
+
+                # Convert the answer (tokens) back to the original text
+                # Score: score from the model
+                # Start: Index of the first character of the answer in the context string
+                # End: Index of the character following the last character of the answer in the context string
+                # Answer: Plain text of the answer
+                for s, e, score in zip(starts, ends, scores):
+                    token_to_orig_map = output["token_to_orig_map"]
+                    answers.append(
+                        {
+                            "score": score.item(),
+                            "start": np.where(char_to_word == token_to_orig_map[s])[0][0].item(),
+                            "end": np.where(char_to_word == token_to_orig_map[e])[0][-1].item(),
+                            "answer": " ".join(example.doc_tokens[token_to_orig_map[s] : token_to_orig_map[e] + 1]),
+                        }
+                    )
+            else:
+                # Convert the answer (tokens) back to the original text
+                # Score: score from the model
+                # Start: Index of the first character of the answer in the context string
+                # End: Index of the character following the last character of the answer in the context string
+                # Answer: Plain text of the answer
+                question_first = bool(self.tokenizer.padding_side == "right")
+                enc = output["encoding"]
+
+                # Encoding was *not* padded, input_ids *might*.
+                # It doesn't make a difference unless we're padding on
+                # the left hand side, since now we have different offsets
+                # everywhere.
+                if self.tokenizer.padding_side == "left":
+                    offset = (output["input_ids"] == self.tokenizer.pad_token_id).numpy().sum()
+                else:
+                    offset = 0
+
+                # Sometimes the max probability token is in the middle of a word so:
+                # - we start by finding the right word containing the token with `token_to_word`
+                # - then we convert this word in a character span with `word_to_chars`
+                sequence_index = 1 if question_first else 0
+                for s, e, score in zip(starts, ends, scores):
+                    s = s - offset
+                    e = e - offset
+
+                    start_index, end_index = self.get_indices(enc, s, e, sequence_index, align_to_words)
+
+                    answers.append(
+                        {
+                            "score": score.item(),
+                            "start": start_index,
+                            "end": end_index,
+                            "answer": example.context_text[start_index:end_index],
+                        }
+                    )
+
+        if handle_impossible_answer:
+            answers.append({"score": min_null_score, "start": 0, "end": 0, "answer": ""})
+        answers = sorted(answers, key=lambda x: x["score"], reverse=True)[:top_k]
+        if len(answers) == 1:
+            return answers[0]
+        return answers
+
+    def get_indices(
+        self, enc: "tokenizers.Encoding", s: int, e: int, sequence_index: int, align_to_words: bool
+    ) -> Tuple[int, int]:
+        if align_to_words:
+            try:
+                start_word = enc.token_to_word(s)
+                end_word = enc.token_to_word(e)
+                start_index = enc.word_to_chars(start_word, sequence_index=sequence_index)[0]
+                end_index = enc.word_to_chars(end_word, sequence_index=sequence_index)[1]
+            except Exception:
+                # Some tokenizers don't really handle words. Keep to offsets then.
+                start_index = enc.offsets[s][0]
+                end_index = enc.offsets[e][1]
+        else:
+            start_index = enc.offsets[s][0]
+            end_index = enc.offsets[e][1]
+        return start_index, end_index
+
+    def span_to_answer(self, text: str, start: int, end: int) -> Dict[str, Union[str, int]]:
+        """
+        When decoding from token probabilities, this method maps token indexes to actual word in the initial context.
+
+        Args:
+            text (`str`): The actual context to extract the answer from.
+            start (`int`): The answer starting token index.
+            end (`int`): The answer end token index.
+
+        Returns:
+            Dictionary like `{'answer': str, 'start': int, 'end': int}`
+        """
+        words = []
+        token_idx = char_start_idx = char_end_idx = chars_idx = 0
+
+        for i, word in enumerate(text.split(" ")):
+            token = self.tokenizer.tokenize(word)
+
+            # Append words if they are in the span
+            if start <= token_idx <= end:
+                if token_idx == start:
+                    char_start_idx = chars_idx
+
+                if token_idx == end:
+                    char_end_idx = chars_idx + len(word)
+
+                words += [word]
+
+            # Stop if we went over the end of the answer
+            if token_idx > end:
+                break
+
+            # Append the subtokenization length to the running index
+            token_idx += len(token)
+            chars_idx += len(word) + 1
+
+        # Join text with spaces
+        return {
+            "answer": " ".join(words),
+            "start": max(0, char_start_idx),
+            "end": min(len(text), char_end_idx),
+        }
diff --git a/.venv/lib/python3.11/site-packages/transformers/pipelines/table_question_answering.py b/.venv/lib/python3.11/site-packages/transformers/pipelines/table_question_answering.py
new file mode 100644
index 0000000000000000000000000000000000000000..10ea7170fed40cbc6f14c8b712741ce570fbf3f7
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/transformers/pipelines/table_question_answering.py
@@ -0,0 +1,443 @@
+import collections
+import types
+
+import numpy as np
+
+from ..utils import (
+    add_end_docstrings,
+    is_tf_available,
+    is_torch_available,
+    requires_backends,
+)
+from .base import ArgumentHandler, Dataset, Pipeline, PipelineException, build_pipeline_init_args
+
+
+if is_torch_available():
+    import torch
+
+    from ..models.auto.modeling_auto import (
+        MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES,
+        MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING_NAMES,
+    )
+
+if is_tf_available():
+    import tensorflow as tf
+
+    from ..models.auto.modeling_tf_auto import (
+        TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES,
+        TF_MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING_NAMES,
+    )
+
+
+class TableQuestionAnsweringArgumentHandler(ArgumentHandler):
+    """
+    Handles arguments for the TableQuestionAnsweringPipeline
+    """
+
+    def __call__(self, table=None, query=None, **kwargs):
+        # Returns tqa_pipeline_inputs of shape:
+        # [
+        #   {"table": pd.DataFrame, "query": List[str]},
+        #   ...,
+        #   {"table": pd.DataFrame, "query" : List[str]}
+        # ]
+        requires_backends(self, "pandas")
+        import pandas as pd
+
+        if table is None:
+            raise ValueError("Keyword argument `table` cannot be None.")
+        elif query is None:
+            if isinstance(table, dict) and table.get("query") is not None and table.get("table") is not None:
+                tqa_pipeline_inputs = [table]
+            elif isinstance(table, list) and len(table) > 0:
+                if not all(isinstance(d, dict) for d in table):
+                    raise ValueError(
+                        f"Keyword argument `table` should be a list of dict, but is {(type(d) for d in table)}"
+                    )
+
+                if table[0].get("query") is not None and table[0].get("table") is not None:
+                    tqa_pipeline_inputs = table
+                else:
+                    raise ValueError(
+                        "If keyword argument `table` is a list of dictionaries, each dictionary should have a `table`"
+                        f" and `query` key, but only dictionary has keys {table[0].keys()} `table` and `query` keys."
+                    )
+            elif Dataset is not None and isinstance(table, Dataset) or isinstance(table, types.GeneratorType):
+                return table
+            else:
+                raise ValueError(
+                    "Invalid input. Keyword argument `table` should be either of type `dict` or `list`, but "
+                    f"is {type(table)})"
+                )
+        else:
+            tqa_pipeline_inputs = [{"table": table, "query": query}]
+
+        for tqa_pipeline_input in tqa_pipeline_inputs:
+            if not isinstance(tqa_pipeline_input["table"], pd.DataFrame):
+                if tqa_pipeline_input["table"] is None:
+                    raise ValueError("Table cannot be None.")
+
+                tqa_pipeline_input["table"] = pd.DataFrame(tqa_pipeline_input["table"])
+
+        return tqa_pipeline_inputs
+
+
+@add_end_docstrings(build_pipeline_init_args(has_tokenizer=True))
+class TableQuestionAnsweringPipeline(Pipeline):
+    """
+    Table Question Answering pipeline using a `ModelForTableQuestionAnswering`. This pipeline is only available in
+    PyTorch.
+
+    Example:
+
+    ```python
+    >>> from transformers import pipeline
+
+    >>> oracle = pipeline(model="google/tapas-base-finetuned-wtq")
+    >>> table = {
+    ...     "Repository": ["Transformers", "Datasets", "Tokenizers"],
+    ...     "Stars": ["36542", "4512", "3934"],
+    ...     "Contributors": ["651", "77", "34"],
+    ...     "Programming language": ["Python", "Python", "Rust, Python and NodeJS"],
+    ... }
+    >>> oracle(query="How many stars does the transformers repository have?", table=table)
+    {'answer': 'AVERAGE > 36542', 'coordinates': [(0, 1)], 'cells': ['36542'], 'aggregator': 'AVERAGE'}
+    ```
+
+    Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial)
+
+    This tabular question answering pipeline can currently be loaded from [`pipeline`] using the following task
+    identifier: `"table-question-answering"`.
+
+    The models that this pipeline can use are models that have been fine-tuned on a tabular question answering task.
+    See the up-to-date list of available models on
+    [huggingface.co/models](https://huggingface.co/models?filter=table-question-answering).
+    """
+
+    default_input_names = "table,query"
+
+    def __init__(self, args_parser=TableQuestionAnsweringArgumentHandler(), *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self._args_parser = args_parser
+
+        if self.framework == "tf":
+            mapping = TF_MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING_NAMES.copy()
+            mapping.update(TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES)
+        else:
+            mapping = MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING_NAMES.copy()
+            mapping.update(MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES)
+        self.check_model_type(mapping)
+
+        self.aggregate = bool(getattr(self.model.config, "aggregation_labels", None)) and bool(
+            getattr(self.model.config, "num_aggregation_labels", None)
+        )
+        self.type = "tapas" if hasattr(self.model.config, "aggregation_labels") else None
+
+    def batch_inference(self, **inputs):
+        return self.model(**inputs)
+
+    def sequential_inference(self, **inputs):
+        """
+        Inference used for models that need to process sequences in a sequential fashion, like the SQA models which
+        handle conversational query related to a table.
+        """
+        if self.framework == "pt":
+            all_logits = []
+            all_aggregations = []
+            prev_answers = None
+            batch_size = inputs["input_ids"].shape[0]
+
+            input_ids = inputs["input_ids"].to(self.device)
+            attention_mask = inputs["attention_mask"].to(self.device)
+            token_type_ids = inputs["token_type_ids"].to(self.device)
+            token_type_ids_example = None
+
+            for index in range(batch_size):
+                # If sequences have already been processed, the token type IDs will be created according to the previous
+                # answer.
+                if prev_answers is not None:
+                    prev_labels_example = token_type_ids_example[:, 3]  # shape (seq_len,)
+                    model_labels = np.zeros_like(prev_labels_example.cpu().numpy())  # shape (seq_len,)
+
+                    token_type_ids_example = token_type_ids[index]  # shape (seq_len, 7)
+                    for i in range(model_labels.shape[0]):
+                        segment_id = token_type_ids_example[:, 0].tolist()[i]
+                        col_id = token_type_ids_example[:, 1].tolist()[i] - 1
+                        row_id = token_type_ids_example[:, 2].tolist()[i] - 1
+
+                        if row_id >= 0 and col_id >= 0 and segment_id == 1:
+                            model_labels[i] = int(prev_answers[(col_id, row_id)])
+
+                    token_type_ids_example[:, 3] = torch.from_numpy(model_labels).type(torch.long).to(self.device)
+
+                input_ids_example = input_ids[index]
+                attention_mask_example = attention_mask[index]  # shape (seq_len,)
+                token_type_ids_example = token_type_ids[index]  # shape (seq_len, 7)
+                outputs = self.model(
+                    input_ids=input_ids_example.unsqueeze(0),
+                    attention_mask=attention_mask_example.unsqueeze(0),
+                    token_type_ids=token_type_ids_example.unsqueeze(0),
+                )
+                logits = outputs.logits
+
+                if self.aggregate:
+                    all_aggregations.append(outputs.logits_aggregation)
+
+                all_logits.append(logits)
+
+                dist_per_token = torch.distributions.Bernoulli(logits=logits)
+                probabilities = dist_per_token.probs * attention_mask_example.type(torch.float32).to(
+                    dist_per_token.probs.device
+                )
+
+                coords_to_probs = collections.defaultdict(list)
+                for i, p in enumerate(probabilities.squeeze().tolist()):
+                    segment_id = token_type_ids_example[:, 0].tolist()[i]
+                    col = token_type_ids_example[:, 1].tolist()[i] - 1
+                    row = token_type_ids_example[:, 2].tolist()[i] - 1
+                    if col >= 0 and row >= 0 and segment_id == 1:
+                        coords_to_probs[(col, row)].append(p)
+
+                prev_answers = {key: np.array(coords_to_probs[key]).mean() > 0.5 for key in coords_to_probs}
+
+            logits_batch = torch.cat(tuple(all_logits), 0)
+
+            return (logits_batch,) if not self.aggregate else (logits_batch, torch.cat(tuple(all_aggregations), 0))
+        else:
+            all_logits = []
+            all_aggregations = []
+            prev_answers = None
+            batch_size = inputs["input_ids"].shape[0]
+
+            input_ids = inputs["input_ids"]
+            attention_mask = inputs["attention_mask"]
+            token_type_ids = inputs["token_type_ids"].numpy()
+            token_type_ids_example = None
+
+            for index in range(batch_size):
+                # If sequences have already been processed, the token type IDs will be created according to the previous
+                # answer.
+                if prev_answers is not None:
+                    prev_labels_example = token_type_ids_example[:, 3]  # shape (seq_len,)
+                    model_labels = np.zeros_like(prev_labels_example, dtype=np.int32)  # shape (seq_len,)
+
+                    token_type_ids_example = token_type_ids[index]  # shape (seq_len, 7)
+                    for i in range(model_labels.shape[0]):
+                        segment_id = token_type_ids_example[:, 0].tolist()[i]
+                        col_id = token_type_ids_example[:, 1].tolist()[i] - 1
+                        row_id = token_type_ids_example[:, 2].tolist()[i] - 1
+
+                        if row_id >= 0 and col_id >= 0 and segment_id == 1:
+                            model_labels[i] = int(prev_answers[(col_id, row_id)])
+
+                    token_type_ids_example[:, 3] = model_labels
+
+                input_ids_example = input_ids[index]
+                attention_mask_example = attention_mask[index]  # shape (seq_len,)
+                token_type_ids_example = token_type_ids[index]  # shape (seq_len, 7)
+                outputs = self.model(
+                    input_ids=np.expand_dims(input_ids_example, axis=0),
+                    attention_mask=np.expand_dims(attention_mask_example, axis=0),
+                    token_type_ids=np.expand_dims(token_type_ids_example, axis=0),
+                )
+                logits = outputs.logits
+
+                if self.aggregate:
+                    all_aggregations.append(outputs.logits_aggregation)
+
+                all_logits.append(logits)
+
+                probabilities = tf.math.sigmoid(tf.cast(logits, tf.float32)) * tf.cast(
+                    attention_mask_example, tf.float32
+                )
+
+                coords_to_probs = collections.defaultdict(list)
+                token_type_ids_example = token_type_ids_example
+                for i, p in enumerate(tf.squeeze(probabilities).numpy().tolist()):
+                    segment_id = token_type_ids_example[:, 0].tolist()[i]
+                    col = token_type_ids_example[:, 1].tolist()[i] - 1
+                    row = token_type_ids_example[:, 2].tolist()[i] - 1
+                    if col >= 0 and row >= 0 and segment_id == 1:
+                        coords_to_probs[(col, row)].append(p)
+
+                prev_answers = {key: np.array(coords_to_probs[key]).mean() > 0.5 for key in coords_to_probs}
+
+            logits_batch = tf.concat(tuple(all_logits), 0)
+
+            return (logits_batch,) if not self.aggregate else (logits_batch, tf.concat(tuple(all_aggregations), 0))
+
+    def __call__(self, *args, **kwargs):
+        r"""
+        Answers queries according to a table. The pipeline accepts several types of inputs which are detailed below:
+
+        - `pipeline(table, query)`
+        - `pipeline(table, [query])`
+        - `pipeline(table=table, query=query)`
+        - `pipeline(table=table, query=[query])`
+        - `pipeline({"table": table, "query": query})`
+        - `pipeline({"table": table, "query": [query]})`
+        - `pipeline([{"table": table, "query": query}, {"table": table, "query": query}])`
+
+        The `table` argument should be a dict or a DataFrame built from that dict, containing the whole table:
+
+        Example:
+
+        ```python
+        data = {
+            "actors": ["brad pitt", "leonardo di caprio", "george clooney"],
+            "age": ["56", "45", "59"],
+            "number of movies": ["87", "53", "69"],
+            "date of birth": ["7 february 1967", "10 june 1996", "28 november 1967"],
+        }
+        ```
+
+        This dictionary can be passed in as such, or can be converted to a pandas DataFrame:
+
+        Example:
+
+        ```python
+        import pandas as pd
+
+        table = pd.DataFrame.from_dict(data)
+        ```
+
+        Args:
+            table (`pd.DataFrame` or `Dict`):
+                Pandas DataFrame or dictionary that will be converted to a DataFrame containing all the table values.
+                See above for an example of dictionary.
+            query (`str` or `List[str]`):
+                Query or list of queries that will be sent to the model alongside the table.
+            sequential (`bool`, *optional*, defaults to `False`):
+                Whether to do inference sequentially or as a batch. Batching is faster, but models like SQA require the
+                inference to be done sequentially to extract relations within sequences, given their conversational
+                nature.
+            padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `False`):
+                Activates and controls padding. Accepts the following values:
+
+                - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
+                  sequence if provided).
+                - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
+                  acceptable input length for the model if that argument is not provided.
+                - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
+                  lengths).
+
+            truncation (`bool`, `str` or [`TapasTruncationStrategy`], *optional*, defaults to `False`):
+                Activates and controls truncation. Accepts the following values:
+
+                - `True` or `'drop_rows_to_fit'`: Truncate to a maximum length specified with the argument `max_length`
+                  or to the maximum acceptable input length for the model if that argument is not provided. This will
+                  truncate row by row, removing rows from the table.
+                - `False` or `'do_not_truncate'` (default): No truncation (i.e., can output batch with sequence lengths
+                  greater than the model maximum admissible input size).
+
+
+        Return:
+            A dictionary or a list of dictionaries containing results: Each result is a dictionary with the following
+            keys:
+
+            - **answer** (`str`) -- The answer of the query given the table. If there is an aggregator, the answer will
+              be preceded by `AGGREGATOR >`.
+            - **coordinates** (`List[Tuple[int, int]]`) -- Coordinates of the cells of the answers.
+            - **cells** (`List[str]`) -- List of strings made up of the answer cell values.
+            - **aggregator** (`str`) -- If the model has an aggregator, this returns the aggregator.
+        """
+        pipeline_inputs = self._args_parser(*args, **kwargs)
+
+        results = super().__call__(pipeline_inputs, **kwargs)
+        if len(results) == 1:
+            return results[0]
+        return results
+
+    def _sanitize_parameters(self, sequential=None, padding=None, truncation=None, **kwargs):
+        preprocess_params = {}
+        if padding is not None:
+            preprocess_params["padding"] = padding
+        if truncation is not None:
+            preprocess_params["truncation"] = truncation
+
+        forward_params = {}
+        if sequential is not None:
+            forward_params["sequential"] = sequential
+
+        if self.assistant_model is not None:
+            forward_params["assistant_model"] = self.assistant_model
+        if self.assistant_tokenizer is not None:
+            forward_params["tokenizer"] = self.tokenizer
+            forward_params["assistant_tokenizer"] = self.assistant_tokenizer
+
+        return preprocess_params, forward_params, {}
+
+    def preprocess(self, pipeline_input, sequential=None, padding=True, truncation=None):
+        if truncation is None:
+            if self.type == "tapas":
+                truncation = "drop_rows_to_fit"
+            else:
+                truncation = "do_not_truncate"
+
+        table, query = pipeline_input["table"], pipeline_input["query"]
+        if table.empty:
+            raise ValueError("table is empty")
+        if query is None or query == "":
+            raise ValueError("query is empty")
+        inputs = self.tokenizer(table, query, return_tensors=self.framework, truncation=truncation, padding=padding)
+        inputs["table"] = table
+        return inputs
+
+    def _forward(self, model_inputs, sequential=False, **generate_kwargs):
+        table = model_inputs.pop("table")
+
+        if self.type == "tapas":
+            if sequential:
+                outputs = self.sequential_inference(**model_inputs)
+            else:
+                outputs = self.batch_inference(**model_inputs)
+        else:
+            # User-defined `generation_config` passed to the pipeline call take precedence
+            if "generation_config" not in generate_kwargs:
+                generate_kwargs["generation_config"] = self.generation_config
+
+            outputs = self.model.generate(**model_inputs, **generate_kwargs)
+        model_outputs = {"model_inputs": model_inputs, "table": table, "outputs": outputs}
+        return model_outputs
+
+    def postprocess(self, model_outputs):
+        inputs = model_outputs["model_inputs"]
+        table = model_outputs["table"]
+        outputs = model_outputs["outputs"]
+        if self.type == "tapas":
+            if self.aggregate:
+                logits, logits_agg = outputs[:2]
+                predictions = self.tokenizer.convert_logits_to_predictions(inputs, logits, logits_agg)
+                answer_coordinates_batch, agg_predictions = predictions
+                aggregators = {i: self.model.config.aggregation_labels[pred] for i, pred in enumerate(agg_predictions)}
+
+                no_agg_label_index = self.model.config.no_aggregation_label_index
+                aggregators_prefix = {
+                    i: aggregators[i] + " > " for i, pred in enumerate(agg_predictions) if pred != no_agg_label_index
+                }
+            else:
+                logits = outputs[0]
+                predictions = self.tokenizer.convert_logits_to_predictions(inputs, logits)
+                answer_coordinates_batch = predictions[0]
+                aggregators = {}
+                aggregators_prefix = {}
+            answers = []
+            for index, coordinates in enumerate(answer_coordinates_batch):
+                cells = [table.iat[coordinate] for coordinate in coordinates]
+                aggregator = aggregators.get(index, "")
+                aggregator_prefix = aggregators_prefix.get(index, "")
+                answer = {
+                    "answer": aggregator_prefix + ", ".join(cells),
+                    "coordinates": coordinates,
+                    "cells": [table.iat[coordinate] for coordinate in coordinates],
+                }
+                if aggregator:
+                    answer["aggregator"] = aggregator
+
+                answers.append(answer)
+            if len(answer) == 0:
+                raise PipelineException("Empty answer")
+        else:
+            answers = [{"answer": answer} for answer in self.tokenizer.batch_decode(outputs, skip_special_tokens=True)]
+
+        return answers if len(answers) > 1 else answers[0]
diff --git a/.venv/lib/python3.11/site-packages/transformers/pipelines/text2text_generation.py b/.venv/lib/python3.11/site-packages/transformers/pipelines/text2text_generation.py
new file mode 100644
index 0000000000000000000000000000000000000000..9bc7544550286ecb2ad2108d7dffb142cc123877
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/transformers/pipelines/text2text_generation.py
@@ -0,0 +1,382 @@
+import enum
+import warnings
+
+from ..tokenization_utils import TruncationStrategy
+from ..utils import add_end_docstrings, is_tf_available, is_torch_available, logging
+from .base import Pipeline, build_pipeline_init_args
+
+
+if is_tf_available():
+    import tensorflow as tf
+
+    from ..models.auto.modeling_tf_auto import TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES
+
+if is_torch_available():
+    from ..models.auto.modeling_auto import MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES
+
+logger = logging.get_logger(__name__)
+
+
+class ReturnType(enum.Enum):
+    TENSORS = 0
+    TEXT = 1
+
+
+@add_end_docstrings(build_pipeline_init_args(has_tokenizer=True))
+class Text2TextGenerationPipeline(Pipeline):
+    """
+    Pipeline for text to text generation using seq2seq models.
+
+    Example:
+
+    ```python
+    >>> from transformers import pipeline
+
+    >>> generator = pipeline(model="mrm8488/t5-base-finetuned-question-generation-ap")
+    >>> generator(
+    ...     "answer: Manuel context: Manuel has created RuPERTa-base with the support of HF-Transformers and Google"
+    ... )
+    [{'generated_text': 'question: Who created the RuPERTa-base?'}]
+    ```
+
+    Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial). You can pass text
+    generation parameters to this pipeline to control stopping criteria, decoding strategy, and more. Learn more about
+    text generation parameters in [Text generation strategies](../generation_strategies) and [Text
+    generation](text_generation).
+
+    This Text2TextGenerationPipeline pipeline can currently be loaded from [`pipeline`] using the following task
+    identifier: `"text2text-generation"`.
+
+    The models that this pipeline can use are models that have been fine-tuned on a translation task. See the
+    up-to-date list of available models on
+    [huggingface.co/models](https://huggingface.co/models?filter=text2text-generation). For a list of available
+    parameters, see the [following
+    documentation](https://huggingface.co/docs/transformers/en/main_classes/text_generation#transformers.generation.GenerationMixin.generate)
+
+    Usage:
+
+    ```python
+    text2text_generator = pipeline("text2text-generation")
+    text2text_generator("question: What is 42 ? context: 42 is the answer to life, the universe and everything")
+    ```"""
+
+    # Used in the return key of the pipeline.
+    return_name = "generated"
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        self.check_model_type(
+            TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES
+            if self.framework == "tf"
+            else MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES
+        )
+
+    def _sanitize_parameters(
+        self,
+        return_tensors=None,
+        return_text=None,
+        return_type=None,
+        clean_up_tokenization_spaces=None,
+        truncation=None,
+        stop_sequence=None,
+        **generate_kwargs,
+    ):
+        preprocess_params = {}
+        if truncation is not None:
+            preprocess_params["truncation"] = truncation
+
+        forward_params = generate_kwargs
+
+        postprocess_params = {}
+        if return_tensors is not None and return_type is None:
+            return_type = ReturnType.TENSORS if return_tensors else ReturnType.TEXT
+        if return_type is not None:
+            postprocess_params["return_type"] = return_type
+
+        if clean_up_tokenization_spaces is not None:
+            postprocess_params["clean_up_tokenization_spaces"] = clean_up_tokenization_spaces
+
+        if stop_sequence is not None:
+            stop_sequence_ids = self.tokenizer.encode(stop_sequence, add_special_tokens=False)
+            if len(stop_sequence_ids) > 1:
+                warnings.warn(
+                    "Stopping on a multiple token sequence is not yet supported on transformers. The first token of"
+                    " the stop sequence will be used as the stop sequence string in the interim."
+                )
+            generate_kwargs["eos_token_id"] = stop_sequence_ids[0]
+
+        if self.assistant_model is not None:
+            forward_params["assistant_model"] = self.assistant_model
+        if self.assistant_tokenizer is not None:
+            forward_params["tokenizer"] = self.tokenizer
+            forward_params["assistant_tokenizer"] = self.assistant_tokenizer
+
+        return preprocess_params, forward_params, postprocess_params
+
+    def check_inputs(self, input_length: int, min_length: int, max_length: int):
+        """
+        Checks whether there might be something wrong with given input with regard to the model.
+        """
+        return True
+
+    def _parse_and_tokenize(self, *args, truncation):
+        prefix = self.prefix if self.prefix is not None else ""
+        if isinstance(args[0], list):
+            if self.tokenizer.pad_token_id is None:
+                raise ValueError("Please make sure that the tokenizer has a pad_token_id when using a batch input")
+            args = ([prefix + arg for arg in args[0]],)
+            padding = True
+
+        elif isinstance(args[0], str):
+            args = (prefix + args[0],)
+            padding = False
+        else:
+            raise ValueError(
+                f" `args[0]`: {args[0]} have the wrong format. The should be either of type `str` or type `list`"
+            )
+        inputs = self.tokenizer(*args, padding=padding, truncation=truncation, return_tensors=self.framework)
+        # This is produced by tokenizers but is an invalid generate kwargs
+        if "token_type_ids" in inputs:
+            del inputs["token_type_ids"]
+        return inputs
+
+    def __call__(self, *args, **kwargs):
+        r"""
+        Generate the output text(s) using text(s) given as inputs.
+
+        Args:
+            args (`str` or `List[str]`):
+                Input text for the encoder.
+            return_tensors (`bool`, *optional*, defaults to `False`):
+                Whether or not to include the tensors of predictions (as token indices) in the outputs.
+            return_text (`bool`, *optional*, defaults to `True`):
+                Whether or not to include the decoded texts in the outputs.
+            clean_up_tokenization_spaces (`bool`, *optional*, defaults to `False`):
+                Whether or not to clean up the potential extra spaces in the text output.
+            truncation (`TruncationStrategy`, *optional*, defaults to `TruncationStrategy.DO_NOT_TRUNCATE`):
+                The truncation strategy for the tokenization within the pipeline. `TruncationStrategy.DO_NOT_TRUNCATE`
+                (default) will never truncate, but it is sometimes desirable to truncate the input to fit the model's
+                max_length instead of throwing an error down the line.
+            generate_kwargs:
+                Additional keyword arguments to pass along to the generate method of the model (see the generate method
+                corresponding to your framework [here](./text_generation)).
+
+        Return:
+            A list or a list of list of `dict`: Each result comes as a dictionary with the following keys:
+
+            - **generated_text** (`str`, present when `return_text=True`) -- The generated text.
+            - **generated_token_ids** (`torch.Tensor` or `tf.Tensor`, present when `return_tensors=True`) -- The token
+              ids of the generated text.
+        """
+
+        result = super().__call__(*args, **kwargs)
+        if (
+            isinstance(args[0], list)
+            and all(isinstance(el, str) for el in args[0])
+            and all(len(res) == 1 for res in result)
+        ):
+            return [res[0] for res in result]
+        return result
+
+    def preprocess(self, inputs, truncation=TruncationStrategy.DO_NOT_TRUNCATE, **kwargs):
+        inputs = self._parse_and_tokenize(inputs, truncation=truncation, **kwargs)
+        return inputs
+
+    def _forward(self, model_inputs, **generate_kwargs):
+        if self.framework == "pt":
+            in_b, input_length = model_inputs["input_ids"].shape
+        elif self.framework == "tf":
+            in_b, input_length = tf.shape(model_inputs["input_ids"]).numpy()
+
+        self.check_inputs(
+            input_length,
+            generate_kwargs.get("min_length", self.generation_config.min_length),
+            generate_kwargs.get("max_length", self.generation_config.max_length),
+        )
+
+        # User-defined `generation_config` passed to the pipeline call take precedence
+        if "generation_config" not in generate_kwargs:
+            generate_kwargs["generation_config"] = self.generation_config
+
+        output_ids = self.model.generate(**model_inputs, **generate_kwargs)
+        out_b = output_ids.shape[0]
+        if self.framework == "pt":
+            output_ids = output_ids.reshape(in_b, out_b // in_b, *output_ids.shape[1:])
+        elif self.framework == "tf":
+            output_ids = tf.reshape(output_ids, (in_b, out_b // in_b, *output_ids.shape[1:]))
+        return {"output_ids": output_ids}
+
+    def postprocess(self, model_outputs, return_type=ReturnType.TEXT, clean_up_tokenization_spaces=False):
+        records = []
+        for output_ids in model_outputs["output_ids"][0]:
+            if return_type == ReturnType.TENSORS:
+                record = {f"{self.return_name}_token_ids": output_ids}
+            elif return_type == ReturnType.TEXT:
+                record = {
+                    f"{self.return_name}_text": self.tokenizer.decode(
+                        output_ids,
+                        skip_special_tokens=True,
+                        clean_up_tokenization_spaces=clean_up_tokenization_spaces,
+                    )
+                }
+            records.append(record)
+        return records
+
+
+@add_end_docstrings(build_pipeline_init_args(has_tokenizer=True))
+class SummarizationPipeline(Text2TextGenerationPipeline):
+    """
+    Summarize news articles and other documents.
+
+    This summarizing pipeline can currently be loaded from [`pipeline`] using the following task identifier:
+    `"summarization"`.
+
+    The models that this pipeline can use are models that have been fine-tuned on a summarization task, which is
+    currently, '*bart-large-cnn*', '*google-t5/t5-small*', '*google-t5/t5-base*', '*google-t5/t5-large*', '*google-t5/t5-3b*', '*google-t5/t5-11b*'. See the up-to-date
+    list of available models on [huggingface.co/models](https://huggingface.co/models?filter=summarization). For a list
+    of available parameters, see the [following
+    documentation](https://huggingface.co/docs/transformers/en/main_classes/text_generation#transformers.generation.GenerationMixin.generate)
+
+    Usage:
+
+    ```python
+    # use bart in pytorch
+    summarizer = pipeline("summarization")
+    summarizer("An apple a day, keeps the doctor away", min_length=5, max_length=20)
+
+    # use t5 in tf
+    summarizer = pipeline("summarization", model="google-t5/t5-base", tokenizer="google-t5/t5-base", framework="tf")
+    summarizer("An apple a day, keeps the doctor away", min_length=5, max_length=20)
+    ```"""
+
+    # Used in the return key of the pipeline.
+    return_name = "summary"
+
+    def __call__(self, *args, **kwargs):
+        r"""
+        Summarize the text(s) given as inputs.
+
+        Args:
+            documents (*str* or `List[str]`):
+                One or several articles (or one list of articles) to summarize.
+            return_text (`bool`, *optional*, defaults to `True`):
+                Whether or not to include the decoded texts in the outputs
+            return_tensors (`bool`, *optional*, defaults to `False`):
+                Whether or not to include the tensors of predictions (as token indices) in the outputs.
+            clean_up_tokenization_spaces (`bool`, *optional*, defaults to `False`):
+                Whether or not to clean up the potential extra spaces in the text output.
+            generate_kwargs:
+                Additional keyword arguments to pass along to the generate method of the model (see the generate method
+                corresponding to your framework [here](./text_generation)).
+
+        Return:
+            A list or a list of list of `dict`: Each result comes as a dictionary with the following keys:
+
+            - **summary_text** (`str`, present when `return_text=True`) -- The summary of the corresponding input.
+            - **summary_token_ids** (`torch.Tensor` or `tf.Tensor`, present when `return_tensors=True`) -- The token
+              ids of the summary.
+        """
+        return super().__call__(*args, **kwargs)
+
+    def check_inputs(self, input_length: int, min_length: int, max_length: int) -> bool:
+        """
+        Checks whether there might be something wrong with given input with regard to the model.
+        """
+        if max_length < min_length:
+            logger.warning(f"Your min_length={min_length} must be inferior than your max_length={max_length}.")
+
+        if input_length < max_length:
+            logger.warning(
+                f"Your max_length is set to {max_length}, but your input_length is only {input_length}. Since this is "
+                "a summarization task, where outputs shorter than the input are typically wanted, you might "
+                f"consider decreasing max_length manually, e.g. summarizer('...', max_length={input_length//2})"
+            )
+
+
+@add_end_docstrings(build_pipeline_init_args(has_tokenizer=True))
+class TranslationPipeline(Text2TextGenerationPipeline):
+    """
+    Translates from one language to another.
+
+    This translation pipeline can currently be loaded from [`pipeline`] using the following task identifier:
+    `"translation_xx_to_yy"`.
+
+    The models that this pipeline can use are models that have been fine-tuned on a translation task. See the
+    up-to-date list of available models on [huggingface.co/models](https://huggingface.co/models?filter=translation).
+    For a list of available parameters, see the [following
+    documentation](https://huggingface.co/docs/transformers/en/main_classes/text_generation#transformers.generation.GenerationMixin.generate)
+
+    Usage:
+
+    ```python
+    en_fr_translator = pipeline("translation_en_to_fr")
+    en_fr_translator("How old are you?")
+    ```"""
+
+    # Used in the return key of the pipeline.
+    return_name = "translation"
+
+    def check_inputs(self, input_length: int, min_length: int, max_length: int):
+        if input_length > 0.9 * max_length:
+            logger.warning(
+                f"Your input_length: {input_length} is bigger than 0.9 * max_length: {max_length}. You might consider "
+                "increasing your max_length manually, e.g. translator('...', max_length=400)"
+            )
+        return True
+
+    def preprocess(self, *args, truncation=TruncationStrategy.DO_NOT_TRUNCATE, src_lang=None, tgt_lang=None):
+        if getattr(self.tokenizer, "_build_translation_inputs", None):
+            return self.tokenizer._build_translation_inputs(
+                *args, return_tensors=self.framework, truncation=truncation, src_lang=src_lang, tgt_lang=tgt_lang
+            )
+        else:
+            return super()._parse_and_tokenize(*args, truncation=truncation)
+
+    def _sanitize_parameters(self, src_lang=None, tgt_lang=None, **kwargs):
+        preprocess_params, forward_params, postprocess_params = super()._sanitize_parameters(**kwargs)
+        if src_lang is not None:
+            preprocess_params["src_lang"] = src_lang
+        if tgt_lang is not None:
+            preprocess_params["tgt_lang"] = tgt_lang
+        if src_lang is None and tgt_lang is None:
+            # Backward compatibility, direct arguments use is preferred.
+            task = kwargs.get("task", self.task)
+            items = task.split("_")
+            if task and len(items) == 4:
+                # translation, XX, to YY
+                preprocess_params["src_lang"] = items[1]
+                preprocess_params["tgt_lang"] = items[3]
+        return preprocess_params, forward_params, postprocess_params
+
+    def __call__(self, *args, **kwargs):
+        r"""
+        Translate the text(s) given as inputs.
+
+        Args:
+            args (`str` or `List[str]`):
+                Texts to be translated.
+            return_tensors (`bool`, *optional*, defaults to `False`):
+                Whether or not to include the tensors of predictions (as token indices) in the outputs.
+            return_text (`bool`, *optional*, defaults to `True`):
+                Whether or not to include the decoded texts in the outputs.
+            clean_up_tokenization_spaces (`bool`, *optional*, defaults to `False`):
+                Whether or not to clean up the potential extra spaces in the text output.
+            src_lang (`str`, *optional*):
+                The language of the input. Might be required for multilingual models. Will not have any effect for
+                single pair translation models
+            tgt_lang (`str`, *optional*):
+                The language of the desired output. Might be required for multilingual models. Will not have any effect
+                for single pair translation models
+            generate_kwargs:
+                Additional keyword arguments to pass along to the generate method of the model (see the generate method
+                corresponding to your framework [here](./text_generation)).
+
+        Return:
+            A list or a list of list of `dict`: Each result comes as a dictionary with the following keys:
+
+            - **translation_text** (`str`, present when `return_text=True`) -- The translation.
+            - **translation_token_ids** (`torch.Tensor` or `tf.Tensor`, present when `return_tensors=True`) -- The
+              token ids of the translation.
+        """
+        return super().__call__(*args, **kwargs)
diff --git a/.venv/lib/python3.11/site-packages/transformers/pipelines/text_classification.py b/.venv/lib/python3.11/site-packages/transformers/pipelines/text_classification.py
new file mode 100644
index 0000000000000000000000000000000000000000..dadb29c386b41e4ca3bd1a49ee103308c3f02174
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/transformers/pipelines/text_classification.py
@@ -0,0 +1,236 @@
+import inspect
+import warnings
+from typing import Dict
+
+import numpy as np
+
+from ..utils import ExplicitEnum, add_end_docstrings, is_tf_available, is_torch_available
+from .base import GenericTensor, Pipeline, build_pipeline_init_args
+
+
+if is_tf_available():
+    from ..models.auto.modeling_tf_auto import TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES
+
+if is_torch_available():
+    from ..models.auto.modeling_auto import MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES
+
+
+def sigmoid(_outputs):
+    return 1.0 / (1.0 + np.exp(-_outputs))
+
+
+def softmax(_outputs):
+    maxes = np.max(_outputs, axis=-1, keepdims=True)
+    shifted_exp = np.exp(_outputs - maxes)
+    return shifted_exp / shifted_exp.sum(axis=-1, keepdims=True)
+
+
+class ClassificationFunction(ExplicitEnum):
+    SIGMOID = "sigmoid"
+    SOFTMAX = "softmax"
+    NONE = "none"
+
+
+@add_end_docstrings(
+    build_pipeline_init_args(has_tokenizer=True),
+    r"""
+        return_all_scores (`bool`, *optional*, defaults to `False`):
+            Whether to return all prediction scores or just the one of the predicted class.
+        function_to_apply (`str`, *optional*, defaults to `"default"`):
+            The function to apply to the model outputs in order to retrieve the scores. Accepts four different values:
+
+            - `"default"`: if the model has a single label, will apply the sigmoid function on the output. If the model
+              has several labels, will apply the softmax function on the output. In case of regression tasks, will not
+              apply any function on the output.
+            - `"sigmoid"`: Applies the sigmoid function on the output.
+            - `"softmax"`: Applies the softmax function on the output.
+            - `"none"`: Does not apply any function on the output.""",
+)
+class TextClassificationPipeline(Pipeline):
+    """
+    Text classification pipeline using any `ModelForSequenceClassification`. See the [sequence classification
+    examples](../task_summary#sequence-classification) for more information.
+
+    Example:
+
+    ```python
+    >>> from transformers import pipeline
+
+    >>> classifier = pipeline(model="distilbert/distilbert-base-uncased-finetuned-sst-2-english")
+    >>> classifier("This movie is disgustingly good !")
+    [{'label': 'POSITIVE', 'score': 1.0}]
+
+    >>> classifier("Director tried too much.")
+    [{'label': 'NEGATIVE', 'score': 0.996}]
+    ```
+
+    Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial)
+
+    This text classification pipeline can currently be loaded from [`pipeline`] using the following task identifier:
+    `"sentiment-analysis"` (for classifying sequences according to positive or negative sentiments).
+
+    If multiple classification labels are available (`model.config.num_labels >= 2`), the pipeline will run a softmax
+    over the results. If there is a single label, the pipeline will run a sigmoid over the result. In case of regression
+    tasks (`model.config.problem_type == "regression"`), will not apply any function on the output.
+
+    The models that this pipeline can use are models that have been fine-tuned on a sequence classification task. See
+    the up-to-date list of available models on
+    [huggingface.co/models](https://huggingface.co/models?filter=text-classification).
+    """
+
+    return_all_scores = False
+    function_to_apply = ClassificationFunction.NONE
+
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+
+        self.check_model_type(
+            TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES
+            if self.framework == "tf"
+            else MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES
+        )
+
+    def _sanitize_parameters(self, return_all_scores=None, function_to_apply=None, top_k="", **tokenizer_kwargs):
+        # Using "" as default argument because we're going to use `top_k=None` in user code to declare
+        # "No top_k"
+        preprocess_params = tokenizer_kwargs
+
+        postprocess_params = {}
+        if hasattr(self.model.config, "return_all_scores") and return_all_scores is None:
+            return_all_scores = self.model.config.return_all_scores
+
+        if isinstance(top_k, int) or top_k is None:
+            postprocess_params["top_k"] = top_k
+            postprocess_params["_legacy"] = False
+        elif return_all_scores is not None:
+            warnings.warn(
+                "`return_all_scores` is now deprecated,  if want a similar functionality use `top_k=None` instead of"
+                " `return_all_scores=True` or `top_k=1` instead of `return_all_scores=False`.",
+                UserWarning,
+            )
+            if return_all_scores:
+                postprocess_params["top_k"] = None
+            else:
+                postprocess_params["top_k"] = 1
+
+        if isinstance(function_to_apply, str):
+            function_to_apply = ClassificationFunction[function_to_apply.upper()]
+
+        if function_to_apply is not None:
+            postprocess_params["function_to_apply"] = function_to_apply
+        return preprocess_params, {}, postprocess_params
+
+    def __call__(self, inputs, **kwargs):
+        """
+        Classify the text(s) given as inputs.
+
+        Args:
+            inputs (`str` or `List[str]` or `Dict[str]`, or `List[Dict[str]]`):
+                One or several texts to classify. In order to use text pairs for your classification, you can send a
+                dictionary containing `{"text", "text_pair"}` keys, or a list of those.
+            top_k (`int`, *optional*, defaults to `1`):
+                How many results to return.
+            function_to_apply (`str`, *optional*, defaults to `"default"`):
+                The function to apply to the model outputs in order to retrieve the scores. Accepts four different
+                values:
+
+                If this argument is not specified, then it will apply the following functions according to the number
+                of labels:
+
+                - If problem type is regression, will not apply any function on the output.
+                - If the model has a single label, will apply the sigmoid function on the output.
+                - If the model has several labels, will apply the softmax function on the output.
+
+                Possible values are:
+
+                - `"sigmoid"`: Applies the sigmoid function on the output.
+                - `"softmax"`: Applies the softmax function on the output.
+                - `"none"`: Does not apply any function on the output.
+
+        Return:
+            A list or a list of list of `dict`: Each result comes as list of dictionaries with the following keys:
+
+            - **label** (`str`) -- The label predicted.
+            - **score** (`float`) -- The corresponding probability.
+
+            If `top_k` is used, one such dictionary is returned per label.
+        """
+        inputs = (inputs,)
+        result = super().__call__(*inputs, **kwargs)
+        # TODO try and retrieve it in a nicer way from _sanitize_parameters.
+        _legacy = "top_k" not in kwargs
+        if isinstance(inputs[0], str) and _legacy:
+            # This pipeline is odd, and return a list when single item is run
+            return [result]
+        else:
+            return result
+
+    def preprocess(self, inputs, **tokenizer_kwargs) -> Dict[str, GenericTensor]:
+        return_tensors = self.framework
+        if isinstance(inputs, dict):
+            return self.tokenizer(**inputs, return_tensors=return_tensors, **tokenizer_kwargs)
+        elif isinstance(inputs, list) and len(inputs) == 1 and isinstance(inputs[0], list) and len(inputs[0]) == 2:
+            # It used to be valid to use a list of list of list for text pairs, keeping this path for BC
+            return self.tokenizer(
+                text=inputs[0][0], text_pair=inputs[0][1], return_tensors=return_tensors, **tokenizer_kwargs
+            )
+        elif isinstance(inputs, list):
+            # This is likely an invalid usage of the pipeline attempting to pass text pairs.
+            raise ValueError(
+                "The pipeline received invalid inputs, if you are trying to send text pairs, you can try to send a"
+                ' dictionary `{"text": "My text", "text_pair": "My pair"}` in order to send a text pair.'
+            )
+        return self.tokenizer(inputs, return_tensors=return_tensors, **tokenizer_kwargs)
+
+    def _forward(self, model_inputs):
+        # `XXXForSequenceClassification` models should not use `use_cache=True` even if it's supported
+        model_forward = self.model.forward if self.framework == "pt" else self.model.call
+        if "use_cache" in inspect.signature(model_forward).parameters.keys():
+            model_inputs["use_cache"] = False
+        return self.model(**model_inputs)
+
+    def postprocess(self, model_outputs, function_to_apply=None, top_k=1, _legacy=True):
+        # `_legacy` is used to determine if we're running the naked pipeline and in backward
+        # compatibility mode, or if running the pipeline with `pipeline(..., top_k=1)` we're running
+        # the more natural result containing the list.
+        # Default value before `set_parameters`
+        if function_to_apply is None:
+            if self.model.config.problem_type == "regression":
+                function_to_apply = ClassificationFunction.NONE
+            elif self.model.config.problem_type == "multi_label_classification" or self.model.config.num_labels == 1:
+                function_to_apply = ClassificationFunction.SIGMOID
+            elif self.model.config.problem_type == "single_label_classification" or self.model.config.num_labels > 1:
+                function_to_apply = ClassificationFunction.SOFTMAX
+            elif hasattr(self.model.config, "function_to_apply") and function_to_apply is None:
+                function_to_apply = self.model.config.function_to_apply
+            else:
+                function_to_apply = ClassificationFunction.NONE
+
+        outputs = model_outputs["logits"][0]
+
+        if self.framework == "pt":
+            # To enable using fp16 and bf16
+            outputs = outputs.float().numpy()
+        else:
+            outputs = outputs.numpy()
+
+        if function_to_apply == ClassificationFunction.SIGMOID:
+            scores = sigmoid(outputs)
+        elif function_to_apply == ClassificationFunction.SOFTMAX:
+            scores = softmax(outputs)
+        elif function_to_apply == ClassificationFunction.NONE:
+            scores = outputs
+        else:
+            raise ValueError(f"Unrecognized `function_to_apply` argument: {function_to_apply}")
+
+        if top_k == 1 and _legacy:
+            return {"label": self.model.config.id2label[scores.argmax().item()], "score": scores.max().item()}
+
+        dict_scores = [
+            {"label": self.model.config.id2label[i], "score": score.item()} for i, score in enumerate(scores)
+        ]
+        if not _legacy:
+            dict_scores.sort(key=lambda x: x["score"], reverse=True)
+            if top_k is not None:
+                dict_scores = dict_scores[:top_k]
+        return dict_scores
diff --git a/.venv/lib/python3.11/site-packages/transformers/pipelines/text_generation.py b/.venv/lib/python3.11/site-packages/transformers/pipelines/text_generation.py
new file mode 100644
index 0000000000000000000000000000000000000000..c0f14663ffdf5876d1aa4612cf54432974049606
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/transformers/pipelines/text_generation.py
@@ -0,0 +1,449 @@
+import enum
+import itertools
+import types
+from typing import Dict
+
+from ..utils import add_end_docstrings, is_tf_available, is_torch_available
+from .base import Pipeline, build_pipeline_init_args
+
+
+if is_torch_available():
+    from ..models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
+    from .pt_utils import KeyDataset
+
+if is_tf_available():
+    import tensorflow as tf
+
+    from ..models.auto.modeling_tf_auto import TF_MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
+
+
+class ReturnType(enum.Enum):
+    TENSORS = 0
+    NEW_TEXT = 1
+    FULL_TEXT = 2
+
+
+class Chat:
+    """This class is intended to just be used internally in this pipeline and not exposed to users. We convert chats
+    to this format because the rest of the pipeline code tends to assume that lists of messages are
+    actually a batch of samples rather than messages in the same conversation."""
+
+    def __init__(self, messages: Dict):
+        for message in messages:
+            if not ("role" in message and "content" in message):
+                raise ValueError("When passing chat dicts as input, each dict must have a 'role' and 'content' key.")
+        self.messages = messages
+
+
+@add_end_docstrings(build_pipeline_init_args(has_tokenizer=True))
+class TextGenerationPipeline(Pipeline):
+    """
+    Language generation pipeline using any `ModelWithLMHead`. This pipeline predicts the words that will follow a
+    specified text prompt. When the underlying model is a conversational model, it can also accept one or more chats,
+    in which case the pipeline will operate in chat mode and will continue the chat(s) by adding its response(s).
+    Each chat takes the form of a list of dicts, where each dict contains "role" and "content" keys.
+
+    Examples:
+
+    ```python
+    >>> from transformers import pipeline
+
+    >>> generator = pipeline(model="openai-community/gpt2")
+    >>> generator("I can't believe you did such a ", do_sample=False)
+    [{'generated_text': "I can't believe you did such a icky thing to me. I'm so sorry. I'm so sorry. I'm so sorry. I'm so sorry. I'm so sorry. I'm so sorry. I'm so sorry. I"}]
+
+    >>> # These parameters will return suggestions, and only the newly created text making it easier for prompting suggestions.
+    >>> outputs = generator("My tart needs some", num_return_sequences=4, return_full_text=False)
+    ```
+
+    ```python
+    >>> from transformers import pipeline
+
+    >>> generator = pipeline(model="HuggingFaceH4/zephyr-7b-beta")
+    >>> # Zephyr-beta is a conversational model, so let's pass it a chat instead of a single string
+    >>> generator([{"role": "user", "content": "What is the capital of France? Answer in one word."}], do_sample=False, max_new_tokens=2)
+    [{'generated_text': [{'role': 'user', 'content': 'What is the capital of France? Answer in one word.'}, {'role': 'assistant', 'content': 'Paris'}]}]
+    ```
+
+    Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial). You can pass text
+    generation parameters to this pipeline to control stopping criteria, decoding strategy, and more. Learn more about
+    text generation parameters in [Text generation strategies](../generation_strategies) and [Text
+    generation](text_generation).
+
+    This language generation pipeline can currently be loaded from [`pipeline`] using the following task identifier:
+    `"text-generation"`.
+
+    The models that this pipeline can use are models that have been trained with an autoregressive language modeling
+    objective. See the list of available [text completion models](https://huggingface.co/models?filter=text-generation)
+    and the list of [conversational models](https://huggingface.co/models?other=conversational)
+    on [huggingface.co/models].
+    """
+
+    # Prefix text to help Transformer-XL and XLNet with short prompts as proposed by Aman Rusia
+    # in https://github.com/rusiaaman/XLNet-gen#methodology
+    # and https://medium.com/@amanrusia/xlnet-speaks-comparison-to-gpt-2-ea1a4e9ba39e
+
+    XL_PREFIX = """
+    In 1991, the remains of Russian Tsar Nicholas II and his family (except for Alexei and Maria) are discovered. The
+    voice of Nicholas's young son, Tsarevich Alexei Nikolaevich, narrates the remainder of the story. 1883 Western
+    Siberia, a young Grigori Rasputin is asked by his father and a group of men to perform magic. Rasputin has a vision
+    and denounces one of the men as a horse thief. Although his father initially slaps him for making such an
+    accusation, Rasputin watches as the man is chased outside and beaten. Twenty years later, Rasputin sees a vision of
+    the Virgin Mary, prompting him to become a priest. Rasputin quickly becomes famous, with people, even a bishop,
+    begging for his blessing. <eod> </s> <eos>
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.check_model_type(
+            TF_MODEL_FOR_CAUSAL_LM_MAPPING_NAMES if self.framework == "tf" else MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
+        )
+        if "prefix" not in self._preprocess_params:
+            # This is very specific. The logic is quite complex and needs to be done
+            # as a "default".
+            # It also defines both some preprocess_kwargs and generate_kwargs
+            # which is why we cannot put them in their respective methods.
+            prefix = None
+            if self.prefix is not None:
+                prefix = self.prefix
+            if prefix is None and self.model.__class__.__name__ in [
+                "XLNetLMHeadModel",
+                "TransfoXLLMHeadModel",
+                "TFXLNetLMHeadModel",
+                "TFTransfoXLLMHeadModel",
+            ]:
+                # For XLNet and TransformerXL we add an article to the prompt to give more state to the model.
+                prefix = self.XL_PREFIX
+            if prefix is not None:
+                # Recalculate some generate_kwargs linked to prefix.
+                preprocess_params, forward_params, _ = self._sanitize_parameters(prefix=prefix, **self._forward_params)
+                self._preprocess_params = {**self._preprocess_params, **preprocess_params}
+                self._forward_params = {**self._forward_params, **forward_params}
+
+    def _sanitize_parameters(
+        self,
+        return_full_text=None,
+        return_tensors=None,
+        return_text=None,
+        return_type=None,
+        clean_up_tokenization_spaces=None,
+        prefix=None,
+        handle_long_generation=None,
+        stop_sequence=None,
+        truncation=None,
+        max_length=None,
+        continue_final_message=None,
+        **generate_kwargs,
+    ):
+        preprocess_params = {}
+
+        add_special_tokens = False
+        if "add_special_tokens" in generate_kwargs:
+            add_special_tokens = preprocess_params["add_special_tokens"] = generate_kwargs.pop("add_special_tokens")
+
+        if "padding" in generate_kwargs:
+            preprocess_params["padding"] = generate_kwargs.pop("padding")
+
+        if truncation is not None:
+            preprocess_params["truncation"] = truncation
+
+        if max_length is not None:
+            preprocess_params["max_length"] = max_length
+            generate_kwargs["max_length"] = max_length
+
+        if prefix is not None:
+            preprocess_params["prefix"] = prefix
+        if prefix:
+            prefix_inputs = self.tokenizer(
+                prefix, padding=False, add_special_tokens=add_special_tokens, return_tensors=self.framework
+            )
+            generate_kwargs["prefix_length"] = prefix_inputs["input_ids"].shape[-1]
+
+        if handle_long_generation is not None:
+            if handle_long_generation not in {"hole"}:
+                raise ValueError(
+                    f"{handle_long_generation} is not a valid value for `handle_long_generation` parameter expected"
+                    " [None, 'hole']"
+                )
+            preprocess_params["handle_long_generation"] = handle_long_generation
+
+        if continue_final_message is not None:
+            preprocess_params["continue_final_message"] = continue_final_message
+
+        preprocess_params.update(generate_kwargs)
+        forward_params = generate_kwargs
+
+        postprocess_params = {}
+        if return_full_text is not None and return_type is None:
+            if return_text is not None:
+                raise ValueError("`return_text` is mutually exclusive with `return_full_text`")
+            if return_tensors is not None:
+                raise ValueError("`return_full_text` is mutually exclusive with `return_tensors`")
+            return_type = ReturnType.FULL_TEXT if return_full_text else ReturnType.NEW_TEXT
+        if return_tensors is not None and return_type is None:
+            if return_text is not None:
+                raise ValueError("`return_text` is mutually exclusive with `return_tensors`")
+            return_type = ReturnType.TENSORS
+        if return_type is not None:
+            postprocess_params["return_type"] = return_type
+        if clean_up_tokenization_spaces is not None:
+            postprocess_params["clean_up_tokenization_spaces"] = clean_up_tokenization_spaces
+        if continue_final_message is not None:
+            postprocess_params["continue_final_message"] = continue_final_message
+
+        if stop_sequence is not None:
+            stop_sequence_ids = self.tokenizer.encode(stop_sequence, add_special_tokens=False)
+            generate_kwargs["eos_token_id"] = stop_sequence_ids
+
+        if self.assistant_model is not None:
+            forward_params["assistant_model"] = self.assistant_model
+        if self.assistant_tokenizer is not None:
+            forward_params["tokenizer"] = self.tokenizer
+            forward_params["assistant_tokenizer"] = self.assistant_tokenizer
+
+        return preprocess_params, forward_params, postprocess_params
+
+    # overriding _parse_and_tokenize to allow for unusual language-modeling tokenizer arguments
+    def _parse_and_tokenize(self, *args, **kwargs):
+        """
+        Parse arguments and tokenize
+        """
+        # Parse arguments
+        if self.model.__class__.__name__ in ["TransfoXLLMHeadModel"]:
+            kwargs.update({"add_space_before_punct_symbol": True})
+
+        return super()._parse_and_tokenize(*args, **kwargs)
+
+    def __call__(self, text_inputs, **kwargs):
+        """
+        Complete the prompt(s) given as inputs.
+
+        Args:
+            text_inputs (`str`, `List[str]`, List[Dict[str, str]], or `List[List[Dict[str, str]]]`):
+                One or several prompts (or one list of prompts) to complete. If strings or a list of string are
+                passed, this pipeline will continue each prompt. Alternatively, a "chat", in the form of a list
+                of dicts with "role" and "content" keys, can be passed, or a list of such chats. When chats are passed,
+                the model's chat template will be used to format them before passing them to the model.
+            return_tensors (`bool`, *optional*, defaults to `False`):
+                Returns the tensors of predictions (as token indices) in the outputs. If set to
+                `True`, the decoded text is not returned.
+            return_text (`bool`, *optional*):
+                Returns the decoded texts in the outputs.
+            return_full_text (`bool`, *optional*, defaults to `True`):
+                If set to `False` only added text is returned, otherwise the full text is returned. Cannot be
+                specified at the same time as `return_text`.
+            clean_up_tokenization_spaces (`bool`, *optional*, defaults to `True`):
+                Whether or not to clean up the potential extra spaces in the text output.
+            continue_final_message( `bool`, *optional*): This indicates that you want the model to continue the
+                last message in the input chat rather than starting a new one, allowing you to "prefill" its response.
+                By default this is `True` when the final message in the input chat has the `assistant` role and
+                `False` otherwise, but you can manually override that behaviour by setting this flag.
+            prefix (`str`, *optional*):
+                Prefix added to prompt.
+            handle_long_generation (`str`, *optional*):
+                By default, this pipelines does not handle long generation (ones that exceed in one form or the other
+                the model maximum length). There is no perfect way to adress this (more info
+                :https://github.com/huggingface/transformers/issues/14033#issuecomment-948385227). This provides common
+                strategies to work around that problem depending on your use case.
+
+                - `None` : default strategy where nothing in particular happens
+                - `"hole"`: Truncates left of input, and leaves a gap wide enough to let generation happen (might
+                  truncate a lot of the prompt and not suitable when generation exceed the model capacity)
+            generate_kwargs (`dict`, *optional*):
+                Additional keyword arguments to pass along to the generate method of the model (see the generate method
+                corresponding to your framework [here](./text_generation)).
+
+        Return:
+            A list or a list of lists of `dict`: Returns one of the following dictionaries (cannot return a combination
+            of both `generated_text` and `generated_token_ids`):
+
+            - **generated_text** (`str`, present when `return_text=True`) -- The generated text.
+            - **generated_token_ids** (`torch.Tensor` or `tf.Tensor`, present when `return_tensors=True`) -- The token
+              ids of the generated text.
+        """
+        if isinstance(
+            text_inputs,
+            (list, tuple, types.GeneratorType, KeyDataset)
+            if is_torch_available()
+            else (list, tuple, types.GeneratorType),
+        ):
+            if isinstance(text_inputs, types.GeneratorType):
+                text_inputs, _ = itertools.tee(text_inputs)
+                text_inputs, first_item = (x for x in text_inputs), next(_)
+            else:
+                first_item = text_inputs[0]
+            if isinstance(first_item, (list, tuple, dict)):
+                # We have one or more prompts in list-of-dicts format, so this is chat mode
+                if isinstance(first_item, dict):
+                    return super().__call__(Chat(text_inputs), **kwargs)
+                else:
+                    chats = (Chat(chat) for chat in text_inputs)  # 🐈 🐈 🐈
+                    if isinstance(text_inputs, types.GeneratorType):
+                        return super().__call__(chats, **kwargs)
+                    else:
+                        return super().__call__(list(chats), **kwargs)
+        return super().__call__(text_inputs, **kwargs)
+
+    def preprocess(
+        self,
+        prompt_text,
+        prefix="",
+        handle_long_generation=None,
+        add_special_tokens=None,
+        truncation=None,
+        padding=None,
+        max_length=None,
+        continue_final_message=None,
+        **generate_kwargs,
+    ):
+        # Only set non-None tokenizer kwargs, so as to rely on the tokenizer's defaults
+        tokenizer_kwargs = {
+            "add_special_tokens": add_special_tokens,
+            "truncation": truncation,
+            "padding": padding,
+            "max_length": max_length,
+        }
+        tokenizer_kwargs = {key: value for key, value in tokenizer_kwargs.items() if value is not None}
+
+        if isinstance(prompt_text, Chat):
+            tokenizer_kwargs.pop("add_special_tokens", None)  # ignore add_special_tokens on chats
+            # If the user passes a chat that ends in an assistant message, we treat it as a prefill by default
+            # because very few models support multiple separate, consecutive assistant messages
+            if continue_final_message is None:
+                continue_final_message = prompt_text.messages[-1]["role"] == "assistant"
+            inputs = self.tokenizer.apply_chat_template(
+                prompt_text.messages,
+                add_generation_prompt=not continue_final_message,
+                continue_final_message=continue_final_message,
+                return_dict=True,
+                return_tensors=self.framework,
+                **tokenizer_kwargs,
+            )
+        else:
+            inputs = self.tokenizer(prefix + prompt_text, return_tensors=self.framework, **tokenizer_kwargs)
+
+        inputs["prompt_text"] = prompt_text
+
+        if handle_long_generation == "hole":
+            cur_len = inputs["input_ids"].shape[-1]
+            if "max_new_tokens" in generate_kwargs:
+                new_tokens = generate_kwargs["max_new_tokens"]
+            else:
+                new_tokens = generate_kwargs.get("max_length", self.generation_config.max_length) - cur_len
+                if new_tokens < 0:
+                    raise ValueError("We cannot infer how many new tokens are expected")
+            if cur_len + new_tokens > self.tokenizer.model_max_length:
+                keep_length = self.tokenizer.model_max_length - new_tokens
+                if keep_length <= 0:
+                    raise ValueError(
+                        "We cannot use `hole` to handle this generation the number of desired tokens exceeds the"
+                        " models max length"
+                    )
+
+                inputs["input_ids"] = inputs["input_ids"][:, -keep_length:]
+                if "attention_mask" in inputs:
+                    inputs["attention_mask"] = inputs["attention_mask"][:, -keep_length:]
+
+        return inputs
+
+    def _forward(self, model_inputs, **generate_kwargs):
+        input_ids = model_inputs["input_ids"]
+        attention_mask = model_inputs.get("attention_mask", None)
+        # Allow empty prompts
+        if input_ids.shape[1] == 0:
+            input_ids = None
+            attention_mask = None
+            in_b = 1
+        else:
+            in_b = input_ids.shape[0]
+        prompt_text = model_inputs.pop("prompt_text")
+
+        # If there is a prefix, we may need to adjust the generation length. Do so without permanently modifying
+        # generate_kwargs, as some of the parameterization may come from the initialization of the pipeline.
+        prefix_length = generate_kwargs.pop("prefix_length", 0)
+        if prefix_length > 0:
+            has_max_new_tokens = "max_new_tokens" in generate_kwargs or (
+                "generation_config" in generate_kwargs
+                and generate_kwargs["generation_config"].max_new_tokens is not None
+            )
+            if not has_max_new_tokens:
+                generate_kwargs["max_length"] = generate_kwargs.get("max_length") or self.generation_config.max_length
+                generate_kwargs["max_length"] += prefix_length
+            has_min_new_tokens = "min_new_tokens" in generate_kwargs or (
+                "generation_config" in generate_kwargs
+                and generate_kwargs["generation_config"].min_new_tokens is not None
+            )
+            if not has_min_new_tokens and "min_length" in generate_kwargs:
+                generate_kwargs["min_length"] += prefix_length
+
+        # User-defined `generation_config` passed to the pipeline call take precedence
+        if "generation_config" not in generate_kwargs:
+            generate_kwargs["generation_config"] = self.generation_config
+
+        generated_sequence = self.model.generate(input_ids=input_ids, attention_mask=attention_mask, **generate_kwargs)
+        out_b = generated_sequence.shape[0]
+        if self.framework == "pt":
+            generated_sequence = generated_sequence.reshape(in_b, out_b // in_b, *generated_sequence.shape[1:])
+        elif self.framework == "tf":
+            generated_sequence = tf.reshape(generated_sequence, (in_b, out_b // in_b, *generated_sequence.shape[1:]))
+        return {"generated_sequence": generated_sequence, "input_ids": input_ids, "prompt_text": prompt_text}
+
+    def postprocess(
+        self,
+        model_outputs,
+        return_type=ReturnType.FULL_TEXT,
+        clean_up_tokenization_spaces=True,
+        continue_final_message=None,
+    ):
+        generated_sequence = model_outputs["generated_sequence"][0]
+        input_ids = model_outputs["input_ids"]
+        prompt_text = model_outputs["prompt_text"]
+        generated_sequence = generated_sequence.numpy().tolist()
+        records = []
+        for sequence in generated_sequence:
+            if return_type == ReturnType.TENSORS:
+                record = {"generated_token_ids": sequence}
+            elif return_type in {ReturnType.NEW_TEXT, ReturnType.FULL_TEXT}:
+                # Decode text
+                text = self.tokenizer.decode(
+                    sequence,
+                    skip_special_tokens=True,
+                    clean_up_tokenization_spaces=clean_up_tokenization_spaces,
+                )
+
+                # Remove PADDING prompt of the sequence if XLNet or Transfo-XL model is used
+                if input_ids is None:
+                    prompt_length = 0
+                else:
+                    prompt_length = len(
+                        self.tokenizer.decode(
+                            input_ids[0],
+                            skip_special_tokens=True,
+                            clean_up_tokenization_spaces=clean_up_tokenization_spaces,
+                        )
+                    )
+
+                all_text = text[prompt_length:]
+                if return_type == ReturnType.FULL_TEXT:
+                    if isinstance(prompt_text, str):
+                        all_text = prompt_text + all_text
+                    elif isinstance(prompt_text, Chat):
+                        if continue_final_message is None:
+                            # If the user passes a chat ending in an assistant message, we treat it as a prefill by
+                            # default because very few models support multiple separate, consecutive assistant messages
+                            continue_final_message = prompt_text.messages[-1]["role"] == "assistant"
+                        if continue_final_message:
+                            # With assistant prefill, concat onto the end of the last message
+                            all_text = list(prompt_text.messages)[:-1] + [
+                                {
+                                    "role": prompt_text.messages[-1]["role"],
+                                    "content": prompt_text.messages[-1]["content"] + all_text,
+                                }
+                            ]
+                        else:
+                            # When we're not starting from a prefill, the output is a new assistant message
+                            all_text = list(prompt_text.messages) + [{"role": "assistant", "content": all_text}]
+                record = {"generated_text": all_text}
+            records.append(record)
+
+        return records
diff --git a/.venv/lib/python3.11/site-packages/transformers/pipelines/text_to_audio.py b/.venv/lib/python3.11/site-packages/transformers/pipelines/text_to_audio.py
new file mode 100644
index 0000000000000000000000000000000000000000..b7beca586d21957b2eb3ec2dbb7daa2c49453970
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/transformers/pipelines/text_to_audio.py
@@ -0,0 +1,219 @@
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.from typing import List, Union
+from typing import List, Union
+
+from ..utils import is_torch_available
+from .base import Pipeline
+
+
+if is_torch_available():
+    from ..models.auto.modeling_auto import MODEL_FOR_TEXT_TO_SPECTROGRAM_MAPPING
+    from ..models.speecht5.modeling_speecht5 import SpeechT5HifiGan
+
+DEFAULT_VOCODER_ID = "microsoft/speecht5_hifigan"
+
+
+class TextToAudioPipeline(Pipeline):
+    """
+    Text-to-audio generation pipeline using any `AutoModelForTextToWaveform` or `AutoModelForTextToSpectrogram`. This
+    pipeline generates an audio file from an input text and optional other conditional inputs.
+
+    Example:
+
+    ```python
+    >>> from transformers import pipeline
+
+    >>> pipe = pipeline(model="suno/bark-small")
+    >>> output = pipe("Hey it's HuggingFace on the phone!")
+
+    >>> audio = output["audio"]
+    >>> sampling_rate = output["sampling_rate"]
+    ```
+
+    Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial)
+
+    <Tip>
+
+    You can specify parameters passed to the model by using [`TextToAudioPipeline.__call__.forward_params`] or
+    [`TextToAudioPipeline.__call__.generate_kwargs`].
+
+    Example:
+
+    ```python
+    >>> from transformers import pipeline
+
+    >>> music_generator = pipeline(task="text-to-audio", model="facebook/musicgen-small", framework="pt")
+
+    >>> # diversify the music generation by adding randomness with a high temperature and set a maximum music length
+    >>> generate_kwargs = {
+    ...     "do_sample": True,
+    ...     "temperature": 0.7,
+    ...     "max_new_tokens": 35,
+    ... }
+
+    >>> outputs = music_generator("Techno music with high melodic riffs", generate_kwargs=generate_kwargs)
+    ```
+
+    </Tip>
+
+    This pipeline can currently be loaded from [`pipeline`] using the following task identifiers: `"text-to-speech"` or
+    `"text-to-audio"`.
+
+    See the list of available models on [huggingface.co/models](https://huggingface.co/models?filter=text-to-speech).
+    """
+
+    def __init__(self, *args, vocoder=None, sampling_rate=None, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        if self.framework == "tf":
+            raise ValueError("The TextToAudioPipeline is only available in PyTorch.")
+
+        self.vocoder = None
+        if self.model.__class__ in MODEL_FOR_TEXT_TO_SPECTROGRAM_MAPPING.values():
+            self.vocoder = (
+                SpeechT5HifiGan.from_pretrained(DEFAULT_VOCODER_ID).to(self.model.device)
+                if vocoder is None
+                else vocoder
+            )
+
+        self.sampling_rate = sampling_rate
+        if self.vocoder is not None:
+            self.sampling_rate = self.vocoder.config.sampling_rate
+
+        if self.sampling_rate is None:
+            # get sampling_rate from config and generation config
+
+            config = self.model.config
+            gen_config = self.model.__dict__.get("generation_config", None)
+            if gen_config is not None:
+                config.update(gen_config.to_dict())
+
+            for sampling_rate_name in ["sample_rate", "sampling_rate"]:
+                sampling_rate = getattr(config, sampling_rate_name, None)
+                if sampling_rate is not None:
+                    self.sampling_rate = sampling_rate
+
+    def preprocess(self, text, **kwargs):
+        if isinstance(text, str):
+            text = [text]
+
+        if self.model.config.model_type == "bark":
+            # bark Tokenizer is called with BarkProcessor which uses those kwargs
+            new_kwargs = {
+                "max_length": self.generation_config.semantic_config.get("max_input_semantic_length", 256),
+                "add_special_tokens": False,
+                "return_attention_mask": True,
+                "return_token_type_ids": False,
+                "padding": "max_length",
+            }
+
+            # priority is given to kwargs
+            new_kwargs.update(kwargs)
+
+            kwargs = new_kwargs
+
+        output = self.tokenizer(text, **kwargs, return_tensors="pt")
+
+        return output
+
+    def _forward(self, model_inputs, **kwargs):
+        # we expect some kwargs to be additional tensors which need to be on the right device
+        kwargs = self._ensure_tensor_on_device(kwargs, device=self.device)
+        forward_params = kwargs["forward_params"]
+        generate_kwargs = kwargs["generate_kwargs"]
+
+        if self.model.can_generate():
+            # we expect some kwargs to be additional tensors which need to be on the right device
+            generate_kwargs = self._ensure_tensor_on_device(generate_kwargs, device=self.device)
+
+            # User-defined `generation_config` passed to the pipeline call take precedence
+            if "generation_config" not in generate_kwargs:
+                generate_kwargs["generation_config"] = self.generation_config
+
+            # generate_kwargs get priority over forward_params
+            forward_params.update(generate_kwargs)
+
+            output = self.model.generate(**model_inputs, **forward_params)
+        else:
+            if len(generate_kwargs):
+                raise ValueError(
+                    "You're using the `TextToAudioPipeline` with a forward-only model, but `generate_kwargs` is non "
+                    "empty. For forward-only TTA models, please use `forward_params` instead of `generate_kwargs`. "
+                    f"For reference, the `generate_kwargs` used here are: {generate_kwargs.keys()}"
+                )
+            output = self.model(**model_inputs, **forward_params)[0]
+
+        if self.vocoder is not None:
+            # in that case, the output is a spectrogram that needs to be converted into a waveform
+            output = self.vocoder(output)
+
+        return output
+
+    def __call__(self, text_inputs: Union[str, List[str]], **forward_params):
+        """
+        Generates speech/audio from the inputs. See the [`TextToAudioPipeline`] documentation for more information.
+
+        Args:
+            text_inputs (`str` or `List[str]`):
+                The text(s) to generate.
+            forward_params (`dict`, *optional*):
+                Parameters passed to the model generation/forward method. `forward_params` are always passed to the
+                underlying model.
+            generate_kwargs (`dict`, *optional*):
+                The dictionary of ad-hoc parametrization of `generate_config` to be used for the generation call. For a
+                complete overview of generate, check the [following
+                guide](https://huggingface.co/docs/transformers/en/main_classes/text_generation). `generate_kwargs` are
+                only passed to the underlying model if the latter is a generative model.
+
+        Return:
+            A `dict` or a list of `dict`: The dictionaries have two keys:
+
+            - **audio** (`np.ndarray` of shape `(nb_channels, audio_length)`) -- The generated audio waveform.
+            - **sampling_rate** (`int`) -- The sampling rate of the generated audio waveform.
+        """
+        return super().__call__(text_inputs, **forward_params)
+
+    def _sanitize_parameters(
+        self,
+        preprocess_params=None,
+        forward_params=None,
+        generate_kwargs=None,
+    ):
+        if self.assistant_model is not None:
+            generate_kwargs["assistant_model"] = self.assistant_model
+        if self.assistant_tokenizer is not None:
+            generate_kwargs["tokenizer"] = self.tokenizer
+            generate_kwargs["assistant_tokenizer"] = self.assistant_tokenizer
+
+        params = {
+            "forward_params": forward_params if forward_params else {},
+            "generate_kwargs": generate_kwargs if generate_kwargs else {},
+        }
+
+        if preprocess_params is None:
+            preprocess_params = {}
+        postprocess_params = {}
+
+        return preprocess_params, params, postprocess_params
+
+    def postprocess(self, waveform):
+        output_dict = {}
+        if isinstance(waveform, dict):
+            waveform = waveform["waveform"]
+        elif isinstance(waveform, tuple):
+            waveform = waveform[0]
+        output_dict["audio"] = waveform.cpu().float().numpy()
+        output_dict["sampling_rate"] = self.sampling_rate
+
+        return output_dict
diff --git a/.venv/lib/python3.11/site-packages/transformers/pipelines/token_classification.py b/.venv/lib/python3.11/site-packages/transformers/pipelines/token_classification.py
new file mode 100644
index 0000000000000000000000000000000000000000..9256f238148476b4d923c84f884156b4564c93a7
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/transformers/pipelines/token_classification.py
@@ -0,0 +1,576 @@
+import types
+import warnings
+from typing import List, Optional, Tuple, Union
+
+import numpy as np
+
+from ..models.bert.tokenization_bert import BasicTokenizer
+from ..utils import (
+    ExplicitEnum,
+    add_end_docstrings,
+    is_tf_available,
+    is_torch_available,
+)
+from .base import ArgumentHandler, ChunkPipeline, Dataset, build_pipeline_init_args
+
+
+if is_tf_available():
+    import tensorflow as tf
+
+    from ..models.auto.modeling_tf_auto import TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES
+if is_torch_available():
+    import torch
+
+    from ..models.auto.modeling_auto import MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES
+
+
+class TokenClassificationArgumentHandler(ArgumentHandler):
+    """
+    Handles arguments for token classification.
+    """
+
+    def __call__(self, inputs: Union[str, List[str]], **kwargs):
+        if inputs is not None and isinstance(inputs, (list, tuple)) and len(inputs) > 0:
+            inputs = list(inputs)
+            batch_size = len(inputs)
+        elif isinstance(inputs, str):
+            inputs = [inputs]
+            batch_size = 1
+        elif Dataset is not None and isinstance(inputs, Dataset) or isinstance(inputs, types.GeneratorType):
+            return inputs, None
+        else:
+            raise ValueError("At least one input is required.")
+
+        offset_mapping = kwargs.get("offset_mapping")
+        if offset_mapping:
+            if isinstance(offset_mapping, list) and isinstance(offset_mapping[0], tuple):
+                offset_mapping = [offset_mapping]
+            if len(offset_mapping) != batch_size:
+                raise ValueError("offset_mapping should have the same batch size as the input")
+        return inputs, offset_mapping
+
+
+class AggregationStrategy(ExplicitEnum):
+    """All the valid aggregation strategies for TokenClassificationPipeline"""
+
+    NONE = "none"
+    SIMPLE = "simple"
+    FIRST = "first"
+    AVERAGE = "average"
+    MAX = "max"
+
+
+@add_end_docstrings(
+    build_pipeline_init_args(has_tokenizer=True),
+    r"""
+        ignore_labels (`List[str]`, defaults to `["O"]`):
+            A list of labels to ignore.
+        grouped_entities (`bool`, *optional*, defaults to `False`):
+            DEPRECATED, use `aggregation_strategy` instead. Whether or not to group the tokens corresponding to the
+            same entity together in the predictions or not.
+        stride (`int`, *optional*):
+            If stride is provided, the pipeline is applied on all the text. The text is split into chunks of size
+            model_max_length. Works only with fast tokenizers and `aggregation_strategy` different from `NONE`. The
+            value of this argument defines the number of overlapping tokens between chunks. In other words, the model
+            will shift forward by `tokenizer.model_max_length - stride` tokens each step.
+        aggregation_strategy (`str`, *optional*, defaults to `"none"`):
+            The strategy to fuse (or not) tokens based on the model prediction.
+
+                - "none" : Will simply not do any aggregation and simply return raw results from the model
+                - "simple" : Will attempt to group entities following the default schema. (A, B-TAG), (B, I-TAG), (C,
+                  I-TAG), (D, B-TAG2) (E, B-TAG2) will end up being [{"word": ABC, "entity": "TAG"}, {"word": "D",
+                  "entity": "TAG2"}, {"word": "E", "entity": "TAG2"}] Notice that two consecutive B tags will end up as
+                  different entities. On word based languages, we might end up splitting words undesirably : Imagine
+                  Microsoft being tagged as [{"word": "Micro", "entity": "ENTERPRISE"}, {"word": "soft", "entity":
+                  "NAME"}]. Look for FIRST, MAX, AVERAGE for ways to mitigate that and disambiguate words (on languages
+                  that support that meaning, which is basically tokens separated by a space). These mitigations will
+                  only work on real words, "New york" might still be tagged with two different entities.
+                - "first" : (works only on word based models) Will use the `SIMPLE` strategy except that words, cannot
+                  end up with different tags. Words will simply use the tag of the first token of the word when there
+                  is ambiguity.
+                - "average" : (works only on word based models) Will use the `SIMPLE` strategy except that words,
+                  cannot end up with different tags. scores will be averaged first across tokens, and then the maximum
+                  label is applied.
+                - "max" : (works only on word based models) Will use the `SIMPLE` strategy except that words, cannot
+                  end up with different tags. Word entity will simply be the token with the maximum score.""",
+)
+class TokenClassificationPipeline(ChunkPipeline):
+    """
+    Named Entity Recognition pipeline using any `ModelForTokenClassification`. See the [named entity recognition
+    examples](../task_summary#named-entity-recognition) for more information.
+
+    Example:
+
+    ```python
+    >>> from transformers import pipeline
+
+    >>> token_classifier = pipeline(model="Jean-Baptiste/camembert-ner", aggregation_strategy="simple")
+    >>> sentence = "Je m'appelle jean-baptiste et je vis à montréal"
+    >>> tokens = token_classifier(sentence)
+    >>> tokens
+    [{'entity_group': 'PER', 'score': 0.9931, 'word': 'jean-baptiste', 'start': 12, 'end': 26}, {'entity_group': 'LOC', 'score': 0.998, 'word': 'montréal', 'start': 38, 'end': 47}]
+
+    >>> token = tokens[0]
+    >>> # Start and end provide an easy way to highlight words in the original text.
+    >>> sentence[token["start"] : token["end"]]
+    ' jean-baptiste'
+
+    >>> # Some models use the same idea to do part of speech.
+    >>> syntaxer = pipeline(model="vblagoje/bert-english-uncased-finetuned-pos", aggregation_strategy="simple")
+    >>> syntaxer("My name is Sarah and I live in London")
+    [{'entity_group': 'PRON', 'score': 0.999, 'word': 'my', 'start': 0, 'end': 2}, {'entity_group': 'NOUN', 'score': 0.997, 'word': 'name', 'start': 3, 'end': 7}, {'entity_group': 'AUX', 'score': 0.994, 'word': 'is', 'start': 8, 'end': 10}, {'entity_group': 'PROPN', 'score': 0.999, 'word': 'sarah', 'start': 11, 'end': 16}, {'entity_group': 'CCONJ', 'score': 0.999, 'word': 'and', 'start': 17, 'end': 20}, {'entity_group': 'PRON', 'score': 0.999, 'word': 'i', 'start': 21, 'end': 22}, {'entity_group': 'VERB', 'score': 0.998, 'word': 'live', 'start': 23, 'end': 27}, {'entity_group': 'ADP', 'score': 0.999, 'word': 'in', 'start': 28, 'end': 30}, {'entity_group': 'PROPN', 'score': 0.999, 'word': 'london', 'start': 31, 'end': 37}]
+    ```
+
+    Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial)
+
+    This token recognition pipeline can currently be loaded from [`pipeline`] using the following task identifier:
+    `"ner"` (for predicting the classes of tokens in a sequence: person, organisation, location or miscellaneous).
+
+    The models that this pipeline can use are models that have been fine-tuned on a token classification task. See the
+    up-to-date list of available models on
+    [huggingface.co/models](https://huggingface.co/models?filter=token-classification).
+    """
+
+    default_input_names = "sequences"
+
+    def __init__(self, args_parser=TokenClassificationArgumentHandler(), *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.check_model_type(
+            TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES
+            if self.framework == "tf"
+            else MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES
+        )
+
+        self._basic_tokenizer = BasicTokenizer(do_lower_case=False)
+        self._args_parser = args_parser
+
+    def _sanitize_parameters(
+        self,
+        ignore_labels=None,
+        grouped_entities: Optional[bool] = None,
+        ignore_subwords: Optional[bool] = None,
+        aggregation_strategy: Optional[AggregationStrategy] = None,
+        offset_mapping: Optional[List[Tuple[int, int]]] = None,
+        stride: Optional[int] = None,
+    ):
+        preprocess_params = {}
+        if offset_mapping is not None:
+            preprocess_params["offset_mapping"] = offset_mapping
+
+        postprocess_params = {}
+        if grouped_entities is not None or ignore_subwords is not None:
+            if grouped_entities and ignore_subwords:
+                aggregation_strategy = AggregationStrategy.FIRST
+            elif grouped_entities and not ignore_subwords:
+                aggregation_strategy = AggregationStrategy.SIMPLE
+            else:
+                aggregation_strategy = AggregationStrategy.NONE
+
+            if grouped_entities is not None:
+                warnings.warn(
+                    "`grouped_entities` is deprecated and will be removed in version v5.0.0, defaulted to"
+                    f' `aggregation_strategy="{aggregation_strategy}"` instead.'
+                )
+            if ignore_subwords is not None:
+                warnings.warn(
+                    "`ignore_subwords` is deprecated and will be removed in version v5.0.0, defaulted to"
+                    f' `aggregation_strategy="{aggregation_strategy}"` instead.'
+                )
+
+        if aggregation_strategy is not None:
+            if isinstance(aggregation_strategy, str):
+                aggregation_strategy = AggregationStrategy[aggregation_strategy.upper()]
+            if (
+                aggregation_strategy
+                in {AggregationStrategy.FIRST, AggregationStrategy.MAX, AggregationStrategy.AVERAGE}
+                and not self.tokenizer.is_fast
+            ):
+                raise ValueError(
+                    "Slow tokenizers cannot handle subwords. Please set the `aggregation_strategy` option"
+                    ' to `"simple"` or use a fast tokenizer.'
+                )
+            postprocess_params["aggregation_strategy"] = aggregation_strategy
+        if ignore_labels is not None:
+            postprocess_params["ignore_labels"] = ignore_labels
+        if stride is not None:
+            if stride >= self.tokenizer.model_max_length:
+                raise ValueError(
+                    "`stride` must be less than `tokenizer.model_max_length` (or even lower if the tokenizer adds special tokens)"
+                )
+            if aggregation_strategy == AggregationStrategy.NONE:
+                raise ValueError(
+                    "`stride` was provided to process all the text but `aggregation_strategy="
+                    f'"{aggregation_strategy}"`, please select another one instead.'
+                )
+            else:
+                if self.tokenizer.is_fast:
+                    tokenizer_params = {
+                        "return_overflowing_tokens": True,
+                        "padding": True,
+                        "stride": stride,
+                    }
+                    preprocess_params["tokenizer_params"] = tokenizer_params
+                else:
+                    raise ValueError(
+                        "`stride` was provided to process all the text but you're using a slow tokenizer."
+                        " Please use a fast tokenizer."
+                    )
+        return preprocess_params, {}, postprocess_params
+
+    def __call__(self, inputs: Union[str, List[str]], **kwargs):
+        """
+        Classify each token of the text(s) given as inputs.
+
+        Args:
+            inputs (`str` or `List[str]`):
+                One or several texts (or one list of texts) for token classification.
+
+        Return:
+            A list or a list of list of `dict`: Each result comes as a list of dictionaries (one for each token in the
+            corresponding input, or each entity if this pipeline was instantiated with an aggregation_strategy) with
+            the following keys:
+
+            - **word** (`str`) -- The token/word classified. This is obtained by decoding the selected tokens. If you
+              want to have the exact string in the original sentence, use `start` and `end`.
+            - **score** (`float`) -- The corresponding probability for `entity`.
+            - **entity** (`str`) -- The entity predicted for that token/word (it is named *entity_group* when
+              *aggregation_strategy* is not `"none"`.
+            - **index** (`int`, only present when `aggregation_strategy="none"`) -- The index of the corresponding
+              token in the sentence.
+            - **start** (`int`, *optional*) -- The index of the start of the corresponding entity in the sentence. Only
+              exists if the offsets are available within the tokenizer
+            - **end** (`int`, *optional*) -- The index of the end of the corresponding entity in the sentence. Only
+              exists if the offsets are available within the tokenizer
+        """
+
+        _inputs, offset_mapping = self._args_parser(inputs, **kwargs)
+        if offset_mapping:
+            kwargs["offset_mapping"] = offset_mapping
+
+        return super().__call__(inputs, **kwargs)
+
+    def preprocess(self, sentence, offset_mapping=None, **preprocess_params):
+        tokenizer_params = preprocess_params.pop("tokenizer_params", {})
+        truncation = True if self.tokenizer.model_max_length and self.tokenizer.model_max_length > 0 else False
+        inputs = self.tokenizer(
+            sentence,
+            return_tensors=self.framework,
+            truncation=truncation,
+            return_special_tokens_mask=True,
+            return_offsets_mapping=self.tokenizer.is_fast,
+            **tokenizer_params,
+        )
+        inputs.pop("overflow_to_sample_mapping", None)
+        num_chunks = len(inputs["input_ids"])
+
+        for i in range(num_chunks):
+            if self.framework == "tf":
+                model_inputs = {k: tf.expand_dims(v[i], 0) for k, v in inputs.items()}
+            else:
+                model_inputs = {k: v[i].unsqueeze(0) for k, v in inputs.items()}
+            if offset_mapping is not None:
+                model_inputs["offset_mapping"] = offset_mapping
+            model_inputs["sentence"] = sentence if i == 0 else None
+            model_inputs["is_last"] = i == num_chunks - 1
+
+            yield model_inputs
+
+    def _forward(self, model_inputs):
+        # Forward
+        special_tokens_mask = model_inputs.pop("special_tokens_mask")
+        offset_mapping = model_inputs.pop("offset_mapping", None)
+        sentence = model_inputs.pop("sentence")
+        is_last = model_inputs.pop("is_last")
+        if self.framework == "tf":
+            logits = self.model(**model_inputs)[0]
+        else:
+            output = self.model(**model_inputs)
+            logits = output["logits"] if isinstance(output, dict) else output[0]
+
+        return {
+            "logits": logits,
+            "special_tokens_mask": special_tokens_mask,
+            "offset_mapping": offset_mapping,
+            "sentence": sentence,
+            "is_last": is_last,
+            **model_inputs,
+        }
+
+    def postprocess(self, all_outputs, aggregation_strategy=AggregationStrategy.NONE, ignore_labels=None):
+        if ignore_labels is None:
+            ignore_labels = ["O"]
+        all_entities = []
+        for model_outputs in all_outputs:
+            if self.framework == "pt" and model_outputs["logits"][0].dtype in (torch.bfloat16, torch.float16):
+                logits = model_outputs["logits"][0].to(torch.float32).numpy()
+            else:
+                logits = model_outputs["logits"][0].numpy()
+
+            sentence = all_outputs[0]["sentence"]
+            input_ids = model_outputs["input_ids"][0]
+            offset_mapping = (
+                model_outputs["offset_mapping"][0] if model_outputs["offset_mapping"] is not None else None
+            )
+            special_tokens_mask = model_outputs["special_tokens_mask"][0].numpy()
+
+            maxes = np.max(logits, axis=-1, keepdims=True)
+            shifted_exp = np.exp(logits - maxes)
+            scores = shifted_exp / shifted_exp.sum(axis=-1, keepdims=True)
+
+            if self.framework == "tf":
+                input_ids = input_ids.numpy()
+                offset_mapping = offset_mapping.numpy() if offset_mapping is not None else None
+
+            pre_entities = self.gather_pre_entities(
+                sentence, input_ids, scores, offset_mapping, special_tokens_mask, aggregation_strategy
+            )
+            grouped_entities = self.aggregate(pre_entities, aggregation_strategy)
+            # Filter anything that is in self.ignore_labels
+            entities = [
+                entity
+                for entity in grouped_entities
+                if entity.get("entity", None) not in ignore_labels
+                and entity.get("entity_group", None) not in ignore_labels
+            ]
+            all_entities.extend(entities)
+        num_chunks = len(all_outputs)
+        if num_chunks > 1:
+            all_entities = self.aggregate_overlapping_entities(all_entities)
+        return all_entities
+
+    def aggregate_overlapping_entities(self, entities):
+        if len(entities) == 0:
+            return entities
+        entities = sorted(entities, key=lambda x: x["start"])
+        aggregated_entities = []
+        previous_entity = entities[0]
+        for entity in entities:
+            if previous_entity["start"] <= entity["start"] < previous_entity["end"]:
+                current_length = entity["end"] - entity["start"]
+                previous_length = previous_entity["end"] - previous_entity["start"]
+                if current_length > previous_length:
+                    previous_entity = entity
+                elif current_length == previous_length and entity["score"] > previous_entity["score"]:
+                    previous_entity = entity
+            else:
+                aggregated_entities.append(previous_entity)
+                previous_entity = entity
+        aggregated_entities.append(previous_entity)
+        return aggregated_entities
+
+    def gather_pre_entities(
+        self,
+        sentence: str,
+        input_ids: np.ndarray,
+        scores: np.ndarray,
+        offset_mapping: Optional[List[Tuple[int, int]]],
+        special_tokens_mask: np.ndarray,
+        aggregation_strategy: AggregationStrategy,
+    ) -> List[dict]:
+        """Fuse various numpy arrays into dicts with all the information needed for aggregation"""
+        pre_entities = []
+        for idx, token_scores in enumerate(scores):
+            # Filter special_tokens
+            if special_tokens_mask[idx]:
+                continue
+
+            word = self.tokenizer.convert_ids_to_tokens(int(input_ids[idx]))
+            if offset_mapping is not None:
+                start_ind, end_ind = offset_mapping[idx]
+                if not isinstance(start_ind, int):
+                    if self.framework == "pt":
+                        start_ind = start_ind.item()
+                        end_ind = end_ind.item()
+                word_ref = sentence[start_ind:end_ind]
+                if getattr(self.tokenizer, "_tokenizer", None) and getattr(
+                    self.tokenizer._tokenizer.model, "continuing_subword_prefix", None
+                ):
+                    # This is a BPE, word aware tokenizer, there is a correct way
+                    # to fuse tokens
+                    is_subword = len(word) != len(word_ref)
+                else:
+                    # This is a fallback heuristic. This will fail most likely on any kind of text + punctuation mixtures that will be considered "words". Non word aware models cannot do better than this unfortunately.
+                    if aggregation_strategy in {
+                        AggregationStrategy.FIRST,
+                        AggregationStrategy.AVERAGE,
+                        AggregationStrategy.MAX,
+                    }:
+                        warnings.warn(
+                            "Tokenizer does not support real words, using fallback heuristic",
+                            UserWarning,
+                        )
+                    is_subword = start_ind > 0 and " " not in sentence[start_ind - 1 : start_ind + 1]
+
+                if int(input_ids[idx]) == self.tokenizer.unk_token_id:
+                    word = word_ref
+                    is_subword = False
+            else:
+                start_ind = None
+                end_ind = None
+                is_subword = False
+
+            pre_entity = {
+                "word": word,
+                "scores": token_scores,
+                "start": start_ind,
+                "end": end_ind,
+                "index": idx,
+                "is_subword": is_subword,
+            }
+            pre_entities.append(pre_entity)
+        return pre_entities
+
+    def aggregate(self, pre_entities: List[dict], aggregation_strategy: AggregationStrategy) -> List[dict]:
+        if aggregation_strategy in {AggregationStrategy.NONE, AggregationStrategy.SIMPLE}:
+            entities = []
+            for pre_entity in pre_entities:
+                entity_idx = pre_entity["scores"].argmax()
+                score = pre_entity["scores"][entity_idx]
+                entity = {
+                    "entity": self.model.config.id2label[entity_idx],
+                    "score": score,
+                    "index": pre_entity["index"],
+                    "word": pre_entity["word"],
+                    "start": pre_entity["start"],
+                    "end": pre_entity["end"],
+                }
+                entities.append(entity)
+        else:
+            entities = self.aggregate_words(pre_entities, aggregation_strategy)
+
+        if aggregation_strategy == AggregationStrategy.NONE:
+            return entities
+        return self.group_entities(entities)
+
+    def aggregate_word(self, entities: List[dict], aggregation_strategy: AggregationStrategy) -> dict:
+        word = self.tokenizer.convert_tokens_to_string([entity["word"] for entity in entities])
+        if aggregation_strategy == AggregationStrategy.FIRST:
+            scores = entities[0]["scores"]
+            idx = scores.argmax()
+            score = scores[idx]
+            entity = self.model.config.id2label[idx]
+        elif aggregation_strategy == AggregationStrategy.MAX:
+            max_entity = max(entities, key=lambda entity: entity["scores"].max())
+            scores = max_entity["scores"]
+            idx = scores.argmax()
+            score = scores[idx]
+            entity = self.model.config.id2label[idx]
+        elif aggregation_strategy == AggregationStrategy.AVERAGE:
+            scores = np.stack([entity["scores"] for entity in entities])
+            average_scores = np.nanmean(scores, axis=0)
+            entity_idx = average_scores.argmax()
+            entity = self.model.config.id2label[entity_idx]
+            score = average_scores[entity_idx]
+        else:
+            raise ValueError("Invalid aggregation_strategy")
+        new_entity = {
+            "entity": entity,
+            "score": score,
+            "word": word,
+            "start": entities[0]["start"],
+            "end": entities[-1]["end"],
+        }
+        return new_entity
+
+    def aggregate_words(self, entities: List[dict], aggregation_strategy: AggregationStrategy) -> List[dict]:
+        """
+        Override tokens from a given word that disagree to force agreement on word boundaries.
+
+        Example: micro|soft| com|pany| B-ENT I-NAME I-ENT I-ENT will be rewritten with first strategy as microsoft|
+        company| B-ENT I-ENT
+        """
+        if aggregation_strategy in {
+            AggregationStrategy.NONE,
+            AggregationStrategy.SIMPLE,
+        }:
+            raise ValueError("NONE and SIMPLE strategies are invalid for word aggregation")
+
+        word_entities = []
+        word_group = None
+        for entity in entities:
+            if word_group is None:
+                word_group = [entity]
+            elif entity["is_subword"]:
+                word_group.append(entity)
+            else:
+                word_entities.append(self.aggregate_word(word_group, aggregation_strategy))
+                word_group = [entity]
+        # Last item
+        if word_group is not None:
+            word_entities.append(self.aggregate_word(word_group, aggregation_strategy))
+        return word_entities
+
+    def group_sub_entities(self, entities: List[dict]) -> dict:
+        """
+        Group together the adjacent tokens with the same entity predicted.
+
+        Args:
+            entities (`dict`): The entities predicted by the pipeline.
+        """
+        # Get the first entity in the entity group
+        entity = entities[0]["entity"].split("-", 1)[-1]
+        scores = np.nanmean([entity["score"] for entity in entities])
+        tokens = [entity["word"] for entity in entities]
+
+        entity_group = {
+            "entity_group": entity,
+            "score": np.mean(scores),
+            "word": self.tokenizer.convert_tokens_to_string(tokens),
+            "start": entities[0]["start"],
+            "end": entities[-1]["end"],
+        }
+        return entity_group
+
+    def get_tag(self, entity_name: str) -> Tuple[str, str]:
+        if entity_name.startswith("B-"):
+            bi = "B"
+            tag = entity_name[2:]
+        elif entity_name.startswith("I-"):
+            bi = "I"
+            tag = entity_name[2:]
+        else:
+            # It's not in B-, I- format
+            # Default to I- for continuation.
+            bi = "I"
+            tag = entity_name
+        return bi, tag
+
+    def group_entities(self, entities: List[dict]) -> List[dict]:
+        """
+        Find and group together the adjacent tokens with the same entity predicted.
+
+        Args:
+            entities (`dict`): The entities predicted by the pipeline.
+        """
+
+        entity_groups = []
+        entity_group_disagg = []
+
+        for entity in entities:
+            if not entity_group_disagg:
+                entity_group_disagg.append(entity)
+                continue
+
+            # If the current entity is similar and adjacent to the previous entity,
+            # append it to the disaggregated entity group
+            # The split is meant to account for the "B" and "I" prefixes
+            # Shouldn't merge if both entities are B-type
+            bi, tag = self.get_tag(entity["entity"])
+            last_bi, last_tag = self.get_tag(entity_group_disagg[-1]["entity"])
+
+            if tag == last_tag and bi != "B":
+                # Modify subword type to be previous_type
+                entity_group_disagg.append(entity)
+            else:
+                # If the current entity is different from the previous entity
+                # aggregate the disaggregated entity group
+                entity_groups.append(self.group_sub_entities(entity_group_disagg))
+                entity_group_disagg = [entity]
+        if entity_group_disagg:
+            # it's the last entity, add it to the entity groups
+            entity_groups.append(self.group_sub_entities(entity_group_disagg))
+
+        return entity_groups
+
+
+NerPipeline = TokenClassificationPipeline
diff --git a/.venv/lib/python3.11/site-packages/transformers/pipelines/video_classification.py b/.venv/lib/python3.11/site-packages/transformers/pipelines/video_classification.py
new file mode 100644
index 0000000000000000000000000000000000000000..057910098da20a1dfc02bf0d8b041e2d7af8cd09
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/transformers/pipelines/video_classification.py
@@ -0,0 +1,184 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import warnings
+from io import BytesIO
+from typing import List, Union
+
+import requests
+
+from ..utils import (
+    add_end_docstrings,
+    is_av_available,
+    is_torch_available,
+    logging,
+    requires_backends,
+)
+from .base import Pipeline, build_pipeline_init_args
+
+
+if is_av_available():
+    import av
+    import numpy as np
+
+
+if is_torch_available():
+    from ..models.auto.modeling_auto import MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING_NAMES
+
+logger = logging.get_logger(__name__)
+
+
+@add_end_docstrings(build_pipeline_init_args(has_image_processor=True))
+class VideoClassificationPipeline(Pipeline):
+    """
+    Video classification pipeline using any `AutoModelForVideoClassification`. This pipeline predicts the class of a
+    video.
+
+    This video classification pipeline can currently be loaded from [`pipeline`] using the following task identifier:
+    `"video-classification"`.
+
+    See the list of available models on
+    [huggingface.co/models](https://huggingface.co/models?filter=video-classification).
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        requires_backends(self, "av")
+        self.check_model_type(MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING_NAMES)
+
+    def _sanitize_parameters(self, top_k=None, num_frames=None, frame_sampling_rate=None, function_to_apply=None):
+        preprocess_params = {}
+        if frame_sampling_rate is not None:
+            preprocess_params["frame_sampling_rate"] = frame_sampling_rate
+        if num_frames is not None:
+            preprocess_params["num_frames"] = num_frames
+
+        postprocess_params = {}
+        if top_k is not None:
+            postprocess_params["top_k"] = top_k
+        if function_to_apply is not None:
+            if function_to_apply not in ["softmax", "sigmoid", "none"]:
+                raise ValueError(
+                    f"Invalid value for `function_to_apply`: {function_to_apply}. "
+                    "Valid options are ['softmax', 'sigmoid', 'none']"
+                )
+            postprocess_params["function_to_apply"] = function_to_apply
+        else:
+            postprocess_params["function_to_apply"] = "softmax"
+        return preprocess_params, {}, postprocess_params
+
+    def __call__(self, inputs: Union[str, List[str]] = None, **kwargs):
+        """
+        Assign labels to the video(s) passed as inputs.
+
+        Args:
+            inputs (`str`, `List[str]`):
+                The pipeline handles three types of videos:
+
+                - A string containing a http link pointing to a video
+                - A string containing a local path to a video
+
+                The pipeline accepts either a single video or a batch of videos, which must then be passed as a string.
+                Videos in a batch must all be in the same format: all as http links or all as local paths.
+            top_k (`int`, *optional*, defaults to 5):
+                The number of top labels that will be returned by the pipeline. If the provided number is higher than
+                the number of labels available in the model configuration, it will default to the number of labels.
+            num_frames (`int`, *optional*, defaults to `self.model.config.num_frames`):
+                The number of frames sampled from the video to run the classification on. If not provided, will default
+                to the number of frames specified in the model configuration.
+            frame_sampling_rate (`int`, *optional*, defaults to 1):
+                The sampling rate used to select frames from the video. If not provided, will default to 1, i.e. every
+                frame will be used.
+            function_to_apply(`str`, *optional*, defaults to "softmax"):
+                The function to apply to the model output. By default, the pipeline will apply the softmax function to
+                the output of the model. Valid options: ["softmax", "sigmoid", "none"]. Note that passing Python's
+                built-in `None` will default to "softmax", so you need to pass the string "none" to disable any
+                post-processing.
+
+        Return:
+            A dictionary or a list of dictionaries containing result. If the input is a single video, will return a
+            dictionary, if the input is a list of several videos, will return a list of dictionaries corresponding to
+            the videos.
+
+            The dictionaries contain the following keys:
+
+            - **label** (`str`) -- The label identified by the model.
+            - **score** (`int`) -- The score attributed by the model for that label.
+        """
+        # After deprecation of this is completed, remove the default `None` value for `images`
+        if "videos" in kwargs:
+            warnings.warn(
+                "The `videos` argument has been renamed to `inputs`. In version 5 of Transformers, `videos` will no longer be accepted",
+                FutureWarning,
+            )
+            inputs = kwargs.pop("videos")
+        if inputs is None:
+            raise ValueError("Cannot call the video-classification pipeline without an inputs argument!")
+        return super().__call__(inputs, **kwargs)
+
+    def preprocess(self, video, num_frames=None, frame_sampling_rate=1):
+        if num_frames is None:
+            num_frames = self.model.config.num_frames
+
+        if video.startswith("http://") or video.startswith("https://"):
+            video = BytesIO(requests.get(video).content)
+
+        container = av.open(video)
+
+        start_idx = 0
+        end_idx = num_frames * frame_sampling_rate - 1
+        indices = np.linspace(start_idx, end_idx, num=num_frames, dtype=np.int64)
+
+        video = read_video_pyav(container, indices)
+        video = list(video)
+
+        model_inputs = self.image_processor(video, return_tensors=self.framework)
+        if self.framework == "pt":
+            model_inputs = model_inputs.to(self.torch_dtype)
+        return model_inputs
+
+    def _forward(self, model_inputs):
+        model_outputs = self.model(**model_inputs)
+        return model_outputs
+
+    def postprocess(self, model_outputs, top_k=5, function_to_apply="softmax"):
+        if top_k > self.model.config.num_labels:
+            top_k = self.model.config.num_labels
+
+        if self.framework == "pt":
+            if function_to_apply == "softmax":
+                probs = model_outputs.logits[0].softmax(-1)
+            elif function_to_apply == "sigmoid":
+                probs = model_outputs.logits[0].sigmoid()
+            else:
+                probs = model_outputs.logits[0]
+            scores, ids = probs.topk(top_k)
+        else:
+            raise ValueError(f"Unsupported framework: {self.framework}")
+
+        scores = scores.tolist()
+        ids = ids.tolist()
+        return [{"score": score, "label": self.model.config.id2label[_id]} for score, _id in zip(scores, ids)]
+
+
+def read_video_pyav(container, indices):
+    frames = []
+    container.seek(0)
+    start_index = indices[0]
+    end_index = indices[-1]
+    for i, frame in enumerate(container.decode(video=0)):
+        if i > end_index:
+            break
+        if i >= start_index and i in indices:
+            frames.append(frame)
+    return np.stack([x.to_ndarray(format="rgb24") for x in frames])
diff --git a/.venv/lib/python3.11/site-packages/transformers/pipelines/visual_question_answering.py b/.venv/lib/python3.11/site-packages/transformers/pipelines/visual_question_answering.py
new file mode 100644
index 0000000000000000000000000000000000000000..6d600c9eaf50bc99f6810b0c2836b154cd62ed51
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/transformers/pipelines/visual_question_answering.py
@@ -0,0 +1,200 @@
+from typing import List, Union
+
+from ..utils import add_end_docstrings, is_torch_available, is_vision_available, logging
+from .base import Pipeline, build_pipeline_init_args
+
+
+if is_vision_available():
+    from PIL import Image
+
+    from ..image_utils import load_image
+
+if is_torch_available():
+    from ..models.auto.modeling_auto import MODEL_FOR_VISUAL_QUESTION_ANSWERING_MAPPING_NAMES
+    from .pt_utils import KeyDataset
+
+logger = logging.get_logger(__name__)
+
+
+@add_end_docstrings(build_pipeline_init_args(has_tokenizer=True, has_image_processor=True))
+class VisualQuestionAnsweringPipeline(Pipeline):
+    """
+    Visual Question Answering pipeline using a `AutoModelForVisualQuestionAnswering`. This pipeline is currently only
+    available in PyTorch.
+
+    Example:
+
+    ```python
+    >>> from transformers import pipeline
+
+    >>> oracle = pipeline(model="dandelin/vilt-b32-finetuned-vqa")
+    >>> image_url = "https://huggingface.co/datasets/Narsil/image_dummy/raw/main/lena.png"
+    >>> oracle(question="What is she wearing ?", image=image_url)
+    [{'score': 0.948, 'answer': 'hat'}, {'score': 0.009, 'answer': 'fedora'}, {'score': 0.003, 'answer': 'clothes'}, {'score': 0.003, 'answer': 'sun hat'}, {'score': 0.002, 'answer': 'nothing'}]
+
+    >>> oracle(question="What is she wearing ?", image=image_url, top_k=1)
+    [{'score': 0.948, 'answer': 'hat'}]
+
+    >>> oracle(question="Is this a person ?", image=image_url, top_k=1)
+    [{'score': 0.993, 'answer': 'yes'}]
+
+    >>> oracle(question="Is this a man ?", image=image_url, top_k=1)
+    [{'score': 0.996, 'answer': 'no'}]
+    ```
+
+    Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial)
+
+    This visual question answering pipeline can currently be loaded from [`pipeline`] using the following task
+    identifiers: `"visual-question-answering", "vqa"`.
+
+    The models that this pipeline can use are models that have been fine-tuned on a visual question answering task. See
+    the up-to-date list of available models on
+    [huggingface.co/models](https://huggingface.co/models?filter=visual-question-answering).
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.check_model_type(MODEL_FOR_VISUAL_QUESTION_ANSWERING_MAPPING_NAMES)
+
+    def _sanitize_parameters(self, top_k=None, padding=None, truncation=None, timeout=None, **kwargs):
+        preprocess_params, postprocess_params = {}, {}
+        if padding is not None:
+            preprocess_params["padding"] = padding
+        if truncation is not None:
+            preprocess_params["truncation"] = truncation
+        if timeout is not None:
+            preprocess_params["timeout"] = timeout
+        if top_k is not None:
+            postprocess_params["top_k"] = top_k
+
+        forward_params = {}
+        if self.assistant_model is not None:
+            forward_params["assistant_model"] = self.assistant_model
+        if self.assistant_tokenizer is not None:
+            forward_params["tokenizer"] = self.tokenizer
+            forward_params["assistant_tokenizer"] = self.assistant_tokenizer
+
+        return preprocess_params, forward_params, postprocess_params
+
+    def __call__(
+        self,
+        image: Union["Image.Image", str, List["Image.Image"], List[str], "KeyDataset"],
+        question: Union[str, List[str]] = None,
+        **kwargs,
+    ):
+        r"""
+        Answers open-ended questions about images. The pipeline accepts several types of inputs which are detailed
+        below:
+
+        - `pipeline(image=image, question=question)`
+        - `pipeline({"image": image, "question": question})`
+        - `pipeline([{"image": image, "question": question}])`
+        - `pipeline([{"image": image, "question": question}, {"image": image, "question": question}])`
+
+        Args:
+            image (`str`, `List[str]`, `PIL.Image`, `List[PIL.Image]` or `KeyDataset`):
+                The pipeline handles three types of images:
+
+                - A string containing a http link pointing to an image
+                - A string containing a local path to an image
+                - An image loaded in PIL directly
+
+                The pipeline accepts either a single image or a batch of images. If given a single image, it can be
+                broadcasted to multiple questions.
+                For dataset: the passed in dataset must be of type `transformers.pipelines.pt_utils.KeyDataset`
+                Example:
+                ```python
+                >>> from transformers.pipelines.pt_utils import KeyDataset
+                >>> from datasets import load_dataset
+
+                >>> dataset = load_dataset("detection-datasets/coco")
+                >>> oracle(image=KeyDataset(dataset, "image"), question="What's in this image?")
+
+                ```
+            question (`str`, `List[str]`):
+                The question(s) asked. If given a single question, it can be broadcasted to multiple images.
+                If multiple images and questions are given, each and every question will be broadcasted to all images
+                (same effect as a Cartesian product)
+            top_k (`int`, *optional*, defaults to 5):
+                The number of top labels that will be returned by the pipeline. If the provided number is higher than
+                the number of labels available in the model configuration, it will default to the number of labels.
+            timeout (`float`, *optional*, defaults to None):
+                The maximum time in seconds to wait for fetching images from the web. If None, no timeout is set and
+                the call may block forever.
+        Return:
+            A dictionary or a list of dictionaries containing the result. The dictionaries contain the following keys:
+
+            - **label** (`str`) -- The label identified by the model.
+            - **score** (`int`) -- The score attributed by the model for that label.
+        """
+        is_dataset = isinstance(image, KeyDataset)
+        is_image_batch = isinstance(image, list) and all(isinstance(item, (Image.Image, str)) for item in image)
+        is_question_batch = isinstance(question, list) and all(isinstance(item, str) for item in question)
+
+        if isinstance(image, (Image.Image, str)) and isinstance(question, str):
+            inputs = {"image": image, "question": question}
+        elif (is_image_batch or is_dataset) and isinstance(question, str):
+            inputs = [{"image": im, "question": question} for im in image]
+        elif isinstance(image, (Image.Image, str)) and is_question_batch:
+            inputs = [{"image": image, "question": q} for q in question]
+        elif (is_image_batch or is_dataset) and is_question_batch:
+            question_image_pairs = []
+            for q in question:
+                for im in image:
+                    question_image_pairs.append({"image": im, "question": q})
+            inputs = question_image_pairs
+        else:
+            """
+            Supports the following format
+            - {"image": image, "question": question}
+            - [{"image": image, "question": question}]
+            - Generator and datasets
+            """
+            inputs = image
+        results = super().__call__(inputs, **kwargs)
+        return results
+
+    def preprocess(self, inputs, padding=False, truncation=False, timeout=None):
+        image = load_image(inputs["image"], timeout=timeout)
+        model_inputs = self.tokenizer(
+            inputs["question"],
+            return_tensors=self.framework,
+            padding=padding,
+            truncation=truncation,
+        )
+        image_features = self.image_processor(images=image, return_tensors=self.framework)
+        if self.framework == "pt":
+            image_features = image_features.to(self.torch_dtype)
+        model_inputs.update(image_features)
+        return model_inputs
+
+    def _forward(self, model_inputs, **generate_kwargs):
+        if self.model.can_generate():
+            # User-defined `generation_config` passed to the pipeline call take precedence
+            if "generation_config" not in generate_kwargs:
+                generate_kwargs["generation_config"] = self.generation_config
+
+            model_outputs = self.model.generate(**model_inputs, **generate_kwargs)
+        else:
+            model_outputs = self.model(**model_inputs)
+        return model_outputs
+
+    def postprocess(self, model_outputs, top_k=5):
+        if self.model.can_generate():
+            return [
+                {"answer": self.tokenizer.decode(output_ids, skip_special_tokens=True).strip()}
+                for output_ids in model_outputs
+            ]
+        else:
+            if top_k > self.model.config.num_labels:
+                top_k = self.model.config.num_labels
+
+            if self.framework == "pt":
+                probs = model_outputs.logits.sigmoid()[0]
+                scores, ids = probs.topk(top_k)
+            else:
+                raise ValueError(f"Unsupported framework: {self.framework}")
+
+            scores = scores.tolist()
+            ids = ids.tolist()
+            return [{"score": score, "answer": self.model.config.id2label[_id]} for score, _id in zip(scores, ids)]
diff --git a/.venv/lib/python3.11/site-packages/transformers/pipelines/zero_shot_audio_classification.py b/.venv/lib/python3.11/site-packages/transformers/pipelines/zero_shot_audio_classification.py
new file mode 100644
index 0000000000000000000000000000000000000000..8ed339a5b7f889c21991eaec6901887ce97d90cd
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/transformers/pipelines/zero_shot_audio_classification.py
@@ -0,0 +1,164 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from collections import UserDict
+from typing import Union
+
+import numpy as np
+import requests
+
+from ..utils import (
+    add_end_docstrings,
+    logging,
+)
+from .audio_classification import ffmpeg_read
+from .base import Pipeline, build_pipeline_init_args
+
+
+logger = logging.get_logger(__name__)
+
+
+@add_end_docstrings(build_pipeline_init_args(has_feature_extractor=True, has_tokenizer=True))
+class ZeroShotAudioClassificationPipeline(Pipeline):
+    """
+    Zero shot audio classification pipeline using `ClapModel`. This pipeline predicts the class of an audio when you
+    provide an audio and a set of `candidate_labels`.
+
+    <Tip warning={true}>
+
+    The default `hypothesis_template` is : `"This is a sound of {}."`. Make sure you update it for your usage.
+
+    </Tip>
+
+    Example:
+    ```python
+    >>> from transformers import pipeline
+    >>> from datasets import load_dataset
+
+    >>> dataset = load_dataset("ashraq/esc50")
+    >>> audio = next(iter(dataset["train"]["audio"]))["array"]
+    >>> classifier = pipeline(task="zero-shot-audio-classification", model="laion/clap-htsat-unfused")
+    >>> classifier(audio, candidate_labels=["Sound of a dog", "Sound of vaccum cleaner"])
+    [{'score': 0.9996, 'label': 'Sound of a dog'}, {'score': 0.0004, 'label': 'Sound of vaccum cleaner'}]
+    ```
+
+
+    Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial) This audio
+    classification pipeline can currently be loaded from [`pipeline`] using the following task identifier:
+    `"zero-shot-audio-classification"`. See the list of available models on
+    [huggingface.co/models](https://huggingface.co/models?filter=zero-shot-audio-classification).
+    """
+
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+
+        if self.framework != "pt":
+            raise ValueError(f"The {self.__class__} is only available in PyTorch.")
+        # No specific FOR_XXX available yet
+
+    def __call__(self, audios: Union[np.ndarray, bytes, str], **kwargs):
+        """
+        Assign labels to the audio(s) passed as inputs.
+
+        Args:
+            audios (`str`, `List[str]`, `np.array` or `List[np.array]`):
+                The pipeline handles three types of inputs:
+                - A string containing a http link pointing to an audio
+                - A string containing a local path to an audio
+                - An audio loaded in numpy
+            candidate_labels (`List[str]`):
+                The candidate labels for this audio. They will be formatted using *hypothesis_template*.
+            hypothesis_template (`str`, *optional*, defaults to `"This is a sound of {}"`):
+                The format used in conjunction with *candidate_labels* to attempt the audio classification by
+                replacing the placeholder with the candidate_labels. Pass "{}" if *candidate_labels* are
+                already formatted.
+        Return:
+            A list of dictionaries containing one entry per proposed label. Each dictionary contains the
+            following keys:
+            - **label** (`str`) -- One of the suggested *candidate_labels*.
+            - **score** (`float`) -- The score attributed by the model to that label. It is a value between
+                0 and 1, computed as the `softmax` of `logits_per_audio`.
+        """
+        return super().__call__(audios, **kwargs)
+
+    def _sanitize_parameters(self, **kwargs):
+        preprocess_params = {}
+        if "candidate_labels" in kwargs:
+            preprocess_params["candidate_labels"] = kwargs["candidate_labels"]
+        if "hypothesis_template" in kwargs:
+            preprocess_params["hypothesis_template"] = kwargs["hypothesis_template"]
+
+        return preprocess_params, {}, {}
+
+    def preprocess(self, audio, candidate_labels=None, hypothesis_template="This is a sound of {}."):
+        if isinstance(audio, str):
+            if audio.startswith("http://") or audio.startswith("https://"):
+                # We need to actually check for a real protocol, otherwise it's impossible to use a local file
+                # like http_huggingface_co.png
+                audio = requests.get(audio).content
+            else:
+                with open(audio, "rb") as f:
+                    audio = f.read()
+
+        if isinstance(audio, bytes):
+            audio = ffmpeg_read(audio, self.feature_extractor.sampling_rate)
+
+        if not isinstance(audio, np.ndarray):
+            raise TypeError("We expect a numpy ndarray as input")
+        if len(audio.shape) != 1:
+            raise ValueError("We expect a single channel audio input for ZeroShotAudioClassificationPipeline")
+
+        inputs = self.feature_extractor(
+            [audio], sampling_rate=self.feature_extractor.sampling_rate, return_tensors="pt"
+        )
+        if self.framework == "pt":
+            inputs = inputs.to(self.torch_dtype)
+        inputs["candidate_labels"] = candidate_labels
+        sequences = [hypothesis_template.format(x) for x in candidate_labels]
+        text_inputs = self.tokenizer(sequences, return_tensors=self.framework, padding=True)
+        inputs["text_inputs"] = [text_inputs]
+        return inputs
+
+    def _forward(self, model_inputs):
+        candidate_labels = model_inputs.pop("candidate_labels")
+        text_inputs = model_inputs.pop("text_inputs")
+        if isinstance(text_inputs[0], UserDict):
+            text_inputs = text_inputs[0]
+        else:
+            # Batching case.
+            text_inputs = text_inputs[0][0]
+
+        outputs = self.model(**text_inputs, **model_inputs)
+
+        model_outputs = {
+            "candidate_labels": candidate_labels,
+            "logits": outputs.logits_per_audio,
+        }
+        return model_outputs
+
+    def postprocess(self, model_outputs):
+        candidate_labels = model_outputs.pop("candidate_labels")
+        logits = model_outputs["logits"][0]
+
+        if self.framework == "pt":
+            probs = logits.softmax(dim=0)
+            scores = probs.tolist()
+        else:
+            raise ValueError("`tf` framework not supported.")
+
+        result = [
+            {"score": score, "label": candidate_label}
+            for score, candidate_label in sorted(zip(scores, candidate_labels), key=lambda x: -x[0])
+        ]
+        return result
diff --git a/.venv/lib/python3.11/site-packages/transformers/pipelines/zero_shot_classification.py b/.venv/lib/python3.11/site-packages/transformers/pipelines/zero_shot_classification.py
new file mode 100644
index 0000000000000000000000000000000000000000..f4aee3341e30d55691ea74d0e90dd00ba4567c8b
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/transformers/pipelines/zero_shot_classification.py
@@ -0,0 +1,268 @@
+import inspect
+from typing import List, Union
+
+import numpy as np
+
+from ..tokenization_utils import TruncationStrategy
+from ..utils import add_end_docstrings, logging
+from .base import ArgumentHandler, ChunkPipeline, build_pipeline_init_args
+
+
+logger = logging.get_logger(__name__)
+
+
+class ZeroShotClassificationArgumentHandler(ArgumentHandler):
+    """
+    Handles arguments for zero-shot for text classification by turning each possible label into an NLI
+    premise/hypothesis pair.
+    """
+
+    def _parse_labels(self, labels):
+        if isinstance(labels, str):
+            labels = [label.strip() for label in labels.split(",") if label.strip()]
+        return labels
+
+    def __call__(self, sequences, labels, hypothesis_template):
+        if len(labels) == 0 or len(sequences) == 0:
+            raise ValueError("You must include at least one label and at least one sequence.")
+        if hypothesis_template.format(labels[0]) == hypothesis_template:
+            raise ValueError(
+                (
+                    'The provided hypothesis_template "{}" was not able to be formatted with the target labels. '
+                    "Make sure the passed template includes formatting syntax such as {{}} where the label should go."
+                ).format(hypothesis_template)
+            )
+
+        if isinstance(sequences, str):
+            sequences = [sequences]
+
+        sequence_pairs = []
+        for sequence in sequences:
+            sequence_pairs.extend([[sequence, hypothesis_template.format(label)] for label in labels])
+
+        return sequence_pairs, sequences
+
+
+@add_end_docstrings(build_pipeline_init_args(has_tokenizer=True))
+class ZeroShotClassificationPipeline(ChunkPipeline):
+    """
+    NLI-based zero-shot classification pipeline using a `ModelForSequenceClassification` trained on NLI (natural
+    language inference) tasks. Equivalent of `text-classification` pipelines, but these models don't require a
+    hardcoded number of potential classes, they can be chosen at runtime. It usually means it's slower but it is
+    **much** more flexible.
+
+    Any combination of sequences and labels can be passed and each combination will be posed as a premise/hypothesis
+    pair and passed to the pretrained model. Then, the logit for *entailment* is taken as the logit for the candidate
+    label being valid. Any NLI model can be used, but the id of the *entailment* label must be included in the model
+    config's :attr:*~transformers.PretrainedConfig.label2id*.
+
+    Example:
+
+    ```python
+    >>> from transformers import pipeline
+
+    >>> oracle = pipeline(model="facebook/bart-large-mnli")
+    >>> oracle(
+    ...     "I have a problem with my iphone that needs to be resolved asap!!",
+    ...     candidate_labels=["urgent", "not urgent", "phone", "tablet", "computer"],
+    ... )
+    {'sequence': 'I have a problem with my iphone that needs to be resolved asap!!', 'labels': ['urgent', 'phone', 'computer', 'not urgent', 'tablet'], 'scores': [0.504, 0.479, 0.013, 0.003, 0.002]}
+
+    >>> oracle(
+    ...     "I have a problem with my iphone that needs to be resolved asap!!",
+    ...     candidate_labels=["english", "german"],
+    ... )
+    {'sequence': 'I have a problem with my iphone that needs to be resolved asap!!', 'labels': ['english', 'german'], 'scores': [0.814, 0.186]}
+    ```
+
+    Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial)
+
+    This NLI pipeline can currently be loaded from [`pipeline`] using the following task identifier:
+    `"zero-shot-classification"`.
+
+    The models that this pipeline can use are models that have been fine-tuned on an NLI task. See the up-to-date list
+    of available models on [huggingface.co/models](https://huggingface.co/models?search=nli).
+    """
+
+    def __init__(self, args_parser=ZeroShotClassificationArgumentHandler(), *args, **kwargs):
+        self._args_parser = args_parser
+        super().__init__(*args, **kwargs)
+        if self.entailment_id == -1:
+            logger.warning(
+                "Failed to determine 'entailment' label id from the label2id mapping in the model config. Setting to "
+                "-1. Define a descriptive label2id mapping in the model config to ensure correct outputs."
+            )
+
+    @property
+    def entailment_id(self):
+        for label, ind in self.model.config.label2id.items():
+            if label.lower().startswith("entail"):
+                return ind
+        return -1
+
+    def _parse_and_tokenize(
+        self, sequence_pairs, padding=True, add_special_tokens=True, truncation=TruncationStrategy.ONLY_FIRST, **kwargs
+    ):
+        """
+        Parse arguments and tokenize only_first so that hypothesis (label) is not truncated
+        """
+        return_tensors = self.framework
+        if self.tokenizer.pad_token is None:
+            # Override for tokenizers not supporting padding
+            logger.error(
+                "Tokenizer was not supporting padding necessary for zero-shot, attempting to use "
+                " `pad_token=eos_token`"
+            )
+            self.tokenizer.pad_token = self.tokenizer.eos_token
+        try:
+            inputs = self.tokenizer(
+                sequence_pairs,
+                add_special_tokens=add_special_tokens,
+                return_tensors=return_tensors,
+                padding=padding,
+                truncation=truncation,
+            )
+        except Exception as e:
+            if "too short" in str(e):
+                # tokenizers might yell that we want to truncate
+                # to a value that is not even reached by the input.
+                # In that case we don't want to truncate.
+                # It seems there's not a really better way to catch that
+                # exception.
+
+                inputs = self.tokenizer(
+                    sequence_pairs,
+                    add_special_tokens=add_special_tokens,
+                    return_tensors=return_tensors,
+                    padding=padding,
+                    truncation=TruncationStrategy.DO_NOT_TRUNCATE,
+                )
+            else:
+                raise e
+
+        return inputs
+
+    def _sanitize_parameters(self, **kwargs):
+        if kwargs.get("multi_class", None) is not None:
+            kwargs["multi_label"] = kwargs["multi_class"]
+            logger.warning(
+                "The `multi_class` argument has been deprecated and renamed to `multi_label`. "
+                "`multi_class` will be removed in a future version of Transformers."
+            )
+        preprocess_params = {}
+        if "candidate_labels" in kwargs:
+            preprocess_params["candidate_labels"] = self._args_parser._parse_labels(kwargs["candidate_labels"])
+        if "hypothesis_template" in kwargs:
+            preprocess_params["hypothesis_template"] = kwargs["hypothesis_template"]
+
+        postprocess_params = {}
+        if "multi_label" in kwargs:
+            postprocess_params["multi_label"] = kwargs["multi_label"]
+        return preprocess_params, {}, postprocess_params
+
+    def __call__(
+        self,
+        sequences: Union[str, List[str]],
+        *args,
+        **kwargs,
+    ):
+        """
+        Classify the sequence(s) given as inputs. See the [`ZeroShotClassificationPipeline`] documentation for more
+        information.
+
+        Args:
+            sequences (`str` or `List[str]`):
+                The sequence(s) to classify, will be truncated if the model input is too large.
+            candidate_labels (`str` or `List[str]`):
+                The set of possible class labels to classify each sequence into. Can be a single label, a string of
+                comma-separated labels, or a list of labels.
+            hypothesis_template (`str`, *optional*, defaults to `"This example is {}."`):
+                The template used to turn each label into an NLI-style hypothesis. This template must include a {} or
+                similar syntax for the candidate label to be inserted into the template. For example, the default
+                template is `"This example is {}."` With the candidate label `"sports"`, this would be fed into the
+                model like `"<cls> sequence to classify <sep> This example is sports . <sep>"`. The default template
+                works well in many cases, but it may be worthwhile to experiment with different templates depending on
+                the task setting.
+            multi_label (`bool`, *optional*, defaults to `False`):
+                Whether or not multiple candidate labels can be true. If `False`, the scores are normalized such that
+                the sum of the label likelihoods for each sequence is 1. If `True`, the labels are considered
+                independent and probabilities are normalized for each candidate by doing a softmax of the entailment
+                score vs. the contradiction score.
+
+        Return:
+            A `dict` or a list of `dict`: Each result comes as a dictionary with the following keys:
+
+            - **sequence** (`str`) -- The sequence for which this is the output.
+            - **labels** (`List[str]`) -- The labels sorted by order of likelihood.
+            - **scores** (`List[float]`) -- The probabilities for each of the labels.
+        """
+        if len(args) == 0:
+            pass
+        elif len(args) == 1 and "candidate_labels" not in kwargs:
+            kwargs["candidate_labels"] = args[0]
+        else:
+            raise ValueError(f"Unable to understand extra arguments {args}")
+
+        return super().__call__(sequences, **kwargs)
+
+    def preprocess(self, inputs, candidate_labels=None, hypothesis_template="This example is {}."):
+        sequence_pairs, sequences = self._args_parser(inputs, candidate_labels, hypothesis_template)
+
+        for i, (candidate_label, sequence_pair) in enumerate(zip(candidate_labels, sequence_pairs)):
+            model_input = self._parse_and_tokenize([sequence_pair])
+
+            yield {
+                "candidate_label": candidate_label,
+                "sequence": sequences[0],
+                "is_last": i == len(candidate_labels) - 1,
+                **model_input,
+            }
+
+    def _forward(self, inputs):
+        candidate_label = inputs["candidate_label"]
+        sequence = inputs["sequence"]
+        model_inputs = {k: inputs[k] for k in self.tokenizer.model_input_names}
+        # `XXXForSequenceClassification` models should not use `use_cache=True` even if it's supported
+        model_forward = self.model.forward if self.framework == "pt" else self.model.call
+        if "use_cache" in inspect.signature(model_forward).parameters.keys():
+            model_inputs["use_cache"] = False
+        outputs = self.model(**model_inputs)
+
+        model_outputs = {
+            "candidate_label": candidate_label,
+            "sequence": sequence,
+            "is_last": inputs["is_last"],
+            **outputs,
+        }
+        return model_outputs
+
+    def postprocess(self, model_outputs, multi_label=False):
+        candidate_labels = [outputs["candidate_label"] for outputs in model_outputs]
+        sequences = [outputs["sequence"] for outputs in model_outputs]
+        if self.framework == "pt":
+            logits = np.concatenate([output["logits"].float().numpy() for output in model_outputs])
+        else:
+            logits = np.concatenate([output["logits"].numpy() for output in model_outputs])
+        N = logits.shape[0]
+        n = len(candidate_labels)
+        num_sequences = N // n
+        reshaped_outputs = logits.reshape((num_sequences, n, -1))
+
+        if multi_label or len(candidate_labels) == 1:
+            # softmax over the entailment vs. contradiction dim for each label independently
+            entailment_id = self.entailment_id
+            contradiction_id = -1 if entailment_id == 0 else 0
+            entail_contr_logits = reshaped_outputs[..., [contradiction_id, entailment_id]]
+            scores = np.exp(entail_contr_logits) / np.exp(entail_contr_logits).sum(-1, keepdims=True)
+            scores = scores[..., 1]
+        else:
+            # softmax the "entailment" logits over all candidate labels
+            entail_logits = reshaped_outputs[..., self.entailment_id]
+            scores = np.exp(entail_logits) / np.exp(entail_logits).sum(-1, keepdims=True)
+
+        top_inds = list(reversed(scores[0].argsort()))
+        return {
+            "sequence": sequences[0],
+            "labels": [candidate_labels[i] for i in top_inds],
+            "scores": scores[0, top_inds].tolist(),
+        }
diff --git a/.venv/lib/python3.11/site-packages/transformers/pipelines/zero_shot_image_classification.py b/.venv/lib/python3.11/site-packages/transformers/pipelines/zero_shot_image_classification.py
new file mode 100644
index 0000000000000000000000000000000000000000..c53b515dcccd9c1f277a3f8a8871be08661e7a1c
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/transformers/pipelines/zero_shot_image_classification.py
@@ -0,0 +1,193 @@
+import warnings
+from collections import UserDict
+from typing import List, Union
+
+from ..utils import (
+    add_end_docstrings,
+    is_tf_available,
+    is_torch_available,
+    is_vision_available,
+    logging,
+    requires_backends,
+)
+from .base import Pipeline, build_pipeline_init_args
+
+
+if is_vision_available():
+    from PIL import Image
+
+    from ..image_utils import load_image
+
+if is_torch_available():
+    import torch
+
+    from ..models.auto.modeling_auto import MODEL_FOR_ZERO_SHOT_IMAGE_CLASSIFICATION_MAPPING_NAMES
+
+if is_tf_available():
+    from ..models.auto.modeling_tf_auto import TF_MODEL_FOR_ZERO_SHOT_IMAGE_CLASSIFICATION_MAPPING_NAMES
+    from ..tf_utils import stable_softmax
+
+logger = logging.get_logger(__name__)
+
+
+@add_end_docstrings(build_pipeline_init_args(has_image_processor=True))
+class ZeroShotImageClassificationPipeline(Pipeline):
+    """
+    Zero shot image classification pipeline using `CLIPModel`. This pipeline predicts the class of an image when you
+    provide an image and a set of `candidate_labels`.
+
+    Example:
+
+    ```python
+    >>> from transformers import pipeline
+
+    >>> classifier = pipeline(model="google/siglip-so400m-patch14-384")
+    >>> classifier(
+    ...     "https://huggingface.co/datasets/Narsil/image_dummy/raw/main/parrots.png",
+    ...     candidate_labels=["animals", "humans", "landscape"],
+    ... )
+    [{'score': 0.965, 'label': 'animals'}, {'score': 0.03, 'label': 'humans'}, {'score': 0.005, 'label': 'landscape'}]
+
+    >>> classifier(
+    ...     "https://huggingface.co/datasets/Narsil/image_dummy/raw/main/parrots.png",
+    ...     candidate_labels=["black and white", "photorealist", "painting"],
+    ... )
+    [{'score': 0.996, 'label': 'black and white'}, {'score': 0.003, 'label': 'photorealist'}, {'score': 0.0, 'label': 'painting'}]
+    ```
+
+    Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial)
+
+    This image classification pipeline can currently be loaded from [`pipeline`] using the following task identifier:
+    `"zero-shot-image-classification"`.
+
+    See the list of available models on
+    [huggingface.co/models](https://huggingface.co/models?filter=zero-shot-image-classification).
+    """
+
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+
+        requires_backends(self, "vision")
+        self.check_model_type(
+            TF_MODEL_FOR_ZERO_SHOT_IMAGE_CLASSIFICATION_MAPPING_NAMES
+            if self.framework == "tf"
+            else MODEL_FOR_ZERO_SHOT_IMAGE_CLASSIFICATION_MAPPING_NAMES
+        )
+
+    def __call__(self, image: Union[str, List[str], "Image", List["Image"]] = None, **kwargs):
+        """
+        Assign labels to the image(s) passed as inputs.
+
+        Args:
+            image (`str`, `List[str]`, `PIL.Image` or `List[PIL.Image]`):
+                The pipeline handles three types of images:
+
+                - A string containing a http link pointing to an image
+                - A string containing a local path to an image
+                - An image loaded in PIL directly
+
+            candidate_labels (`List[str]`):
+                The candidate labels for this image. They will be formatted using *hypothesis_template*.
+
+            hypothesis_template (`str`, *optional*, defaults to `"This is a photo of {}"`):
+                The format used in conjunction with *candidate_labels* to attempt the image classification by
+                replacing the placeholder with the candidate_labels. Pass "{}" if *candidate_labels* are
+                already formatted.
+
+            timeout (`float`, *optional*, defaults to None):
+                The maximum time in seconds to wait for fetching images from the web. If None, no timeout is set and
+                the call may block forever.
+
+        Return:
+            A list of dictionaries containing one entry per proposed label. Each dictionary contains the
+            following keys:
+            - **label** (`str`) -- One of the suggested *candidate_labels*.
+            - **score** (`float`) -- The score attributed by the model to that label. It is a value between
+                0 and 1, computed as the `softmax` of `logits_per_image`.
+        """
+        # After deprecation of this is completed, remove the default `None` value for `image`
+        if "images" in kwargs:
+            image = kwargs.pop("images")
+        if image is None:
+            raise ValueError("Cannot call the zero-shot-image-classification pipeline without an images argument!")
+        return super().__call__(image, **kwargs)
+
+    def _sanitize_parameters(self, tokenizer_kwargs=None, **kwargs):
+        preprocess_params = {}
+        if "candidate_labels" in kwargs:
+            preprocess_params["candidate_labels"] = kwargs["candidate_labels"]
+        if "timeout" in kwargs:
+            preprocess_params["timeout"] = kwargs["timeout"]
+        if "hypothesis_template" in kwargs:
+            preprocess_params["hypothesis_template"] = kwargs["hypothesis_template"]
+        if tokenizer_kwargs is not None:
+            warnings.warn(
+                "The `tokenizer_kwargs` argument is deprecated and will be removed in version 5 of Transformers",
+                FutureWarning,
+            )
+            preprocess_params["tokenizer_kwargs"] = tokenizer_kwargs
+
+        return preprocess_params, {}, {}
+
+    def preprocess(
+        self,
+        image,
+        candidate_labels=None,
+        hypothesis_template="This is a photo of {}.",
+        timeout=None,
+        tokenizer_kwargs=None,
+    ):
+        if tokenizer_kwargs is None:
+            tokenizer_kwargs = {}
+        image = load_image(image, timeout=timeout)
+        inputs = self.image_processor(images=[image], return_tensors=self.framework)
+        if self.framework == "pt":
+            inputs = inputs.to(self.torch_dtype)
+        inputs["candidate_labels"] = candidate_labels
+        sequences = [hypothesis_template.format(x) for x in candidate_labels]
+        padding = "max_length" if self.model.config.model_type == "siglip" else True
+        text_inputs = self.tokenizer(sequences, return_tensors=self.framework, padding=padding, **tokenizer_kwargs)
+        inputs["text_inputs"] = [text_inputs]
+        return inputs
+
+    def _forward(self, model_inputs):
+        candidate_labels = model_inputs.pop("candidate_labels")
+        text_inputs = model_inputs.pop("text_inputs")
+        if isinstance(text_inputs[0], UserDict):
+            text_inputs = text_inputs[0]
+        else:
+            # Batching case.
+            text_inputs = text_inputs[0][0]
+
+        outputs = self.model(**text_inputs, **model_inputs)
+
+        model_outputs = {
+            "candidate_labels": candidate_labels,
+            "logits": outputs.logits_per_image,
+        }
+        return model_outputs
+
+    def postprocess(self, model_outputs):
+        candidate_labels = model_outputs.pop("candidate_labels")
+        logits = model_outputs["logits"][0]
+        if self.framework == "pt" and self.model.config.model_type == "siglip":
+            probs = torch.sigmoid(logits).squeeze(-1)
+            scores = probs.tolist()
+            if not isinstance(scores, list):
+                scores = [scores]
+        elif self.framework == "pt":
+            probs = logits.softmax(dim=-1).squeeze(-1)
+            scores = probs.tolist()
+            if not isinstance(scores, list):
+                scores = [scores]
+        elif self.framework == "tf":
+            probs = stable_softmax(logits, axis=-1)
+            scores = probs.numpy().tolist()
+        else:
+            raise ValueError(f"Unsupported framework: {self.framework}")
+
+        result = [
+            {"score": score, "label": candidate_label}
+            for score, candidate_label in sorted(zip(scores, candidate_labels), key=lambda x: -x[0])
+        ]
+        return result
diff --git a/.venv/lib/python3.11/site-packages/transformers/pipelines/zero_shot_object_detection.py b/.venv/lib/python3.11/site-packages/transformers/pipelines/zero_shot_object_detection.py
new file mode 100644
index 0000000000000000000000000000000000000000..ce8da7340bcce527f6ef8c013f1f609c341f9857
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/transformers/pipelines/zero_shot_object_detection.py
@@ -0,0 +1,235 @@
+from typing import Any, Dict, List, Union
+
+from ..utils import add_end_docstrings, is_torch_available, is_vision_available, logging, requires_backends
+from .base import ChunkPipeline, build_pipeline_init_args
+
+
+if is_vision_available():
+    from PIL import Image
+
+    from ..image_utils import load_image, valid_images
+
+if is_torch_available():
+    import torch
+
+    from transformers.modeling_outputs import BaseModelOutput
+
+    from ..models.auto.modeling_auto import MODEL_FOR_ZERO_SHOT_OBJECT_DETECTION_MAPPING_NAMES
+
+logger = logging.get_logger(__name__)
+
+
+@add_end_docstrings(build_pipeline_init_args(has_image_processor=True))
+class ZeroShotObjectDetectionPipeline(ChunkPipeline):
+    """
+    Zero shot object detection pipeline using `OwlViTForObjectDetection`. This pipeline predicts bounding boxes of
+    objects when you provide an image and a set of `candidate_labels`.
+
+    Example:
+
+    ```python
+    >>> from transformers import pipeline
+
+    >>> detector = pipeline(model="google/owlvit-base-patch32", task="zero-shot-object-detection")
+    >>> detector(
+    ...     "http://images.cocodataset.org/val2017/000000039769.jpg",
+    ...     candidate_labels=["cat", "couch"],
+    ... )
+    [{'score': 0.287, 'label': 'cat', 'box': {'xmin': 324, 'ymin': 20, 'xmax': 640, 'ymax': 373}}, {'score': 0.254, 'label': 'cat', 'box': {'xmin': 1, 'ymin': 55, 'xmax': 315, 'ymax': 472}}, {'score': 0.121, 'label': 'couch', 'box': {'xmin': 4, 'ymin': 0, 'xmax': 642, 'ymax': 476}}]
+
+    >>> detector(
+    ...     "https://huggingface.co/datasets/Narsil/image_dummy/raw/main/parrots.png",
+    ...     candidate_labels=["head", "bird"],
+    ... )
+    [{'score': 0.119, 'label': 'bird', 'box': {'xmin': 71, 'ymin': 170, 'xmax': 410, 'ymax': 508}}]
+    ```
+
+    Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial)
+
+    This object detection pipeline can currently be loaded from [`pipeline`] using the following task identifier:
+    `"zero-shot-object-detection"`.
+
+    See the list of available models on
+    [huggingface.co/models](https://huggingface.co/models?filter=zero-shot-object-detection).
+    """
+
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+
+        if self.framework == "tf":
+            raise ValueError(f"The {self.__class__} is only available in PyTorch.")
+
+        requires_backends(self, "vision")
+        self.check_model_type(MODEL_FOR_ZERO_SHOT_OBJECT_DETECTION_MAPPING_NAMES)
+
+    def __call__(
+        self,
+        image: Union[str, "Image.Image", List[Dict[str, Any]]],
+        candidate_labels: Union[str, List[str]] = None,
+        **kwargs,
+    ):
+        """
+        Detect objects (bounding boxes & classes) in the image(s) passed as inputs.
+
+        Args:
+            image (`str`, `PIL.Image` or `List[Dict[str, Any]]`):
+                The pipeline handles three types of images:
+
+                - A string containing an http url pointing to an image
+                - A string containing a local path to an image
+                - An image loaded in PIL directly
+
+                You can use this parameter to send directly a list of images, or a dataset or a generator like so:
+
+                ```python
+                >>> from transformers import pipeline
+
+                >>> detector = pipeline(model="google/owlvit-base-patch32", task="zero-shot-object-detection")
+                >>> detector(
+                ...     [
+                ...         {
+                ...             "image": "http://images.cocodataset.org/val2017/000000039769.jpg",
+                ...             "candidate_labels": ["cat", "couch"],
+                ...         },
+                ...         {
+                ...             "image": "http://images.cocodataset.org/val2017/000000039769.jpg",
+                ...             "candidate_labels": ["cat", "couch"],
+                ...         },
+                ...     ]
+                ... )
+                [[{'score': 0.287, 'label': 'cat', 'box': {'xmin': 324, 'ymin': 20, 'xmax': 640, 'ymax': 373}}, {'score': 0.25, 'label': 'cat', 'box': {'xmin': 1, 'ymin': 55, 'xmax': 315, 'ymax': 472}}, {'score': 0.121, 'label': 'couch', 'box': {'xmin': 4, 'ymin': 0, 'xmax': 642, 'ymax': 476}}], [{'score': 0.287, 'label': 'cat', 'box': {'xmin': 324, 'ymin': 20, 'xmax': 640, 'ymax': 373}}, {'score': 0.254, 'label': 'cat', 'box': {'xmin': 1, 'ymin': 55, 'xmax': 315, 'ymax': 472}}, {'score': 0.121, 'label': 'couch', 'box': {'xmin': 4, 'ymin': 0, 'xmax': 642, 'ymax': 476}}]]
+                ```
+
+
+            candidate_labels (`str` or `List[str]` or `List[List[str]]`):
+                What the model should recognize in the image.
+
+            threshold (`float`, *optional*, defaults to 0.1):
+                The probability necessary to make a prediction.
+
+            top_k (`int`, *optional*, defaults to None):
+                The number of top predictions that will be returned by the pipeline. If the provided number is `None`
+                or higher than the number of predictions available, it will default to the number of predictions.
+
+            timeout (`float`, *optional*, defaults to None):
+                The maximum time in seconds to wait for fetching images from the web. If None, no timeout is set and
+                the call may block forever.
+
+
+        Return:
+            A list of lists containing prediction results, one list per input image. Each list contains dictionaries
+            with the following keys:
+
+            - **label** (`str`) -- Text query corresponding to the found object.
+            - **score** (`float`) -- Score corresponding to the object (between 0 and 1).
+            - **box** (`Dict[str,int]`) -- Bounding box of the detected object in image's original size. It is a
+              dictionary with `x_min`, `x_max`, `y_min`, `y_max` keys.
+        """
+        if "text_queries" in kwargs:
+            candidate_labels = kwargs.pop("text_queries")
+
+        if isinstance(image, (str, Image.Image)):
+            inputs = {"image": image, "candidate_labels": candidate_labels}
+        elif isinstance(image, (list, tuple)) and valid_images(image):
+            return list(
+                super().__call__(
+                    ({"image": img, "candidate_labels": labels} for img, labels in zip(image, candidate_labels)),
+                    **kwargs,
+                )
+            )
+        else:
+            """
+            Supports the following format
+            - {"image": image, "candidate_labels": candidate_labels}
+            - [{"image": image, "candidate_labels": candidate_labels}]
+            - Generator and datasets
+            This is a common pattern in other multimodal pipelines, so we support it here as well.
+            """
+            inputs = image
+
+        results = super().__call__(inputs, **kwargs)
+        return results
+
+    def _sanitize_parameters(self, **kwargs):
+        preprocess_params = {}
+        if "timeout" in kwargs:
+            preprocess_params["timeout"] = kwargs["timeout"]
+        postprocess_params = {}
+        if "threshold" in kwargs:
+            postprocess_params["threshold"] = kwargs["threshold"]
+        if "top_k" in kwargs:
+            postprocess_params["top_k"] = kwargs["top_k"]
+        return preprocess_params, {}, postprocess_params
+
+    def preprocess(self, inputs, timeout=None):
+        image = load_image(inputs["image"], timeout=timeout)
+        candidate_labels = inputs["candidate_labels"]
+        if isinstance(candidate_labels, str):
+            candidate_labels = candidate_labels.split(",")
+
+        target_size = torch.tensor([[image.height, image.width]], dtype=torch.int32)
+        for i, candidate_label in enumerate(candidate_labels):
+            text_inputs = self.tokenizer(candidate_label, return_tensors=self.framework)
+            image_features = self.image_processor(image, return_tensors=self.framework)
+            if self.framework == "pt":
+                image_features = image_features.to(self.torch_dtype)
+            yield {
+                "is_last": i == len(candidate_labels) - 1,
+                "target_size": target_size,
+                "candidate_label": candidate_label,
+                **text_inputs,
+                **image_features,
+            }
+
+    def _forward(self, model_inputs):
+        target_size = model_inputs.pop("target_size")
+        candidate_label = model_inputs.pop("candidate_label")
+        is_last = model_inputs.pop("is_last")
+
+        outputs = self.model(**model_inputs)
+
+        model_outputs = {"target_size": target_size, "candidate_label": candidate_label, "is_last": is_last, **outputs}
+        return model_outputs
+
+    def postprocess(self, model_outputs, threshold=0.1, top_k=None):
+        results = []
+        for model_output in model_outputs:
+            label = model_output["candidate_label"]
+            model_output = BaseModelOutput(model_output)
+            outputs = self.image_processor.post_process_object_detection(
+                outputs=model_output, threshold=threshold, target_sizes=model_output["target_size"]
+            )[0]
+
+            for index in outputs["scores"].nonzero():
+                score = outputs["scores"][index].item()
+                box = self._get_bounding_box(outputs["boxes"][index][0])
+
+                result = {"score": score, "label": label, "box": box}
+                results.append(result)
+
+        results = sorted(results, key=lambda x: x["score"], reverse=True)
+        if top_k:
+            results = results[:top_k]
+
+        return results
+
+    def _get_bounding_box(self, box: "torch.Tensor") -> Dict[str, int]:
+        """
+        Turns list [xmin, xmax, ymin, ymax] into dict { "xmin": xmin, ... }
+
+        Args:
+            box (`torch.Tensor`): Tensor containing the coordinates in corners format.
+
+        Returns:
+            bbox (`Dict[str, int]`): Dict containing the coordinates in corners format.
+        """
+        if self.framework != "pt":
+            raise ValueError("The ZeroShotObjectDetectionPipeline is only available in PyTorch.")
+        xmin, ymin, xmax, ymax = box.int().tolist()
+        bbox = {
+            "xmin": xmin,
+            "ymin": ymin,
+            "xmax": xmax,
+            "ymax": ymax,
+        }
+        return bbox
diff --git a/.venv/lib/python3.11/site-packages/transformers/sagemaker/__init__.py b/.venv/lib/python3.11/site-packages/transformers/sagemaker/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..98fe38de89cd025911d03669f9e22b03ab0768bd
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/transformers/sagemaker/__init__.py
@@ -0,0 +1,16 @@
+# Copyright 2021 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .trainer_sm import SageMakerTrainer
+from .training_args_sm import SageMakerTrainingArguments, is_sagemaker_dp_enabled
diff --git a/.venv/lib/python3.11/site-packages/transformers/sagemaker/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/sagemaker/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..099e01a2157fc76f0966eba131749abed573c936
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/sagemaker/__pycache__/__init__.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/transformers/sagemaker/__pycache__/trainer_sm.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/sagemaker/__pycache__/trainer_sm.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a88bfd9fa5aaca8b9e4ab2a9039c24821a8f6931
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/sagemaker/__pycache__/trainer_sm.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/transformers/sagemaker/__pycache__/training_args_sm.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/sagemaker/__pycache__/training_args_sm.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..364e39f0f6299f4340252c5fea617553c18a8087
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/sagemaker/__pycache__/training_args_sm.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/transformers/sagemaker/trainer_sm.py b/.venv/lib/python3.11/site-packages/transformers/sagemaker/trainer_sm.py
new file mode 100644
index 0000000000000000000000000000000000000000..6ab4e01acdbcd3ade1afc2339a75850bc538bd7a
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/transformers/sagemaker/trainer_sm.py
@@ -0,0 +1,30 @@
+# Copyright 2021 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import warnings
+
+from ..trainer import Trainer
+from ..utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class SageMakerTrainer(Trainer):
+    def __init__(self, args=None, **kwargs):
+        warnings.warn(
+            "`SageMakerTrainer` is deprecated and will be removed in v5 of Transformers. You can use `Trainer` "
+            "instead.",
+            FutureWarning,
+        )
+        super().__init__(args=args, **kwargs)
diff --git a/.venv/lib/python3.11/site-packages/transformers/sagemaker/training_args_sm.py b/.venv/lib/python3.11/site-packages/transformers/sagemaker/training_args_sm.py
new file mode 100644
index 0000000000000000000000000000000000000000..3daac7859b550de31f211a5e7c9938d8d557fc4c
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/transformers/sagemaker/training_args_sm.py
@@ -0,0 +1,136 @@
+# Copyright 2021 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import importlib.util
+import json
+import os
+import warnings
+from dataclasses import dataclass, field
+
+import torch
+
+from ..training_args import TrainingArguments
+from ..utils import cached_property, is_sagemaker_dp_enabled, logging
+
+
+logger = logging.get_logger(__name__)
+
+# TODO: should be moved to `utils` after refactoring of SageMakerTrainer
+
+
+def is_sagemaker_model_parallel_available():
+    # Get the sagemaker specific mp parameters from smp_options variable.
+    smp_options = os.getenv("SM_HP_MP_PARAMETERS", "{}")
+    try:
+        # Parse it and check the field "partitions" is included, it is required for model parallel.
+        smp_options = json.loads(smp_options)
+        if "partitions" not in smp_options:
+            return False
+    except json.JSONDecodeError:
+        return False
+
+    # Get the sagemaker specific framework parameters from mpi_options variable.
+    mpi_options = os.getenv("SM_FRAMEWORK_PARAMS", "{}")
+    try:
+        # Parse it and check the field "sagemaker_distributed_dataparallel_enabled".
+        mpi_options = json.loads(mpi_options)
+        if not mpi_options.get("sagemaker_mpi_enabled", False):
+            return False
+    except json.JSONDecodeError:
+        return False
+    # Lastly, check if the `smdistributed` module is present.
+    return importlib.util.find_spec("smdistributed") is not None
+
+
+if is_sagemaker_model_parallel_available():
+    import smdistributed.modelparallel.torch as smp
+
+    smp.init()
+
+
+@dataclass
+class SageMakerTrainingArguments(TrainingArguments):
+    mp_parameters: str = field(
+        default="",
+        metadata={"help": "Used by the SageMaker launcher to send mp-specific args. Ignored in SageMakerTrainer"},
+    )
+
+    def __post_init__(self):
+        super().__post_init__()
+        warnings.warn(
+            "`SageMakerTrainingArguments` is deprecated and will be removed in v5 of Transformers. You can use "
+            "`TrainingArguments` instead.",
+            FutureWarning,
+        )
+
+    @cached_property
+    def _setup_devices(self) -> "torch.device":
+        logger.info("PyTorch: setting up devices")
+        if torch.distributed.is_available() and torch.distributed.is_initialized() and self.local_rank == -1:
+            logger.warning(
+                "torch.distributed process group is initialized, but local_rank == -1. "
+                "In order to use Torch DDP, launch your script with `python -m torch.distributed.launch"
+            )
+        if self.no_cuda:
+            device = torch.device("cpu")
+            self._n_gpu = 0
+        elif is_sagemaker_model_parallel_available():
+            local_rank = smp.local_rank()
+            device = torch.device("cuda", local_rank)
+            self._n_gpu = 1
+        elif is_sagemaker_dp_enabled():
+            import smdistributed.dataparallel.torch.torch_smddp  # noqa: F401
+
+            torch.distributed.init_process_group(backend="smddp", timeout=self.ddp_timeout_delta)
+            self.local_rank = int(os.getenv("SMDATAPARALLEL_LOCAL_RANK"))
+            device = torch.device("cuda", self.local_rank)
+            self._n_gpu = 1
+        elif self.local_rank == -1:
+            # if n_gpu is > 1 we'll use nn.DataParallel.
+            # If you only want to use a specific subset of GPUs use `CUDA_VISIBLE_DEVICES=0`
+            # Explicitly set CUDA to the first (index 0) CUDA device, otherwise `set_device` will
+            # trigger an error that a device index is missing. Index 0 takes into account the
+            # GPUs available in the environment, so `CUDA_VISIBLE_DEVICES=1,2` with `cuda:0`
+            # will use the first GPU in that env, i.e. GPU#1
+            device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+            # Sometimes the line in the postinit has not been run before we end up here, so just checking we're not at
+            # the default value.
+            self._n_gpu = torch.cuda.device_count()
+        else:
+            # Here, we'll use torch.distributed.
+            # Initializes the distributed backend which will take care of synchronizing nodes/GPUs
+            if not torch.distributed.is_initialized():
+                torch.distributed.init_process_group(backend="nccl", timeout=self.ddp_timeout_delta)
+            device = torch.device("cuda", self.local_rank)
+            self._n_gpu = 1
+
+        if device.type == "cuda":
+            torch.cuda.set_device(device)
+
+        return device
+
+    @property
+    def world_size(self):
+        if is_sagemaker_model_parallel_available():
+            return smp.dp_size()
+
+        return super().world_size
+
+    @property
+    def place_model_on_device(self):
+        return not is_sagemaker_model_parallel_available()
+
+    @property
+    def _no_sync_in_gradient_accumulation(self):
+        return False