diff --git a/.venv/lib/python3.11/site-packages/transformers/__pycache__/image_processing_utils.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/__pycache__/image_processing_utils.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a5725153d17701ba387c6681c3bdd6291bae9d9f
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/__pycache__/image_processing_utils.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/transformers/__pycache__/image_transforms.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/__pycache__/image_transforms.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3f8b31a738e3eae16ffb94bc75d499450fcebfb0
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/__pycache__/image_transforms.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/transformers/__pycache__/modelcard.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/__pycache__/modelcard.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c589a78197c19eb463bdec2f981bb82c58e6e431
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/__pycache__/modelcard.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/transformers/__pycache__/training_args_tf.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/__pycache__/training_args_tf.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f88c77252185ad0e4892fb712861eaad491e1c09
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/__pycache__/training_args_tf.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/transformers/generation/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/generation/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..dadeab816cc49a8b6a3dcbfba77183f935428be1
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/generation/__pycache__/__init__.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/transformers/generation/__pycache__/beam_constraints.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/generation/__pycache__/beam_constraints.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..cac9a2ef5255abae0588c8e41bd69d4eef83e86b
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/generation/__pycache__/beam_constraints.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/transformers/generation/__pycache__/beam_search.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/generation/__pycache__/beam_search.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9890b9a6e459e22a1bc8611dcb8031cf9f60cfa2
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/generation/__pycache__/beam_search.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/transformers/pipelines/__init__.py b/.venv/lib/python3.11/site-packages/transformers/pipelines/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..257f5689b0ed71afd8560aeb183f4e47beb03d47
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/transformers/pipelines/__init__.py
@@ -0,0 +1,1178 @@
+# coding=utf-8
+# Copyright 2018 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import json
+import os
+import warnings
+from pathlib import Path
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union
+
+from huggingface_hub import model_info
+
+from ..configuration_utils import PretrainedConfig
+from ..dynamic_module_utils import get_class_from_dynamic_module
+from ..feature_extraction_utils import PreTrainedFeatureExtractor
+from ..image_processing_utils import BaseImageProcessor
+from ..models.auto.configuration_auto import AutoConfig
+from ..models.auto.feature_extraction_auto import FEATURE_EXTRACTOR_MAPPING, AutoFeatureExtractor
+from ..models.auto.image_processing_auto import IMAGE_PROCESSOR_MAPPING, AutoImageProcessor
+from ..models.auto.modeling_auto import AutoModelForDepthEstimation, AutoModelForImageToImage
+from ..models.auto.processing_auto import PROCESSOR_MAPPING, AutoProcessor
+from ..models.auto.tokenization_auto import TOKENIZER_MAPPING, AutoTokenizer
+from ..processing_utils import ProcessorMixin
+from ..tokenization_utils import PreTrainedTokenizer
+from ..utils import (
+ CONFIG_NAME,
+ HUGGINGFACE_CO_RESOLVE_ENDPOINT,
+ cached_file,
+ extract_commit_hash,
+ find_adapter_config_file,
+ is_kenlm_available,
+ is_offline_mode,
+ is_peft_available,
+ is_pyctcdecode_available,
+ is_tf_available,
+ is_torch_available,
+ logging,
+)
+from .audio_classification import AudioClassificationPipeline
+from .automatic_speech_recognition import AutomaticSpeechRecognitionPipeline
+from .base import (
+ ArgumentHandler,
+ CsvPipelineDataFormat,
+ JsonPipelineDataFormat,
+ PipedPipelineDataFormat,
+ Pipeline,
+ PipelineDataFormat,
+ PipelineException,
+ PipelineRegistry,
+ get_default_model_and_revision,
+ infer_framework_load_model,
+)
+from .depth_estimation import DepthEstimationPipeline
+from .document_question_answering import DocumentQuestionAnsweringPipeline
+from .feature_extraction import FeatureExtractionPipeline
+from .fill_mask import FillMaskPipeline
+from .image_classification import ImageClassificationPipeline
+from .image_feature_extraction import ImageFeatureExtractionPipeline
+from .image_segmentation import ImageSegmentationPipeline
+from .image_text_to_text import ImageTextToTextPipeline
+from .image_to_image import ImageToImagePipeline
+from .image_to_text import ImageToTextPipeline
+from .mask_generation import MaskGenerationPipeline
+from .object_detection import ObjectDetectionPipeline
+from .question_answering import QuestionAnsweringArgumentHandler, QuestionAnsweringPipeline
+from .table_question_answering import TableQuestionAnsweringArgumentHandler, TableQuestionAnsweringPipeline
+from .text2text_generation import SummarizationPipeline, Text2TextGenerationPipeline, TranslationPipeline
+from .text_classification import TextClassificationPipeline
+from .text_generation import TextGenerationPipeline
+from .text_to_audio import TextToAudioPipeline
+from .token_classification import (
+ AggregationStrategy,
+ NerPipeline,
+ TokenClassificationArgumentHandler,
+ TokenClassificationPipeline,
+)
+from .video_classification import VideoClassificationPipeline
+from .visual_question_answering import VisualQuestionAnsweringPipeline
+from .zero_shot_audio_classification import ZeroShotAudioClassificationPipeline
+from .zero_shot_classification import ZeroShotClassificationArgumentHandler, ZeroShotClassificationPipeline
+from .zero_shot_image_classification import ZeroShotImageClassificationPipeline
+from .zero_shot_object_detection import ZeroShotObjectDetectionPipeline
+
+
+if is_tf_available():
+ import tensorflow as tf
+
+ from ..models.auto.modeling_tf_auto import (
+ TFAutoModel,
+ TFAutoModelForCausalLM,
+ TFAutoModelForImageClassification,
+ TFAutoModelForMaskedLM,
+ TFAutoModelForQuestionAnswering,
+ TFAutoModelForSeq2SeqLM,
+ TFAutoModelForSequenceClassification,
+ TFAutoModelForTableQuestionAnswering,
+ TFAutoModelForTokenClassification,
+ TFAutoModelForVision2Seq,
+ TFAutoModelForZeroShotImageClassification,
+ )
+
+if is_torch_available():
+ import torch
+
+ from ..models.auto.modeling_auto import (
+ AutoModel,
+ AutoModelForAudioClassification,
+ AutoModelForCausalLM,
+ AutoModelForCTC,
+ AutoModelForDocumentQuestionAnswering,
+ AutoModelForImageClassification,
+ AutoModelForImageSegmentation,
+ AutoModelForImageTextToText,
+ AutoModelForMaskedLM,
+ AutoModelForMaskGeneration,
+ AutoModelForObjectDetection,
+ AutoModelForQuestionAnswering,
+ AutoModelForSemanticSegmentation,
+ AutoModelForSeq2SeqLM,
+ AutoModelForSequenceClassification,
+ AutoModelForSpeechSeq2Seq,
+ AutoModelForTableQuestionAnswering,
+ AutoModelForTextToSpectrogram,
+ AutoModelForTextToWaveform,
+ AutoModelForTokenClassification,
+ AutoModelForVideoClassification,
+ AutoModelForVision2Seq,
+ AutoModelForVisualQuestionAnswering,
+ AutoModelForZeroShotImageClassification,
+ AutoModelForZeroShotObjectDetection,
+ )
+
+
+if TYPE_CHECKING:
+ from ..modeling_tf_utils import TFPreTrainedModel
+ from ..modeling_utils import PreTrainedModel
+ from ..tokenization_utils_fast import PreTrainedTokenizerFast
+
+
+logger = logging.get_logger(__name__)
+
+
+# Register all the supported tasks here
+TASK_ALIASES = {
+ "sentiment-analysis": "text-classification",
+ "ner": "token-classification",
+ "vqa": "visual-question-answering",
+ "text-to-speech": "text-to-audio",
+}
+SUPPORTED_TASKS = {
+ "audio-classification": {
+ "impl": AudioClassificationPipeline,
+ "tf": (),
+ "pt": (AutoModelForAudioClassification,) if is_torch_available() else (),
+ "default": {"model": {"pt": ("superb/wav2vec2-base-superb-ks", "372e048")}},
+ "type": "audio",
+ },
+ "automatic-speech-recognition": {
+ "impl": AutomaticSpeechRecognitionPipeline,
+ "tf": (),
+ "pt": (AutoModelForCTC, AutoModelForSpeechSeq2Seq) if is_torch_available() else (),
+ "default": {"model": {"pt": ("facebook/wav2vec2-base-960h", "22aad52")}},
+ "type": "multimodal",
+ },
+ "text-to-audio": {
+ "impl": TextToAudioPipeline,
+ "tf": (),
+ "pt": (AutoModelForTextToWaveform, AutoModelForTextToSpectrogram) if is_torch_available() else (),
+ "default": {"model": {"pt": ("suno/bark-small", "1dbd7a1")}},
+ "type": "text",
+ },
+ "feature-extraction": {
+ "impl": FeatureExtractionPipeline,
+ "tf": (TFAutoModel,) if is_tf_available() else (),
+ "pt": (AutoModel,) if is_torch_available() else (),
+ "default": {
+ "model": {
+ "pt": ("distilbert/distilbert-base-cased", "6ea8117"),
+ "tf": ("distilbert/distilbert-base-cased", "6ea8117"),
+ }
+ },
+ "type": "multimodal",
+ },
+ "text-classification": {
+ "impl": TextClassificationPipeline,
+ "tf": (TFAutoModelForSequenceClassification,) if is_tf_available() else (),
+ "pt": (AutoModelForSequenceClassification,) if is_torch_available() else (),
+ "default": {
+ "model": {
+ "pt": ("distilbert/distilbert-base-uncased-finetuned-sst-2-english", "714eb0f"),
+ "tf": ("distilbert/distilbert-base-uncased-finetuned-sst-2-english", "714eb0f"),
+ },
+ },
+ "type": "text",
+ },
+ "token-classification": {
+ "impl": TokenClassificationPipeline,
+ "tf": (TFAutoModelForTokenClassification,) if is_tf_available() else (),
+ "pt": (AutoModelForTokenClassification,) if is_torch_available() else (),
+ "default": {
+ "model": {
+ "pt": ("dbmdz/bert-large-cased-finetuned-conll03-english", "4c53496"),
+ "tf": ("dbmdz/bert-large-cased-finetuned-conll03-english", "4c53496"),
+ },
+ },
+ "type": "text",
+ },
+ "question-answering": {
+ "impl": QuestionAnsweringPipeline,
+ "tf": (TFAutoModelForQuestionAnswering,) if is_tf_available() else (),
+ "pt": (AutoModelForQuestionAnswering,) if is_torch_available() else (),
+ "default": {
+ "model": {
+ "pt": ("distilbert/distilbert-base-cased-distilled-squad", "564e9b5"),
+ "tf": ("distilbert/distilbert-base-cased-distilled-squad", "564e9b5"),
+ },
+ },
+ "type": "text",
+ },
+ "table-question-answering": {
+ "impl": TableQuestionAnsweringPipeline,
+ "pt": (AutoModelForTableQuestionAnswering,) if is_torch_available() else (),
+ "tf": (TFAutoModelForTableQuestionAnswering,) if is_tf_available() else (),
+ "default": {
+ "model": {
+ "pt": ("google/tapas-base-finetuned-wtq", "e3dde19"),
+ "tf": ("google/tapas-base-finetuned-wtq", "e3dde19"),
+ },
+ },
+ "type": "text",
+ },
+ "visual-question-answering": {
+ "impl": VisualQuestionAnsweringPipeline,
+ "pt": (AutoModelForVisualQuestionAnswering,) if is_torch_available() else (),
+ "tf": (),
+ "default": {
+ "model": {"pt": ("dandelin/vilt-b32-finetuned-vqa", "d0a1f6a")},
+ },
+ "type": "multimodal",
+ },
+ "document-question-answering": {
+ "impl": DocumentQuestionAnsweringPipeline,
+ "pt": (AutoModelForDocumentQuestionAnswering,) if is_torch_available() else (),
+ "tf": (),
+ "default": {
+ "model": {"pt": ("impira/layoutlm-document-qa", "beed3c4")},
+ },
+ "type": "multimodal",
+ },
+ "fill-mask": {
+ "impl": FillMaskPipeline,
+ "tf": (TFAutoModelForMaskedLM,) if is_tf_available() else (),
+ "pt": (AutoModelForMaskedLM,) if is_torch_available() else (),
+ "default": {
+ "model": {
+ "pt": ("distilbert/distilroberta-base", "fb53ab8"),
+ "tf": ("distilbert/distilroberta-base", "fb53ab8"),
+ }
+ },
+ "type": "text",
+ },
+ "summarization": {
+ "impl": SummarizationPipeline,
+ "tf": (TFAutoModelForSeq2SeqLM,) if is_tf_available() else (),
+ "pt": (AutoModelForSeq2SeqLM,) if is_torch_available() else (),
+ "default": {
+ "model": {"pt": ("sshleifer/distilbart-cnn-12-6", "a4f8f3e"), "tf": ("google-t5/t5-small", "df1b051")}
+ },
+ "type": "text",
+ },
+ # This task is a special case as it's parametrized by SRC, TGT languages.
+ "translation": {
+ "impl": TranslationPipeline,
+ "tf": (TFAutoModelForSeq2SeqLM,) if is_tf_available() else (),
+ "pt": (AutoModelForSeq2SeqLM,) if is_torch_available() else (),
+ "default": {
+ ("en", "fr"): {"model": {"pt": ("google-t5/t5-base", "a9723ea"), "tf": ("google-t5/t5-base", "a9723ea")}},
+ ("en", "de"): {"model": {"pt": ("google-t5/t5-base", "a9723ea"), "tf": ("google-t5/t5-base", "a9723ea")}},
+ ("en", "ro"): {"model": {"pt": ("google-t5/t5-base", "a9723ea"), "tf": ("google-t5/t5-base", "a9723ea")}},
+ },
+ "type": "text",
+ },
+ "text2text-generation": {
+ "impl": Text2TextGenerationPipeline,
+ "tf": (TFAutoModelForSeq2SeqLM,) if is_tf_available() else (),
+ "pt": (AutoModelForSeq2SeqLM,) if is_torch_available() else (),
+ "default": {"model": {"pt": ("google-t5/t5-base", "a9723ea"), "tf": ("google-t5/t5-base", "a9723ea")}},
+ "type": "text",
+ },
+ "text-generation": {
+ "impl": TextGenerationPipeline,
+ "tf": (TFAutoModelForCausalLM,) if is_tf_available() else (),
+ "pt": (AutoModelForCausalLM,) if is_torch_available() else (),
+ "default": {"model": {"pt": ("openai-community/gpt2", "607a30d"), "tf": ("openai-community/gpt2", "607a30d")}},
+ "type": "text",
+ },
+ "zero-shot-classification": {
+ "impl": ZeroShotClassificationPipeline,
+ "tf": (TFAutoModelForSequenceClassification,) if is_tf_available() else (),
+ "pt": (AutoModelForSequenceClassification,) if is_torch_available() else (),
+ "default": {
+ "model": {
+ "pt": ("facebook/bart-large-mnli", "d7645e1"),
+ "tf": ("FacebookAI/roberta-large-mnli", "2a8f12d"),
+ },
+ "config": {
+ "pt": ("facebook/bart-large-mnli", "d7645e1"),
+ "tf": ("FacebookAI/roberta-large-mnli", "2a8f12d"),
+ },
+ },
+ "type": "text",
+ },
+ "zero-shot-image-classification": {
+ "impl": ZeroShotImageClassificationPipeline,
+ "tf": (TFAutoModelForZeroShotImageClassification,) if is_tf_available() else (),
+ "pt": (AutoModelForZeroShotImageClassification,) if is_torch_available() else (),
+ "default": {
+ "model": {
+ "pt": ("openai/clip-vit-base-patch32", "3d74acf"),
+ "tf": ("openai/clip-vit-base-patch32", "3d74acf"),
+ }
+ },
+ "type": "multimodal",
+ },
+ "zero-shot-audio-classification": {
+ "impl": ZeroShotAudioClassificationPipeline,
+ "tf": (),
+ "pt": (AutoModel,) if is_torch_available() else (),
+ "default": {
+ "model": {
+ "pt": ("laion/clap-htsat-fused", "cca9e28"),
+ }
+ },
+ "type": "multimodal",
+ },
+ "image-classification": {
+ "impl": ImageClassificationPipeline,
+ "tf": (TFAutoModelForImageClassification,) if is_tf_available() else (),
+ "pt": (AutoModelForImageClassification,) if is_torch_available() else (),
+ "default": {
+ "model": {
+ "pt": ("google/vit-base-patch16-224", "3f49326"),
+ "tf": ("google/vit-base-patch16-224", "3f49326"),
+ }
+ },
+ "type": "image",
+ },
+ "image-feature-extraction": {
+ "impl": ImageFeatureExtractionPipeline,
+ "tf": (TFAutoModel,) if is_tf_available() else (),
+ "pt": (AutoModel,) if is_torch_available() else (),
+ "default": {
+ "model": {
+ "pt": ("google/vit-base-patch16-224", "3f49326"),
+ "tf": ("google/vit-base-patch16-224", "3f49326"),
+ }
+ },
+ "type": "image",
+ },
+ "image-segmentation": {
+ "impl": ImageSegmentationPipeline,
+ "tf": (),
+ "pt": (AutoModelForImageSegmentation, AutoModelForSemanticSegmentation) if is_torch_available() else (),
+ "default": {"model": {"pt": ("facebook/detr-resnet-50-panoptic", "d53b52a")}},
+ "type": "multimodal",
+ },
+ "image-to-text": {
+ "impl": ImageToTextPipeline,
+ "tf": (TFAutoModelForVision2Seq,) if is_tf_available() else (),
+ "pt": (AutoModelForVision2Seq,) if is_torch_available() else (),
+ "default": {
+ "model": {
+ "pt": ("ydshieh/vit-gpt2-coco-en", "5bebf1e"),
+ "tf": ("ydshieh/vit-gpt2-coco-en", "5bebf1e"),
+ }
+ },
+ "type": "multimodal",
+ },
+ "image-text-to-text": {
+ "impl": ImageTextToTextPipeline,
+ "tf": (),
+ "pt": (AutoModelForImageTextToText,) if is_torch_available() else (),
+ "default": {
+ "model": {
+ "pt": ("llava-hf/llava-onevision-qwen2-0.5b-ov-hf", "2c9ba3b"),
+ }
+ },
+ "type": "multimodal",
+ },
+ "object-detection": {
+ "impl": ObjectDetectionPipeline,
+ "tf": (),
+ "pt": (AutoModelForObjectDetection,) if is_torch_available() else (),
+ "default": {"model": {"pt": ("facebook/detr-resnet-50", "1d5f47b")}},
+ "type": "multimodal",
+ },
+ "zero-shot-object-detection": {
+ "impl": ZeroShotObjectDetectionPipeline,
+ "tf": (),
+ "pt": (AutoModelForZeroShotObjectDetection,) if is_torch_available() else (),
+ "default": {"model": {"pt": ("google/owlvit-base-patch32", "cbc355f")}},
+ "type": "multimodal",
+ },
+ "depth-estimation": {
+ "impl": DepthEstimationPipeline,
+ "tf": (),
+ "pt": (AutoModelForDepthEstimation,) if is_torch_available() else (),
+ "default": {"model": {"pt": ("Intel/dpt-large", "bc15f29")}},
+ "type": "image",
+ },
+ "video-classification": {
+ "impl": VideoClassificationPipeline,
+ "tf": (),
+ "pt": (AutoModelForVideoClassification,) if is_torch_available() else (),
+ "default": {"model": {"pt": ("MCG-NJU/videomae-base-finetuned-kinetics", "488eb9a")}},
+ "type": "video",
+ },
+ "mask-generation": {
+ "impl": MaskGenerationPipeline,
+ "tf": (),
+ "pt": (AutoModelForMaskGeneration,) if is_torch_available() else (),
+ "default": {"model": {"pt": ("facebook/sam-vit-huge", "87aecf0")}},
+ "type": "multimodal",
+ },
+ "image-to-image": {
+ "impl": ImageToImagePipeline,
+ "tf": (),
+ "pt": (AutoModelForImageToImage,) if is_torch_available() else (),
+ "default": {"model": {"pt": ("caidas/swin2SR-classical-sr-x2-64", "cee1c92")}},
+ "type": "image",
+ },
+}
+
+NO_FEATURE_EXTRACTOR_TASKS = set()
+NO_IMAGE_PROCESSOR_TASKS = set()
+NO_TOKENIZER_TASKS = set()
+
+# Those model configs are special, they are generic over their task, meaning
+# any tokenizer/feature_extractor might be use for a given model so we cannot
+# use the statically defined TOKENIZER_MAPPING and FEATURE_EXTRACTOR_MAPPING to
+# see if the model defines such objects or not.
+MULTI_MODEL_AUDIO_CONFIGS = {"SpeechEncoderDecoderConfig"}
+MULTI_MODEL_VISION_CONFIGS = {"VisionEncoderDecoderConfig", "VisionTextDualEncoderConfig"}
+for task, values in SUPPORTED_TASKS.items():
+ if values["type"] == "text":
+ NO_FEATURE_EXTRACTOR_TASKS.add(task)
+ NO_IMAGE_PROCESSOR_TASKS.add(task)
+ elif values["type"] in {"image", "video"}:
+ NO_TOKENIZER_TASKS.add(task)
+ elif values["type"] in {"audio"}:
+ NO_TOKENIZER_TASKS.add(task)
+ NO_IMAGE_PROCESSOR_TASKS.add(task)
+ elif values["type"] != "multimodal":
+ raise ValueError(f"SUPPORTED_TASK {task} contains invalid type {values['type']}")
+
+PIPELINE_REGISTRY = PipelineRegistry(supported_tasks=SUPPORTED_TASKS, task_aliases=TASK_ALIASES)
+
+
+def get_supported_tasks() -> List[str]:
+ """
+ Returns a list of supported task strings.
+ """
+ return PIPELINE_REGISTRY.get_supported_tasks()
+
+
+def get_task(model: str, token: Optional[str] = None, **deprecated_kwargs) -> str:
+ use_auth_token = deprecated_kwargs.pop("use_auth_token", None)
+ if use_auth_token is not None:
+ warnings.warn(
+ "The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers. Please use `token` instead.",
+ FutureWarning,
+ )
+ if token is not None:
+ raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.")
+ token = use_auth_token
+
+ if is_offline_mode():
+ raise RuntimeError("You cannot infer task automatically within `pipeline` when using offline mode")
+ try:
+ info = model_info(model, token=token)
+ except Exception as e:
+ raise RuntimeError(f"Instantiating a pipeline without a task set raised an error: {e}")
+ if not info.pipeline_tag:
+ raise RuntimeError(
+ f"The model {model} does not seem to have a correct `pipeline_tag` set to infer the task automatically"
+ )
+ if getattr(info, "library_name", "transformers") not in {"transformers", "timm"}:
+ raise RuntimeError(f"This model is meant to be used with {info.library_name} not with transformers")
+ task = info.pipeline_tag
+ return task
+
+
+def check_task(task: str) -> Tuple[str, Dict, Any]:
+ """
+ Checks an incoming task string, to validate it's correct and return the default Pipeline and Model classes, and
+ default models if they exist.
+
+ Args:
+ task (`str`):
+ The task defining which pipeline will be returned. Currently accepted tasks are:
+
+ - `"audio-classification"`
+ - `"automatic-speech-recognition"`
+ - `"conversational"`
+ - `"depth-estimation"`
+ - `"document-question-answering"`
+ - `"feature-extraction"`
+ - `"fill-mask"`
+ - `"image-classification"`
+ - `"image-feature-extraction"`
+ - `"image-segmentation"`
+ - `"image-to-text"`
+ - `"image-to-image"`
+ - `"object-detection"`
+ - `"question-answering"`
+ - `"summarization"`
+ - `"table-question-answering"`
+ - `"text2text-generation"`
+ - `"text-classification"` (alias `"sentiment-analysis"` available)
+ - `"text-generation"`
+ - `"text-to-audio"` (alias `"text-to-speech"` available)
+ - `"token-classification"` (alias `"ner"` available)
+ - `"translation"`
+ - `"translation_xx_to_yy"`
+ - `"video-classification"`
+ - `"visual-question-answering"` (alias `"vqa"` available)
+ - `"zero-shot-classification"`
+ - `"zero-shot-image-classification"`
+ - `"zero-shot-object-detection"`
+
+ Returns:
+ (normalized_task: `str`, task_defaults: `dict`, task_options: (`tuple`, None)) The normalized task name
+ (removed alias and options). The actual dictionary required to initialize the pipeline and some extra task
+ options for parametrized tasks like "translation_XX_to_YY"
+
+
+ """
+ return PIPELINE_REGISTRY.check_task(task)
+
+
+def clean_custom_task(task_info):
+ import transformers
+
+ if "impl" not in task_info:
+ raise RuntimeError("This model introduces a custom pipeline without specifying its implementation.")
+ pt_class_names = task_info.get("pt", ())
+ if isinstance(pt_class_names, str):
+ pt_class_names = [pt_class_names]
+ task_info["pt"] = tuple(getattr(transformers, c) for c in pt_class_names)
+ tf_class_names = task_info.get("tf", ())
+ if isinstance(tf_class_names, str):
+ tf_class_names = [tf_class_names]
+ task_info["tf"] = tuple(getattr(transformers, c) for c in tf_class_names)
+ return task_info, None
+
+
+def pipeline(
+ task: str = None,
+ model: Optional[Union[str, "PreTrainedModel", "TFPreTrainedModel"]] = None,
+ config: Optional[Union[str, PretrainedConfig]] = None,
+ tokenizer: Optional[Union[str, PreTrainedTokenizer, "PreTrainedTokenizerFast"]] = None,
+ feature_extractor: Optional[Union[str, PreTrainedFeatureExtractor]] = None,
+ image_processor: Optional[Union[str, BaseImageProcessor]] = None,
+ processor: Optional[Union[str, ProcessorMixin]] = None,
+ framework: Optional[str] = None,
+ revision: Optional[str] = None,
+ use_fast: bool = True,
+ token: Optional[Union[str, bool]] = None,
+ device: Optional[Union[int, str, "torch.device"]] = None,
+ device_map=None,
+ torch_dtype=None,
+ trust_remote_code: Optional[bool] = None,
+ model_kwargs: Dict[str, Any] = None,
+ pipeline_class: Optional[Any] = None,
+ **kwargs,
+) -> Pipeline:
+ """
+ Utility factory method to build a [`Pipeline`].
+
+ A pipeline consists of:
+
+ - One or more components for pre-processing model inputs, such as a [tokenizer](tokenizer),
+ [image_processor](image_processor), [feature_extractor](feature_extractor), or [processor](processors).
+ - A [model](model) that generates predictions from the inputs.
+ - Optional post-processing steps to refine the model's output, which can also be handled by processors.
+
+
+ While there are such optional arguments as `tokenizer`, `feature_extractor`, `image_processor`, and `processor`,
+ they shouldn't be specified all at once. If these components are not provided, `pipeline` will try to load
+ required ones automatically. In case you want to provide these components explicitly, please refer to a
+ specific pipeline in order to get more details regarding what components are required.
+
+
+ Args:
+ task (`str`):
+ The task defining which pipeline will be returned. Currently accepted tasks are:
+
+ - `"audio-classification"`: will return a [`AudioClassificationPipeline`].
+ - `"automatic-speech-recognition"`: will return a [`AutomaticSpeechRecognitionPipeline`].
+ - `"depth-estimation"`: will return a [`DepthEstimationPipeline`].
+ - `"document-question-answering"`: will return a [`DocumentQuestionAnsweringPipeline`].
+ - `"feature-extraction"`: will return a [`FeatureExtractionPipeline`].
+ - `"fill-mask"`: will return a [`FillMaskPipeline`]:.
+ - `"image-classification"`: will return a [`ImageClassificationPipeline`].
+ - `"image-feature-extraction"`: will return an [`ImageFeatureExtractionPipeline`].
+ - `"image-segmentation"`: will return a [`ImageSegmentationPipeline`].
+ - `"image-text-to-text"`: will return a [`ImageTextToTextPipeline`].
+ - `"image-to-image"`: will return a [`ImageToImagePipeline`].
+ - `"image-to-text"`: will return a [`ImageToTextPipeline`].
+ - `"mask-generation"`: will return a [`MaskGenerationPipeline`].
+ - `"object-detection"`: will return a [`ObjectDetectionPipeline`].
+ - `"question-answering"`: will return a [`QuestionAnsweringPipeline`].
+ - `"summarization"`: will return a [`SummarizationPipeline`].
+ - `"table-question-answering"`: will return a [`TableQuestionAnsweringPipeline`].
+ - `"text2text-generation"`: will return a [`Text2TextGenerationPipeline`].
+ - `"text-classification"` (alias `"sentiment-analysis"` available): will return a
+ [`TextClassificationPipeline`].
+ - `"text-generation"`: will return a [`TextGenerationPipeline`]:.
+ - `"text-to-audio"` (alias `"text-to-speech"` available): will return a [`TextToAudioPipeline`]:.
+ - `"token-classification"` (alias `"ner"` available): will return a [`TokenClassificationPipeline`].
+ - `"translation"`: will return a [`TranslationPipeline`].
+ - `"translation_xx_to_yy"`: will return a [`TranslationPipeline`].
+ - `"video-classification"`: will return a [`VideoClassificationPipeline`].
+ - `"visual-question-answering"`: will return a [`VisualQuestionAnsweringPipeline`].
+ - `"zero-shot-classification"`: will return a [`ZeroShotClassificationPipeline`].
+ - `"zero-shot-image-classification"`: will return a [`ZeroShotImageClassificationPipeline`].
+ - `"zero-shot-audio-classification"`: will return a [`ZeroShotAudioClassificationPipeline`].
+ - `"zero-shot-object-detection"`: will return a [`ZeroShotObjectDetectionPipeline`].
+
+ model (`str` or [`PreTrainedModel`] or [`TFPreTrainedModel`], *optional*):
+ The model that will be used by the pipeline to make predictions. This can be a model identifier or an
+ actual instance of a pretrained model inheriting from [`PreTrainedModel`] (for PyTorch) or
+ [`TFPreTrainedModel`] (for TensorFlow).
+
+ If not provided, the default for the `task` will be loaded.
+ config (`str` or [`PretrainedConfig`], *optional*):
+ The configuration that will be used by the pipeline to instantiate the model. This can be a model
+ identifier or an actual pretrained model configuration inheriting from [`PretrainedConfig`].
+
+ If not provided, the default configuration file for the requested model will be used. That means that if
+ `model` is given, its default configuration will be used. However, if `model` is not supplied, this
+ `task`'s default model's config is used instead.
+ tokenizer (`str` or [`PreTrainedTokenizer`], *optional*):
+ The tokenizer that will be used by the pipeline to encode data for the model. This can be a model
+ identifier or an actual pretrained tokenizer inheriting from [`PreTrainedTokenizer`].
+
+ If not provided, the default tokenizer for the given `model` will be loaded (if it is a string). If `model`
+ is not specified or not a string, then the default tokenizer for `config` is loaded (if it is a string).
+ However, if `config` is also not given or not a string, then the default tokenizer for the given `task`
+ will be loaded.
+ feature_extractor (`str` or [`PreTrainedFeatureExtractor`], *optional*):
+ The feature extractor that will be used by the pipeline to encode data for the model. This can be a model
+ identifier or an actual pretrained feature extractor inheriting from [`PreTrainedFeatureExtractor`].
+
+ Feature extractors are used for non-NLP models, such as Speech or Vision models as well as multi-modal
+ models. Multi-modal models will also require a tokenizer to be passed.
+
+ If not provided, the default feature extractor for the given `model` will be loaded (if it is a string). If
+ `model` is not specified or not a string, then the default feature extractor for `config` is loaded (if it
+ is a string). However, if `config` is also not given or not a string, then the default feature extractor
+ for the given `task` will be loaded.
+ image_processor (`str` or [`BaseImageProcessor`], *optional*):
+ The image processor that will be used by the pipeline to preprocess images for the model. This can be a
+ model identifier or an actual image processor inheriting from [`BaseImageProcessor`].
+
+ Image processors are used for Vision models and multi-modal models that require image inputs. Multi-modal
+ models will also require a tokenizer to be passed.
+
+ If not provided, the default image processor for the given `model` will be loaded (if it is a string). If
+ `model` is not specified or not a string, then the default image processor for `config` is loaded (if it is
+ a string).
+ processor (`str` or [`ProcessorMixin`], *optional*):
+ The processor that will be used by the pipeline to preprocess data for the model. This can be a model
+ identifier or an actual processor inheriting from [`ProcessorMixin`].
+
+ Processors are used for multi-modal models that require multi-modal inputs, for example, a model that
+ requires both text and image inputs.
+
+ If not provided, the default processor for the given `model` will be loaded (if it is a string). If `model`
+ is not specified or not a string, then the default processor for `config` is loaded (if it is a string).
+ framework (`str`, *optional*):
+ The framework to use, either `"pt"` for PyTorch or `"tf"` for TensorFlow. The specified framework must be
+ installed.
+
+ If no framework is specified, will default to the one currently installed. If no framework is specified and
+ both frameworks are installed, will default to the framework of the `model`, or to PyTorch if no model is
+ provided.
+ revision (`str`, *optional*, defaults to `"main"`):
+ When passing a task name or a string model identifier: The specific model version to use. It can be a
+ branch name, a tag name, or a commit id, since we use a git-based system for storing models and other
+ artifacts on huggingface.co, so `revision` can be any identifier allowed by git.
+ use_fast (`bool`, *optional*, defaults to `True`):
+ Whether or not to use a Fast tokenizer if possible (a [`PreTrainedTokenizerFast`]).
+ use_auth_token (`str` or *bool*, *optional*):
+ The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated
+ when running `huggingface-cli login` (stored in `~/.huggingface`).
+ device (`int` or `str` or `torch.device`):
+ Defines the device (*e.g.*, `"cpu"`, `"cuda:1"`, `"mps"`, or a GPU ordinal rank like `1`) on which this
+ pipeline will be allocated.
+ device_map (`str` or `Dict[str, Union[int, str, torch.device]`, *optional*):
+ Sent directly as `model_kwargs` (just a simpler shortcut). When `accelerate` library is present, set
+ `device_map="auto"` to compute the most optimized `device_map` automatically (see
+ [here](https://huggingface.co/docs/accelerate/main/en/package_reference/big_modeling#accelerate.cpu_offload)
+ for more information).
+
+
+
+ Do not use `device_map` AND `device` at the same time as they will conflict
+
+
+
+ torch_dtype (`str` or `torch.dtype`, *optional*):
+ Sent directly as `model_kwargs` (just a simpler shortcut) to use the available precision for this model
+ (`torch.float16`, `torch.bfloat16`, ... or `"auto"`).
+ trust_remote_code (`bool`, *optional*, defaults to `False`):
+ Whether or not to allow for custom code defined on the Hub in their own modeling, configuration,
+ tokenization or even pipeline files. This option should only be set to `True` for repositories you trust
+ and in which you have read the code, as it will execute code present on the Hub on your local machine.
+ model_kwargs (`Dict[str, Any]`, *optional*):
+ Additional dictionary of keyword arguments passed along to the model's `from_pretrained(...,
+ **model_kwargs)` function.
+ kwargs (`Dict[str, Any]`, *optional*):
+ Additional keyword arguments passed along to the specific pipeline init (see the documentation for the
+ corresponding pipeline class for possible values).
+
+ Returns:
+ [`Pipeline`]: A suitable pipeline for the task.
+
+ Examples:
+
+ ```python
+ >>> from transformers import pipeline, AutoModelForTokenClassification, AutoTokenizer
+
+ >>> # Sentiment analysis pipeline
+ >>> analyzer = pipeline("sentiment-analysis")
+
+ >>> # Question answering pipeline, specifying the checkpoint identifier
+ >>> oracle = pipeline(
+ ... "question-answering", model="distilbert/distilbert-base-cased-distilled-squad", tokenizer="google-bert/bert-base-cased"
+ ... )
+
+ >>> # Named entity recognition pipeline, passing in a specific model and tokenizer
+ >>> model = AutoModelForTokenClassification.from_pretrained("dbmdz/bert-large-cased-finetuned-conll03-english")
+ >>> tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-cased")
+ >>> recognizer = pipeline("ner", model=model, tokenizer=tokenizer)
+ ```"""
+ if model_kwargs is None:
+ model_kwargs = {}
+ # Make sure we only pass use_auth_token once as a kwarg (it used to be possible to pass it in model_kwargs,
+ # this is to keep BC).
+ use_auth_token = model_kwargs.pop("use_auth_token", None)
+ if use_auth_token is not None:
+ warnings.warn(
+ "The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers. Please use `token` instead.",
+ FutureWarning,
+ )
+ if token is not None:
+ raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.")
+ token = use_auth_token
+
+ code_revision = kwargs.pop("code_revision", None)
+ commit_hash = kwargs.pop("_commit_hash", None)
+
+ hub_kwargs = {
+ "revision": revision,
+ "token": token,
+ "trust_remote_code": trust_remote_code,
+ "_commit_hash": commit_hash,
+ }
+
+ if task is None and model is None:
+ raise RuntimeError(
+ "Impossible to instantiate a pipeline without either a task or a model "
+ "being specified. "
+ "Please provide a task class or a model"
+ )
+
+ if model is None and tokenizer is not None:
+ raise RuntimeError(
+ "Impossible to instantiate a pipeline with tokenizer specified but not the model as the provided tokenizer"
+ " may not be compatible with the default model. Please provide a PreTrainedModel class or a"
+ " path/identifier to a pretrained model when providing tokenizer."
+ )
+ if model is None and feature_extractor is not None:
+ raise RuntimeError(
+ "Impossible to instantiate a pipeline with feature_extractor specified but not the model as the provided"
+ " feature_extractor may not be compatible with the default model. Please provide a PreTrainedModel class"
+ " or a path/identifier to a pretrained model when providing feature_extractor."
+ )
+ if isinstance(model, Path):
+ model = str(model)
+
+ if commit_hash is None:
+ pretrained_model_name_or_path = None
+ if isinstance(config, str):
+ pretrained_model_name_or_path = config
+ elif config is None and isinstance(model, str):
+ pretrained_model_name_or_path = model
+
+ if not isinstance(config, PretrainedConfig) and pretrained_model_name_or_path is not None:
+ # We make a call to the config file first (which may be absent) to get the commit hash as soon as possible
+ resolved_config_file = cached_file(
+ pretrained_model_name_or_path,
+ CONFIG_NAME,
+ _raise_exceptions_for_gated_repo=False,
+ _raise_exceptions_for_missing_entries=False,
+ _raise_exceptions_for_connection_errors=False,
+ cache_dir=model_kwargs.get("cache_dir"),
+ **hub_kwargs,
+ )
+ hub_kwargs["_commit_hash"] = extract_commit_hash(resolved_config_file, commit_hash)
+ else:
+ hub_kwargs["_commit_hash"] = getattr(config, "_commit_hash", None)
+
+ # Config is the primordial information item.
+ # Instantiate config if needed
+ if isinstance(config, str):
+ config = AutoConfig.from_pretrained(
+ config, _from_pipeline=task, code_revision=code_revision, **hub_kwargs, **model_kwargs
+ )
+ hub_kwargs["_commit_hash"] = config._commit_hash
+ elif config is None and isinstance(model, str):
+ # Check for an adapter file in the model path if PEFT is available
+ if is_peft_available():
+ # `find_adapter_config_file` doesn't accept `trust_remote_code`
+ _hub_kwargs = {k: v for k, v in hub_kwargs.items() if k != "trust_remote_code"}
+ maybe_adapter_path = find_adapter_config_file(
+ model,
+ token=hub_kwargs["token"],
+ revision=hub_kwargs["revision"],
+ _commit_hash=hub_kwargs["_commit_hash"],
+ )
+
+ if maybe_adapter_path is not None:
+ with open(maybe_adapter_path, "r", encoding="utf-8") as f:
+ adapter_config = json.load(f)
+ model = adapter_config["base_model_name_or_path"]
+
+ config = AutoConfig.from_pretrained(
+ model, _from_pipeline=task, code_revision=code_revision, **hub_kwargs, **model_kwargs
+ )
+ hub_kwargs["_commit_hash"] = config._commit_hash
+
+ custom_tasks = {}
+ if config is not None and len(getattr(config, "custom_pipelines", {})) > 0:
+ custom_tasks = config.custom_pipelines
+ if task is None and trust_remote_code is not False:
+ if len(custom_tasks) == 1:
+ task = list(custom_tasks.keys())[0]
+ else:
+ raise RuntimeError(
+ "We can't infer the task automatically for this model as there are multiple tasks available. Pick "
+ f"one in {', '.join(custom_tasks.keys())}"
+ )
+
+ if task is None and model is not None:
+ if not isinstance(model, str):
+ raise RuntimeError(
+ "Inferring the task automatically requires to check the hub with a model_id defined as a `str`. "
+ f"{model} is not a valid model_id."
+ )
+ task = get_task(model, token)
+
+ # Retrieve the task
+ if task in custom_tasks:
+ normalized_task = task
+ targeted_task, task_options = clean_custom_task(custom_tasks[task])
+ if pipeline_class is None:
+ if not trust_remote_code:
+ raise ValueError(
+ "Loading this pipeline requires you to execute the code in the pipeline file in that"
+ " repo on your local machine. Make sure you have read the code there to avoid malicious use, then"
+ " set the option `trust_remote_code=True` to remove this error."
+ )
+ class_ref = targeted_task["impl"]
+ pipeline_class = get_class_from_dynamic_module(
+ class_ref,
+ model,
+ code_revision=code_revision,
+ **hub_kwargs,
+ )
+ else:
+ normalized_task, targeted_task, task_options = check_task(task)
+ if pipeline_class is None:
+ pipeline_class = targeted_task["impl"]
+
+ # Use default model/config/tokenizer for the task if no model is provided
+ if model is None:
+ # At that point framework might still be undetermined
+ model, default_revision = get_default_model_and_revision(targeted_task, framework, task_options)
+ revision = revision if revision is not None else default_revision
+ logger.warning(
+ f"No model was supplied, defaulted to {model} and revision"
+ f" {revision} ({HUGGINGFACE_CO_RESOLVE_ENDPOINT}/{model}).\n"
+ "Using a pipeline without specifying a model name and revision in production is not recommended."
+ )
+ hub_kwargs["revision"] = revision
+ if config is None and isinstance(model, str):
+ config = AutoConfig.from_pretrained(model, _from_pipeline=task, **hub_kwargs, **model_kwargs)
+ hub_kwargs["_commit_hash"] = config._commit_hash
+
+ if device_map is not None:
+ if "device_map" in model_kwargs:
+ raise ValueError(
+ 'You cannot use both `pipeline(... device_map=..., model_kwargs={"device_map":...})` as those'
+ " arguments might conflict, use only one.)"
+ )
+ if device is not None:
+ logger.warning(
+ "Both `device` and `device_map` are specified. `device` will override `device_map`. You"
+ " will most likely encounter unexpected behavior. Please remove `device` and keep `device_map`."
+ )
+ model_kwargs["device_map"] = device_map
+ if torch_dtype is not None:
+ if "torch_dtype" in model_kwargs:
+ raise ValueError(
+ 'You cannot use both `pipeline(... torch_dtype=..., model_kwargs={"torch_dtype":...})` as those'
+ " arguments might conflict, use only one.)"
+ )
+ if isinstance(torch_dtype, str) and hasattr(torch, torch_dtype):
+ torch_dtype = getattr(torch, torch_dtype)
+ model_kwargs["torch_dtype"] = torch_dtype
+
+ model_name = model if isinstance(model, str) else None
+
+ # Load the correct model if possible
+ # Infer the framework from the model if not already defined
+ if isinstance(model, str) or framework is None:
+ model_classes = {"tf": targeted_task["tf"], "pt": targeted_task["pt"]}
+ framework, model = infer_framework_load_model(
+ model,
+ model_classes=model_classes,
+ config=config,
+ framework=framework,
+ task=task,
+ **hub_kwargs,
+ **model_kwargs,
+ )
+
+ model_config = model.config
+ hub_kwargs["_commit_hash"] = model.config._commit_hash
+
+ load_tokenizer = type(model_config) in TOKENIZER_MAPPING or model_config.tokenizer_class is not None
+ load_feature_extractor = type(model_config) in FEATURE_EXTRACTOR_MAPPING or feature_extractor is not None
+ load_image_processor = type(model_config) in IMAGE_PROCESSOR_MAPPING or image_processor is not None
+ load_processor = type(model_config) in PROCESSOR_MAPPING or processor is not None
+
+ # Check that pipeline class required loading
+ load_tokenizer = load_tokenizer and pipeline_class._load_tokenizer
+ load_feature_extractor = load_feature_extractor and pipeline_class._load_feature_extractor
+ load_image_processor = load_image_processor and pipeline_class._load_image_processor
+ load_processor = load_processor and pipeline_class._load_processor
+
+ # If `model` (instance of `PretrainedModel` instead of `str`) is passed (and/or same for config), while
+ # `image_processor` or `feature_extractor` is `None`, the loading will fail. This happens particularly for some
+ # vision tasks when calling `pipeline()` with `model` and only one of the `image_processor` and `feature_extractor`.
+ # TODO: we need to make `NO_IMAGE_PROCESSOR_TASKS` and `NO_FEATURE_EXTRACTOR_TASKS` more robust to avoid such issue.
+ # This block is only temporarily to make CI green.
+ if load_image_processor and load_feature_extractor:
+ load_feature_extractor = False
+
+ if (
+ tokenizer is None
+ and not load_tokenizer
+ and normalized_task not in NO_TOKENIZER_TASKS
+ # Using class name to avoid importing the real class.
+ and (
+ model_config.__class__.__name__ in MULTI_MODEL_AUDIO_CONFIGS
+ or model_config.__class__.__name__ in MULTI_MODEL_VISION_CONFIGS
+ )
+ ):
+ # This is a special category of models, that are fusions of multiple models
+ # so the model_config might not define a tokenizer, but it seems to be
+ # necessary for the task, so we're force-trying to load it.
+ load_tokenizer = True
+ if (
+ image_processor is None
+ and not load_image_processor
+ and normalized_task not in NO_IMAGE_PROCESSOR_TASKS
+ # Using class name to avoid importing the real class.
+ and model_config.__class__.__name__ in MULTI_MODEL_VISION_CONFIGS
+ ):
+ # This is a special category of models, that are fusions of multiple models
+ # so the model_config might not define a tokenizer, but it seems to be
+ # necessary for the task, so we're force-trying to load it.
+ load_image_processor = True
+ if (
+ feature_extractor is None
+ and not load_feature_extractor
+ and normalized_task not in NO_FEATURE_EXTRACTOR_TASKS
+ # Using class name to avoid importing the real class.
+ and model_config.__class__.__name__ in MULTI_MODEL_AUDIO_CONFIGS
+ ):
+ # This is a special category of models, that are fusions of multiple models
+ # so the model_config might not define a tokenizer, but it seems to be
+ # necessary for the task, so we're force-trying to load it.
+ load_feature_extractor = True
+
+ if task in NO_TOKENIZER_TASKS:
+ # These will never require a tokenizer.
+ # the model on the other hand might have a tokenizer, but
+ # the files could be missing from the hub, instead of failing
+ # on such repos, we just force to not load it.
+ load_tokenizer = False
+
+ if task in NO_FEATURE_EXTRACTOR_TASKS:
+ load_feature_extractor = False
+ if task in NO_IMAGE_PROCESSOR_TASKS:
+ load_image_processor = False
+
+ if load_tokenizer:
+ # Try to infer tokenizer from model or config name (if provided as str)
+ if tokenizer is None:
+ if isinstance(model_name, str):
+ tokenizer = model_name
+ elif isinstance(config, str):
+ tokenizer = config
+ else:
+ # Impossible to guess what is the right tokenizer here
+ raise Exception(
+ "Impossible to guess which tokenizer to use. "
+ "Please provide a PreTrainedTokenizer class or a path/identifier to a pretrained tokenizer."
+ )
+
+ # Instantiate tokenizer if needed
+ if isinstance(tokenizer, (str, tuple)):
+ if isinstance(tokenizer, tuple):
+ # For tuple we have (tokenizer name, {kwargs})
+ use_fast = tokenizer[1].pop("use_fast", use_fast)
+ tokenizer_identifier = tokenizer[0]
+ tokenizer_kwargs = tokenizer[1]
+ else:
+ tokenizer_identifier = tokenizer
+ tokenizer_kwargs = model_kwargs.copy()
+ tokenizer_kwargs.pop("torch_dtype", None)
+
+ tokenizer = AutoTokenizer.from_pretrained(
+ tokenizer_identifier, use_fast=use_fast, _from_pipeline=task, **hub_kwargs, **tokenizer_kwargs
+ )
+
+ if load_image_processor:
+ # Try to infer image processor from model or config name (if provided as str)
+ if image_processor is None:
+ if isinstance(model_name, str):
+ image_processor = model_name
+ elif isinstance(config, str):
+ image_processor = config
+ # Backward compatibility, as `feature_extractor` used to be the name
+ # for `ImageProcessor`.
+ elif feature_extractor is not None and isinstance(feature_extractor, BaseImageProcessor):
+ image_processor = feature_extractor
+ else:
+ # Impossible to guess what is the right image_processor here
+ raise Exception(
+ "Impossible to guess which image processor to use. "
+ "Please provide a PreTrainedImageProcessor class or a path/identifier "
+ "to a pretrained image processor."
+ )
+
+ # Instantiate image_processor if needed
+ if isinstance(image_processor, (str, tuple)):
+ image_processor = AutoImageProcessor.from_pretrained(
+ image_processor, _from_pipeline=task, **hub_kwargs, **model_kwargs
+ )
+
+ if load_feature_extractor:
+ # Try to infer feature extractor from model or config name (if provided as str)
+ if feature_extractor is None:
+ if isinstance(model_name, str):
+ feature_extractor = model_name
+ elif isinstance(config, str):
+ feature_extractor = config
+ else:
+ # Impossible to guess what is the right feature_extractor here
+ raise Exception(
+ "Impossible to guess which feature extractor to use. "
+ "Please provide a PreTrainedFeatureExtractor class or a path/identifier "
+ "to a pretrained feature extractor."
+ )
+
+ # Instantiate feature_extractor if needed
+ if isinstance(feature_extractor, (str, tuple)):
+ feature_extractor = AutoFeatureExtractor.from_pretrained(
+ feature_extractor, _from_pipeline=task, **hub_kwargs, **model_kwargs
+ )
+
+ if (
+ feature_extractor._processor_class
+ and feature_extractor._processor_class.endswith("WithLM")
+ and isinstance(model_name, str)
+ ):
+ try:
+ import kenlm # to trigger `ImportError` if not installed
+ from pyctcdecode import BeamSearchDecoderCTC
+
+ if os.path.isdir(model_name) or os.path.isfile(model_name):
+ decoder = BeamSearchDecoderCTC.load_from_dir(model_name)
+ else:
+ language_model_glob = os.path.join(
+ BeamSearchDecoderCTC._LANGUAGE_MODEL_SERIALIZED_DIRECTORY, "*"
+ )
+ alphabet_filename = BeamSearchDecoderCTC._ALPHABET_SERIALIZED_FILENAME
+ allow_patterns = [language_model_glob, alphabet_filename]
+ decoder = BeamSearchDecoderCTC.load_from_hf_hub(model_name, allow_patterns=allow_patterns)
+
+ kwargs["decoder"] = decoder
+ except ImportError as e:
+ logger.warning(f"Could not load the `decoder` for {model_name}. Defaulting to raw CTC. Error: {e}")
+ if not is_kenlm_available():
+ logger.warning("Try to install `kenlm`: `pip install kenlm")
+
+ if not is_pyctcdecode_available():
+ logger.warning("Try to install `pyctcdecode`: `pip install pyctcdecode")
+
+ if load_processor:
+ # Try to infer processor from model or config name (if provided as str)
+ if processor is None:
+ if isinstance(model_name, str):
+ processor = model_name
+ elif isinstance(config, str):
+ processor = config
+ else:
+ # Impossible to guess what is the right processor here
+ raise Exception(
+ "Impossible to guess which processor to use. "
+ "Please provide a processor instance or a path/identifier "
+ "to a processor."
+ )
+
+ # Instantiate processor if needed
+ if isinstance(processor, (str, tuple)):
+ processor = AutoProcessor.from_pretrained(processor, _from_pipeline=task, **hub_kwargs, **model_kwargs)
+ if not isinstance(processor, ProcessorMixin):
+ raise TypeError(
+ "Processor was loaded, but it is not an instance of `ProcessorMixin`. "
+ f"Got type `{type(processor)}` instead. Please check that you specified "
+ "correct pipeline task for the model and model has processor implemented and saved."
+ )
+
+ if task == "translation" and model.config.task_specific_params:
+ for key in model.config.task_specific_params:
+ if key.startswith("translation"):
+ task = key
+ warnings.warn(
+ f'"translation" task was used, instead of "translation_XX_to_YY", defaulting to "{task}"',
+ UserWarning,
+ )
+ break
+
+ if tokenizer is not None:
+ kwargs["tokenizer"] = tokenizer
+
+ if feature_extractor is not None:
+ kwargs["feature_extractor"] = feature_extractor
+
+ if torch_dtype is not None:
+ kwargs["torch_dtype"] = torch_dtype
+
+ if image_processor is not None:
+ kwargs["image_processor"] = image_processor
+
+ if device is not None:
+ kwargs["device"] = device
+
+ if processor is not None:
+ kwargs["processor"] = processor
+
+ return pipeline_class(model=model, framework=framework, task=task, **kwargs)
diff --git a/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3c13de8d2f8416c40cc05909e6b4dc3a15c66706
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/__init__.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/audio_classification.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/audio_classification.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..78a6def32f2b9c4e5c4f398d1c4caa2317d9c6d0
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/audio_classification.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/audio_utils.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/audio_utils.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4ecc0f8845fbe145dfeb09070c6be150cc99b895
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/audio_utils.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/automatic_speech_recognition.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/automatic_speech_recognition.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..be7914fb35489ccfd95dee43f879e2436d43095c
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/automatic_speech_recognition.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/base.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/base.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1c254c1421872f69e5ea59af1fb57b4f90999947
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/base.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/depth_estimation.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/depth_estimation.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a952dd38e0ea7d65e8d39e69191a2ca651bd605c
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/depth_estimation.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/document_question_answering.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/document_question_answering.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4e8da27a53928f48b178da6848f65430ebf562cc
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/document_question_answering.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/feature_extraction.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/feature_extraction.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3c8cbfc27f17c71f2992710d1d01fa38a4103e4b
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/feature_extraction.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/fill_mask.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/fill_mask.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d48fdb57cc820d93b00c44aa67d069f79eaef3ce
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/fill_mask.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/image_classification.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/image_classification.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7ca87019d3bea003988f0d743a88fba6c2eb9432
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/image_classification.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/image_feature_extraction.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/image_feature_extraction.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1f26ad155baf5fad6d5ec5b16ce5cf31a37b4f97
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/image_feature_extraction.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/image_segmentation.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/image_segmentation.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a1597f2d4807c8ea0c7e2890c0a807bcf6394d0c
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/image_segmentation.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/image_text_to_text.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/image_text_to_text.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..55138504bdf575226a181dc4401e207ed1af9d83
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/image_text_to_text.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/image_to_image.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/image_to_image.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7117056ab6f1eed8e2ac85b608865bf1d8d1f381
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/image_to_image.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/image_to_text.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/image_to_text.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..282a5eab4a5afa2f511bf832c0f5c9f977f9a19e
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/image_to_text.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/mask_generation.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/mask_generation.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d81847e9085c6a700c2ed99df5a5ec3b485ec0fc
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/mask_generation.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/object_detection.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/object_detection.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..20d7177e8f35bc38403669156b96ff28c56cea8e
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/object_detection.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/pt_utils.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/pt_utils.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..eaa157f04981120aa846f4bff0a97ef0271359b0
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/pt_utils.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/question_answering.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/question_answering.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..dadc7dbcbc6d2a1b1143894da48036f00ab71f41
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/question_answering.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/table_question_answering.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/table_question_answering.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6bbb4de92436af95784a53a396dfe1df5c89ed9c
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/table_question_answering.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/text2text_generation.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/text2text_generation.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8b0677e2f2c048d094bf1e3b1888005074e3b9b4
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/text2text_generation.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/text_classification.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/text_classification.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9226af6305246af4ac726bf575396c4ae5d0cbb3
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/text_classification.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/text_generation.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/text_generation.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c55bc926b9fc6b43b404c05ced61caea13ceb21e
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/text_generation.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/text_to_audio.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/text_to_audio.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6d005050223b6a943128b3a79ff044246a046f50
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/text_to_audio.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/token_classification.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/token_classification.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b35ba5e8975996d511e41dc35e813edd4d212992
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/token_classification.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/video_classification.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/video_classification.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f30a7f3b4ab8da22ab900cc036524ac69ddc7689
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/video_classification.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/visual_question_answering.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/visual_question_answering.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e2c83eff32e78865d7f9430829b8a7ae10416e3a
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/visual_question_answering.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/zero_shot_audio_classification.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/zero_shot_audio_classification.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9909e35082363c85c942b355399deb0f0e89e4f1
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/zero_shot_audio_classification.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/zero_shot_classification.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/zero_shot_classification.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8b2661528b79429f9ffd1421f3ba8135ab2c29bc
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/zero_shot_classification.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/zero_shot_image_classification.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/zero_shot_image_classification.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4db3ff00805b9daa790f19fdd193510846fdf908
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/zero_shot_image_classification.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/zero_shot_object_detection.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/zero_shot_object_detection.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..bbe4539f856a5eb2d287d9a5556e54e280324f62
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/zero_shot_object_detection.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/transformers/pipelines/audio_classification.py b/.venv/lib/python3.11/site-packages/transformers/pipelines/audio_classification.py
new file mode 100644
index 0000000000000000000000000000000000000000..f6acbb3096e07d5b467ef4f2dccbda3ce1cd8e51
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/transformers/pipelines/audio_classification.py
@@ -0,0 +1,234 @@
+# Copyright 2021 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import subprocess
+from typing import Union
+
+import numpy as np
+import requests
+
+from ..utils import add_end_docstrings, is_torch_available, is_torchaudio_available, logging
+from .base import Pipeline, build_pipeline_init_args
+
+
+if is_torch_available():
+ from ..models.auto.modeling_auto import MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING_NAMES
+
+logger = logging.get_logger(__name__)
+
+
+def ffmpeg_read(bpayload: bytes, sampling_rate: int) -> np.array:
+ """
+ Helper function to read an audio file through ffmpeg.
+ """
+ ar = f"{sampling_rate}"
+ ac = "1"
+ format_for_conversion = "f32le"
+ ffmpeg_command = [
+ "ffmpeg",
+ "-i",
+ "pipe:0",
+ "-ac",
+ ac,
+ "-ar",
+ ar,
+ "-f",
+ format_for_conversion,
+ "-hide_banner",
+ "-loglevel",
+ "quiet",
+ "pipe:1",
+ ]
+
+ try:
+ ffmpeg_process = subprocess.Popen(ffmpeg_command, stdin=subprocess.PIPE, stdout=subprocess.PIPE)
+ except FileNotFoundError:
+ raise ValueError("ffmpeg was not found but is required to load audio files from filename")
+ output_stream = ffmpeg_process.communicate(bpayload)
+ out_bytes = output_stream[0]
+
+ audio = np.frombuffer(out_bytes, np.float32)
+ if audio.shape[0] == 0:
+ raise ValueError("Malformed soundfile")
+ return audio
+
+
+@add_end_docstrings(build_pipeline_init_args(has_feature_extractor=True))
+class AudioClassificationPipeline(Pipeline):
+ """
+ Audio classification pipeline using any `AutoModelForAudioClassification`. This pipeline predicts the class of a
+ raw waveform or an audio file. In case of an audio file, ffmpeg should be installed to support multiple audio
+ formats.
+
+ Example:
+
+ ```python
+ >>> from transformers import pipeline
+
+ >>> classifier = pipeline(model="superb/wav2vec2-base-superb-ks")
+ >>> classifier("https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/1.flac")
+ [{'score': 0.997, 'label': '_unknown_'}, {'score': 0.002, 'label': 'left'}, {'score': 0.0, 'label': 'yes'}, {'score': 0.0, 'label': 'down'}, {'score': 0.0, 'label': 'stop'}]
+ ```
+
+ Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial)
+
+
+ This pipeline can currently be loaded from [`pipeline`] using the following task identifier:
+ `"audio-classification"`.
+
+ See the list of available models on
+ [huggingface.co/models](https://huggingface.co/models?filter=audio-classification).
+ """
+
+ def __init__(self, *args, **kwargs):
+ # Default, might be overriden by the model.config.
+ kwargs["top_k"] = kwargs.get("top_k", 5)
+ super().__init__(*args, **kwargs)
+
+ if self.framework != "pt":
+ raise ValueError(f"The {self.__class__} is only available in PyTorch.")
+
+ self.check_model_type(MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING_NAMES)
+
+ def __call__(
+ self,
+ inputs: Union[np.ndarray, bytes, str],
+ **kwargs,
+ ):
+ """
+ Classify the sequence(s) given as inputs. See the [`AutomaticSpeechRecognitionPipeline`] documentation for more
+ information.
+
+ Args:
+ inputs (`np.ndarray` or `bytes` or `str` or `dict`):
+ The inputs is either :
+ - `str` that is the filename of the audio file, the file will be read at the correct sampling rate
+ to get the waveform using *ffmpeg*. This requires *ffmpeg* to be installed on the system.
+ - `bytes` it is supposed to be the content of an audio file and is interpreted by *ffmpeg* in the
+ same way.
+ - (`np.ndarray` of shape (n, ) of type `np.float32` or `np.float64`)
+ Raw audio at the correct sampling rate (no further check will be done)
+ - `dict` form can be used to pass raw audio sampled at arbitrary `sampling_rate` and let this
+ pipeline do the resampling. The dict must be either be in the format `{"sampling_rate": int,
+ "raw": np.array}`, or `{"sampling_rate": int, "array": np.array}`, where the key `"raw"` or
+ `"array"` is used to denote the raw audio waveform.
+ top_k (`int`, *optional*, defaults to None):
+ The number of top labels that will be returned by the pipeline. If the provided number is `None` or
+ higher than the number of labels available in the model configuration, it will default to the number of
+ labels.
+ function_to_apply(`str`, *optional*, defaults to "softmax"):
+ The function to apply to the model output. By default, the pipeline will apply the softmax function to
+ the output of the model. Valid options: ["softmax", "sigmoid", "none"]. Note that passing Python's
+ built-in `None` will default to "softmax", so you need to pass the string "none" to disable any
+ post-processing.
+
+ Return:
+ A list of `dict` with the following keys:
+
+ - **label** (`str`) -- The label predicted.
+ - **score** (`float`) -- The corresponding probability.
+ """
+ return super().__call__(inputs, **kwargs)
+
+ def _sanitize_parameters(self, top_k=None, function_to_apply=None, **kwargs):
+ # No parameters on this pipeline right now
+ postprocess_params = {}
+ if top_k is not None:
+ if top_k > self.model.config.num_labels:
+ top_k = self.model.config.num_labels
+ postprocess_params["top_k"] = top_k
+ if function_to_apply is not None:
+ if function_to_apply not in ["softmax", "sigmoid", "none"]:
+ raise ValueError(
+ f"Invalid value for `function_to_apply`: {function_to_apply}. "
+ "Valid options are ['softmax', 'sigmoid', 'none']"
+ )
+ postprocess_params["function_to_apply"] = function_to_apply
+ else:
+ postprocess_params["function_to_apply"] = "softmax"
+ return {}, {}, postprocess_params
+
+ def preprocess(self, inputs):
+ if isinstance(inputs, str):
+ if inputs.startswith("http://") or inputs.startswith("https://"):
+ # We need to actually check for a real protocol, otherwise it's impossible to use a local file
+ # like http_huggingface_co.png
+ inputs = requests.get(inputs).content
+ else:
+ with open(inputs, "rb") as f:
+ inputs = f.read()
+
+ if isinstance(inputs, bytes):
+ inputs = ffmpeg_read(inputs, self.feature_extractor.sampling_rate)
+
+ if isinstance(inputs, dict):
+ # Accepting `"array"` which is the key defined in `datasets` for
+ # better integration
+ if not ("sampling_rate" in inputs and ("raw" in inputs or "array" in inputs)):
+ raise ValueError(
+ "When passing a dictionary to AudioClassificationPipeline, the dict needs to contain a "
+ '"raw" key containing the numpy array representing the audio and a "sampling_rate" key, '
+ "containing the sampling_rate associated with that array"
+ )
+
+ _inputs = inputs.pop("raw", None)
+ if _inputs is None:
+ # Remove path which will not be used from `datasets`.
+ inputs.pop("path", None)
+ _inputs = inputs.pop("array", None)
+ in_sampling_rate = inputs.pop("sampling_rate")
+ inputs = _inputs
+ if in_sampling_rate != self.feature_extractor.sampling_rate:
+ import torch
+
+ if is_torchaudio_available():
+ from torchaudio import functional as F
+ else:
+ raise ImportError(
+ "torchaudio is required to resample audio samples in AudioClassificationPipeline. "
+ "The torchaudio package can be installed through: `pip install torchaudio`."
+ )
+
+ inputs = F.resample(
+ torch.from_numpy(inputs), in_sampling_rate, self.feature_extractor.sampling_rate
+ ).numpy()
+
+ if not isinstance(inputs, np.ndarray):
+ raise TypeError("We expect a numpy ndarray as input")
+ if len(inputs.shape) != 1:
+ raise ValueError("We expect a single channel audio input for AudioClassificationPipeline")
+
+ processed = self.feature_extractor(
+ inputs, sampling_rate=self.feature_extractor.sampling_rate, return_tensors="pt"
+ )
+ return processed
+
+ def _forward(self, model_inputs):
+ model_outputs = self.model(**model_inputs)
+ return model_outputs
+
+ def postprocess(self, model_outputs, top_k=5, function_to_apply="softmax"):
+ if function_to_apply == "softmax":
+ probs = model_outputs.logits[0].softmax(-1)
+ elif function_to_apply == "sigmoid":
+ probs = model_outputs.logits[0].sigmoid()
+ else:
+ probs = model_outputs.logits[0]
+ scores, ids = probs.topk(top_k)
+
+ scores = scores.tolist()
+ ids = ids.tolist()
+
+ labels = [{"score": score, "label": self.model.config.id2label[_id]} for score, _id in zip(scores, ids)]
+
+ return labels
diff --git a/.venv/lib/python3.11/site-packages/transformers/pipelines/audio_utils.py b/.venv/lib/python3.11/site-packages/transformers/pipelines/audio_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..72a5f51db6129ae46939a5f2d640d286f479749f
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/transformers/pipelines/audio_utils.py
@@ -0,0 +1,297 @@
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+import datetime
+import platform
+import subprocess
+from typing import Optional, Tuple, Union
+
+import numpy as np
+
+
+def ffmpeg_read(bpayload: bytes, sampling_rate: int) -> np.array:
+ """
+ Helper function to read an audio file through ffmpeg.
+ """
+ ar = f"{sampling_rate}"
+ ac = "1"
+ format_for_conversion = "f32le"
+ ffmpeg_command = [
+ "ffmpeg",
+ "-i",
+ "pipe:0",
+ "-ac",
+ ac,
+ "-ar",
+ ar,
+ "-f",
+ format_for_conversion,
+ "-hide_banner",
+ "-loglevel",
+ "quiet",
+ "pipe:1",
+ ]
+
+ try:
+ with subprocess.Popen(ffmpeg_command, stdin=subprocess.PIPE, stdout=subprocess.PIPE) as ffmpeg_process:
+ output_stream = ffmpeg_process.communicate(bpayload)
+ except FileNotFoundError as error:
+ raise ValueError("ffmpeg was not found but is required to load audio files from filename") from error
+ out_bytes = output_stream[0]
+ audio = np.frombuffer(out_bytes, np.float32)
+ if audio.shape[0] == 0:
+ raise ValueError(
+ "Soundfile is either not in the correct format or is malformed. Ensure that the soundfile has "
+ "a valid audio file extension (e.g. wav, flac or mp3) and is not corrupted. If reading from a remote "
+ "URL, ensure that the URL is the full address to **download** the audio file."
+ )
+ return audio
+
+
+def ffmpeg_microphone(
+ sampling_rate: int,
+ chunk_length_s: float,
+ format_for_conversion: str = "f32le",
+ ffmpeg_input_device: Optional[str] = None,
+ ffmpeg_additional_args: Optional[list[str]] = None,
+):
+ """
+ Helper function to read audio from a microphone using ffmpeg. The default input device will be used unless another
+ input device is specified using the `ffmpeg_input_device` argument. Uses 'alsa' on Linux, 'avfoundation' on MacOS and
+ 'dshow' on Windows.
+
+ Arguments:
+ sampling_rate (`int`):
+ The sampling_rate to use when reading the data from the microphone. Try using the model's sampling_rate to
+ avoid resampling later.
+ chunk_length_s (`float` or `int`):
+ The length of the maximum chunk of audio to be sent returned.
+ format_for_conversion (`str`, defaults to `f32le`):
+ The name of the format of the audio samples to be returned by ffmpeg. The standard is `f32le`, `s16le`
+ could also be used.
+ ffmpeg_input_device (`str`, *optional*):
+ The identifier of the input device to be used by ffmpeg (i.e. ffmpeg's '-i' argument). If unset,
+ the default input device will be used. See `https://www.ffmpeg.org/ffmpeg-devices.html#Input-Devices`
+ for how to specify and list input devices.
+ ffmpeg_additional_args (`list[str]`, *optional*):
+ Additional arguments to pass to ffmpeg, can include arguments like -nostdin for running as a background
+ process. For example, to pass -nostdin to the ffmpeg process, pass in ["-nostdin"]. If passing in flags
+ with multiple arguments, use the following convention (eg ["flag", "arg1", "arg2]).
+
+ Returns:
+ A generator yielding audio chunks of `chunk_length_s` seconds as `bytes` objects of length
+ `int(round(sampling_rate * chunk_length_s)) * size_of_sample`.
+ """
+ ar = f"{sampling_rate}"
+ ac = "1"
+ if format_for_conversion == "s16le":
+ size_of_sample = 2
+ elif format_for_conversion == "f32le":
+ size_of_sample = 4
+ else:
+ raise ValueError(f"Unhandled format `{format_for_conversion}`. Please use `s16le` or `f32le`")
+
+ system = platform.system()
+
+ if system == "Linux":
+ format_ = "alsa"
+ input_ = ffmpeg_input_device or "default"
+ elif system == "Darwin":
+ format_ = "avfoundation"
+ input_ = ffmpeg_input_device or ":default"
+ elif system == "Windows":
+ format_ = "dshow"
+ input_ = ffmpeg_input_device or _get_microphone_name()
+
+ ffmpeg_additional_args = [] if ffmpeg_additional_args is None else ffmpeg_additional_args
+
+ ffmpeg_command = [
+ "ffmpeg",
+ "-f",
+ format_,
+ "-i",
+ input_,
+ "-ac",
+ ac,
+ "-ar",
+ ar,
+ "-f",
+ format_for_conversion,
+ "-fflags",
+ "nobuffer",
+ "-hide_banner",
+ "-loglevel",
+ "quiet",
+ "pipe:1",
+ ]
+
+ ffmpeg_command.extend(ffmpeg_additional_args)
+
+ chunk_len = int(round(sampling_rate * chunk_length_s)) * size_of_sample
+ iterator = _ffmpeg_stream(ffmpeg_command, chunk_len)
+ for item in iterator:
+ yield item
+
+
+def ffmpeg_microphone_live(
+ sampling_rate: int,
+ chunk_length_s: float,
+ stream_chunk_s: Optional[int] = None,
+ stride_length_s: Optional[Union[Tuple[float, float], float]] = None,
+ format_for_conversion: str = "f32le",
+ ffmpeg_input_device: Optional[str] = None,
+ ffmpeg_additional_args: Optional[list[str]] = None,
+):
+ """
+ Helper function to read audio from a microphone using ffmpeg. This will output `partial` overlapping chunks starting
+ from `stream_chunk_s` (if it is defined) until `chunk_length_s` is reached. It will make use of striding to avoid
+ errors on the "sides" of the various chunks. The default input device will be used unless another input device is
+ specified using the `ffmpeg_input_device` argument. Uses 'alsa' on Linux, 'avfoundation' on MacOS and 'dshow' on Windows.
+
+ Arguments:
+ sampling_rate (`int`):
+ The sampling_rate to use when reading the data from the microphone. Try using the model's sampling_rate to
+ avoid resampling later.
+ chunk_length_s (`float` or `int`):
+ The length of the maximum chunk of audio to be sent returned. This includes the eventual striding.
+ stream_chunk_s (`float` or `int`):
+ The length of the minimal temporary audio to be returned.
+ stride_length_s (`float` or `int` or `(float, float)`, *optional*):
+ The length of the striding to be used. Stride is used to provide context to a model on the (left, right) of
+ an audio sample but without using that part to actually make the prediction. Setting this does not change
+ the length of the chunk.
+ format_for_conversion (`str`, *optional*, defaults to `f32le`):
+ The name of the format of the audio samples to be returned by ffmpeg. The standard is `f32le`, `s16le`
+ could also be used.
+ ffmpeg_input_device (`str`, *optional*):
+ The identifier of the input device to be used by ffmpeg (i.e. ffmpeg's '-i' argument). If unset,
+ the default input device will be used. See `https://www.ffmpeg.org/ffmpeg-devices.html#Input-Devices`
+ for how to specify and list input devices.
+ ffmpeg_additional_args (`list[str]`, *optional*):
+ Additional arguments to pass to ffmpeg, can include arguments like -nostdin for running as a background
+ process. For example, to pass -nostdin to the ffmpeg process, pass in ["-nostdin"]. If passing in flags
+ with multiple arguments, use the following convention (eg ["flag", "arg1", "arg2]).
+
+ Return:
+ A generator yielding dictionaries of the following form
+
+ `{"sampling_rate": int, "raw": np.array(), "partial" bool}` With optionally a `"stride" (int, int)` key if
+ `stride_length_s` is defined.
+
+ `stride` and `raw` are all expressed in `samples`, and `partial` is a boolean saying if the current yield item
+ is a whole chunk, or a partial temporary result to be later replaced by another larger chunk.
+ """
+ if stream_chunk_s is not None:
+ chunk_s = stream_chunk_s
+ else:
+ chunk_s = chunk_length_s
+
+ microphone = ffmpeg_microphone(
+ sampling_rate,
+ chunk_s,
+ format_for_conversion=format_for_conversion,
+ ffmpeg_input_device=ffmpeg_input_device,
+ ffmpeg_additional_args=[] if ffmpeg_additional_args is None else ffmpeg_additional_args,
+ )
+
+ if format_for_conversion == "s16le":
+ dtype = np.int16
+ size_of_sample = 2
+ elif format_for_conversion == "f32le":
+ dtype = np.float32
+ size_of_sample = 4
+ else:
+ raise ValueError(f"Unhandled format `{format_for_conversion}`. Please use `s16le` or `f32le`")
+
+ if stride_length_s is None:
+ stride_length_s = chunk_length_s / 6
+ chunk_len = int(round(sampling_rate * chunk_length_s)) * size_of_sample
+ if isinstance(stride_length_s, (int, float)):
+ stride_length_s = [stride_length_s, stride_length_s]
+
+ stride_left = int(round(sampling_rate * stride_length_s[0])) * size_of_sample
+ stride_right = int(round(sampling_rate * stride_length_s[1])) * size_of_sample
+ audio_time = datetime.datetime.now()
+ delta = datetime.timedelta(seconds=chunk_s)
+ for item in chunk_bytes_iter(microphone, chunk_len, stride=(stride_left, stride_right), stream=True):
+ # Put everything back in numpy scale
+ item["raw"] = np.frombuffer(item["raw"], dtype=dtype)
+ item["stride"] = (
+ item["stride"][0] // size_of_sample,
+ item["stride"][1] // size_of_sample,
+ )
+ item["sampling_rate"] = sampling_rate
+ audio_time += delta
+ if datetime.datetime.now() > audio_time + 10 * delta:
+ # We're late !! SKIP
+ continue
+ yield item
+
+
+def chunk_bytes_iter(iterator, chunk_len: int, stride: Tuple[int, int], stream: bool = False):
+ """
+ Reads raw bytes from an iterator and does chunks of length `chunk_len`. Optionally adds `stride` to each chunks to
+ get overlaps. `stream` is used to return partial results even if a full `chunk_len` is not yet available.
+ """
+ acc = b""
+ stride_left, stride_right = stride
+ if stride_left + stride_right >= chunk_len:
+ raise ValueError(
+ f"Stride needs to be strictly smaller than chunk_len: ({stride_left}, {stride_right}) vs {chunk_len}"
+ )
+ _stride_left = 0
+ for raw in iterator:
+ acc += raw
+ if stream and len(acc) < chunk_len:
+ stride = (_stride_left, 0)
+ yield {"raw": acc[:chunk_len], "stride": stride, "partial": True}
+ else:
+ while len(acc) >= chunk_len:
+ # We are flushing the accumulator
+ stride = (_stride_left, stride_right)
+ item = {"raw": acc[:chunk_len], "stride": stride}
+ if stream:
+ item["partial"] = False
+ yield item
+ _stride_left = stride_left
+ acc = acc[chunk_len - stride_left - stride_right :]
+ # Last chunk
+ if len(acc) > stride_left:
+ item = {"raw": acc, "stride": (_stride_left, 0)}
+ if stream:
+ item["partial"] = False
+ yield item
+
+
+def _ffmpeg_stream(ffmpeg_command, buflen: int):
+ """
+ Internal function to create the generator of data through ffmpeg
+ """
+ bufsize = 2**24 # 16Mo
+ try:
+ with subprocess.Popen(ffmpeg_command, stdout=subprocess.PIPE, bufsize=bufsize) as ffmpeg_process:
+ while True:
+ raw = ffmpeg_process.stdout.read(buflen)
+ if raw == b"":
+ break
+ yield raw
+ except FileNotFoundError as error:
+ raise ValueError("ffmpeg was not found but is required to stream audio files from filename") from error
+
+
+def _get_microphone_name():
+ """
+ Retrieve the microphone name in Windows .
+ """
+ command = ["ffmpeg", "-list_devices", "true", "-f", "dshow", "-i", ""]
+
+ try:
+ ffmpeg_devices = subprocess.run(command, text=True, stderr=subprocess.PIPE, encoding="utf-8")
+ microphone_lines = [line for line in ffmpeg_devices.stderr.splitlines() if "(audio)" in line]
+
+ if microphone_lines:
+ microphone_name = microphone_lines[0].split('"')[1]
+ print(f"Using microphone: {microphone_name}")
+ return f"audio={microphone_name}"
+ except FileNotFoundError:
+ print("ffmpeg was not found. Please install it or make sure it is in your system PATH.")
+
+ return "default"
diff --git a/.venv/lib/python3.11/site-packages/transformers/pipelines/automatic_speech_recognition.py b/.venv/lib/python3.11/site-packages/transformers/pipelines/automatic_speech_recognition.py
new file mode 100644
index 0000000000000000000000000000000000000000..66a9c49ea5f3516053fa9d5835109dcd53e3ff1a
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/transformers/pipelines/automatic_speech_recognition.py
@@ -0,0 +1,766 @@
+# Copyright 2021 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import warnings
+from collections import defaultdict
+from typing import TYPE_CHECKING, Dict, Optional, Union
+
+import numpy as np
+import requests
+
+from ..tokenization_utils import PreTrainedTokenizer
+from ..utils import is_torch_available, is_torchaudio_available, logging
+from .audio_utils import ffmpeg_read
+from .base import ChunkPipeline
+
+
+if TYPE_CHECKING:
+ from pyctcdecode import BeamSearchDecoderCTC
+
+ from ..feature_extraction_sequence_utils import SequenceFeatureExtractor
+ from ..modeling_utils import PreTrainedModel
+
+logger = logging.get_logger(__name__)
+
+if is_torch_available():
+ import torch
+
+ from ..models.auto.modeling_auto import MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING_NAMES
+
+
+def rescale_stride(stride, ratio):
+ """
+ Rescales the stride values from audio space to tokens/logits space.
+
+ (160_000, 16_000, 16_000) -> (2000, 200, 200) for instance.
+ """
+ # Shape is [B, SEQ] for tokens
+ # [B, SEQ, V] for logits
+
+ new_strides = []
+ for input_n, left, right in stride:
+ token_n = int(round(input_n * ratio))
+ left = int(round(left / input_n * token_n))
+ right = int(round(right / input_n * token_n))
+ new_stride = (token_n, left, right)
+ new_strides.append(new_stride)
+
+ return new_strides
+
+
+def chunk_iter(inputs, feature_extractor, chunk_len, stride_left, stride_right, dtype=None):
+ inputs_len = inputs.shape[0]
+ step = chunk_len - stride_left - stride_right
+ for chunk_start_idx in range(0, inputs_len, step):
+ chunk_end_idx = chunk_start_idx + chunk_len
+ chunk = inputs[chunk_start_idx:chunk_end_idx]
+ processed = feature_extractor(chunk, sampling_rate=feature_extractor.sampling_rate, return_tensors="pt")
+ if dtype is not None:
+ processed = processed.to(dtype=dtype)
+ _stride_left = 0 if chunk_start_idx == 0 else stride_left
+ is_last = chunk_end_idx >= inputs_len
+ _stride_right = 0 if is_last else stride_right
+
+ chunk_len = chunk.shape[0]
+ stride = (chunk_len, _stride_left, _stride_right)
+ if chunk.shape[0] > _stride_left:
+ yield {"is_last": is_last, "stride": stride, **processed}
+ if is_last:
+ break
+
+
+def _fast_find_longest_common_sequence(sequence_left, sequence_right):
+ seq_len_left = len(sequence_left)
+ seq_len_right = len(sequence_right)
+ counter = [[0] * (seq_len_right + 1) for _ in range(seq_len_left + 1)]
+ longest = 0
+ for i in range(seq_len_left):
+ for j in range(seq_len_right):
+ if sequence_left[i] == sequence_right[j]:
+ previous_counter = counter[i][j] + 1
+ counter[i + 1][j + 1] = previous_counter
+ if previous_counter > longest:
+ longest = previous_counter
+
+ counter = np.array(counter)
+ # we return the idx of the first element of the longest common sequence in the left sequence
+ index_left = np.argwhere(counter == longest)[-1][0] - longest if longest != 0 else -1
+ index_right = np.argwhere(counter == longest)[-1][1] - longest if longest != 0 else -1
+ return index_left, index_right, longest
+
+
+def _find_longest_common_sequence(sequences, tokenizer):
+ # TODO Use a faster algorithm this can probably be done in O(n)
+ # using suffix array.
+ # It might be tedious to do because of fault tolerance.
+ # We actually have a really good property which is that the total sequence
+ # MUST be those subsequences in order.
+ # Also the algorithm should be more tolerant to errors.
+ sequence = [tok_id for tok_id in sequences[0][0].tolist() if tok_id not in tokenizer.all_special_ids]
+ for new_seq in sequences[1:]:
+ new_sequence = [tok_id for tok_id in new_seq[0].tolist() if tok_id not in tokenizer.all_special_ids]
+
+ index = 0
+ max_ = 0.0
+ for i in range(1, len(new_sequence) + 1):
+ # epsilon to favor long perfect matches
+ eps = i / 10000.0
+ matches = np.sum(np.array(sequence[-i:]) == np.array(new_sequence[:i]))
+ matching = matches / i + eps
+ if matches > 1 and matching > max_:
+ index = i
+ max_ = matching
+ sequence.extend(new_sequence[index:])
+ return np.array(sequence)
+
+
+class AutomaticSpeechRecognitionPipeline(ChunkPipeline):
+ """
+ Pipeline that aims at extracting spoken text contained within some audio.
+
+ The input can be either a raw waveform or a audio file. In case of the audio file, ffmpeg should be installed for
+ to support multiple audio formats
+
+ Example:
+
+ ```python
+ >>> from transformers import pipeline
+
+ >>> transcriber = pipeline(model="openai/whisper-base")
+ >>> transcriber("https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/1.flac")
+ {'text': ' He hoped there would be stew for dinner, turnips and carrots and bruised potatoes and fat mutton pieces to be ladled out in thick, peppered flour-fatten sauce.'}
+ ```
+
+ Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial)
+
+ Arguments:
+ model ([`PreTrainedModel`] or [`TFPreTrainedModel`]):
+ The model that will be used by the pipeline to make predictions. This needs to be a model inheriting from
+ [`PreTrainedModel`] for PyTorch and [`TFPreTrainedModel`] for TensorFlow.
+ feature_extractor ([`SequenceFeatureExtractor`]):
+ The feature extractor that will be used by the pipeline to encode waveform for the model.
+ tokenizer ([`PreTrainedTokenizer`]):
+ The tokenizer that will be used by the pipeline to encode data for the model. This object inherits from
+ [`PreTrainedTokenizer`].
+ decoder (`pyctcdecode.BeamSearchDecoderCTC`, *optional*):
+ [PyCTCDecode's
+ BeamSearchDecoderCTC](https://github.com/kensho-technologies/pyctcdecode/blob/2fd33dc37c4111417e08d89ccd23d28e9b308d19/pyctcdecode/decoder.py#L180)
+ can be passed for language model boosted decoding. See [`Wav2Vec2ProcessorWithLM`] for more information.
+ chunk_length_s (`float`, *optional*, defaults to 0):
+ The input length for in each chunk. If `chunk_length_s = 0` then chunking is disabled (default).
+
+
+
+ For more information on how to effectively use `chunk_length_s`, please have a look at the [ASR chunking
+ blog post](https://huggingface.co/blog/asr-chunking).
+
+
+
+ stride_length_s (`float`, *optional*, defaults to `chunk_length_s / 6`):
+ The length of stride on the left and right of each chunk. Used only with `chunk_length_s > 0`. This enables
+ the model to *see* more context and infer letters better than without this context but the pipeline
+ discards the stride bits at the end to make the final reconstitution as perfect as possible.
+
+
+
+ For more information on how to effectively use `stride_length_s`, please have a look at the [ASR chunking
+ blog post](https://huggingface.co/blog/asr-chunking).
+
+
+
+ framework (`str`, *optional*):
+ The framework to use, either `"pt"` for PyTorch or `"tf"` for TensorFlow. The specified framework must be
+ installed. If no framework is specified, will default to the one currently installed. If no framework is
+ specified and both frameworks are installed, will default to the framework of the `model`, or to PyTorch if
+ no model is provided.
+ device (Union[`int`, `torch.device`], *optional*):
+ Device ordinal for CPU/GPU supports. Setting this to `None` will leverage CPU, a positive will run the
+ model on the associated CUDA device id.
+ torch_dtype (Union[`int`, `torch.dtype`], *optional*):
+ The data-type (dtype) of the computation. Setting this to `None` will use float32 precision. Set to
+ `torch.float16` or `torch.bfloat16` to use half-precision in the respective dtypes.
+
+ """
+
+ def __init__(
+ self,
+ model: "PreTrainedModel",
+ feature_extractor: Union["SequenceFeatureExtractor", str] = None,
+ tokenizer: Optional[PreTrainedTokenizer] = None,
+ decoder: Optional[Union["BeamSearchDecoderCTC", str]] = None,
+ device: Union[int, "torch.device"] = None,
+ torch_dtype: Optional[Union[str, "torch.dtype"]] = None,
+ **kwargs,
+ ):
+ # set the model type so we can check we have the right pre- and post-processing parameters
+ if model.config.model_type == "whisper":
+ self.type = "seq2seq_whisper"
+ elif model.__class__.__name__ in MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING_NAMES.values():
+ self.type = "seq2seq"
+ elif (
+ feature_extractor._processor_class
+ and feature_extractor._processor_class.endswith("WithLM")
+ and decoder is not None
+ ):
+ self.decoder = decoder
+ self.type = "ctc_with_lm"
+ else:
+ self.type = "ctc"
+
+ super().__init__(model, tokenizer, feature_extractor, device=device, torch_dtype=torch_dtype, **kwargs)
+
+ def __call__(
+ self,
+ inputs: Union[np.ndarray, bytes, str],
+ **kwargs,
+ ):
+ """
+ Transcribe the audio sequence(s) given as inputs to text. See the [`AutomaticSpeechRecognitionPipeline`]
+ documentation for more information.
+
+ Args:
+ inputs (`np.ndarray` or `bytes` or `str` or `dict`):
+ The inputs is either :
+ - `str` that is either the filename of a local audio file, or a public URL address to download the
+ audio file. The file will be read at the correct sampling rate to get the waveform using
+ *ffmpeg*. This requires *ffmpeg* to be installed on the system.
+ - `bytes` it is supposed to be the content of an audio file and is interpreted by *ffmpeg* in the
+ same way.
+ - (`np.ndarray` of shape (n, ) of type `np.float32` or `np.float64`)
+ Raw audio at the correct sampling rate (no further check will be done)
+ - `dict` form can be used to pass raw audio sampled at arbitrary `sampling_rate` and let this
+ pipeline do the resampling. The dict must be in the format `{"sampling_rate": int, "raw":
+ np.array}` with optionally a `"stride": (left: int, right: int)` than can ask the pipeline to
+ treat the first `left` samples and last `right` samples to be ignored in decoding (but used at
+ inference to provide more context to the model). Only use `stride` with CTC models.
+ return_timestamps (*optional*, `str` or `bool`):
+ Only available for pure CTC models (Wav2Vec2, HuBERT, etc) and the Whisper model. Not available for
+ other sequence-to-sequence models.
+
+ For CTC models, timestamps can take one of two formats:
+ - `"char"`: the pipeline will return timestamps along the text for every character in the text. For
+ instance, if you get `[{"text": "h", "timestamp": (0.5, 0.6)}, {"text": "i", "timestamp": (0.7,
+ 0.9)}]`, then it means the model predicts that the letter "h" was spoken after `0.5` and before
+ `0.6` seconds.
+ - `"word"`: the pipeline will return timestamps along the text for every word in the text. For
+ instance, if you get `[{"text": "hi ", "timestamp": (0.5, 0.9)}, {"text": "there", "timestamp":
+ (1.0, 1.5)}]`, then it means the model predicts that the word "hi" was spoken after `0.5` and
+ before `0.9` seconds.
+
+ For the Whisper model, timestamps can take one of two formats:
+ - `"word"`: same as above for word-level CTC timestamps. Word-level timestamps are predicted
+ through the *dynamic-time warping (DTW)* algorithm, an approximation to word-level timestamps
+ by inspecting the cross-attention weights.
+ - `True`: the pipeline will return timestamps along the text for *segments* of words in the text.
+ For instance, if you get `[{"text": " Hi there!", "timestamp": (0.5, 1.5)}]`, then it means the
+ model predicts that the segment "Hi there!" was spoken after `0.5` and before `1.5` seconds.
+ Note that a segment of text refers to a sequence of one or more words, rather than individual
+ words as with word-level timestamps.
+ generate_kwargs (`dict`, *optional*):
+ The dictionary of ad-hoc parametrization of `generate_config` to be used for the generation call. For a
+ complete overview of generate, check the [following
+ guide](https://huggingface.co/docs/transformers/en/main_classes/text_generation).
+
+ Return:
+ `Dict`: A dictionary with the following keys:
+ - **text** (`str`): The recognized text.
+ - **chunks** (*optional(, `List[Dict]`)
+ When using `return_timestamps`, the `chunks` will become a list containing all the various text
+ chunks identified by the model, *e.g.* `[{"text": "hi ", "timestamp": (0.5, 0.9)}, {"text":
+ "there", "timestamp": (1.0, 1.5)}]`. The original full text can roughly be recovered by doing
+ `"".join(chunk["text"] for chunk in output["chunks"])`.
+ """
+ return super().__call__(inputs, **kwargs)
+
+ def _sanitize_parameters(
+ self,
+ chunk_length_s=None,
+ stride_length_s=None,
+ ignore_warning=None,
+ decoder_kwargs=None,
+ return_timestamps=None,
+ return_language=None,
+ generate_kwargs=None,
+ max_new_tokens=None,
+ ):
+ # No parameters on this pipeline right now
+ preprocess_params = {}
+ if chunk_length_s is not None:
+ if self.type == "seq2seq" and not ignore_warning:
+ logger.warning(
+ "Using `chunk_length_s` is very experimental with seq2seq models. The results will not necessarily"
+ " be entirely accurate and will have caveats. More information:"
+ " https://github.com/huggingface/transformers/pull/20104. Ignore this warning with pipeline(...,"
+ " ignore_warning=True)"
+ )
+ preprocess_params["chunk_length_s"] = chunk_length_s
+ if stride_length_s is not None:
+ preprocess_params["stride_length_s"] = stride_length_s
+
+ forward_params = defaultdict(dict)
+ if max_new_tokens is not None:
+ warnings.warn(
+ "`max_new_tokens` is deprecated and will be removed in version 4.49 of Transformers. To remove this warning, pass `max_new_tokens` as a key inside `generate_kwargs` instead.",
+ FutureWarning,
+ )
+ forward_params["max_new_tokens"] = max_new_tokens
+ if generate_kwargs is not None:
+ if max_new_tokens is not None and "max_new_tokens" in generate_kwargs:
+ raise ValueError(
+ "`max_new_tokens` is defined both as an argument and inside `generate_kwargs` argument, please use"
+ " only 1 version"
+ )
+ forward_params.update(generate_kwargs)
+
+ postprocess_params = {}
+ if decoder_kwargs is not None:
+ postprocess_params["decoder_kwargs"] = decoder_kwargs
+ if return_timestamps is not None:
+ # Check whether we have a valid setting for return_timestamps and throw an error before we perform a forward pass
+ if self.type == "seq2seq" and return_timestamps:
+ raise ValueError("We cannot return_timestamps yet on non-CTC models apart from Whisper!")
+ if self.type == "ctc_with_lm" and return_timestamps != "word":
+ raise ValueError("CTC with LM can only predict word level timestamps, set `return_timestamps='word'`")
+ if self.type == "ctc" and return_timestamps not in ["char", "word"]:
+ raise ValueError(
+ "CTC can either predict character level timestamps, or word level timestamps. "
+ "Set `return_timestamps='char'` or `return_timestamps='word'` as required."
+ )
+ if self.type == "seq2seq_whisper" and return_timestamps == "char":
+ raise ValueError(
+ "Whisper cannot return `char` timestamps, only word level or segment level timestamps. "
+ "Use `return_timestamps='word'` or `return_timestamps=True` respectively."
+ )
+ forward_params["return_timestamps"] = return_timestamps
+ postprocess_params["return_timestamps"] = return_timestamps
+ if return_language is not None:
+ if self.type != "seq2seq_whisper":
+ raise ValueError("Only Whisper can return language for now.")
+ postprocess_params["return_language"] = return_language
+
+ if self.assistant_model is not None:
+ forward_params["assistant_model"] = self.assistant_model
+ if self.assistant_tokenizer is not None:
+ forward_params["tokenizer"] = self.tokenizer
+ forward_params["assistant_tokenizer"] = self.assistant_tokenizer
+
+ return preprocess_params, forward_params, postprocess_params
+
+ def preprocess(self, inputs, chunk_length_s=0, stride_length_s=None):
+ if isinstance(inputs, str):
+ if inputs.startswith("http://") or inputs.startswith("https://"):
+ # We need to actually check for a real protocol, otherwise it's impossible to use a local file
+ # like http_huggingface_co.png
+ inputs = requests.get(inputs).content
+ else:
+ with open(inputs, "rb") as f:
+ inputs = f.read()
+
+ if isinstance(inputs, bytes):
+ inputs = ffmpeg_read(inputs, self.feature_extractor.sampling_rate)
+
+ stride = None
+ extra = {}
+ if isinstance(inputs, dict):
+ stride = inputs.pop("stride", None)
+ # Accepting `"array"` which is the key defined in `datasets` for
+ # better integration
+ if not ("sampling_rate" in inputs and ("raw" in inputs or "array" in inputs)):
+ raise ValueError(
+ "When passing a dictionary to AutomaticSpeechRecognitionPipeline, the dict needs to contain a "
+ '"raw" key containing the numpy array representing the audio and a "sampling_rate" key, '
+ "containing the sampling_rate associated with that array"
+ )
+
+ _inputs = inputs.pop("raw", None)
+ if _inputs is None:
+ # Remove path which will not be used from `datasets`.
+ inputs.pop("path", None)
+ _inputs = inputs.pop("array", None)
+ in_sampling_rate = inputs.pop("sampling_rate")
+ extra = inputs
+ inputs = _inputs
+ if in_sampling_rate != self.feature_extractor.sampling_rate:
+ if is_torchaudio_available():
+ from torchaudio import functional as F
+ else:
+ raise ImportError(
+ "torchaudio is required to resample audio samples in AutomaticSpeechRecognitionPipeline. "
+ "The torchaudio package can be installed through: `pip install torchaudio`."
+ )
+
+ inputs = F.resample(
+ torch.from_numpy(inputs), in_sampling_rate, self.feature_extractor.sampling_rate
+ ).numpy()
+ ratio = self.feature_extractor.sampling_rate / in_sampling_rate
+ else:
+ ratio = 1
+ if stride is not None:
+ if stride[0] + stride[1] > inputs.shape[0]:
+ raise ValueError("Stride is too large for input")
+
+ # Stride needs to get the chunk length here, it's going to get
+ # swallowed by the `feature_extractor` later, and then batching
+ # can add extra data in the inputs, so we need to keep track
+ # of the original length in the stride so we can cut properly.
+ stride = (inputs.shape[0], int(round(stride[0] * ratio)), int(round(stride[1] * ratio)))
+ if not isinstance(inputs, np.ndarray):
+ raise TypeError(f"We expect a numpy ndarray as input, got `{type(inputs)}`")
+ if len(inputs.shape) != 1:
+ raise ValueError("We expect a single channel audio input for AutomaticSpeechRecognitionPipeline")
+
+ if chunk_length_s:
+ if stride_length_s is None:
+ stride_length_s = chunk_length_s / 6
+
+ if isinstance(stride_length_s, (int, float)):
+ stride_length_s = [stride_length_s, stride_length_s]
+
+ # XXX: Carefuly, this variable will not exist in `seq2seq` setting.
+ # Currently chunking is not possible at this level for `seq2seq` so
+ # it's ok.
+ align_to = getattr(self.model.config, "inputs_to_logits_ratio", 1)
+ chunk_len = int(round(chunk_length_s * self.feature_extractor.sampling_rate / align_to) * align_to)
+ stride_left = int(round(stride_length_s[0] * self.feature_extractor.sampling_rate / align_to) * align_to)
+ stride_right = int(round(stride_length_s[1] * self.feature_extractor.sampling_rate / align_to) * align_to)
+
+ if chunk_len < stride_left + stride_right:
+ raise ValueError("Chunk length must be superior to stride length")
+
+ for item in chunk_iter(
+ inputs, self.feature_extractor, chunk_len, stride_left, stride_right, self.torch_dtype
+ ):
+ yield {**item, **extra}
+ else:
+ if self.type == "seq2seq_whisper" and inputs.shape[0] > self.feature_extractor.n_samples:
+ processed = self.feature_extractor(
+ inputs,
+ sampling_rate=self.feature_extractor.sampling_rate,
+ truncation=False,
+ padding="longest",
+ return_tensors="pt",
+ return_attention_mask=True,
+ )
+ else:
+ if self.type == "seq2seq_whisper" and stride is None:
+ processed = self.feature_extractor(
+ inputs,
+ sampling_rate=self.feature_extractor.sampling_rate,
+ return_tensors="pt",
+ return_token_timestamps=True,
+ return_attention_mask=True,
+ )
+ extra["num_frames"] = processed.pop("num_frames")
+ else:
+ processed = self.feature_extractor(
+ inputs,
+ sampling_rate=self.feature_extractor.sampling_rate,
+ return_tensors="pt",
+ return_attention_mask=True,
+ )
+ if self.torch_dtype is not None:
+ processed = processed.to(dtype=self.torch_dtype)
+ if stride is not None:
+ if self.type == "seq2seq":
+ raise ValueError("Stride is only usable with CTC models, try removing it !")
+
+ processed["stride"] = stride
+ yield {"is_last": True, **processed, **extra}
+
+ def _forward(self, model_inputs, return_timestamps=False, **generate_kwargs):
+ attention_mask = model_inputs.pop("attention_mask", None)
+ stride = model_inputs.pop("stride", None)
+ num_frames = model_inputs.pop("num_frames", None)
+ is_last = model_inputs.pop("is_last")
+
+ if stride is not None and num_frames is not None:
+ raise ValueError("num_frames must be used only when stride is None")
+
+ if self.type in {"seq2seq", "seq2seq_whisper"}:
+ # Consume values so we can let extra information flow freely through
+ # the pipeline (important for `partial` in microphone)
+ if "input_features" in model_inputs:
+ inputs = model_inputs.pop("input_features")
+ elif "input_values" in model_inputs:
+ inputs = model_inputs.pop("input_values")
+ else:
+ raise ValueError(
+ "Seq2Seq speech recognition model requires either a "
+ f"`input_features` or `input_values` key, but only has {model_inputs.keys()}"
+ )
+
+ # custom processing for Whisper timestamps and word-level timestamps
+ if return_timestamps and self.type == "seq2seq_whisper":
+ generate_kwargs["return_timestamps"] = return_timestamps
+ if return_timestamps == "word":
+ generate_kwargs["return_token_timestamps"] = True
+ generate_kwargs["return_segments"] = True
+
+ if stride is not None:
+ if isinstance(stride, tuple):
+ generate_kwargs["num_frames"] = stride[0] // self.feature_extractor.hop_length
+ else:
+ generate_kwargs["num_frames"] = [s[0] // self.feature_extractor.hop_length for s in stride]
+ else:
+ generate_kwargs["num_frames"] = num_frames
+
+ # User-defined `generation_config` passed to the pipeline call take precedence
+ if "generation_config" not in generate_kwargs:
+ generate_kwargs["generation_config"] = self.generation_config
+
+ tokens = self.model.generate(
+ inputs=inputs,
+ attention_mask=attention_mask,
+ **generate_kwargs,
+ )
+ # whisper longform generation stores timestamps in "segments"
+ if return_timestamps == "word" and self.type == "seq2seq_whisper":
+ if "segments" not in tokens:
+ out = {"tokens": tokens["sequences"], "token_timestamps": tokens["token_timestamps"]}
+ else:
+ token_timestamps = [
+ torch.cat([segment["token_timestamps"] for segment in segment_list])
+ for segment_list in tokens["segments"]
+ ]
+ out = {"tokens": tokens["sequences"], "token_timestamps": token_timestamps}
+ else:
+ out = {"tokens": tokens}
+ if self.type == "seq2seq_whisper":
+ if stride is not None:
+ out["stride"] = stride
+
+ else:
+ inputs = {
+ self.model.main_input_name: model_inputs.pop(self.model.main_input_name),
+ "attention_mask": attention_mask,
+ }
+ outputs = self.model(**inputs)
+ logits = outputs.logits
+
+ if self.type == "ctc_with_lm":
+ out = {"logits": logits}
+ else:
+ out = {"tokens": logits.argmax(dim=-1)}
+ if stride is not None:
+ # Send stride to `postprocess`.
+ # it needs to be handled there where
+ # the pieces are to be concatenated.
+ ratio = 1 / self.model.config.inputs_to_logits_ratio
+ if isinstance(stride, tuple):
+ out["stride"] = rescale_stride([stride], ratio)[0]
+ else:
+ out["stride"] = rescale_stride(stride, ratio)
+ # Leftover
+ extra = model_inputs
+ return {"is_last": is_last, **out, **extra}
+
+ def postprocess(
+ self, model_outputs, decoder_kwargs: Optional[Dict] = None, return_timestamps=None, return_language=None
+ ):
+ # Optional return types
+ optional = {}
+
+ final_items = []
+ key = "logits" if self.type == "ctc_with_lm" else "tokens"
+ stride = None
+ for outputs in model_outputs:
+ if self.framework == "pt" and outputs[key].dtype in (torch.bfloat16, torch.float16):
+ items = outputs[key].to(torch.float32).numpy()
+ else:
+ items = outputs[key].numpy()
+ stride = outputs.get("stride", None)
+ if stride is not None and self.type in {"ctc", "ctc_with_lm"}:
+ total_n, left, right = stride
+ # Total_n might be < logits.shape[1]
+ # because of padding, that's why
+ # we need to reconstruct this information
+ # This won't work with left padding (which doesn't exist right now)
+ right_n = total_n - right
+ items = items[:, left:right_n]
+ final_items.append(items)
+
+ if stride and self.type == "seq2seq":
+ items = _find_longest_common_sequence(final_items, self.tokenizer)
+ elif self.type == "seq2seq_whisper":
+ time_precision = self.feature_extractor.chunk_length / self.model.config.max_source_positions
+ # Send the chunking back to seconds, it's easier to handle in whisper
+ sampling_rate = self.feature_extractor.sampling_rate
+ for output in model_outputs:
+ if "stride" in output:
+ chunk_len, stride_left, stride_right = output["stride"]
+ # Go back in seconds
+ chunk_len /= sampling_rate
+ stride_left /= sampling_rate
+ stride_right /= sampling_rate
+ output["stride"] = chunk_len, stride_left, stride_right
+
+ text, optional = self.tokenizer._decode_asr(
+ model_outputs,
+ return_timestamps=return_timestamps,
+ return_language=return_language,
+ time_precision=time_precision,
+ )
+ else:
+ items = np.concatenate(final_items, axis=1)
+ items = items.squeeze(0)
+
+ if self.type == "ctc_with_lm":
+ if decoder_kwargs is None:
+ decoder_kwargs = {}
+ beams = self.decoder.decode_beams(items, **decoder_kwargs)
+ text = beams[0][0]
+ if return_timestamps:
+ # Simply cast from pyctcdecode format to wav2vec2 format to leverage
+ # pre-existing code later
+ chunk_offset = beams[0][2]
+ offsets = []
+ for word, (start_offset, end_offset) in chunk_offset:
+ offsets.append({"word": word, "start_offset": start_offset, "end_offset": end_offset})
+ elif self.type != "seq2seq_whisper":
+ skip_special_tokens = self.type != "ctc"
+ text = self.tokenizer.decode(items, skip_special_tokens=skip_special_tokens)
+ if return_timestamps:
+ offsets = self.tokenizer.decode(
+ items, skip_special_tokens=skip_special_tokens, output_char_offsets=True
+ )["char_offsets"]
+ if return_timestamps == "word":
+ offsets = self.tokenizer._get_word_offsets(offsets, self.tokenizer.replace_word_delimiter_char)
+
+ if return_timestamps and self.type not in {"seq2seq", "seq2seq_whisper"}:
+ chunks = []
+ for item in offsets:
+ start = item["start_offset"] * self.model.config.inputs_to_logits_ratio
+ start /= self.feature_extractor.sampling_rate
+
+ stop = item["end_offset"] * self.model.config.inputs_to_logits_ratio
+ stop /= self.feature_extractor.sampling_rate
+
+ chunks.append({"text": item[return_timestamps], "timestamp": (start, stop)})
+ optional["chunks"] = chunks
+
+ extra = defaultdict(list)
+ for output in model_outputs:
+ output.pop("tokens", None)
+ output.pop("logits", None)
+ output.pop("is_last", None)
+ output.pop("stride", None)
+ output.pop("token_timestamps", None)
+ for k, v in output.items():
+ extra[k].append(v)
+ return {"text": text, **optional, **extra}
+
+
+def _find_timestamp_sequence(sequences, tokenizer, feature_extractor, max_source_positions):
+ """
+ Computes the final sequences by merging the end of the nth sequence with the beginning of the n+1th sequence. Since
+ `WhisperForConditionalGeneration` produces the timestamps pairwise, we filter the consecutive timestamps and only
+ iterate over them. We keep track of the `time` which indicates the actual starting time of the chunk that is
+ processed. We need to make sure to offset the timestamps tokens by the `time` in order for the tokenizer to
+ properly compute the final `offset`.
+ """
+ # index of the first timestamp token
+ timestamp_begin = tokenizer.convert_tokens_to_ids("<|notimestamps|>") + 1
+ items = []
+ # approximation of the token to time ratio : ~0.2seconds
+ time_precision = feature_extractor.chunk_length / max_source_positions
+ time = 0
+ for seq_idx, item in enumerate(sequences):
+ sequence, stride = item
+ if isinstance(sequence, list):
+ sequence = np.array(sequence)
+ chunk_len, stride_left, stride_right = stride
+ sequence = sequence.squeeze(0)
+ # get rid of the `forced_decoder_idx` that are use to parametrize the generation
+ begin_idx = np.where(sequence == timestamp_begin)[0][0] if timestamp_begin in sequence else 0
+ sequence = sequence[begin_idx:]
+
+ timestamp_tokens = sequence >= timestamp_begin
+ if seq_idx != 0 and sum(timestamp_tokens) > 0:
+ consecutive = np.where(timestamp_tokens[:-1] & timestamp_tokens[1:])[0] + 1
+ last_timestamp = np.where(timestamp_tokens)[0][-1]
+ consecutive = np.append(consecutive, last_timestamp) if last_timestamp not in consecutive else consecutive
+ time -= stride_left + stride_right
+ offset = int((time / feature_extractor.sampling_rate) / time_precision)
+ overlap_time = int((stride_left / feature_extractor.sampling_rate) / time_precision)
+ # relevant timestamps are in the overlapping part
+ relevant_timestamp = np.where(sequence[consecutive] >= timestamp_begin + overlap_time)[0]
+ if relevant_timestamp.shape[0] > 0:
+ relevant_timestamp = (
+ consecutive[relevant_timestamp[0] - 1] if relevant_timestamp[0] > 0 else consecutive[0]
+ )
+ # if a big stride is used, we need to check some of the previous items for the best overlap
+ best_match = 0
+ sliced_sequence = []
+ for idx, previous_sequence in enumerate(reversed(items)):
+ previous_tokens = previous_sequence[1:-1]
+ if previous_sequence[0] < (timestamp_begin + offset - overlap_time) and idx != 0:
+ break # the previous sequence is too far in the past
+ if len(previous_tokens) > 0:
+ # find the longest common sequence between the overlapping parts
+ index_left, index_right, match_length = _fast_find_longest_common_sequence(
+ sequence[1:relevant_timestamp], previous_tokens
+ )
+ # don't do anything if only 1 token was matched
+ if match_length > 1 and match_length > best_match:
+ best_match = match_length
+ best_idx = idx
+ end_of_curr_sequence_idx = (
+ np.where(sequence[index_left + 1 :] >= timestamp_begin)[0][0] + 1
+ )
+ end_of_curr_sequence_idx = end_of_curr_sequence_idx + 1 + index_left
+ # if all the tokens are matched, suffix
+ if index_left == 0 and match_length == len(previous_tokens):
+ sliced_sequence = np.insert(
+ sequence[index_left + 1 : end_of_curr_sequence_idx], 0, previous_sequence[0]
+ )
+ sliced_sequence[-1] = previous_sequence[-1]
+ # if part of the previous sequence is not taken
+ elif index_left >= 0:
+ sliced_sequence = sequence[index_left + 1 : end_of_curr_sequence_idx]
+ # let's insert the missing part of the previous sequence
+ previous_slice = (
+ previous_sequence[: index_right + 1] if index_right > 0 else [previous_sequence[0]]
+ )
+ sliced_sequence = np.insert(sliced_sequence, 0, previous_slice)
+ sliced_sequence[-1] += offset
+
+ if len(sliced_sequence) > 0:
+ items[len(items) - best_idx - 1] = sliced_sequence
+ items = items[: len(items) - best_idx]
+ sequence = sequence[end_of_curr_sequence_idx:]
+
+ # sequence might have changed
+ timestamp_tokens = sequence >= timestamp_begin
+ consecutive = np.where(timestamp_tokens[:-1] & timestamp_tokens[1:])[0] + 1
+ if sum(timestamp_tokens) > 0:
+ last_timestamp = np.where(timestamp_tokens)[0][-1]
+ consecutive = (
+ np.append(consecutive, last_timestamp + 1) if last_timestamp not in consecutive else consecutive
+ )
+
+ if len(consecutive) > 0:
+ last_slice = 0
+ for current_slice in consecutive:
+ actual_offset = items[-1][-1] if seq_idx != 0 or last_slice != 0 else sequence[0]
+ sliced_tokens = sequence[last_slice:current_slice]
+ duration = sliced_tokens[-1] - sliced_tokens[0]
+ sliced_tokens[0] = actual_offset
+ sliced_tokens[-1] = actual_offset + duration
+ items.append(sliced_tokens)
+ last_slice = current_slice
+
+ time += chunk_len
+ result = []
+ for i in range(len(items)):
+ result += items[i].tolist()
+ return result
diff --git a/.venv/lib/python3.11/site-packages/transformers/pipelines/base.py b/.venv/lib/python3.11/site-packages/transformers/pipelines/base.py
new file mode 100644
index 0000000000000000000000000000000000000000..a24e9c3f69787849de363dc501666d511e84ee13
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/transformers/pipelines/base.py
@@ -0,0 +1,1484 @@
+# coding=utf-8
+# Copyright 2018 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import collections
+import copy
+import csv
+import importlib
+import json
+import os
+import pickle
+import sys
+import traceback
+import types
+import warnings
+from abc import ABC, abstractmethod
+from collections import UserDict
+from contextlib import contextmanager
+from os.path import abspath, exists
+from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union
+
+from ..dynamic_module_utils import custom_object_save
+from ..feature_extraction_utils import PreTrainedFeatureExtractor
+from ..image_processing_utils import BaseImageProcessor
+from ..modelcard import ModelCard
+from ..models.auto import AutoConfig, AutoTokenizer
+from ..processing_utils import ProcessorMixin
+from ..tokenization_utils import PreTrainedTokenizer
+from ..utils import (
+ ModelOutput,
+ PushToHubMixin,
+ add_end_docstrings,
+ copy_func,
+ infer_framework,
+ is_tf_available,
+ is_torch_available,
+ is_torch_cuda_available,
+ is_torch_mlu_available,
+ is_torch_mps_available,
+ is_torch_musa_available,
+ is_torch_npu_available,
+ is_torch_xpu_available,
+ logging,
+)
+
+
+GenericTensor = Union[List["GenericTensor"], "torch.Tensor", "tf.Tensor"]
+
+if is_tf_available():
+ import tensorflow as tf
+
+ from ..models.auto.modeling_tf_auto import TFAutoModel
+
+if is_torch_available():
+ import torch
+ from torch.utils.data import DataLoader, Dataset
+
+ from ..models.auto.modeling_auto import AutoModel
+
+ # Re-export for backward compatibility
+ from .pt_utils import KeyDataset
+else:
+ Dataset = None
+ KeyDataset = None
+
+if TYPE_CHECKING:
+ from ..modeling_tf_utils import TFPreTrainedModel
+ from ..modeling_utils import PreTrainedModel
+
+
+logger = logging.get_logger(__name__)
+
+
+def no_collate_fn(items):
+ if len(items) != 1:
+ raise ValueError("This collate_fn is meant to be used with batch_size=1")
+ return items[0]
+
+
+def _pad(items, key, padding_value, padding_side):
+ batch_size = len(items)
+ if isinstance(items[0][key], torch.Tensor):
+ # Others include `attention_mask` etc...
+ shape = items[0][key].shape
+ dim = len(shape)
+ if dim == 1:
+ # We have a list of 1-dim torch tensors, which can be stacked without padding
+ return torch.cat([item[key] for item in items], dim=0)
+ if key in ["pixel_values", "image"]:
+ # This is probable image so padding shouldn't be necessary
+ # B, C, H, W
+ return torch.cat([item[key] for item in items], dim=0)
+ elif dim == 4 and key == "input_features":
+ # this is probably a mel spectrogram batched
+ return torch.cat([item[key] for item in items], dim=0)
+ max_length = max(item[key].shape[1] for item in items)
+ min_length = min(item[key].shape[1] for item in items)
+ dtype = items[0][key].dtype
+
+ if dim == 2:
+ if max_length == min_length:
+ # Bypass for `ImageGPT` which doesn't provide a padding value, yet
+ # we can consistently pad since the size should be matching
+ return torch.cat([item[key] for item in items], dim=0)
+ tensor = torch.zeros((batch_size, max_length), dtype=dtype) + padding_value
+ elif dim == 3:
+ tensor = torch.zeros((batch_size, max_length, shape[-1]), dtype=dtype) + padding_value
+ elif dim == 4:
+ tensor = torch.zeros((batch_size, max_length, shape[-2], shape[-1]), dtype=dtype) + padding_value
+
+ for i, item in enumerate(items):
+ if dim == 2:
+ if padding_side == "left":
+ tensor[i, -len(item[key][0]) :] = item[key][0].clone()
+ else:
+ tensor[i, : len(item[key][0])] = item[key][0].clone()
+ elif dim == 3:
+ if padding_side == "left":
+ tensor[i, -len(item[key][0]) :, :] = item[key][0].clone()
+ else:
+ tensor[i, : len(item[key][0]), :] = item[key][0].clone()
+ elif dim == 4:
+ if padding_side == "left":
+ tensor[i, -len(item[key][0]) :, :, :] = item[key][0].clone()
+ else:
+ tensor[i, : len(item[key][0]), :, :] = item[key][0].clone()
+
+ return tensor
+ else:
+ return [item[key] for item in items]
+
+
+def pad_collate_fn(tokenizer, feature_extractor):
+ # Tokenizer
+ t_padding_side = None
+ # Feature extractor
+ f_padding_side = None
+ if tokenizer is None and feature_extractor is None:
+ raise ValueError("Pipeline without tokenizer or feature_extractor cannot do batching")
+ if tokenizer is not None:
+ if tokenizer.pad_token_id is None:
+ raise ValueError(
+ "Pipeline with tokenizer without pad_token cannot do batching. You can try to set it with "
+ "`pipe.tokenizer.pad_token_id = model.config.eos_token_id`."
+ )
+ else:
+ t_padding_value = tokenizer.pad_token_id
+ t_padding_side = tokenizer.padding_side
+ if feature_extractor is not None:
+ # Feature extractor can be images, where no padding is expected
+ f_padding_value = getattr(feature_extractor, "padding_value", None)
+ f_padding_side = getattr(feature_extractor, "padding_side", None)
+
+ if t_padding_side is not None and f_padding_side is not None and t_padding_side != f_padding_side:
+ raise ValueError(
+ f"The feature extractor, and tokenizer don't agree on padding side {t_padding_side} != {f_padding_side}"
+ )
+ padding_side = "right"
+ if t_padding_side is not None:
+ padding_side = t_padding_side
+ if f_padding_side is not None:
+ padding_side = f_padding_side
+
+ def inner(items):
+ keys = set(items[0].keys())
+ for item in items:
+ if set(item.keys()) != keys:
+ raise ValueError(
+ f"The elements of the batch contain different keys. Cannot batch them ({set(item.keys())} !="
+ f" {keys})"
+ )
+ # input_values, input_pixels, input_ids, ...
+ padded = {}
+ for key in keys:
+ if key in {"input_ids"}:
+ # ImageGPT uses a feature extractor
+ if tokenizer is None and feature_extractor is not None:
+ _padding_value = f_padding_value
+ else:
+ _padding_value = t_padding_value
+ elif key in {"input_values", "pixel_values", "input_features"}:
+ _padding_value = f_padding_value
+ elif key in {"p_mask", "special_tokens_mask"}:
+ _padding_value = 1
+ elif key in {"attention_mask", "token_type_ids"}:
+ _padding_value = 0
+ else:
+ # This is likely another random key maybe even user provided
+ _padding_value = 0
+ padded[key] = _pad(items, key, _padding_value, padding_side)
+ return padded
+
+ return inner
+
+
+def infer_framework_load_model(
+ model,
+ config: AutoConfig,
+ model_classes: Optional[Dict[str, Tuple[type]]] = None,
+ task: Optional[str] = None,
+ framework: Optional[str] = None,
+ **model_kwargs,
+):
+ """
+ Select framework (TensorFlow or PyTorch) to use from the `model` passed. Returns a tuple (framework, model).
+
+ If `model` is instantiated, this function will just infer the framework from the model class. Otherwise `model` is
+ actually a checkpoint name and this method will try to instantiate it using `model_classes`. Since we don't want to
+ instantiate the model twice, this model is returned for use by the pipeline.
+
+ If both frameworks are installed and available for `model`, PyTorch is selected.
+
+ Args:
+ model (`str`, [`PreTrainedModel`] or [`TFPreTrainedModel]`):
+ The model to infer the framework from. If `str`, a checkpoint name. The model to infer the framewrok from.
+ config ([`AutoConfig`]):
+ The config associated with the model to help using the correct class
+ model_classes (dictionary `str` to `type`, *optional*):
+ A mapping framework to class.
+ task (`str`):
+ The task defining which pipeline will be returned.
+ model_kwargs:
+ Additional dictionary of keyword arguments passed along to the model's `from_pretrained(...,
+ **model_kwargs)` function.
+
+ Returns:
+ `Tuple`: A tuple framework, model.
+ """
+ if not is_tf_available() and not is_torch_available():
+ raise RuntimeError(
+ "At least one of TensorFlow 2.0 or PyTorch should be installed. "
+ "To install TensorFlow 2.0, read the instructions at https://www.tensorflow.org/install/ "
+ "To install PyTorch, read the instructions at https://pytorch.org/."
+ )
+ if isinstance(model, str):
+ model_kwargs["_from_pipeline"] = task
+ class_tuple = ()
+ look_pt = is_torch_available() and framework in {"pt", None}
+ look_tf = is_tf_available() and framework in {"tf", None}
+ if model_classes:
+ if look_pt:
+ class_tuple = class_tuple + model_classes.get("pt", (AutoModel,))
+ if look_tf:
+ class_tuple = class_tuple + model_classes.get("tf", (TFAutoModel,))
+ if config.architectures:
+ classes = []
+ for architecture in config.architectures:
+ transformers_module = importlib.import_module("transformers")
+ if look_pt:
+ _class = getattr(transformers_module, architecture, None)
+ if _class is not None:
+ classes.append(_class)
+ if look_tf:
+ _class = getattr(transformers_module, f"TF{architecture}", None)
+ if _class is not None:
+ classes.append(_class)
+ class_tuple = class_tuple + tuple(classes)
+
+ if len(class_tuple) == 0:
+ raise ValueError(f"Pipeline cannot infer suitable model classes from {model}")
+
+ all_traceback = {}
+ for model_class in class_tuple:
+ kwargs = model_kwargs.copy()
+ if framework == "pt" and model.endswith(".h5"):
+ kwargs["from_tf"] = True
+ logger.warning(
+ "Model might be a TensorFlow model (ending with `.h5`) but TensorFlow is not available. "
+ "Trying to load the model with PyTorch."
+ )
+ elif framework == "tf" and model.endswith(".bin"):
+ kwargs["from_pt"] = True
+ logger.warning(
+ "Model might be a PyTorch model (ending with `.bin`) but PyTorch is not available. "
+ "Trying to load the model with Tensorflow."
+ )
+
+ try:
+ model = model_class.from_pretrained(model, **kwargs)
+ if hasattr(model, "eval"):
+ model = model.eval()
+ # Stop loading on the first successful load.
+ break
+ except (OSError, ValueError):
+ all_traceback[model_class.__name__] = traceback.format_exc()
+ continue
+
+ if isinstance(model, str):
+ error = ""
+ for class_name, trace in all_traceback.items():
+ error += f"while loading with {class_name}, an error is thrown:\n{trace}\n"
+ raise ValueError(
+ f"Could not load model {model} with any of the following classes: {class_tuple}. See the original errors:\n\n{error}\n"
+ )
+
+ if framework is None:
+ framework = infer_framework(model.__class__)
+ return framework, model
+
+
+def infer_framework_from_model(
+ model,
+ model_classes: Optional[Dict[str, Tuple[type]]] = None,
+ task: Optional[str] = None,
+ framework: Optional[str] = None,
+ **model_kwargs,
+):
+ """
+ Select framework (TensorFlow or PyTorch) to use from the `model` passed. Returns a tuple (framework, model).
+
+ If `model` is instantiated, this function will just infer the framework from the model class. Otherwise `model` is
+ actually a checkpoint name and this method will try to instantiate it using `model_classes`. Since we don't want to
+ instantiate the model twice, this model is returned for use by the pipeline.
+
+ If both frameworks are installed and available for `model`, PyTorch is selected.
+
+ Args:
+ model (`str`, [`PreTrainedModel`] or [`TFPreTrainedModel]`):
+ The model to infer the framework from. If `str`, a checkpoint name. The model to infer the framewrok from.
+ model_classes (dictionary `str` to `type`, *optional*):
+ A mapping framework to class.
+ task (`str`):
+ The task defining which pipeline will be returned.
+ model_kwargs:
+ Additional dictionary of keyword arguments passed along to the model's `from_pretrained(...,
+ **model_kwargs)` function.
+
+ Returns:
+ `Tuple`: A tuple framework, model.
+ """
+ if isinstance(model, str):
+ config = AutoConfig.from_pretrained(model, _from_pipeline=task, **model_kwargs)
+ else:
+ config = model.config
+ return infer_framework_load_model(
+ model, config, model_classes=model_classes, _from_pipeline=task, task=task, framework=framework, **model_kwargs
+ )
+
+
+def get_framework(model, revision: Optional[str] = None):
+ """
+ Select framework (TensorFlow or PyTorch) to use.
+
+ Args:
+ model (`str`, [`PreTrainedModel`] or [`TFPreTrainedModel]`):
+ If both frameworks are installed, picks the one corresponding to the model passed (either a model class or
+ the model name). If no specific model is provided, defaults to using PyTorch.
+ """
+ warnings.warn(
+ "`get_framework` is deprecated and will be removed in v5, use `infer_framework_from_model` instead.",
+ FutureWarning,
+ )
+ if not is_tf_available() and not is_torch_available():
+ raise RuntimeError(
+ "At least one of TensorFlow 2.0 or PyTorch should be installed. "
+ "To install TensorFlow 2.0, read the instructions at https://www.tensorflow.org/install/ "
+ "To install PyTorch, read the instructions at https://pytorch.org/."
+ )
+ if isinstance(model, str):
+ if is_torch_available() and not is_tf_available():
+ model = AutoModel.from_pretrained(model, revision=revision)
+ elif is_tf_available() and not is_torch_available():
+ model = TFAutoModel.from_pretrained(model, revision=revision)
+ else:
+ try:
+ model = AutoModel.from_pretrained(model, revision=revision)
+ except OSError:
+ model = TFAutoModel.from_pretrained(model, revision=revision)
+
+ framework = infer_framework(model.__class__)
+ return framework
+
+
+def get_default_model_and_revision(
+ targeted_task: Dict, framework: Optional[str], task_options: Optional[Any]
+) -> Union[str, Tuple[str, str]]:
+ """
+ Select a default model to use for a given task. Defaults to pytorch if ambiguous.
+
+ Args:
+ targeted_task (`Dict`):
+ Dictionary representing the given task, that should contain default models
+
+ framework (`str`, None)
+ "pt", "tf" or None, representing a specific framework if it was specified, or None if we don't know yet.
+
+ task_options (`Any`, None)
+ Any further value required by the task to get fully specified, for instance (SRC, TGT) languages for
+ translation task.
+
+ Returns
+
+ `str` The model string representing the default model for this pipeline
+ """
+ if is_torch_available() and not is_tf_available():
+ framework = "pt"
+ elif is_tf_available() and not is_torch_available():
+ framework = "tf"
+
+ defaults = targeted_task["default"]
+ if task_options:
+ if task_options not in defaults:
+ raise ValueError(f"The task does not provide any default models for options {task_options}")
+ default_models = defaults[task_options]["model"]
+ elif "model" in defaults:
+ default_models = targeted_task["default"]["model"]
+ else:
+ # XXX This error message needs to be updated to be more generic if more tasks are going to become
+ # parametrized
+ raise ValueError('The task defaults can\'t be correctly selected. You probably meant "translation_XX_to_YY"')
+
+ if framework is None:
+ framework = "pt"
+
+ return default_models[framework]
+
+
+def load_assistant_model(
+ model: "PreTrainedModel",
+ assistant_model: Optional[Union[str, "PreTrainedModel"]],
+ assistant_tokenizer: Optional[PreTrainedTokenizer],
+) -> Tuple[Optional["PreTrainedModel"], Optional[PreTrainedTokenizer]]:
+ """
+ Prepares the assistant model and the assistant tokenizer for a pipeline whose model that can call `generate`.
+
+ Args:
+ model ([`PreTrainedModel`]):
+ The main model that will be used by the pipeline to make predictions.
+ assistant_model (`str` or [`PreTrainedModel`], *optional*):
+ The assistant model that will be used by the pipeline to make predictions.
+ assistant_tokenizer ([`PreTrainedTokenizer`], *optional*):
+ The assistant tokenizer that will be used by the pipeline to encode data for the model.
+
+ Returns:
+ Tuple: The loaded assistant model and (optionally) the loaded tokenizer.
+ """
+ if not model.can_generate() or assistant_model is None:
+ return None, None
+
+ if not isinstance(model, PreTrainedModel):
+ raise ValueError(
+ "Assisted generation, triggered by the `assistant_model` argument, is only available for "
+ "`PreTrainedModel` model instances. For instance, TF or JAX models are not supported."
+ )
+
+ # If the model is passed as a string, load the model and the corresponding tokenizer
+ if isinstance(assistant_model, str):
+ assistant_config = AutoConfig.from_pretrained(assistant_model)
+ _, loaded_assistant_model = infer_framework_load_model(assistant_model, config=assistant_config)
+ loaded_assistant_model = loaded_assistant_model.to(device=model.device, dtype=model.dtype)
+ loaded_assistant_tokenizer = AutoTokenizer.from_pretrained(assistant_model)
+ else:
+ loaded_assistant_model = assistant_model
+ loaded_assistant_tokenizer = assistant_tokenizer
+
+ # Finally, let's check the tokenizers: if the two models have different tokenizers, we need to keep the assistant
+ # tokenizer
+ same_vocab_size = model.config.vocab_size == loaded_assistant_model.config.vocab_size
+ same_special_tokens = all(
+ getattr(model.config, token) == getattr(loaded_assistant_model.config, token)
+ for token in ("eos_token_id", "pad_token_id", "bos_token_id")
+ )
+ if same_vocab_size and same_special_tokens:
+ loaded_assistant_tokenizer = None
+ elif loaded_assistant_tokenizer is None:
+ raise ValueError(
+ "The assistant model has a different tokenizer than the main model. You should pass the assistant "
+ "tokenizer."
+ )
+
+ return loaded_assistant_model, loaded_assistant_tokenizer
+
+
+class PipelineException(Exception):
+ """
+ Raised by a [`Pipeline`] when handling __call__.
+
+ Args:
+ task (`str`): The task of the pipeline.
+ model (`str`): The model used by the pipeline.
+ reason (`str`): The error message to display.
+ """
+
+ def __init__(self, task: str, model: str, reason: str):
+ super().__init__(reason)
+
+ self.task = task
+ self.model = model
+
+
+class ArgumentHandler(ABC):
+ """
+ Base interface for handling arguments for each [`~pipelines.Pipeline`].
+ """
+
+ @abstractmethod
+ def __call__(self, *args, **kwargs):
+ raise NotImplementedError()
+
+
+class PipelineDataFormat:
+ """
+ Base class for all the pipeline supported data format both for reading and writing. Supported data formats
+ currently includes:
+
+ - JSON
+ - CSV
+ - stdin/stdout (pipe)
+
+ `PipelineDataFormat` also includes some utilities to work with multi-columns like mapping from datasets columns to
+ pipelines keyword arguments through the `dataset_kwarg_1=dataset_column_1` format.
+
+ Args:
+ output_path (`str`): Where to save the outgoing data.
+ input_path (`str`): Where to look for the input data.
+ column (`str`): The column to read.
+ overwrite (`bool`, *optional*, defaults to `False`):
+ Whether or not to overwrite the `output_path`.
+ """
+
+ SUPPORTED_FORMATS = ["json", "csv", "pipe"]
+
+ def __init__(
+ self,
+ output_path: Optional[str],
+ input_path: Optional[str],
+ column: Optional[str],
+ overwrite: bool = False,
+ ):
+ self.output_path = output_path
+ self.input_path = input_path
+ self.column = column.split(",") if column is not None else [""]
+ self.is_multi_columns = len(self.column) > 1
+
+ if self.is_multi_columns:
+ self.column = [tuple(c.split("=")) if "=" in c else (c, c) for c in self.column]
+
+ if output_path is not None and not overwrite:
+ if exists(abspath(self.output_path)):
+ raise OSError(f"{self.output_path} already exists on disk")
+
+ if input_path is not None:
+ if not exists(abspath(self.input_path)):
+ raise OSError(f"{self.input_path} doesnt exist on disk")
+
+ @abstractmethod
+ def __iter__(self):
+ raise NotImplementedError()
+
+ @abstractmethod
+ def save(self, data: Union[dict, List[dict]]):
+ """
+ Save the provided data object with the representation for the current [`~pipelines.PipelineDataFormat`].
+
+ Args:
+ data (`dict` or list of `dict`): The data to store.
+ """
+ raise NotImplementedError()
+
+ def save_binary(self, data: Union[dict, List[dict]]) -> str:
+ """
+ Save the provided data object as a pickle-formatted binary data on the disk.
+
+ Args:
+ data (`dict` or list of `dict`): The data to store.
+
+ Returns:
+ `str`: Path where the data has been saved.
+ """
+ path, _ = os.path.splitext(self.output_path)
+ binary_path = os.path.extsep.join((path, "pickle"))
+
+ with open(binary_path, "wb+") as f_output:
+ pickle.dump(data, f_output)
+
+ return binary_path
+
+ @staticmethod
+ def from_str(
+ format: str,
+ output_path: Optional[str],
+ input_path: Optional[str],
+ column: Optional[str],
+ overwrite=False,
+ ) -> "PipelineDataFormat":
+ """
+ Creates an instance of the right subclass of [`~pipelines.PipelineDataFormat`] depending on `format`.
+
+ Args:
+ format (`str`):
+ The format of the desired pipeline. Acceptable values are `"json"`, `"csv"` or `"pipe"`.
+ output_path (`str`, *optional*):
+ Where to save the outgoing data.
+ input_path (`str`, *optional*):
+ Where to look for the input data.
+ column (`str`, *optional*):
+ The column to read.
+ overwrite (`bool`, *optional*, defaults to `False`):
+ Whether or not to overwrite the `output_path`.
+
+ Returns:
+ [`~pipelines.PipelineDataFormat`]: The proper data format.
+ """
+ if format == "json":
+ return JsonPipelineDataFormat(output_path, input_path, column, overwrite=overwrite)
+ elif format == "csv":
+ return CsvPipelineDataFormat(output_path, input_path, column, overwrite=overwrite)
+ elif format == "pipe":
+ return PipedPipelineDataFormat(output_path, input_path, column, overwrite=overwrite)
+ else:
+ raise KeyError(f"Unknown reader {format} (Available reader are json/csv/pipe)")
+
+
+class CsvPipelineDataFormat(PipelineDataFormat):
+ """
+ Support for pipelines using CSV data format.
+
+ Args:
+ output_path (`str`): Where to save the outgoing data.
+ input_path (`str`): Where to look for the input data.
+ column (`str`): The column to read.
+ overwrite (`bool`, *optional*, defaults to `False`):
+ Whether or not to overwrite the `output_path`.
+ """
+
+ def __init__(
+ self,
+ output_path: Optional[str],
+ input_path: Optional[str],
+ column: Optional[str],
+ overwrite=False,
+ ):
+ super().__init__(output_path, input_path, column, overwrite=overwrite)
+
+ def __iter__(self):
+ with open(self.input_path, "r") as f:
+ reader = csv.DictReader(f)
+ for row in reader:
+ if self.is_multi_columns:
+ yield {k: row[c] for k, c in self.column}
+ else:
+ yield row[self.column[0]]
+
+ def save(self, data: List[dict]):
+ """
+ Save the provided data object with the representation for the current [`~pipelines.PipelineDataFormat`].
+
+ Args:
+ data (`List[dict]`): The data to store.
+ """
+ with open(self.output_path, "w") as f:
+ if len(data) > 0:
+ writer = csv.DictWriter(f, list(data[0].keys()))
+ writer.writeheader()
+ writer.writerows(data)
+
+
+class JsonPipelineDataFormat(PipelineDataFormat):
+ """
+ Support for pipelines using JSON file format.
+
+ Args:
+ output_path (`str`): Where to save the outgoing data.
+ input_path (`str`): Where to look for the input data.
+ column (`str`): The column to read.
+ overwrite (`bool`, *optional*, defaults to `False`):
+ Whether or not to overwrite the `output_path`.
+ """
+
+ def __init__(
+ self,
+ output_path: Optional[str],
+ input_path: Optional[str],
+ column: Optional[str],
+ overwrite=False,
+ ):
+ super().__init__(output_path, input_path, column, overwrite=overwrite)
+
+ with open(input_path, "r") as f:
+ self._entries = json.load(f)
+
+ def __iter__(self):
+ for entry in self._entries:
+ if self.is_multi_columns:
+ yield {k: entry[c] for k, c in self.column}
+ else:
+ yield entry[self.column[0]]
+
+ def save(self, data: dict):
+ """
+ Save the provided data object in a json file.
+
+ Args:
+ data (`dict`): The data to store.
+ """
+ with open(self.output_path, "w") as f:
+ json.dump(data, f)
+
+
+class PipedPipelineDataFormat(PipelineDataFormat):
+ """
+ Read data from piped input to the python process. For multi columns data, columns should separated by \t
+
+ If columns are provided, then the output will be a dictionary with {column_x: value_x}
+
+ Args:
+ output_path (`str`): Where to save the outgoing data.
+ input_path (`str`): Where to look for the input data.
+ column (`str`): The column to read.
+ overwrite (`bool`, *optional*, defaults to `False`):
+ Whether or not to overwrite the `output_path`.
+ """
+
+ def __iter__(self):
+ for line in sys.stdin:
+ # Split for multi-columns
+ if "\t" in line:
+ line = line.split("\t")
+ if self.column:
+ # Dictionary to map arguments
+ yield {kwargs: l for (kwargs, _), l in zip(self.column, line)}
+ else:
+ yield tuple(line)
+
+ # No dictionary to map arguments
+ else:
+ yield line
+
+ def save(self, data: dict):
+ """
+ Print the data.
+
+ Args:
+ data (`dict`): The data to store.
+ """
+ print(data)
+
+ def save_binary(self, data: Union[dict, List[dict]]) -> str:
+ if self.output_path is None:
+ raise KeyError(
+ "When using piped input on pipeline outputting large object requires an output file path. "
+ "Please provide such output path through --output argument."
+ )
+
+ return super().save_binary(data)
+
+
+class _ScikitCompat(ABC):
+ """
+ Interface layer for the Scikit and Keras compatibility.
+ """
+
+ @abstractmethod
+ def transform(self, X):
+ raise NotImplementedError()
+
+ @abstractmethod
+ def predict(self, X):
+ raise NotImplementedError()
+
+
+def build_pipeline_init_args(
+ has_tokenizer: bool = False,
+ has_feature_extractor: bool = False,
+ has_image_processor: bool = False,
+ has_processor: bool = False,
+ supports_binary_output: bool = True,
+) -> str:
+ docstring = r"""
+ Arguments:
+ model ([`PreTrainedModel`] or [`TFPreTrainedModel`]):
+ The model that will be used by the pipeline to make predictions. This needs to be a model inheriting from
+ [`PreTrainedModel`] for PyTorch and [`TFPreTrainedModel`] for TensorFlow."""
+ if has_tokenizer:
+ docstring += r"""
+ tokenizer ([`PreTrainedTokenizer`]):
+ The tokenizer that will be used by the pipeline to encode data for the model. This object inherits from
+ [`PreTrainedTokenizer`]."""
+ if has_feature_extractor:
+ docstring += r"""
+ feature_extractor ([`SequenceFeatureExtractor`]):
+ The feature extractor that will be used by the pipeline to encode data for the model. This object inherits from
+ [`SequenceFeatureExtractor`]."""
+ if has_image_processor:
+ docstring += r"""
+ image_processor ([`BaseImageProcessor`]):
+ The image processor that will be used by the pipeline to encode data for the model. This object inherits from
+ [`BaseImageProcessor`]."""
+ if has_processor:
+ docstring += r"""
+ processor ([`ProcessorMixin`]):
+ The processor that will be used by the pipeline to encode data for the model. This object inherits from
+ [`ProcessorMixin`]. Processor is a composite object that might contain `tokenizer`, `feature_extractor`, and
+ `image_processor`."""
+ docstring += r"""
+ modelcard (`str` or [`ModelCard`], *optional*):
+ Model card attributed to the model for this pipeline.
+ framework (`str`, *optional*):
+ The framework to use, either `"pt"` for PyTorch or `"tf"` for TensorFlow. The specified framework must be
+ installed.
+
+ If no framework is specified, will default to the one currently installed. If no framework is specified and
+ both frameworks are installed, will default to the framework of the `model`, or to PyTorch if no model is
+ provided.
+ task (`str`, defaults to `""`):
+ A task-identifier for the pipeline.
+ num_workers (`int`, *optional*, defaults to 8):
+ When the pipeline will use *DataLoader* (when passing a dataset, on GPU for a Pytorch model), the number of
+ workers to be used.
+ batch_size (`int`, *optional*, defaults to 1):
+ When the pipeline will use *DataLoader* (when passing a dataset, on GPU for a Pytorch model), the size of
+ the batch to use, for inference this is not always beneficial, please read [Batching with
+ pipelines](https://huggingface.co/transformers/main_classes/pipelines.html#pipeline-batching) .
+ args_parser ([`~pipelines.ArgumentHandler`], *optional*):
+ Reference to the object in charge of parsing supplied pipeline parameters.
+ device (`int`, *optional*, defaults to -1):
+ Device ordinal for CPU/GPU supports. Setting this to -1 will leverage CPU, a positive will run the model on
+ the associated CUDA device id. You can pass native `torch.device` or a `str` too
+ torch_dtype (`str` or `torch.dtype`, *optional*):
+ Sent directly as `model_kwargs` (just a simpler shortcut) to use the available precision for this model
+ (`torch.float16`, `torch.bfloat16`, ... or `"auto"`)"""
+ if supports_binary_output:
+ docstring += r"""
+ binary_output (`bool`, *optional*, defaults to `False`):
+ Flag indicating if the output the pipeline should happen in a serialized format (i.e., pickle) or as
+ the raw output data e.g. text."""
+ return docstring
+
+
+PIPELINE_INIT_ARGS = build_pipeline_init_args(
+ has_tokenizer=True,
+ has_feature_extractor=True,
+ has_image_processor=True,
+ has_processor=True,
+ supports_binary_output=True,
+)
+
+
+if is_torch_available():
+ from transformers.pipelines.pt_utils import (
+ PipelineChunkIterator,
+ PipelineDataset,
+ PipelineIterator,
+ PipelinePackIterator,
+ )
+
+
+@add_end_docstrings(
+ build_pipeline_init_args(
+ has_tokenizer=True, has_feature_extractor=True, has_image_processor=True, has_processor=True
+ )
+)
+class Pipeline(_ScikitCompat, PushToHubMixin):
+ """
+ The Pipeline class is the class from which all pipelines inherit. Refer to this class for methods shared across
+ different pipelines.
+
+ Base class implementing pipelined operations. Pipeline workflow is defined as a sequence of the following
+ operations:
+
+ Input -> Tokenization -> Model Inference -> Post-Processing (task dependent) -> Output
+
+ Pipeline supports running on CPU or GPU through the device argument (see below).
+
+ Some pipeline, like for instance [`FeatureExtractionPipeline`] (`'feature-extraction'`) output large tensor object
+ as nested-lists. In order to avoid dumping such large structure as textual data we provide the `binary_output`
+ constructor argument. If set to `True`, the output will be stored in the pickle format.
+ """
+
+ # Historically we have pipelines working with `tokenizer`, `feature_extractor`, and `image_processor`
+ # as separate processing components. While we have `processor` class that combines them, some pipelines
+ # might still operate with these components separately.
+ # With the addition of `processor` to `pipeline`, we want to avoid:
+ # - loading `processor` for pipelines that still work with `image_processor` and `tokenizer` separately;
+ # - loading `image_processor`/`tokenizer` as a separate component while we operate only with `processor`,
+ # because `processor` will load required sub-components by itself.
+ # Below flags allow granular control over loading components and set to be backward compatible with current
+ # pipelines logic. You may override these flags when creating your pipeline. For example, for
+ # `zero-shot-object-detection` pipeline which operates with `processor` you should set `_load_processor=True`
+ # and all the rest flags to `False` to avoid unnecessary loading of the components.
+ _load_processor = False
+ _load_image_processor = True
+ _load_feature_extractor = True
+ _load_tokenizer = True
+
+ default_input_names = None
+
+ def __init__(
+ self,
+ model: Union["PreTrainedModel", "TFPreTrainedModel"],
+ tokenizer: Optional[PreTrainedTokenizer] = None,
+ feature_extractor: Optional[PreTrainedFeatureExtractor] = None,
+ image_processor: Optional[BaseImageProcessor] = None,
+ processor: Optional[ProcessorMixin] = None,
+ modelcard: Optional[ModelCard] = None,
+ framework: Optional[str] = None,
+ task: str = "",
+ args_parser: ArgumentHandler = None,
+ device: Union[int, "torch.device"] = None,
+ torch_dtype: Optional[Union[str, "torch.dtype"]] = None,
+ binary_output: bool = False,
+ **kwargs,
+ ):
+ if framework is None:
+ framework, model = infer_framework_load_model(model, config=model.config)
+
+ self.task = task
+ self.model = model
+ self.tokenizer = tokenizer
+ self.feature_extractor = feature_extractor
+ self.image_processor = image_processor
+ self.processor = processor
+ self.modelcard = modelcard
+ self.framework = framework
+
+ # `accelerate` device map
+ hf_device_map = getattr(self.model, "hf_device_map", None)
+
+ if hf_device_map is not None and device is not None:
+ raise ValueError(
+ "The model has been loaded with `accelerate` and therefore cannot be moved to a specific device. Please "
+ "discard the `device` argument when creating your pipeline object."
+ )
+
+ if device is None:
+ if hf_device_map is not None:
+ # Take the first device used by `accelerate`.
+ device = next(iter(hf_device_map.values()))
+ else:
+ device = 0
+
+ if is_torch_available() and self.framework == "pt":
+ if device == -1 and self.model.device is not None:
+ device = self.model.device
+ if isinstance(device, torch.device):
+ if device.type == "xpu" and not is_torch_xpu_available(check_device=True):
+ raise ValueError(f'{device} is not available, you should use device="cpu" instead')
+ self.device = device
+ elif isinstance(device, str):
+ if "xpu" in device and not is_torch_xpu_available(check_device=True):
+ raise ValueError(f'{device} is not available, you should use device="cpu" instead')
+ self.device = torch.device(device)
+ elif device < 0:
+ self.device = torch.device("cpu")
+ elif is_torch_mlu_available():
+ self.device = torch.device(f"mlu:{device}")
+ elif is_torch_musa_available():
+ self.device = torch.device(f"musa:{device}")
+ elif is_torch_cuda_available():
+ self.device = torch.device(f"cuda:{device}")
+ elif is_torch_npu_available():
+ self.device = torch.device(f"npu:{device}")
+ elif is_torch_xpu_available(check_device=True):
+ self.device = torch.device(f"xpu:{device}")
+ elif is_torch_mps_available():
+ self.device = torch.device(f"mps:{device}")
+ else:
+ self.device = torch.device("cpu")
+ else:
+ self.device = device if device is not None else -1
+
+ logger.warning(f"Device set to use {self.device}")
+
+ self.binary_output = binary_output
+ # We shouldn't call `model.to()` for models loaded with accelerate as well as the case that model is already on device
+ if (
+ self.framework == "pt"
+ and self.model.device != self.device
+ and not (isinstance(self.device, int) and self.device < 0)
+ and hf_device_map is None
+ ):
+ self.model.to(self.device)
+
+ # If the model can generate:
+ # 1 - create a local generation config. This is done to avoid side-effects on the model as we apply local
+ # tweaks to the generation config.
+ # 2 - load the assistant model if it is passed.
+ self.assistant_model, self.assistant_tokenizer = load_assistant_model(
+ self.model, kwargs.pop("assistant_model", None), kwargs.pop("assistant_tokenizer", None)
+ )
+ if self.model.can_generate():
+ self.prefix = self.model.config.prefix if hasattr(self.model.config, "prefix") else None
+ self.generation_config = copy.deepcopy(self.model.generation_config)
+ # Update the generation config with task specific params if they exist
+ # NOTE: `prefix` is pipeline-specific and doesn't exist in the generation config.
+ task_specific_params = self.model.config.task_specific_params
+ if task_specific_params is not None and task in task_specific_params:
+ this_task_params = task_specific_params.get(task)
+ if "prefix" in this_task_params:
+ self.prefix = this_task_params.pop("prefix")
+ self.generation_config.update(**this_task_params)
+ # If the tokenizer has a pad token but the model doesn't, set it so that `generate` is aware of it.
+ if (
+ self.tokenizer is not None
+ and self.tokenizer.pad_token_id is not None
+ and self.generation_config.pad_token_id is None
+ ):
+ self.generation_config.pad_token_id = self.tokenizer.pad_token_id
+
+ self.call_count = 0
+ self._batch_size = kwargs.pop("batch_size", None)
+ self._num_workers = kwargs.pop("num_workers", None)
+ self._preprocess_params, self._forward_params, self._postprocess_params = self._sanitize_parameters(**kwargs)
+
+ # In processor only mode, we can get the modality processors from the processor
+ if self.processor is not None and all(
+ [self.tokenizer is None, self.feature_extractor is None, self.image_processor is None]
+ ):
+ self.tokenizer = getattr(self.processor, "tokenizer", None)
+ self.feature_extractor = getattr(self.processor, "feature_extractor", None)
+ self.image_processor = getattr(self.processor, "image_processor", None)
+
+ if self.image_processor is None and self.feature_extractor is not None:
+ if isinstance(self.feature_extractor, BaseImageProcessor):
+ # Backward compatible change, if users called
+ # ImageSegmentationPipeline(.., feature_extractor=MyFeatureExtractor())
+ # then we should keep working
+ self.image_processor = self.feature_extractor
+
+ def save_pretrained(
+ self,
+ save_directory: Union[str, os.PathLike],
+ safe_serialization: bool = True,
+ **kwargs,
+ ):
+ """
+ Save the pipeline's model and tokenizer.
+
+ Args:
+ save_directory (`str` or `os.PathLike`):
+ A path to the directory where to saved. It will be created if it doesn't exist.
+ safe_serialization (`str`):
+ Whether to save the model using `safetensors` or the traditional way for PyTorch or Tensorflow.
+ kwargs (`Dict[str, Any]`, *optional*):
+ Additional key word arguments passed along to the [`~utils.PushToHubMixin.push_to_hub`] method.
+ """
+ use_auth_token = kwargs.pop("use_auth_token", None)
+
+ if use_auth_token is not None:
+ warnings.warn(
+ "The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers. Please use `token` instead.",
+ FutureWarning,
+ )
+ if kwargs.get("token", None) is not None:
+ raise ValueError(
+ "`token` and `use_auth_token` are both specified. Please set only the argument `token`."
+ )
+ kwargs["token"] = use_auth_token
+
+ if os.path.isfile(save_directory):
+ logger.error(f"Provided path ({save_directory}) should be a directory, not a file")
+ return
+ os.makedirs(save_directory, exist_ok=True)
+
+ if hasattr(self, "_registered_impl"):
+ # Add info to the config
+ pipeline_info = self._registered_impl.copy()
+ custom_pipelines = {}
+ for task, info in pipeline_info.items():
+ if info["impl"] != self.__class__:
+ continue
+
+ info = info.copy()
+ module_name = info["impl"].__module__
+ last_module = module_name.split(".")[-1]
+ # Change classes into their names/full names
+ info["impl"] = f"{last_module}.{info['impl'].__name__}"
+ info["pt"] = tuple(c.__name__ for c in info["pt"])
+ info["tf"] = tuple(c.__name__ for c in info["tf"])
+
+ custom_pipelines[task] = info
+ self.model.config.custom_pipelines = custom_pipelines
+ # Save the pipeline custom code
+ custom_object_save(self, save_directory)
+
+ kwargs["safe_serialization"] = safe_serialization
+ self.model.save_pretrained(save_directory, **kwargs)
+
+ if self.tokenizer is not None:
+ self.tokenizer.save_pretrained(save_directory, **kwargs)
+
+ if self.feature_extractor is not None:
+ self.feature_extractor.save_pretrained(save_directory, **kwargs)
+
+ if self.image_processor is not None:
+ self.image_processor.save_pretrained(save_directory, **kwargs)
+
+ if self.modelcard is not None:
+ self.modelcard.save_pretrained(save_directory)
+
+ def transform(self, X):
+ """
+ Scikit / Keras interface to transformers' pipelines. This method will forward to __call__().
+ """
+ return self(X)
+
+ def predict(self, X):
+ """
+ Scikit / Keras interface to transformers' pipelines. This method will forward to __call__().
+ """
+ return self(X)
+
+ @property
+ def torch_dtype(self) -> Optional["torch.dtype"]:
+ """
+ Torch dtype of the model (if it's Pytorch model), `None` otherwise.
+ """
+ return getattr(self.model, "dtype", None)
+
+ @contextmanager
+ def device_placement(self):
+ """
+ Context Manager allowing tensor allocation on the user-specified device in framework agnostic way.
+
+ Returns:
+ Context manager
+
+ Examples:
+
+ ```python
+ # Explicitly ask for tensor allocation on CUDA device :0
+ pipe = pipeline(..., device=0)
+ with pipe.device_placement():
+ # Every framework specific tensor allocation will be done on the request device
+ output = pipe(...)
+ ```"""
+ if self.framework == "tf":
+ with tf.device("/CPU:0" if self.device == -1 else f"/device:GPU:{self.device}"):
+ yield
+ else:
+ if self.device.type == "cuda":
+ with torch.cuda.device(self.device):
+ yield
+ elif self.device.type == "mlu":
+ with torch.mlu.device(self.device):
+ yield
+ elif self.device.type == "musa":
+ with torch.musa.device(self.device):
+ yield
+ else:
+ yield
+
+ def ensure_tensor_on_device(self, **inputs):
+ """
+ Ensure PyTorch tensors are on the specified device.
+
+ Args:
+ inputs (keyword arguments that should be `torch.Tensor`, the rest is ignored):
+ The tensors to place on `self.device`.
+ Recursive on lists **only**.
+
+ Return:
+ `Dict[str, torch.Tensor]`: The same as `inputs` but on the proper device.
+ """
+ return self._ensure_tensor_on_device(inputs, self.device)
+
+ def _ensure_tensor_on_device(self, inputs, device):
+ if isinstance(inputs, ModelOutput):
+ return ModelOutput(
+ {name: self._ensure_tensor_on_device(tensor, device) for name, tensor in inputs.items()}
+ )
+ elif isinstance(inputs, dict):
+ return {name: self._ensure_tensor_on_device(tensor, device) for name, tensor in inputs.items()}
+ elif isinstance(inputs, UserDict):
+ return UserDict({name: self._ensure_tensor_on_device(tensor, device) for name, tensor in inputs.items()})
+ elif isinstance(inputs, list):
+ return [self._ensure_tensor_on_device(item, device) for item in inputs]
+ elif isinstance(inputs, tuple):
+ return tuple([self._ensure_tensor_on_device(item, device) for item in inputs])
+ elif isinstance(inputs, torch.Tensor):
+ return inputs.to(device)
+ else:
+ return inputs
+
+ def check_model_type(self, supported_models: Union[List[str], dict]):
+ """
+ Check if the model class is in supported by the pipeline.
+
+ Args:
+ supported_models (`List[str]` or `dict`):
+ The list of models supported by the pipeline, or a dictionary with model class values.
+ """
+ if not isinstance(supported_models, list): # Create from a model mapping
+ supported_models_names = []
+ for _, model_name in supported_models.items():
+ # Mapping can now contain tuples of models for the same configuration.
+ if isinstance(model_name, tuple):
+ supported_models_names.extend(list(model_name))
+ else:
+ supported_models_names.append(model_name)
+ if hasattr(supported_models, "_model_mapping"):
+ for _, model in supported_models._model_mapping._extra_content.items():
+ if isinstance(model_name, tuple):
+ supported_models_names.extend([m.__name__ for m in model])
+ else:
+ supported_models_names.append(model.__name__)
+ supported_models = supported_models_names
+ if self.model.__class__.__name__ not in supported_models:
+ logger.error(
+ f"The model '{self.model.__class__.__name__}' is not supported for {self.task}. Supported models are"
+ f" {supported_models}."
+ )
+
+ @abstractmethod
+ def _sanitize_parameters(self, **pipeline_parameters):
+ """
+ _sanitize_parameters will be called with any excessive named arguments from either `__init__` or `__call__`
+ methods. It should return 3 dictionaries of the resolved parameters used by the various `preprocess`,
+ `forward` and `postprocess` methods. Do not fill dictionaries if the caller didn't specify a kwargs. This
+ lets you keep defaults in function signatures, which is more "natural".
+
+ It is not meant to be called directly, it will be automatically called and the final parameters resolved by
+ `__init__` and `__call__`
+ """
+ raise NotImplementedError("_sanitize_parameters not implemented")
+
+ @abstractmethod
+ def preprocess(self, input_: Any, **preprocess_parameters: Dict) -> Dict[str, GenericTensor]:
+ """
+ Preprocess will take the `input_` of a specific pipeline and return a dictionary of everything necessary for
+ `_forward` to run properly. It should contain at least one tensor, but might have arbitrary other items.
+ """
+ raise NotImplementedError("preprocess not implemented")
+
+ @abstractmethod
+ def _forward(self, input_tensors: Dict[str, GenericTensor], **forward_parameters: Dict) -> ModelOutput:
+ """
+ _forward will receive the prepared dictionary from `preprocess` and run it on the model. This method might
+ involve the GPU or the CPU and should be agnostic to it. Isolating this function is the reason for `preprocess`
+ and `postprocess` to exist, so that the hot path, this method generally can run as fast as possible.
+
+ It is not meant to be called directly, `forward` is preferred. It is basically the same but contains additional
+ code surrounding `_forward` making sure tensors and models are on the same device, disabling the training part
+ of the code (leading to faster inference).
+ """
+ raise NotImplementedError("_forward not implemented")
+
+ @abstractmethod
+ def postprocess(self, model_outputs: ModelOutput, **postprocess_parameters: Dict) -> Any:
+ """
+ Postprocess will receive the raw outputs of the `_forward` method, generally tensors, and reformat them into
+ something more friendly. Generally it will output a list or a dict or results (containing just strings and
+ numbers).
+ """
+ raise NotImplementedError("postprocess not implemented")
+
+ def get_inference_context(self):
+ return torch.no_grad
+
+ def forward(self, model_inputs, **forward_params):
+ with self.device_placement():
+ if self.framework == "tf":
+ model_inputs["training"] = False
+ model_outputs = self._forward(model_inputs, **forward_params)
+ elif self.framework == "pt":
+ inference_context = self.get_inference_context()
+ with inference_context():
+ model_inputs = self._ensure_tensor_on_device(model_inputs, device=self.device)
+ model_outputs = self._forward(model_inputs, **forward_params)
+ model_outputs = self._ensure_tensor_on_device(model_outputs, device=torch.device("cpu"))
+ else:
+ raise ValueError(f"Framework {self.framework} is not supported")
+ return model_outputs
+
+ def get_iterator(
+ self, inputs, num_workers: int, batch_size: int, preprocess_params, forward_params, postprocess_params
+ ):
+ if isinstance(inputs, collections.abc.Sized):
+ dataset = PipelineDataset(inputs, self.preprocess, preprocess_params)
+ else:
+ if num_workers > 1:
+ logger.warning(
+ "For iterable dataset using num_workers>1 is likely to result"
+ " in errors since everything is iterable, setting `num_workers=1`"
+ " to guarantee correctness."
+ )
+ num_workers = 1
+ dataset = PipelineIterator(inputs, self.preprocess, preprocess_params)
+ if "TOKENIZERS_PARALLELISM" not in os.environ:
+ logger.info("Disabling tokenizer parallelism, we're using DataLoader multithreading already")
+ os.environ["TOKENIZERS_PARALLELISM"] = "false"
+ # TODO hack by collating feature_extractor and image_processor
+ feature_extractor = self.feature_extractor if self.feature_extractor is not None else self.image_processor
+ collate_fn = no_collate_fn if batch_size == 1 else pad_collate_fn(self.tokenizer, feature_extractor)
+ dataloader = DataLoader(dataset, num_workers=num_workers, batch_size=batch_size, collate_fn=collate_fn)
+ model_iterator = PipelineIterator(dataloader, self.forward, forward_params, loader_batch_size=batch_size)
+ final_iterator = PipelineIterator(model_iterator, self.postprocess, postprocess_params)
+ return final_iterator
+
+ def __call__(self, inputs, *args, num_workers=None, batch_size=None, **kwargs):
+ if args:
+ logger.warning(f"Ignoring args : {args}")
+
+ if num_workers is None:
+ if self._num_workers is None:
+ num_workers = 0
+ else:
+ num_workers = self._num_workers
+ if batch_size is None:
+ if self._batch_size is None:
+ batch_size = 1
+ else:
+ batch_size = self._batch_size
+
+ preprocess_params, forward_params, postprocess_params = self._sanitize_parameters(**kwargs)
+
+ # Fuse __init__ params and __call__ params without modifying the __init__ ones.
+ preprocess_params = {**self._preprocess_params, **preprocess_params}
+ forward_params = {**self._forward_params, **forward_params}
+ postprocess_params = {**self._postprocess_params, **postprocess_params}
+
+ self.call_count += 1
+ if self.call_count > 10 and self.framework == "pt" and self.device.type == "cuda":
+ logger.warning_once(
+ "You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a"
+ " dataset",
+ )
+
+ is_dataset = Dataset is not None and isinstance(inputs, Dataset)
+ is_generator = isinstance(inputs, types.GeneratorType)
+ is_list = isinstance(inputs, list)
+
+ is_iterable = is_dataset or is_generator or is_list
+
+ # TODO make the get_iterator work also for `tf` (and `flax`).
+ can_use_iterator = self.framework == "pt" and (is_dataset or is_generator or is_list)
+
+ if is_list:
+ if can_use_iterator:
+ final_iterator = self.get_iterator(
+ inputs, num_workers, batch_size, preprocess_params, forward_params, postprocess_params
+ )
+ outputs = list(final_iterator)
+ return outputs
+ else:
+ return self.run_multi(inputs, preprocess_params, forward_params, postprocess_params)
+ elif can_use_iterator:
+ return self.get_iterator(
+ inputs, num_workers, batch_size, preprocess_params, forward_params, postprocess_params
+ )
+ elif is_iterable:
+ return self.iterate(inputs, preprocess_params, forward_params, postprocess_params)
+ elif self.framework == "pt" and isinstance(self, ChunkPipeline):
+ return next(
+ iter(
+ self.get_iterator(
+ [inputs], num_workers, batch_size, preprocess_params, forward_params, postprocess_params
+ )
+ )
+ )
+ else:
+ return self.run_single(inputs, preprocess_params, forward_params, postprocess_params)
+
+ def run_multi(self, inputs, preprocess_params, forward_params, postprocess_params):
+ return [self.run_single(item, preprocess_params, forward_params, postprocess_params) for item in inputs]
+
+ def run_single(self, inputs, preprocess_params, forward_params, postprocess_params):
+ model_inputs = self.preprocess(inputs, **preprocess_params)
+ model_outputs = self.forward(model_inputs, **forward_params)
+ outputs = self.postprocess(model_outputs, **postprocess_params)
+ return outputs
+
+ def iterate(self, inputs, preprocess_params, forward_params, postprocess_params):
+ # This function should become `get_iterator` again, this is a temporary
+ # easy solution.
+ for input_ in inputs:
+ yield self.run_single(input_, preprocess_params, forward_params, postprocess_params)
+
+
+Pipeline.push_to_hub = copy_func(Pipeline.push_to_hub)
+if Pipeline.push_to_hub.__doc__ is not None:
+ Pipeline.push_to_hub.__doc__ = Pipeline.push_to_hub.__doc__.format(
+ object="pipe", object_class="pipeline", object_files="pipeline file"
+ ).replace(".from_pretrained", "")
+
+
+class ChunkPipeline(Pipeline):
+ def run_single(self, inputs, preprocess_params, forward_params, postprocess_params):
+ all_outputs = []
+ for model_inputs in self.preprocess(inputs, **preprocess_params):
+ model_outputs = self.forward(model_inputs, **forward_params)
+ all_outputs.append(model_outputs)
+ outputs = self.postprocess(all_outputs, **postprocess_params)
+ return outputs
+
+ def get_iterator(
+ self, inputs, num_workers: int, batch_size: int, preprocess_params, forward_params, postprocess_params
+ ):
+ if "TOKENIZERS_PARALLELISM" not in os.environ:
+ logger.info("Disabling tokenizer parallelism, we're using DataLoader multithreading already")
+ os.environ["TOKENIZERS_PARALLELISM"] = "false"
+ if num_workers > 1:
+ logger.warning(
+ "For ChunkPipeline using num_workers>0 is likely to result in errors since everything is iterable,"
+ " setting `num_workers=1` to guarantee correctness."
+ )
+ num_workers = 1
+ dataset = PipelineChunkIterator(inputs, self.preprocess, preprocess_params)
+
+ # TODO hack by collating feature_extractor and image_processor
+ feature_extractor = self.feature_extractor if self.feature_extractor is not None else self.image_processor
+ collate_fn = no_collate_fn if batch_size == 1 else pad_collate_fn(self.tokenizer, feature_extractor)
+ dataloader = DataLoader(dataset, num_workers=num_workers, batch_size=batch_size, collate_fn=collate_fn)
+ model_iterator = PipelinePackIterator(dataloader, self.forward, forward_params, loader_batch_size=batch_size)
+ final_iterator = PipelineIterator(model_iterator, self.postprocess, postprocess_params)
+ return final_iterator
+
+
+class PipelineRegistry:
+ def __init__(self, supported_tasks: Dict[str, Any], task_aliases: Dict[str, str]) -> None:
+ self.supported_tasks = supported_tasks
+ self.task_aliases = task_aliases
+
+ def get_supported_tasks(self) -> List[str]:
+ supported_task = list(self.supported_tasks.keys()) + list(self.task_aliases.keys())
+ supported_task.sort()
+ return supported_task
+
+ def check_task(self, task: str) -> Tuple[str, Dict, Any]:
+ if task in self.task_aliases:
+ task = self.task_aliases[task]
+ if task in self.supported_tasks:
+ targeted_task = self.supported_tasks[task]
+ return task, targeted_task, None
+
+ if task.startswith("translation"):
+ tokens = task.split("_")
+ if len(tokens) == 4 and tokens[0] == "translation" and tokens[2] == "to":
+ targeted_task = self.supported_tasks["translation"]
+ task = "translation"
+ return task, targeted_task, (tokens[1], tokens[3])
+ raise KeyError(f"Invalid translation task {task}, use 'translation_XX_to_YY' format")
+
+ raise KeyError(
+ f"Unknown task {task}, available tasks are {self.get_supported_tasks() + ['translation_XX_to_YY']}"
+ )
+
+ def register_pipeline(
+ self,
+ task: str,
+ pipeline_class: type,
+ pt_model: Optional[Union[type, Tuple[type]]] = None,
+ tf_model: Optional[Union[type, Tuple[type]]] = None,
+ default: Optional[Dict] = None,
+ type: Optional[str] = None,
+ ) -> None:
+ if task in self.supported_tasks:
+ logger.warning(f"{task} is already registered. Overwriting pipeline for task {task}...")
+
+ if pt_model is None:
+ pt_model = ()
+ elif not isinstance(pt_model, tuple):
+ pt_model = (pt_model,)
+
+ if tf_model is None:
+ tf_model = ()
+ elif not isinstance(tf_model, tuple):
+ tf_model = (tf_model,)
+
+ task_impl = {"impl": pipeline_class, "pt": pt_model, "tf": tf_model}
+
+ if default is not None:
+ if "model" not in default and ("pt" in default or "tf" in default):
+ default = {"model": default}
+ task_impl["default"] = default
+
+ if type is not None:
+ task_impl["type"] = type
+
+ self.supported_tasks[task] = task_impl
+ pipeline_class._registered_impl = {task: task_impl}
+
+ def to_dict(self):
+ return self.supported_tasks
diff --git a/.venv/lib/python3.11/site-packages/transformers/pipelines/depth_estimation.py b/.venv/lib/python3.11/site-packages/transformers/pipelines/depth_estimation.py
new file mode 100644
index 0000000000000000000000000000000000000000..2203ac09c9cf9b6e9a51055c60678f5266ddd439
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/transformers/pipelines/depth_estimation.py
@@ -0,0 +1,133 @@
+from typing import List, Union
+
+from ..utils import (
+ add_end_docstrings,
+ is_torch_available,
+ is_vision_available,
+ logging,
+ requires_backends,
+)
+from .base import Pipeline, build_pipeline_init_args
+
+
+if is_vision_available():
+ from PIL import Image
+
+ from ..image_utils import load_image
+
+if is_torch_available():
+ from ..models.auto.modeling_auto import MODEL_FOR_DEPTH_ESTIMATION_MAPPING_NAMES
+
+logger = logging.get_logger(__name__)
+
+
+@add_end_docstrings(build_pipeline_init_args(has_image_processor=True))
+class DepthEstimationPipeline(Pipeline):
+ """
+ Depth estimation pipeline using any `AutoModelForDepthEstimation`. This pipeline predicts the depth of an image.
+
+ Example:
+
+ ```python
+ >>> from transformers import pipeline
+
+ >>> depth_estimator = pipeline(task="depth-estimation", model="LiheYoung/depth-anything-base-hf")
+ >>> output = depth_estimator("http://images.cocodataset.org/val2017/000000039769.jpg")
+ >>> # This is a tensor with the values being the depth expressed in meters for each pixel
+ >>> output["predicted_depth"].shape
+ torch.Size([1, 384, 384])
+ ```
+
+ Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial)
+
+
+ This depth estimation pipeline can currently be loaded from [`pipeline`] using the following task identifier:
+ `"depth-estimation"`.
+
+ See the list of available models on [huggingface.co/models](https://huggingface.co/models?filter=depth-estimation).
+ """
+
+ def __init__(self, *args, **kwargs):
+ super().__init__(*args, **kwargs)
+ requires_backends(self, "vision")
+ self.check_model_type(MODEL_FOR_DEPTH_ESTIMATION_MAPPING_NAMES)
+
+ def __call__(self, inputs: Union[str, List[str], "Image.Image", List["Image.Image"]] = None, **kwargs):
+ """
+ Predict the depth(s) of the image(s) passed as inputs.
+
+ Args:
+ inputs (`str`, `List[str]`, `PIL.Image` or `List[PIL.Image]`):
+ The pipeline handles three types of images:
+
+ - A string containing a http link pointing to an image
+ - A string containing a local path to an image
+ - An image loaded in PIL directly
+
+ The pipeline accepts either a single image or a batch of images, which must then be passed as a string.
+ Images in a batch must all be in the same format: all as http links, all as local paths, or all as PIL
+ images.
+ parameters (`Dict`, *optional*):
+ A dictionary of argument names to parameter values, to control pipeline behaviour.
+ The only parameter available right now is `timeout`, which is the length of time, in seconds,
+ that the pipeline should wait before giving up on trying to download an image.
+ timeout (`float`, *optional*, defaults to None):
+ The maximum time in seconds to wait for fetching images from the web. If None, no timeout is set and
+ the call may block forever.
+
+ Return:
+ A dictionary or a list of dictionaries containing result. If the input is a single image, will return a
+ dictionary, if the input is a list of several images, will return a list of dictionaries corresponding to
+ the images.
+
+ The dictionaries contain the following keys:
+
+ - **predicted_depth** (`torch.Tensor`) -- The predicted depth by the model as a `torch.Tensor`.
+ - **depth** (`PIL.Image`) -- The predicted depth by the model as a `PIL.Image`.
+ """
+ # After deprecation of this is completed, remove the default `None` value for `images`
+ if "images" in kwargs:
+ inputs = kwargs.pop("images")
+ if inputs is None:
+ raise ValueError("Cannot call the depth-estimation pipeline without an inputs argument!")
+ return super().__call__(inputs, **kwargs)
+
+ def _sanitize_parameters(self, timeout=None, parameters=None, **kwargs):
+ preprocess_params = {}
+ if timeout is not None:
+ preprocess_params["timeout"] = timeout
+ if isinstance(parameters, dict) and "timeout" in parameters:
+ preprocess_params["timeout"] = parameters["timeout"]
+ return preprocess_params, {}, {}
+
+ def preprocess(self, image, timeout=None):
+ image = load_image(image, timeout)
+ model_inputs = self.image_processor(images=image, return_tensors=self.framework)
+ if self.framework == "pt":
+ model_inputs = model_inputs.to(self.torch_dtype)
+ model_inputs["target_size"] = image.size[::-1]
+ return model_inputs
+
+ def _forward(self, model_inputs):
+ target_size = model_inputs.pop("target_size")
+ model_outputs = self.model(**model_inputs)
+ model_outputs["target_size"] = target_size
+ return model_outputs
+
+ def postprocess(self, model_outputs):
+ outputs = self.image_processor.post_process_depth_estimation(
+ model_outputs,
+ # this acts as `source_sizes` for ZoeDepth and as `target_sizes` for the rest of the models so do *not*
+ # replace with `target_sizes = [model_outputs["target_size"]]`
+ [model_outputs["target_size"]],
+ )
+
+ formatted_outputs = []
+ for output in outputs:
+ depth = output["predicted_depth"].detach().cpu().numpy()
+ depth = (depth - depth.min()) / (depth.max() - depth.min())
+ depth = Image.fromarray((depth * 255).astype("uint8"))
+
+ formatted_outputs.append({"predicted_depth": output["predicted_depth"], "depth": depth})
+
+ return formatted_outputs[0] if len(outputs) == 1 else formatted_outputs
diff --git a/.venv/lib/python3.11/site-packages/transformers/pipelines/document_question_answering.py b/.venv/lib/python3.11/site-packages/transformers/pipelines/document_question_answering.py
new file mode 100644
index 0000000000000000000000000000000000000000..c176d841e29fa6c6bb8c6867562f985d181c7138
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/transformers/pipelines/document_question_answering.py
@@ -0,0 +1,516 @@
+# Copyright 2022 The Impira Team and the HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import re
+from typing import List, Optional, Tuple, Union
+
+import numpy as np
+
+from ..utils import (
+ ExplicitEnum,
+ add_end_docstrings,
+ is_pytesseract_available,
+ is_torch_available,
+ is_vision_available,
+ logging,
+)
+from .base import ChunkPipeline, build_pipeline_init_args
+from .question_answering import select_starts_ends
+
+
+if is_vision_available():
+ from PIL import Image
+
+ from ..image_utils import load_image
+
+if is_torch_available():
+ import torch
+
+ from ..models.auto.modeling_auto import MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING_NAMES
+
+TESSERACT_LOADED = False
+if is_pytesseract_available():
+ TESSERACT_LOADED = True
+ import pytesseract
+
+logger = logging.get_logger(__name__)
+
+
+# normalize_bbox() and apply_tesseract() are derived from apply_tesseract in models/layoutlmv3/feature_extraction_layoutlmv3.py.
+# However, because the pipeline may evolve from what layoutlmv3 currently does, it's copied (vs. imported) to avoid creating an
+# unnecessary dependency.
+def normalize_box(box, width, height):
+ return [
+ int(1000 * (box[0] / width)),
+ int(1000 * (box[1] / height)),
+ int(1000 * (box[2] / width)),
+ int(1000 * (box[3] / height)),
+ ]
+
+
+def apply_tesseract(image: "Image.Image", lang: Optional[str], tesseract_config: Optional[str]):
+ """Applies Tesseract OCR on a document image, and returns recognized words + normalized bounding boxes."""
+ # apply OCR
+ data = pytesseract.image_to_data(image, lang=lang, output_type="dict", config=tesseract_config)
+ words, left, top, width, height = data["text"], data["left"], data["top"], data["width"], data["height"]
+
+ # filter empty words and corresponding coordinates
+ irrelevant_indices = [idx for idx, word in enumerate(words) if not word.strip()]
+ words = [word for idx, word in enumerate(words) if idx not in irrelevant_indices]
+ left = [coord for idx, coord in enumerate(left) if idx not in irrelevant_indices]
+ top = [coord for idx, coord in enumerate(top) if idx not in irrelevant_indices]
+ width = [coord for idx, coord in enumerate(width) if idx not in irrelevant_indices]
+ height = [coord for idx, coord in enumerate(height) if idx not in irrelevant_indices]
+
+ # turn coordinates into (left, top, left+width, top+height) format
+ actual_boxes = []
+ for x, y, w, h in zip(left, top, width, height):
+ actual_box = [x, y, x + w, y + h]
+ actual_boxes.append(actual_box)
+
+ image_width, image_height = image.size
+
+ # finally, normalize the bounding boxes
+ normalized_boxes = []
+ for box in actual_boxes:
+ normalized_boxes.append(normalize_box(box, image_width, image_height))
+
+ if len(words) != len(normalized_boxes):
+ raise ValueError("Not as many words as there are bounding boxes")
+
+ return words, normalized_boxes
+
+
+class ModelType(ExplicitEnum):
+ LayoutLM = "layoutlm"
+ LayoutLMv2andv3 = "layoutlmv2andv3"
+ VisionEncoderDecoder = "vision_encoder_decoder"
+
+
+@add_end_docstrings(build_pipeline_init_args(has_image_processor=True, has_tokenizer=True))
+class DocumentQuestionAnsweringPipeline(ChunkPipeline):
+ # TODO: Update task_summary docs to include an example with document QA and then update the first sentence
+ """
+ Document Question Answering pipeline using any `AutoModelForDocumentQuestionAnswering`. The inputs/outputs are
+ similar to the (extractive) question answering pipeline; however, the pipeline takes an image (and optional OCR'd
+ words/boxes) as input instead of text context.
+
+ Example:
+
+ ```python
+ >>> from transformers import pipeline
+
+ >>> document_qa = pipeline(model="impira/layoutlm-document-qa")
+ >>> document_qa(
+ ... image="https://huggingface.co/spaces/impira/docquery/resolve/2359223c1837a7587402bda0f2643382a6eefeab/invoice.png",
+ ... question="What is the invoice number?",
+ ... )
+ [{'score': 0.425, 'answer': 'us-001', 'start': 16, 'end': 16}]
+ ```
+
+ Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial)
+
+ This document question answering pipeline can currently be loaded from [`pipeline`] using the following task
+ identifier: `"document-question-answering"`.
+
+ The models that this pipeline can use are models that have been fine-tuned on a document question answering task.
+ See the up-to-date list of available models on
+ [huggingface.co/models](https://huggingface.co/models?filter=document-question-answering).
+ """
+
+ def __init__(self, *args, **kwargs):
+ super().__init__(*args, **kwargs)
+ if self.tokenizer is not None and not self.tokenizer.__class__.__name__.endswith("Fast"):
+ raise ValueError(
+ "`DocumentQuestionAnsweringPipeline` requires a fast tokenizer, but a slow tokenizer "
+ f"(`{self.tokenizer.__class__.__name__}`) is provided."
+ )
+
+ if self.model.config.__class__.__name__ == "VisionEncoderDecoderConfig":
+ self.model_type = ModelType.VisionEncoderDecoder
+ if self.model.config.encoder.model_type != "donut-swin":
+ raise ValueError("Currently, the only supported VisionEncoderDecoder model is Donut")
+ else:
+ self.check_model_type(MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING_NAMES)
+ if self.model.config.__class__.__name__ == "LayoutLMConfig":
+ self.model_type = ModelType.LayoutLM
+ else:
+ self.model_type = ModelType.LayoutLMv2andv3
+
+ def _sanitize_parameters(
+ self,
+ padding=None,
+ doc_stride=None,
+ max_question_len=None,
+ lang: Optional[str] = None,
+ tesseract_config: Optional[str] = None,
+ max_answer_len=None,
+ max_seq_len=None,
+ top_k=None,
+ handle_impossible_answer=None,
+ timeout=None,
+ **kwargs,
+ ):
+ preprocess_params, postprocess_params = {}, {}
+ if padding is not None:
+ preprocess_params["padding"] = padding
+ if doc_stride is not None:
+ preprocess_params["doc_stride"] = doc_stride
+ if max_question_len is not None:
+ preprocess_params["max_question_len"] = max_question_len
+ if max_seq_len is not None:
+ preprocess_params["max_seq_len"] = max_seq_len
+ if lang is not None:
+ preprocess_params["lang"] = lang
+ if tesseract_config is not None:
+ preprocess_params["tesseract_config"] = tesseract_config
+ if timeout is not None:
+ preprocess_params["timeout"] = timeout
+
+ if top_k is not None:
+ if top_k < 1:
+ raise ValueError(f"top_k parameter should be >= 1 (got {top_k})")
+ postprocess_params["top_k"] = top_k
+ if max_answer_len is not None:
+ if max_answer_len < 1:
+ raise ValueError(f"max_answer_len parameter should be >= 1 (got {max_answer_len}")
+ postprocess_params["max_answer_len"] = max_answer_len
+ if handle_impossible_answer is not None:
+ postprocess_params["handle_impossible_answer"] = handle_impossible_answer
+
+ forward_params = {}
+ if self.assistant_model is not None:
+ forward_params["assistant_model"] = self.assistant_model
+ if self.assistant_tokenizer is not None:
+ forward_params["tokenizer"] = self.tokenizer
+ forward_params["assistant_tokenizer"] = self.assistant_tokenizer
+
+ return preprocess_params, forward_params, postprocess_params
+
+ def __call__(
+ self,
+ image: Union["Image.Image", str],
+ question: Optional[str] = None,
+ word_boxes: Tuple[str, List[float]] = None,
+ **kwargs,
+ ):
+ """
+ Answer the question(s) given as inputs by using the document(s). A document is defined as an image and an
+ optional list of (word, box) tuples which represent the text in the document. If the `word_boxes` are not
+ provided, it will use the Tesseract OCR engine (if available) to extract the words and boxes automatically for
+ LayoutLM-like models which require them as input. For Donut, no OCR is run.
+
+ You can invoke the pipeline several ways:
+
+ - `pipeline(image=image, question=question)`
+ - `pipeline(image=image, question=question, word_boxes=word_boxes)`
+ - `pipeline([{"image": image, "question": question}])`
+ - `pipeline([{"image": image, "question": question, "word_boxes": word_boxes}])`
+
+ Args:
+ image (`str` or `PIL.Image`):
+ The pipeline handles three types of images:
+
+ - A string containing a http link pointing to an image
+ - A string containing a local path to an image
+ - An image loaded in PIL directly
+
+ The pipeline accepts either a single image or a batch of images. If given a single image, it can be
+ broadcasted to multiple questions.
+ question (`str`):
+ A question to ask of the document.
+ word_boxes (`List[str, Tuple[float, float, float, float]]`, *optional*):
+ A list of words and bounding boxes (normalized 0->1000). If you provide this optional input, then the
+ pipeline will use these words and boxes instead of running OCR on the image to derive them for models
+ that need them (e.g. LayoutLM). This allows you to reuse OCR'd results across many invocations of the
+ pipeline without having to re-run it each time.
+ top_k (`int`, *optional*, defaults to 1):
+ The number of answers to return (will be chosen by order of likelihood). Note that we return less than
+ top_k answers if there are not enough options available within the context.
+ doc_stride (`int`, *optional*, defaults to 128):
+ If the words in the document are too long to fit with the question for the model, it will be split in
+ several chunks with some overlap. This argument controls the size of that overlap.
+ max_answer_len (`int`, *optional*, defaults to 15):
+ The maximum length of predicted answers (e.g., only answers with a shorter length are considered).
+ max_seq_len (`int`, *optional*, defaults to 384):
+ The maximum length of the total sentence (context + question) in tokens of each chunk passed to the
+ model. The context will be split in several chunks (using `doc_stride` as overlap) if needed.
+ max_question_len (`int`, *optional*, defaults to 64):
+ The maximum length of the question after tokenization. It will be truncated if needed.
+ handle_impossible_answer (`bool`, *optional*, defaults to `False`):
+ Whether or not we accept impossible as an answer.
+ lang (`str`, *optional*):
+ Language to use while running OCR. Defaults to english.
+ tesseract_config (`str`, *optional*):
+ Additional flags to pass to tesseract while running OCR.
+ timeout (`float`, *optional*, defaults to None):
+ The maximum time in seconds to wait for fetching images from the web. If None, no timeout is set and
+ the call may block forever.
+
+ Return:
+ A `dict` or a list of `dict`: Each result comes as a dictionary with the following keys:
+
+ - **score** (`float`) -- The probability associated to the answer.
+ - **start** (`int`) -- The start word index of the answer (in the OCR'd version of the input or provided
+ `word_boxes`).
+ - **end** (`int`) -- The end word index of the answer (in the OCR'd version of the input or provided
+ `word_boxes`).
+ - **answer** (`str`) -- The answer to the question.
+ - **words** (`list[int]`) -- The index of each word/box pair that is in the answer
+ """
+ if isinstance(question, str):
+ inputs = {"question": question, "image": image}
+ if word_boxes is not None:
+ inputs["word_boxes"] = word_boxes
+ else:
+ inputs = image
+ return super().__call__(inputs, **kwargs)
+
+ def preprocess(
+ self,
+ input,
+ padding="do_not_pad",
+ doc_stride=None,
+ max_seq_len=None,
+ word_boxes: Tuple[str, List[float]] = None,
+ lang=None,
+ tesseract_config="",
+ timeout=None,
+ ):
+ # NOTE: This code mirrors the code in question answering and will be implemented in a follow up PR
+ # to support documents with enough tokens that overflow the model's window
+ if max_seq_len is None:
+ max_seq_len = self.tokenizer.model_max_length
+
+ if doc_stride is None:
+ doc_stride = min(max_seq_len // 2, 256)
+
+ image = None
+ image_features = {}
+ if input.get("image", None) is not None:
+ image = load_image(input["image"], timeout=timeout)
+ if self.image_processor is not None:
+ image_inputs = self.image_processor(images=image, return_tensors=self.framework)
+ if self.framework == "pt":
+ image_inputs = image_inputs.to(self.torch_dtype)
+ image_features.update(image_inputs)
+ elif self.feature_extractor is not None:
+ image_features.update(self.feature_extractor(images=image, return_tensors=self.framework))
+ elif self.model_type == ModelType.VisionEncoderDecoder:
+ raise ValueError("If you are using a VisionEncoderDecoderModel, you must provide a feature extractor")
+
+ words, boxes = None, None
+ if not self.model_type == ModelType.VisionEncoderDecoder:
+ if "word_boxes" in input:
+ words = [x[0] for x in input["word_boxes"]]
+ boxes = [x[1] for x in input["word_boxes"]]
+ elif "words" in image_features and "boxes" in image_features:
+ words = image_features.pop("words")[0]
+ boxes = image_features.pop("boxes")[0]
+ elif image is not None:
+ if not TESSERACT_LOADED:
+ raise ValueError(
+ "If you provide an image without word_boxes, then the pipeline will run OCR using Tesseract,"
+ " but pytesseract is not available"
+ )
+ if TESSERACT_LOADED:
+ words, boxes = apply_tesseract(image, lang=lang, tesseract_config=tesseract_config)
+ else:
+ raise ValueError(
+ "You must provide an image or word_boxes. If you provide an image, the pipeline will automatically"
+ " run OCR to derive words and boxes"
+ )
+
+ if self.tokenizer.padding_side != "right":
+ raise ValueError(
+ "Document question answering only supports tokenizers whose padding side is 'right', not"
+ f" {self.tokenizer.padding_side}"
+ )
+
+ if self.model_type == ModelType.VisionEncoderDecoder:
+ task_prompt = f'{input["question"]}'
+ # Adapted from https://huggingface.co/spaces/nielsr/donut-docvqa/blob/main/app.py
+ encoding = {
+ "inputs": image_features["pixel_values"],
+ "decoder_input_ids": self.tokenizer(
+ task_prompt, add_special_tokens=False, return_tensors=self.framework
+ ).input_ids,
+ "return_dict_in_generate": True,
+ }
+ yield {
+ **encoding,
+ "p_mask": None,
+ "word_ids": None,
+ "words": None,
+ "output_attentions": True,
+ "is_last": True,
+ }
+ else:
+ tokenizer_kwargs = {}
+ if self.model_type == ModelType.LayoutLM:
+ tokenizer_kwargs["text"] = input["question"].split()
+ tokenizer_kwargs["text_pair"] = words
+ tokenizer_kwargs["is_split_into_words"] = True
+ else:
+ tokenizer_kwargs["text"] = [input["question"]]
+ tokenizer_kwargs["text_pair"] = [words]
+ tokenizer_kwargs["boxes"] = [boxes]
+
+ encoding = self.tokenizer(
+ padding=padding,
+ max_length=max_seq_len,
+ stride=doc_stride,
+ return_token_type_ids=True,
+ truncation="only_second",
+ return_overflowing_tokens=True,
+ **tokenizer_kwargs,
+ )
+ # TODO: check why slower `LayoutLMTokenizer` and `LayoutLMv2Tokenizer` don't have this key in outputs
+ # FIXME: ydshieh and/or Narsil
+ encoding.pop("overflow_to_sample_mapping", None) # We do not use this
+
+ num_spans = len(encoding["input_ids"])
+
+ # p_mask: mask with 1 for token than cannot be in the answer (0 for token which can be in an answer)
+ # We put 0 on the tokens from the context and 1 everywhere else (question and special tokens)
+ # This logic mirrors the logic in the question_answering pipeline
+ p_mask = np.array([[tok != 1 for tok in encoding.sequence_ids(span_id)] for span_id in range(num_spans)])
+ for span_idx in range(num_spans):
+ if self.framework == "pt":
+ span_encoding = {k: torch.tensor(v[span_idx : span_idx + 1]) for (k, v) in encoding.items()}
+ if "pixel_values" in image_features:
+ span_encoding["image"] = image_features["pixel_values"]
+ else:
+ raise ValueError("Unsupported: Tensorflow preprocessing for DocumentQuestionAnsweringPipeline")
+
+ input_ids_span_idx = encoding["input_ids"][span_idx]
+ # keep the cls_token unmasked (some models use it to indicate unanswerable questions)
+ if self.tokenizer.cls_token_id is not None:
+ cls_indices = np.nonzero(np.array(input_ids_span_idx) == self.tokenizer.cls_token_id)[0]
+ for cls_index in cls_indices:
+ p_mask[span_idx][cls_index] = 0
+
+ # For each span, place a bounding box [0,0,0,0] for question and CLS tokens, [1000,1000,1000,1000]
+ # for SEP tokens, and the word's bounding box for words in the original document.
+ if "boxes" not in tokenizer_kwargs:
+ bbox = []
+ for input_id, sequence_id, word_id in zip(
+ encoding.input_ids[span_idx],
+ encoding.sequence_ids(span_idx),
+ encoding.word_ids(span_idx),
+ ):
+ if sequence_id == 1:
+ bbox.append(boxes[word_id])
+ elif input_id == self.tokenizer.sep_token_id:
+ bbox.append([1000] * 4)
+ else:
+ bbox.append([0] * 4)
+
+ if self.framework == "pt":
+ span_encoding["bbox"] = torch.tensor(bbox).unsqueeze(0)
+ elif self.framework == "tf":
+ raise ValueError("Unsupported: Tensorflow preprocessing for DocumentQuestionAnsweringPipeline")
+ yield {
+ **span_encoding,
+ "p_mask": p_mask[span_idx],
+ "word_ids": encoding.word_ids(span_idx),
+ "words": words,
+ "is_last": span_idx == num_spans - 1,
+ }
+
+ def _forward(self, model_inputs, **generate_kwargs):
+ p_mask = model_inputs.pop("p_mask", None)
+ word_ids = model_inputs.pop("word_ids", None)
+ words = model_inputs.pop("words", None)
+ is_last = model_inputs.pop("is_last", False)
+
+ if self.model_type == ModelType.VisionEncoderDecoder:
+ # User-defined `generation_config` passed to the pipeline call take precedence
+ if "generation_config" not in generate_kwargs:
+ generate_kwargs["generation_config"] = self.generation_config
+
+ model_outputs = self.model.generate(**model_inputs, **generate_kwargs)
+ else:
+ model_outputs = self.model(**model_inputs)
+
+ model_outputs = dict(model_outputs.items())
+ model_outputs["p_mask"] = p_mask
+ model_outputs["word_ids"] = word_ids
+ model_outputs["words"] = words
+ model_outputs["attention_mask"] = model_inputs.get("attention_mask", None)
+ model_outputs["is_last"] = is_last
+ return model_outputs
+
+ def postprocess(self, model_outputs, top_k=1, **kwargs):
+ if self.model_type == ModelType.VisionEncoderDecoder:
+ answers = [self.postprocess_encoder_decoder_single(o) for o in model_outputs]
+ else:
+ answers = self.postprocess_extractive_qa(model_outputs, top_k=top_k, **kwargs)
+
+ answers = sorted(answers, key=lambda x: x.get("score", 0), reverse=True)[:top_k]
+ return answers
+
+ def postprocess_encoder_decoder_single(self, model_outputs, **kwargs):
+ sequence = self.tokenizer.batch_decode(model_outputs["sequences"])[0]
+
+ # TODO: A lot of this logic is specific to Donut and should probably be handled in the tokenizer
+ # (see https://github.com/huggingface/transformers/pull/18414/files#r961747408 for more context).
+ sequence = sequence.replace(self.tokenizer.eos_token, "").replace(self.tokenizer.pad_token, "")
+ sequence = re.sub(r"<.*?>", "", sequence, count=1).strip() # remove first task start token
+ ret = {
+ "answer": None,
+ }
+
+ answer = re.search(r"(.*)", sequence)
+ if answer is not None:
+ ret["answer"] = answer.group(1).strip()
+ return ret
+
+ def postprocess_extractive_qa(
+ self, model_outputs, top_k=1, handle_impossible_answer=False, max_answer_len=15, **kwargs
+ ):
+ min_null_score = 1000000 # large and positive
+ answers = []
+ for output in model_outputs:
+ words = output["words"]
+
+ starts, ends, scores, min_null_score = select_starts_ends(
+ start=output["start_logits"],
+ end=output["end_logits"],
+ p_mask=output["p_mask"],
+ attention_mask=output["attention_mask"].numpy()
+ if output.get("attention_mask", None) is not None
+ else None,
+ min_null_score=min_null_score,
+ top_k=top_k,
+ handle_impossible_answer=handle_impossible_answer,
+ max_answer_len=max_answer_len,
+ )
+ word_ids = output["word_ids"]
+ for start, end, score in zip(starts, ends, scores):
+ word_start, word_end = word_ids[start], word_ids[end]
+ if word_start is not None and word_end is not None:
+ answers.append(
+ {
+ "score": float(score),
+ "answer": " ".join(words[word_start : word_end + 1]),
+ "start": word_start,
+ "end": word_end,
+ }
+ )
+
+ if handle_impossible_answer:
+ answers.append({"score": min_null_score, "answer": "", "start": 0, "end": 0})
+
+ return answers
diff --git a/.venv/lib/python3.11/site-packages/transformers/pipelines/feature_extraction.py b/.venv/lib/python3.11/site-packages/transformers/pipelines/feature_extraction.py
new file mode 100644
index 0000000000000000000000000000000000000000..7d67a615ac02d29625f51242e1f747b39e6118bd
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/transformers/pipelines/feature_extraction.py
@@ -0,0 +1,86 @@
+from typing import Dict
+
+from ..utils import add_end_docstrings
+from .base import GenericTensor, Pipeline, build_pipeline_init_args
+
+
+@add_end_docstrings(
+ build_pipeline_init_args(has_tokenizer=True, supports_binary_output=False),
+ r"""
+ tokenize_kwargs (`dict`, *optional*):
+ Additional dictionary of keyword arguments passed along to the tokenizer.
+ return_tensors (`bool`, *optional*):
+ If `True`, returns a tensor according to the specified framework, otherwise returns a list.""",
+)
+class FeatureExtractionPipeline(Pipeline):
+ """
+ Feature extraction pipeline uses no model head. This pipeline extracts the hidden states from the base
+ transformer, which can be used as features in downstream tasks.
+
+ Example:
+
+ ```python
+ >>> from transformers import pipeline
+
+ >>> extractor = pipeline(model="google-bert/bert-base-uncased", task="feature-extraction")
+ >>> result = extractor("This is a simple test.", return_tensors=True)
+ >>> result.shape # This is a tensor of shape [1, sequence_length, hidden_dimension] representing the input string.
+ torch.Size([1, 8, 768])
+ ```
+
+ Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial)
+
+ This feature extraction pipeline can currently be loaded from [`pipeline`] using the task identifier:
+ `"feature-extraction"`.
+
+ All models may be used for this pipeline. See a list of all models, including community-contributed models on
+ [huggingface.co/models](https://huggingface.co/models).
+ """
+
+ def _sanitize_parameters(self, truncation=None, tokenize_kwargs=None, return_tensors=None, **kwargs):
+ if tokenize_kwargs is None:
+ tokenize_kwargs = {}
+
+ if truncation is not None:
+ if "truncation" in tokenize_kwargs:
+ raise ValueError(
+ "truncation parameter defined twice (given as keyword argument as well as in tokenize_kwargs)"
+ )
+ tokenize_kwargs["truncation"] = truncation
+
+ preprocess_params = tokenize_kwargs
+
+ postprocess_params = {}
+ if return_tensors is not None:
+ postprocess_params["return_tensors"] = return_tensors
+
+ return preprocess_params, {}, postprocess_params
+
+ def preprocess(self, inputs, **tokenize_kwargs) -> Dict[str, GenericTensor]:
+ model_inputs = self.tokenizer(inputs, return_tensors=self.framework, **tokenize_kwargs)
+ return model_inputs
+
+ def _forward(self, model_inputs):
+ model_outputs = self.model(**model_inputs)
+ return model_outputs
+
+ def postprocess(self, model_outputs, return_tensors=False):
+ # [0] is the first available tensor, logits or last_hidden_state.
+ if return_tensors:
+ return model_outputs[0]
+ if self.framework == "pt":
+ return model_outputs[0].tolist()
+ elif self.framework == "tf":
+ return model_outputs[0].numpy().tolist()
+
+ def __call__(self, *args, **kwargs):
+ """
+ Extract the features of the input(s).
+
+ Args:
+ args (`str` or `List[str]`): One or several texts (or one list of texts) to get the features of.
+
+ Return:
+ A nested list of `float`: The features computed by the model.
+ """
+ return super().__call__(*args, **kwargs)
diff --git a/.venv/lib/python3.11/site-packages/transformers/pipelines/fill_mask.py b/.venv/lib/python3.11/site-packages/transformers/pipelines/fill_mask.py
new file mode 100644
index 0000000000000000000000000000000000000000..c14f54118486b971f64b0985fe2dc688de52f863
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/transformers/pipelines/fill_mask.py
@@ -0,0 +1,273 @@
+from typing import Dict
+
+import numpy as np
+
+from ..utils import add_end_docstrings, is_tf_available, is_torch_available, logging
+from .base import GenericTensor, Pipeline, PipelineException, build_pipeline_init_args
+
+
+if is_tf_available():
+ import tensorflow as tf
+
+ from ..tf_utils import stable_softmax
+
+
+if is_torch_available():
+ import torch
+
+
+logger = logging.get_logger(__name__)
+
+
+@add_end_docstrings(
+ build_pipeline_init_args(has_tokenizer=True),
+ r"""
+ top_k (`int`, *optional*, defaults to 5):
+ The number of predictions to return.
+ targets (`str` or `List[str]`, *optional*):
+ When passed, the model will limit the scores to the passed targets instead of looking up in the whole
+ vocab. If the provided targets are not in the model vocab, they will be tokenized and the first resulting
+ token will be used (with a warning, and that might be slower).
+ tokenizer_kwargs (`dict`, *optional*):
+ Additional dictionary of keyword arguments passed along to the tokenizer.""",
+)
+class FillMaskPipeline(Pipeline):
+ """
+ Masked language modeling prediction pipeline using any `ModelWithLMHead`. See the [masked language modeling
+ examples](../task_summary#masked-language-modeling) for more information.
+
+ Example:
+
+ ```python
+ >>> from transformers import pipeline
+
+ >>> fill_masker = pipeline(model="google-bert/bert-base-uncased")
+ >>> fill_masker("This is a simple [MASK].")
+ [{'score': 0.042, 'token': 3291, 'token_str': 'problem', 'sequence': 'this is a simple problem.'}, {'score': 0.031, 'token': 3160, 'token_str': 'question', 'sequence': 'this is a simple question.'}, {'score': 0.03, 'token': 8522, 'token_str': 'equation', 'sequence': 'this is a simple equation.'}, {'score': 0.027, 'token': 2028, 'token_str': 'one', 'sequence': 'this is a simple one.'}, {'score': 0.024, 'token': 3627, 'token_str': 'rule', 'sequence': 'this is a simple rule.'}]
+ ```
+
+ Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial)
+
+ This mask filling pipeline can currently be loaded from [`pipeline`] using the following task identifier:
+ `"fill-mask"`.
+
+ The models that this pipeline can use are models that have been trained with a masked language modeling objective,
+ which includes the bi-directional models in the library. See the up-to-date list of available models on
+ [huggingface.co/models](https://huggingface.co/models?filter=fill-mask).
+
+
+
+ This pipeline only works for inputs with exactly one token masked. Experimental: We added support for multiple
+ masks. The returned values are raw model output, and correspond to disjoint probabilities where one might expect
+ joint probabilities (See [discussion](https://github.com/huggingface/transformers/pull/10222)).
+
+
+
+
+
+ This pipeline now supports tokenizer_kwargs. For example try:
+
+ ```python
+ >>> from transformers import pipeline
+
+ >>> fill_masker = pipeline(model="google-bert/bert-base-uncased")
+ >>> tokenizer_kwargs = {"truncation": True}
+ >>> fill_masker(
+ ... "This is a simple [MASK]. " + "...with a large amount of repeated text appended. " * 100,
+ ... tokenizer_kwargs=tokenizer_kwargs,
+ ... )
+ ```
+
+
+
+
+
+ """
+
+ def get_masked_index(self, input_ids: GenericTensor) -> np.ndarray:
+ if self.framework == "tf":
+ masked_index = tf.where(input_ids == self.tokenizer.mask_token_id).numpy()
+ elif self.framework == "pt":
+ masked_index = torch.nonzero(input_ids == self.tokenizer.mask_token_id, as_tuple=False)
+ else:
+ raise ValueError("Unsupported framework")
+ return masked_index
+
+ def _ensure_exactly_one_mask_token(self, input_ids: GenericTensor) -> np.ndarray:
+ masked_index = self.get_masked_index(input_ids)
+ numel = np.prod(masked_index.shape)
+ if numel < 1:
+ raise PipelineException(
+ "fill-mask",
+ self.model.base_model_prefix,
+ f"No mask_token ({self.tokenizer.mask_token}) found on the input",
+ )
+
+ def ensure_exactly_one_mask_token(self, model_inputs: GenericTensor):
+ if isinstance(model_inputs, list):
+ for model_input in model_inputs:
+ self._ensure_exactly_one_mask_token(model_input["input_ids"][0])
+ else:
+ for input_ids in model_inputs["input_ids"]:
+ self._ensure_exactly_one_mask_token(input_ids)
+
+ def preprocess(
+ self, inputs, return_tensors=None, tokenizer_kwargs=None, **preprocess_parameters
+ ) -> Dict[str, GenericTensor]:
+ if return_tensors is None:
+ return_tensors = self.framework
+ if tokenizer_kwargs is None:
+ tokenizer_kwargs = {}
+
+ model_inputs = self.tokenizer(inputs, return_tensors=return_tensors, **tokenizer_kwargs)
+ self.ensure_exactly_one_mask_token(model_inputs)
+ return model_inputs
+
+ def _forward(self, model_inputs):
+ model_outputs = self.model(**model_inputs)
+ model_outputs["input_ids"] = model_inputs["input_ids"]
+ return model_outputs
+
+ def postprocess(self, model_outputs, top_k=5, target_ids=None):
+ # Cap top_k if there are targets
+ if target_ids is not None and target_ids.shape[0] < top_k:
+ top_k = target_ids.shape[0]
+ input_ids = model_outputs["input_ids"][0]
+ outputs = model_outputs["logits"]
+
+ if self.framework == "tf":
+ masked_index = tf.where(input_ids == self.tokenizer.mask_token_id).numpy()[:, 0]
+
+ outputs = outputs.numpy()
+
+ logits = outputs[0, masked_index, :]
+ probs = stable_softmax(logits, axis=-1)
+ if target_ids is not None:
+ probs = tf.gather_nd(tf.squeeze(probs, 0), target_ids.reshape(-1, 1))
+ probs = tf.expand_dims(probs, 0)
+
+ topk = tf.math.top_k(probs, k=top_k)
+ values, predictions = topk.values.numpy(), topk.indices.numpy()
+ else:
+ masked_index = torch.nonzero(input_ids == self.tokenizer.mask_token_id, as_tuple=False).squeeze(-1)
+ # Fill mask pipeline supports only one ${mask_token} per sample
+
+ logits = outputs[0, masked_index, :]
+ probs = logits.softmax(dim=-1)
+ if target_ids is not None:
+ probs = probs[..., target_ids]
+
+ values, predictions = probs.topk(top_k)
+
+ result = []
+ single_mask = values.shape[0] == 1
+ for i, (_values, _predictions) in enumerate(zip(values.tolist(), predictions.tolist())):
+ row = []
+ for v, p in zip(_values, _predictions):
+ # Copy is important since we're going to modify this array in place
+ tokens = input_ids.numpy().copy()
+ if target_ids is not None:
+ p = target_ids[p].tolist()
+
+ tokens[masked_index[i]] = p
+ # Filter padding out:
+ tokens = tokens[np.where(tokens != self.tokenizer.pad_token_id)]
+ # Originally we skip special tokens to give readable output.
+ # For multi masks though, the other [MASK] would be removed otherwise
+ # making the output look odd, so we add them back
+ sequence = self.tokenizer.decode(tokens, skip_special_tokens=single_mask)
+ proposition = {"score": v, "token": p, "token_str": self.tokenizer.decode([p]), "sequence": sequence}
+ row.append(proposition)
+ result.append(row)
+ if single_mask:
+ return result[0]
+ return result
+
+ def get_target_ids(self, targets, top_k=None):
+ if isinstance(targets, str):
+ targets = [targets]
+ try:
+ vocab = self.tokenizer.get_vocab()
+ except Exception:
+ vocab = {}
+ target_ids = []
+ for target in targets:
+ id_ = vocab.get(target, None)
+ if id_ is None:
+ input_ids = self.tokenizer(
+ target,
+ add_special_tokens=False,
+ return_attention_mask=False,
+ return_token_type_ids=False,
+ max_length=1,
+ truncation=True,
+ )["input_ids"]
+ if len(input_ids) == 0:
+ logger.warning(
+ f"The specified target token `{target}` does not exist in the model vocabulary. "
+ "We cannot replace it with anything meaningful, ignoring it"
+ )
+ continue
+ id_ = input_ids[0]
+ # XXX: If users encounter this pass
+ # it becomes pretty slow, so let's make sure
+ # The warning enables them to fix the input to
+ # get faster performance.
+ logger.warning(
+ f"The specified target token `{target}` does not exist in the model vocabulary. "
+ f"Replacing with `{self.tokenizer.convert_ids_to_tokens(id_)}`."
+ )
+ target_ids.append(id_)
+ target_ids = list(set(target_ids))
+ if len(target_ids) == 0:
+ raise ValueError("At least one target must be provided when passed.")
+ target_ids = np.array(target_ids)
+ return target_ids
+
+ def _sanitize_parameters(self, top_k=None, targets=None, tokenizer_kwargs=None):
+ preprocess_params = {}
+
+ if tokenizer_kwargs is not None:
+ preprocess_params["tokenizer_kwargs"] = tokenizer_kwargs
+
+ postprocess_params = {}
+
+ if targets is not None:
+ target_ids = self.get_target_ids(targets, top_k)
+ postprocess_params["target_ids"] = target_ids
+
+ if top_k is not None:
+ postprocess_params["top_k"] = top_k
+
+ if self.tokenizer.mask_token_id is None:
+ raise PipelineException(
+ "fill-mask", self.model.base_model_prefix, "The tokenizer does not define a `mask_token`."
+ )
+ return preprocess_params, {}, postprocess_params
+
+ def __call__(self, inputs, **kwargs):
+ """
+ Fill the masked token in the text(s) given as inputs.
+
+ Args:
+ inputs (`str` or `List[str]`):
+ One or several texts (or one list of prompts) with masked tokens.
+ targets (`str` or `List[str]`, *optional*):
+ When passed, the model will limit the scores to the passed targets instead of looking up in the whole
+ vocab. If the provided targets are not in the model vocab, they will be tokenized and the first
+ resulting token will be used (with a warning, and that might be slower).
+ top_k (`int`, *optional*):
+ When passed, overrides the number of predictions to return.
+
+ Return:
+ A list or a list of list of `dict`: Each result comes as list of dictionaries with the following keys:
+
+ - **sequence** (`str`) -- The corresponding input with the mask token prediction.
+ - **score** (`float`) -- The corresponding probability.
+ - **token** (`int`) -- The predicted token id (to replace the masked one).
+ - **token_str** (`str`) -- The predicted token (to replace the masked one).
+ """
+ outputs = super().__call__(inputs, **kwargs)
+ if isinstance(inputs, list) and len(inputs) == 1:
+ return outputs[0]
+ return outputs
diff --git a/.venv/lib/python3.11/site-packages/transformers/pipelines/image_classification.py b/.venv/lib/python3.11/site-packages/transformers/pipelines/image_classification.py
new file mode 100644
index 0000000000000000000000000000000000000000..0085e5eb73f826598dae8461a15431e3e5ef8f80
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/transformers/pipelines/image_classification.py
@@ -0,0 +1,226 @@
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import List, Union
+
+import numpy as np
+
+from ..utils import (
+ ExplicitEnum,
+ add_end_docstrings,
+ is_tf_available,
+ is_torch_available,
+ is_vision_available,
+ logging,
+ requires_backends,
+)
+from .base import Pipeline, build_pipeline_init_args
+
+
+if is_vision_available():
+ from PIL import Image
+
+ from ..image_utils import load_image
+
+if is_tf_available():
+ from ..models.auto.modeling_tf_auto import TF_MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES
+
+if is_torch_available():
+ import torch
+
+ from ..models.auto.modeling_auto import MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES
+
+logger = logging.get_logger(__name__)
+
+
+# Copied from transformers.pipelines.text_classification.sigmoid
+def sigmoid(_outputs):
+ return 1.0 / (1.0 + np.exp(-_outputs))
+
+
+# Copied from transformers.pipelines.text_classification.softmax
+def softmax(_outputs):
+ maxes = np.max(_outputs, axis=-1, keepdims=True)
+ shifted_exp = np.exp(_outputs - maxes)
+ return shifted_exp / shifted_exp.sum(axis=-1, keepdims=True)
+
+
+# Copied from transformers.pipelines.text_classification.ClassificationFunction
+class ClassificationFunction(ExplicitEnum):
+ SIGMOID = "sigmoid"
+ SOFTMAX = "softmax"
+ NONE = "none"
+
+
+@add_end_docstrings(
+ build_pipeline_init_args(has_image_processor=True),
+ r"""
+ function_to_apply (`str`, *optional*, defaults to `"default"`):
+ The function to apply to the model outputs in order to retrieve the scores. Accepts four different values:
+
+ - `"default"`: if the model has a single label, will apply the sigmoid function on the output. If the model
+ has several labels, will apply the softmax function on the output.
+ - `"sigmoid"`: Applies the sigmoid function on the output.
+ - `"softmax"`: Applies the softmax function on the output.
+ - `"none"`: Does not apply any function on the output.""",
+)
+class ImageClassificationPipeline(Pipeline):
+ """
+ Image classification pipeline using any `AutoModelForImageClassification`. This pipeline predicts the class of an
+ image.
+
+ Example:
+
+ ```python
+ >>> from transformers import pipeline
+
+ >>> classifier = pipeline(model="microsoft/beit-base-patch16-224-pt22k-ft22k")
+ >>> classifier("https://huggingface.co/datasets/Narsil/image_dummy/raw/main/parrots.png")
+ [{'score': 0.442, 'label': 'macaw'}, {'score': 0.088, 'label': 'popinjay'}, {'score': 0.075, 'label': 'parrot'}, {'score': 0.073, 'label': 'parodist, lampooner'}, {'score': 0.046, 'label': 'poll, poll_parrot'}]
+ ```
+
+ Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial)
+
+ This image classification pipeline can currently be loaded from [`pipeline`] using the following task identifier:
+ `"image-classification"`.
+
+ See the list of available models on
+ [huggingface.co/models](https://huggingface.co/models?filter=image-classification).
+ """
+
+ function_to_apply: ClassificationFunction = ClassificationFunction.NONE
+
+ def __init__(self, *args, **kwargs):
+ super().__init__(*args, **kwargs)
+ requires_backends(self, "vision")
+ self.check_model_type(
+ TF_MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES
+ if self.framework == "tf"
+ else MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES
+ )
+
+ def _sanitize_parameters(self, top_k=None, function_to_apply=None, timeout=None):
+ preprocess_params = {}
+ if timeout is not None:
+ preprocess_params["timeout"] = timeout
+ postprocess_params = {}
+ if top_k is not None:
+ postprocess_params["top_k"] = top_k
+ if isinstance(function_to_apply, str):
+ function_to_apply = ClassificationFunction(function_to_apply.lower())
+ if function_to_apply is not None:
+ postprocess_params["function_to_apply"] = function_to_apply
+ return preprocess_params, {}, postprocess_params
+
+ def __call__(self, inputs: Union[str, List[str], "Image.Image", List["Image.Image"]] = None, **kwargs):
+ """
+ Assign labels to the image(s) passed as inputs.
+
+ Args:
+ inputs (`str`, `List[str]`, `PIL.Image` or `List[PIL.Image]`):
+ The pipeline handles three types of images:
+
+ - A string containing a http link pointing to an image
+ - A string containing a local path to an image
+ - An image loaded in PIL directly
+
+ The pipeline accepts either a single image or a batch of images, which must then be passed as a string.
+ Images in a batch must all be in the same format: all as http links, all as local paths, or all as PIL
+ images.
+ function_to_apply (`str`, *optional*, defaults to `"default"`):
+ The function to apply to the model outputs in order to retrieve the scores. Accepts four different
+ values:
+
+ If this argument is not specified, then it will apply the following functions according to the number
+ of labels:
+
+ - If the model has a single label, will apply the sigmoid function on the output.
+ - If the model has several labels, will apply the softmax function on the output.
+
+ Possible values are:
+
+ - `"sigmoid"`: Applies the sigmoid function on the output.
+ - `"softmax"`: Applies the softmax function on the output.
+ - `"none"`: Does not apply any function on the output.
+ top_k (`int`, *optional*, defaults to 5):
+ The number of top labels that will be returned by the pipeline. If the provided number is higher than
+ the number of labels available in the model configuration, it will default to the number of labels.
+ timeout (`float`, *optional*, defaults to None):
+ The maximum time in seconds to wait for fetching images from the web. If None, no timeout is set and
+ the call may block forever.
+
+ Return:
+ A dictionary or a list of dictionaries containing result. If the input is a single image, will return a
+ dictionary, if the input is a list of several images, will return a list of dictionaries corresponding to
+ the images.
+
+ The dictionaries contain the following keys:
+
+ - **label** (`str`) -- The label identified by the model.
+ - **score** (`int`) -- The score attributed by the model for that label.
+ """
+ # After deprecation of this is completed, remove the default `None` value for `images`
+ if "images" in kwargs:
+ inputs = kwargs.pop("images")
+ if inputs is None:
+ raise ValueError("Cannot call the image-classification pipeline without an inputs argument!")
+ return super().__call__(inputs, **kwargs)
+
+ def preprocess(self, image, timeout=None):
+ image = load_image(image, timeout=timeout)
+ model_inputs = self.image_processor(images=image, return_tensors=self.framework)
+ if self.framework == "pt":
+ model_inputs = model_inputs.to(self.torch_dtype)
+ return model_inputs
+
+ def _forward(self, model_inputs):
+ model_outputs = self.model(**model_inputs)
+ return model_outputs
+
+ def postprocess(self, model_outputs, function_to_apply=None, top_k=5):
+ if function_to_apply is None:
+ if self.model.config.problem_type == "single_label_classification" or self.model.config.num_labels == 1:
+ function_to_apply = ClassificationFunction.SIGMOID
+ elif self.model.config.problem_type == "multi_label_classification" or self.model.config.num_labels > 1:
+ function_to_apply = ClassificationFunction.SOFTMAX
+ elif hasattr(self.model.config, "function_to_apply") and function_to_apply is None:
+ function_to_apply = self.model.config.function_to_apply
+ else:
+ function_to_apply = ClassificationFunction.NONE
+
+ if top_k > self.model.config.num_labels:
+ top_k = self.model.config.num_labels
+
+ outputs = model_outputs["logits"][0]
+ if self.framework == "pt" and outputs.dtype in (torch.bfloat16, torch.float16):
+ outputs = outputs.to(torch.float32).numpy()
+ else:
+ outputs = outputs.numpy()
+
+ if function_to_apply == ClassificationFunction.SIGMOID:
+ scores = sigmoid(outputs)
+ elif function_to_apply == ClassificationFunction.SOFTMAX:
+ scores = softmax(outputs)
+ elif function_to_apply == ClassificationFunction.NONE:
+ scores = outputs
+ else:
+ raise ValueError(f"Unrecognized `function_to_apply` argument: {function_to_apply}")
+
+ dict_scores = [
+ {"label": self.model.config.id2label[i], "score": score.item()} for i, score in enumerate(scores)
+ ]
+ dict_scores.sort(key=lambda x: x["score"], reverse=True)
+ if top_k is not None:
+ dict_scores = dict_scores[:top_k]
+
+ return dict_scores
diff --git a/.venv/lib/python3.11/site-packages/transformers/pipelines/image_feature_extraction.py b/.venv/lib/python3.11/site-packages/transformers/pipelines/image_feature_extraction.py
new file mode 100644
index 0000000000000000000000000000000000000000..391eb2b3aec714dbac61fe46bddc7ee74f10cd2f
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/transformers/pipelines/image_feature_extraction.py
@@ -0,0 +1,112 @@
+from typing import Dict
+
+from ..utils import add_end_docstrings, is_vision_available
+from .base import GenericTensor, Pipeline, build_pipeline_init_args
+
+
+if is_vision_available():
+ from ..image_utils import load_image
+
+
+@add_end_docstrings(
+ build_pipeline_init_args(has_image_processor=True),
+ """
+ image_processor_kwargs (`dict`, *optional*):
+ Additional dictionary of keyword arguments passed along to the image processor e.g.
+ {"size": {"height": 100, "width": 100}}
+ pool (`bool`, *optional*, defaults to `False`):
+ Whether or not to return the pooled output. If `False`, the model will return the raw hidden states.
+ """,
+)
+class ImageFeatureExtractionPipeline(Pipeline):
+ """
+ Image feature extraction pipeline uses no model head. This pipeline extracts the hidden states from the base
+ transformer, which can be used as features in downstream tasks.
+
+ Example:
+
+ ```python
+ >>> from transformers import pipeline
+
+ >>> extractor = pipeline(model="google/vit-base-patch16-224", task="image-feature-extraction")
+ >>> result = extractor("https://huggingface.co/datasets/Narsil/image_dummy/raw/main/parrots.png", return_tensors=True)
+ >>> result.shape # This is a tensor of shape [1, sequence_lenth, hidden_dimension] representing the input image.
+ torch.Size([1, 197, 768])
+ ```
+
+ Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial)
+
+ This image feature extraction pipeline can currently be loaded from [`pipeline`] using the task identifier:
+ `"image-feature-extraction"`.
+
+ All vision models may be used for this pipeline. See a list of all models, including community-contributed models on
+ [huggingface.co/models](https://huggingface.co/models).
+ """
+
+ def _sanitize_parameters(self, image_processor_kwargs=None, return_tensors=None, pool=None, **kwargs):
+ preprocess_params = {} if image_processor_kwargs is None else image_processor_kwargs
+
+ postprocess_params = {}
+ if pool is not None:
+ postprocess_params["pool"] = pool
+ if return_tensors is not None:
+ postprocess_params["return_tensors"] = return_tensors
+
+ if "timeout" in kwargs:
+ preprocess_params["timeout"] = kwargs["timeout"]
+
+ return preprocess_params, {}, postprocess_params
+
+ def preprocess(self, image, timeout=None, **image_processor_kwargs) -> Dict[str, GenericTensor]:
+ image = load_image(image, timeout=timeout)
+ model_inputs = self.image_processor(image, return_tensors=self.framework, **image_processor_kwargs)
+ if self.framework == "pt":
+ model_inputs = model_inputs.to(self.torch_dtype)
+ return model_inputs
+
+ def _forward(self, model_inputs):
+ model_outputs = self.model(**model_inputs)
+ return model_outputs
+
+ def postprocess(self, model_outputs, pool=None, return_tensors=False):
+ pool = pool if pool is not None else False
+
+ if pool:
+ if "pooler_output" not in model_outputs:
+ raise ValueError(
+ "No pooled output was returned. Make sure the model has a `pooler` layer when using the `pool` option."
+ )
+ outputs = model_outputs["pooler_output"]
+ else:
+ # [0] is the first available tensor, logits or last_hidden_state.
+ outputs = model_outputs[0]
+
+ if return_tensors:
+ return outputs
+ if self.framework == "pt":
+ return outputs.tolist()
+ elif self.framework == "tf":
+ return outputs.numpy().tolist()
+
+ def __call__(self, *args, **kwargs):
+ """
+ Extract the features of the input(s).
+
+ Args:
+ images (`str`, `List[str]`, `PIL.Image` or `List[PIL.Image]`):
+ The pipeline handles three types of images:
+
+ - A string containing a http link pointing to an image
+ - A string containing a local path to an image
+ - An image loaded in PIL directly
+
+ The pipeline accepts either a single image or a batch of images, which must then be passed as a string.
+ Images in a batch must all be in the same format: all as http links, all as local paths, or all as PIL
+ images.
+ timeout (`float`, *optional*, defaults to None):
+ The maximum time in seconds to wait for fetching images from the web. If None, no timeout is used and
+ the call may block forever.
+ Return:
+ A nested list of `float`: The features computed by the model.
+ """
+ return super().__call__(*args, **kwargs)
diff --git a/.venv/lib/python3.11/site-packages/transformers/pipelines/image_segmentation.py b/.venv/lib/python3.11/site-packages/transformers/pipelines/image_segmentation.py
new file mode 100644
index 0000000000000000000000000000000000000000..d388e591bf9df45c4905a6c8ff86fdce1e123906
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/transformers/pipelines/image_segmentation.py
@@ -0,0 +1,220 @@
+from typing import Any, Dict, List, Union
+
+import numpy as np
+
+from ..utils import add_end_docstrings, is_torch_available, is_vision_available, logging, requires_backends
+from .base import Pipeline, build_pipeline_init_args
+
+
+if is_vision_available():
+ from PIL import Image
+
+ from ..image_utils import load_image
+
+if is_torch_available():
+ from ..models.auto.modeling_auto import (
+ MODEL_FOR_IMAGE_SEGMENTATION_MAPPING_NAMES,
+ MODEL_FOR_INSTANCE_SEGMENTATION_MAPPING_NAMES,
+ MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING_NAMES,
+ MODEL_FOR_UNIVERSAL_SEGMENTATION_MAPPING_NAMES,
+ )
+
+
+logger = logging.get_logger(__name__)
+
+
+Prediction = Dict[str, Any]
+Predictions = List[Prediction]
+
+
+@add_end_docstrings(build_pipeline_init_args(has_image_processor=True))
+class ImageSegmentationPipeline(Pipeline):
+ """
+ Image segmentation pipeline using any `AutoModelForXXXSegmentation`. This pipeline predicts masks of objects and
+ their classes.
+
+ Example:
+
+ ```python
+ >>> from transformers import pipeline
+
+ >>> segmenter = pipeline(model="facebook/detr-resnet-50-panoptic")
+ >>> segments = segmenter("https://huggingface.co/datasets/Narsil/image_dummy/raw/main/parrots.png")
+ >>> len(segments)
+ 2
+
+ >>> segments[0]["label"]
+ 'bird'
+
+ >>> segments[1]["label"]
+ 'bird'
+
+ >>> type(segments[0]["mask"]) # This is a black and white mask showing where is the bird on the original image.
+
+
+ >>> segments[0]["mask"].size
+ (768, 512)
+ ```
+
+
+ This image segmentation pipeline can currently be loaded from [`pipeline`] using the following task identifier:
+ `"image-segmentation"`.
+
+ See the list of available models on
+ [huggingface.co/models](https://huggingface.co/models?filter=image-segmentation).
+ """
+
+ def __init__(self, *args, **kwargs):
+ super().__init__(*args, **kwargs)
+
+ if self.framework == "tf":
+ raise ValueError(f"The {self.__class__} is only available in PyTorch.")
+
+ requires_backends(self, "vision")
+ mapping = MODEL_FOR_IMAGE_SEGMENTATION_MAPPING_NAMES.copy()
+ mapping.update(MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING_NAMES)
+ mapping.update(MODEL_FOR_INSTANCE_SEGMENTATION_MAPPING_NAMES)
+ mapping.update(MODEL_FOR_UNIVERSAL_SEGMENTATION_MAPPING_NAMES)
+ self.check_model_type(mapping)
+
+ def _sanitize_parameters(self, **kwargs):
+ preprocess_kwargs = {}
+ postprocess_kwargs = {}
+ if "subtask" in kwargs:
+ postprocess_kwargs["subtask"] = kwargs["subtask"]
+ preprocess_kwargs["subtask"] = kwargs["subtask"]
+ if "threshold" in kwargs:
+ postprocess_kwargs["threshold"] = kwargs["threshold"]
+ if "mask_threshold" in kwargs:
+ postprocess_kwargs["mask_threshold"] = kwargs["mask_threshold"]
+ if "overlap_mask_area_threshold" in kwargs:
+ postprocess_kwargs["overlap_mask_area_threshold"] = kwargs["overlap_mask_area_threshold"]
+ if "timeout" in kwargs:
+ preprocess_kwargs["timeout"] = kwargs["timeout"]
+
+ return preprocess_kwargs, {}, postprocess_kwargs
+
+ def __call__(self, inputs=None, **kwargs) -> Union[Predictions, List[Prediction]]:
+ """
+ Perform segmentation (detect masks & classes) in the image(s) passed as inputs.
+
+ Args:
+ inputs (`str`, `List[str]`, `PIL.Image` or `List[PIL.Image]`):
+ The pipeline handles three types of images:
+
+ - A string containing an HTTP(S) link pointing to an image
+ - A string containing a local path to an image
+ - An image loaded in PIL directly
+
+ The pipeline accepts either a single image or a batch of images. Images in a batch must all be in the
+ same format: all as HTTP(S) links, all as local paths, or all as PIL images.
+ subtask (`str`, *optional*):
+ Segmentation task to be performed, choose [`semantic`, `instance` and `panoptic`] depending on model
+ capabilities. If not set, the pipeline will attempt tp resolve in the following order:
+ `panoptic`, `instance`, `semantic`.
+ threshold (`float`, *optional*, defaults to 0.9):
+ Probability threshold to filter out predicted masks.
+ mask_threshold (`float`, *optional*, defaults to 0.5):
+ Threshold to use when turning the predicted masks into binary values.
+ overlap_mask_area_threshold (`float`, *optional*, defaults to 0.5):
+ Mask overlap threshold to eliminate small, disconnected segments.
+ timeout (`float`, *optional*, defaults to None):
+ The maximum time in seconds to wait for fetching images from the web. If None, no timeout is set and
+ the call may block forever.
+
+ Return:
+ A dictionary or a list of dictionaries containing the result. If the input is a single image, will return a
+ list of dictionaries, if the input is a list of several images, will return a list of list of dictionaries
+ corresponding to each image.
+
+ The dictionaries contain the mask, label and score (where applicable) of each detected object and contains
+ the following keys:
+
+ - **label** (`str`) -- The class label identified by the model.
+ - **mask** (`PIL.Image`) -- A binary mask of the detected object as a Pil Image of shape (width, height) of
+ the original image. Returns a mask filled with zeros if no object is found.
+ - **score** (*optional* `float`) -- Optionally, when the model is capable of estimating a confidence of the
+ "object" described by the label and the mask.
+ """
+ # After deprecation of this is completed, remove the default `None` value for `images`
+ if "images" in kwargs:
+ inputs = kwargs.pop("images")
+ if inputs is None:
+ raise ValueError("Cannot call the image-classification pipeline without an inputs argument!")
+ return super().__call__(inputs, **kwargs)
+
+ def preprocess(self, image, subtask=None, timeout=None):
+ image = load_image(image, timeout=timeout)
+ target_size = [(image.height, image.width)]
+ if self.model.config.__class__.__name__ == "OneFormerConfig":
+ if subtask is None:
+ kwargs = {}
+ else:
+ kwargs = {"task_inputs": [subtask]}
+ inputs = self.image_processor(images=[image], return_tensors="pt", **kwargs)
+ if self.framework == "pt":
+ inputs = inputs.to(self.torch_dtype)
+ inputs["task_inputs"] = self.tokenizer(
+ inputs["task_inputs"],
+ padding="max_length",
+ max_length=self.model.config.task_seq_len,
+ return_tensors=self.framework,
+ )["input_ids"]
+ else:
+ inputs = self.image_processor(images=[image], return_tensors="pt")
+ if self.framework == "pt":
+ inputs = inputs.to(self.torch_dtype)
+ inputs["target_size"] = target_size
+ return inputs
+
+ def _forward(self, model_inputs):
+ target_size = model_inputs.pop("target_size")
+ model_outputs = self.model(**model_inputs)
+ model_outputs["target_size"] = target_size
+ return model_outputs
+
+ def postprocess(
+ self, model_outputs, subtask=None, threshold=0.9, mask_threshold=0.5, overlap_mask_area_threshold=0.5
+ ):
+ fn = None
+ if subtask in {"panoptic", None} and hasattr(self.image_processor, "post_process_panoptic_segmentation"):
+ fn = self.image_processor.post_process_panoptic_segmentation
+ elif subtask in {"instance", None} and hasattr(self.image_processor, "post_process_instance_segmentation"):
+ fn = self.image_processor.post_process_instance_segmentation
+
+ if fn is not None:
+ outputs = fn(
+ model_outputs,
+ threshold=threshold,
+ mask_threshold=mask_threshold,
+ overlap_mask_area_threshold=overlap_mask_area_threshold,
+ target_sizes=model_outputs["target_size"],
+ )[0]
+
+ annotation = []
+ segmentation = outputs["segmentation"]
+
+ for segment in outputs["segments_info"]:
+ mask = (segmentation == segment["id"]) * 255
+ mask = Image.fromarray(mask.numpy().astype(np.uint8), mode="L")
+ label = self.model.config.id2label[segment["label_id"]]
+ score = segment["score"]
+ annotation.append({"score": score, "label": label, "mask": mask})
+
+ elif subtask in {"semantic", None} and hasattr(self.image_processor, "post_process_semantic_segmentation"):
+ outputs = self.image_processor.post_process_semantic_segmentation(
+ model_outputs, target_sizes=model_outputs["target_size"]
+ )[0]
+
+ annotation = []
+ segmentation = outputs.numpy()
+ labels = np.unique(segmentation)
+
+ for label in labels:
+ mask = (segmentation == label) * 255
+ mask = Image.fromarray(mask.astype(np.uint8), mode="L")
+ label = self.model.config.id2label[label]
+ annotation.append({"score": None, "label": label, "mask": mask})
+ else:
+ raise ValueError(f"Subtask {subtask} is not supported for model {type(self.model)}")
+ return annotation
diff --git a/.venv/lib/python3.11/site-packages/transformers/pipelines/image_text_to_text.py b/.venv/lib/python3.11/site-packages/transformers/pipelines/image_text_to_text.py
new file mode 100644
index 0000000000000000000000000000000000000000..5afba0d7c0410ed5ee7a0f4d53d0f791b43c6f8c
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/transformers/pipelines/image_text_to_text.py
@@ -0,0 +1,432 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import enum
+from typing import Dict, List, Optional, Union
+
+from ..processing_utils import ProcessingKwargs, Unpack
+from ..utils import (
+ add_end_docstrings,
+ is_torch_available,
+ is_vision_available,
+ logging,
+ requires_backends,
+)
+from .base import Pipeline, build_pipeline_init_args
+
+
+if is_vision_available():
+ from PIL import Image
+
+ from ..image_utils import load_images, valid_images
+
+
+if is_torch_available():
+ from ..models.auto.modeling_auto import MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES
+ from .pt_utils import KeyDataset
+
+logger = logging.get_logger(__name__)
+
+IMAGE_TOKEN = ""
+
+
+class ReturnType(enum.Enum):
+ TENSORS = 0
+ NEW_TEXT = 1
+ FULL_TEXT = 2
+
+
+class Chat:
+ """This class is intended to just be used internally in this pipeline and not exposed to users. We convert chats
+ to this format because the rest of the pipeline code tends to assume that lists of messages are
+ actually a batch of samples rather than messages in the same conversation."""
+
+ def __init__(self, messages: Dict, images: Union[str, List[str], "Image.Image", List["Image.Image"]]):
+ for message in messages:
+ if not ("role" in message and "content" in message):
+ raise ValueError("When passing chat dicts as input, each dict must have a 'role' and 'content' key.")
+ images = retrieve_images_in_messages(messages, images)
+
+ self.messages = messages
+ self.images = images
+
+
+def retrieve_images_in_messages(
+ messages: dict, images: Optional[Union[str, List[str], "Image.Image", List["Image.Image"]]]
+):
+ """
+ Retrieve and combine images from the chat and the images passed as input.
+ """
+ if images is None:
+ images = []
+ idx_images = 0
+ retrieved_images = []
+ for message in messages:
+ for content in message["content"]:
+ if isinstance(content, dict):
+ if content.get("type") == "image":
+ for key in ["image", "url", "path", "base64"]:
+ if key in content:
+ retrieved_images.append(content[key])
+ break
+ else:
+ if idx_images < len(images):
+ retrieved_images.append(images[idx_images])
+ idx_images += 1
+ else:
+ raise ValueError(
+ "The number of images in the chat messages should be the same as the number of images passed to the pipeline."
+ )
+ # Add support for OpenAI/TGI chat format
+ elif content.get("type") == "image_url":
+ if isinstance(content.get("image_url"), dict) and "url" in content["image_url"]:
+ retrieved_images.append(content["image_url"]["url"])
+ # Rewrite content to be in the Transformers chat format
+ content["type"] = "image"
+ content["image"] = content["image_url"]["url"]
+ del content["image_url"]
+ else:
+ raise ValueError(
+ "Wrong format for 'image_url' content type. The content should have an 'image_url' dict with a 'url' key."
+ )
+
+ # The number of images passed should be consistent with the number of images in the chat without an image key
+ if idx_images != len(images):
+ raise ValueError(
+ "The number of images in the chat messages should be the same as the number of images passed to the pipeline."
+ )
+
+ return retrieved_images
+
+
+@add_end_docstrings(build_pipeline_init_args(has_processor=True))
+class ImageTextToTextPipeline(Pipeline):
+ """
+ Image-text-to-text pipeline using an `AutoModelForImageTextToText`. This pipeline generates text given an image and text.
+ When the underlying model is a conversational model, it can also accept one or more chats,
+ in which case the pipeline will operate in chat mode and will continue the chat(s) by adding its response(s).
+ Each chat takes the form of a list of dicts, where each dict contains "role" and "content" keys.
+
+ Example:
+
+ ```python
+ >>> from transformers import pipeline
+
+ >>> pipe = pipeline(task="image-text-to-text", model="Salesforce/blip-image-captioning-base")
+ >>> pipe("https://huggingface.co/datasets/Narsil/image_dummy/raw/main/parrots.png", text="A photo of")
+ [{'generated_text': 'a photo of two birds'}]
+ ```
+
+ ```python
+ >>> from transformers import pipeline
+
+ >>> pipe = pipeline("image-text-to-text", model="llava-hf/llava-interleave-qwen-0.5b-hf")
+ >>> messages = [
+ >>> {
+ >>> "role": "user",
+ >>> "content": [
+ >>> {
+ >>> "type": "image",
+ >>> "url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg",
+ >>> },
+ >>> {"type": "text", "text": "Describe this image."},
+ >>> ],
+ >>> },
+ >>> {
+ >>> "role": "assistant",
+ >>> "content": [
+ >>> {"type": "text", "text": "There is a dog and"},
+ >>> ],
+ >>> },
+ >>> ]
+ >>> pipe(text=messages, max_new_tokens=20, return_full_text=False)
+ [{'input_text': [{'role': 'user',
+ 'content': [{'type': 'image',
+ 'url': 'https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg'},
+ {'type': 'text', 'text': 'Describe this image.'}]},
+ {'role': 'assistant',
+ 'content': [{'type': 'text', 'text': 'There is a dog and'}]}],
+ 'generated_text': ' a person in the image. The dog is sitting on the sand, and the person is sitting on'}]
+ ```
+
+ Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial)
+
+ This image-text to text pipeline can currently be loaded from pipeline() using the following task identifier:
+ "image-text-to-text".
+
+ See the list of available models on
+ [huggingface.co/models](https://huggingface.co/models?pipeline_tag=image-text-to-text).
+ """
+
+ _load_processor = True
+ _load_image_processor = False
+ _load_feature_extractor = False
+ _load_tokenizer = False
+
+ def __init__(self, *args, **kwargs):
+ super().__init__(*args, **kwargs)
+ requires_backends(self, "vision")
+ self.check_model_type(MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES)
+
+ def _sanitize_parameters(
+ self,
+ max_new_tokens=None,
+ generate_kwargs=None,
+ timeout=None,
+ return_full_text=None,
+ return_tensors=None,
+ return_type=None,
+ continue_final_message=None,
+ **kwargs: Unpack[ProcessingKwargs],
+ ):
+ forward_kwargs = {}
+ preprocess_params = {}
+ postprocess_params = {}
+
+ preprocess_params["processing_kwargs"] = kwargs
+
+ if timeout is not None:
+ preprocess_params["timeout"] = timeout
+
+ if continue_final_message is not None:
+ preprocess_params["continue_final_message"] = continue_final_message
+
+ if generate_kwargs is not None:
+ forward_kwargs["generate_kwargs"] = generate_kwargs
+
+ if max_new_tokens is not None:
+ if "generate_kwargs" not in forward_kwargs:
+ forward_kwargs["generate_kwargs"] = {}
+ if "max_new_tokens" in forward_kwargs["generate_kwargs"]:
+ raise ValueError(
+ "'max_new_tokens' is defined twice, once in 'generate_kwargs' and once as a direct parameter,"
+ " please use only one"
+ )
+ forward_kwargs["generate_kwargs"]["max_new_tokens"] = max_new_tokens
+
+ if return_full_text is not None and return_type is None:
+ if return_tensors is not None:
+ raise ValueError("`return_full_text` is mutually exclusive with `return_tensors`")
+ return_type = ReturnType.FULL_TEXT if return_full_text else ReturnType.NEW_TEXT
+ if return_tensors is not None and return_type is None:
+ return_type = ReturnType.TENSORS
+ if return_type is not None:
+ postprocess_params["return_type"] = return_type
+ if continue_final_message is not None:
+ postprocess_params["continue_final_message"] = continue_final_message
+
+ return preprocess_params, forward_kwargs, postprocess_params
+
+ def __call__(
+ self,
+ images: Optional[
+ Union[str, List[str], List[List[str]], "Image.Image", List["Image.Image"], List[List["Image.Image"]]]
+ ] = None,
+ text: Optional[Union[str, List[str], List[dict]]] = None,
+ **kwargs,
+ ):
+ """
+ Generate a text given text and the image(s) passed as inputs.
+
+ Args:
+ images (`str`, `List[str]`, `PIL.Image or `List[PIL.Image]`):
+ The pipeline handles three types of images:
+
+ - A string containing a HTTP(s) link pointing to an image
+ - A string containing a local path to an image
+ - An image loaded in PIL directly
+
+ The pipeline accepts either a single image or a batch of images.
+ text (str, List[str], `List[Dict[str, Union[str, PIL.Image]]]`):
+ The text to be used for generation. If a list of strings is passed, the length of the list should be the
+ same as the number of images. Text can also follow the chat format: a list of dictionaries where each
+ dictionary represents a message in a conversation. Each dictionary should have two keys: 'role' and
+ 'content'. 'role' should be one of 'user', 'system' or 'assistant'. 'content' should be a list of dictionary
+ containing the text of the message and the type of the message. The type of the message can be either
+ 'text' or 'image'. If the type is 'image', no text is needed.
+ return_tensors (`bool`, *optional*, defaults to `False`):
+ Returns the tensors of predictions (as token indices) in the outputs. If set to
+ `True`, the decoded text is not returned.
+ return_text (`bool`, *optional*):
+ Returns the decoded texts in the outputs.
+ return_full_text (`bool`, *optional*, defaults to `True`):
+ If set to `False` only added text is returned, otherwise the full text is returned. Cannot be
+ specified at the same time as `return_text`.
+ continue_final_message( `bool`, *optional*): This indicates that you want the model to continue the
+ last message in the input chat rather than starting a new one, allowing you to "prefill" its response.
+ By default this is `True` when the final message in the input chat has the `assistant` role and
+ `False` otherwise, but you can manually override that behaviour by setting this flag.
+
+ Return:
+ A list or a list of list of `dict`: Each result comes as a dictionary with the following key (cannot return a combination
+ of both `generated_text` and `generated_token_ids`):
+
+ - **generated_text** (`str`, present when `return_text=True`) -- The generated text.
+ - **generated_token_ids** (`torch.Tensor`, present when `return_tensors=True`) -- The token
+ ids of the generated text.
+ - **input_text** (`str`) -- The input text.
+ """
+ if images is None and text is None:
+ raise ValueError("You must at least provide either text or images.")
+ if images is not None and text is None and not valid_images(images):
+ """
+ Supports the following format
+ - {"image": image, "text": text}
+ - [{"image": image, "text": text}]
+ - Generator and datasets
+ This is a common pattern in other multimodal pipelines, so we support it here as well.
+ """
+ return super().__call__(images, **kwargs)
+
+ if isinstance(text, (list, tuple, KeyDataset)) and isinstance(text[0], (list, tuple, dict)):
+ # We have one or more prompts in list-of-dicts format, so this is chat mode
+ if isinstance(text[0], dict):
+ return super().__call__(Chat(text, images), **kwargs)
+ else:
+ if images is None:
+ images = [None] * len(text)
+ chats = [Chat(chat, image) for chat, image in zip(text, images)] # 🐈 🐈 🐈
+ return super().__call__(chats, **kwargs)
+
+ # encourage the user to use the chat format if supported
+ if getattr(self.processor, "chat_template", None) is not None:
+ logger.warning_once(
+ "The input data was not formatted as a chat with dicts containing 'role' and 'content' keys, even though this model supports chat. "
+ "Consider using the chat format for better results. For more information, see https://huggingface.co/docs/transformers/en/chat_templating"
+ )
+
+ # support text only generation
+ if images is None:
+ return super().__call__(text, **kwargs)
+ if text is None:
+ raise ValueError("You must provide text for this pipeline.")
+
+ return super().__call__({"images": images, "text": text}, **kwargs)
+
+ def preprocess(self, inputs=None, timeout=None, continue_final_message=None, processing_kwargs=None):
+ # In case we only have text inputs
+ if isinstance(inputs, (list, tuple, str)):
+ images = None
+ text = inputs
+ inputs_text = inputs
+ else:
+ if isinstance(inputs, Chat):
+ # If the user passes a chat that ends in an assistant message, we treat it as a prefill by default
+ # because very few models support multiple separate, consecutive assistant messages
+ if continue_final_message is None:
+ continue_final_message = inputs.messages[-1]["role"] == "assistant"
+ text = self.processor.apply_chat_template(
+ inputs.messages,
+ add_generation_prompt=not continue_final_message,
+ continue_final_message=continue_final_message,
+ return_tensors=self.framework,
+ )
+ inputs_text = inputs
+ images = inputs.images
+ else:
+ text = inputs["text"]
+ inputs_text = inputs["text"]
+ images = inputs["images"]
+
+ images = load_images(images)
+
+ # if batched text inputs, we set padding to True unless specified otherwise
+ if isinstance(text, (list, tuple)) and len(text) > 1:
+ processing_kwargs.setdefault("padding", True)
+ model_inputs = self.processor(
+ images=images, text=text, return_tensors=self.framework, legacy=False, **processing_kwargs
+ ).to(dtype=self.torch_dtype)
+
+ model_inputs["text"] = inputs_text
+
+ return model_inputs
+
+ def _forward(self, model_inputs, generate_kwargs=None):
+ generate_kwargs = {} if generate_kwargs is None else generate_kwargs
+ prompt_text = model_inputs.pop("text")
+ input_ids = (
+ model_inputs["input_ids"] if "input_ids" in model_inputs else model_inputs["decoder_input_ids"]
+ ) # for decoder-only models
+ generated_sequence = self.model.generate(**model_inputs, **generate_kwargs)
+
+ return {"generated_sequence": generated_sequence, "prompt_text": prompt_text, "input_ids": input_ids}
+
+ def postprocess(self, model_outputs, return_type=ReturnType.FULL_TEXT, continue_final_message=None):
+ input_texts = model_outputs["prompt_text"]
+ input_texts = [input_texts] if isinstance(input_texts, (str, Chat)) else input_texts
+ generated_sequence = model_outputs["generated_sequence"]
+ input_ids = model_outputs["input_ids"]
+ if return_type == ReturnType.TENSORS:
+ return [
+ {"input_text": input_texts[i], "generated_token_ids": generated_sequence[i]}
+ for i in range(len(input_texts))
+ ]
+
+ # Decode inputs and outputs the same way to remove input text from generated text if present
+ generated_texts = self.processor.post_process_image_text_to_text(generated_sequence)
+ decoded_inputs = self.processor.post_process_image_text_to_text(input_ids)
+
+ # Force consistent behavior for including the input text in the output
+ if return_type in {ReturnType.NEW_TEXT, ReturnType.FULL_TEXT}:
+ # Remove the input text from the generated text if the generated text starts with the input text
+ # (accounting for the possibility of a space between the input and generated text)
+ new_generated_texts = []
+ for text_generated, decoded_input in zip(generated_texts, decoded_inputs):
+ # There can be added characters before the input text, so we need to find the beginning of the input text in the generated text
+ index_input_text = text_generated.find(decoded_input)
+ # Limit the search to 2 residual characters, like spaces or new lines, to avoid removing a large part of the answer
+ if 0 <= index_input_text <= 2:
+ # If the input text is found, we remove it
+ new_generated_texts.append(text_generated[index_input_text + len(decoded_input) :])
+ else:
+ new_generated_texts.append(text_generated)
+ generated_texts = new_generated_texts
+ if return_type == ReturnType.FULL_TEXT:
+ full_texts = []
+ for prompt_text, generated_text in zip(input_texts, generated_texts):
+ if isinstance(prompt_text, str):
+ generated_text = prompt_text + generated_text
+ elif isinstance(prompt_text, Chat):
+ if continue_final_message is None:
+ # If the user passes a chat ending in an assistant message, we treat it as a prefill by
+ # default because very few models support multiple separate, consecutive assistant messages
+ continue_final_message = prompt_text.messages[-1]["role"] == "assistant"
+ if continue_final_message:
+ # With assistant prefill, concat onto the end of the last message
+ new_text = dict(prompt_text.messages[-1]["content"][-1].items())
+ new_text["text"] += generated_text
+ generated_text = list(prompt_text.messages)[:-1] + [
+ {
+ "role": prompt_text.messages[-1]["role"],
+ "content": prompt_text.messages[-1]["content"][:-1] + [new_text],
+ }
+ ]
+ else:
+ # When we're not starting from a prefill, the output is a new assistant message
+ generated_text = list(prompt_text.messages) + [
+ {"role": "assistant", "content": generated_text}
+ ]
+ full_texts.append(generated_text)
+ generated_texts = full_texts
+
+ records = [
+ {
+ "input_text": input_text.messages if isinstance(input_text, Chat) else input_text,
+ "generated_text": generated_text,
+ }
+ for input_text, generated_text in zip(input_texts, generated_texts)
+ ]
+
+ return records
diff --git a/.venv/lib/python3.11/site-packages/transformers/pipelines/image_to_image.py b/.venv/lib/python3.11/site-packages/transformers/pipelines/image_to_image.py
new file mode 100644
index 0000000000000000000000000000000000000000..cb66359a4dddea48519f2de2dc69e86cd4ac5645
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/transformers/pipelines/image_to_image.py
@@ -0,0 +1,136 @@
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import List, Union
+
+import numpy as np
+
+from ..utils import (
+ add_end_docstrings,
+ is_torch_available,
+ is_vision_available,
+ logging,
+ requires_backends,
+)
+from .base import Pipeline, build_pipeline_init_args
+
+
+if is_vision_available():
+ from PIL import Image
+
+ from ..image_utils import load_image
+
+if is_torch_available():
+ from ..models.auto.modeling_auto import MODEL_FOR_IMAGE_TO_IMAGE_MAPPING_NAMES
+
+logger = logging.get_logger(__name__)
+
+
+@add_end_docstrings(build_pipeline_init_args(has_image_processor=True))
+class ImageToImagePipeline(Pipeline):
+ """
+ Image to Image pipeline using any `AutoModelForImageToImage`. This pipeline generates an image based on a previous
+ image input.
+
+ Example:
+
+ ```python
+ >>> from PIL import Image
+ >>> import requests
+
+ >>> from transformers import pipeline
+
+ >>> upscaler = pipeline("image-to-image", model="caidas/swin2SR-classical-sr-x2-64")
+ >>> img = Image.open(requests.get("http://images.cocodataset.org/val2017/000000039769.jpg", stream=True).raw)
+ >>> img = img.resize((64, 64))
+ >>> upscaled_img = upscaler(img)
+ >>> img.size
+ (64, 64)
+
+ >>> upscaled_img.size
+ (144, 144)
+ ```
+
+ This image to image pipeline can currently be loaded from [`pipeline`] using the following task identifier:
+ `"image-to-image"`.
+
+ See the list of available models on [huggingface.co/models](https://huggingface.co/models?filter=image-to-image).
+ """
+
+ def __init__(self, *args, **kwargs):
+ super().__init__(*args, **kwargs)
+ requires_backends(self, "vision")
+ self.check_model_type(MODEL_FOR_IMAGE_TO_IMAGE_MAPPING_NAMES)
+
+ def _sanitize_parameters(self, **kwargs):
+ preprocess_params = {}
+ postprocess_params = {}
+ forward_params = {}
+
+ if "timeout" in kwargs:
+ preprocess_params["timeout"] = kwargs["timeout"]
+ if "head_mask" in kwargs:
+ forward_params["head_mask"] = kwargs["head_mask"]
+
+ return preprocess_params, forward_params, postprocess_params
+
+ def __call__(
+ self, images: Union[str, List[str], "Image.Image", List["Image.Image"]], **kwargs
+ ) -> Union["Image.Image", List["Image.Image"]]:
+ """
+ Transform the image(s) passed as inputs.
+
+ Args:
+ images (`str`, `List[str]`, `PIL.Image` or `List[PIL.Image]`):
+ The pipeline handles three types of images:
+
+ - A string containing a http link pointing to an image
+ - A string containing a local path to an image
+ - An image loaded in PIL directly
+
+ The pipeline accepts either a single image or a batch of images, which must then be passed as a string.
+ Images in a batch must all be in the same format: all as http links, all as local paths, or all as PIL
+ images.
+ timeout (`float`, *optional*, defaults to None):
+ The maximum time in seconds to wait for fetching images from the web. If None, no timeout is used and
+ the call may block forever.
+
+ Return:
+ An image (Image.Image) or a list of images (List["Image.Image"]) containing result(s). If the input is a
+ single image, the return will be also a single image, if the input is a list of several images, it will
+ return a list of transformed images.
+ """
+ return super().__call__(images, **kwargs)
+
+ def _forward(self, model_inputs):
+ model_outputs = self.model(**model_inputs)
+ return model_outputs
+
+ def preprocess(self, image, timeout=None):
+ image = load_image(image, timeout=timeout)
+ inputs = self.image_processor(images=[image], return_tensors="pt")
+ if self.framework == "pt":
+ inputs = inputs.to(self.torch_dtype)
+ return inputs
+
+ def postprocess(self, model_outputs):
+ images = []
+ if "reconstruction" in model_outputs.keys():
+ outputs = model_outputs.reconstruction
+ for output in outputs:
+ output = output.data.squeeze().float().cpu().clamp_(0, 1).numpy()
+ output = np.moveaxis(output, source=0, destination=-1)
+ output = (output * 255.0).round().astype(np.uint8) # float32 to uint8
+ images.append(Image.fromarray(output))
+
+ return images if len(images) > 1 else images[0]
diff --git a/.venv/lib/python3.11/site-packages/transformers/pipelines/image_to_text.py b/.venv/lib/python3.11/site-packages/transformers/pipelines/image_to_text.py
new file mode 100644
index 0000000000000000000000000000000000000000..32a3ec218dac305f93d8e41959200a78c590c8df
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/transformers/pipelines/image_to_text.py
@@ -0,0 +1,222 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import List, Union
+
+from ..utils import (
+ add_end_docstrings,
+ is_tf_available,
+ is_torch_available,
+ is_vision_available,
+ logging,
+ requires_backends,
+)
+from .base import Pipeline, build_pipeline_init_args
+
+
+if is_vision_available():
+ from PIL import Image
+
+ from ..image_utils import load_image
+
+if is_tf_available():
+ from ..models.auto.modeling_tf_auto import TF_MODEL_FOR_VISION_2_SEQ_MAPPING_NAMES
+
+if is_torch_available():
+ import torch
+
+ from ..models.auto.modeling_auto import MODEL_FOR_VISION_2_SEQ_MAPPING_NAMES
+
+logger = logging.get_logger(__name__)
+
+
+@add_end_docstrings(build_pipeline_init_args(has_tokenizer=True, has_image_processor=True))
+class ImageToTextPipeline(Pipeline):
+ """
+ Image To Text pipeline using a `AutoModelForVision2Seq`. This pipeline predicts a caption for a given image.
+
+ Example:
+
+ ```python
+ >>> from transformers import pipeline
+
+ >>> captioner = pipeline(model="ydshieh/vit-gpt2-coco-en")
+ >>> captioner("https://huggingface.co/datasets/Narsil/image_dummy/raw/main/parrots.png")
+ [{'generated_text': 'two birds are standing next to each other '}]
+ ```
+
+ Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial)
+
+ This image to text pipeline can currently be loaded from pipeline() using the following task identifier:
+ "image-to-text".
+
+ See the list of available models on
+ [huggingface.co/models](https://huggingface.co/models?pipeline_tag=image-to-text).
+ """
+
+ def __init__(self, *args, **kwargs):
+ super().__init__(*args, **kwargs)
+ requires_backends(self, "vision")
+ self.check_model_type(
+ TF_MODEL_FOR_VISION_2_SEQ_MAPPING_NAMES if self.framework == "tf" else MODEL_FOR_VISION_2_SEQ_MAPPING_NAMES
+ )
+
+ def _sanitize_parameters(self, max_new_tokens=None, generate_kwargs=None, prompt=None, timeout=None):
+ forward_params = {}
+ preprocess_params = {}
+
+ if prompt is not None:
+ preprocess_params["prompt"] = prompt
+ if timeout is not None:
+ preprocess_params["timeout"] = timeout
+
+ if max_new_tokens is not None:
+ forward_params["max_new_tokens"] = max_new_tokens
+ if generate_kwargs is not None:
+ if max_new_tokens is not None and "max_new_tokens" in generate_kwargs:
+ raise ValueError(
+ "`max_new_tokens` is defined both as an argument and inside `generate_kwargs` argument, please use"
+ " only 1 version"
+ )
+ forward_params.update(generate_kwargs)
+
+ if self.assistant_model is not None:
+ forward_params["assistant_model"] = self.assistant_model
+ if self.assistant_tokenizer is not None:
+ forward_params["tokenizer"] = self.tokenizer
+ forward_params["assistant_tokenizer"] = self.assistant_tokenizer
+
+ return preprocess_params, forward_params, {}
+
+ def __call__(self, inputs: Union[str, List[str], "Image.Image", List["Image.Image"]] = None, **kwargs):
+ """
+ Assign labels to the image(s) passed as inputs.
+
+ Args:
+ inputs (`str`, `List[str]`, `PIL.Image` or `List[PIL.Image]`):
+ The pipeline handles three types of images:
+
+ - A string containing a HTTP(s) link pointing to an image
+ - A string containing a local path to an image
+ - An image loaded in PIL directly
+
+ The pipeline accepts either a single image or a batch of images.
+
+ max_new_tokens (`int`, *optional*):
+ The amount of maximum tokens to generate. By default it will use `generate` default.
+
+ generate_kwargs (`Dict`, *optional*):
+ Pass it to send all of these arguments directly to `generate` allowing full control of this function.
+
+ timeout (`float`, *optional*, defaults to None):
+ The maximum time in seconds to wait for fetching images from the web. If None, no timeout is set and
+ the call may block forever.
+
+ Return:
+ A list or a list of list of `dict`: Each result comes as a dictionary with the following key:
+
+ - **generated_text** (`str`) -- The generated text.
+ """
+ # After deprecation of this is completed, remove the default `None` value for `images`
+ if "images" in kwargs:
+ inputs = kwargs.pop("images")
+ if inputs is None:
+ raise ValueError("Cannot call the image-to-text pipeline without an inputs argument!")
+ return super().__call__(inputs, **kwargs)
+
+ def preprocess(self, image, prompt=None, timeout=None):
+ image = load_image(image, timeout=timeout)
+
+ if prompt is not None:
+ logger.warning_once(
+ "Passing `prompt` to the `image-to-text` pipeline is deprecated and will be removed in version 4.48"
+ " of 🤗 Transformers. Use the `image-text-to-text` pipeline instead",
+ )
+ if not isinstance(prompt, str):
+ raise ValueError(
+ f"Received an invalid text input, got - {type(prompt)} - but expected a single string. "
+ "Note also that one single text can be provided for conditional image to text generation."
+ )
+
+ model_type = self.model.config.model_type
+
+ if model_type == "git":
+ model_inputs = self.image_processor(images=image, return_tensors=self.framework)
+ if self.framework == "pt":
+ model_inputs = model_inputs.to(self.torch_dtype)
+ input_ids = self.tokenizer(text=prompt, add_special_tokens=False).input_ids
+ input_ids = [self.tokenizer.cls_token_id] + input_ids
+ input_ids = torch.tensor(input_ids).unsqueeze(0)
+ model_inputs.update({"input_ids": input_ids})
+
+ elif model_type == "pix2struct":
+ model_inputs = self.image_processor(images=image, header_text=prompt, return_tensors=self.framework)
+ if self.framework == "pt":
+ model_inputs = model_inputs.to(self.torch_dtype)
+
+ elif model_type != "vision-encoder-decoder":
+ # vision-encoder-decoder does not support conditional generation
+ model_inputs = self.image_processor(images=image, return_tensors=self.framework)
+ if self.framework == "pt":
+ model_inputs = model_inputs.to(self.torch_dtype)
+ text_inputs = self.tokenizer(prompt, return_tensors=self.framework)
+ model_inputs.update(text_inputs)
+
+ else:
+ raise ValueError(f"Model type {model_type} does not support conditional text generation")
+
+ else:
+ model_inputs = self.image_processor(images=image, return_tensors=self.framework)
+ if self.framework == "pt":
+ model_inputs = model_inputs.to(self.torch_dtype)
+
+ if self.model.config.model_type == "git" and prompt is None:
+ model_inputs["input_ids"] = None
+
+ return model_inputs
+
+ def _forward(self, model_inputs, **generate_kwargs):
+ # Git model sets `model_inputs["input_ids"] = None` in `preprocess` (when `prompt=None`). In batch model, the
+ # pipeline will group them into a list of `None`, which fail `_forward`. Avoid this by checking it first.
+ if (
+ "input_ids" in model_inputs
+ and isinstance(model_inputs["input_ids"], list)
+ and all(x is None for x in model_inputs["input_ids"])
+ ):
+ model_inputs["input_ids"] = None
+
+ # User-defined `generation_config` passed to the pipeline call take precedence
+ if "generation_config" not in generate_kwargs:
+ generate_kwargs["generation_config"] = self.generation_config
+
+ # FIXME: We need to pop here due to a difference in how `generation.py` and `generation.tf_utils.py`
+ # parse inputs. In the Tensorflow version, `generate` raises an error if we don't use `input_ids` whereas
+ # the PyTorch version matches it with `self.model.main_input_name` or `self.model.encoder.main_input_name`
+ # in the `_prepare_model_inputs` method.
+ inputs = model_inputs.pop(self.model.main_input_name)
+ model_outputs = self.model.generate(inputs, **model_inputs, **generate_kwargs)
+ return model_outputs
+
+ def postprocess(self, model_outputs):
+ records = []
+ for output_ids in model_outputs:
+ record = {
+ "generated_text": self.tokenizer.decode(
+ output_ids,
+ skip_special_tokens=True,
+ )
+ }
+ records.append(record)
+ return records
diff --git a/.venv/lib/python3.11/site-packages/transformers/pipelines/mask_generation.py b/.venv/lib/python3.11/site-packages/transformers/pipelines/mask_generation.py
new file mode 100644
index 0000000000000000000000000000000000000000..f87e45b7f8ecb410ba5d0a088188256d59290f0f
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/transformers/pipelines/mask_generation.py
@@ -0,0 +1,287 @@
+from collections import defaultdict
+from typing import Optional
+
+from ..image_utils import load_image
+from ..utils import (
+ add_end_docstrings,
+ is_torch_available,
+ logging,
+ requires_backends,
+)
+from .base import ChunkPipeline, build_pipeline_init_args
+
+
+if is_torch_available():
+ import torch
+
+ from ..models.auto.modeling_auto import MODEL_FOR_MASK_GENERATION_MAPPING_NAMES
+
+logger = logging.get_logger(__name__)
+
+
+@add_end_docstrings(
+ build_pipeline_init_args(has_image_processor=True),
+ r"""
+ points_per_batch (*optional*, int, default to 64):
+ Sets the number of points run simultaneously by the model. Higher numbers may be faster but use more GPU
+ memory.
+ output_bboxes_mask (`bool`, *optional*, default to `False`):
+ Whether or not to output the bounding box predictions.
+ output_rle_masks (`bool`, *optional*, default to `False`):
+ Whether or not to output the masks in `RLE` format""",
+)
+class MaskGenerationPipeline(ChunkPipeline):
+ """
+ Automatic mask generation for images using `SamForMaskGeneration`. This pipeline predicts binary masks for an
+ image, given an image. It is a `ChunkPipeline` because you can seperate the points in a mini-batch in order to
+ avoid OOM issues. Use the `points_per_batch` argument to control the number of points that will be processed at the
+ same time. Default is `64`.
+
+ The pipeline works in 3 steps:
+ 1. `preprocess`: A grid of 1024 points evenly separated is generated along with bounding boxes and point
+ labels.
+ For more details on how the points and bounding boxes are created, check the `_generate_crop_boxes`
+ function. The image is also preprocessed using the `image_processor`. This function `yields` a minibatch of
+ `points_per_batch`.
+
+ 2. `forward`: feeds the outputs of `preprocess` to the model. The image embedding is computed only once.
+ Calls both `self.model.get_image_embeddings` and makes sure that the gradients are not computed, and the
+ tensors and models are on the same device.
+
+ 3. `postprocess`: The most important part of the automatic mask generation happens here. Three steps
+ are induced:
+ - image_processor.postprocess_masks (run on each minibatch loop): takes in the raw output masks,
+ resizes them according
+ to the image size, and transforms there to binary masks.
+ - image_processor.filter_masks (on each minibatch loop): uses both `pred_iou_thresh` and
+ `stability_scores`. Also
+ applies a variety of filters based on non maximum suppression to remove bad masks.
+ - image_processor.postprocess_masks_for_amg applies the NSM on the mask to only keep relevant ones.
+
+ Example:
+
+ ```python
+ >>> from transformers import pipeline
+
+ >>> generator = pipeline(model="facebook/sam-vit-base", task="mask-generation")
+ >>> outputs = generator(
+ ... "http://images.cocodataset.org/val2017/000000039769.jpg",
+ ... )
+
+ >>> outputs = generator(
+ ... "https://huggingface.co/datasets/Narsil/image_dummy/raw/main/parrots.png", points_per_batch=128
+ ... )
+ ```
+
+ Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial)
+
+ This segmentation pipeline can currently be loaded from [`pipeline`] using the following task identifier:
+ `"mask-generation"`.
+
+ See the list of available models on [huggingface.co/models](https://huggingface.co/models?filter=mask-generation).
+ """
+
+ def __init__(self, **kwargs):
+ super().__init__(**kwargs)
+ requires_backends(self, "vision")
+ requires_backends(self, "torch")
+
+ if self.framework != "pt":
+ raise ValueError(f"The {self.__class__} is only available in PyTorch.")
+
+ self.check_model_type(MODEL_FOR_MASK_GENERATION_MAPPING_NAMES)
+
+ def _sanitize_parameters(self, **kwargs):
+ preprocess_kwargs = {}
+ postprocess_kwargs = {}
+ forward_params = {}
+ # preprocess args
+ if "points_per_batch" in kwargs:
+ preprocess_kwargs["points_per_batch"] = kwargs["points_per_batch"]
+ if "points_per_crop" in kwargs:
+ preprocess_kwargs["points_per_crop"] = kwargs["points_per_crop"]
+ if "crops_n_layers" in kwargs:
+ preprocess_kwargs["crops_n_layers"] = kwargs["crops_n_layers"]
+ if "crop_overlap_ratio" in kwargs:
+ preprocess_kwargs["crop_overlap_ratio"] = kwargs["crop_overlap_ratio"]
+ if "crop_n_points_downscale_factor" in kwargs:
+ preprocess_kwargs["crop_n_points_downscale_factor"] = kwargs["crop_n_points_downscale_factor"]
+ if "timeout" in kwargs:
+ preprocess_kwargs["timeout"] = kwargs["timeout"]
+ # postprocess args
+ if "pred_iou_thresh" in kwargs:
+ forward_params["pred_iou_thresh"] = kwargs["pred_iou_thresh"]
+ if "stability_score_offset" in kwargs:
+ forward_params["stability_score_offset"] = kwargs["stability_score_offset"]
+ if "mask_threshold" in kwargs:
+ forward_params["mask_threshold"] = kwargs["mask_threshold"]
+ if "stability_score_thresh" in kwargs:
+ forward_params["stability_score_thresh"] = kwargs["stability_score_thresh"]
+ if "crops_nms_thresh" in kwargs:
+ postprocess_kwargs["crops_nms_thresh"] = kwargs["crops_nms_thresh"]
+ if "output_rle_mask" in kwargs:
+ postprocess_kwargs["output_rle_mask"] = kwargs["output_rle_mask"]
+ if "output_bboxes_mask" in kwargs:
+ postprocess_kwargs["output_bboxes_mask"] = kwargs["output_bboxes_mask"]
+ return preprocess_kwargs, forward_params, postprocess_kwargs
+
+ def __call__(self, image, *args, num_workers=None, batch_size=None, **kwargs):
+ """
+ Generates binary segmentation masks
+
+ Args:
+ inputs (`np.ndarray` or `bytes` or `str` or `dict`):
+ Image or list of images.
+ mask_threshold (`float`, *optional*, defaults to 0.0):
+ Threshold to use when turning the predicted masks into binary values.
+ pred_iou_thresh (`float`, *optional*, defaults to 0.88):
+ A filtering threshold in `[0,1]` applied on the model's predicted mask quality.
+ stability_score_thresh (`float`, *optional*, defaults to 0.95):
+ A filtering threshold in `[0,1]`, using the stability of the mask under changes to the cutoff used to
+ binarize the model's mask predictions.
+ stability_score_offset (`int`, *optional*, defaults to 1):
+ The amount to shift the cutoff when calculated the stability score.
+ crops_nms_thresh (`float`, *optional*, defaults to 0.7):
+ The box IoU cutoff used by non-maximal suppression to filter duplicate masks.
+ crops_n_layers (`int`, *optional*, defaults to 0):
+ If `crops_n_layers>0`, mask prediction will be run again on crops of the image. Sets the number of
+ layers to run, where each layer has 2**i_layer number of image crops.
+ crop_overlap_ratio (`float`, *optional*, defaults to `512 / 1500`):
+ Sets the degree to which crops overlap. In the first crop layer, crops will overlap by this fraction of
+ the image length. Later layers with more crops scale down this overlap.
+ crop_n_points_downscale_factor (`int`, *optional*, defaults to `1`):
+ The number of points-per-side sampled in layer n is scaled down by crop_n_points_downscale_factor**n.
+ timeout (`float`, *optional*, defaults to None):
+ The maximum time in seconds to wait for fetching images from the web. If None, no timeout is set and
+ the call may block forever.
+
+ Return:
+ `Dict`: A dictionary with the following keys:
+ - **mask** (`PIL.Image`) -- A binary mask of the detected object as a PIL Image of shape `(width,
+ height)` of the original image. Returns a mask filled with zeros if no object is found.
+ - **score** (*optional* `float`) -- Optionally, when the model is capable of estimating a confidence of
+ the "object" described by the label and the mask.
+
+ """
+ return super().__call__(image, *args, num_workers=num_workers, batch_size=batch_size, **kwargs)
+
+ def preprocess(
+ self,
+ image,
+ points_per_batch=64,
+ crops_n_layers: int = 0,
+ crop_overlap_ratio: float = 512 / 1500,
+ points_per_crop: Optional[int] = 32,
+ crop_n_points_downscale_factor: Optional[int] = 1,
+ timeout: Optional[float] = None,
+ ):
+ image = load_image(image, timeout=timeout)
+ target_size = self.image_processor.size["longest_edge"]
+ crop_boxes, grid_points, cropped_images, input_labels = self.image_processor.generate_crop_boxes(
+ image, target_size, crops_n_layers, crop_overlap_ratio, points_per_crop, crop_n_points_downscale_factor
+ )
+ model_inputs = self.image_processor(images=cropped_images, return_tensors="pt")
+ if self.framework == "pt":
+ model_inputs = model_inputs.to(self.torch_dtype)
+
+ with self.device_placement():
+ if self.framework == "pt":
+ inference_context = self.get_inference_context()
+ with inference_context():
+ model_inputs = self._ensure_tensor_on_device(model_inputs, device=self.device)
+ image_embeddings = self.model.get_image_embeddings(model_inputs.pop("pixel_values"))
+ model_inputs["image_embeddings"] = image_embeddings
+
+ n_points = grid_points.shape[1]
+ points_per_batch = points_per_batch if points_per_batch is not None else n_points
+
+ if points_per_batch <= 0:
+ raise ValueError(
+ "Cannot have points_per_batch<=0. Must be >=1 to returned batched outputs. "
+ "To return all points at once, set points_per_batch to None"
+ )
+
+ for i in range(0, n_points, points_per_batch):
+ batched_points = grid_points[:, i : i + points_per_batch, :, :]
+ labels = input_labels[:, i : i + points_per_batch]
+ is_last = i == n_points - points_per_batch
+ yield {
+ "input_points": batched_points,
+ "input_labels": labels,
+ "input_boxes": crop_boxes,
+ "is_last": is_last,
+ **model_inputs,
+ }
+
+ def _forward(
+ self,
+ model_inputs,
+ pred_iou_thresh=0.88,
+ stability_score_thresh=0.95,
+ mask_threshold=0,
+ stability_score_offset=1,
+ ):
+ input_boxes = model_inputs.pop("input_boxes")
+ is_last = model_inputs.pop("is_last")
+ original_sizes = model_inputs.pop("original_sizes").tolist()
+ reshaped_input_sizes = model_inputs.pop("reshaped_input_sizes").tolist()
+
+ model_outputs = self.model(**model_inputs)
+
+ # post processing happens here in order to avoid CPU GPU copies of ALL the masks
+ low_resolution_masks = model_outputs["pred_masks"]
+ masks = self.image_processor.post_process_masks(
+ low_resolution_masks, original_sizes, reshaped_input_sizes, mask_threshold, binarize=False
+ )
+ iou_scores = model_outputs["iou_scores"]
+ masks, iou_scores, boxes = self.image_processor.filter_masks(
+ masks[0],
+ iou_scores[0],
+ original_sizes[0],
+ input_boxes[0],
+ pred_iou_thresh,
+ stability_score_thresh,
+ mask_threshold,
+ stability_score_offset,
+ )
+ return {
+ "masks": masks,
+ "is_last": is_last,
+ "boxes": boxes,
+ "iou_scores": iou_scores,
+ }
+
+ def postprocess(
+ self,
+ model_outputs,
+ output_rle_mask=False,
+ output_bboxes_mask=False,
+ crops_nms_thresh=0.7,
+ ):
+ all_scores = []
+ all_masks = []
+ all_boxes = []
+ for model_output in model_outputs:
+ all_scores.append(model_output.pop("iou_scores"))
+ all_masks.extend(model_output.pop("masks"))
+ all_boxes.append(model_output.pop("boxes"))
+
+ all_scores = torch.cat(all_scores)
+ all_boxes = torch.cat(all_boxes)
+ output_masks, iou_scores, rle_mask, bounding_boxes = self.image_processor.post_process_for_mask_generation(
+ all_masks, all_scores, all_boxes, crops_nms_thresh
+ )
+
+ extra = defaultdict(list)
+ for output in model_outputs:
+ for k, v in output.items():
+ extra[k].append(v)
+
+ optional = {}
+ if output_rle_mask:
+ optional["rle_mask"] = rle_mask
+
+ if output_bboxes_mask:
+ optional["bounding_boxes"] = bounding_boxes
+
+ return {"masks": output_masks, "scores": iou_scores, **optional, **extra}
diff --git a/.venv/lib/python3.11/site-packages/transformers/pipelines/object_detection.py b/.venv/lib/python3.11/site-packages/transformers/pipelines/object_detection.py
new file mode 100644
index 0000000000000000000000000000000000000000..c84f17b2bd6ad0ac2bbbe95a3421e7197a5744c6
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/transformers/pipelines/object_detection.py
@@ -0,0 +1,191 @@
+from typing import Any, Dict, List, Union
+
+from ..utils import add_end_docstrings, is_torch_available, is_vision_available, logging, requires_backends
+from .base import Pipeline, build_pipeline_init_args
+
+
+if is_vision_available():
+ from ..image_utils import load_image
+
+
+if is_torch_available():
+ import torch
+
+ from ..models.auto.modeling_auto import (
+ MODEL_FOR_OBJECT_DETECTION_MAPPING_NAMES,
+ MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES,
+ )
+
+logger = logging.get_logger(__name__)
+
+
+Prediction = Dict[str, Any]
+Predictions = List[Prediction]
+
+
+@add_end_docstrings(build_pipeline_init_args(has_image_processor=True))
+class ObjectDetectionPipeline(Pipeline):
+ """
+ Object detection pipeline using any `AutoModelForObjectDetection`. This pipeline predicts bounding boxes of objects
+ and their classes.
+
+ Example:
+
+ ```python
+ >>> from transformers import pipeline
+
+ >>> detector = pipeline(model="facebook/detr-resnet-50")
+ >>> detector("https://huggingface.co/datasets/Narsil/image_dummy/raw/main/parrots.png")
+ [{'score': 0.997, 'label': 'bird', 'box': {'xmin': 69, 'ymin': 171, 'xmax': 396, 'ymax': 507}}, {'score': 0.999, 'label': 'bird', 'box': {'xmin': 398, 'ymin': 105, 'xmax': 767, 'ymax': 507}}]
+
+ >>> # x, y are expressed relative to the top left hand corner.
+ ```
+
+ Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial)
+
+ This object detection pipeline can currently be loaded from [`pipeline`] using the following task identifier:
+ `"object-detection"`.
+
+ See the list of available models on [huggingface.co/models](https://huggingface.co/models?filter=object-detection).
+ """
+
+ def __init__(self, *args, **kwargs):
+ super().__init__(*args, **kwargs)
+
+ if self.framework == "tf":
+ raise ValueError(f"The {self.__class__} is only available in PyTorch.")
+
+ requires_backends(self, "vision")
+ mapping = MODEL_FOR_OBJECT_DETECTION_MAPPING_NAMES.copy()
+ mapping.update(MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES)
+ self.check_model_type(mapping)
+
+ def _sanitize_parameters(self, **kwargs):
+ preprocess_params = {}
+ if "timeout" in kwargs:
+ preprocess_params["timeout"] = kwargs["timeout"]
+ postprocess_kwargs = {}
+ if "threshold" in kwargs:
+ postprocess_kwargs["threshold"] = kwargs["threshold"]
+ return preprocess_params, {}, postprocess_kwargs
+
+ def __call__(self, *args, **kwargs) -> Union[Predictions, List[Prediction]]:
+ """
+ Detect objects (bounding boxes & classes) in the image(s) passed as inputs.
+
+ Args:
+ inputs (`str`, `List[str]`, `PIL.Image` or `List[PIL.Image]`):
+ The pipeline handles three types of images:
+
+ - A string containing an HTTP(S) link pointing to an image
+ - A string containing a local path to an image
+ - An image loaded in PIL directly
+
+ The pipeline accepts either a single image or a batch of images. Images in a batch must all be in the
+ same format: all as HTTP(S) links, all as local paths, or all as PIL images.
+ threshold (`float`, *optional*, defaults to 0.5):
+ The probability necessary to make a prediction.
+ timeout (`float`, *optional*, defaults to None):
+ The maximum time in seconds to wait for fetching images from the web. If None, no timeout is set and
+ the call may block forever.
+
+ Return:
+ A list of dictionaries or a list of list of dictionaries containing the result. If the input is a single
+ image, will return a list of dictionaries, if the input is a list of several images, will return a list of
+ list of dictionaries corresponding to each image.
+
+ The dictionaries contain the following keys:
+
+ - **label** (`str`) -- The class label identified by the model.
+ - **score** (`float`) -- The score attributed by the model for that label.
+ - **box** (`List[Dict[str, int]]`) -- The bounding box of detected object in image's original size.
+ """
+ # After deprecation of this is completed, remove the default `None` value for `images`
+ if "images" in kwargs and "inputs" not in kwargs:
+ kwargs["inputs"] = kwargs.pop("images")
+ return super().__call__(*args, **kwargs)
+
+ def preprocess(self, image, timeout=None):
+ image = load_image(image, timeout=timeout)
+ target_size = torch.IntTensor([[image.height, image.width]])
+ inputs = self.image_processor(images=[image], return_tensors="pt")
+ if self.framework == "pt":
+ inputs = inputs.to(self.torch_dtype)
+ if self.tokenizer is not None:
+ inputs = self.tokenizer(text=inputs["words"], boxes=inputs["boxes"], return_tensors="pt")
+ inputs["target_size"] = target_size
+ return inputs
+
+ def _forward(self, model_inputs):
+ target_size = model_inputs.pop("target_size")
+ outputs = self.model(**model_inputs)
+ model_outputs = outputs.__class__({"target_size": target_size, **outputs})
+ if self.tokenizer is not None:
+ model_outputs["bbox"] = model_inputs["bbox"]
+ return model_outputs
+
+ def postprocess(self, model_outputs, threshold=0.5):
+ target_size = model_outputs["target_size"]
+ if self.tokenizer is not None:
+ # This is a LayoutLMForTokenClassification variant.
+ # The OCR got the boxes and the model classified the words.
+ height, width = target_size[0].tolist()
+
+ def unnormalize(bbox):
+ return self._get_bounding_box(
+ torch.Tensor(
+ [
+ (width * bbox[0] / 1000),
+ (height * bbox[1] / 1000),
+ (width * bbox[2] / 1000),
+ (height * bbox[3] / 1000),
+ ]
+ )
+ )
+
+ scores, classes = model_outputs["logits"].squeeze(0).softmax(dim=-1).max(dim=-1)
+ labels = [self.model.config.id2label[prediction] for prediction in classes.tolist()]
+ boxes = [unnormalize(bbox) for bbox in model_outputs["bbox"].squeeze(0)]
+ keys = ["score", "label", "box"]
+ annotation = [dict(zip(keys, vals)) for vals in zip(scores.tolist(), labels, boxes) if vals[0] > threshold]
+ else:
+ # This is a regular ForObjectDetectionModel
+ raw_annotations = self.image_processor.post_process_object_detection(model_outputs, threshold, target_size)
+ raw_annotation = raw_annotations[0]
+ scores = raw_annotation["scores"]
+ labels = raw_annotation["labels"]
+ boxes = raw_annotation["boxes"]
+
+ raw_annotation["scores"] = scores.tolist()
+ raw_annotation["labels"] = [self.model.config.id2label[label.item()] for label in labels]
+ raw_annotation["boxes"] = [self._get_bounding_box(box) for box in boxes]
+
+ # {"scores": [...], ...} --> [{"score":x, ...}, ...]
+ keys = ["score", "label", "box"]
+ annotation = [
+ dict(zip(keys, vals))
+ for vals in zip(raw_annotation["scores"], raw_annotation["labels"], raw_annotation["boxes"])
+ ]
+
+ return annotation
+
+ def _get_bounding_box(self, box: "torch.Tensor") -> Dict[str, int]:
+ """
+ Turns list [xmin, xmax, ymin, ymax] into dict { "xmin": xmin, ... }
+
+ Args:
+ box (`torch.Tensor`): Tensor containing the coordinates in corners format.
+
+ Returns:
+ bbox (`Dict[str, int]`): Dict containing the coordinates in corners format.
+ """
+ if self.framework != "pt":
+ raise ValueError("The ObjectDetectionPipeline is only available in PyTorch.")
+ xmin, ymin, xmax, ymax = box.int().tolist()
+ bbox = {
+ "xmin": xmin,
+ "ymin": ymin,
+ "xmax": xmax,
+ "ymax": ymax,
+ }
+ return bbox
diff --git a/.venv/lib/python3.11/site-packages/transformers/pipelines/pt_utils.py b/.venv/lib/python3.11/site-packages/transformers/pipelines/pt_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..19663437cd691efb265770ae007871cafe1275ed
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/transformers/pipelines/pt_utils.py
@@ -0,0 +1,321 @@
+import numpy as np
+import torch
+from torch.utils.data import Dataset, IterableDataset
+
+from ..utils.generic import ModelOutput
+
+
+class PipelineDataset(Dataset):
+ def __init__(self, dataset, process, params):
+ self.dataset = dataset
+ self.process = process
+ self.params = params
+
+ def __len__(self):
+ return len(self.dataset)
+
+ def __getitem__(self, i):
+ item = self.dataset[i]
+ processed = self.process(item, **self.params)
+ return processed
+
+
+class PipelineIterator(IterableDataset):
+ def __init__(self, loader, infer, params, loader_batch_size=None):
+ """
+ Roughly equivalent to
+
+ ```
+ for item in loader:
+ yield infer(item, **params)
+ ```
+
+ Arguments:
+ loader (`torch.utils.data.DataLoader` or `Iterable`):
+ The iterator that will be used to apply `infer` on.
+ infer (any function):
+ The function to apply of each element of `loader`.
+ params (`dict`):
+ The parameters passed to `infer` along with every item
+ loader_batch_size (`int`, *optional*):
+ If specified, the items of `loader` are supposed to come as batch, and are loader_batched here
+ making it roughly behave as
+
+
+ ```
+ for items in loader:
+ for i in loader_batch_size:
+ item = items[i]
+ yield infer(item, **params)
+ ```"""
+ self.loader = loader
+ self.infer = infer
+ self.params = params
+ if loader_batch_size == 1:
+ # Let's spare some time by deactivating altogether
+ loader_batch_size = None
+ self.loader_batch_size = loader_batch_size
+
+ # Internal bookkeeping
+ self._loader_batch_index = None
+ self._loader_batch_data = None
+
+ def __len__(self):
+ return len(self.loader)
+
+ def __iter__(self):
+ self.iterator = iter(self.loader)
+ return self
+
+ def loader_batch_item(self):
+ """
+ Return item located at `loader_batch_index` within the current `loader_batch_data`.
+ """
+ if isinstance(self._loader_batch_data, torch.Tensor):
+ # Batch data is simple tensor, just fetch the slice
+ result = self._loader_batch_data[self._loader_batch_index].unsqueeze(0)
+ else:
+ # Batch data is assumed to be BaseModelOutput (or dict)
+ loader_batched = {}
+ for k, element in self._loader_batch_data.items():
+ if isinstance(element, ModelOutput):
+ # Convert ModelOutput to tuple first
+ element = element.to_tuple()
+ if isinstance(element[0], torch.Tensor):
+ loader_batched[k] = tuple(el[self._loader_batch_index].unsqueeze(0) for el in element)
+ elif isinstance(element[0], np.ndarray):
+ loader_batched[k] = tuple(np.expand_dims(el[self._loader_batch_index], 0) for el in element)
+ continue
+ if k in {"hidden_states", "past_key_values", "attentions"} and isinstance(element, tuple):
+ # Those are stored as lists of tensors so need specific unbatching.
+ if isinstance(element[0], torch.Tensor):
+ loader_batched[k] = tuple(el[self._loader_batch_index].unsqueeze(0) for el in element)
+ elif isinstance(element[0], np.ndarray):
+ loader_batched[k] = tuple(np.expand_dims(el[self._loader_batch_index], 0) for el in element)
+ continue
+ if element is None:
+ # This can happen for optional data that get passed around
+ loader_batched[k] = None
+ elif isinstance(element[self._loader_batch_index], torch.Tensor):
+ # Take correct batch data, but make it looked like batch_size=1
+ # For compatibility with other methods within transformers
+
+ loader_batched[k] = element[self._loader_batch_index].unsqueeze(0)
+ elif isinstance(element[self._loader_batch_index], np.ndarray):
+ # Take correct batch data, but make it looked like batch_size=1
+ # For compatibility with other methods within transformers
+ loader_batched[k] = np.expand_dims(element[self._loader_batch_index], 0)
+ else:
+ # This is typically a list, so no need to `unsqueeze`.
+ loader_batched[k] = element[self._loader_batch_index]
+ # Recreate the element by reusing the original class to make it look
+ # batch_size=1
+ result = self._loader_batch_data.__class__(loader_batched)
+ self._loader_batch_index += 1
+ return result
+
+ def __next__(self):
+ if self._loader_batch_index is not None and self._loader_batch_index < self.loader_batch_size:
+ # We are currently unrolling a batch so we just need to return
+ # the current item within a batch
+ return self.loader_batch_item()
+
+ # We're out of items within a batch
+ item = next(self.iterator)
+ processed = self.infer(item, **self.params)
+ # We now have a batch of "inferred things".
+ if self.loader_batch_size is not None:
+ # Try to infer the size of the batch
+ if isinstance(processed, torch.Tensor):
+ first_tensor = processed
+ elif isinstance(processed, tuple):
+ first_tensor = processed[0]
+ else:
+ key = list(processed.keys())[0]
+ first_tensor = processed[key]
+
+ if isinstance(first_tensor, list):
+ observed_batch_size = len(first_tensor)
+ else:
+ observed_batch_size = first_tensor.shape[0]
+ if 0 < observed_batch_size < self.loader_batch_size:
+ # could be last batch so we can't unroll as many
+ # elements.
+ self.loader_batch_size = observed_batch_size
+ # Setting internal index to unwrap the batch
+ self._loader_batch_data = processed[0] if isinstance(processed, tuple) else processed
+ self._loader_batch_index = 0
+ return self.loader_batch_item()
+ else:
+ # We're not unrolling batches
+ return processed
+
+
+class PipelineChunkIterator(PipelineIterator):
+ def __init__(self, loader, infer, params, loader_batch_size=None):
+ """
+ Roughly equivalent to
+
+ ```
+ for iterator in loader:
+ for item in iterator:
+ yield infer(item, **params)
+ ```
+
+ Arguments:
+ loader (`torch.utils.data.DataLoader` or `Iterable`):
+ The iterator that will be used to apply `infer` on.
+ infer (any function):
+ The function to apply of each element of `loader`.
+ params (`dict`):
+ The parameters passed to `infer` along with every item
+ """
+ super().__init__(loader, infer, params)
+
+ def __iter__(self):
+ self.iterator = iter(self.loader)
+ self.subiterator = None
+ return self
+
+ def __next__(self):
+ if self.subiterator is None:
+ "Subiterator None means we haven't started a `preprocess` iterator. so start it"
+ self.subiterator = self.infer(next(self.iterator), **self.params)
+ try:
+ # Try to return next item
+ processed = next(self.subiterator)
+ except StopIteration:
+ # When a preprocess iterator ends, we can start lookig at the next item
+ # ChunkIterator will keep feeding until ALL elements of iterator
+ # all have created their subiterator and have been iterating against.
+ #
+ # Another way to look at it, is we're basically flattening lists of lists
+ # into a single list, but with generators
+ self.subiterator = self.infer(next(self.iterator), **self.params)
+ processed = next(self.subiterator)
+ return processed
+
+
+class PipelinePackIterator(PipelineIterator):
+ """
+ Roughly equivalent to
+
+ ```
+ packed = []
+ for item in loader:
+ packed.append(item)
+ if item["is_last"]:
+ yield packed
+ packed = []
+ ```
+
+ but it also handles cases where `item` are batched (meaning it's a dict of Tensor with first dimension > 1. In
+ that case it does
+
+ ```
+ packed = []
+ for batch in loader:
+ # item is batched
+ for item in batch:
+ packed.append(item)
+ if item["is_last"]:
+ yield packed
+ packed = []
+ ```
+
+ Arguments:
+ loader (`torch.utils.data.DataLoader` or `Iterable`):
+ The iterator that will be used to apply `infer` on.
+ infer (any function):
+ The function to apply of each element of `loader`.
+ params (`dict`):
+ The parameters passed to `infer` along with every item
+ loader_batch_size (`int`, *optional*):
+ If specified, the items of `loader` are supposed to come as batch, and are loader_batched here making
+ it roughly behave as
+
+
+ ```
+ for items in loader:
+ for i in loader_batch_size:
+ item = items[i]
+ yield infer(item, **params)
+ ```"""
+
+ def __iter__(self):
+ self.iterator = iter(self.loader)
+ return self
+
+ def __next__(self):
+ # Extremely similar to PipelineIterator in its unpacking mechanism
+ # BUT, we have an extra required item which is the presence of `is_last`
+ # That is because everything is flattened by `PipelineChunkIterator` we
+ # need to keep track of how to regroup here in the original `process`
+ # boundaries so that `process` and `postprocess` see the same data.
+
+ # This iterator accumulates items (possibly while unbatching) until it
+ # its a `is_last` and then just passes it on to the caller.
+ is_last = False
+ accumulator = []
+ if self._loader_batch_index is not None and self._loader_batch_index < self.loader_batch_size:
+ while self._loader_batch_index < self.loader_batch_size:
+ item = self.loader_batch_item()
+ is_last = item.pop("is_last")
+ accumulator.append(item)
+ if is_last:
+ return accumulator
+
+ while not is_last:
+ processed = self.infer(next(self.iterator), **self.params)
+ if self.loader_batch_size is not None:
+ if isinstance(processed, torch.Tensor):
+ first_tensor = processed
+ else:
+ key = list(processed.keys())[0]
+ first_tensor = processed[key]
+ if isinstance(first_tensor, list):
+ observed_batch_size = len(first_tensor)
+ else:
+ observed_batch_size = first_tensor.shape[0]
+ if 0 < observed_batch_size < self.loader_batch_size:
+ # could be last batch so we can't unroll as many
+ # elements.
+ self.loader_batch_size = observed_batch_size
+ self._loader_batch_data = processed
+ self._loader_batch_index = 0
+ while self._loader_batch_index < self.loader_batch_size:
+ item = self.loader_batch_item()
+ is_last = item.pop("is_last")
+ accumulator.append(item)
+ if is_last:
+ return accumulator
+ else:
+ item = processed
+ is_last = item.pop("is_last")
+ accumulator.append(item)
+ return accumulator
+
+
+class KeyDataset(Dataset):
+ def __init__(self, dataset: Dataset, key: str):
+ self.dataset = dataset
+ self.key = key
+
+ def __len__(self):
+ return len(self.dataset)
+
+ def __getitem__(self, i):
+ return self.dataset[i][self.key]
+
+
+class KeyPairDataset(Dataset):
+ def __init__(self, dataset: Dataset, key1: str, key2: str):
+ self.dataset = dataset
+ self.key1 = key1
+ self.key2 = key2
+
+ def __len__(self):
+ return len(self.dataset)
+
+ def __getitem__(self, i):
+ return {"text": self.dataset[i][self.key1], "text_pair": self.dataset[i][self.key2]}
diff --git a/.venv/lib/python3.11/site-packages/transformers/pipelines/question_answering.py b/.venv/lib/python3.11/site-packages/transformers/pipelines/question_answering.py
new file mode 100644
index 0000000000000000000000000000000000000000..7b876eefc492793087871602f51fcd6fb55f5244
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/transformers/pipelines/question_answering.py
@@ -0,0 +1,682 @@
+import inspect
+import types
+import warnings
+from collections.abc import Iterable
+from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union
+
+import numpy as np
+
+from ..data import SquadExample, SquadFeatures, squad_convert_examples_to_features
+from ..modelcard import ModelCard
+from ..tokenization_utils import PreTrainedTokenizer
+from ..utils import (
+ PaddingStrategy,
+ add_end_docstrings,
+ is_tf_available,
+ is_tokenizers_available,
+ is_torch_available,
+ logging,
+)
+from .base import ArgumentHandler, ChunkPipeline, build_pipeline_init_args
+
+
+logger = logging.get_logger(__name__)
+
+if TYPE_CHECKING:
+ from ..modeling_tf_utils import TFPreTrainedModel
+ from ..modeling_utils import PreTrainedModel
+
+ if is_tokenizers_available():
+ import tokenizers
+
+if is_tf_available():
+ import tensorflow as tf
+
+ from ..models.auto.modeling_tf_auto import TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING_NAMES
+
+ Dataset = None
+
+if is_torch_available():
+ import torch
+ from torch.utils.data import Dataset
+
+ from ..models.auto.modeling_auto import MODEL_FOR_QUESTION_ANSWERING_MAPPING_NAMES
+
+
+def decode_spans(
+ start: np.ndarray, end: np.ndarray, topk: int, max_answer_len: int, undesired_tokens: np.ndarray
+) -> Tuple:
+ """
+ Take the output of any `ModelForQuestionAnswering` and will generate probabilities for each span to be the actual
+ answer.
+
+ In addition, it filters out some unwanted/impossible cases like answer len being greater than max_answer_len or
+ answer end position being before the starting position. The method supports output the k-best answer through the
+ topk argument.
+
+ Args:
+ start (`np.ndarray`): Individual start probabilities for each token.
+ end (`np.ndarray`): Individual end probabilities for each token.
+ topk (`int`): Indicates how many possible answer span(s) to extract from the model output.
+ max_answer_len (`int`): Maximum size of the answer to extract from the model's output.
+ undesired_tokens (`np.ndarray`): Mask determining tokens that can be part of the answer
+ """
+ # Ensure we have batch axis
+ if start.ndim == 1:
+ start = start[None]
+
+ if end.ndim == 1:
+ end = end[None]
+
+ # Compute the score of each tuple(start, end) to be the real answer
+ outer = np.matmul(np.expand_dims(start, -1), np.expand_dims(end, 1))
+
+ # Remove candidate with end < start and end - start > max_answer_len
+ candidates = np.tril(np.triu(outer), max_answer_len - 1)
+
+ # Inspired by Chen & al. (https://github.com/facebookresearch/DrQA)
+ scores_flat = candidates.flatten()
+ if topk == 1:
+ idx_sort = [np.argmax(scores_flat)]
+ elif len(scores_flat) < topk:
+ idx_sort = np.argsort(-scores_flat)
+ else:
+ idx = np.argpartition(-scores_flat, topk)[0:topk]
+ idx_sort = idx[np.argsort(-scores_flat[idx])]
+
+ starts, ends = np.unravel_index(idx_sort, candidates.shape)[1:]
+ desired_spans = np.isin(starts, undesired_tokens.nonzero()) & np.isin(ends, undesired_tokens.nonzero())
+ starts = starts[desired_spans]
+ ends = ends[desired_spans]
+ scores = candidates[0, starts, ends]
+
+ return starts, ends, scores
+
+
+def select_starts_ends(
+ start,
+ end,
+ p_mask,
+ attention_mask,
+ min_null_score=1000000,
+ top_k=1,
+ handle_impossible_answer=False,
+ max_answer_len=15,
+):
+ """
+ Takes the raw output of any `ModelForQuestionAnswering` and first normalizes its outputs and then uses
+ `decode_spans()` to generate probabilities for each span to be the actual answer.
+
+ Args:
+ start (`np.ndarray`): Individual start logits for each token.
+ end (`np.ndarray`): Individual end logits for each token.
+ p_mask (`np.ndarray`): A mask with 1 for values that cannot be in the answer
+ attention_mask (`np.ndarray`): The attention mask generated by the tokenizer
+ min_null_score(`float`): The minimum null (empty) answer score seen so far.
+ topk (`int`): Indicates how many possible answer span(s) to extract from the model output.
+ handle_impossible_answer(`bool`): Whether to allow null (empty) answers
+ max_answer_len (`int`): Maximum size of the answer to extract from the model's output.
+ """
+ # Ensure padded tokens & question tokens cannot belong to the set of candidate answers.
+ undesired_tokens = np.abs(np.array(p_mask) - 1)
+
+ if attention_mask is not None:
+ undesired_tokens = undesired_tokens & attention_mask
+
+ # Generate mask
+ undesired_tokens_mask = undesired_tokens == 0.0
+
+ # Make sure non-context indexes in the tensor cannot contribute to the softmax
+ start = np.where(undesired_tokens_mask, -10000.0, start)
+ end = np.where(undesired_tokens_mask, -10000.0, end)
+
+ # Normalize logits and spans to retrieve the answer
+ start = np.exp(start - start.max(axis=-1, keepdims=True))
+ start = start / start.sum()
+
+ end = np.exp(end - end.max(axis=-1, keepdims=True))
+ end = end / end.sum()
+
+ if handle_impossible_answer:
+ min_null_score = min(min_null_score, (start[0, 0] * end[0, 0]).item())
+
+ # Mask CLS
+ start[0, 0] = end[0, 0] = 0.0
+
+ starts, ends, scores = decode_spans(start, end, top_k, max_answer_len, undesired_tokens)
+ return starts, ends, scores, min_null_score
+
+
+class QuestionAnsweringArgumentHandler(ArgumentHandler):
+ """
+ QuestionAnsweringPipeline requires the user to provide multiple arguments (i.e. question & context) to be mapped to
+ internal [`SquadExample`].
+
+ QuestionAnsweringArgumentHandler manages all the possible to create a [`SquadExample`] from the command-line
+ supplied arguments.
+ """
+
+ def normalize(self, item):
+ if isinstance(item, SquadExample):
+ return item
+ elif isinstance(item, dict):
+ for k in ["question", "context"]:
+ if k not in item:
+ raise KeyError("You need to provide a dictionary with keys {question:..., context:...}")
+ elif item[k] is None:
+ raise ValueError(f"`{k}` cannot be None")
+ elif isinstance(item[k], str) and len(item[k]) == 0:
+ raise ValueError(f"`{k}` cannot be empty")
+
+ return QuestionAnsweringPipeline.create_sample(**item)
+ raise ValueError(f"{item} argument needs to be of type (SquadExample, dict)")
+
+ def __call__(self, *args, **kwargs):
+ # Detect where the actual inputs are
+ if args is not None and len(args) > 0:
+ if len(args) == 1:
+ inputs = args[0]
+ elif len(args) == 2 and {type(el) for el in args} == {str}:
+ inputs = [{"question": args[0], "context": args[1]}]
+ else:
+ inputs = list(args)
+ # Generic compatibility with sklearn and Keras
+ # Batched data
+ elif "X" in kwargs:
+ warnings.warn(
+ "Passing the `X` argument to the pipeline is deprecated and will be removed in v5. Inputs should be passed using the `question` and `context` keyword arguments instead.",
+ FutureWarning,
+ )
+ inputs = kwargs["X"]
+ elif "data" in kwargs:
+ warnings.warn(
+ "Passing the `data` argument to the pipeline is deprecated and will be removed in v5. Inputs should be passed using the `question` and `context` keyword arguments instead.",
+ FutureWarning,
+ )
+ inputs = kwargs["data"]
+ elif "question" in kwargs and "context" in kwargs:
+ if isinstance(kwargs["question"], list) and isinstance(kwargs["context"], str):
+ inputs = [{"question": Q, "context": kwargs["context"]} for Q in kwargs["question"]]
+ elif isinstance(kwargs["question"], list) and isinstance(kwargs["context"], list):
+ if len(kwargs["question"]) != len(kwargs["context"]):
+ raise ValueError("Questions and contexts don't have the same lengths")
+
+ inputs = [{"question": Q, "context": C} for Q, C in zip(kwargs["question"], kwargs["context"])]
+ elif isinstance(kwargs["question"], str) and isinstance(kwargs["context"], str):
+ inputs = [{"question": kwargs["question"], "context": kwargs["context"]}]
+ else:
+ raise ValueError("Arguments can't be understood")
+ else:
+ raise ValueError(f"Unknown arguments {kwargs}")
+
+ # When user is sending a generator we need to trust it's a valid example
+ generator_types = (types.GeneratorType, Dataset) if Dataset is not None else (types.GeneratorType,)
+ if isinstance(inputs, generator_types):
+ return inputs
+
+ # Normalize inputs
+ if isinstance(inputs, dict):
+ inputs = [inputs]
+ elif isinstance(inputs, Iterable):
+ # Copy to avoid overriding arguments
+ inputs = list(inputs)
+ else:
+ raise ValueError(f"Invalid arguments {kwargs}")
+
+ for i, item in enumerate(inputs):
+ inputs[i] = self.normalize(item)
+
+ return inputs
+
+
+@add_end_docstrings(build_pipeline_init_args(has_tokenizer=True))
+class QuestionAnsweringPipeline(ChunkPipeline):
+ """
+ Question Answering pipeline using any `ModelForQuestionAnswering`. See the [question answering
+ examples](../task_summary#question-answering) for more information.
+
+ Example:
+
+ ```python
+ >>> from transformers import pipeline
+
+ >>> oracle = pipeline(model="deepset/roberta-base-squad2")
+ >>> oracle(question="Where do I live?", context="My name is Wolfgang and I live in Berlin")
+ {'score': 0.9191, 'start': 34, 'end': 40, 'answer': 'Berlin'}
+ ```
+
+ Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial)
+
+ This question answering pipeline can currently be loaded from [`pipeline`] using the following task identifier:
+ `"question-answering"`.
+
+ The models that this pipeline can use are models that have been fine-tuned on a question answering task. See the
+ up-to-date list of available models on
+ [huggingface.co/models](https://huggingface.co/models?filter=question-answering).
+ """
+
+ default_input_names = "question,context"
+ handle_impossible_answer = False
+
+ def __init__(
+ self,
+ model: Union["PreTrainedModel", "TFPreTrainedModel"],
+ tokenizer: PreTrainedTokenizer,
+ modelcard: Optional[ModelCard] = None,
+ framework: Optional[str] = None,
+ task: str = "",
+ **kwargs,
+ ):
+ super().__init__(
+ model=model,
+ tokenizer=tokenizer,
+ modelcard=modelcard,
+ framework=framework,
+ task=task,
+ **kwargs,
+ )
+
+ self._args_parser = QuestionAnsweringArgumentHandler()
+ self.check_model_type(
+ TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING_NAMES
+ if self.framework == "tf"
+ else MODEL_FOR_QUESTION_ANSWERING_MAPPING_NAMES
+ )
+
+ @staticmethod
+ def create_sample(
+ question: Union[str, List[str]], context: Union[str, List[str]]
+ ) -> Union[SquadExample, List[SquadExample]]:
+ """
+ QuestionAnsweringPipeline leverages the [`SquadExample`] internally. This helper method encapsulate all the
+ logic for converting question(s) and context(s) to [`SquadExample`].
+
+ We currently support extractive question answering.
+
+ Arguments:
+ question (`str` or `List[str]`): The question(s) asked.
+ context (`str` or `List[str]`): The context(s) in which we will look for the answer.
+
+ Returns:
+ One or a list of [`SquadExample`]: The corresponding [`SquadExample`] grouping question and context.
+ """
+ if isinstance(question, list):
+ return [SquadExample(None, q, c, None, None, None) for q, c in zip(question, context)]
+ else:
+ return SquadExample(None, question, context, None, None, None)
+
+ def _sanitize_parameters(
+ self,
+ padding=None,
+ topk=None,
+ top_k=None,
+ doc_stride=None,
+ max_answer_len=None,
+ max_seq_len=None,
+ max_question_len=None,
+ handle_impossible_answer=None,
+ align_to_words=None,
+ **kwargs,
+ ):
+ # Set defaults values
+ preprocess_params = {}
+ if padding is not None:
+ preprocess_params["padding"] = padding
+ if doc_stride is not None:
+ preprocess_params["doc_stride"] = doc_stride
+ if max_question_len is not None:
+ preprocess_params["max_question_len"] = max_question_len
+ if max_seq_len is not None:
+ preprocess_params["max_seq_len"] = max_seq_len
+
+ postprocess_params = {}
+ if topk is not None and top_k is None:
+ warnings.warn("topk parameter is deprecated, use top_k instead", UserWarning)
+ top_k = topk
+ if top_k is not None:
+ if top_k < 1:
+ raise ValueError(f"top_k parameter should be >= 1 (got {top_k})")
+ postprocess_params["top_k"] = top_k
+ if max_answer_len is not None:
+ if max_answer_len < 1:
+ raise ValueError(f"max_answer_len parameter should be >= 1 (got {max_answer_len}")
+ if max_answer_len is not None:
+ postprocess_params["max_answer_len"] = max_answer_len
+ if handle_impossible_answer is not None:
+ postprocess_params["handle_impossible_answer"] = handle_impossible_answer
+ if align_to_words is not None:
+ postprocess_params["align_to_words"] = align_to_words
+ return preprocess_params, {}, postprocess_params
+
+ def __call__(self, *args, **kwargs):
+ """
+ Answer the question(s) given as inputs by using the context(s).
+
+ Args:
+ question (`str` or `List[str]`):
+ One or several question(s) (must be used in conjunction with the `context` argument).
+ context (`str` or `List[str]`):
+ One or several context(s) associated with the question(s) (must be used in conjunction with the
+ `question` argument).
+ top_k (`int`, *optional*, defaults to 1):
+ The number of answers to return (will be chosen by order of likelihood). Note that we return less than
+ top_k answers if there are not enough options available within the context.
+ doc_stride (`int`, *optional*, defaults to 128):
+ If the context is too long to fit with the question for the model, it will be split in several chunks
+ with some overlap. This argument controls the size of that overlap.
+ max_answer_len (`int`, *optional*, defaults to 15):
+ The maximum length of predicted answers (e.g., only answers with a shorter length are considered).
+ max_seq_len (`int`, *optional*, defaults to 384):
+ The maximum length of the total sentence (context + question) in tokens of each chunk passed to the
+ model. The context will be split in several chunks (using `doc_stride` as overlap) if needed.
+ max_question_len (`int`, *optional*, defaults to 64):
+ The maximum length of the question after tokenization. It will be truncated if needed.
+ handle_impossible_answer (`bool`, *optional*, defaults to `False`):
+ Whether or not we accept impossible as an answer.
+ align_to_words (`bool`, *optional*, defaults to `True`):
+ Attempts to align the answer to real words. Improves quality on space separated languages. Might hurt on
+ non-space-separated languages (like Japanese or Chinese)
+
+ Return:
+ A `dict` or a list of `dict`: Each result comes as a dictionary with the following keys:
+
+ - **score** (`float`) -- The probability associated to the answer.
+ - **start** (`int`) -- The character start index of the answer (in the tokenized version of the input).
+ - **end** (`int`) -- The character end index of the answer (in the tokenized version of the input).
+ - **answer** (`str`) -- The answer to the question.
+ """
+
+ # Convert inputs to features
+ if args:
+ warnings.warn(
+ "Passing a list of SQuAD examples to the pipeline is deprecated and will be removed in v5. Inputs should be passed using the `question` and `context` keyword arguments instead.",
+ FutureWarning,
+ )
+
+ examples = self._args_parser(*args, **kwargs)
+ if isinstance(examples, (list, tuple)) and len(examples) == 1:
+ return super().__call__(examples[0], **kwargs)
+ return super().__call__(examples, **kwargs)
+
+ def preprocess(self, example, padding="do_not_pad", doc_stride=None, max_question_len=64, max_seq_len=None):
+ # XXX: This is specal, args_parser will not handle anything generator or dataset like
+ # For those we expect user to send a simple valid example either directly as a SquadExample or simple dict.
+ # So we still need a little sanitation here.
+ if isinstance(example, dict):
+ example = SquadExample(None, example["question"], example["context"], None, None, None)
+
+ if max_seq_len is None:
+ max_seq_len = min(self.tokenizer.model_max_length, 384)
+ if doc_stride is None:
+ doc_stride = min(max_seq_len // 2, 128)
+
+ if doc_stride > max_seq_len:
+ raise ValueError(f"`doc_stride` ({doc_stride}) is larger than `max_seq_len` ({max_seq_len})")
+
+ if not self.tokenizer.is_fast:
+ features = squad_convert_examples_to_features(
+ examples=[example],
+ tokenizer=self.tokenizer,
+ max_seq_length=max_seq_len,
+ doc_stride=doc_stride,
+ max_query_length=max_question_len,
+ padding_strategy=PaddingStrategy.MAX_LENGTH,
+ is_training=False,
+ tqdm_enabled=False,
+ )
+ else:
+ # Define the side we want to truncate / pad and the text/pair sorting
+ question_first = self.tokenizer.padding_side == "right"
+
+ encoded_inputs = self.tokenizer(
+ text=example.question_text if question_first else example.context_text,
+ text_pair=example.context_text if question_first else example.question_text,
+ padding=padding,
+ truncation="only_second" if question_first else "only_first",
+ max_length=max_seq_len,
+ stride=doc_stride,
+ return_token_type_ids=True,
+ return_overflowing_tokens=True,
+ return_offsets_mapping=True,
+ return_special_tokens_mask=True,
+ )
+ # When the input is too long, it's converted in a batch of inputs with overflowing tokens
+ # and a stride of overlap between the inputs. If a batch of inputs is given, a special output
+ # "overflow_to_sample_mapping" indicate which member of the encoded batch belong to which original batch sample.
+ # Here we tokenize examples one-by-one so we don't need to use "overflow_to_sample_mapping".
+ # "num_span" is the number of output samples generated from the overflowing tokens.
+ num_spans = len(encoded_inputs["input_ids"])
+
+ # p_mask: mask with 1 for token than cannot be in the answer (0 for token which can be in an answer)
+ # We put 0 on the tokens from the context and 1 everywhere else (question and special tokens)
+ p_mask = [
+ [tok != 1 if question_first else 0 for tok in encoded_inputs.sequence_ids(span_id)]
+ for span_id in range(num_spans)
+ ]
+
+ features = []
+ for span_idx in range(num_spans):
+ input_ids_span_idx = encoded_inputs["input_ids"][span_idx]
+ attention_mask_span_idx = (
+ encoded_inputs["attention_mask"][span_idx] if "attention_mask" in encoded_inputs else None
+ )
+ token_type_ids_span_idx = (
+ encoded_inputs["token_type_ids"][span_idx] if "token_type_ids" in encoded_inputs else None
+ )
+ # keep the cls_token unmasked (some models use it to indicate unanswerable questions)
+ if self.tokenizer.cls_token_id is not None:
+ cls_indices = np.nonzero(np.array(input_ids_span_idx) == self.tokenizer.cls_token_id)[0]
+ for cls_index in cls_indices:
+ p_mask[span_idx][cls_index] = 0
+ submask = p_mask[span_idx]
+ features.append(
+ SquadFeatures(
+ input_ids=input_ids_span_idx,
+ attention_mask=attention_mask_span_idx,
+ token_type_ids=token_type_ids_span_idx,
+ p_mask=submask,
+ encoding=encoded_inputs[span_idx],
+ # We don't use the rest of the values - and actually
+ # for Fast tokenizer we could totally avoid using SquadFeatures and SquadExample
+ cls_index=None,
+ token_to_orig_map={},
+ example_index=0,
+ unique_id=0,
+ paragraph_len=0,
+ token_is_max_context=0,
+ tokens=[],
+ start_position=0,
+ end_position=0,
+ is_impossible=False,
+ qas_id=None,
+ )
+ )
+
+ for i, feature in enumerate(features):
+ fw_args = {}
+ others = {}
+ model_input_names = self.tokenizer.model_input_names + ["p_mask", "token_type_ids"]
+
+ for k, v in feature.__dict__.items():
+ if k in model_input_names:
+ if self.framework == "tf":
+ tensor = tf.constant(v)
+ if tensor.dtype == tf.int64:
+ tensor = tf.cast(tensor, tf.int32)
+ fw_args[k] = tf.expand_dims(tensor, 0)
+ elif self.framework == "pt":
+ tensor = torch.tensor(v)
+ if tensor.dtype == torch.int32:
+ tensor = tensor.long()
+ fw_args[k] = tensor.unsqueeze(0)
+ else:
+ others[k] = v
+
+ is_last = i == len(features) - 1
+ yield {"example": example, "is_last": is_last, **fw_args, **others}
+
+ def _forward(self, inputs):
+ example = inputs["example"]
+ model_inputs = {k: inputs[k] for k in self.tokenizer.model_input_names}
+ # `XXXForSequenceClassification` models should not use `use_cache=True` even if it's supported
+ model_forward = self.model.forward if self.framework == "pt" else self.model.call
+ if "use_cache" in inspect.signature(model_forward).parameters.keys():
+ model_inputs["use_cache"] = False
+ output = self.model(**model_inputs)
+ if isinstance(output, dict):
+ return {"start": output["start_logits"], "end": output["end_logits"], "example": example, **inputs}
+ else:
+ start, end = output[:2]
+ return {"start": start, "end": end, "example": example, **inputs}
+
+ def postprocess(
+ self,
+ model_outputs,
+ top_k=1,
+ handle_impossible_answer=False,
+ max_answer_len=15,
+ align_to_words=True,
+ ):
+ min_null_score = 1000000 # large and positive
+ answers = []
+ for output in model_outputs:
+ if self.framework == "pt" and output["start"].dtype == torch.bfloat16:
+ start_ = output["start"].to(torch.float32)
+ else:
+ start_ = output["start"]
+ if self.framework == "pt" and output["start"].dtype == torch.bfloat16:
+ end_ = output["end"].to(torch.float32)
+ else:
+ end_ = output["end"]
+ example = output["example"]
+ p_mask = output["p_mask"]
+ attention_mask = (
+ output["attention_mask"].numpy() if output.get("attention_mask", None) is not None else None
+ )
+
+ starts, ends, scores, min_null_score = select_starts_ends(
+ start_, end_, p_mask, attention_mask, min_null_score, top_k, handle_impossible_answer, max_answer_len
+ )
+
+ if not self.tokenizer.is_fast:
+ char_to_word = np.array(example.char_to_word_offset)
+
+ # Convert the answer (tokens) back to the original text
+ # Score: score from the model
+ # Start: Index of the first character of the answer in the context string
+ # End: Index of the character following the last character of the answer in the context string
+ # Answer: Plain text of the answer
+ for s, e, score in zip(starts, ends, scores):
+ token_to_orig_map = output["token_to_orig_map"]
+ answers.append(
+ {
+ "score": score.item(),
+ "start": np.where(char_to_word == token_to_orig_map[s])[0][0].item(),
+ "end": np.where(char_to_word == token_to_orig_map[e])[0][-1].item(),
+ "answer": " ".join(example.doc_tokens[token_to_orig_map[s] : token_to_orig_map[e] + 1]),
+ }
+ )
+ else:
+ # Convert the answer (tokens) back to the original text
+ # Score: score from the model
+ # Start: Index of the first character of the answer in the context string
+ # End: Index of the character following the last character of the answer in the context string
+ # Answer: Plain text of the answer
+ question_first = bool(self.tokenizer.padding_side == "right")
+ enc = output["encoding"]
+
+ # Encoding was *not* padded, input_ids *might*.
+ # It doesn't make a difference unless we're padding on
+ # the left hand side, since now we have different offsets
+ # everywhere.
+ if self.tokenizer.padding_side == "left":
+ offset = (output["input_ids"] == self.tokenizer.pad_token_id).numpy().sum()
+ else:
+ offset = 0
+
+ # Sometimes the max probability token is in the middle of a word so:
+ # - we start by finding the right word containing the token with `token_to_word`
+ # - then we convert this word in a character span with `word_to_chars`
+ sequence_index = 1 if question_first else 0
+ for s, e, score in zip(starts, ends, scores):
+ s = s - offset
+ e = e - offset
+
+ start_index, end_index = self.get_indices(enc, s, e, sequence_index, align_to_words)
+
+ answers.append(
+ {
+ "score": score.item(),
+ "start": start_index,
+ "end": end_index,
+ "answer": example.context_text[start_index:end_index],
+ }
+ )
+
+ if handle_impossible_answer:
+ answers.append({"score": min_null_score, "start": 0, "end": 0, "answer": ""})
+ answers = sorted(answers, key=lambda x: x["score"], reverse=True)[:top_k]
+ if len(answers) == 1:
+ return answers[0]
+ return answers
+
+ def get_indices(
+ self, enc: "tokenizers.Encoding", s: int, e: int, sequence_index: int, align_to_words: bool
+ ) -> Tuple[int, int]:
+ if align_to_words:
+ try:
+ start_word = enc.token_to_word(s)
+ end_word = enc.token_to_word(e)
+ start_index = enc.word_to_chars(start_word, sequence_index=sequence_index)[0]
+ end_index = enc.word_to_chars(end_word, sequence_index=sequence_index)[1]
+ except Exception:
+ # Some tokenizers don't really handle words. Keep to offsets then.
+ start_index = enc.offsets[s][0]
+ end_index = enc.offsets[e][1]
+ else:
+ start_index = enc.offsets[s][0]
+ end_index = enc.offsets[e][1]
+ return start_index, end_index
+
+ def span_to_answer(self, text: str, start: int, end: int) -> Dict[str, Union[str, int]]:
+ """
+ When decoding from token probabilities, this method maps token indexes to actual word in the initial context.
+
+ Args:
+ text (`str`): The actual context to extract the answer from.
+ start (`int`): The answer starting token index.
+ end (`int`): The answer end token index.
+
+ Returns:
+ Dictionary like `{'answer': str, 'start': int, 'end': int}`
+ """
+ words = []
+ token_idx = char_start_idx = char_end_idx = chars_idx = 0
+
+ for i, word in enumerate(text.split(" ")):
+ token = self.tokenizer.tokenize(word)
+
+ # Append words if they are in the span
+ if start <= token_idx <= end:
+ if token_idx == start:
+ char_start_idx = chars_idx
+
+ if token_idx == end:
+ char_end_idx = chars_idx + len(word)
+
+ words += [word]
+
+ # Stop if we went over the end of the answer
+ if token_idx > end:
+ break
+
+ # Append the subtokenization length to the running index
+ token_idx += len(token)
+ chars_idx += len(word) + 1
+
+ # Join text with spaces
+ return {
+ "answer": " ".join(words),
+ "start": max(0, char_start_idx),
+ "end": min(len(text), char_end_idx),
+ }
diff --git a/.venv/lib/python3.11/site-packages/transformers/pipelines/table_question_answering.py b/.venv/lib/python3.11/site-packages/transformers/pipelines/table_question_answering.py
new file mode 100644
index 0000000000000000000000000000000000000000..10ea7170fed40cbc6f14c8b712741ce570fbf3f7
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/transformers/pipelines/table_question_answering.py
@@ -0,0 +1,443 @@
+import collections
+import types
+
+import numpy as np
+
+from ..utils import (
+ add_end_docstrings,
+ is_tf_available,
+ is_torch_available,
+ requires_backends,
+)
+from .base import ArgumentHandler, Dataset, Pipeline, PipelineException, build_pipeline_init_args
+
+
+if is_torch_available():
+ import torch
+
+ from ..models.auto.modeling_auto import (
+ MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES,
+ MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING_NAMES,
+ )
+
+if is_tf_available():
+ import tensorflow as tf
+
+ from ..models.auto.modeling_tf_auto import (
+ TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES,
+ TF_MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING_NAMES,
+ )
+
+
+class TableQuestionAnsweringArgumentHandler(ArgumentHandler):
+ """
+ Handles arguments for the TableQuestionAnsweringPipeline
+ """
+
+ def __call__(self, table=None, query=None, **kwargs):
+ # Returns tqa_pipeline_inputs of shape:
+ # [
+ # {"table": pd.DataFrame, "query": List[str]},
+ # ...,
+ # {"table": pd.DataFrame, "query" : List[str]}
+ # ]
+ requires_backends(self, "pandas")
+ import pandas as pd
+
+ if table is None:
+ raise ValueError("Keyword argument `table` cannot be None.")
+ elif query is None:
+ if isinstance(table, dict) and table.get("query") is not None and table.get("table") is not None:
+ tqa_pipeline_inputs = [table]
+ elif isinstance(table, list) and len(table) > 0:
+ if not all(isinstance(d, dict) for d in table):
+ raise ValueError(
+ f"Keyword argument `table` should be a list of dict, but is {(type(d) for d in table)}"
+ )
+
+ if table[0].get("query") is not None and table[0].get("table") is not None:
+ tqa_pipeline_inputs = table
+ else:
+ raise ValueError(
+ "If keyword argument `table` is a list of dictionaries, each dictionary should have a `table`"
+ f" and `query` key, but only dictionary has keys {table[0].keys()} `table` and `query` keys."
+ )
+ elif Dataset is not None and isinstance(table, Dataset) or isinstance(table, types.GeneratorType):
+ return table
+ else:
+ raise ValueError(
+ "Invalid input. Keyword argument `table` should be either of type `dict` or `list`, but "
+ f"is {type(table)})"
+ )
+ else:
+ tqa_pipeline_inputs = [{"table": table, "query": query}]
+
+ for tqa_pipeline_input in tqa_pipeline_inputs:
+ if not isinstance(tqa_pipeline_input["table"], pd.DataFrame):
+ if tqa_pipeline_input["table"] is None:
+ raise ValueError("Table cannot be None.")
+
+ tqa_pipeline_input["table"] = pd.DataFrame(tqa_pipeline_input["table"])
+
+ return tqa_pipeline_inputs
+
+
+@add_end_docstrings(build_pipeline_init_args(has_tokenizer=True))
+class TableQuestionAnsweringPipeline(Pipeline):
+ """
+ Table Question Answering pipeline using a `ModelForTableQuestionAnswering`. This pipeline is only available in
+ PyTorch.
+
+ Example:
+
+ ```python
+ >>> from transformers import pipeline
+
+ >>> oracle = pipeline(model="google/tapas-base-finetuned-wtq")
+ >>> table = {
+ ... "Repository": ["Transformers", "Datasets", "Tokenizers"],
+ ... "Stars": ["36542", "4512", "3934"],
+ ... "Contributors": ["651", "77", "34"],
+ ... "Programming language": ["Python", "Python", "Rust, Python and NodeJS"],
+ ... }
+ >>> oracle(query="How many stars does the transformers repository have?", table=table)
+ {'answer': 'AVERAGE > 36542', 'coordinates': [(0, 1)], 'cells': ['36542'], 'aggregator': 'AVERAGE'}
+ ```
+
+ Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial)
+
+ This tabular question answering pipeline can currently be loaded from [`pipeline`] using the following task
+ identifier: `"table-question-answering"`.
+
+ The models that this pipeline can use are models that have been fine-tuned on a tabular question answering task.
+ See the up-to-date list of available models on
+ [huggingface.co/models](https://huggingface.co/models?filter=table-question-answering).
+ """
+
+ default_input_names = "table,query"
+
+ def __init__(self, args_parser=TableQuestionAnsweringArgumentHandler(), *args, **kwargs):
+ super().__init__(*args, **kwargs)
+ self._args_parser = args_parser
+
+ if self.framework == "tf":
+ mapping = TF_MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING_NAMES.copy()
+ mapping.update(TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES)
+ else:
+ mapping = MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING_NAMES.copy()
+ mapping.update(MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES)
+ self.check_model_type(mapping)
+
+ self.aggregate = bool(getattr(self.model.config, "aggregation_labels", None)) and bool(
+ getattr(self.model.config, "num_aggregation_labels", None)
+ )
+ self.type = "tapas" if hasattr(self.model.config, "aggregation_labels") else None
+
+ def batch_inference(self, **inputs):
+ return self.model(**inputs)
+
+ def sequential_inference(self, **inputs):
+ """
+ Inference used for models that need to process sequences in a sequential fashion, like the SQA models which
+ handle conversational query related to a table.
+ """
+ if self.framework == "pt":
+ all_logits = []
+ all_aggregations = []
+ prev_answers = None
+ batch_size = inputs["input_ids"].shape[0]
+
+ input_ids = inputs["input_ids"].to(self.device)
+ attention_mask = inputs["attention_mask"].to(self.device)
+ token_type_ids = inputs["token_type_ids"].to(self.device)
+ token_type_ids_example = None
+
+ for index in range(batch_size):
+ # If sequences have already been processed, the token type IDs will be created according to the previous
+ # answer.
+ if prev_answers is not None:
+ prev_labels_example = token_type_ids_example[:, 3] # shape (seq_len,)
+ model_labels = np.zeros_like(prev_labels_example.cpu().numpy()) # shape (seq_len,)
+
+ token_type_ids_example = token_type_ids[index] # shape (seq_len, 7)
+ for i in range(model_labels.shape[0]):
+ segment_id = token_type_ids_example[:, 0].tolist()[i]
+ col_id = token_type_ids_example[:, 1].tolist()[i] - 1
+ row_id = token_type_ids_example[:, 2].tolist()[i] - 1
+
+ if row_id >= 0 and col_id >= 0 and segment_id == 1:
+ model_labels[i] = int(prev_answers[(col_id, row_id)])
+
+ token_type_ids_example[:, 3] = torch.from_numpy(model_labels).type(torch.long).to(self.device)
+
+ input_ids_example = input_ids[index]
+ attention_mask_example = attention_mask[index] # shape (seq_len,)
+ token_type_ids_example = token_type_ids[index] # shape (seq_len, 7)
+ outputs = self.model(
+ input_ids=input_ids_example.unsqueeze(0),
+ attention_mask=attention_mask_example.unsqueeze(0),
+ token_type_ids=token_type_ids_example.unsqueeze(0),
+ )
+ logits = outputs.logits
+
+ if self.aggregate:
+ all_aggregations.append(outputs.logits_aggregation)
+
+ all_logits.append(logits)
+
+ dist_per_token = torch.distributions.Bernoulli(logits=logits)
+ probabilities = dist_per_token.probs * attention_mask_example.type(torch.float32).to(
+ dist_per_token.probs.device
+ )
+
+ coords_to_probs = collections.defaultdict(list)
+ for i, p in enumerate(probabilities.squeeze().tolist()):
+ segment_id = token_type_ids_example[:, 0].tolist()[i]
+ col = token_type_ids_example[:, 1].tolist()[i] - 1
+ row = token_type_ids_example[:, 2].tolist()[i] - 1
+ if col >= 0 and row >= 0 and segment_id == 1:
+ coords_to_probs[(col, row)].append(p)
+
+ prev_answers = {key: np.array(coords_to_probs[key]).mean() > 0.5 for key in coords_to_probs}
+
+ logits_batch = torch.cat(tuple(all_logits), 0)
+
+ return (logits_batch,) if not self.aggregate else (logits_batch, torch.cat(tuple(all_aggregations), 0))
+ else:
+ all_logits = []
+ all_aggregations = []
+ prev_answers = None
+ batch_size = inputs["input_ids"].shape[0]
+
+ input_ids = inputs["input_ids"]
+ attention_mask = inputs["attention_mask"]
+ token_type_ids = inputs["token_type_ids"].numpy()
+ token_type_ids_example = None
+
+ for index in range(batch_size):
+ # If sequences have already been processed, the token type IDs will be created according to the previous
+ # answer.
+ if prev_answers is not None:
+ prev_labels_example = token_type_ids_example[:, 3] # shape (seq_len,)
+ model_labels = np.zeros_like(prev_labels_example, dtype=np.int32) # shape (seq_len,)
+
+ token_type_ids_example = token_type_ids[index] # shape (seq_len, 7)
+ for i in range(model_labels.shape[0]):
+ segment_id = token_type_ids_example[:, 0].tolist()[i]
+ col_id = token_type_ids_example[:, 1].tolist()[i] - 1
+ row_id = token_type_ids_example[:, 2].tolist()[i] - 1
+
+ if row_id >= 0 and col_id >= 0 and segment_id == 1:
+ model_labels[i] = int(prev_answers[(col_id, row_id)])
+
+ token_type_ids_example[:, 3] = model_labels
+
+ input_ids_example = input_ids[index]
+ attention_mask_example = attention_mask[index] # shape (seq_len,)
+ token_type_ids_example = token_type_ids[index] # shape (seq_len, 7)
+ outputs = self.model(
+ input_ids=np.expand_dims(input_ids_example, axis=0),
+ attention_mask=np.expand_dims(attention_mask_example, axis=0),
+ token_type_ids=np.expand_dims(token_type_ids_example, axis=0),
+ )
+ logits = outputs.logits
+
+ if self.aggregate:
+ all_aggregations.append(outputs.logits_aggregation)
+
+ all_logits.append(logits)
+
+ probabilities = tf.math.sigmoid(tf.cast(logits, tf.float32)) * tf.cast(
+ attention_mask_example, tf.float32
+ )
+
+ coords_to_probs = collections.defaultdict(list)
+ token_type_ids_example = token_type_ids_example
+ for i, p in enumerate(tf.squeeze(probabilities).numpy().tolist()):
+ segment_id = token_type_ids_example[:, 0].tolist()[i]
+ col = token_type_ids_example[:, 1].tolist()[i] - 1
+ row = token_type_ids_example[:, 2].tolist()[i] - 1
+ if col >= 0 and row >= 0 and segment_id == 1:
+ coords_to_probs[(col, row)].append(p)
+
+ prev_answers = {key: np.array(coords_to_probs[key]).mean() > 0.5 for key in coords_to_probs}
+
+ logits_batch = tf.concat(tuple(all_logits), 0)
+
+ return (logits_batch,) if not self.aggregate else (logits_batch, tf.concat(tuple(all_aggregations), 0))
+
+ def __call__(self, *args, **kwargs):
+ r"""
+ Answers queries according to a table. The pipeline accepts several types of inputs which are detailed below:
+
+ - `pipeline(table, query)`
+ - `pipeline(table, [query])`
+ - `pipeline(table=table, query=query)`
+ - `pipeline(table=table, query=[query])`
+ - `pipeline({"table": table, "query": query})`
+ - `pipeline({"table": table, "query": [query]})`
+ - `pipeline([{"table": table, "query": query}, {"table": table, "query": query}])`
+
+ The `table` argument should be a dict or a DataFrame built from that dict, containing the whole table:
+
+ Example:
+
+ ```python
+ data = {
+ "actors": ["brad pitt", "leonardo di caprio", "george clooney"],
+ "age": ["56", "45", "59"],
+ "number of movies": ["87", "53", "69"],
+ "date of birth": ["7 february 1967", "10 june 1996", "28 november 1967"],
+ }
+ ```
+
+ This dictionary can be passed in as such, or can be converted to a pandas DataFrame:
+
+ Example:
+
+ ```python
+ import pandas as pd
+
+ table = pd.DataFrame.from_dict(data)
+ ```
+
+ Args:
+ table (`pd.DataFrame` or `Dict`):
+ Pandas DataFrame or dictionary that will be converted to a DataFrame containing all the table values.
+ See above for an example of dictionary.
+ query (`str` or `List[str]`):
+ Query or list of queries that will be sent to the model alongside the table.
+ sequential (`bool`, *optional*, defaults to `False`):
+ Whether to do inference sequentially or as a batch. Batching is faster, but models like SQA require the
+ inference to be done sequentially to extract relations within sequences, given their conversational
+ nature.
+ padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `False`):
+ Activates and controls padding. Accepts the following values:
+
+ - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
+ sequence if provided).
+ - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
+ acceptable input length for the model if that argument is not provided.
+ - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
+ lengths).
+
+ truncation (`bool`, `str` or [`TapasTruncationStrategy`], *optional*, defaults to `False`):
+ Activates and controls truncation. Accepts the following values:
+
+ - `True` or `'drop_rows_to_fit'`: Truncate to a maximum length specified with the argument `max_length`
+ or to the maximum acceptable input length for the model if that argument is not provided. This will
+ truncate row by row, removing rows from the table.
+ - `False` or `'do_not_truncate'` (default): No truncation (i.e., can output batch with sequence lengths
+ greater than the model maximum admissible input size).
+
+
+ Return:
+ A dictionary or a list of dictionaries containing results: Each result is a dictionary with the following
+ keys:
+
+ - **answer** (`str`) -- The answer of the query given the table. If there is an aggregator, the answer will
+ be preceded by `AGGREGATOR >`.
+ - **coordinates** (`List[Tuple[int, int]]`) -- Coordinates of the cells of the answers.
+ - **cells** (`List[str]`) -- List of strings made up of the answer cell values.
+ - **aggregator** (`str`) -- If the model has an aggregator, this returns the aggregator.
+ """
+ pipeline_inputs = self._args_parser(*args, **kwargs)
+
+ results = super().__call__(pipeline_inputs, **kwargs)
+ if len(results) == 1:
+ return results[0]
+ return results
+
+ def _sanitize_parameters(self, sequential=None, padding=None, truncation=None, **kwargs):
+ preprocess_params = {}
+ if padding is not None:
+ preprocess_params["padding"] = padding
+ if truncation is not None:
+ preprocess_params["truncation"] = truncation
+
+ forward_params = {}
+ if sequential is not None:
+ forward_params["sequential"] = sequential
+
+ if self.assistant_model is not None:
+ forward_params["assistant_model"] = self.assistant_model
+ if self.assistant_tokenizer is not None:
+ forward_params["tokenizer"] = self.tokenizer
+ forward_params["assistant_tokenizer"] = self.assistant_tokenizer
+
+ return preprocess_params, forward_params, {}
+
+ def preprocess(self, pipeline_input, sequential=None, padding=True, truncation=None):
+ if truncation is None:
+ if self.type == "tapas":
+ truncation = "drop_rows_to_fit"
+ else:
+ truncation = "do_not_truncate"
+
+ table, query = pipeline_input["table"], pipeline_input["query"]
+ if table.empty:
+ raise ValueError("table is empty")
+ if query is None or query == "":
+ raise ValueError("query is empty")
+ inputs = self.tokenizer(table, query, return_tensors=self.framework, truncation=truncation, padding=padding)
+ inputs["table"] = table
+ return inputs
+
+ def _forward(self, model_inputs, sequential=False, **generate_kwargs):
+ table = model_inputs.pop("table")
+
+ if self.type == "tapas":
+ if sequential:
+ outputs = self.sequential_inference(**model_inputs)
+ else:
+ outputs = self.batch_inference(**model_inputs)
+ else:
+ # User-defined `generation_config` passed to the pipeline call take precedence
+ if "generation_config" not in generate_kwargs:
+ generate_kwargs["generation_config"] = self.generation_config
+
+ outputs = self.model.generate(**model_inputs, **generate_kwargs)
+ model_outputs = {"model_inputs": model_inputs, "table": table, "outputs": outputs}
+ return model_outputs
+
+ def postprocess(self, model_outputs):
+ inputs = model_outputs["model_inputs"]
+ table = model_outputs["table"]
+ outputs = model_outputs["outputs"]
+ if self.type == "tapas":
+ if self.aggregate:
+ logits, logits_agg = outputs[:2]
+ predictions = self.tokenizer.convert_logits_to_predictions(inputs, logits, logits_agg)
+ answer_coordinates_batch, agg_predictions = predictions
+ aggregators = {i: self.model.config.aggregation_labels[pred] for i, pred in enumerate(agg_predictions)}
+
+ no_agg_label_index = self.model.config.no_aggregation_label_index
+ aggregators_prefix = {
+ i: aggregators[i] + " > " for i, pred in enumerate(agg_predictions) if pred != no_agg_label_index
+ }
+ else:
+ logits = outputs[0]
+ predictions = self.tokenizer.convert_logits_to_predictions(inputs, logits)
+ answer_coordinates_batch = predictions[0]
+ aggregators = {}
+ aggregators_prefix = {}
+ answers = []
+ for index, coordinates in enumerate(answer_coordinates_batch):
+ cells = [table.iat[coordinate] for coordinate in coordinates]
+ aggregator = aggregators.get(index, "")
+ aggregator_prefix = aggregators_prefix.get(index, "")
+ answer = {
+ "answer": aggregator_prefix + ", ".join(cells),
+ "coordinates": coordinates,
+ "cells": [table.iat[coordinate] for coordinate in coordinates],
+ }
+ if aggregator:
+ answer["aggregator"] = aggregator
+
+ answers.append(answer)
+ if len(answer) == 0:
+ raise PipelineException("Empty answer")
+ else:
+ answers = [{"answer": answer} for answer in self.tokenizer.batch_decode(outputs, skip_special_tokens=True)]
+
+ return answers if len(answers) > 1 else answers[0]
diff --git a/.venv/lib/python3.11/site-packages/transformers/pipelines/text2text_generation.py b/.venv/lib/python3.11/site-packages/transformers/pipelines/text2text_generation.py
new file mode 100644
index 0000000000000000000000000000000000000000..9bc7544550286ecb2ad2108d7dffb142cc123877
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/transformers/pipelines/text2text_generation.py
@@ -0,0 +1,382 @@
+import enum
+import warnings
+
+from ..tokenization_utils import TruncationStrategy
+from ..utils import add_end_docstrings, is_tf_available, is_torch_available, logging
+from .base import Pipeline, build_pipeline_init_args
+
+
+if is_tf_available():
+ import tensorflow as tf
+
+ from ..models.auto.modeling_tf_auto import TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES
+
+if is_torch_available():
+ from ..models.auto.modeling_auto import MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES
+
+logger = logging.get_logger(__name__)
+
+
+class ReturnType(enum.Enum):
+ TENSORS = 0
+ TEXT = 1
+
+
+@add_end_docstrings(build_pipeline_init_args(has_tokenizer=True))
+class Text2TextGenerationPipeline(Pipeline):
+ """
+ Pipeline for text to text generation using seq2seq models.
+
+ Example:
+
+ ```python
+ >>> from transformers import pipeline
+
+ >>> generator = pipeline(model="mrm8488/t5-base-finetuned-question-generation-ap")
+ >>> generator(
+ ... "answer: Manuel context: Manuel has created RuPERTa-base with the support of HF-Transformers and Google"
+ ... )
+ [{'generated_text': 'question: Who created the RuPERTa-base?'}]
+ ```
+
+ Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial). You can pass text
+ generation parameters to this pipeline to control stopping criteria, decoding strategy, and more. Learn more about
+ text generation parameters in [Text generation strategies](../generation_strategies) and [Text
+ generation](text_generation).
+
+ This Text2TextGenerationPipeline pipeline can currently be loaded from [`pipeline`] using the following task
+ identifier: `"text2text-generation"`.
+
+ The models that this pipeline can use are models that have been fine-tuned on a translation task. See the
+ up-to-date list of available models on
+ [huggingface.co/models](https://huggingface.co/models?filter=text2text-generation). For a list of available
+ parameters, see the [following
+ documentation](https://huggingface.co/docs/transformers/en/main_classes/text_generation#transformers.generation.GenerationMixin.generate)
+
+ Usage:
+
+ ```python
+ text2text_generator = pipeline("text2text-generation")
+ text2text_generator("question: What is 42 ? context: 42 is the answer to life, the universe and everything")
+ ```"""
+
+ # Used in the return key of the pipeline.
+ return_name = "generated"
+
+ def __init__(self, *args, **kwargs):
+ super().__init__(*args, **kwargs)
+
+ self.check_model_type(
+ TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES
+ if self.framework == "tf"
+ else MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES
+ )
+
+ def _sanitize_parameters(
+ self,
+ return_tensors=None,
+ return_text=None,
+ return_type=None,
+ clean_up_tokenization_spaces=None,
+ truncation=None,
+ stop_sequence=None,
+ **generate_kwargs,
+ ):
+ preprocess_params = {}
+ if truncation is not None:
+ preprocess_params["truncation"] = truncation
+
+ forward_params = generate_kwargs
+
+ postprocess_params = {}
+ if return_tensors is not None and return_type is None:
+ return_type = ReturnType.TENSORS if return_tensors else ReturnType.TEXT
+ if return_type is not None:
+ postprocess_params["return_type"] = return_type
+
+ if clean_up_tokenization_spaces is not None:
+ postprocess_params["clean_up_tokenization_spaces"] = clean_up_tokenization_spaces
+
+ if stop_sequence is not None:
+ stop_sequence_ids = self.tokenizer.encode(stop_sequence, add_special_tokens=False)
+ if len(stop_sequence_ids) > 1:
+ warnings.warn(
+ "Stopping on a multiple token sequence is not yet supported on transformers. The first token of"
+ " the stop sequence will be used as the stop sequence string in the interim."
+ )
+ generate_kwargs["eos_token_id"] = stop_sequence_ids[0]
+
+ if self.assistant_model is not None:
+ forward_params["assistant_model"] = self.assistant_model
+ if self.assistant_tokenizer is not None:
+ forward_params["tokenizer"] = self.tokenizer
+ forward_params["assistant_tokenizer"] = self.assistant_tokenizer
+
+ return preprocess_params, forward_params, postprocess_params
+
+ def check_inputs(self, input_length: int, min_length: int, max_length: int):
+ """
+ Checks whether there might be something wrong with given input with regard to the model.
+ """
+ return True
+
+ def _parse_and_tokenize(self, *args, truncation):
+ prefix = self.prefix if self.prefix is not None else ""
+ if isinstance(args[0], list):
+ if self.tokenizer.pad_token_id is None:
+ raise ValueError("Please make sure that the tokenizer has a pad_token_id when using a batch input")
+ args = ([prefix + arg for arg in args[0]],)
+ padding = True
+
+ elif isinstance(args[0], str):
+ args = (prefix + args[0],)
+ padding = False
+ else:
+ raise ValueError(
+ f" `args[0]`: {args[0]} have the wrong format. The should be either of type `str` or type `list`"
+ )
+ inputs = self.tokenizer(*args, padding=padding, truncation=truncation, return_tensors=self.framework)
+ # This is produced by tokenizers but is an invalid generate kwargs
+ if "token_type_ids" in inputs:
+ del inputs["token_type_ids"]
+ return inputs
+
+ def __call__(self, *args, **kwargs):
+ r"""
+ Generate the output text(s) using text(s) given as inputs.
+
+ Args:
+ args (`str` or `List[str]`):
+ Input text for the encoder.
+ return_tensors (`bool`, *optional*, defaults to `False`):
+ Whether or not to include the tensors of predictions (as token indices) in the outputs.
+ return_text (`bool`, *optional*, defaults to `True`):
+ Whether or not to include the decoded texts in the outputs.
+ clean_up_tokenization_spaces (`bool`, *optional*, defaults to `False`):
+ Whether or not to clean up the potential extra spaces in the text output.
+ truncation (`TruncationStrategy`, *optional*, defaults to `TruncationStrategy.DO_NOT_TRUNCATE`):
+ The truncation strategy for the tokenization within the pipeline. `TruncationStrategy.DO_NOT_TRUNCATE`
+ (default) will never truncate, but it is sometimes desirable to truncate the input to fit the model's
+ max_length instead of throwing an error down the line.
+ generate_kwargs:
+ Additional keyword arguments to pass along to the generate method of the model (see the generate method
+ corresponding to your framework [here](./text_generation)).
+
+ Return:
+ A list or a list of list of `dict`: Each result comes as a dictionary with the following keys:
+
+ - **generated_text** (`str`, present when `return_text=True`) -- The generated text.
+ - **generated_token_ids** (`torch.Tensor` or `tf.Tensor`, present when `return_tensors=True`) -- The token
+ ids of the generated text.
+ """
+
+ result = super().__call__(*args, **kwargs)
+ if (
+ isinstance(args[0], list)
+ and all(isinstance(el, str) for el in args[0])
+ and all(len(res) == 1 for res in result)
+ ):
+ return [res[0] for res in result]
+ return result
+
+ def preprocess(self, inputs, truncation=TruncationStrategy.DO_NOT_TRUNCATE, **kwargs):
+ inputs = self._parse_and_tokenize(inputs, truncation=truncation, **kwargs)
+ return inputs
+
+ def _forward(self, model_inputs, **generate_kwargs):
+ if self.framework == "pt":
+ in_b, input_length = model_inputs["input_ids"].shape
+ elif self.framework == "tf":
+ in_b, input_length = tf.shape(model_inputs["input_ids"]).numpy()
+
+ self.check_inputs(
+ input_length,
+ generate_kwargs.get("min_length", self.generation_config.min_length),
+ generate_kwargs.get("max_length", self.generation_config.max_length),
+ )
+
+ # User-defined `generation_config` passed to the pipeline call take precedence
+ if "generation_config" not in generate_kwargs:
+ generate_kwargs["generation_config"] = self.generation_config
+
+ output_ids = self.model.generate(**model_inputs, **generate_kwargs)
+ out_b = output_ids.shape[0]
+ if self.framework == "pt":
+ output_ids = output_ids.reshape(in_b, out_b // in_b, *output_ids.shape[1:])
+ elif self.framework == "tf":
+ output_ids = tf.reshape(output_ids, (in_b, out_b // in_b, *output_ids.shape[1:]))
+ return {"output_ids": output_ids}
+
+ def postprocess(self, model_outputs, return_type=ReturnType.TEXT, clean_up_tokenization_spaces=False):
+ records = []
+ for output_ids in model_outputs["output_ids"][0]:
+ if return_type == ReturnType.TENSORS:
+ record = {f"{self.return_name}_token_ids": output_ids}
+ elif return_type == ReturnType.TEXT:
+ record = {
+ f"{self.return_name}_text": self.tokenizer.decode(
+ output_ids,
+ skip_special_tokens=True,
+ clean_up_tokenization_spaces=clean_up_tokenization_spaces,
+ )
+ }
+ records.append(record)
+ return records
+
+
+@add_end_docstrings(build_pipeline_init_args(has_tokenizer=True))
+class SummarizationPipeline(Text2TextGenerationPipeline):
+ """
+ Summarize news articles and other documents.
+
+ This summarizing pipeline can currently be loaded from [`pipeline`] using the following task identifier:
+ `"summarization"`.
+
+ The models that this pipeline can use are models that have been fine-tuned on a summarization task, which is
+ currently, '*bart-large-cnn*', '*google-t5/t5-small*', '*google-t5/t5-base*', '*google-t5/t5-large*', '*google-t5/t5-3b*', '*google-t5/t5-11b*'. See the up-to-date
+ list of available models on [huggingface.co/models](https://huggingface.co/models?filter=summarization). For a list
+ of available parameters, see the [following
+ documentation](https://huggingface.co/docs/transformers/en/main_classes/text_generation#transformers.generation.GenerationMixin.generate)
+
+ Usage:
+
+ ```python
+ # use bart in pytorch
+ summarizer = pipeline("summarization")
+ summarizer("An apple a day, keeps the doctor away", min_length=5, max_length=20)
+
+ # use t5 in tf
+ summarizer = pipeline("summarization", model="google-t5/t5-base", tokenizer="google-t5/t5-base", framework="tf")
+ summarizer("An apple a day, keeps the doctor away", min_length=5, max_length=20)
+ ```"""
+
+ # Used in the return key of the pipeline.
+ return_name = "summary"
+
+ def __call__(self, *args, **kwargs):
+ r"""
+ Summarize the text(s) given as inputs.
+
+ Args:
+ documents (*str* or `List[str]`):
+ One or several articles (or one list of articles) to summarize.
+ return_text (`bool`, *optional*, defaults to `True`):
+ Whether or not to include the decoded texts in the outputs
+ return_tensors (`bool`, *optional*, defaults to `False`):
+ Whether or not to include the tensors of predictions (as token indices) in the outputs.
+ clean_up_tokenization_spaces (`bool`, *optional*, defaults to `False`):
+ Whether or not to clean up the potential extra spaces in the text output.
+ generate_kwargs:
+ Additional keyword arguments to pass along to the generate method of the model (see the generate method
+ corresponding to your framework [here](./text_generation)).
+
+ Return:
+ A list or a list of list of `dict`: Each result comes as a dictionary with the following keys:
+
+ - **summary_text** (`str`, present when `return_text=True`) -- The summary of the corresponding input.
+ - **summary_token_ids** (`torch.Tensor` or `tf.Tensor`, present when `return_tensors=True`) -- The token
+ ids of the summary.
+ """
+ return super().__call__(*args, **kwargs)
+
+ def check_inputs(self, input_length: int, min_length: int, max_length: int) -> bool:
+ """
+ Checks whether there might be something wrong with given input with regard to the model.
+ """
+ if max_length < min_length:
+ logger.warning(f"Your min_length={min_length} must be inferior than your max_length={max_length}.")
+
+ if input_length < max_length:
+ logger.warning(
+ f"Your max_length is set to {max_length}, but your input_length is only {input_length}. Since this is "
+ "a summarization task, where outputs shorter than the input are typically wanted, you might "
+ f"consider decreasing max_length manually, e.g. summarizer('...', max_length={input_length//2})"
+ )
+
+
+@add_end_docstrings(build_pipeline_init_args(has_tokenizer=True))
+class TranslationPipeline(Text2TextGenerationPipeline):
+ """
+ Translates from one language to another.
+
+ This translation pipeline can currently be loaded from [`pipeline`] using the following task identifier:
+ `"translation_xx_to_yy"`.
+
+ The models that this pipeline can use are models that have been fine-tuned on a translation task. See the
+ up-to-date list of available models on [huggingface.co/models](https://huggingface.co/models?filter=translation).
+ For a list of available parameters, see the [following
+ documentation](https://huggingface.co/docs/transformers/en/main_classes/text_generation#transformers.generation.GenerationMixin.generate)
+
+ Usage:
+
+ ```python
+ en_fr_translator = pipeline("translation_en_to_fr")
+ en_fr_translator("How old are you?")
+ ```"""
+
+ # Used in the return key of the pipeline.
+ return_name = "translation"
+
+ def check_inputs(self, input_length: int, min_length: int, max_length: int):
+ if input_length > 0.9 * max_length:
+ logger.warning(
+ f"Your input_length: {input_length} is bigger than 0.9 * max_length: {max_length}. You might consider "
+ "increasing your max_length manually, e.g. translator('...', max_length=400)"
+ )
+ return True
+
+ def preprocess(self, *args, truncation=TruncationStrategy.DO_NOT_TRUNCATE, src_lang=None, tgt_lang=None):
+ if getattr(self.tokenizer, "_build_translation_inputs", None):
+ return self.tokenizer._build_translation_inputs(
+ *args, return_tensors=self.framework, truncation=truncation, src_lang=src_lang, tgt_lang=tgt_lang
+ )
+ else:
+ return super()._parse_and_tokenize(*args, truncation=truncation)
+
+ def _sanitize_parameters(self, src_lang=None, tgt_lang=None, **kwargs):
+ preprocess_params, forward_params, postprocess_params = super()._sanitize_parameters(**kwargs)
+ if src_lang is not None:
+ preprocess_params["src_lang"] = src_lang
+ if tgt_lang is not None:
+ preprocess_params["tgt_lang"] = tgt_lang
+ if src_lang is None and tgt_lang is None:
+ # Backward compatibility, direct arguments use is preferred.
+ task = kwargs.get("task", self.task)
+ items = task.split("_")
+ if task and len(items) == 4:
+ # translation, XX, to YY
+ preprocess_params["src_lang"] = items[1]
+ preprocess_params["tgt_lang"] = items[3]
+ return preprocess_params, forward_params, postprocess_params
+
+ def __call__(self, *args, **kwargs):
+ r"""
+ Translate the text(s) given as inputs.
+
+ Args:
+ args (`str` or `List[str]`):
+ Texts to be translated.
+ return_tensors (`bool`, *optional*, defaults to `False`):
+ Whether or not to include the tensors of predictions (as token indices) in the outputs.
+ return_text (`bool`, *optional*, defaults to `True`):
+ Whether or not to include the decoded texts in the outputs.
+ clean_up_tokenization_spaces (`bool`, *optional*, defaults to `False`):
+ Whether or not to clean up the potential extra spaces in the text output.
+ src_lang (`str`, *optional*):
+ The language of the input. Might be required for multilingual models. Will not have any effect for
+ single pair translation models
+ tgt_lang (`str`, *optional*):
+ The language of the desired output. Might be required for multilingual models. Will not have any effect
+ for single pair translation models
+ generate_kwargs:
+ Additional keyword arguments to pass along to the generate method of the model (see the generate method
+ corresponding to your framework [here](./text_generation)).
+
+ Return:
+ A list or a list of list of `dict`: Each result comes as a dictionary with the following keys:
+
+ - **translation_text** (`str`, present when `return_text=True`) -- The translation.
+ - **translation_token_ids** (`torch.Tensor` or `tf.Tensor`, present when `return_tensors=True`) -- The
+ token ids of the translation.
+ """
+ return super().__call__(*args, **kwargs)
diff --git a/.venv/lib/python3.11/site-packages/transformers/pipelines/text_classification.py b/.venv/lib/python3.11/site-packages/transformers/pipelines/text_classification.py
new file mode 100644
index 0000000000000000000000000000000000000000..dadb29c386b41e4ca3bd1a49ee103308c3f02174
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/transformers/pipelines/text_classification.py
@@ -0,0 +1,236 @@
+import inspect
+import warnings
+from typing import Dict
+
+import numpy as np
+
+from ..utils import ExplicitEnum, add_end_docstrings, is_tf_available, is_torch_available
+from .base import GenericTensor, Pipeline, build_pipeline_init_args
+
+
+if is_tf_available():
+ from ..models.auto.modeling_tf_auto import TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES
+
+if is_torch_available():
+ from ..models.auto.modeling_auto import MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES
+
+
+def sigmoid(_outputs):
+ return 1.0 / (1.0 + np.exp(-_outputs))
+
+
+def softmax(_outputs):
+ maxes = np.max(_outputs, axis=-1, keepdims=True)
+ shifted_exp = np.exp(_outputs - maxes)
+ return shifted_exp / shifted_exp.sum(axis=-1, keepdims=True)
+
+
+class ClassificationFunction(ExplicitEnum):
+ SIGMOID = "sigmoid"
+ SOFTMAX = "softmax"
+ NONE = "none"
+
+
+@add_end_docstrings(
+ build_pipeline_init_args(has_tokenizer=True),
+ r"""
+ return_all_scores (`bool`, *optional*, defaults to `False`):
+ Whether to return all prediction scores or just the one of the predicted class.
+ function_to_apply (`str`, *optional*, defaults to `"default"`):
+ The function to apply to the model outputs in order to retrieve the scores. Accepts four different values:
+
+ - `"default"`: if the model has a single label, will apply the sigmoid function on the output. If the model
+ has several labels, will apply the softmax function on the output. In case of regression tasks, will not
+ apply any function on the output.
+ - `"sigmoid"`: Applies the sigmoid function on the output.
+ - `"softmax"`: Applies the softmax function on the output.
+ - `"none"`: Does not apply any function on the output.""",
+)
+class TextClassificationPipeline(Pipeline):
+ """
+ Text classification pipeline using any `ModelForSequenceClassification`. See the [sequence classification
+ examples](../task_summary#sequence-classification) for more information.
+
+ Example:
+
+ ```python
+ >>> from transformers import pipeline
+
+ >>> classifier = pipeline(model="distilbert/distilbert-base-uncased-finetuned-sst-2-english")
+ >>> classifier("This movie is disgustingly good !")
+ [{'label': 'POSITIVE', 'score': 1.0}]
+
+ >>> classifier("Director tried too much.")
+ [{'label': 'NEGATIVE', 'score': 0.996}]
+ ```
+
+ Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial)
+
+ This text classification pipeline can currently be loaded from [`pipeline`] using the following task identifier:
+ `"sentiment-analysis"` (for classifying sequences according to positive or negative sentiments).
+
+ If multiple classification labels are available (`model.config.num_labels >= 2`), the pipeline will run a softmax
+ over the results. If there is a single label, the pipeline will run a sigmoid over the result. In case of regression
+ tasks (`model.config.problem_type == "regression"`), will not apply any function on the output.
+
+ The models that this pipeline can use are models that have been fine-tuned on a sequence classification task. See
+ the up-to-date list of available models on
+ [huggingface.co/models](https://huggingface.co/models?filter=text-classification).
+ """
+
+ return_all_scores = False
+ function_to_apply = ClassificationFunction.NONE
+
+ def __init__(self, **kwargs):
+ super().__init__(**kwargs)
+
+ self.check_model_type(
+ TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES
+ if self.framework == "tf"
+ else MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES
+ )
+
+ def _sanitize_parameters(self, return_all_scores=None, function_to_apply=None, top_k="", **tokenizer_kwargs):
+ # Using "" as default argument because we're going to use `top_k=None` in user code to declare
+ # "No top_k"
+ preprocess_params = tokenizer_kwargs
+
+ postprocess_params = {}
+ if hasattr(self.model.config, "return_all_scores") and return_all_scores is None:
+ return_all_scores = self.model.config.return_all_scores
+
+ if isinstance(top_k, int) or top_k is None:
+ postprocess_params["top_k"] = top_k
+ postprocess_params["_legacy"] = False
+ elif return_all_scores is not None:
+ warnings.warn(
+ "`return_all_scores` is now deprecated, if want a similar functionality use `top_k=None` instead of"
+ " `return_all_scores=True` or `top_k=1` instead of `return_all_scores=False`.",
+ UserWarning,
+ )
+ if return_all_scores:
+ postprocess_params["top_k"] = None
+ else:
+ postprocess_params["top_k"] = 1
+
+ if isinstance(function_to_apply, str):
+ function_to_apply = ClassificationFunction[function_to_apply.upper()]
+
+ if function_to_apply is not None:
+ postprocess_params["function_to_apply"] = function_to_apply
+ return preprocess_params, {}, postprocess_params
+
+ def __call__(self, inputs, **kwargs):
+ """
+ Classify the text(s) given as inputs.
+
+ Args:
+ inputs (`str` or `List[str]` or `Dict[str]`, or `List[Dict[str]]`):
+ One or several texts to classify. In order to use text pairs for your classification, you can send a
+ dictionary containing `{"text", "text_pair"}` keys, or a list of those.
+ top_k (`int`, *optional*, defaults to `1`):
+ How many results to return.
+ function_to_apply (`str`, *optional*, defaults to `"default"`):
+ The function to apply to the model outputs in order to retrieve the scores. Accepts four different
+ values:
+
+ If this argument is not specified, then it will apply the following functions according to the number
+ of labels:
+
+ - If problem type is regression, will not apply any function on the output.
+ - If the model has a single label, will apply the sigmoid function on the output.
+ - If the model has several labels, will apply the softmax function on the output.
+
+ Possible values are:
+
+ - `"sigmoid"`: Applies the sigmoid function on the output.
+ - `"softmax"`: Applies the softmax function on the output.
+ - `"none"`: Does not apply any function on the output.
+
+ Return:
+ A list or a list of list of `dict`: Each result comes as list of dictionaries with the following keys:
+
+ - **label** (`str`) -- The label predicted.
+ - **score** (`float`) -- The corresponding probability.
+
+ If `top_k` is used, one such dictionary is returned per label.
+ """
+ inputs = (inputs,)
+ result = super().__call__(*inputs, **kwargs)
+ # TODO try and retrieve it in a nicer way from _sanitize_parameters.
+ _legacy = "top_k" not in kwargs
+ if isinstance(inputs[0], str) and _legacy:
+ # This pipeline is odd, and return a list when single item is run
+ return [result]
+ else:
+ return result
+
+ def preprocess(self, inputs, **tokenizer_kwargs) -> Dict[str, GenericTensor]:
+ return_tensors = self.framework
+ if isinstance(inputs, dict):
+ return self.tokenizer(**inputs, return_tensors=return_tensors, **tokenizer_kwargs)
+ elif isinstance(inputs, list) and len(inputs) == 1 and isinstance(inputs[0], list) and len(inputs[0]) == 2:
+ # It used to be valid to use a list of list of list for text pairs, keeping this path for BC
+ return self.tokenizer(
+ text=inputs[0][0], text_pair=inputs[0][1], return_tensors=return_tensors, **tokenizer_kwargs
+ )
+ elif isinstance(inputs, list):
+ # This is likely an invalid usage of the pipeline attempting to pass text pairs.
+ raise ValueError(
+ "The pipeline received invalid inputs, if you are trying to send text pairs, you can try to send a"
+ ' dictionary `{"text": "My text", "text_pair": "My pair"}` in order to send a text pair.'
+ )
+ return self.tokenizer(inputs, return_tensors=return_tensors, **tokenizer_kwargs)
+
+ def _forward(self, model_inputs):
+ # `XXXForSequenceClassification` models should not use `use_cache=True` even if it's supported
+ model_forward = self.model.forward if self.framework == "pt" else self.model.call
+ if "use_cache" in inspect.signature(model_forward).parameters.keys():
+ model_inputs["use_cache"] = False
+ return self.model(**model_inputs)
+
+ def postprocess(self, model_outputs, function_to_apply=None, top_k=1, _legacy=True):
+ # `_legacy` is used to determine if we're running the naked pipeline and in backward
+ # compatibility mode, or if running the pipeline with `pipeline(..., top_k=1)` we're running
+ # the more natural result containing the list.
+ # Default value before `set_parameters`
+ if function_to_apply is None:
+ if self.model.config.problem_type == "regression":
+ function_to_apply = ClassificationFunction.NONE
+ elif self.model.config.problem_type == "multi_label_classification" or self.model.config.num_labels == 1:
+ function_to_apply = ClassificationFunction.SIGMOID
+ elif self.model.config.problem_type == "single_label_classification" or self.model.config.num_labels > 1:
+ function_to_apply = ClassificationFunction.SOFTMAX
+ elif hasattr(self.model.config, "function_to_apply") and function_to_apply is None:
+ function_to_apply = self.model.config.function_to_apply
+ else:
+ function_to_apply = ClassificationFunction.NONE
+
+ outputs = model_outputs["logits"][0]
+
+ if self.framework == "pt":
+ # To enable using fp16 and bf16
+ outputs = outputs.float().numpy()
+ else:
+ outputs = outputs.numpy()
+
+ if function_to_apply == ClassificationFunction.SIGMOID:
+ scores = sigmoid(outputs)
+ elif function_to_apply == ClassificationFunction.SOFTMAX:
+ scores = softmax(outputs)
+ elif function_to_apply == ClassificationFunction.NONE:
+ scores = outputs
+ else:
+ raise ValueError(f"Unrecognized `function_to_apply` argument: {function_to_apply}")
+
+ if top_k == 1 and _legacy:
+ return {"label": self.model.config.id2label[scores.argmax().item()], "score": scores.max().item()}
+
+ dict_scores = [
+ {"label": self.model.config.id2label[i], "score": score.item()} for i, score in enumerate(scores)
+ ]
+ if not _legacy:
+ dict_scores.sort(key=lambda x: x["score"], reverse=True)
+ if top_k is not None:
+ dict_scores = dict_scores[:top_k]
+ return dict_scores
diff --git a/.venv/lib/python3.11/site-packages/transformers/pipelines/text_generation.py b/.venv/lib/python3.11/site-packages/transformers/pipelines/text_generation.py
new file mode 100644
index 0000000000000000000000000000000000000000..c0f14663ffdf5876d1aa4612cf54432974049606
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/transformers/pipelines/text_generation.py
@@ -0,0 +1,449 @@
+import enum
+import itertools
+import types
+from typing import Dict
+
+from ..utils import add_end_docstrings, is_tf_available, is_torch_available
+from .base import Pipeline, build_pipeline_init_args
+
+
+if is_torch_available():
+ from ..models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
+ from .pt_utils import KeyDataset
+
+if is_tf_available():
+ import tensorflow as tf
+
+ from ..models.auto.modeling_tf_auto import TF_MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
+
+
+class ReturnType(enum.Enum):
+ TENSORS = 0
+ NEW_TEXT = 1
+ FULL_TEXT = 2
+
+
+class Chat:
+ """This class is intended to just be used internally in this pipeline and not exposed to users. We convert chats
+ to this format because the rest of the pipeline code tends to assume that lists of messages are
+ actually a batch of samples rather than messages in the same conversation."""
+
+ def __init__(self, messages: Dict):
+ for message in messages:
+ if not ("role" in message and "content" in message):
+ raise ValueError("When passing chat dicts as input, each dict must have a 'role' and 'content' key.")
+ self.messages = messages
+
+
+@add_end_docstrings(build_pipeline_init_args(has_tokenizer=True))
+class TextGenerationPipeline(Pipeline):
+ """
+ Language generation pipeline using any `ModelWithLMHead`. This pipeline predicts the words that will follow a
+ specified text prompt. When the underlying model is a conversational model, it can also accept one or more chats,
+ in which case the pipeline will operate in chat mode and will continue the chat(s) by adding its response(s).
+ Each chat takes the form of a list of dicts, where each dict contains "role" and "content" keys.
+
+ Examples:
+
+ ```python
+ >>> from transformers import pipeline
+
+ >>> generator = pipeline(model="openai-community/gpt2")
+ >>> generator("I can't believe you did such a ", do_sample=False)
+ [{'generated_text': "I can't believe you did such a icky thing to me. I'm so sorry. I'm so sorry. I'm so sorry. I'm so sorry. I'm so sorry. I'm so sorry. I'm so sorry. I"}]
+
+ >>> # These parameters will return suggestions, and only the newly created text making it easier for prompting suggestions.
+ >>> outputs = generator("My tart needs some", num_return_sequences=4, return_full_text=False)
+ ```
+
+ ```python
+ >>> from transformers import pipeline
+
+ >>> generator = pipeline(model="HuggingFaceH4/zephyr-7b-beta")
+ >>> # Zephyr-beta is a conversational model, so let's pass it a chat instead of a single string
+ >>> generator([{"role": "user", "content": "What is the capital of France? Answer in one word."}], do_sample=False, max_new_tokens=2)
+ [{'generated_text': [{'role': 'user', 'content': 'What is the capital of France? Answer in one word.'}, {'role': 'assistant', 'content': 'Paris'}]}]
+ ```
+
+ Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial). You can pass text
+ generation parameters to this pipeline to control stopping criteria, decoding strategy, and more. Learn more about
+ text generation parameters in [Text generation strategies](../generation_strategies) and [Text
+ generation](text_generation).
+
+ This language generation pipeline can currently be loaded from [`pipeline`] using the following task identifier:
+ `"text-generation"`.
+
+ The models that this pipeline can use are models that have been trained with an autoregressive language modeling
+ objective. See the list of available [text completion models](https://huggingface.co/models?filter=text-generation)
+ and the list of [conversational models](https://huggingface.co/models?other=conversational)
+ on [huggingface.co/models].
+ """
+
+ # Prefix text to help Transformer-XL and XLNet with short prompts as proposed by Aman Rusia
+ # in https://github.com/rusiaaman/XLNet-gen#methodology
+ # and https://medium.com/@amanrusia/xlnet-speaks-comparison-to-gpt-2-ea1a4e9ba39e
+
+ XL_PREFIX = """
+ In 1991, the remains of Russian Tsar Nicholas II and his family (except for Alexei and Maria) are discovered. The
+ voice of Nicholas's young son, Tsarevich Alexei Nikolaevich, narrates the remainder of the story. 1883 Western
+ Siberia, a young Grigori Rasputin is asked by his father and a group of men to perform magic. Rasputin has a vision
+ and denounces one of the men as a horse thief. Although his father initially slaps him for making such an
+ accusation, Rasputin watches as the man is chased outside and beaten. Twenty years later, Rasputin sees a vision of
+ the Virgin Mary, prompting him to become a priest. Rasputin quickly becomes famous, with people, even a bishop,
+ begging for his blessing.
+ """
+
+ def __init__(self, *args, **kwargs):
+ super().__init__(*args, **kwargs)
+ self.check_model_type(
+ TF_MODEL_FOR_CAUSAL_LM_MAPPING_NAMES if self.framework == "tf" else MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
+ )
+ if "prefix" not in self._preprocess_params:
+ # This is very specific. The logic is quite complex and needs to be done
+ # as a "default".
+ # It also defines both some preprocess_kwargs and generate_kwargs
+ # which is why we cannot put them in their respective methods.
+ prefix = None
+ if self.prefix is not None:
+ prefix = self.prefix
+ if prefix is None and self.model.__class__.__name__ in [
+ "XLNetLMHeadModel",
+ "TransfoXLLMHeadModel",
+ "TFXLNetLMHeadModel",
+ "TFTransfoXLLMHeadModel",
+ ]:
+ # For XLNet and TransformerXL we add an article to the prompt to give more state to the model.
+ prefix = self.XL_PREFIX
+ if prefix is not None:
+ # Recalculate some generate_kwargs linked to prefix.
+ preprocess_params, forward_params, _ = self._sanitize_parameters(prefix=prefix, **self._forward_params)
+ self._preprocess_params = {**self._preprocess_params, **preprocess_params}
+ self._forward_params = {**self._forward_params, **forward_params}
+
+ def _sanitize_parameters(
+ self,
+ return_full_text=None,
+ return_tensors=None,
+ return_text=None,
+ return_type=None,
+ clean_up_tokenization_spaces=None,
+ prefix=None,
+ handle_long_generation=None,
+ stop_sequence=None,
+ truncation=None,
+ max_length=None,
+ continue_final_message=None,
+ **generate_kwargs,
+ ):
+ preprocess_params = {}
+
+ add_special_tokens = False
+ if "add_special_tokens" in generate_kwargs:
+ add_special_tokens = preprocess_params["add_special_tokens"] = generate_kwargs.pop("add_special_tokens")
+
+ if "padding" in generate_kwargs:
+ preprocess_params["padding"] = generate_kwargs.pop("padding")
+
+ if truncation is not None:
+ preprocess_params["truncation"] = truncation
+
+ if max_length is not None:
+ preprocess_params["max_length"] = max_length
+ generate_kwargs["max_length"] = max_length
+
+ if prefix is not None:
+ preprocess_params["prefix"] = prefix
+ if prefix:
+ prefix_inputs = self.tokenizer(
+ prefix, padding=False, add_special_tokens=add_special_tokens, return_tensors=self.framework
+ )
+ generate_kwargs["prefix_length"] = prefix_inputs["input_ids"].shape[-1]
+
+ if handle_long_generation is not None:
+ if handle_long_generation not in {"hole"}:
+ raise ValueError(
+ f"{handle_long_generation} is not a valid value for `handle_long_generation` parameter expected"
+ " [None, 'hole']"
+ )
+ preprocess_params["handle_long_generation"] = handle_long_generation
+
+ if continue_final_message is not None:
+ preprocess_params["continue_final_message"] = continue_final_message
+
+ preprocess_params.update(generate_kwargs)
+ forward_params = generate_kwargs
+
+ postprocess_params = {}
+ if return_full_text is not None and return_type is None:
+ if return_text is not None:
+ raise ValueError("`return_text` is mutually exclusive with `return_full_text`")
+ if return_tensors is not None:
+ raise ValueError("`return_full_text` is mutually exclusive with `return_tensors`")
+ return_type = ReturnType.FULL_TEXT if return_full_text else ReturnType.NEW_TEXT
+ if return_tensors is not None and return_type is None:
+ if return_text is not None:
+ raise ValueError("`return_text` is mutually exclusive with `return_tensors`")
+ return_type = ReturnType.TENSORS
+ if return_type is not None:
+ postprocess_params["return_type"] = return_type
+ if clean_up_tokenization_spaces is not None:
+ postprocess_params["clean_up_tokenization_spaces"] = clean_up_tokenization_spaces
+ if continue_final_message is not None:
+ postprocess_params["continue_final_message"] = continue_final_message
+
+ if stop_sequence is not None:
+ stop_sequence_ids = self.tokenizer.encode(stop_sequence, add_special_tokens=False)
+ generate_kwargs["eos_token_id"] = stop_sequence_ids
+
+ if self.assistant_model is not None:
+ forward_params["assistant_model"] = self.assistant_model
+ if self.assistant_tokenizer is not None:
+ forward_params["tokenizer"] = self.tokenizer
+ forward_params["assistant_tokenizer"] = self.assistant_tokenizer
+
+ return preprocess_params, forward_params, postprocess_params
+
+ # overriding _parse_and_tokenize to allow for unusual language-modeling tokenizer arguments
+ def _parse_and_tokenize(self, *args, **kwargs):
+ """
+ Parse arguments and tokenize
+ """
+ # Parse arguments
+ if self.model.__class__.__name__ in ["TransfoXLLMHeadModel"]:
+ kwargs.update({"add_space_before_punct_symbol": True})
+
+ return super()._parse_and_tokenize(*args, **kwargs)
+
+ def __call__(self, text_inputs, **kwargs):
+ """
+ Complete the prompt(s) given as inputs.
+
+ Args:
+ text_inputs (`str`, `List[str]`, List[Dict[str, str]], or `List[List[Dict[str, str]]]`):
+ One or several prompts (or one list of prompts) to complete. If strings or a list of string are
+ passed, this pipeline will continue each prompt. Alternatively, a "chat", in the form of a list
+ of dicts with "role" and "content" keys, can be passed, or a list of such chats. When chats are passed,
+ the model's chat template will be used to format them before passing them to the model.
+ return_tensors (`bool`, *optional*, defaults to `False`):
+ Returns the tensors of predictions (as token indices) in the outputs. If set to
+ `True`, the decoded text is not returned.
+ return_text (`bool`, *optional*):
+ Returns the decoded texts in the outputs.
+ return_full_text (`bool`, *optional*, defaults to `True`):
+ If set to `False` only added text is returned, otherwise the full text is returned. Cannot be
+ specified at the same time as `return_text`.
+ clean_up_tokenization_spaces (`bool`, *optional*, defaults to `True`):
+ Whether or not to clean up the potential extra spaces in the text output.
+ continue_final_message( `bool`, *optional*): This indicates that you want the model to continue the
+ last message in the input chat rather than starting a new one, allowing you to "prefill" its response.
+ By default this is `True` when the final message in the input chat has the `assistant` role and
+ `False` otherwise, but you can manually override that behaviour by setting this flag.
+ prefix (`str`, *optional*):
+ Prefix added to prompt.
+ handle_long_generation (`str`, *optional*):
+ By default, this pipelines does not handle long generation (ones that exceed in one form or the other
+ the model maximum length). There is no perfect way to adress this (more info
+ :https://github.com/huggingface/transformers/issues/14033#issuecomment-948385227). This provides common
+ strategies to work around that problem depending on your use case.
+
+ - `None` : default strategy where nothing in particular happens
+ - `"hole"`: Truncates left of input, and leaves a gap wide enough to let generation happen (might
+ truncate a lot of the prompt and not suitable when generation exceed the model capacity)
+ generate_kwargs (`dict`, *optional*):
+ Additional keyword arguments to pass along to the generate method of the model (see the generate method
+ corresponding to your framework [here](./text_generation)).
+
+ Return:
+ A list or a list of lists of `dict`: Returns one of the following dictionaries (cannot return a combination
+ of both `generated_text` and `generated_token_ids`):
+
+ - **generated_text** (`str`, present when `return_text=True`) -- The generated text.
+ - **generated_token_ids** (`torch.Tensor` or `tf.Tensor`, present when `return_tensors=True`) -- The token
+ ids of the generated text.
+ """
+ if isinstance(
+ text_inputs,
+ (list, tuple, types.GeneratorType, KeyDataset)
+ if is_torch_available()
+ else (list, tuple, types.GeneratorType),
+ ):
+ if isinstance(text_inputs, types.GeneratorType):
+ text_inputs, _ = itertools.tee(text_inputs)
+ text_inputs, first_item = (x for x in text_inputs), next(_)
+ else:
+ first_item = text_inputs[0]
+ if isinstance(first_item, (list, tuple, dict)):
+ # We have one or more prompts in list-of-dicts format, so this is chat mode
+ if isinstance(first_item, dict):
+ return super().__call__(Chat(text_inputs), **kwargs)
+ else:
+ chats = (Chat(chat) for chat in text_inputs) # 🐈 🐈 🐈
+ if isinstance(text_inputs, types.GeneratorType):
+ return super().__call__(chats, **kwargs)
+ else:
+ return super().__call__(list(chats), **kwargs)
+ return super().__call__(text_inputs, **kwargs)
+
+ def preprocess(
+ self,
+ prompt_text,
+ prefix="",
+ handle_long_generation=None,
+ add_special_tokens=None,
+ truncation=None,
+ padding=None,
+ max_length=None,
+ continue_final_message=None,
+ **generate_kwargs,
+ ):
+ # Only set non-None tokenizer kwargs, so as to rely on the tokenizer's defaults
+ tokenizer_kwargs = {
+ "add_special_tokens": add_special_tokens,
+ "truncation": truncation,
+ "padding": padding,
+ "max_length": max_length,
+ }
+ tokenizer_kwargs = {key: value for key, value in tokenizer_kwargs.items() if value is not None}
+
+ if isinstance(prompt_text, Chat):
+ tokenizer_kwargs.pop("add_special_tokens", None) # ignore add_special_tokens on chats
+ # If the user passes a chat that ends in an assistant message, we treat it as a prefill by default
+ # because very few models support multiple separate, consecutive assistant messages
+ if continue_final_message is None:
+ continue_final_message = prompt_text.messages[-1]["role"] == "assistant"
+ inputs = self.tokenizer.apply_chat_template(
+ prompt_text.messages,
+ add_generation_prompt=not continue_final_message,
+ continue_final_message=continue_final_message,
+ return_dict=True,
+ return_tensors=self.framework,
+ **tokenizer_kwargs,
+ )
+ else:
+ inputs = self.tokenizer(prefix + prompt_text, return_tensors=self.framework, **tokenizer_kwargs)
+
+ inputs["prompt_text"] = prompt_text
+
+ if handle_long_generation == "hole":
+ cur_len = inputs["input_ids"].shape[-1]
+ if "max_new_tokens" in generate_kwargs:
+ new_tokens = generate_kwargs["max_new_tokens"]
+ else:
+ new_tokens = generate_kwargs.get("max_length", self.generation_config.max_length) - cur_len
+ if new_tokens < 0:
+ raise ValueError("We cannot infer how many new tokens are expected")
+ if cur_len + new_tokens > self.tokenizer.model_max_length:
+ keep_length = self.tokenizer.model_max_length - new_tokens
+ if keep_length <= 0:
+ raise ValueError(
+ "We cannot use `hole` to handle this generation the number of desired tokens exceeds the"
+ " models max length"
+ )
+
+ inputs["input_ids"] = inputs["input_ids"][:, -keep_length:]
+ if "attention_mask" in inputs:
+ inputs["attention_mask"] = inputs["attention_mask"][:, -keep_length:]
+
+ return inputs
+
+ def _forward(self, model_inputs, **generate_kwargs):
+ input_ids = model_inputs["input_ids"]
+ attention_mask = model_inputs.get("attention_mask", None)
+ # Allow empty prompts
+ if input_ids.shape[1] == 0:
+ input_ids = None
+ attention_mask = None
+ in_b = 1
+ else:
+ in_b = input_ids.shape[0]
+ prompt_text = model_inputs.pop("prompt_text")
+
+ # If there is a prefix, we may need to adjust the generation length. Do so without permanently modifying
+ # generate_kwargs, as some of the parameterization may come from the initialization of the pipeline.
+ prefix_length = generate_kwargs.pop("prefix_length", 0)
+ if prefix_length > 0:
+ has_max_new_tokens = "max_new_tokens" in generate_kwargs or (
+ "generation_config" in generate_kwargs
+ and generate_kwargs["generation_config"].max_new_tokens is not None
+ )
+ if not has_max_new_tokens:
+ generate_kwargs["max_length"] = generate_kwargs.get("max_length") or self.generation_config.max_length
+ generate_kwargs["max_length"] += prefix_length
+ has_min_new_tokens = "min_new_tokens" in generate_kwargs or (
+ "generation_config" in generate_kwargs
+ and generate_kwargs["generation_config"].min_new_tokens is not None
+ )
+ if not has_min_new_tokens and "min_length" in generate_kwargs:
+ generate_kwargs["min_length"] += prefix_length
+
+ # User-defined `generation_config` passed to the pipeline call take precedence
+ if "generation_config" not in generate_kwargs:
+ generate_kwargs["generation_config"] = self.generation_config
+
+ generated_sequence = self.model.generate(input_ids=input_ids, attention_mask=attention_mask, **generate_kwargs)
+ out_b = generated_sequence.shape[0]
+ if self.framework == "pt":
+ generated_sequence = generated_sequence.reshape(in_b, out_b // in_b, *generated_sequence.shape[1:])
+ elif self.framework == "tf":
+ generated_sequence = tf.reshape(generated_sequence, (in_b, out_b // in_b, *generated_sequence.shape[1:]))
+ return {"generated_sequence": generated_sequence, "input_ids": input_ids, "prompt_text": prompt_text}
+
+ def postprocess(
+ self,
+ model_outputs,
+ return_type=ReturnType.FULL_TEXT,
+ clean_up_tokenization_spaces=True,
+ continue_final_message=None,
+ ):
+ generated_sequence = model_outputs["generated_sequence"][0]
+ input_ids = model_outputs["input_ids"]
+ prompt_text = model_outputs["prompt_text"]
+ generated_sequence = generated_sequence.numpy().tolist()
+ records = []
+ for sequence in generated_sequence:
+ if return_type == ReturnType.TENSORS:
+ record = {"generated_token_ids": sequence}
+ elif return_type in {ReturnType.NEW_TEXT, ReturnType.FULL_TEXT}:
+ # Decode text
+ text = self.tokenizer.decode(
+ sequence,
+ skip_special_tokens=True,
+ clean_up_tokenization_spaces=clean_up_tokenization_spaces,
+ )
+
+ # Remove PADDING prompt of the sequence if XLNet or Transfo-XL model is used
+ if input_ids is None:
+ prompt_length = 0
+ else:
+ prompt_length = len(
+ self.tokenizer.decode(
+ input_ids[0],
+ skip_special_tokens=True,
+ clean_up_tokenization_spaces=clean_up_tokenization_spaces,
+ )
+ )
+
+ all_text = text[prompt_length:]
+ if return_type == ReturnType.FULL_TEXT:
+ if isinstance(prompt_text, str):
+ all_text = prompt_text + all_text
+ elif isinstance(prompt_text, Chat):
+ if continue_final_message is None:
+ # If the user passes a chat ending in an assistant message, we treat it as a prefill by
+ # default because very few models support multiple separate, consecutive assistant messages
+ continue_final_message = prompt_text.messages[-1]["role"] == "assistant"
+ if continue_final_message:
+ # With assistant prefill, concat onto the end of the last message
+ all_text = list(prompt_text.messages)[:-1] + [
+ {
+ "role": prompt_text.messages[-1]["role"],
+ "content": prompt_text.messages[-1]["content"] + all_text,
+ }
+ ]
+ else:
+ # When we're not starting from a prefill, the output is a new assistant message
+ all_text = list(prompt_text.messages) + [{"role": "assistant", "content": all_text}]
+ record = {"generated_text": all_text}
+ records.append(record)
+
+ return records
diff --git a/.venv/lib/python3.11/site-packages/transformers/pipelines/text_to_audio.py b/.venv/lib/python3.11/site-packages/transformers/pipelines/text_to_audio.py
new file mode 100644
index 0000000000000000000000000000000000000000..b7beca586d21957b2eb3ec2dbb7daa2c49453970
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/transformers/pipelines/text_to_audio.py
@@ -0,0 +1,219 @@
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.from typing import List, Union
+from typing import List, Union
+
+from ..utils import is_torch_available
+from .base import Pipeline
+
+
+if is_torch_available():
+ from ..models.auto.modeling_auto import MODEL_FOR_TEXT_TO_SPECTROGRAM_MAPPING
+ from ..models.speecht5.modeling_speecht5 import SpeechT5HifiGan
+
+DEFAULT_VOCODER_ID = "microsoft/speecht5_hifigan"
+
+
+class TextToAudioPipeline(Pipeline):
+ """
+ Text-to-audio generation pipeline using any `AutoModelForTextToWaveform` or `AutoModelForTextToSpectrogram`. This
+ pipeline generates an audio file from an input text and optional other conditional inputs.
+
+ Example:
+
+ ```python
+ >>> from transformers import pipeline
+
+ >>> pipe = pipeline(model="suno/bark-small")
+ >>> output = pipe("Hey it's HuggingFace on the phone!")
+
+ >>> audio = output["audio"]
+ >>> sampling_rate = output["sampling_rate"]
+ ```
+
+ Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial)
+
+
+
+ You can specify parameters passed to the model by using [`TextToAudioPipeline.__call__.forward_params`] or
+ [`TextToAudioPipeline.__call__.generate_kwargs`].
+
+ Example:
+
+ ```python
+ >>> from transformers import pipeline
+
+ >>> music_generator = pipeline(task="text-to-audio", model="facebook/musicgen-small", framework="pt")
+
+ >>> # diversify the music generation by adding randomness with a high temperature and set a maximum music length
+ >>> generate_kwargs = {
+ ... "do_sample": True,
+ ... "temperature": 0.7,
+ ... "max_new_tokens": 35,
+ ... }
+
+ >>> outputs = music_generator("Techno music with high melodic riffs", generate_kwargs=generate_kwargs)
+ ```
+
+
+
+ This pipeline can currently be loaded from [`pipeline`] using the following task identifiers: `"text-to-speech"` or
+ `"text-to-audio"`.
+
+ See the list of available models on [huggingface.co/models](https://huggingface.co/models?filter=text-to-speech).
+ """
+
+ def __init__(self, *args, vocoder=None, sampling_rate=None, **kwargs):
+ super().__init__(*args, **kwargs)
+
+ if self.framework == "tf":
+ raise ValueError("The TextToAudioPipeline is only available in PyTorch.")
+
+ self.vocoder = None
+ if self.model.__class__ in MODEL_FOR_TEXT_TO_SPECTROGRAM_MAPPING.values():
+ self.vocoder = (
+ SpeechT5HifiGan.from_pretrained(DEFAULT_VOCODER_ID).to(self.model.device)
+ if vocoder is None
+ else vocoder
+ )
+
+ self.sampling_rate = sampling_rate
+ if self.vocoder is not None:
+ self.sampling_rate = self.vocoder.config.sampling_rate
+
+ if self.sampling_rate is None:
+ # get sampling_rate from config and generation config
+
+ config = self.model.config
+ gen_config = self.model.__dict__.get("generation_config", None)
+ if gen_config is not None:
+ config.update(gen_config.to_dict())
+
+ for sampling_rate_name in ["sample_rate", "sampling_rate"]:
+ sampling_rate = getattr(config, sampling_rate_name, None)
+ if sampling_rate is not None:
+ self.sampling_rate = sampling_rate
+
+ def preprocess(self, text, **kwargs):
+ if isinstance(text, str):
+ text = [text]
+
+ if self.model.config.model_type == "bark":
+ # bark Tokenizer is called with BarkProcessor which uses those kwargs
+ new_kwargs = {
+ "max_length": self.generation_config.semantic_config.get("max_input_semantic_length", 256),
+ "add_special_tokens": False,
+ "return_attention_mask": True,
+ "return_token_type_ids": False,
+ "padding": "max_length",
+ }
+
+ # priority is given to kwargs
+ new_kwargs.update(kwargs)
+
+ kwargs = new_kwargs
+
+ output = self.tokenizer(text, **kwargs, return_tensors="pt")
+
+ return output
+
+ def _forward(self, model_inputs, **kwargs):
+ # we expect some kwargs to be additional tensors which need to be on the right device
+ kwargs = self._ensure_tensor_on_device(kwargs, device=self.device)
+ forward_params = kwargs["forward_params"]
+ generate_kwargs = kwargs["generate_kwargs"]
+
+ if self.model.can_generate():
+ # we expect some kwargs to be additional tensors which need to be on the right device
+ generate_kwargs = self._ensure_tensor_on_device(generate_kwargs, device=self.device)
+
+ # User-defined `generation_config` passed to the pipeline call take precedence
+ if "generation_config" not in generate_kwargs:
+ generate_kwargs["generation_config"] = self.generation_config
+
+ # generate_kwargs get priority over forward_params
+ forward_params.update(generate_kwargs)
+
+ output = self.model.generate(**model_inputs, **forward_params)
+ else:
+ if len(generate_kwargs):
+ raise ValueError(
+ "You're using the `TextToAudioPipeline` with a forward-only model, but `generate_kwargs` is non "
+ "empty. For forward-only TTA models, please use `forward_params` instead of `generate_kwargs`. "
+ f"For reference, the `generate_kwargs` used here are: {generate_kwargs.keys()}"
+ )
+ output = self.model(**model_inputs, **forward_params)[0]
+
+ if self.vocoder is not None:
+ # in that case, the output is a spectrogram that needs to be converted into a waveform
+ output = self.vocoder(output)
+
+ return output
+
+ def __call__(self, text_inputs: Union[str, List[str]], **forward_params):
+ """
+ Generates speech/audio from the inputs. See the [`TextToAudioPipeline`] documentation for more information.
+
+ Args:
+ text_inputs (`str` or `List[str]`):
+ The text(s) to generate.
+ forward_params (`dict`, *optional*):
+ Parameters passed to the model generation/forward method. `forward_params` are always passed to the
+ underlying model.
+ generate_kwargs (`dict`, *optional*):
+ The dictionary of ad-hoc parametrization of `generate_config` to be used for the generation call. For a
+ complete overview of generate, check the [following
+ guide](https://huggingface.co/docs/transformers/en/main_classes/text_generation). `generate_kwargs` are
+ only passed to the underlying model if the latter is a generative model.
+
+ Return:
+ A `dict` or a list of `dict`: The dictionaries have two keys:
+
+ - **audio** (`np.ndarray` of shape `(nb_channels, audio_length)`) -- The generated audio waveform.
+ - **sampling_rate** (`int`) -- The sampling rate of the generated audio waveform.
+ """
+ return super().__call__(text_inputs, **forward_params)
+
+ def _sanitize_parameters(
+ self,
+ preprocess_params=None,
+ forward_params=None,
+ generate_kwargs=None,
+ ):
+ if self.assistant_model is not None:
+ generate_kwargs["assistant_model"] = self.assistant_model
+ if self.assistant_tokenizer is not None:
+ generate_kwargs["tokenizer"] = self.tokenizer
+ generate_kwargs["assistant_tokenizer"] = self.assistant_tokenizer
+
+ params = {
+ "forward_params": forward_params if forward_params else {},
+ "generate_kwargs": generate_kwargs if generate_kwargs else {},
+ }
+
+ if preprocess_params is None:
+ preprocess_params = {}
+ postprocess_params = {}
+
+ return preprocess_params, params, postprocess_params
+
+ def postprocess(self, waveform):
+ output_dict = {}
+ if isinstance(waveform, dict):
+ waveform = waveform["waveform"]
+ elif isinstance(waveform, tuple):
+ waveform = waveform[0]
+ output_dict["audio"] = waveform.cpu().float().numpy()
+ output_dict["sampling_rate"] = self.sampling_rate
+
+ return output_dict
diff --git a/.venv/lib/python3.11/site-packages/transformers/pipelines/token_classification.py b/.venv/lib/python3.11/site-packages/transformers/pipelines/token_classification.py
new file mode 100644
index 0000000000000000000000000000000000000000..9256f238148476b4d923c84f884156b4564c93a7
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/transformers/pipelines/token_classification.py
@@ -0,0 +1,576 @@
+import types
+import warnings
+from typing import List, Optional, Tuple, Union
+
+import numpy as np
+
+from ..models.bert.tokenization_bert import BasicTokenizer
+from ..utils import (
+ ExplicitEnum,
+ add_end_docstrings,
+ is_tf_available,
+ is_torch_available,
+)
+from .base import ArgumentHandler, ChunkPipeline, Dataset, build_pipeline_init_args
+
+
+if is_tf_available():
+ import tensorflow as tf
+
+ from ..models.auto.modeling_tf_auto import TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES
+if is_torch_available():
+ import torch
+
+ from ..models.auto.modeling_auto import MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES
+
+
+class TokenClassificationArgumentHandler(ArgumentHandler):
+ """
+ Handles arguments for token classification.
+ """
+
+ def __call__(self, inputs: Union[str, List[str]], **kwargs):
+ if inputs is not None and isinstance(inputs, (list, tuple)) and len(inputs) > 0:
+ inputs = list(inputs)
+ batch_size = len(inputs)
+ elif isinstance(inputs, str):
+ inputs = [inputs]
+ batch_size = 1
+ elif Dataset is not None and isinstance(inputs, Dataset) or isinstance(inputs, types.GeneratorType):
+ return inputs, None
+ else:
+ raise ValueError("At least one input is required.")
+
+ offset_mapping = kwargs.get("offset_mapping")
+ if offset_mapping:
+ if isinstance(offset_mapping, list) and isinstance(offset_mapping[0], tuple):
+ offset_mapping = [offset_mapping]
+ if len(offset_mapping) != batch_size:
+ raise ValueError("offset_mapping should have the same batch size as the input")
+ return inputs, offset_mapping
+
+
+class AggregationStrategy(ExplicitEnum):
+ """All the valid aggregation strategies for TokenClassificationPipeline"""
+
+ NONE = "none"
+ SIMPLE = "simple"
+ FIRST = "first"
+ AVERAGE = "average"
+ MAX = "max"
+
+
+@add_end_docstrings(
+ build_pipeline_init_args(has_tokenizer=True),
+ r"""
+ ignore_labels (`List[str]`, defaults to `["O"]`):
+ A list of labels to ignore.
+ grouped_entities (`bool`, *optional*, defaults to `False`):
+ DEPRECATED, use `aggregation_strategy` instead. Whether or not to group the tokens corresponding to the
+ same entity together in the predictions or not.
+ stride (`int`, *optional*):
+ If stride is provided, the pipeline is applied on all the text. The text is split into chunks of size
+ model_max_length. Works only with fast tokenizers and `aggregation_strategy` different from `NONE`. The
+ value of this argument defines the number of overlapping tokens between chunks. In other words, the model
+ will shift forward by `tokenizer.model_max_length - stride` tokens each step.
+ aggregation_strategy (`str`, *optional*, defaults to `"none"`):
+ The strategy to fuse (or not) tokens based on the model prediction.
+
+ - "none" : Will simply not do any aggregation and simply return raw results from the model
+ - "simple" : Will attempt to group entities following the default schema. (A, B-TAG), (B, I-TAG), (C,
+ I-TAG), (D, B-TAG2) (E, B-TAG2) will end up being [{"word": ABC, "entity": "TAG"}, {"word": "D",
+ "entity": "TAG2"}, {"word": "E", "entity": "TAG2"}] Notice that two consecutive B tags will end up as
+ different entities. On word based languages, we might end up splitting words undesirably : Imagine
+ Microsoft being tagged as [{"word": "Micro", "entity": "ENTERPRISE"}, {"word": "soft", "entity":
+ "NAME"}]. Look for FIRST, MAX, AVERAGE for ways to mitigate that and disambiguate words (on languages
+ that support that meaning, which is basically tokens separated by a space). These mitigations will
+ only work on real words, "New york" might still be tagged with two different entities.
+ - "first" : (works only on word based models) Will use the `SIMPLE` strategy except that words, cannot
+ end up with different tags. Words will simply use the tag of the first token of the word when there
+ is ambiguity.
+ - "average" : (works only on word based models) Will use the `SIMPLE` strategy except that words,
+ cannot end up with different tags. scores will be averaged first across tokens, and then the maximum
+ label is applied.
+ - "max" : (works only on word based models) Will use the `SIMPLE` strategy except that words, cannot
+ end up with different tags. Word entity will simply be the token with the maximum score.""",
+)
+class TokenClassificationPipeline(ChunkPipeline):
+ """
+ Named Entity Recognition pipeline using any `ModelForTokenClassification`. See the [named entity recognition
+ examples](../task_summary#named-entity-recognition) for more information.
+
+ Example:
+
+ ```python
+ >>> from transformers import pipeline
+
+ >>> token_classifier = pipeline(model="Jean-Baptiste/camembert-ner", aggregation_strategy="simple")
+ >>> sentence = "Je m'appelle jean-baptiste et je vis à montréal"
+ >>> tokens = token_classifier(sentence)
+ >>> tokens
+ [{'entity_group': 'PER', 'score': 0.9931, 'word': 'jean-baptiste', 'start': 12, 'end': 26}, {'entity_group': 'LOC', 'score': 0.998, 'word': 'montréal', 'start': 38, 'end': 47}]
+
+ >>> token = tokens[0]
+ >>> # Start and end provide an easy way to highlight words in the original text.
+ >>> sentence[token["start"] : token["end"]]
+ ' jean-baptiste'
+
+ >>> # Some models use the same idea to do part of speech.
+ >>> syntaxer = pipeline(model="vblagoje/bert-english-uncased-finetuned-pos", aggregation_strategy="simple")
+ >>> syntaxer("My name is Sarah and I live in London")
+ [{'entity_group': 'PRON', 'score': 0.999, 'word': 'my', 'start': 0, 'end': 2}, {'entity_group': 'NOUN', 'score': 0.997, 'word': 'name', 'start': 3, 'end': 7}, {'entity_group': 'AUX', 'score': 0.994, 'word': 'is', 'start': 8, 'end': 10}, {'entity_group': 'PROPN', 'score': 0.999, 'word': 'sarah', 'start': 11, 'end': 16}, {'entity_group': 'CCONJ', 'score': 0.999, 'word': 'and', 'start': 17, 'end': 20}, {'entity_group': 'PRON', 'score': 0.999, 'word': 'i', 'start': 21, 'end': 22}, {'entity_group': 'VERB', 'score': 0.998, 'word': 'live', 'start': 23, 'end': 27}, {'entity_group': 'ADP', 'score': 0.999, 'word': 'in', 'start': 28, 'end': 30}, {'entity_group': 'PROPN', 'score': 0.999, 'word': 'london', 'start': 31, 'end': 37}]
+ ```
+
+ Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial)
+
+ This token recognition pipeline can currently be loaded from [`pipeline`] using the following task identifier:
+ `"ner"` (for predicting the classes of tokens in a sequence: person, organisation, location or miscellaneous).
+
+ The models that this pipeline can use are models that have been fine-tuned on a token classification task. See the
+ up-to-date list of available models on
+ [huggingface.co/models](https://huggingface.co/models?filter=token-classification).
+ """
+
+ default_input_names = "sequences"
+
+ def __init__(self, args_parser=TokenClassificationArgumentHandler(), *args, **kwargs):
+ super().__init__(*args, **kwargs)
+ self.check_model_type(
+ TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES
+ if self.framework == "tf"
+ else MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES
+ )
+
+ self._basic_tokenizer = BasicTokenizer(do_lower_case=False)
+ self._args_parser = args_parser
+
+ def _sanitize_parameters(
+ self,
+ ignore_labels=None,
+ grouped_entities: Optional[bool] = None,
+ ignore_subwords: Optional[bool] = None,
+ aggregation_strategy: Optional[AggregationStrategy] = None,
+ offset_mapping: Optional[List[Tuple[int, int]]] = None,
+ stride: Optional[int] = None,
+ ):
+ preprocess_params = {}
+ if offset_mapping is not None:
+ preprocess_params["offset_mapping"] = offset_mapping
+
+ postprocess_params = {}
+ if grouped_entities is not None or ignore_subwords is not None:
+ if grouped_entities and ignore_subwords:
+ aggregation_strategy = AggregationStrategy.FIRST
+ elif grouped_entities and not ignore_subwords:
+ aggregation_strategy = AggregationStrategy.SIMPLE
+ else:
+ aggregation_strategy = AggregationStrategy.NONE
+
+ if grouped_entities is not None:
+ warnings.warn(
+ "`grouped_entities` is deprecated and will be removed in version v5.0.0, defaulted to"
+ f' `aggregation_strategy="{aggregation_strategy}"` instead.'
+ )
+ if ignore_subwords is not None:
+ warnings.warn(
+ "`ignore_subwords` is deprecated and will be removed in version v5.0.0, defaulted to"
+ f' `aggregation_strategy="{aggregation_strategy}"` instead.'
+ )
+
+ if aggregation_strategy is not None:
+ if isinstance(aggregation_strategy, str):
+ aggregation_strategy = AggregationStrategy[aggregation_strategy.upper()]
+ if (
+ aggregation_strategy
+ in {AggregationStrategy.FIRST, AggregationStrategy.MAX, AggregationStrategy.AVERAGE}
+ and not self.tokenizer.is_fast
+ ):
+ raise ValueError(
+ "Slow tokenizers cannot handle subwords. Please set the `aggregation_strategy` option"
+ ' to `"simple"` or use a fast tokenizer.'
+ )
+ postprocess_params["aggregation_strategy"] = aggregation_strategy
+ if ignore_labels is not None:
+ postprocess_params["ignore_labels"] = ignore_labels
+ if stride is not None:
+ if stride >= self.tokenizer.model_max_length:
+ raise ValueError(
+ "`stride` must be less than `tokenizer.model_max_length` (or even lower if the tokenizer adds special tokens)"
+ )
+ if aggregation_strategy == AggregationStrategy.NONE:
+ raise ValueError(
+ "`stride` was provided to process all the text but `aggregation_strategy="
+ f'"{aggregation_strategy}"`, please select another one instead.'
+ )
+ else:
+ if self.tokenizer.is_fast:
+ tokenizer_params = {
+ "return_overflowing_tokens": True,
+ "padding": True,
+ "stride": stride,
+ }
+ preprocess_params["tokenizer_params"] = tokenizer_params
+ else:
+ raise ValueError(
+ "`stride` was provided to process all the text but you're using a slow tokenizer."
+ " Please use a fast tokenizer."
+ )
+ return preprocess_params, {}, postprocess_params
+
+ def __call__(self, inputs: Union[str, List[str]], **kwargs):
+ """
+ Classify each token of the text(s) given as inputs.
+
+ Args:
+ inputs (`str` or `List[str]`):
+ One or several texts (or one list of texts) for token classification.
+
+ Return:
+ A list or a list of list of `dict`: Each result comes as a list of dictionaries (one for each token in the
+ corresponding input, or each entity if this pipeline was instantiated with an aggregation_strategy) with
+ the following keys:
+
+ - **word** (`str`) -- The token/word classified. This is obtained by decoding the selected tokens. If you
+ want to have the exact string in the original sentence, use `start` and `end`.
+ - **score** (`float`) -- The corresponding probability for `entity`.
+ - **entity** (`str`) -- The entity predicted for that token/word (it is named *entity_group* when
+ *aggregation_strategy* is not `"none"`.
+ - **index** (`int`, only present when `aggregation_strategy="none"`) -- The index of the corresponding
+ token in the sentence.
+ - **start** (`int`, *optional*) -- The index of the start of the corresponding entity in the sentence. Only
+ exists if the offsets are available within the tokenizer
+ - **end** (`int`, *optional*) -- The index of the end of the corresponding entity in the sentence. Only
+ exists if the offsets are available within the tokenizer
+ """
+
+ _inputs, offset_mapping = self._args_parser(inputs, **kwargs)
+ if offset_mapping:
+ kwargs["offset_mapping"] = offset_mapping
+
+ return super().__call__(inputs, **kwargs)
+
+ def preprocess(self, sentence, offset_mapping=None, **preprocess_params):
+ tokenizer_params = preprocess_params.pop("tokenizer_params", {})
+ truncation = True if self.tokenizer.model_max_length and self.tokenizer.model_max_length > 0 else False
+ inputs = self.tokenizer(
+ sentence,
+ return_tensors=self.framework,
+ truncation=truncation,
+ return_special_tokens_mask=True,
+ return_offsets_mapping=self.tokenizer.is_fast,
+ **tokenizer_params,
+ )
+ inputs.pop("overflow_to_sample_mapping", None)
+ num_chunks = len(inputs["input_ids"])
+
+ for i in range(num_chunks):
+ if self.framework == "tf":
+ model_inputs = {k: tf.expand_dims(v[i], 0) for k, v in inputs.items()}
+ else:
+ model_inputs = {k: v[i].unsqueeze(0) for k, v in inputs.items()}
+ if offset_mapping is not None:
+ model_inputs["offset_mapping"] = offset_mapping
+ model_inputs["sentence"] = sentence if i == 0 else None
+ model_inputs["is_last"] = i == num_chunks - 1
+
+ yield model_inputs
+
+ def _forward(self, model_inputs):
+ # Forward
+ special_tokens_mask = model_inputs.pop("special_tokens_mask")
+ offset_mapping = model_inputs.pop("offset_mapping", None)
+ sentence = model_inputs.pop("sentence")
+ is_last = model_inputs.pop("is_last")
+ if self.framework == "tf":
+ logits = self.model(**model_inputs)[0]
+ else:
+ output = self.model(**model_inputs)
+ logits = output["logits"] if isinstance(output, dict) else output[0]
+
+ return {
+ "logits": logits,
+ "special_tokens_mask": special_tokens_mask,
+ "offset_mapping": offset_mapping,
+ "sentence": sentence,
+ "is_last": is_last,
+ **model_inputs,
+ }
+
+ def postprocess(self, all_outputs, aggregation_strategy=AggregationStrategy.NONE, ignore_labels=None):
+ if ignore_labels is None:
+ ignore_labels = ["O"]
+ all_entities = []
+ for model_outputs in all_outputs:
+ if self.framework == "pt" and model_outputs["logits"][0].dtype in (torch.bfloat16, torch.float16):
+ logits = model_outputs["logits"][0].to(torch.float32).numpy()
+ else:
+ logits = model_outputs["logits"][0].numpy()
+
+ sentence = all_outputs[0]["sentence"]
+ input_ids = model_outputs["input_ids"][0]
+ offset_mapping = (
+ model_outputs["offset_mapping"][0] if model_outputs["offset_mapping"] is not None else None
+ )
+ special_tokens_mask = model_outputs["special_tokens_mask"][0].numpy()
+
+ maxes = np.max(logits, axis=-1, keepdims=True)
+ shifted_exp = np.exp(logits - maxes)
+ scores = shifted_exp / shifted_exp.sum(axis=-1, keepdims=True)
+
+ if self.framework == "tf":
+ input_ids = input_ids.numpy()
+ offset_mapping = offset_mapping.numpy() if offset_mapping is not None else None
+
+ pre_entities = self.gather_pre_entities(
+ sentence, input_ids, scores, offset_mapping, special_tokens_mask, aggregation_strategy
+ )
+ grouped_entities = self.aggregate(pre_entities, aggregation_strategy)
+ # Filter anything that is in self.ignore_labels
+ entities = [
+ entity
+ for entity in grouped_entities
+ if entity.get("entity", None) not in ignore_labels
+ and entity.get("entity_group", None) not in ignore_labels
+ ]
+ all_entities.extend(entities)
+ num_chunks = len(all_outputs)
+ if num_chunks > 1:
+ all_entities = self.aggregate_overlapping_entities(all_entities)
+ return all_entities
+
+ def aggregate_overlapping_entities(self, entities):
+ if len(entities) == 0:
+ return entities
+ entities = sorted(entities, key=lambda x: x["start"])
+ aggregated_entities = []
+ previous_entity = entities[0]
+ for entity in entities:
+ if previous_entity["start"] <= entity["start"] < previous_entity["end"]:
+ current_length = entity["end"] - entity["start"]
+ previous_length = previous_entity["end"] - previous_entity["start"]
+ if current_length > previous_length:
+ previous_entity = entity
+ elif current_length == previous_length and entity["score"] > previous_entity["score"]:
+ previous_entity = entity
+ else:
+ aggregated_entities.append(previous_entity)
+ previous_entity = entity
+ aggregated_entities.append(previous_entity)
+ return aggregated_entities
+
+ def gather_pre_entities(
+ self,
+ sentence: str,
+ input_ids: np.ndarray,
+ scores: np.ndarray,
+ offset_mapping: Optional[List[Tuple[int, int]]],
+ special_tokens_mask: np.ndarray,
+ aggregation_strategy: AggregationStrategy,
+ ) -> List[dict]:
+ """Fuse various numpy arrays into dicts with all the information needed for aggregation"""
+ pre_entities = []
+ for idx, token_scores in enumerate(scores):
+ # Filter special_tokens
+ if special_tokens_mask[idx]:
+ continue
+
+ word = self.tokenizer.convert_ids_to_tokens(int(input_ids[idx]))
+ if offset_mapping is not None:
+ start_ind, end_ind = offset_mapping[idx]
+ if not isinstance(start_ind, int):
+ if self.framework == "pt":
+ start_ind = start_ind.item()
+ end_ind = end_ind.item()
+ word_ref = sentence[start_ind:end_ind]
+ if getattr(self.tokenizer, "_tokenizer", None) and getattr(
+ self.tokenizer._tokenizer.model, "continuing_subword_prefix", None
+ ):
+ # This is a BPE, word aware tokenizer, there is a correct way
+ # to fuse tokens
+ is_subword = len(word) != len(word_ref)
+ else:
+ # This is a fallback heuristic. This will fail most likely on any kind of text + punctuation mixtures that will be considered "words". Non word aware models cannot do better than this unfortunately.
+ if aggregation_strategy in {
+ AggregationStrategy.FIRST,
+ AggregationStrategy.AVERAGE,
+ AggregationStrategy.MAX,
+ }:
+ warnings.warn(
+ "Tokenizer does not support real words, using fallback heuristic",
+ UserWarning,
+ )
+ is_subword = start_ind > 0 and " " not in sentence[start_ind - 1 : start_ind + 1]
+
+ if int(input_ids[idx]) == self.tokenizer.unk_token_id:
+ word = word_ref
+ is_subword = False
+ else:
+ start_ind = None
+ end_ind = None
+ is_subword = False
+
+ pre_entity = {
+ "word": word,
+ "scores": token_scores,
+ "start": start_ind,
+ "end": end_ind,
+ "index": idx,
+ "is_subword": is_subword,
+ }
+ pre_entities.append(pre_entity)
+ return pre_entities
+
+ def aggregate(self, pre_entities: List[dict], aggregation_strategy: AggregationStrategy) -> List[dict]:
+ if aggregation_strategy in {AggregationStrategy.NONE, AggregationStrategy.SIMPLE}:
+ entities = []
+ for pre_entity in pre_entities:
+ entity_idx = pre_entity["scores"].argmax()
+ score = pre_entity["scores"][entity_idx]
+ entity = {
+ "entity": self.model.config.id2label[entity_idx],
+ "score": score,
+ "index": pre_entity["index"],
+ "word": pre_entity["word"],
+ "start": pre_entity["start"],
+ "end": pre_entity["end"],
+ }
+ entities.append(entity)
+ else:
+ entities = self.aggregate_words(pre_entities, aggregation_strategy)
+
+ if aggregation_strategy == AggregationStrategy.NONE:
+ return entities
+ return self.group_entities(entities)
+
+ def aggregate_word(self, entities: List[dict], aggregation_strategy: AggregationStrategy) -> dict:
+ word = self.tokenizer.convert_tokens_to_string([entity["word"] for entity in entities])
+ if aggregation_strategy == AggregationStrategy.FIRST:
+ scores = entities[0]["scores"]
+ idx = scores.argmax()
+ score = scores[idx]
+ entity = self.model.config.id2label[idx]
+ elif aggregation_strategy == AggregationStrategy.MAX:
+ max_entity = max(entities, key=lambda entity: entity["scores"].max())
+ scores = max_entity["scores"]
+ idx = scores.argmax()
+ score = scores[idx]
+ entity = self.model.config.id2label[idx]
+ elif aggregation_strategy == AggregationStrategy.AVERAGE:
+ scores = np.stack([entity["scores"] for entity in entities])
+ average_scores = np.nanmean(scores, axis=0)
+ entity_idx = average_scores.argmax()
+ entity = self.model.config.id2label[entity_idx]
+ score = average_scores[entity_idx]
+ else:
+ raise ValueError("Invalid aggregation_strategy")
+ new_entity = {
+ "entity": entity,
+ "score": score,
+ "word": word,
+ "start": entities[0]["start"],
+ "end": entities[-1]["end"],
+ }
+ return new_entity
+
+ def aggregate_words(self, entities: List[dict], aggregation_strategy: AggregationStrategy) -> List[dict]:
+ """
+ Override tokens from a given word that disagree to force agreement on word boundaries.
+
+ Example: micro|soft| com|pany| B-ENT I-NAME I-ENT I-ENT will be rewritten with first strategy as microsoft|
+ company| B-ENT I-ENT
+ """
+ if aggregation_strategy in {
+ AggregationStrategy.NONE,
+ AggregationStrategy.SIMPLE,
+ }:
+ raise ValueError("NONE and SIMPLE strategies are invalid for word aggregation")
+
+ word_entities = []
+ word_group = None
+ for entity in entities:
+ if word_group is None:
+ word_group = [entity]
+ elif entity["is_subword"]:
+ word_group.append(entity)
+ else:
+ word_entities.append(self.aggregate_word(word_group, aggregation_strategy))
+ word_group = [entity]
+ # Last item
+ if word_group is not None:
+ word_entities.append(self.aggregate_word(word_group, aggregation_strategy))
+ return word_entities
+
+ def group_sub_entities(self, entities: List[dict]) -> dict:
+ """
+ Group together the adjacent tokens with the same entity predicted.
+
+ Args:
+ entities (`dict`): The entities predicted by the pipeline.
+ """
+ # Get the first entity in the entity group
+ entity = entities[0]["entity"].split("-", 1)[-1]
+ scores = np.nanmean([entity["score"] for entity in entities])
+ tokens = [entity["word"] for entity in entities]
+
+ entity_group = {
+ "entity_group": entity,
+ "score": np.mean(scores),
+ "word": self.tokenizer.convert_tokens_to_string(tokens),
+ "start": entities[0]["start"],
+ "end": entities[-1]["end"],
+ }
+ return entity_group
+
+ def get_tag(self, entity_name: str) -> Tuple[str, str]:
+ if entity_name.startswith("B-"):
+ bi = "B"
+ tag = entity_name[2:]
+ elif entity_name.startswith("I-"):
+ bi = "I"
+ tag = entity_name[2:]
+ else:
+ # It's not in B-, I- format
+ # Default to I- for continuation.
+ bi = "I"
+ tag = entity_name
+ return bi, tag
+
+ def group_entities(self, entities: List[dict]) -> List[dict]:
+ """
+ Find and group together the adjacent tokens with the same entity predicted.
+
+ Args:
+ entities (`dict`): The entities predicted by the pipeline.
+ """
+
+ entity_groups = []
+ entity_group_disagg = []
+
+ for entity in entities:
+ if not entity_group_disagg:
+ entity_group_disagg.append(entity)
+ continue
+
+ # If the current entity is similar and adjacent to the previous entity,
+ # append it to the disaggregated entity group
+ # The split is meant to account for the "B" and "I" prefixes
+ # Shouldn't merge if both entities are B-type
+ bi, tag = self.get_tag(entity["entity"])
+ last_bi, last_tag = self.get_tag(entity_group_disagg[-1]["entity"])
+
+ if tag == last_tag and bi != "B":
+ # Modify subword type to be previous_type
+ entity_group_disagg.append(entity)
+ else:
+ # If the current entity is different from the previous entity
+ # aggregate the disaggregated entity group
+ entity_groups.append(self.group_sub_entities(entity_group_disagg))
+ entity_group_disagg = [entity]
+ if entity_group_disagg:
+ # it's the last entity, add it to the entity groups
+ entity_groups.append(self.group_sub_entities(entity_group_disagg))
+
+ return entity_groups
+
+
+NerPipeline = TokenClassificationPipeline
diff --git a/.venv/lib/python3.11/site-packages/transformers/pipelines/video_classification.py b/.venv/lib/python3.11/site-packages/transformers/pipelines/video_classification.py
new file mode 100644
index 0000000000000000000000000000000000000000..057910098da20a1dfc02bf0d8b041e2d7af8cd09
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/transformers/pipelines/video_classification.py
@@ -0,0 +1,184 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import warnings
+from io import BytesIO
+from typing import List, Union
+
+import requests
+
+from ..utils import (
+ add_end_docstrings,
+ is_av_available,
+ is_torch_available,
+ logging,
+ requires_backends,
+)
+from .base import Pipeline, build_pipeline_init_args
+
+
+if is_av_available():
+ import av
+ import numpy as np
+
+
+if is_torch_available():
+ from ..models.auto.modeling_auto import MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING_NAMES
+
+logger = logging.get_logger(__name__)
+
+
+@add_end_docstrings(build_pipeline_init_args(has_image_processor=True))
+class VideoClassificationPipeline(Pipeline):
+ """
+ Video classification pipeline using any `AutoModelForVideoClassification`. This pipeline predicts the class of a
+ video.
+
+ This video classification pipeline can currently be loaded from [`pipeline`] using the following task identifier:
+ `"video-classification"`.
+
+ See the list of available models on
+ [huggingface.co/models](https://huggingface.co/models?filter=video-classification).
+ """
+
+ def __init__(self, *args, **kwargs):
+ super().__init__(*args, **kwargs)
+ requires_backends(self, "av")
+ self.check_model_type(MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING_NAMES)
+
+ def _sanitize_parameters(self, top_k=None, num_frames=None, frame_sampling_rate=None, function_to_apply=None):
+ preprocess_params = {}
+ if frame_sampling_rate is not None:
+ preprocess_params["frame_sampling_rate"] = frame_sampling_rate
+ if num_frames is not None:
+ preprocess_params["num_frames"] = num_frames
+
+ postprocess_params = {}
+ if top_k is not None:
+ postprocess_params["top_k"] = top_k
+ if function_to_apply is not None:
+ if function_to_apply not in ["softmax", "sigmoid", "none"]:
+ raise ValueError(
+ f"Invalid value for `function_to_apply`: {function_to_apply}. "
+ "Valid options are ['softmax', 'sigmoid', 'none']"
+ )
+ postprocess_params["function_to_apply"] = function_to_apply
+ else:
+ postprocess_params["function_to_apply"] = "softmax"
+ return preprocess_params, {}, postprocess_params
+
+ def __call__(self, inputs: Union[str, List[str]] = None, **kwargs):
+ """
+ Assign labels to the video(s) passed as inputs.
+
+ Args:
+ inputs (`str`, `List[str]`):
+ The pipeline handles three types of videos:
+
+ - A string containing a http link pointing to a video
+ - A string containing a local path to a video
+
+ The pipeline accepts either a single video or a batch of videos, which must then be passed as a string.
+ Videos in a batch must all be in the same format: all as http links or all as local paths.
+ top_k (`int`, *optional*, defaults to 5):
+ The number of top labels that will be returned by the pipeline. If the provided number is higher than
+ the number of labels available in the model configuration, it will default to the number of labels.
+ num_frames (`int`, *optional*, defaults to `self.model.config.num_frames`):
+ The number of frames sampled from the video to run the classification on. If not provided, will default
+ to the number of frames specified in the model configuration.
+ frame_sampling_rate (`int`, *optional*, defaults to 1):
+ The sampling rate used to select frames from the video. If not provided, will default to 1, i.e. every
+ frame will be used.
+ function_to_apply(`str`, *optional*, defaults to "softmax"):
+ The function to apply to the model output. By default, the pipeline will apply the softmax function to
+ the output of the model. Valid options: ["softmax", "sigmoid", "none"]. Note that passing Python's
+ built-in `None` will default to "softmax", so you need to pass the string "none" to disable any
+ post-processing.
+
+ Return:
+ A dictionary or a list of dictionaries containing result. If the input is a single video, will return a
+ dictionary, if the input is a list of several videos, will return a list of dictionaries corresponding to
+ the videos.
+
+ The dictionaries contain the following keys:
+
+ - **label** (`str`) -- The label identified by the model.
+ - **score** (`int`) -- The score attributed by the model for that label.
+ """
+ # After deprecation of this is completed, remove the default `None` value for `images`
+ if "videos" in kwargs:
+ warnings.warn(
+ "The `videos` argument has been renamed to `inputs`. In version 5 of Transformers, `videos` will no longer be accepted",
+ FutureWarning,
+ )
+ inputs = kwargs.pop("videos")
+ if inputs is None:
+ raise ValueError("Cannot call the video-classification pipeline without an inputs argument!")
+ return super().__call__(inputs, **kwargs)
+
+ def preprocess(self, video, num_frames=None, frame_sampling_rate=1):
+ if num_frames is None:
+ num_frames = self.model.config.num_frames
+
+ if video.startswith("http://") or video.startswith("https://"):
+ video = BytesIO(requests.get(video).content)
+
+ container = av.open(video)
+
+ start_idx = 0
+ end_idx = num_frames * frame_sampling_rate - 1
+ indices = np.linspace(start_idx, end_idx, num=num_frames, dtype=np.int64)
+
+ video = read_video_pyav(container, indices)
+ video = list(video)
+
+ model_inputs = self.image_processor(video, return_tensors=self.framework)
+ if self.framework == "pt":
+ model_inputs = model_inputs.to(self.torch_dtype)
+ return model_inputs
+
+ def _forward(self, model_inputs):
+ model_outputs = self.model(**model_inputs)
+ return model_outputs
+
+ def postprocess(self, model_outputs, top_k=5, function_to_apply="softmax"):
+ if top_k > self.model.config.num_labels:
+ top_k = self.model.config.num_labels
+
+ if self.framework == "pt":
+ if function_to_apply == "softmax":
+ probs = model_outputs.logits[0].softmax(-1)
+ elif function_to_apply == "sigmoid":
+ probs = model_outputs.logits[0].sigmoid()
+ else:
+ probs = model_outputs.logits[0]
+ scores, ids = probs.topk(top_k)
+ else:
+ raise ValueError(f"Unsupported framework: {self.framework}")
+
+ scores = scores.tolist()
+ ids = ids.tolist()
+ return [{"score": score, "label": self.model.config.id2label[_id]} for score, _id in zip(scores, ids)]
+
+
+def read_video_pyav(container, indices):
+ frames = []
+ container.seek(0)
+ start_index = indices[0]
+ end_index = indices[-1]
+ for i, frame in enumerate(container.decode(video=0)):
+ if i > end_index:
+ break
+ if i >= start_index and i in indices:
+ frames.append(frame)
+ return np.stack([x.to_ndarray(format="rgb24") for x in frames])
diff --git a/.venv/lib/python3.11/site-packages/transformers/pipelines/visual_question_answering.py b/.venv/lib/python3.11/site-packages/transformers/pipelines/visual_question_answering.py
new file mode 100644
index 0000000000000000000000000000000000000000..6d600c9eaf50bc99f6810b0c2836b154cd62ed51
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/transformers/pipelines/visual_question_answering.py
@@ -0,0 +1,200 @@
+from typing import List, Union
+
+from ..utils import add_end_docstrings, is_torch_available, is_vision_available, logging
+from .base import Pipeline, build_pipeline_init_args
+
+
+if is_vision_available():
+ from PIL import Image
+
+ from ..image_utils import load_image
+
+if is_torch_available():
+ from ..models.auto.modeling_auto import MODEL_FOR_VISUAL_QUESTION_ANSWERING_MAPPING_NAMES
+ from .pt_utils import KeyDataset
+
+logger = logging.get_logger(__name__)
+
+
+@add_end_docstrings(build_pipeline_init_args(has_tokenizer=True, has_image_processor=True))
+class VisualQuestionAnsweringPipeline(Pipeline):
+ """
+ Visual Question Answering pipeline using a `AutoModelForVisualQuestionAnswering`. This pipeline is currently only
+ available in PyTorch.
+
+ Example:
+
+ ```python
+ >>> from transformers import pipeline
+
+ >>> oracle = pipeline(model="dandelin/vilt-b32-finetuned-vqa")
+ >>> image_url = "https://huggingface.co/datasets/Narsil/image_dummy/raw/main/lena.png"
+ >>> oracle(question="What is she wearing ?", image=image_url)
+ [{'score': 0.948, 'answer': 'hat'}, {'score': 0.009, 'answer': 'fedora'}, {'score': 0.003, 'answer': 'clothes'}, {'score': 0.003, 'answer': 'sun hat'}, {'score': 0.002, 'answer': 'nothing'}]
+
+ >>> oracle(question="What is she wearing ?", image=image_url, top_k=1)
+ [{'score': 0.948, 'answer': 'hat'}]
+
+ >>> oracle(question="Is this a person ?", image=image_url, top_k=1)
+ [{'score': 0.993, 'answer': 'yes'}]
+
+ >>> oracle(question="Is this a man ?", image=image_url, top_k=1)
+ [{'score': 0.996, 'answer': 'no'}]
+ ```
+
+ Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial)
+
+ This visual question answering pipeline can currently be loaded from [`pipeline`] using the following task
+ identifiers: `"visual-question-answering", "vqa"`.
+
+ The models that this pipeline can use are models that have been fine-tuned on a visual question answering task. See
+ the up-to-date list of available models on
+ [huggingface.co/models](https://huggingface.co/models?filter=visual-question-answering).
+ """
+
+ def __init__(self, *args, **kwargs):
+ super().__init__(*args, **kwargs)
+ self.check_model_type(MODEL_FOR_VISUAL_QUESTION_ANSWERING_MAPPING_NAMES)
+
+ def _sanitize_parameters(self, top_k=None, padding=None, truncation=None, timeout=None, **kwargs):
+ preprocess_params, postprocess_params = {}, {}
+ if padding is not None:
+ preprocess_params["padding"] = padding
+ if truncation is not None:
+ preprocess_params["truncation"] = truncation
+ if timeout is not None:
+ preprocess_params["timeout"] = timeout
+ if top_k is not None:
+ postprocess_params["top_k"] = top_k
+
+ forward_params = {}
+ if self.assistant_model is not None:
+ forward_params["assistant_model"] = self.assistant_model
+ if self.assistant_tokenizer is not None:
+ forward_params["tokenizer"] = self.tokenizer
+ forward_params["assistant_tokenizer"] = self.assistant_tokenizer
+
+ return preprocess_params, forward_params, postprocess_params
+
+ def __call__(
+ self,
+ image: Union["Image.Image", str, List["Image.Image"], List[str], "KeyDataset"],
+ question: Union[str, List[str]] = None,
+ **kwargs,
+ ):
+ r"""
+ Answers open-ended questions about images. The pipeline accepts several types of inputs which are detailed
+ below:
+
+ - `pipeline(image=image, question=question)`
+ - `pipeline({"image": image, "question": question})`
+ - `pipeline([{"image": image, "question": question}])`
+ - `pipeline([{"image": image, "question": question}, {"image": image, "question": question}])`
+
+ Args:
+ image (`str`, `List[str]`, `PIL.Image`, `List[PIL.Image]` or `KeyDataset`):
+ The pipeline handles three types of images:
+
+ - A string containing a http link pointing to an image
+ - A string containing a local path to an image
+ - An image loaded in PIL directly
+
+ The pipeline accepts either a single image or a batch of images. If given a single image, it can be
+ broadcasted to multiple questions.
+ For dataset: the passed in dataset must be of type `transformers.pipelines.pt_utils.KeyDataset`
+ Example:
+ ```python
+ >>> from transformers.pipelines.pt_utils import KeyDataset
+ >>> from datasets import load_dataset
+
+ >>> dataset = load_dataset("detection-datasets/coco")
+ >>> oracle(image=KeyDataset(dataset, "image"), question="What's in this image?")
+
+ ```
+ question (`str`, `List[str]`):
+ The question(s) asked. If given a single question, it can be broadcasted to multiple images.
+ If multiple images and questions are given, each and every question will be broadcasted to all images
+ (same effect as a Cartesian product)
+ top_k (`int`, *optional*, defaults to 5):
+ The number of top labels that will be returned by the pipeline. If the provided number is higher than
+ the number of labels available in the model configuration, it will default to the number of labels.
+ timeout (`float`, *optional*, defaults to None):
+ The maximum time in seconds to wait for fetching images from the web. If None, no timeout is set and
+ the call may block forever.
+ Return:
+ A dictionary or a list of dictionaries containing the result. The dictionaries contain the following keys:
+
+ - **label** (`str`) -- The label identified by the model.
+ - **score** (`int`) -- The score attributed by the model for that label.
+ """
+ is_dataset = isinstance(image, KeyDataset)
+ is_image_batch = isinstance(image, list) and all(isinstance(item, (Image.Image, str)) for item in image)
+ is_question_batch = isinstance(question, list) and all(isinstance(item, str) for item in question)
+
+ if isinstance(image, (Image.Image, str)) and isinstance(question, str):
+ inputs = {"image": image, "question": question}
+ elif (is_image_batch or is_dataset) and isinstance(question, str):
+ inputs = [{"image": im, "question": question} for im in image]
+ elif isinstance(image, (Image.Image, str)) and is_question_batch:
+ inputs = [{"image": image, "question": q} for q in question]
+ elif (is_image_batch or is_dataset) and is_question_batch:
+ question_image_pairs = []
+ for q in question:
+ for im in image:
+ question_image_pairs.append({"image": im, "question": q})
+ inputs = question_image_pairs
+ else:
+ """
+ Supports the following format
+ - {"image": image, "question": question}
+ - [{"image": image, "question": question}]
+ - Generator and datasets
+ """
+ inputs = image
+ results = super().__call__(inputs, **kwargs)
+ return results
+
+ def preprocess(self, inputs, padding=False, truncation=False, timeout=None):
+ image = load_image(inputs["image"], timeout=timeout)
+ model_inputs = self.tokenizer(
+ inputs["question"],
+ return_tensors=self.framework,
+ padding=padding,
+ truncation=truncation,
+ )
+ image_features = self.image_processor(images=image, return_tensors=self.framework)
+ if self.framework == "pt":
+ image_features = image_features.to(self.torch_dtype)
+ model_inputs.update(image_features)
+ return model_inputs
+
+ def _forward(self, model_inputs, **generate_kwargs):
+ if self.model.can_generate():
+ # User-defined `generation_config` passed to the pipeline call take precedence
+ if "generation_config" not in generate_kwargs:
+ generate_kwargs["generation_config"] = self.generation_config
+
+ model_outputs = self.model.generate(**model_inputs, **generate_kwargs)
+ else:
+ model_outputs = self.model(**model_inputs)
+ return model_outputs
+
+ def postprocess(self, model_outputs, top_k=5):
+ if self.model.can_generate():
+ return [
+ {"answer": self.tokenizer.decode(output_ids, skip_special_tokens=True).strip()}
+ for output_ids in model_outputs
+ ]
+ else:
+ if top_k > self.model.config.num_labels:
+ top_k = self.model.config.num_labels
+
+ if self.framework == "pt":
+ probs = model_outputs.logits.sigmoid()[0]
+ scores, ids = probs.topk(top_k)
+ else:
+ raise ValueError(f"Unsupported framework: {self.framework}")
+
+ scores = scores.tolist()
+ ids = ids.tolist()
+ return [{"score": score, "answer": self.model.config.id2label[_id]} for score, _id in zip(scores, ids)]
diff --git a/.venv/lib/python3.11/site-packages/transformers/pipelines/zero_shot_audio_classification.py b/.venv/lib/python3.11/site-packages/transformers/pipelines/zero_shot_audio_classification.py
new file mode 100644
index 0000000000000000000000000000000000000000..8ed339a5b7f889c21991eaec6901887ce97d90cd
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/transformers/pipelines/zero_shot_audio_classification.py
@@ -0,0 +1,164 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from collections import UserDict
+from typing import Union
+
+import numpy as np
+import requests
+
+from ..utils import (
+ add_end_docstrings,
+ logging,
+)
+from .audio_classification import ffmpeg_read
+from .base import Pipeline, build_pipeline_init_args
+
+
+logger = logging.get_logger(__name__)
+
+
+@add_end_docstrings(build_pipeline_init_args(has_feature_extractor=True, has_tokenizer=True))
+class ZeroShotAudioClassificationPipeline(Pipeline):
+ """
+ Zero shot audio classification pipeline using `ClapModel`. This pipeline predicts the class of an audio when you
+ provide an audio and a set of `candidate_labels`.
+
+
+
+ The default `hypothesis_template` is : `"This is a sound of {}."`. Make sure you update it for your usage.
+
+
+
+ Example:
+ ```python
+ >>> from transformers import pipeline
+ >>> from datasets import load_dataset
+
+ >>> dataset = load_dataset("ashraq/esc50")
+ >>> audio = next(iter(dataset["train"]["audio"]))["array"]
+ >>> classifier = pipeline(task="zero-shot-audio-classification", model="laion/clap-htsat-unfused")
+ >>> classifier(audio, candidate_labels=["Sound of a dog", "Sound of vaccum cleaner"])
+ [{'score': 0.9996, 'label': 'Sound of a dog'}, {'score': 0.0004, 'label': 'Sound of vaccum cleaner'}]
+ ```
+
+
+ Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial) This audio
+ classification pipeline can currently be loaded from [`pipeline`] using the following task identifier:
+ `"zero-shot-audio-classification"`. See the list of available models on
+ [huggingface.co/models](https://huggingface.co/models?filter=zero-shot-audio-classification).
+ """
+
+ def __init__(self, **kwargs):
+ super().__init__(**kwargs)
+
+ if self.framework != "pt":
+ raise ValueError(f"The {self.__class__} is only available in PyTorch.")
+ # No specific FOR_XXX available yet
+
+ def __call__(self, audios: Union[np.ndarray, bytes, str], **kwargs):
+ """
+ Assign labels to the audio(s) passed as inputs.
+
+ Args:
+ audios (`str`, `List[str]`, `np.array` or `List[np.array]`):
+ The pipeline handles three types of inputs:
+ - A string containing a http link pointing to an audio
+ - A string containing a local path to an audio
+ - An audio loaded in numpy
+ candidate_labels (`List[str]`):
+ The candidate labels for this audio. They will be formatted using *hypothesis_template*.
+ hypothesis_template (`str`, *optional*, defaults to `"This is a sound of {}"`):
+ The format used in conjunction with *candidate_labels* to attempt the audio classification by
+ replacing the placeholder with the candidate_labels. Pass "{}" if *candidate_labels* are
+ already formatted.
+ Return:
+ A list of dictionaries containing one entry per proposed label. Each dictionary contains the
+ following keys:
+ - **label** (`str`) -- One of the suggested *candidate_labels*.
+ - **score** (`float`) -- The score attributed by the model to that label. It is a value between
+ 0 and 1, computed as the `softmax` of `logits_per_audio`.
+ """
+ return super().__call__(audios, **kwargs)
+
+ def _sanitize_parameters(self, **kwargs):
+ preprocess_params = {}
+ if "candidate_labels" in kwargs:
+ preprocess_params["candidate_labels"] = kwargs["candidate_labels"]
+ if "hypothesis_template" in kwargs:
+ preprocess_params["hypothesis_template"] = kwargs["hypothesis_template"]
+
+ return preprocess_params, {}, {}
+
+ def preprocess(self, audio, candidate_labels=None, hypothesis_template="This is a sound of {}."):
+ if isinstance(audio, str):
+ if audio.startswith("http://") or audio.startswith("https://"):
+ # We need to actually check for a real protocol, otherwise it's impossible to use a local file
+ # like http_huggingface_co.png
+ audio = requests.get(audio).content
+ else:
+ with open(audio, "rb") as f:
+ audio = f.read()
+
+ if isinstance(audio, bytes):
+ audio = ffmpeg_read(audio, self.feature_extractor.sampling_rate)
+
+ if not isinstance(audio, np.ndarray):
+ raise TypeError("We expect a numpy ndarray as input")
+ if len(audio.shape) != 1:
+ raise ValueError("We expect a single channel audio input for ZeroShotAudioClassificationPipeline")
+
+ inputs = self.feature_extractor(
+ [audio], sampling_rate=self.feature_extractor.sampling_rate, return_tensors="pt"
+ )
+ if self.framework == "pt":
+ inputs = inputs.to(self.torch_dtype)
+ inputs["candidate_labels"] = candidate_labels
+ sequences = [hypothesis_template.format(x) for x in candidate_labels]
+ text_inputs = self.tokenizer(sequences, return_tensors=self.framework, padding=True)
+ inputs["text_inputs"] = [text_inputs]
+ return inputs
+
+ def _forward(self, model_inputs):
+ candidate_labels = model_inputs.pop("candidate_labels")
+ text_inputs = model_inputs.pop("text_inputs")
+ if isinstance(text_inputs[0], UserDict):
+ text_inputs = text_inputs[0]
+ else:
+ # Batching case.
+ text_inputs = text_inputs[0][0]
+
+ outputs = self.model(**text_inputs, **model_inputs)
+
+ model_outputs = {
+ "candidate_labels": candidate_labels,
+ "logits": outputs.logits_per_audio,
+ }
+ return model_outputs
+
+ def postprocess(self, model_outputs):
+ candidate_labels = model_outputs.pop("candidate_labels")
+ logits = model_outputs["logits"][0]
+
+ if self.framework == "pt":
+ probs = logits.softmax(dim=0)
+ scores = probs.tolist()
+ else:
+ raise ValueError("`tf` framework not supported.")
+
+ result = [
+ {"score": score, "label": candidate_label}
+ for score, candidate_label in sorted(zip(scores, candidate_labels), key=lambda x: -x[0])
+ ]
+ return result
diff --git a/.venv/lib/python3.11/site-packages/transformers/pipelines/zero_shot_classification.py b/.venv/lib/python3.11/site-packages/transformers/pipelines/zero_shot_classification.py
new file mode 100644
index 0000000000000000000000000000000000000000..f4aee3341e30d55691ea74d0e90dd00ba4567c8b
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/transformers/pipelines/zero_shot_classification.py
@@ -0,0 +1,268 @@
+import inspect
+from typing import List, Union
+
+import numpy as np
+
+from ..tokenization_utils import TruncationStrategy
+from ..utils import add_end_docstrings, logging
+from .base import ArgumentHandler, ChunkPipeline, build_pipeline_init_args
+
+
+logger = logging.get_logger(__name__)
+
+
+class ZeroShotClassificationArgumentHandler(ArgumentHandler):
+ """
+ Handles arguments for zero-shot for text classification by turning each possible label into an NLI
+ premise/hypothesis pair.
+ """
+
+ def _parse_labels(self, labels):
+ if isinstance(labels, str):
+ labels = [label.strip() for label in labels.split(",") if label.strip()]
+ return labels
+
+ def __call__(self, sequences, labels, hypothesis_template):
+ if len(labels) == 0 or len(sequences) == 0:
+ raise ValueError("You must include at least one label and at least one sequence.")
+ if hypothesis_template.format(labels[0]) == hypothesis_template:
+ raise ValueError(
+ (
+ 'The provided hypothesis_template "{}" was not able to be formatted with the target labels. '
+ "Make sure the passed template includes formatting syntax such as {{}} where the label should go."
+ ).format(hypothesis_template)
+ )
+
+ if isinstance(sequences, str):
+ sequences = [sequences]
+
+ sequence_pairs = []
+ for sequence in sequences:
+ sequence_pairs.extend([[sequence, hypothesis_template.format(label)] for label in labels])
+
+ return sequence_pairs, sequences
+
+
+@add_end_docstrings(build_pipeline_init_args(has_tokenizer=True))
+class ZeroShotClassificationPipeline(ChunkPipeline):
+ """
+ NLI-based zero-shot classification pipeline using a `ModelForSequenceClassification` trained on NLI (natural
+ language inference) tasks. Equivalent of `text-classification` pipelines, but these models don't require a
+ hardcoded number of potential classes, they can be chosen at runtime. It usually means it's slower but it is
+ **much** more flexible.
+
+ Any combination of sequences and labels can be passed and each combination will be posed as a premise/hypothesis
+ pair and passed to the pretrained model. Then, the logit for *entailment* is taken as the logit for the candidate
+ label being valid. Any NLI model can be used, but the id of the *entailment* label must be included in the model
+ config's :attr:*~transformers.PretrainedConfig.label2id*.
+
+ Example:
+
+ ```python
+ >>> from transformers import pipeline
+
+ >>> oracle = pipeline(model="facebook/bart-large-mnli")
+ >>> oracle(
+ ... "I have a problem with my iphone that needs to be resolved asap!!",
+ ... candidate_labels=["urgent", "not urgent", "phone", "tablet", "computer"],
+ ... )
+ {'sequence': 'I have a problem with my iphone that needs to be resolved asap!!', 'labels': ['urgent', 'phone', 'computer', 'not urgent', 'tablet'], 'scores': [0.504, 0.479, 0.013, 0.003, 0.002]}
+
+ >>> oracle(
+ ... "I have a problem with my iphone that needs to be resolved asap!!",
+ ... candidate_labels=["english", "german"],
+ ... )
+ {'sequence': 'I have a problem with my iphone that needs to be resolved asap!!', 'labels': ['english', 'german'], 'scores': [0.814, 0.186]}
+ ```
+
+ Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial)
+
+ This NLI pipeline can currently be loaded from [`pipeline`] using the following task identifier:
+ `"zero-shot-classification"`.
+
+ The models that this pipeline can use are models that have been fine-tuned on an NLI task. See the up-to-date list
+ of available models on [huggingface.co/models](https://huggingface.co/models?search=nli).
+ """
+
+ def __init__(self, args_parser=ZeroShotClassificationArgumentHandler(), *args, **kwargs):
+ self._args_parser = args_parser
+ super().__init__(*args, **kwargs)
+ if self.entailment_id == -1:
+ logger.warning(
+ "Failed to determine 'entailment' label id from the label2id mapping in the model config. Setting to "
+ "-1. Define a descriptive label2id mapping in the model config to ensure correct outputs."
+ )
+
+ @property
+ def entailment_id(self):
+ for label, ind in self.model.config.label2id.items():
+ if label.lower().startswith("entail"):
+ return ind
+ return -1
+
+ def _parse_and_tokenize(
+ self, sequence_pairs, padding=True, add_special_tokens=True, truncation=TruncationStrategy.ONLY_FIRST, **kwargs
+ ):
+ """
+ Parse arguments and tokenize only_first so that hypothesis (label) is not truncated
+ """
+ return_tensors = self.framework
+ if self.tokenizer.pad_token is None:
+ # Override for tokenizers not supporting padding
+ logger.error(
+ "Tokenizer was not supporting padding necessary for zero-shot, attempting to use "
+ " `pad_token=eos_token`"
+ )
+ self.tokenizer.pad_token = self.tokenizer.eos_token
+ try:
+ inputs = self.tokenizer(
+ sequence_pairs,
+ add_special_tokens=add_special_tokens,
+ return_tensors=return_tensors,
+ padding=padding,
+ truncation=truncation,
+ )
+ except Exception as e:
+ if "too short" in str(e):
+ # tokenizers might yell that we want to truncate
+ # to a value that is not even reached by the input.
+ # In that case we don't want to truncate.
+ # It seems there's not a really better way to catch that
+ # exception.
+
+ inputs = self.tokenizer(
+ sequence_pairs,
+ add_special_tokens=add_special_tokens,
+ return_tensors=return_tensors,
+ padding=padding,
+ truncation=TruncationStrategy.DO_NOT_TRUNCATE,
+ )
+ else:
+ raise e
+
+ return inputs
+
+ def _sanitize_parameters(self, **kwargs):
+ if kwargs.get("multi_class", None) is not None:
+ kwargs["multi_label"] = kwargs["multi_class"]
+ logger.warning(
+ "The `multi_class` argument has been deprecated and renamed to `multi_label`. "
+ "`multi_class` will be removed in a future version of Transformers."
+ )
+ preprocess_params = {}
+ if "candidate_labels" in kwargs:
+ preprocess_params["candidate_labels"] = self._args_parser._parse_labels(kwargs["candidate_labels"])
+ if "hypothesis_template" in kwargs:
+ preprocess_params["hypothesis_template"] = kwargs["hypothesis_template"]
+
+ postprocess_params = {}
+ if "multi_label" in kwargs:
+ postprocess_params["multi_label"] = kwargs["multi_label"]
+ return preprocess_params, {}, postprocess_params
+
+ def __call__(
+ self,
+ sequences: Union[str, List[str]],
+ *args,
+ **kwargs,
+ ):
+ """
+ Classify the sequence(s) given as inputs. See the [`ZeroShotClassificationPipeline`] documentation for more
+ information.
+
+ Args:
+ sequences (`str` or `List[str]`):
+ The sequence(s) to classify, will be truncated if the model input is too large.
+ candidate_labels (`str` or `List[str]`):
+ The set of possible class labels to classify each sequence into. Can be a single label, a string of
+ comma-separated labels, or a list of labels.
+ hypothesis_template (`str`, *optional*, defaults to `"This example is {}."`):
+ The template used to turn each label into an NLI-style hypothesis. This template must include a {} or
+ similar syntax for the candidate label to be inserted into the template. For example, the default
+ template is `"This example is {}."` With the candidate label `"sports"`, this would be fed into the
+ model like `" sequence to classify This example is sports . "`. The default template
+ works well in many cases, but it may be worthwhile to experiment with different templates depending on
+ the task setting.
+ multi_label (`bool`, *optional*, defaults to `False`):
+ Whether or not multiple candidate labels can be true. If `False`, the scores are normalized such that
+ the sum of the label likelihoods for each sequence is 1. If `True`, the labels are considered
+ independent and probabilities are normalized for each candidate by doing a softmax of the entailment
+ score vs. the contradiction score.
+
+ Return:
+ A `dict` or a list of `dict`: Each result comes as a dictionary with the following keys:
+
+ - **sequence** (`str`) -- The sequence for which this is the output.
+ - **labels** (`List[str]`) -- The labels sorted by order of likelihood.
+ - **scores** (`List[float]`) -- The probabilities for each of the labels.
+ """
+ if len(args) == 0:
+ pass
+ elif len(args) == 1 and "candidate_labels" not in kwargs:
+ kwargs["candidate_labels"] = args[0]
+ else:
+ raise ValueError(f"Unable to understand extra arguments {args}")
+
+ return super().__call__(sequences, **kwargs)
+
+ def preprocess(self, inputs, candidate_labels=None, hypothesis_template="This example is {}."):
+ sequence_pairs, sequences = self._args_parser(inputs, candidate_labels, hypothesis_template)
+
+ for i, (candidate_label, sequence_pair) in enumerate(zip(candidate_labels, sequence_pairs)):
+ model_input = self._parse_and_tokenize([sequence_pair])
+
+ yield {
+ "candidate_label": candidate_label,
+ "sequence": sequences[0],
+ "is_last": i == len(candidate_labels) - 1,
+ **model_input,
+ }
+
+ def _forward(self, inputs):
+ candidate_label = inputs["candidate_label"]
+ sequence = inputs["sequence"]
+ model_inputs = {k: inputs[k] for k in self.tokenizer.model_input_names}
+ # `XXXForSequenceClassification` models should not use `use_cache=True` even if it's supported
+ model_forward = self.model.forward if self.framework == "pt" else self.model.call
+ if "use_cache" in inspect.signature(model_forward).parameters.keys():
+ model_inputs["use_cache"] = False
+ outputs = self.model(**model_inputs)
+
+ model_outputs = {
+ "candidate_label": candidate_label,
+ "sequence": sequence,
+ "is_last": inputs["is_last"],
+ **outputs,
+ }
+ return model_outputs
+
+ def postprocess(self, model_outputs, multi_label=False):
+ candidate_labels = [outputs["candidate_label"] for outputs in model_outputs]
+ sequences = [outputs["sequence"] for outputs in model_outputs]
+ if self.framework == "pt":
+ logits = np.concatenate([output["logits"].float().numpy() for output in model_outputs])
+ else:
+ logits = np.concatenate([output["logits"].numpy() for output in model_outputs])
+ N = logits.shape[0]
+ n = len(candidate_labels)
+ num_sequences = N // n
+ reshaped_outputs = logits.reshape((num_sequences, n, -1))
+
+ if multi_label or len(candidate_labels) == 1:
+ # softmax over the entailment vs. contradiction dim for each label independently
+ entailment_id = self.entailment_id
+ contradiction_id = -1 if entailment_id == 0 else 0
+ entail_contr_logits = reshaped_outputs[..., [contradiction_id, entailment_id]]
+ scores = np.exp(entail_contr_logits) / np.exp(entail_contr_logits).sum(-1, keepdims=True)
+ scores = scores[..., 1]
+ else:
+ # softmax the "entailment" logits over all candidate labels
+ entail_logits = reshaped_outputs[..., self.entailment_id]
+ scores = np.exp(entail_logits) / np.exp(entail_logits).sum(-1, keepdims=True)
+
+ top_inds = list(reversed(scores[0].argsort()))
+ return {
+ "sequence": sequences[0],
+ "labels": [candidate_labels[i] for i in top_inds],
+ "scores": scores[0, top_inds].tolist(),
+ }
diff --git a/.venv/lib/python3.11/site-packages/transformers/pipelines/zero_shot_image_classification.py b/.venv/lib/python3.11/site-packages/transformers/pipelines/zero_shot_image_classification.py
new file mode 100644
index 0000000000000000000000000000000000000000..c53b515dcccd9c1f277a3f8a8871be08661e7a1c
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/transformers/pipelines/zero_shot_image_classification.py
@@ -0,0 +1,193 @@
+import warnings
+from collections import UserDict
+from typing import List, Union
+
+from ..utils import (
+ add_end_docstrings,
+ is_tf_available,
+ is_torch_available,
+ is_vision_available,
+ logging,
+ requires_backends,
+)
+from .base import Pipeline, build_pipeline_init_args
+
+
+if is_vision_available():
+ from PIL import Image
+
+ from ..image_utils import load_image
+
+if is_torch_available():
+ import torch
+
+ from ..models.auto.modeling_auto import MODEL_FOR_ZERO_SHOT_IMAGE_CLASSIFICATION_MAPPING_NAMES
+
+if is_tf_available():
+ from ..models.auto.modeling_tf_auto import TF_MODEL_FOR_ZERO_SHOT_IMAGE_CLASSIFICATION_MAPPING_NAMES
+ from ..tf_utils import stable_softmax
+
+logger = logging.get_logger(__name__)
+
+
+@add_end_docstrings(build_pipeline_init_args(has_image_processor=True))
+class ZeroShotImageClassificationPipeline(Pipeline):
+ """
+ Zero shot image classification pipeline using `CLIPModel`. This pipeline predicts the class of an image when you
+ provide an image and a set of `candidate_labels`.
+
+ Example:
+
+ ```python
+ >>> from transformers import pipeline
+
+ >>> classifier = pipeline(model="google/siglip-so400m-patch14-384")
+ >>> classifier(
+ ... "https://huggingface.co/datasets/Narsil/image_dummy/raw/main/parrots.png",
+ ... candidate_labels=["animals", "humans", "landscape"],
+ ... )
+ [{'score': 0.965, 'label': 'animals'}, {'score': 0.03, 'label': 'humans'}, {'score': 0.005, 'label': 'landscape'}]
+
+ >>> classifier(
+ ... "https://huggingface.co/datasets/Narsil/image_dummy/raw/main/parrots.png",
+ ... candidate_labels=["black and white", "photorealist", "painting"],
+ ... )
+ [{'score': 0.996, 'label': 'black and white'}, {'score': 0.003, 'label': 'photorealist'}, {'score': 0.0, 'label': 'painting'}]
+ ```
+
+ Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial)
+
+ This image classification pipeline can currently be loaded from [`pipeline`] using the following task identifier:
+ `"zero-shot-image-classification"`.
+
+ See the list of available models on
+ [huggingface.co/models](https://huggingface.co/models?filter=zero-shot-image-classification).
+ """
+
+ def __init__(self, **kwargs):
+ super().__init__(**kwargs)
+
+ requires_backends(self, "vision")
+ self.check_model_type(
+ TF_MODEL_FOR_ZERO_SHOT_IMAGE_CLASSIFICATION_MAPPING_NAMES
+ if self.framework == "tf"
+ else MODEL_FOR_ZERO_SHOT_IMAGE_CLASSIFICATION_MAPPING_NAMES
+ )
+
+ def __call__(self, image: Union[str, List[str], "Image", List["Image"]] = None, **kwargs):
+ """
+ Assign labels to the image(s) passed as inputs.
+
+ Args:
+ image (`str`, `List[str]`, `PIL.Image` or `List[PIL.Image]`):
+ The pipeline handles three types of images:
+
+ - A string containing a http link pointing to an image
+ - A string containing a local path to an image
+ - An image loaded in PIL directly
+
+ candidate_labels (`List[str]`):
+ The candidate labels for this image. They will be formatted using *hypothesis_template*.
+
+ hypothesis_template (`str`, *optional*, defaults to `"This is a photo of {}"`):
+ The format used in conjunction with *candidate_labels* to attempt the image classification by
+ replacing the placeholder with the candidate_labels. Pass "{}" if *candidate_labels* are
+ already formatted.
+
+ timeout (`float`, *optional*, defaults to None):
+ The maximum time in seconds to wait for fetching images from the web. If None, no timeout is set and
+ the call may block forever.
+
+ Return:
+ A list of dictionaries containing one entry per proposed label. Each dictionary contains the
+ following keys:
+ - **label** (`str`) -- One of the suggested *candidate_labels*.
+ - **score** (`float`) -- The score attributed by the model to that label. It is a value between
+ 0 and 1, computed as the `softmax` of `logits_per_image`.
+ """
+ # After deprecation of this is completed, remove the default `None` value for `image`
+ if "images" in kwargs:
+ image = kwargs.pop("images")
+ if image is None:
+ raise ValueError("Cannot call the zero-shot-image-classification pipeline without an images argument!")
+ return super().__call__(image, **kwargs)
+
+ def _sanitize_parameters(self, tokenizer_kwargs=None, **kwargs):
+ preprocess_params = {}
+ if "candidate_labels" in kwargs:
+ preprocess_params["candidate_labels"] = kwargs["candidate_labels"]
+ if "timeout" in kwargs:
+ preprocess_params["timeout"] = kwargs["timeout"]
+ if "hypothesis_template" in kwargs:
+ preprocess_params["hypothesis_template"] = kwargs["hypothesis_template"]
+ if tokenizer_kwargs is not None:
+ warnings.warn(
+ "The `tokenizer_kwargs` argument is deprecated and will be removed in version 5 of Transformers",
+ FutureWarning,
+ )
+ preprocess_params["tokenizer_kwargs"] = tokenizer_kwargs
+
+ return preprocess_params, {}, {}
+
+ def preprocess(
+ self,
+ image,
+ candidate_labels=None,
+ hypothesis_template="This is a photo of {}.",
+ timeout=None,
+ tokenizer_kwargs=None,
+ ):
+ if tokenizer_kwargs is None:
+ tokenizer_kwargs = {}
+ image = load_image(image, timeout=timeout)
+ inputs = self.image_processor(images=[image], return_tensors=self.framework)
+ if self.framework == "pt":
+ inputs = inputs.to(self.torch_dtype)
+ inputs["candidate_labels"] = candidate_labels
+ sequences = [hypothesis_template.format(x) for x in candidate_labels]
+ padding = "max_length" if self.model.config.model_type == "siglip" else True
+ text_inputs = self.tokenizer(sequences, return_tensors=self.framework, padding=padding, **tokenizer_kwargs)
+ inputs["text_inputs"] = [text_inputs]
+ return inputs
+
+ def _forward(self, model_inputs):
+ candidate_labels = model_inputs.pop("candidate_labels")
+ text_inputs = model_inputs.pop("text_inputs")
+ if isinstance(text_inputs[0], UserDict):
+ text_inputs = text_inputs[0]
+ else:
+ # Batching case.
+ text_inputs = text_inputs[0][0]
+
+ outputs = self.model(**text_inputs, **model_inputs)
+
+ model_outputs = {
+ "candidate_labels": candidate_labels,
+ "logits": outputs.logits_per_image,
+ }
+ return model_outputs
+
+ def postprocess(self, model_outputs):
+ candidate_labels = model_outputs.pop("candidate_labels")
+ logits = model_outputs["logits"][0]
+ if self.framework == "pt" and self.model.config.model_type == "siglip":
+ probs = torch.sigmoid(logits).squeeze(-1)
+ scores = probs.tolist()
+ if not isinstance(scores, list):
+ scores = [scores]
+ elif self.framework == "pt":
+ probs = logits.softmax(dim=-1).squeeze(-1)
+ scores = probs.tolist()
+ if not isinstance(scores, list):
+ scores = [scores]
+ elif self.framework == "tf":
+ probs = stable_softmax(logits, axis=-1)
+ scores = probs.numpy().tolist()
+ else:
+ raise ValueError(f"Unsupported framework: {self.framework}")
+
+ result = [
+ {"score": score, "label": candidate_label}
+ for score, candidate_label in sorted(zip(scores, candidate_labels), key=lambda x: -x[0])
+ ]
+ return result
diff --git a/.venv/lib/python3.11/site-packages/transformers/pipelines/zero_shot_object_detection.py b/.venv/lib/python3.11/site-packages/transformers/pipelines/zero_shot_object_detection.py
new file mode 100644
index 0000000000000000000000000000000000000000..ce8da7340bcce527f6ef8c013f1f609c341f9857
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/transformers/pipelines/zero_shot_object_detection.py
@@ -0,0 +1,235 @@
+from typing import Any, Dict, List, Union
+
+from ..utils import add_end_docstrings, is_torch_available, is_vision_available, logging, requires_backends
+from .base import ChunkPipeline, build_pipeline_init_args
+
+
+if is_vision_available():
+ from PIL import Image
+
+ from ..image_utils import load_image, valid_images
+
+if is_torch_available():
+ import torch
+
+ from transformers.modeling_outputs import BaseModelOutput
+
+ from ..models.auto.modeling_auto import MODEL_FOR_ZERO_SHOT_OBJECT_DETECTION_MAPPING_NAMES
+
+logger = logging.get_logger(__name__)
+
+
+@add_end_docstrings(build_pipeline_init_args(has_image_processor=True))
+class ZeroShotObjectDetectionPipeline(ChunkPipeline):
+ """
+ Zero shot object detection pipeline using `OwlViTForObjectDetection`. This pipeline predicts bounding boxes of
+ objects when you provide an image and a set of `candidate_labels`.
+
+ Example:
+
+ ```python
+ >>> from transformers import pipeline
+
+ >>> detector = pipeline(model="google/owlvit-base-patch32", task="zero-shot-object-detection")
+ >>> detector(
+ ... "http://images.cocodataset.org/val2017/000000039769.jpg",
+ ... candidate_labels=["cat", "couch"],
+ ... )
+ [{'score': 0.287, 'label': 'cat', 'box': {'xmin': 324, 'ymin': 20, 'xmax': 640, 'ymax': 373}}, {'score': 0.254, 'label': 'cat', 'box': {'xmin': 1, 'ymin': 55, 'xmax': 315, 'ymax': 472}}, {'score': 0.121, 'label': 'couch', 'box': {'xmin': 4, 'ymin': 0, 'xmax': 642, 'ymax': 476}}]
+
+ >>> detector(
+ ... "https://huggingface.co/datasets/Narsil/image_dummy/raw/main/parrots.png",
+ ... candidate_labels=["head", "bird"],
+ ... )
+ [{'score': 0.119, 'label': 'bird', 'box': {'xmin': 71, 'ymin': 170, 'xmax': 410, 'ymax': 508}}]
+ ```
+
+ Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial)
+
+ This object detection pipeline can currently be loaded from [`pipeline`] using the following task identifier:
+ `"zero-shot-object-detection"`.
+
+ See the list of available models on
+ [huggingface.co/models](https://huggingface.co/models?filter=zero-shot-object-detection).
+ """
+
+ def __init__(self, **kwargs):
+ super().__init__(**kwargs)
+
+ if self.framework == "tf":
+ raise ValueError(f"The {self.__class__} is only available in PyTorch.")
+
+ requires_backends(self, "vision")
+ self.check_model_type(MODEL_FOR_ZERO_SHOT_OBJECT_DETECTION_MAPPING_NAMES)
+
+ def __call__(
+ self,
+ image: Union[str, "Image.Image", List[Dict[str, Any]]],
+ candidate_labels: Union[str, List[str]] = None,
+ **kwargs,
+ ):
+ """
+ Detect objects (bounding boxes & classes) in the image(s) passed as inputs.
+
+ Args:
+ image (`str`, `PIL.Image` or `List[Dict[str, Any]]`):
+ The pipeline handles three types of images:
+
+ - A string containing an http url pointing to an image
+ - A string containing a local path to an image
+ - An image loaded in PIL directly
+
+ You can use this parameter to send directly a list of images, or a dataset or a generator like so:
+
+ ```python
+ >>> from transformers import pipeline
+
+ >>> detector = pipeline(model="google/owlvit-base-patch32", task="zero-shot-object-detection")
+ >>> detector(
+ ... [
+ ... {
+ ... "image": "http://images.cocodataset.org/val2017/000000039769.jpg",
+ ... "candidate_labels": ["cat", "couch"],
+ ... },
+ ... {
+ ... "image": "http://images.cocodataset.org/val2017/000000039769.jpg",
+ ... "candidate_labels": ["cat", "couch"],
+ ... },
+ ... ]
+ ... )
+ [[{'score': 0.287, 'label': 'cat', 'box': {'xmin': 324, 'ymin': 20, 'xmax': 640, 'ymax': 373}}, {'score': 0.25, 'label': 'cat', 'box': {'xmin': 1, 'ymin': 55, 'xmax': 315, 'ymax': 472}}, {'score': 0.121, 'label': 'couch', 'box': {'xmin': 4, 'ymin': 0, 'xmax': 642, 'ymax': 476}}], [{'score': 0.287, 'label': 'cat', 'box': {'xmin': 324, 'ymin': 20, 'xmax': 640, 'ymax': 373}}, {'score': 0.254, 'label': 'cat', 'box': {'xmin': 1, 'ymin': 55, 'xmax': 315, 'ymax': 472}}, {'score': 0.121, 'label': 'couch', 'box': {'xmin': 4, 'ymin': 0, 'xmax': 642, 'ymax': 476}}]]
+ ```
+
+
+ candidate_labels (`str` or `List[str]` or `List[List[str]]`):
+ What the model should recognize in the image.
+
+ threshold (`float`, *optional*, defaults to 0.1):
+ The probability necessary to make a prediction.
+
+ top_k (`int`, *optional*, defaults to None):
+ The number of top predictions that will be returned by the pipeline. If the provided number is `None`
+ or higher than the number of predictions available, it will default to the number of predictions.
+
+ timeout (`float`, *optional*, defaults to None):
+ The maximum time in seconds to wait for fetching images from the web. If None, no timeout is set and
+ the call may block forever.
+
+
+ Return:
+ A list of lists containing prediction results, one list per input image. Each list contains dictionaries
+ with the following keys:
+
+ - **label** (`str`) -- Text query corresponding to the found object.
+ - **score** (`float`) -- Score corresponding to the object (between 0 and 1).
+ - **box** (`Dict[str,int]`) -- Bounding box of the detected object in image's original size. It is a
+ dictionary with `x_min`, `x_max`, `y_min`, `y_max` keys.
+ """
+ if "text_queries" in kwargs:
+ candidate_labels = kwargs.pop("text_queries")
+
+ if isinstance(image, (str, Image.Image)):
+ inputs = {"image": image, "candidate_labels": candidate_labels}
+ elif isinstance(image, (list, tuple)) and valid_images(image):
+ return list(
+ super().__call__(
+ ({"image": img, "candidate_labels": labels} for img, labels in zip(image, candidate_labels)),
+ **kwargs,
+ )
+ )
+ else:
+ """
+ Supports the following format
+ - {"image": image, "candidate_labels": candidate_labels}
+ - [{"image": image, "candidate_labels": candidate_labels}]
+ - Generator and datasets
+ This is a common pattern in other multimodal pipelines, so we support it here as well.
+ """
+ inputs = image
+
+ results = super().__call__(inputs, **kwargs)
+ return results
+
+ def _sanitize_parameters(self, **kwargs):
+ preprocess_params = {}
+ if "timeout" in kwargs:
+ preprocess_params["timeout"] = kwargs["timeout"]
+ postprocess_params = {}
+ if "threshold" in kwargs:
+ postprocess_params["threshold"] = kwargs["threshold"]
+ if "top_k" in kwargs:
+ postprocess_params["top_k"] = kwargs["top_k"]
+ return preprocess_params, {}, postprocess_params
+
+ def preprocess(self, inputs, timeout=None):
+ image = load_image(inputs["image"], timeout=timeout)
+ candidate_labels = inputs["candidate_labels"]
+ if isinstance(candidate_labels, str):
+ candidate_labels = candidate_labels.split(",")
+
+ target_size = torch.tensor([[image.height, image.width]], dtype=torch.int32)
+ for i, candidate_label in enumerate(candidate_labels):
+ text_inputs = self.tokenizer(candidate_label, return_tensors=self.framework)
+ image_features = self.image_processor(image, return_tensors=self.framework)
+ if self.framework == "pt":
+ image_features = image_features.to(self.torch_dtype)
+ yield {
+ "is_last": i == len(candidate_labels) - 1,
+ "target_size": target_size,
+ "candidate_label": candidate_label,
+ **text_inputs,
+ **image_features,
+ }
+
+ def _forward(self, model_inputs):
+ target_size = model_inputs.pop("target_size")
+ candidate_label = model_inputs.pop("candidate_label")
+ is_last = model_inputs.pop("is_last")
+
+ outputs = self.model(**model_inputs)
+
+ model_outputs = {"target_size": target_size, "candidate_label": candidate_label, "is_last": is_last, **outputs}
+ return model_outputs
+
+ def postprocess(self, model_outputs, threshold=0.1, top_k=None):
+ results = []
+ for model_output in model_outputs:
+ label = model_output["candidate_label"]
+ model_output = BaseModelOutput(model_output)
+ outputs = self.image_processor.post_process_object_detection(
+ outputs=model_output, threshold=threshold, target_sizes=model_output["target_size"]
+ )[0]
+
+ for index in outputs["scores"].nonzero():
+ score = outputs["scores"][index].item()
+ box = self._get_bounding_box(outputs["boxes"][index][0])
+
+ result = {"score": score, "label": label, "box": box}
+ results.append(result)
+
+ results = sorted(results, key=lambda x: x["score"], reverse=True)
+ if top_k:
+ results = results[:top_k]
+
+ return results
+
+ def _get_bounding_box(self, box: "torch.Tensor") -> Dict[str, int]:
+ """
+ Turns list [xmin, xmax, ymin, ymax] into dict { "xmin": xmin, ... }
+
+ Args:
+ box (`torch.Tensor`): Tensor containing the coordinates in corners format.
+
+ Returns:
+ bbox (`Dict[str, int]`): Dict containing the coordinates in corners format.
+ """
+ if self.framework != "pt":
+ raise ValueError("The ZeroShotObjectDetectionPipeline is only available in PyTorch.")
+ xmin, ymin, xmax, ymax = box.int().tolist()
+ bbox = {
+ "xmin": xmin,
+ "ymin": ymin,
+ "xmax": xmax,
+ "ymax": ymax,
+ }
+ return bbox
diff --git a/.venv/lib/python3.11/site-packages/transformers/sagemaker/__init__.py b/.venv/lib/python3.11/site-packages/transformers/sagemaker/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..98fe38de89cd025911d03669f9e22b03ab0768bd
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/transformers/sagemaker/__init__.py
@@ -0,0 +1,16 @@
+# Copyright 2021 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .trainer_sm import SageMakerTrainer
+from .training_args_sm import SageMakerTrainingArguments, is_sagemaker_dp_enabled
diff --git a/.venv/lib/python3.11/site-packages/transformers/sagemaker/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/sagemaker/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..099e01a2157fc76f0966eba131749abed573c936
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/sagemaker/__pycache__/__init__.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/transformers/sagemaker/__pycache__/trainer_sm.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/sagemaker/__pycache__/trainer_sm.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a88bfd9fa5aaca8b9e4ab2a9039c24821a8f6931
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/sagemaker/__pycache__/trainer_sm.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/transformers/sagemaker/__pycache__/training_args_sm.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/sagemaker/__pycache__/training_args_sm.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..364e39f0f6299f4340252c5fea617553c18a8087
Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/sagemaker/__pycache__/training_args_sm.cpython-311.pyc differ
diff --git a/.venv/lib/python3.11/site-packages/transformers/sagemaker/trainer_sm.py b/.venv/lib/python3.11/site-packages/transformers/sagemaker/trainer_sm.py
new file mode 100644
index 0000000000000000000000000000000000000000..6ab4e01acdbcd3ade1afc2339a75850bc538bd7a
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/transformers/sagemaker/trainer_sm.py
@@ -0,0 +1,30 @@
+# Copyright 2021 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import warnings
+
+from ..trainer import Trainer
+from ..utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class SageMakerTrainer(Trainer):
+ def __init__(self, args=None, **kwargs):
+ warnings.warn(
+ "`SageMakerTrainer` is deprecated and will be removed in v5 of Transformers. You can use `Trainer` "
+ "instead.",
+ FutureWarning,
+ )
+ super().__init__(args=args, **kwargs)
diff --git a/.venv/lib/python3.11/site-packages/transformers/sagemaker/training_args_sm.py b/.venv/lib/python3.11/site-packages/transformers/sagemaker/training_args_sm.py
new file mode 100644
index 0000000000000000000000000000000000000000..3daac7859b550de31f211a5e7c9938d8d557fc4c
--- /dev/null
+++ b/.venv/lib/python3.11/site-packages/transformers/sagemaker/training_args_sm.py
@@ -0,0 +1,136 @@
+# Copyright 2021 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import importlib.util
+import json
+import os
+import warnings
+from dataclasses import dataclass, field
+
+import torch
+
+from ..training_args import TrainingArguments
+from ..utils import cached_property, is_sagemaker_dp_enabled, logging
+
+
+logger = logging.get_logger(__name__)
+
+# TODO: should be moved to `utils` after refactoring of SageMakerTrainer
+
+
+def is_sagemaker_model_parallel_available():
+ # Get the sagemaker specific mp parameters from smp_options variable.
+ smp_options = os.getenv("SM_HP_MP_PARAMETERS", "{}")
+ try:
+ # Parse it and check the field "partitions" is included, it is required for model parallel.
+ smp_options = json.loads(smp_options)
+ if "partitions" not in smp_options:
+ return False
+ except json.JSONDecodeError:
+ return False
+
+ # Get the sagemaker specific framework parameters from mpi_options variable.
+ mpi_options = os.getenv("SM_FRAMEWORK_PARAMS", "{}")
+ try:
+ # Parse it and check the field "sagemaker_distributed_dataparallel_enabled".
+ mpi_options = json.loads(mpi_options)
+ if not mpi_options.get("sagemaker_mpi_enabled", False):
+ return False
+ except json.JSONDecodeError:
+ return False
+ # Lastly, check if the `smdistributed` module is present.
+ return importlib.util.find_spec("smdistributed") is not None
+
+
+if is_sagemaker_model_parallel_available():
+ import smdistributed.modelparallel.torch as smp
+
+ smp.init()
+
+
+@dataclass
+class SageMakerTrainingArguments(TrainingArguments):
+ mp_parameters: str = field(
+ default="",
+ metadata={"help": "Used by the SageMaker launcher to send mp-specific args. Ignored in SageMakerTrainer"},
+ )
+
+ def __post_init__(self):
+ super().__post_init__()
+ warnings.warn(
+ "`SageMakerTrainingArguments` is deprecated and will be removed in v5 of Transformers. You can use "
+ "`TrainingArguments` instead.",
+ FutureWarning,
+ )
+
+ @cached_property
+ def _setup_devices(self) -> "torch.device":
+ logger.info("PyTorch: setting up devices")
+ if torch.distributed.is_available() and torch.distributed.is_initialized() and self.local_rank == -1:
+ logger.warning(
+ "torch.distributed process group is initialized, but local_rank == -1. "
+ "In order to use Torch DDP, launch your script with `python -m torch.distributed.launch"
+ )
+ if self.no_cuda:
+ device = torch.device("cpu")
+ self._n_gpu = 0
+ elif is_sagemaker_model_parallel_available():
+ local_rank = smp.local_rank()
+ device = torch.device("cuda", local_rank)
+ self._n_gpu = 1
+ elif is_sagemaker_dp_enabled():
+ import smdistributed.dataparallel.torch.torch_smddp # noqa: F401
+
+ torch.distributed.init_process_group(backend="smddp", timeout=self.ddp_timeout_delta)
+ self.local_rank = int(os.getenv("SMDATAPARALLEL_LOCAL_RANK"))
+ device = torch.device("cuda", self.local_rank)
+ self._n_gpu = 1
+ elif self.local_rank == -1:
+ # if n_gpu is > 1 we'll use nn.DataParallel.
+ # If you only want to use a specific subset of GPUs use `CUDA_VISIBLE_DEVICES=0`
+ # Explicitly set CUDA to the first (index 0) CUDA device, otherwise `set_device` will
+ # trigger an error that a device index is missing. Index 0 takes into account the
+ # GPUs available in the environment, so `CUDA_VISIBLE_DEVICES=1,2` with `cuda:0`
+ # will use the first GPU in that env, i.e. GPU#1
+ device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+ # Sometimes the line in the postinit has not been run before we end up here, so just checking we're not at
+ # the default value.
+ self._n_gpu = torch.cuda.device_count()
+ else:
+ # Here, we'll use torch.distributed.
+ # Initializes the distributed backend which will take care of synchronizing nodes/GPUs
+ if not torch.distributed.is_initialized():
+ torch.distributed.init_process_group(backend="nccl", timeout=self.ddp_timeout_delta)
+ device = torch.device("cuda", self.local_rank)
+ self._n_gpu = 1
+
+ if device.type == "cuda":
+ torch.cuda.set_device(device)
+
+ return device
+
+ @property
+ def world_size(self):
+ if is_sagemaker_model_parallel_available():
+ return smp.dp_size()
+
+ return super().world_size
+
+ @property
+ def place_model_on_device(self):
+ return not is_sagemaker_model_parallel_available()
+
+ @property
+ def _no_sync_in_gradient_accumulation(self):
+ return False