diff --git a/.venv/lib/python3.11/site-packages/transformers/__pycache__/image_processing_utils.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/__pycache__/image_processing_utils.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a5725153d17701ba387c6681c3bdd6291bae9d9f Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/__pycache__/image_processing_utils.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/transformers/__pycache__/image_transforms.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/__pycache__/image_transforms.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3f8b31a738e3eae16ffb94bc75d499450fcebfb0 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/__pycache__/image_transforms.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/transformers/__pycache__/modelcard.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/__pycache__/modelcard.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c589a78197c19eb463bdec2f981bb82c58e6e431 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/__pycache__/modelcard.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/transformers/__pycache__/training_args_tf.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/__pycache__/training_args_tf.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f88c77252185ad0e4892fb712861eaad491e1c09 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/__pycache__/training_args_tf.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/transformers/generation/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/generation/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..dadeab816cc49a8b6a3dcbfba77183f935428be1 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/generation/__pycache__/__init__.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/transformers/generation/__pycache__/beam_constraints.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/generation/__pycache__/beam_constraints.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..cac9a2ef5255abae0588c8e41bd69d4eef83e86b Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/generation/__pycache__/beam_constraints.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/transformers/generation/__pycache__/beam_search.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/generation/__pycache__/beam_search.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9890b9a6e459e22a1bc8611dcb8031cf9f60cfa2 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/generation/__pycache__/beam_search.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/transformers/pipelines/__init__.py b/.venv/lib/python3.11/site-packages/transformers/pipelines/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..257f5689b0ed71afd8560aeb183f4e47beb03d47 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/transformers/pipelines/__init__.py @@ -0,0 +1,1178 @@ +# coding=utf-8 +# Copyright 2018 The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import json +import os +import warnings +from pathlib import Path +from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union + +from huggingface_hub import model_info + +from ..configuration_utils import PretrainedConfig +from ..dynamic_module_utils import get_class_from_dynamic_module +from ..feature_extraction_utils import PreTrainedFeatureExtractor +from ..image_processing_utils import BaseImageProcessor +from ..models.auto.configuration_auto import AutoConfig +from ..models.auto.feature_extraction_auto import FEATURE_EXTRACTOR_MAPPING, AutoFeatureExtractor +from ..models.auto.image_processing_auto import IMAGE_PROCESSOR_MAPPING, AutoImageProcessor +from ..models.auto.modeling_auto import AutoModelForDepthEstimation, AutoModelForImageToImage +from ..models.auto.processing_auto import PROCESSOR_MAPPING, AutoProcessor +from ..models.auto.tokenization_auto import TOKENIZER_MAPPING, AutoTokenizer +from ..processing_utils import ProcessorMixin +from ..tokenization_utils import PreTrainedTokenizer +from ..utils import ( + CONFIG_NAME, + HUGGINGFACE_CO_RESOLVE_ENDPOINT, + cached_file, + extract_commit_hash, + find_adapter_config_file, + is_kenlm_available, + is_offline_mode, + is_peft_available, + is_pyctcdecode_available, + is_tf_available, + is_torch_available, + logging, +) +from .audio_classification import AudioClassificationPipeline +from .automatic_speech_recognition import AutomaticSpeechRecognitionPipeline +from .base import ( + ArgumentHandler, + CsvPipelineDataFormat, + JsonPipelineDataFormat, + PipedPipelineDataFormat, + Pipeline, + PipelineDataFormat, + PipelineException, + PipelineRegistry, + get_default_model_and_revision, + infer_framework_load_model, +) +from .depth_estimation import DepthEstimationPipeline +from .document_question_answering import DocumentQuestionAnsweringPipeline +from .feature_extraction import FeatureExtractionPipeline +from .fill_mask import FillMaskPipeline +from .image_classification import ImageClassificationPipeline +from .image_feature_extraction import ImageFeatureExtractionPipeline +from .image_segmentation import ImageSegmentationPipeline +from .image_text_to_text import ImageTextToTextPipeline +from .image_to_image import ImageToImagePipeline +from .image_to_text import ImageToTextPipeline +from .mask_generation import MaskGenerationPipeline +from .object_detection import ObjectDetectionPipeline +from .question_answering import QuestionAnsweringArgumentHandler, QuestionAnsweringPipeline +from .table_question_answering import TableQuestionAnsweringArgumentHandler, TableQuestionAnsweringPipeline +from .text2text_generation import SummarizationPipeline, Text2TextGenerationPipeline, TranslationPipeline +from .text_classification import TextClassificationPipeline +from .text_generation import TextGenerationPipeline +from .text_to_audio import TextToAudioPipeline +from .token_classification import ( + AggregationStrategy, + NerPipeline, + TokenClassificationArgumentHandler, + TokenClassificationPipeline, +) +from .video_classification import VideoClassificationPipeline +from .visual_question_answering import VisualQuestionAnsweringPipeline +from .zero_shot_audio_classification import ZeroShotAudioClassificationPipeline +from .zero_shot_classification import ZeroShotClassificationArgumentHandler, ZeroShotClassificationPipeline +from .zero_shot_image_classification import ZeroShotImageClassificationPipeline +from .zero_shot_object_detection import ZeroShotObjectDetectionPipeline + + +if is_tf_available(): + import tensorflow as tf + + from ..models.auto.modeling_tf_auto import ( + TFAutoModel, + TFAutoModelForCausalLM, + TFAutoModelForImageClassification, + TFAutoModelForMaskedLM, + TFAutoModelForQuestionAnswering, + TFAutoModelForSeq2SeqLM, + TFAutoModelForSequenceClassification, + TFAutoModelForTableQuestionAnswering, + TFAutoModelForTokenClassification, + TFAutoModelForVision2Seq, + TFAutoModelForZeroShotImageClassification, + ) + +if is_torch_available(): + import torch + + from ..models.auto.modeling_auto import ( + AutoModel, + AutoModelForAudioClassification, + AutoModelForCausalLM, + AutoModelForCTC, + AutoModelForDocumentQuestionAnswering, + AutoModelForImageClassification, + AutoModelForImageSegmentation, + AutoModelForImageTextToText, + AutoModelForMaskedLM, + AutoModelForMaskGeneration, + AutoModelForObjectDetection, + AutoModelForQuestionAnswering, + AutoModelForSemanticSegmentation, + AutoModelForSeq2SeqLM, + AutoModelForSequenceClassification, + AutoModelForSpeechSeq2Seq, + AutoModelForTableQuestionAnswering, + AutoModelForTextToSpectrogram, + AutoModelForTextToWaveform, + AutoModelForTokenClassification, + AutoModelForVideoClassification, + AutoModelForVision2Seq, + AutoModelForVisualQuestionAnswering, + AutoModelForZeroShotImageClassification, + AutoModelForZeroShotObjectDetection, + ) + + +if TYPE_CHECKING: + from ..modeling_tf_utils import TFPreTrainedModel + from ..modeling_utils import PreTrainedModel + from ..tokenization_utils_fast import PreTrainedTokenizerFast + + +logger = logging.get_logger(__name__) + + +# Register all the supported tasks here +TASK_ALIASES = { + "sentiment-analysis": "text-classification", + "ner": "token-classification", + "vqa": "visual-question-answering", + "text-to-speech": "text-to-audio", +} +SUPPORTED_TASKS = { + "audio-classification": { + "impl": AudioClassificationPipeline, + "tf": (), + "pt": (AutoModelForAudioClassification,) if is_torch_available() else (), + "default": {"model": {"pt": ("superb/wav2vec2-base-superb-ks", "372e048")}}, + "type": "audio", + }, + "automatic-speech-recognition": { + "impl": AutomaticSpeechRecognitionPipeline, + "tf": (), + "pt": (AutoModelForCTC, AutoModelForSpeechSeq2Seq) if is_torch_available() else (), + "default": {"model": {"pt": ("facebook/wav2vec2-base-960h", "22aad52")}}, + "type": "multimodal", + }, + "text-to-audio": { + "impl": TextToAudioPipeline, + "tf": (), + "pt": (AutoModelForTextToWaveform, AutoModelForTextToSpectrogram) if is_torch_available() else (), + "default": {"model": {"pt": ("suno/bark-small", "1dbd7a1")}}, + "type": "text", + }, + "feature-extraction": { + "impl": FeatureExtractionPipeline, + "tf": (TFAutoModel,) if is_tf_available() else (), + "pt": (AutoModel,) if is_torch_available() else (), + "default": { + "model": { + "pt": ("distilbert/distilbert-base-cased", "6ea8117"), + "tf": ("distilbert/distilbert-base-cased", "6ea8117"), + } + }, + "type": "multimodal", + }, + "text-classification": { + "impl": TextClassificationPipeline, + "tf": (TFAutoModelForSequenceClassification,) if is_tf_available() else (), + "pt": (AutoModelForSequenceClassification,) if is_torch_available() else (), + "default": { + "model": { + "pt": ("distilbert/distilbert-base-uncased-finetuned-sst-2-english", "714eb0f"), + "tf": ("distilbert/distilbert-base-uncased-finetuned-sst-2-english", "714eb0f"), + }, + }, + "type": "text", + }, + "token-classification": { + "impl": TokenClassificationPipeline, + "tf": (TFAutoModelForTokenClassification,) if is_tf_available() else (), + "pt": (AutoModelForTokenClassification,) if is_torch_available() else (), + "default": { + "model": { + "pt": ("dbmdz/bert-large-cased-finetuned-conll03-english", "4c53496"), + "tf": ("dbmdz/bert-large-cased-finetuned-conll03-english", "4c53496"), + }, + }, + "type": "text", + }, + "question-answering": { + "impl": QuestionAnsweringPipeline, + "tf": (TFAutoModelForQuestionAnswering,) if is_tf_available() else (), + "pt": (AutoModelForQuestionAnswering,) if is_torch_available() else (), + "default": { + "model": { + "pt": ("distilbert/distilbert-base-cased-distilled-squad", "564e9b5"), + "tf": ("distilbert/distilbert-base-cased-distilled-squad", "564e9b5"), + }, + }, + "type": "text", + }, + "table-question-answering": { + "impl": TableQuestionAnsweringPipeline, + "pt": (AutoModelForTableQuestionAnswering,) if is_torch_available() else (), + "tf": (TFAutoModelForTableQuestionAnswering,) if is_tf_available() else (), + "default": { + "model": { + "pt": ("google/tapas-base-finetuned-wtq", "e3dde19"), + "tf": ("google/tapas-base-finetuned-wtq", "e3dde19"), + }, + }, + "type": "text", + }, + "visual-question-answering": { + "impl": VisualQuestionAnsweringPipeline, + "pt": (AutoModelForVisualQuestionAnswering,) if is_torch_available() else (), + "tf": (), + "default": { + "model": {"pt": ("dandelin/vilt-b32-finetuned-vqa", "d0a1f6a")}, + }, + "type": "multimodal", + }, + "document-question-answering": { + "impl": DocumentQuestionAnsweringPipeline, + "pt": (AutoModelForDocumentQuestionAnswering,) if is_torch_available() else (), + "tf": (), + "default": { + "model": {"pt": ("impira/layoutlm-document-qa", "beed3c4")}, + }, + "type": "multimodal", + }, + "fill-mask": { + "impl": FillMaskPipeline, + "tf": (TFAutoModelForMaskedLM,) if is_tf_available() else (), + "pt": (AutoModelForMaskedLM,) if is_torch_available() else (), + "default": { + "model": { + "pt": ("distilbert/distilroberta-base", "fb53ab8"), + "tf": ("distilbert/distilroberta-base", "fb53ab8"), + } + }, + "type": "text", + }, + "summarization": { + "impl": SummarizationPipeline, + "tf": (TFAutoModelForSeq2SeqLM,) if is_tf_available() else (), + "pt": (AutoModelForSeq2SeqLM,) if is_torch_available() else (), + "default": { + "model": {"pt": ("sshleifer/distilbart-cnn-12-6", "a4f8f3e"), "tf": ("google-t5/t5-small", "df1b051")} + }, + "type": "text", + }, + # This task is a special case as it's parametrized by SRC, TGT languages. + "translation": { + "impl": TranslationPipeline, + "tf": (TFAutoModelForSeq2SeqLM,) if is_tf_available() else (), + "pt": (AutoModelForSeq2SeqLM,) if is_torch_available() else (), + "default": { + ("en", "fr"): {"model": {"pt": ("google-t5/t5-base", "a9723ea"), "tf": ("google-t5/t5-base", "a9723ea")}}, + ("en", "de"): {"model": {"pt": ("google-t5/t5-base", "a9723ea"), "tf": ("google-t5/t5-base", "a9723ea")}}, + ("en", "ro"): {"model": {"pt": ("google-t5/t5-base", "a9723ea"), "tf": ("google-t5/t5-base", "a9723ea")}}, + }, + "type": "text", + }, + "text2text-generation": { + "impl": Text2TextGenerationPipeline, + "tf": (TFAutoModelForSeq2SeqLM,) if is_tf_available() else (), + "pt": (AutoModelForSeq2SeqLM,) if is_torch_available() else (), + "default": {"model": {"pt": ("google-t5/t5-base", "a9723ea"), "tf": ("google-t5/t5-base", "a9723ea")}}, + "type": "text", + }, + "text-generation": { + "impl": TextGenerationPipeline, + "tf": (TFAutoModelForCausalLM,) if is_tf_available() else (), + "pt": (AutoModelForCausalLM,) if is_torch_available() else (), + "default": {"model": {"pt": ("openai-community/gpt2", "607a30d"), "tf": ("openai-community/gpt2", "607a30d")}}, + "type": "text", + }, + "zero-shot-classification": { + "impl": ZeroShotClassificationPipeline, + "tf": (TFAutoModelForSequenceClassification,) if is_tf_available() else (), + "pt": (AutoModelForSequenceClassification,) if is_torch_available() else (), + "default": { + "model": { + "pt": ("facebook/bart-large-mnli", "d7645e1"), + "tf": ("FacebookAI/roberta-large-mnli", "2a8f12d"), + }, + "config": { + "pt": ("facebook/bart-large-mnli", "d7645e1"), + "tf": ("FacebookAI/roberta-large-mnli", "2a8f12d"), + }, + }, + "type": "text", + }, + "zero-shot-image-classification": { + "impl": ZeroShotImageClassificationPipeline, + "tf": (TFAutoModelForZeroShotImageClassification,) if is_tf_available() else (), + "pt": (AutoModelForZeroShotImageClassification,) if is_torch_available() else (), + "default": { + "model": { + "pt": ("openai/clip-vit-base-patch32", "3d74acf"), + "tf": ("openai/clip-vit-base-patch32", "3d74acf"), + } + }, + "type": "multimodal", + }, + "zero-shot-audio-classification": { + "impl": ZeroShotAudioClassificationPipeline, + "tf": (), + "pt": (AutoModel,) if is_torch_available() else (), + "default": { + "model": { + "pt": ("laion/clap-htsat-fused", "cca9e28"), + } + }, + "type": "multimodal", + }, + "image-classification": { + "impl": ImageClassificationPipeline, + "tf": (TFAutoModelForImageClassification,) if is_tf_available() else (), + "pt": (AutoModelForImageClassification,) if is_torch_available() else (), + "default": { + "model": { + "pt": ("google/vit-base-patch16-224", "3f49326"), + "tf": ("google/vit-base-patch16-224", "3f49326"), + } + }, + "type": "image", + }, + "image-feature-extraction": { + "impl": ImageFeatureExtractionPipeline, + "tf": (TFAutoModel,) if is_tf_available() else (), + "pt": (AutoModel,) if is_torch_available() else (), + "default": { + "model": { + "pt": ("google/vit-base-patch16-224", "3f49326"), + "tf": ("google/vit-base-patch16-224", "3f49326"), + } + }, + "type": "image", + }, + "image-segmentation": { + "impl": ImageSegmentationPipeline, + "tf": (), + "pt": (AutoModelForImageSegmentation, AutoModelForSemanticSegmentation) if is_torch_available() else (), + "default": {"model": {"pt": ("facebook/detr-resnet-50-panoptic", "d53b52a")}}, + "type": "multimodal", + }, + "image-to-text": { + "impl": ImageToTextPipeline, + "tf": (TFAutoModelForVision2Seq,) if is_tf_available() else (), + "pt": (AutoModelForVision2Seq,) if is_torch_available() else (), + "default": { + "model": { + "pt": ("ydshieh/vit-gpt2-coco-en", "5bebf1e"), + "tf": ("ydshieh/vit-gpt2-coco-en", "5bebf1e"), + } + }, + "type": "multimodal", + }, + "image-text-to-text": { + "impl": ImageTextToTextPipeline, + "tf": (), + "pt": (AutoModelForImageTextToText,) if is_torch_available() else (), + "default": { + "model": { + "pt": ("llava-hf/llava-onevision-qwen2-0.5b-ov-hf", "2c9ba3b"), + } + }, + "type": "multimodal", + }, + "object-detection": { + "impl": ObjectDetectionPipeline, + "tf": (), + "pt": (AutoModelForObjectDetection,) if is_torch_available() else (), + "default": {"model": {"pt": ("facebook/detr-resnet-50", "1d5f47b")}}, + "type": "multimodal", + }, + "zero-shot-object-detection": { + "impl": ZeroShotObjectDetectionPipeline, + "tf": (), + "pt": (AutoModelForZeroShotObjectDetection,) if is_torch_available() else (), + "default": {"model": {"pt": ("google/owlvit-base-patch32", "cbc355f")}}, + "type": "multimodal", + }, + "depth-estimation": { + "impl": DepthEstimationPipeline, + "tf": (), + "pt": (AutoModelForDepthEstimation,) if is_torch_available() else (), + "default": {"model": {"pt": ("Intel/dpt-large", "bc15f29")}}, + "type": "image", + }, + "video-classification": { + "impl": VideoClassificationPipeline, + "tf": (), + "pt": (AutoModelForVideoClassification,) if is_torch_available() else (), + "default": {"model": {"pt": ("MCG-NJU/videomae-base-finetuned-kinetics", "488eb9a")}}, + "type": "video", + }, + "mask-generation": { + "impl": MaskGenerationPipeline, + "tf": (), + "pt": (AutoModelForMaskGeneration,) if is_torch_available() else (), + "default": {"model": {"pt": ("facebook/sam-vit-huge", "87aecf0")}}, + "type": "multimodal", + }, + "image-to-image": { + "impl": ImageToImagePipeline, + "tf": (), + "pt": (AutoModelForImageToImage,) if is_torch_available() else (), + "default": {"model": {"pt": ("caidas/swin2SR-classical-sr-x2-64", "cee1c92")}}, + "type": "image", + }, +} + +NO_FEATURE_EXTRACTOR_TASKS = set() +NO_IMAGE_PROCESSOR_TASKS = set() +NO_TOKENIZER_TASKS = set() + +# Those model configs are special, they are generic over their task, meaning +# any tokenizer/feature_extractor might be use for a given model so we cannot +# use the statically defined TOKENIZER_MAPPING and FEATURE_EXTRACTOR_MAPPING to +# see if the model defines such objects or not. +MULTI_MODEL_AUDIO_CONFIGS = {"SpeechEncoderDecoderConfig"} +MULTI_MODEL_VISION_CONFIGS = {"VisionEncoderDecoderConfig", "VisionTextDualEncoderConfig"} +for task, values in SUPPORTED_TASKS.items(): + if values["type"] == "text": + NO_FEATURE_EXTRACTOR_TASKS.add(task) + NO_IMAGE_PROCESSOR_TASKS.add(task) + elif values["type"] in {"image", "video"}: + NO_TOKENIZER_TASKS.add(task) + elif values["type"] in {"audio"}: + NO_TOKENIZER_TASKS.add(task) + NO_IMAGE_PROCESSOR_TASKS.add(task) + elif values["type"] != "multimodal": + raise ValueError(f"SUPPORTED_TASK {task} contains invalid type {values['type']}") + +PIPELINE_REGISTRY = PipelineRegistry(supported_tasks=SUPPORTED_TASKS, task_aliases=TASK_ALIASES) + + +def get_supported_tasks() -> List[str]: + """ + Returns a list of supported task strings. + """ + return PIPELINE_REGISTRY.get_supported_tasks() + + +def get_task(model: str, token: Optional[str] = None, **deprecated_kwargs) -> str: + use_auth_token = deprecated_kwargs.pop("use_auth_token", None) + if use_auth_token is not None: + warnings.warn( + "The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers. Please use `token` instead.", + FutureWarning, + ) + if token is not None: + raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.") + token = use_auth_token + + if is_offline_mode(): + raise RuntimeError("You cannot infer task automatically within `pipeline` when using offline mode") + try: + info = model_info(model, token=token) + except Exception as e: + raise RuntimeError(f"Instantiating a pipeline without a task set raised an error: {e}") + if not info.pipeline_tag: + raise RuntimeError( + f"The model {model} does not seem to have a correct `pipeline_tag` set to infer the task automatically" + ) + if getattr(info, "library_name", "transformers") not in {"transformers", "timm"}: + raise RuntimeError(f"This model is meant to be used with {info.library_name} not with transformers") + task = info.pipeline_tag + return task + + +def check_task(task: str) -> Tuple[str, Dict, Any]: + """ + Checks an incoming task string, to validate it's correct and return the default Pipeline and Model classes, and + default models if they exist. + + Args: + task (`str`): + The task defining which pipeline will be returned. Currently accepted tasks are: + + - `"audio-classification"` + - `"automatic-speech-recognition"` + - `"conversational"` + - `"depth-estimation"` + - `"document-question-answering"` + - `"feature-extraction"` + - `"fill-mask"` + - `"image-classification"` + - `"image-feature-extraction"` + - `"image-segmentation"` + - `"image-to-text"` + - `"image-to-image"` + - `"object-detection"` + - `"question-answering"` + - `"summarization"` + - `"table-question-answering"` + - `"text2text-generation"` + - `"text-classification"` (alias `"sentiment-analysis"` available) + - `"text-generation"` + - `"text-to-audio"` (alias `"text-to-speech"` available) + - `"token-classification"` (alias `"ner"` available) + - `"translation"` + - `"translation_xx_to_yy"` + - `"video-classification"` + - `"visual-question-answering"` (alias `"vqa"` available) + - `"zero-shot-classification"` + - `"zero-shot-image-classification"` + - `"zero-shot-object-detection"` + + Returns: + (normalized_task: `str`, task_defaults: `dict`, task_options: (`tuple`, None)) The normalized task name + (removed alias and options). The actual dictionary required to initialize the pipeline and some extra task + options for parametrized tasks like "translation_XX_to_YY" + + + """ + return PIPELINE_REGISTRY.check_task(task) + + +def clean_custom_task(task_info): + import transformers + + if "impl" not in task_info: + raise RuntimeError("This model introduces a custom pipeline without specifying its implementation.") + pt_class_names = task_info.get("pt", ()) + if isinstance(pt_class_names, str): + pt_class_names = [pt_class_names] + task_info["pt"] = tuple(getattr(transformers, c) for c in pt_class_names) + tf_class_names = task_info.get("tf", ()) + if isinstance(tf_class_names, str): + tf_class_names = [tf_class_names] + task_info["tf"] = tuple(getattr(transformers, c) for c in tf_class_names) + return task_info, None + + +def pipeline( + task: str = None, + model: Optional[Union[str, "PreTrainedModel", "TFPreTrainedModel"]] = None, + config: Optional[Union[str, PretrainedConfig]] = None, + tokenizer: Optional[Union[str, PreTrainedTokenizer, "PreTrainedTokenizerFast"]] = None, + feature_extractor: Optional[Union[str, PreTrainedFeatureExtractor]] = None, + image_processor: Optional[Union[str, BaseImageProcessor]] = None, + processor: Optional[Union[str, ProcessorMixin]] = None, + framework: Optional[str] = None, + revision: Optional[str] = None, + use_fast: bool = True, + token: Optional[Union[str, bool]] = None, + device: Optional[Union[int, str, "torch.device"]] = None, + device_map=None, + torch_dtype=None, + trust_remote_code: Optional[bool] = None, + model_kwargs: Dict[str, Any] = None, + pipeline_class: Optional[Any] = None, + **kwargs, +) -> Pipeline: + """ + Utility factory method to build a [`Pipeline`]. + + A pipeline consists of: + + - One or more components for pre-processing model inputs, such as a [tokenizer](tokenizer), + [image_processor](image_processor), [feature_extractor](feature_extractor), or [processor](processors). + - A [model](model) that generates predictions from the inputs. + - Optional post-processing steps to refine the model's output, which can also be handled by processors. + + + While there are such optional arguments as `tokenizer`, `feature_extractor`, `image_processor`, and `processor`, + they shouldn't be specified all at once. If these components are not provided, `pipeline` will try to load + required ones automatically. In case you want to provide these components explicitly, please refer to a + specific pipeline in order to get more details regarding what components are required. + + + Args: + task (`str`): + The task defining which pipeline will be returned. Currently accepted tasks are: + + - `"audio-classification"`: will return a [`AudioClassificationPipeline`]. + - `"automatic-speech-recognition"`: will return a [`AutomaticSpeechRecognitionPipeline`]. + - `"depth-estimation"`: will return a [`DepthEstimationPipeline`]. + - `"document-question-answering"`: will return a [`DocumentQuestionAnsweringPipeline`]. + - `"feature-extraction"`: will return a [`FeatureExtractionPipeline`]. + - `"fill-mask"`: will return a [`FillMaskPipeline`]:. + - `"image-classification"`: will return a [`ImageClassificationPipeline`]. + - `"image-feature-extraction"`: will return an [`ImageFeatureExtractionPipeline`]. + - `"image-segmentation"`: will return a [`ImageSegmentationPipeline`]. + - `"image-text-to-text"`: will return a [`ImageTextToTextPipeline`]. + - `"image-to-image"`: will return a [`ImageToImagePipeline`]. + - `"image-to-text"`: will return a [`ImageToTextPipeline`]. + - `"mask-generation"`: will return a [`MaskGenerationPipeline`]. + - `"object-detection"`: will return a [`ObjectDetectionPipeline`]. + - `"question-answering"`: will return a [`QuestionAnsweringPipeline`]. + - `"summarization"`: will return a [`SummarizationPipeline`]. + - `"table-question-answering"`: will return a [`TableQuestionAnsweringPipeline`]. + - `"text2text-generation"`: will return a [`Text2TextGenerationPipeline`]. + - `"text-classification"` (alias `"sentiment-analysis"` available): will return a + [`TextClassificationPipeline`]. + - `"text-generation"`: will return a [`TextGenerationPipeline`]:. + - `"text-to-audio"` (alias `"text-to-speech"` available): will return a [`TextToAudioPipeline`]:. + - `"token-classification"` (alias `"ner"` available): will return a [`TokenClassificationPipeline`]. + - `"translation"`: will return a [`TranslationPipeline`]. + - `"translation_xx_to_yy"`: will return a [`TranslationPipeline`]. + - `"video-classification"`: will return a [`VideoClassificationPipeline`]. + - `"visual-question-answering"`: will return a [`VisualQuestionAnsweringPipeline`]. + - `"zero-shot-classification"`: will return a [`ZeroShotClassificationPipeline`]. + - `"zero-shot-image-classification"`: will return a [`ZeroShotImageClassificationPipeline`]. + - `"zero-shot-audio-classification"`: will return a [`ZeroShotAudioClassificationPipeline`]. + - `"zero-shot-object-detection"`: will return a [`ZeroShotObjectDetectionPipeline`]. + + model (`str` or [`PreTrainedModel`] or [`TFPreTrainedModel`], *optional*): + The model that will be used by the pipeline to make predictions. This can be a model identifier or an + actual instance of a pretrained model inheriting from [`PreTrainedModel`] (for PyTorch) or + [`TFPreTrainedModel`] (for TensorFlow). + + If not provided, the default for the `task` will be loaded. + config (`str` or [`PretrainedConfig`], *optional*): + The configuration that will be used by the pipeline to instantiate the model. This can be a model + identifier or an actual pretrained model configuration inheriting from [`PretrainedConfig`]. + + If not provided, the default configuration file for the requested model will be used. That means that if + `model` is given, its default configuration will be used. However, if `model` is not supplied, this + `task`'s default model's config is used instead. + tokenizer (`str` or [`PreTrainedTokenizer`], *optional*): + The tokenizer that will be used by the pipeline to encode data for the model. This can be a model + identifier or an actual pretrained tokenizer inheriting from [`PreTrainedTokenizer`]. + + If not provided, the default tokenizer for the given `model` will be loaded (if it is a string). If `model` + is not specified or not a string, then the default tokenizer for `config` is loaded (if it is a string). + However, if `config` is also not given or not a string, then the default tokenizer for the given `task` + will be loaded. + feature_extractor (`str` or [`PreTrainedFeatureExtractor`], *optional*): + The feature extractor that will be used by the pipeline to encode data for the model. This can be a model + identifier or an actual pretrained feature extractor inheriting from [`PreTrainedFeatureExtractor`]. + + Feature extractors are used for non-NLP models, such as Speech or Vision models as well as multi-modal + models. Multi-modal models will also require a tokenizer to be passed. + + If not provided, the default feature extractor for the given `model` will be loaded (if it is a string). If + `model` is not specified or not a string, then the default feature extractor for `config` is loaded (if it + is a string). However, if `config` is also not given or not a string, then the default feature extractor + for the given `task` will be loaded. + image_processor (`str` or [`BaseImageProcessor`], *optional*): + The image processor that will be used by the pipeline to preprocess images for the model. This can be a + model identifier or an actual image processor inheriting from [`BaseImageProcessor`]. + + Image processors are used for Vision models and multi-modal models that require image inputs. Multi-modal + models will also require a tokenizer to be passed. + + If not provided, the default image processor for the given `model` will be loaded (if it is a string). If + `model` is not specified or not a string, then the default image processor for `config` is loaded (if it is + a string). + processor (`str` or [`ProcessorMixin`], *optional*): + The processor that will be used by the pipeline to preprocess data for the model. This can be a model + identifier or an actual processor inheriting from [`ProcessorMixin`]. + + Processors are used for multi-modal models that require multi-modal inputs, for example, a model that + requires both text and image inputs. + + If not provided, the default processor for the given `model` will be loaded (if it is a string). If `model` + is not specified or not a string, then the default processor for `config` is loaded (if it is a string). + framework (`str`, *optional*): + The framework to use, either `"pt"` for PyTorch or `"tf"` for TensorFlow. The specified framework must be + installed. + + If no framework is specified, will default to the one currently installed. If no framework is specified and + both frameworks are installed, will default to the framework of the `model`, or to PyTorch if no model is + provided. + revision (`str`, *optional*, defaults to `"main"`): + When passing a task name or a string model identifier: The specific model version to use. It can be a + branch name, a tag name, or a commit id, since we use a git-based system for storing models and other + artifacts on huggingface.co, so `revision` can be any identifier allowed by git. + use_fast (`bool`, *optional*, defaults to `True`): + Whether or not to use a Fast tokenizer if possible (a [`PreTrainedTokenizerFast`]). + use_auth_token (`str` or *bool*, *optional*): + The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated + when running `huggingface-cli login` (stored in `~/.huggingface`). + device (`int` or `str` or `torch.device`): + Defines the device (*e.g.*, `"cpu"`, `"cuda:1"`, `"mps"`, or a GPU ordinal rank like `1`) on which this + pipeline will be allocated. + device_map (`str` or `Dict[str, Union[int, str, torch.device]`, *optional*): + Sent directly as `model_kwargs` (just a simpler shortcut). When `accelerate` library is present, set + `device_map="auto"` to compute the most optimized `device_map` automatically (see + [here](https://huggingface.co/docs/accelerate/main/en/package_reference/big_modeling#accelerate.cpu_offload) + for more information). + + + + Do not use `device_map` AND `device` at the same time as they will conflict + + + + torch_dtype (`str` or `torch.dtype`, *optional*): + Sent directly as `model_kwargs` (just a simpler shortcut) to use the available precision for this model + (`torch.float16`, `torch.bfloat16`, ... or `"auto"`). + trust_remote_code (`bool`, *optional*, defaults to `False`): + Whether or not to allow for custom code defined on the Hub in their own modeling, configuration, + tokenization or even pipeline files. This option should only be set to `True` for repositories you trust + and in which you have read the code, as it will execute code present on the Hub on your local machine. + model_kwargs (`Dict[str, Any]`, *optional*): + Additional dictionary of keyword arguments passed along to the model's `from_pretrained(..., + **model_kwargs)` function. + kwargs (`Dict[str, Any]`, *optional*): + Additional keyword arguments passed along to the specific pipeline init (see the documentation for the + corresponding pipeline class for possible values). + + Returns: + [`Pipeline`]: A suitable pipeline for the task. + + Examples: + + ```python + >>> from transformers import pipeline, AutoModelForTokenClassification, AutoTokenizer + + >>> # Sentiment analysis pipeline + >>> analyzer = pipeline("sentiment-analysis") + + >>> # Question answering pipeline, specifying the checkpoint identifier + >>> oracle = pipeline( + ... "question-answering", model="distilbert/distilbert-base-cased-distilled-squad", tokenizer="google-bert/bert-base-cased" + ... ) + + >>> # Named entity recognition pipeline, passing in a specific model and tokenizer + >>> model = AutoModelForTokenClassification.from_pretrained("dbmdz/bert-large-cased-finetuned-conll03-english") + >>> tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-cased") + >>> recognizer = pipeline("ner", model=model, tokenizer=tokenizer) + ```""" + if model_kwargs is None: + model_kwargs = {} + # Make sure we only pass use_auth_token once as a kwarg (it used to be possible to pass it in model_kwargs, + # this is to keep BC). + use_auth_token = model_kwargs.pop("use_auth_token", None) + if use_auth_token is not None: + warnings.warn( + "The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers. Please use `token` instead.", + FutureWarning, + ) + if token is not None: + raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.") + token = use_auth_token + + code_revision = kwargs.pop("code_revision", None) + commit_hash = kwargs.pop("_commit_hash", None) + + hub_kwargs = { + "revision": revision, + "token": token, + "trust_remote_code": trust_remote_code, + "_commit_hash": commit_hash, + } + + if task is None and model is None: + raise RuntimeError( + "Impossible to instantiate a pipeline without either a task or a model " + "being specified. " + "Please provide a task class or a model" + ) + + if model is None and tokenizer is not None: + raise RuntimeError( + "Impossible to instantiate a pipeline with tokenizer specified but not the model as the provided tokenizer" + " may not be compatible with the default model. Please provide a PreTrainedModel class or a" + " path/identifier to a pretrained model when providing tokenizer." + ) + if model is None and feature_extractor is not None: + raise RuntimeError( + "Impossible to instantiate a pipeline with feature_extractor specified but not the model as the provided" + " feature_extractor may not be compatible with the default model. Please provide a PreTrainedModel class" + " or a path/identifier to a pretrained model when providing feature_extractor." + ) + if isinstance(model, Path): + model = str(model) + + if commit_hash is None: + pretrained_model_name_or_path = None + if isinstance(config, str): + pretrained_model_name_or_path = config + elif config is None and isinstance(model, str): + pretrained_model_name_or_path = model + + if not isinstance(config, PretrainedConfig) and pretrained_model_name_or_path is not None: + # We make a call to the config file first (which may be absent) to get the commit hash as soon as possible + resolved_config_file = cached_file( + pretrained_model_name_or_path, + CONFIG_NAME, + _raise_exceptions_for_gated_repo=False, + _raise_exceptions_for_missing_entries=False, + _raise_exceptions_for_connection_errors=False, + cache_dir=model_kwargs.get("cache_dir"), + **hub_kwargs, + ) + hub_kwargs["_commit_hash"] = extract_commit_hash(resolved_config_file, commit_hash) + else: + hub_kwargs["_commit_hash"] = getattr(config, "_commit_hash", None) + + # Config is the primordial information item. + # Instantiate config if needed + if isinstance(config, str): + config = AutoConfig.from_pretrained( + config, _from_pipeline=task, code_revision=code_revision, **hub_kwargs, **model_kwargs + ) + hub_kwargs["_commit_hash"] = config._commit_hash + elif config is None and isinstance(model, str): + # Check for an adapter file in the model path if PEFT is available + if is_peft_available(): + # `find_adapter_config_file` doesn't accept `trust_remote_code` + _hub_kwargs = {k: v for k, v in hub_kwargs.items() if k != "trust_remote_code"} + maybe_adapter_path = find_adapter_config_file( + model, + token=hub_kwargs["token"], + revision=hub_kwargs["revision"], + _commit_hash=hub_kwargs["_commit_hash"], + ) + + if maybe_adapter_path is not None: + with open(maybe_adapter_path, "r", encoding="utf-8") as f: + adapter_config = json.load(f) + model = adapter_config["base_model_name_or_path"] + + config = AutoConfig.from_pretrained( + model, _from_pipeline=task, code_revision=code_revision, **hub_kwargs, **model_kwargs + ) + hub_kwargs["_commit_hash"] = config._commit_hash + + custom_tasks = {} + if config is not None and len(getattr(config, "custom_pipelines", {})) > 0: + custom_tasks = config.custom_pipelines + if task is None and trust_remote_code is not False: + if len(custom_tasks) == 1: + task = list(custom_tasks.keys())[0] + else: + raise RuntimeError( + "We can't infer the task automatically for this model as there are multiple tasks available. Pick " + f"one in {', '.join(custom_tasks.keys())}" + ) + + if task is None and model is not None: + if not isinstance(model, str): + raise RuntimeError( + "Inferring the task automatically requires to check the hub with a model_id defined as a `str`. " + f"{model} is not a valid model_id." + ) + task = get_task(model, token) + + # Retrieve the task + if task in custom_tasks: + normalized_task = task + targeted_task, task_options = clean_custom_task(custom_tasks[task]) + if pipeline_class is None: + if not trust_remote_code: + raise ValueError( + "Loading this pipeline requires you to execute the code in the pipeline file in that" + " repo on your local machine. Make sure you have read the code there to avoid malicious use, then" + " set the option `trust_remote_code=True` to remove this error." + ) + class_ref = targeted_task["impl"] + pipeline_class = get_class_from_dynamic_module( + class_ref, + model, + code_revision=code_revision, + **hub_kwargs, + ) + else: + normalized_task, targeted_task, task_options = check_task(task) + if pipeline_class is None: + pipeline_class = targeted_task["impl"] + + # Use default model/config/tokenizer for the task if no model is provided + if model is None: + # At that point framework might still be undetermined + model, default_revision = get_default_model_and_revision(targeted_task, framework, task_options) + revision = revision if revision is not None else default_revision + logger.warning( + f"No model was supplied, defaulted to {model} and revision" + f" {revision} ({HUGGINGFACE_CO_RESOLVE_ENDPOINT}/{model}).\n" + "Using a pipeline without specifying a model name and revision in production is not recommended." + ) + hub_kwargs["revision"] = revision + if config is None and isinstance(model, str): + config = AutoConfig.from_pretrained(model, _from_pipeline=task, **hub_kwargs, **model_kwargs) + hub_kwargs["_commit_hash"] = config._commit_hash + + if device_map is not None: + if "device_map" in model_kwargs: + raise ValueError( + 'You cannot use both `pipeline(... device_map=..., model_kwargs={"device_map":...})` as those' + " arguments might conflict, use only one.)" + ) + if device is not None: + logger.warning( + "Both `device` and `device_map` are specified. `device` will override `device_map`. You" + " will most likely encounter unexpected behavior. Please remove `device` and keep `device_map`." + ) + model_kwargs["device_map"] = device_map + if torch_dtype is not None: + if "torch_dtype" in model_kwargs: + raise ValueError( + 'You cannot use both `pipeline(... torch_dtype=..., model_kwargs={"torch_dtype":...})` as those' + " arguments might conflict, use only one.)" + ) + if isinstance(torch_dtype, str) and hasattr(torch, torch_dtype): + torch_dtype = getattr(torch, torch_dtype) + model_kwargs["torch_dtype"] = torch_dtype + + model_name = model if isinstance(model, str) else None + + # Load the correct model if possible + # Infer the framework from the model if not already defined + if isinstance(model, str) or framework is None: + model_classes = {"tf": targeted_task["tf"], "pt": targeted_task["pt"]} + framework, model = infer_framework_load_model( + model, + model_classes=model_classes, + config=config, + framework=framework, + task=task, + **hub_kwargs, + **model_kwargs, + ) + + model_config = model.config + hub_kwargs["_commit_hash"] = model.config._commit_hash + + load_tokenizer = type(model_config) in TOKENIZER_MAPPING or model_config.tokenizer_class is not None + load_feature_extractor = type(model_config) in FEATURE_EXTRACTOR_MAPPING or feature_extractor is not None + load_image_processor = type(model_config) in IMAGE_PROCESSOR_MAPPING or image_processor is not None + load_processor = type(model_config) in PROCESSOR_MAPPING or processor is not None + + # Check that pipeline class required loading + load_tokenizer = load_tokenizer and pipeline_class._load_tokenizer + load_feature_extractor = load_feature_extractor and pipeline_class._load_feature_extractor + load_image_processor = load_image_processor and pipeline_class._load_image_processor + load_processor = load_processor and pipeline_class._load_processor + + # If `model` (instance of `PretrainedModel` instead of `str`) is passed (and/or same for config), while + # `image_processor` or `feature_extractor` is `None`, the loading will fail. This happens particularly for some + # vision tasks when calling `pipeline()` with `model` and only one of the `image_processor` and `feature_extractor`. + # TODO: we need to make `NO_IMAGE_PROCESSOR_TASKS` and `NO_FEATURE_EXTRACTOR_TASKS` more robust to avoid such issue. + # This block is only temporarily to make CI green. + if load_image_processor and load_feature_extractor: + load_feature_extractor = False + + if ( + tokenizer is None + and not load_tokenizer + and normalized_task not in NO_TOKENIZER_TASKS + # Using class name to avoid importing the real class. + and ( + model_config.__class__.__name__ in MULTI_MODEL_AUDIO_CONFIGS + or model_config.__class__.__name__ in MULTI_MODEL_VISION_CONFIGS + ) + ): + # This is a special category of models, that are fusions of multiple models + # so the model_config might not define a tokenizer, but it seems to be + # necessary for the task, so we're force-trying to load it. + load_tokenizer = True + if ( + image_processor is None + and not load_image_processor + and normalized_task not in NO_IMAGE_PROCESSOR_TASKS + # Using class name to avoid importing the real class. + and model_config.__class__.__name__ in MULTI_MODEL_VISION_CONFIGS + ): + # This is a special category of models, that are fusions of multiple models + # so the model_config might not define a tokenizer, but it seems to be + # necessary for the task, so we're force-trying to load it. + load_image_processor = True + if ( + feature_extractor is None + and not load_feature_extractor + and normalized_task not in NO_FEATURE_EXTRACTOR_TASKS + # Using class name to avoid importing the real class. + and model_config.__class__.__name__ in MULTI_MODEL_AUDIO_CONFIGS + ): + # This is a special category of models, that are fusions of multiple models + # so the model_config might not define a tokenizer, but it seems to be + # necessary for the task, so we're force-trying to load it. + load_feature_extractor = True + + if task in NO_TOKENIZER_TASKS: + # These will never require a tokenizer. + # the model on the other hand might have a tokenizer, but + # the files could be missing from the hub, instead of failing + # on such repos, we just force to not load it. + load_tokenizer = False + + if task in NO_FEATURE_EXTRACTOR_TASKS: + load_feature_extractor = False + if task in NO_IMAGE_PROCESSOR_TASKS: + load_image_processor = False + + if load_tokenizer: + # Try to infer tokenizer from model or config name (if provided as str) + if tokenizer is None: + if isinstance(model_name, str): + tokenizer = model_name + elif isinstance(config, str): + tokenizer = config + else: + # Impossible to guess what is the right tokenizer here + raise Exception( + "Impossible to guess which tokenizer to use. " + "Please provide a PreTrainedTokenizer class or a path/identifier to a pretrained tokenizer." + ) + + # Instantiate tokenizer if needed + if isinstance(tokenizer, (str, tuple)): + if isinstance(tokenizer, tuple): + # For tuple we have (tokenizer name, {kwargs}) + use_fast = tokenizer[1].pop("use_fast", use_fast) + tokenizer_identifier = tokenizer[0] + tokenizer_kwargs = tokenizer[1] + else: + tokenizer_identifier = tokenizer + tokenizer_kwargs = model_kwargs.copy() + tokenizer_kwargs.pop("torch_dtype", None) + + tokenizer = AutoTokenizer.from_pretrained( + tokenizer_identifier, use_fast=use_fast, _from_pipeline=task, **hub_kwargs, **tokenizer_kwargs + ) + + if load_image_processor: + # Try to infer image processor from model or config name (if provided as str) + if image_processor is None: + if isinstance(model_name, str): + image_processor = model_name + elif isinstance(config, str): + image_processor = config + # Backward compatibility, as `feature_extractor` used to be the name + # for `ImageProcessor`. + elif feature_extractor is not None and isinstance(feature_extractor, BaseImageProcessor): + image_processor = feature_extractor + else: + # Impossible to guess what is the right image_processor here + raise Exception( + "Impossible to guess which image processor to use. " + "Please provide a PreTrainedImageProcessor class or a path/identifier " + "to a pretrained image processor." + ) + + # Instantiate image_processor if needed + if isinstance(image_processor, (str, tuple)): + image_processor = AutoImageProcessor.from_pretrained( + image_processor, _from_pipeline=task, **hub_kwargs, **model_kwargs + ) + + if load_feature_extractor: + # Try to infer feature extractor from model or config name (if provided as str) + if feature_extractor is None: + if isinstance(model_name, str): + feature_extractor = model_name + elif isinstance(config, str): + feature_extractor = config + else: + # Impossible to guess what is the right feature_extractor here + raise Exception( + "Impossible to guess which feature extractor to use. " + "Please provide a PreTrainedFeatureExtractor class or a path/identifier " + "to a pretrained feature extractor." + ) + + # Instantiate feature_extractor if needed + if isinstance(feature_extractor, (str, tuple)): + feature_extractor = AutoFeatureExtractor.from_pretrained( + feature_extractor, _from_pipeline=task, **hub_kwargs, **model_kwargs + ) + + if ( + feature_extractor._processor_class + and feature_extractor._processor_class.endswith("WithLM") + and isinstance(model_name, str) + ): + try: + import kenlm # to trigger `ImportError` if not installed + from pyctcdecode import BeamSearchDecoderCTC + + if os.path.isdir(model_name) or os.path.isfile(model_name): + decoder = BeamSearchDecoderCTC.load_from_dir(model_name) + else: + language_model_glob = os.path.join( + BeamSearchDecoderCTC._LANGUAGE_MODEL_SERIALIZED_DIRECTORY, "*" + ) + alphabet_filename = BeamSearchDecoderCTC._ALPHABET_SERIALIZED_FILENAME + allow_patterns = [language_model_glob, alphabet_filename] + decoder = BeamSearchDecoderCTC.load_from_hf_hub(model_name, allow_patterns=allow_patterns) + + kwargs["decoder"] = decoder + except ImportError as e: + logger.warning(f"Could not load the `decoder` for {model_name}. Defaulting to raw CTC. Error: {e}") + if not is_kenlm_available(): + logger.warning("Try to install `kenlm`: `pip install kenlm") + + if not is_pyctcdecode_available(): + logger.warning("Try to install `pyctcdecode`: `pip install pyctcdecode") + + if load_processor: + # Try to infer processor from model or config name (if provided as str) + if processor is None: + if isinstance(model_name, str): + processor = model_name + elif isinstance(config, str): + processor = config + else: + # Impossible to guess what is the right processor here + raise Exception( + "Impossible to guess which processor to use. " + "Please provide a processor instance or a path/identifier " + "to a processor." + ) + + # Instantiate processor if needed + if isinstance(processor, (str, tuple)): + processor = AutoProcessor.from_pretrained(processor, _from_pipeline=task, **hub_kwargs, **model_kwargs) + if not isinstance(processor, ProcessorMixin): + raise TypeError( + "Processor was loaded, but it is not an instance of `ProcessorMixin`. " + f"Got type `{type(processor)}` instead. Please check that you specified " + "correct pipeline task for the model and model has processor implemented and saved." + ) + + if task == "translation" and model.config.task_specific_params: + for key in model.config.task_specific_params: + if key.startswith("translation"): + task = key + warnings.warn( + f'"translation" task was used, instead of "translation_XX_to_YY", defaulting to "{task}"', + UserWarning, + ) + break + + if tokenizer is not None: + kwargs["tokenizer"] = tokenizer + + if feature_extractor is not None: + kwargs["feature_extractor"] = feature_extractor + + if torch_dtype is not None: + kwargs["torch_dtype"] = torch_dtype + + if image_processor is not None: + kwargs["image_processor"] = image_processor + + if device is not None: + kwargs["device"] = device + + if processor is not None: + kwargs["processor"] = processor + + return pipeline_class(model=model, framework=framework, task=task, **kwargs) diff --git a/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3c13de8d2f8416c40cc05909e6b4dc3a15c66706 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/__init__.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/audio_classification.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/audio_classification.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..78a6def32f2b9c4e5c4f398d1c4caa2317d9c6d0 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/audio_classification.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/audio_utils.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/audio_utils.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4ecc0f8845fbe145dfeb09070c6be150cc99b895 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/audio_utils.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/automatic_speech_recognition.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/automatic_speech_recognition.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..be7914fb35489ccfd95dee43f879e2436d43095c Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/automatic_speech_recognition.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/base.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/base.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1c254c1421872f69e5ea59af1fb57b4f90999947 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/base.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/depth_estimation.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/depth_estimation.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a952dd38e0ea7d65e8d39e69191a2ca651bd605c Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/depth_estimation.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/document_question_answering.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/document_question_answering.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4e8da27a53928f48b178da6848f65430ebf562cc Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/document_question_answering.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/feature_extraction.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/feature_extraction.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3c8cbfc27f17c71f2992710d1d01fa38a4103e4b Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/feature_extraction.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/fill_mask.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/fill_mask.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d48fdb57cc820d93b00c44aa67d069f79eaef3ce Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/fill_mask.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/image_classification.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/image_classification.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7ca87019d3bea003988f0d743a88fba6c2eb9432 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/image_classification.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/image_feature_extraction.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/image_feature_extraction.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1f26ad155baf5fad6d5ec5b16ce5cf31a37b4f97 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/image_feature_extraction.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/image_segmentation.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/image_segmentation.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a1597f2d4807c8ea0c7e2890c0a807bcf6394d0c Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/image_segmentation.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/image_text_to_text.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/image_text_to_text.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..55138504bdf575226a181dc4401e207ed1af9d83 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/image_text_to_text.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/image_to_image.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/image_to_image.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7117056ab6f1eed8e2ac85b608865bf1d8d1f381 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/image_to_image.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/image_to_text.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/image_to_text.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..282a5eab4a5afa2f511bf832c0f5c9f977f9a19e Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/image_to_text.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/mask_generation.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/mask_generation.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d81847e9085c6a700c2ed99df5a5ec3b485ec0fc Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/mask_generation.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/object_detection.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/object_detection.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..20d7177e8f35bc38403669156b96ff28c56cea8e Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/object_detection.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/pt_utils.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/pt_utils.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..eaa157f04981120aa846f4bff0a97ef0271359b0 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/pt_utils.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/question_answering.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/question_answering.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..dadc7dbcbc6d2a1b1143894da48036f00ab71f41 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/question_answering.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/table_question_answering.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/table_question_answering.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6bbb4de92436af95784a53a396dfe1df5c89ed9c Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/table_question_answering.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/text2text_generation.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/text2text_generation.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8b0677e2f2c048d094bf1e3b1888005074e3b9b4 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/text2text_generation.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/text_classification.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/text_classification.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9226af6305246af4ac726bf575396c4ae5d0cbb3 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/text_classification.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/text_generation.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/text_generation.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c55bc926b9fc6b43b404c05ced61caea13ceb21e Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/text_generation.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/text_to_audio.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/text_to_audio.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6d005050223b6a943128b3a79ff044246a046f50 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/text_to_audio.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/token_classification.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/token_classification.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b35ba5e8975996d511e41dc35e813edd4d212992 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/token_classification.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/video_classification.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/video_classification.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f30a7f3b4ab8da22ab900cc036524ac69ddc7689 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/video_classification.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/visual_question_answering.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/visual_question_answering.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e2c83eff32e78865d7f9430829b8a7ae10416e3a Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/visual_question_answering.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/zero_shot_audio_classification.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/zero_shot_audio_classification.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9909e35082363c85c942b355399deb0f0e89e4f1 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/zero_shot_audio_classification.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/zero_shot_classification.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/zero_shot_classification.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8b2661528b79429f9ffd1421f3ba8135ab2c29bc Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/zero_shot_classification.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/zero_shot_image_classification.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/zero_shot_image_classification.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4db3ff00805b9daa790f19fdd193510846fdf908 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/zero_shot_image_classification.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/zero_shot_object_detection.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/zero_shot_object_detection.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..bbe4539f856a5eb2d287d9a5556e54e280324f62 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/pipelines/__pycache__/zero_shot_object_detection.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/transformers/pipelines/audio_classification.py b/.venv/lib/python3.11/site-packages/transformers/pipelines/audio_classification.py new file mode 100644 index 0000000000000000000000000000000000000000..f6acbb3096e07d5b467ef4f2dccbda3ce1cd8e51 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/transformers/pipelines/audio_classification.py @@ -0,0 +1,234 @@ +# Copyright 2021 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import subprocess +from typing import Union + +import numpy as np +import requests + +from ..utils import add_end_docstrings, is_torch_available, is_torchaudio_available, logging +from .base import Pipeline, build_pipeline_init_args + + +if is_torch_available(): + from ..models.auto.modeling_auto import MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING_NAMES + +logger = logging.get_logger(__name__) + + +def ffmpeg_read(bpayload: bytes, sampling_rate: int) -> np.array: + """ + Helper function to read an audio file through ffmpeg. + """ + ar = f"{sampling_rate}" + ac = "1" + format_for_conversion = "f32le" + ffmpeg_command = [ + "ffmpeg", + "-i", + "pipe:0", + "-ac", + ac, + "-ar", + ar, + "-f", + format_for_conversion, + "-hide_banner", + "-loglevel", + "quiet", + "pipe:1", + ] + + try: + ffmpeg_process = subprocess.Popen(ffmpeg_command, stdin=subprocess.PIPE, stdout=subprocess.PIPE) + except FileNotFoundError: + raise ValueError("ffmpeg was not found but is required to load audio files from filename") + output_stream = ffmpeg_process.communicate(bpayload) + out_bytes = output_stream[0] + + audio = np.frombuffer(out_bytes, np.float32) + if audio.shape[0] == 0: + raise ValueError("Malformed soundfile") + return audio + + +@add_end_docstrings(build_pipeline_init_args(has_feature_extractor=True)) +class AudioClassificationPipeline(Pipeline): + """ + Audio classification pipeline using any `AutoModelForAudioClassification`. This pipeline predicts the class of a + raw waveform or an audio file. In case of an audio file, ffmpeg should be installed to support multiple audio + formats. + + Example: + + ```python + >>> from transformers import pipeline + + >>> classifier = pipeline(model="superb/wav2vec2-base-superb-ks") + >>> classifier("https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/1.flac") + [{'score': 0.997, 'label': '_unknown_'}, {'score': 0.002, 'label': 'left'}, {'score': 0.0, 'label': 'yes'}, {'score': 0.0, 'label': 'down'}, {'score': 0.0, 'label': 'stop'}] + ``` + + Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial) + + + This pipeline can currently be loaded from [`pipeline`] using the following task identifier: + `"audio-classification"`. + + See the list of available models on + [huggingface.co/models](https://huggingface.co/models?filter=audio-classification). + """ + + def __init__(self, *args, **kwargs): + # Default, might be overriden by the model.config. + kwargs["top_k"] = kwargs.get("top_k", 5) + super().__init__(*args, **kwargs) + + if self.framework != "pt": + raise ValueError(f"The {self.__class__} is only available in PyTorch.") + + self.check_model_type(MODEL_FOR_AUDIO_CLASSIFICATION_MAPPING_NAMES) + + def __call__( + self, + inputs: Union[np.ndarray, bytes, str], + **kwargs, + ): + """ + Classify the sequence(s) given as inputs. See the [`AutomaticSpeechRecognitionPipeline`] documentation for more + information. + + Args: + inputs (`np.ndarray` or `bytes` or `str` or `dict`): + The inputs is either : + - `str` that is the filename of the audio file, the file will be read at the correct sampling rate + to get the waveform using *ffmpeg*. This requires *ffmpeg* to be installed on the system. + - `bytes` it is supposed to be the content of an audio file and is interpreted by *ffmpeg* in the + same way. + - (`np.ndarray` of shape (n, ) of type `np.float32` or `np.float64`) + Raw audio at the correct sampling rate (no further check will be done) + - `dict` form can be used to pass raw audio sampled at arbitrary `sampling_rate` and let this + pipeline do the resampling. The dict must be either be in the format `{"sampling_rate": int, + "raw": np.array}`, or `{"sampling_rate": int, "array": np.array}`, where the key `"raw"` or + `"array"` is used to denote the raw audio waveform. + top_k (`int`, *optional*, defaults to None): + The number of top labels that will be returned by the pipeline. If the provided number is `None` or + higher than the number of labels available in the model configuration, it will default to the number of + labels. + function_to_apply(`str`, *optional*, defaults to "softmax"): + The function to apply to the model output. By default, the pipeline will apply the softmax function to + the output of the model. Valid options: ["softmax", "sigmoid", "none"]. Note that passing Python's + built-in `None` will default to "softmax", so you need to pass the string "none" to disable any + post-processing. + + Return: + A list of `dict` with the following keys: + + - **label** (`str`) -- The label predicted. + - **score** (`float`) -- The corresponding probability. + """ + return super().__call__(inputs, **kwargs) + + def _sanitize_parameters(self, top_k=None, function_to_apply=None, **kwargs): + # No parameters on this pipeline right now + postprocess_params = {} + if top_k is not None: + if top_k > self.model.config.num_labels: + top_k = self.model.config.num_labels + postprocess_params["top_k"] = top_k + if function_to_apply is not None: + if function_to_apply not in ["softmax", "sigmoid", "none"]: + raise ValueError( + f"Invalid value for `function_to_apply`: {function_to_apply}. " + "Valid options are ['softmax', 'sigmoid', 'none']" + ) + postprocess_params["function_to_apply"] = function_to_apply + else: + postprocess_params["function_to_apply"] = "softmax" + return {}, {}, postprocess_params + + def preprocess(self, inputs): + if isinstance(inputs, str): + if inputs.startswith("http://") or inputs.startswith("https://"): + # We need to actually check for a real protocol, otherwise it's impossible to use a local file + # like http_huggingface_co.png + inputs = requests.get(inputs).content + else: + with open(inputs, "rb") as f: + inputs = f.read() + + if isinstance(inputs, bytes): + inputs = ffmpeg_read(inputs, self.feature_extractor.sampling_rate) + + if isinstance(inputs, dict): + # Accepting `"array"` which is the key defined in `datasets` for + # better integration + if not ("sampling_rate" in inputs and ("raw" in inputs or "array" in inputs)): + raise ValueError( + "When passing a dictionary to AudioClassificationPipeline, the dict needs to contain a " + '"raw" key containing the numpy array representing the audio and a "sampling_rate" key, ' + "containing the sampling_rate associated with that array" + ) + + _inputs = inputs.pop("raw", None) + if _inputs is None: + # Remove path which will not be used from `datasets`. + inputs.pop("path", None) + _inputs = inputs.pop("array", None) + in_sampling_rate = inputs.pop("sampling_rate") + inputs = _inputs + if in_sampling_rate != self.feature_extractor.sampling_rate: + import torch + + if is_torchaudio_available(): + from torchaudio import functional as F + else: + raise ImportError( + "torchaudio is required to resample audio samples in AudioClassificationPipeline. " + "The torchaudio package can be installed through: `pip install torchaudio`." + ) + + inputs = F.resample( + torch.from_numpy(inputs), in_sampling_rate, self.feature_extractor.sampling_rate + ).numpy() + + if not isinstance(inputs, np.ndarray): + raise TypeError("We expect a numpy ndarray as input") + if len(inputs.shape) != 1: + raise ValueError("We expect a single channel audio input for AudioClassificationPipeline") + + processed = self.feature_extractor( + inputs, sampling_rate=self.feature_extractor.sampling_rate, return_tensors="pt" + ) + return processed + + def _forward(self, model_inputs): + model_outputs = self.model(**model_inputs) + return model_outputs + + def postprocess(self, model_outputs, top_k=5, function_to_apply="softmax"): + if function_to_apply == "softmax": + probs = model_outputs.logits[0].softmax(-1) + elif function_to_apply == "sigmoid": + probs = model_outputs.logits[0].sigmoid() + else: + probs = model_outputs.logits[0] + scores, ids = probs.topk(top_k) + + scores = scores.tolist() + ids = ids.tolist() + + labels = [{"score": score, "label": self.model.config.id2label[_id]} for score, _id in zip(scores, ids)] + + return labels diff --git a/.venv/lib/python3.11/site-packages/transformers/pipelines/audio_utils.py b/.venv/lib/python3.11/site-packages/transformers/pipelines/audio_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..72a5f51db6129ae46939a5f2d640d286f479749f --- /dev/null +++ b/.venv/lib/python3.11/site-packages/transformers/pipelines/audio_utils.py @@ -0,0 +1,297 @@ +# Copyright 2023 The HuggingFace Team. All rights reserved. +import datetime +import platform +import subprocess +from typing import Optional, Tuple, Union + +import numpy as np + + +def ffmpeg_read(bpayload: bytes, sampling_rate: int) -> np.array: + """ + Helper function to read an audio file through ffmpeg. + """ + ar = f"{sampling_rate}" + ac = "1" + format_for_conversion = "f32le" + ffmpeg_command = [ + "ffmpeg", + "-i", + "pipe:0", + "-ac", + ac, + "-ar", + ar, + "-f", + format_for_conversion, + "-hide_banner", + "-loglevel", + "quiet", + "pipe:1", + ] + + try: + with subprocess.Popen(ffmpeg_command, stdin=subprocess.PIPE, stdout=subprocess.PIPE) as ffmpeg_process: + output_stream = ffmpeg_process.communicate(bpayload) + except FileNotFoundError as error: + raise ValueError("ffmpeg was not found but is required to load audio files from filename") from error + out_bytes = output_stream[0] + audio = np.frombuffer(out_bytes, np.float32) + if audio.shape[0] == 0: + raise ValueError( + "Soundfile is either not in the correct format or is malformed. Ensure that the soundfile has " + "a valid audio file extension (e.g. wav, flac or mp3) and is not corrupted. If reading from a remote " + "URL, ensure that the URL is the full address to **download** the audio file." + ) + return audio + + +def ffmpeg_microphone( + sampling_rate: int, + chunk_length_s: float, + format_for_conversion: str = "f32le", + ffmpeg_input_device: Optional[str] = None, + ffmpeg_additional_args: Optional[list[str]] = None, +): + """ + Helper function to read audio from a microphone using ffmpeg. The default input device will be used unless another + input device is specified using the `ffmpeg_input_device` argument. Uses 'alsa' on Linux, 'avfoundation' on MacOS and + 'dshow' on Windows. + + Arguments: + sampling_rate (`int`): + The sampling_rate to use when reading the data from the microphone. Try using the model's sampling_rate to + avoid resampling later. + chunk_length_s (`float` or `int`): + The length of the maximum chunk of audio to be sent returned. + format_for_conversion (`str`, defaults to `f32le`): + The name of the format of the audio samples to be returned by ffmpeg. The standard is `f32le`, `s16le` + could also be used. + ffmpeg_input_device (`str`, *optional*): + The identifier of the input device to be used by ffmpeg (i.e. ffmpeg's '-i' argument). If unset, + the default input device will be used. See `https://www.ffmpeg.org/ffmpeg-devices.html#Input-Devices` + for how to specify and list input devices. + ffmpeg_additional_args (`list[str]`, *optional*): + Additional arguments to pass to ffmpeg, can include arguments like -nostdin for running as a background + process. For example, to pass -nostdin to the ffmpeg process, pass in ["-nostdin"]. If passing in flags + with multiple arguments, use the following convention (eg ["flag", "arg1", "arg2]). + + Returns: + A generator yielding audio chunks of `chunk_length_s` seconds as `bytes` objects of length + `int(round(sampling_rate * chunk_length_s)) * size_of_sample`. + """ + ar = f"{sampling_rate}" + ac = "1" + if format_for_conversion == "s16le": + size_of_sample = 2 + elif format_for_conversion == "f32le": + size_of_sample = 4 + else: + raise ValueError(f"Unhandled format `{format_for_conversion}`. Please use `s16le` or `f32le`") + + system = platform.system() + + if system == "Linux": + format_ = "alsa" + input_ = ffmpeg_input_device or "default" + elif system == "Darwin": + format_ = "avfoundation" + input_ = ffmpeg_input_device or ":default" + elif system == "Windows": + format_ = "dshow" + input_ = ffmpeg_input_device or _get_microphone_name() + + ffmpeg_additional_args = [] if ffmpeg_additional_args is None else ffmpeg_additional_args + + ffmpeg_command = [ + "ffmpeg", + "-f", + format_, + "-i", + input_, + "-ac", + ac, + "-ar", + ar, + "-f", + format_for_conversion, + "-fflags", + "nobuffer", + "-hide_banner", + "-loglevel", + "quiet", + "pipe:1", + ] + + ffmpeg_command.extend(ffmpeg_additional_args) + + chunk_len = int(round(sampling_rate * chunk_length_s)) * size_of_sample + iterator = _ffmpeg_stream(ffmpeg_command, chunk_len) + for item in iterator: + yield item + + +def ffmpeg_microphone_live( + sampling_rate: int, + chunk_length_s: float, + stream_chunk_s: Optional[int] = None, + stride_length_s: Optional[Union[Tuple[float, float], float]] = None, + format_for_conversion: str = "f32le", + ffmpeg_input_device: Optional[str] = None, + ffmpeg_additional_args: Optional[list[str]] = None, +): + """ + Helper function to read audio from a microphone using ffmpeg. This will output `partial` overlapping chunks starting + from `stream_chunk_s` (if it is defined) until `chunk_length_s` is reached. It will make use of striding to avoid + errors on the "sides" of the various chunks. The default input device will be used unless another input device is + specified using the `ffmpeg_input_device` argument. Uses 'alsa' on Linux, 'avfoundation' on MacOS and 'dshow' on Windows. + + Arguments: + sampling_rate (`int`): + The sampling_rate to use when reading the data from the microphone. Try using the model's sampling_rate to + avoid resampling later. + chunk_length_s (`float` or `int`): + The length of the maximum chunk of audio to be sent returned. This includes the eventual striding. + stream_chunk_s (`float` or `int`): + The length of the minimal temporary audio to be returned. + stride_length_s (`float` or `int` or `(float, float)`, *optional*): + The length of the striding to be used. Stride is used to provide context to a model on the (left, right) of + an audio sample but without using that part to actually make the prediction. Setting this does not change + the length of the chunk. + format_for_conversion (`str`, *optional*, defaults to `f32le`): + The name of the format of the audio samples to be returned by ffmpeg. The standard is `f32le`, `s16le` + could also be used. + ffmpeg_input_device (`str`, *optional*): + The identifier of the input device to be used by ffmpeg (i.e. ffmpeg's '-i' argument). If unset, + the default input device will be used. See `https://www.ffmpeg.org/ffmpeg-devices.html#Input-Devices` + for how to specify and list input devices. + ffmpeg_additional_args (`list[str]`, *optional*): + Additional arguments to pass to ffmpeg, can include arguments like -nostdin for running as a background + process. For example, to pass -nostdin to the ffmpeg process, pass in ["-nostdin"]. If passing in flags + with multiple arguments, use the following convention (eg ["flag", "arg1", "arg2]). + + Return: + A generator yielding dictionaries of the following form + + `{"sampling_rate": int, "raw": np.array(), "partial" bool}` With optionally a `"stride" (int, int)` key if + `stride_length_s` is defined. + + `stride` and `raw` are all expressed in `samples`, and `partial` is a boolean saying if the current yield item + is a whole chunk, or a partial temporary result to be later replaced by another larger chunk. + """ + if stream_chunk_s is not None: + chunk_s = stream_chunk_s + else: + chunk_s = chunk_length_s + + microphone = ffmpeg_microphone( + sampling_rate, + chunk_s, + format_for_conversion=format_for_conversion, + ffmpeg_input_device=ffmpeg_input_device, + ffmpeg_additional_args=[] if ffmpeg_additional_args is None else ffmpeg_additional_args, + ) + + if format_for_conversion == "s16le": + dtype = np.int16 + size_of_sample = 2 + elif format_for_conversion == "f32le": + dtype = np.float32 + size_of_sample = 4 + else: + raise ValueError(f"Unhandled format `{format_for_conversion}`. Please use `s16le` or `f32le`") + + if stride_length_s is None: + stride_length_s = chunk_length_s / 6 + chunk_len = int(round(sampling_rate * chunk_length_s)) * size_of_sample + if isinstance(stride_length_s, (int, float)): + stride_length_s = [stride_length_s, stride_length_s] + + stride_left = int(round(sampling_rate * stride_length_s[0])) * size_of_sample + stride_right = int(round(sampling_rate * stride_length_s[1])) * size_of_sample + audio_time = datetime.datetime.now() + delta = datetime.timedelta(seconds=chunk_s) + for item in chunk_bytes_iter(microphone, chunk_len, stride=(stride_left, stride_right), stream=True): + # Put everything back in numpy scale + item["raw"] = np.frombuffer(item["raw"], dtype=dtype) + item["stride"] = ( + item["stride"][0] // size_of_sample, + item["stride"][1] // size_of_sample, + ) + item["sampling_rate"] = sampling_rate + audio_time += delta + if datetime.datetime.now() > audio_time + 10 * delta: + # We're late !! SKIP + continue + yield item + + +def chunk_bytes_iter(iterator, chunk_len: int, stride: Tuple[int, int], stream: bool = False): + """ + Reads raw bytes from an iterator and does chunks of length `chunk_len`. Optionally adds `stride` to each chunks to + get overlaps. `stream` is used to return partial results even if a full `chunk_len` is not yet available. + """ + acc = b"" + stride_left, stride_right = stride + if stride_left + stride_right >= chunk_len: + raise ValueError( + f"Stride needs to be strictly smaller than chunk_len: ({stride_left}, {stride_right}) vs {chunk_len}" + ) + _stride_left = 0 + for raw in iterator: + acc += raw + if stream and len(acc) < chunk_len: + stride = (_stride_left, 0) + yield {"raw": acc[:chunk_len], "stride": stride, "partial": True} + else: + while len(acc) >= chunk_len: + # We are flushing the accumulator + stride = (_stride_left, stride_right) + item = {"raw": acc[:chunk_len], "stride": stride} + if stream: + item["partial"] = False + yield item + _stride_left = stride_left + acc = acc[chunk_len - stride_left - stride_right :] + # Last chunk + if len(acc) > stride_left: + item = {"raw": acc, "stride": (_stride_left, 0)} + if stream: + item["partial"] = False + yield item + + +def _ffmpeg_stream(ffmpeg_command, buflen: int): + """ + Internal function to create the generator of data through ffmpeg + """ + bufsize = 2**24 # 16Mo + try: + with subprocess.Popen(ffmpeg_command, stdout=subprocess.PIPE, bufsize=bufsize) as ffmpeg_process: + while True: + raw = ffmpeg_process.stdout.read(buflen) + if raw == b"": + break + yield raw + except FileNotFoundError as error: + raise ValueError("ffmpeg was not found but is required to stream audio files from filename") from error + + +def _get_microphone_name(): + """ + Retrieve the microphone name in Windows . + """ + command = ["ffmpeg", "-list_devices", "true", "-f", "dshow", "-i", ""] + + try: + ffmpeg_devices = subprocess.run(command, text=True, stderr=subprocess.PIPE, encoding="utf-8") + microphone_lines = [line for line in ffmpeg_devices.stderr.splitlines() if "(audio)" in line] + + if microphone_lines: + microphone_name = microphone_lines[0].split('"')[1] + print(f"Using microphone: {microphone_name}") + return f"audio={microphone_name}" + except FileNotFoundError: + print("ffmpeg was not found. Please install it or make sure it is in your system PATH.") + + return "default" diff --git a/.venv/lib/python3.11/site-packages/transformers/pipelines/automatic_speech_recognition.py b/.venv/lib/python3.11/site-packages/transformers/pipelines/automatic_speech_recognition.py new file mode 100644 index 0000000000000000000000000000000000000000..66a9c49ea5f3516053fa9d5835109dcd53e3ff1a --- /dev/null +++ b/.venv/lib/python3.11/site-packages/transformers/pipelines/automatic_speech_recognition.py @@ -0,0 +1,766 @@ +# Copyright 2021 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import warnings +from collections import defaultdict +from typing import TYPE_CHECKING, Dict, Optional, Union + +import numpy as np +import requests + +from ..tokenization_utils import PreTrainedTokenizer +from ..utils import is_torch_available, is_torchaudio_available, logging +from .audio_utils import ffmpeg_read +from .base import ChunkPipeline + + +if TYPE_CHECKING: + from pyctcdecode import BeamSearchDecoderCTC + + from ..feature_extraction_sequence_utils import SequenceFeatureExtractor + from ..modeling_utils import PreTrainedModel + +logger = logging.get_logger(__name__) + +if is_torch_available(): + import torch + + from ..models.auto.modeling_auto import MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING_NAMES + + +def rescale_stride(stride, ratio): + """ + Rescales the stride values from audio space to tokens/logits space. + + (160_000, 16_000, 16_000) -> (2000, 200, 200) for instance. + """ + # Shape is [B, SEQ] for tokens + # [B, SEQ, V] for logits + + new_strides = [] + for input_n, left, right in stride: + token_n = int(round(input_n * ratio)) + left = int(round(left / input_n * token_n)) + right = int(round(right / input_n * token_n)) + new_stride = (token_n, left, right) + new_strides.append(new_stride) + + return new_strides + + +def chunk_iter(inputs, feature_extractor, chunk_len, stride_left, stride_right, dtype=None): + inputs_len = inputs.shape[0] + step = chunk_len - stride_left - stride_right + for chunk_start_idx in range(0, inputs_len, step): + chunk_end_idx = chunk_start_idx + chunk_len + chunk = inputs[chunk_start_idx:chunk_end_idx] + processed = feature_extractor(chunk, sampling_rate=feature_extractor.sampling_rate, return_tensors="pt") + if dtype is not None: + processed = processed.to(dtype=dtype) + _stride_left = 0 if chunk_start_idx == 0 else stride_left + is_last = chunk_end_idx >= inputs_len + _stride_right = 0 if is_last else stride_right + + chunk_len = chunk.shape[0] + stride = (chunk_len, _stride_left, _stride_right) + if chunk.shape[0] > _stride_left: + yield {"is_last": is_last, "stride": stride, **processed} + if is_last: + break + + +def _fast_find_longest_common_sequence(sequence_left, sequence_right): + seq_len_left = len(sequence_left) + seq_len_right = len(sequence_right) + counter = [[0] * (seq_len_right + 1) for _ in range(seq_len_left + 1)] + longest = 0 + for i in range(seq_len_left): + for j in range(seq_len_right): + if sequence_left[i] == sequence_right[j]: + previous_counter = counter[i][j] + 1 + counter[i + 1][j + 1] = previous_counter + if previous_counter > longest: + longest = previous_counter + + counter = np.array(counter) + # we return the idx of the first element of the longest common sequence in the left sequence + index_left = np.argwhere(counter == longest)[-1][0] - longest if longest != 0 else -1 + index_right = np.argwhere(counter == longest)[-1][1] - longest if longest != 0 else -1 + return index_left, index_right, longest + + +def _find_longest_common_sequence(sequences, tokenizer): + # TODO Use a faster algorithm this can probably be done in O(n) + # using suffix array. + # It might be tedious to do because of fault tolerance. + # We actually have a really good property which is that the total sequence + # MUST be those subsequences in order. + # Also the algorithm should be more tolerant to errors. + sequence = [tok_id for tok_id in sequences[0][0].tolist() if tok_id not in tokenizer.all_special_ids] + for new_seq in sequences[1:]: + new_sequence = [tok_id for tok_id in new_seq[0].tolist() if tok_id not in tokenizer.all_special_ids] + + index = 0 + max_ = 0.0 + for i in range(1, len(new_sequence) + 1): + # epsilon to favor long perfect matches + eps = i / 10000.0 + matches = np.sum(np.array(sequence[-i:]) == np.array(new_sequence[:i])) + matching = matches / i + eps + if matches > 1 and matching > max_: + index = i + max_ = matching + sequence.extend(new_sequence[index:]) + return np.array(sequence) + + +class AutomaticSpeechRecognitionPipeline(ChunkPipeline): + """ + Pipeline that aims at extracting spoken text contained within some audio. + + The input can be either a raw waveform or a audio file. In case of the audio file, ffmpeg should be installed for + to support multiple audio formats + + Example: + + ```python + >>> from transformers import pipeline + + >>> transcriber = pipeline(model="openai/whisper-base") + >>> transcriber("https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/1.flac") + {'text': ' He hoped there would be stew for dinner, turnips and carrots and bruised potatoes and fat mutton pieces to be ladled out in thick, peppered flour-fatten sauce.'} + ``` + + Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial) + + Arguments: + model ([`PreTrainedModel`] or [`TFPreTrainedModel`]): + The model that will be used by the pipeline to make predictions. This needs to be a model inheriting from + [`PreTrainedModel`] for PyTorch and [`TFPreTrainedModel`] for TensorFlow. + feature_extractor ([`SequenceFeatureExtractor`]): + The feature extractor that will be used by the pipeline to encode waveform for the model. + tokenizer ([`PreTrainedTokenizer`]): + The tokenizer that will be used by the pipeline to encode data for the model. This object inherits from + [`PreTrainedTokenizer`]. + decoder (`pyctcdecode.BeamSearchDecoderCTC`, *optional*): + [PyCTCDecode's + BeamSearchDecoderCTC](https://github.com/kensho-technologies/pyctcdecode/blob/2fd33dc37c4111417e08d89ccd23d28e9b308d19/pyctcdecode/decoder.py#L180) + can be passed for language model boosted decoding. See [`Wav2Vec2ProcessorWithLM`] for more information. + chunk_length_s (`float`, *optional*, defaults to 0): + The input length for in each chunk. If `chunk_length_s = 0` then chunking is disabled (default). + + + + For more information on how to effectively use `chunk_length_s`, please have a look at the [ASR chunking + blog post](https://huggingface.co/blog/asr-chunking). + + + + stride_length_s (`float`, *optional*, defaults to `chunk_length_s / 6`): + The length of stride on the left and right of each chunk. Used only with `chunk_length_s > 0`. This enables + the model to *see* more context and infer letters better than without this context but the pipeline + discards the stride bits at the end to make the final reconstitution as perfect as possible. + + + + For more information on how to effectively use `stride_length_s`, please have a look at the [ASR chunking + blog post](https://huggingface.co/blog/asr-chunking). + + + + framework (`str`, *optional*): + The framework to use, either `"pt"` for PyTorch or `"tf"` for TensorFlow. The specified framework must be + installed. If no framework is specified, will default to the one currently installed. If no framework is + specified and both frameworks are installed, will default to the framework of the `model`, or to PyTorch if + no model is provided. + device (Union[`int`, `torch.device`], *optional*): + Device ordinal for CPU/GPU supports. Setting this to `None` will leverage CPU, a positive will run the + model on the associated CUDA device id. + torch_dtype (Union[`int`, `torch.dtype`], *optional*): + The data-type (dtype) of the computation. Setting this to `None` will use float32 precision. Set to + `torch.float16` or `torch.bfloat16` to use half-precision in the respective dtypes. + + """ + + def __init__( + self, + model: "PreTrainedModel", + feature_extractor: Union["SequenceFeatureExtractor", str] = None, + tokenizer: Optional[PreTrainedTokenizer] = None, + decoder: Optional[Union["BeamSearchDecoderCTC", str]] = None, + device: Union[int, "torch.device"] = None, + torch_dtype: Optional[Union[str, "torch.dtype"]] = None, + **kwargs, + ): + # set the model type so we can check we have the right pre- and post-processing parameters + if model.config.model_type == "whisper": + self.type = "seq2seq_whisper" + elif model.__class__.__name__ in MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING_NAMES.values(): + self.type = "seq2seq" + elif ( + feature_extractor._processor_class + and feature_extractor._processor_class.endswith("WithLM") + and decoder is not None + ): + self.decoder = decoder + self.type = "ctc_with_lm" + else: + self.type = "ctc" + + super().__init__(model, tokenizer, feature_extractor, device=device, torch_dtype=torch_dtype, **kwargs) + + def __call__( + self, + inputs: Union[np.ndarray, bytes, str], + **kwargs, + ): + """ + Transcribe the audio sequence(s) given as inputs to text. See the [`AutomaticSpeechRecognitionPipeline`] + documentation for more information. + + Args: + inputs (`np.ndarray` or `bytes` or `str` or `dict`): + The inputs is either : + - `str` that is either the filename of a local audio file, or a public URL address to download the + audio file. The file will be read at the correct sampling rate to get the waveform using + *ffmpeg*. This requires *ffmpeg* to be installed on the system. + - `bytes` it is supposed to be the content of an audio file and is interpreted by *ffmpeg* in the + same way. + - (`np.ndarray` of shape (n, ) of type `np.float32` or `np.float64`) + Raw audio at the correct sampling rate (no further check will be done) + - `dict` form can be used to pass raw audio sampled at arbitrary `sampling_rate` and let this + pipeline do the resampling. The dict must be in the format `{"sampling_rate": int, "raw": + np.array}` with optionally a `"stride": (left: int, right: int)` than can ask the pipeline to + treat the first `left` samples and last `right` samples to be ignored in decoding (but used at + inference to provide more context to the model). Only use `stride` with CTC models. + return_timestamps (*optional*, `str` or `bool`): + Only available for pure CTC models (Wav2Vec2, HuBERT, etc) and the Whisper model. Not available for + other sequence-to-sequence models. + + For CTC models, timestamps can take one of two formats: + - `"char"`: the pipeline will return timestamps along the text for every character in the text. For + instance, if you get `[{"text": "h", "timestamp": (0.5, 0.6)}, {"text": "i", "timestamp": (0.7, + 0.9)}]`, then it means the model predicts that the letter "h" was spoken after `0.5` and before + `0.6` seconds. + - `"word"`: the pipeline will return timestamps along the text for every word in the text. For + instance, if you get `[{"text": "hi ", "timestamp": (0.5, 0.9)}, {"text": "there", "timestamp": + (1.0, 1.5)}]`, then it means the model predicts that the word "hi" was spoken after `0.5` and + before `0.9` seconds. + + For the Whisper model, timestamps can take one of two formats: + - `"word"`: same as above for word-level CTC timestamps. Word-level timestamps are predicted + through the *dynamic-time warping (DTW)* algorithm, an approximation to word-level timestamps + by inspecting the cross-attention weights. + - `True`: the pipeline will return timestamps along the text for *segments* of words in the text. + For instance, if you get `[{"text": " Hi there!", "timestamp": (0.5, 1.5)}]`, then it means the + model predicts that the segment "Hi there!" was spoken after `0.5` and before `1.5` seconds. + Note that a segment of text refers to a sequence of one or more words, rather than individual + words as with word-level timestamps. + generate_kwargs (`dict`, *optional*): + The dictionary of ad-hoc parametrization of `generate_config` to be used for the generation call. For a + complete overview of generate, check the [following + guide](https://huggingface.co/docs/transformers/en/main_classes/text_generation). + + Return: + `Dict`: A dictionary with the following keys: + - **text** (`str`): The recognized text. + - **chunks** (*optional(, `List[Dict]`) + When using `return_timestamps`, the `chunks` will become a list containing all the various text + chunks identified by the model, *e.g.* `[{"text": "hi ", "timestamp": (0.5, 0.9)}, {"text": + "there", "timestamp": (1.0, 1.5)}]`. The original full text can roughly be recovered by doing + `"".join(chunk["text"] for chunk in output["chunks"])`. + """ + return super().__call__(inputs, **kwargs) + + def _sanitize_parameters( + self, + chunk_length_s=None, + stride_length_s=None, + ignore_warning=None, + decoder_kwargs=None, + return_timestamps=None, + return_language=None, + generate_kwargs=None, + max_new_tokens=None, + ): + # No parameters on this pipeline right now + preprocess_params = {} + if chunk_length_s is not None: + if self.type == "seq2seq" and not ignore_warning: + logger.warning( + "Using `chunk_length_s` is very experimental with seq2seq models. The results will not necessarily" + " be entirely accurate and will have caveats. More information:" + " https://github.com/huggingface/transformers/pull/20104. Ignore this warning with pipeline(...," + " ignore_warning=True)" + ) + preprocess_params["chunk_length_s"] = chunk_length_s + if stride_length_s is not None: + preprocess_params["stride_length_s"] = stride_length_s + + forward_params = defaultdict(dict) + if max_new_tokens is not None: + warnings.warn( + "`max_new_tokens` is deprecated and will be removed in version 4.49 of Transformers. To remove this warning, pass `max_new_tokens` as a key inside `generate_kwargs` instead.", + FutureWarning, + ) + forward_params["max_new_tokens"] = max_new_tokens + if generate_kwargs is not None: + if max_new_tokens is not None and "max_new_tokens" in generate_kwargs: + raise ValueError( + "`max_new_tokens` is defined both as an argument and inside `generate_kwargs` argument, please use" + " only 1 version" + ) + forward_params.update(generate_kwargs) + + postprocess_params = {} + if decoder_kwargs is not None: + postprocess_params["decoder_kwargs"] = decoder_kwargs + if return_timestamps is not None: + # Check whether we have a valid setting for return_timestamps and throw an error before we perform a forward pass + if self.type == "seq2seq" and return_timestamps: + raise ValueError("We cannot return_timestamps yet on non-CTC models apart from Whisper!") + if self.type == "ctc_with_lm" and return_timestamps != "word": + raise ValueError("CTC with LM can only predict word level timestamps, set `return_timestamps='word'`") + if self.type == "ctc" and return_timestamps not in ["char", "word"]: + raise ValueError( + "CTC can either predict character level timestamps, or word level timestamps. " + "Set `return_timestamps='char'` or `return_timestamps='word'` as required." + ) + if self.type == "seq2seq_whisper" and return_timestamps == "char": + raise ValueError( + "Whisper cannot return `char` timestamps, only word level or segment level timestamps. " + "Use `return_timestamps='word'` or `return_timestamps=True` respectively." + ) + forward_params["return_timestamps"] = return_timestamps + postprocess_params["return_timestamps"] = return_timestamps + if return_language is not None: + if self.type != "seq2seq_whisper": + raise ValueError("Only Whisper can return language for now.") + postprocess_params["return_language"] = return_language + + if self.assistant_model is not None: + forward_params["assistant_model"] = self.assistant_model + if self.assistant_tokenizer is not None: + forward_params["tokenizer"] = self.tokenizer + forward_params["assistant_tokenizer"] = self.assistant_tokenizer + + return preprocess_params, forward_params, postprocess_params + + def preprocess(self, inputs, chunk_length_s=0, stride_length_s=None): + if isinstance(inputs, str): + if inputs.startswith("http://") or inputs.startswith("https://"): + # We need to actually check for a real protocol, otherwise it's impossible to use a local file + # like http_huggingface_co.png + inputs = requests.get(inputs).content + else: + with open(inputs, "rb") as f: + inputs = f.read() + + if isinstance(inputs, bytes): + inputs = ffmpeg_read(inputs, self.feature_extractor.sampling_rate) + + stride = None + extra = {} + if isinstance(inputs, dict): + stride = inputs.pop("stride", None) + # Accepting `"array"` which is the key defined in `datasets` for + # better integration + if not ("sampling_rate" in inputs and ("raw" in inputs or "array" in inputs)): + raise ValueError( + "When passing a dictionary to AutomaticSpeechRecognitionPipeline, the dict needs to contain a " + '"raw" key containing the numpy array representing the audio and a "sampling_rate" key, ' + "containing the sampling_rate associated with that array" + ) + + _inputs = inputs.pop("raw", None) + if _inputs is None: + # Remove path which will not be used from `datasets`. + inputs.pop("path", None) + _inputs = inputs.pop("array", None) + in_sampling_rate = inputs.pop("sampling_rate") + extra = inputs + inputs = _inputs + if in_sampling_rate != self.feature_extractor.sampling_rate: + if is_torchaudio_available(): + from torchaudio import functional as F + else: + raise ImportError( + "torchaudio is required to resample audio samples in AutomaticSpeechRecognitionPipeline. " + "The torchaudio package can be installed through: `pip install torchaudio`." + ) + + inputs = F.resample( + torch.from_numpy(inputs), in_sampling_rate, self.feature_extractor.sampling_rate + ).numpy() + ratio = self.feature_extractor.sampling_rate / in_sampling_rate + else: + ratio = 1 + if stride is not None: + if stride[0] + stride[1] > inputs.shape[0]: + raise ValueError("Stride is too large for input") + + # Stride needs to get the chunk length here, it's going to get + # swallowed by the `feature_extractor` later, and then batching + # can add extra data in the inputs, so we need to keep track + # of the original length in the stride so we can cut properly. + stride = (inputs.shape[0], int(round(stride[0] * ratio)), int(round(stride[1] * ratio))) + if not isinstance(inputs, np.ndarray): + raise TypeError(f"We expect a numpy ndarray as input, got `{type(inputs)}`") + if len(inputs.shape) != 1: + raise ValueError("We expect a single channel audio input for AutomaticSpeechRecognitionPipeline") + + if chunk_length_s: + if stride_length_s is None: + stride_length_s = chunk_length_s / 6 + + if isinstance(stride_length_s, (int, float)): + stride_length_s = [stride_length_s, stride_length_s] + + # XXX: Carefuly, this variable will not exist in `seq2seq` setting. + # Currently chunking is not possible at this level for `seq2seq` so + # it's ok. + align_to = getattr(self.model.config, "inputs_to_logits_ratio", 1) + chunk_len = int(round(chunk_length_s * self.feature_extractor.sampling_rate / align_to) * align_to) + stride_left = int(round(stride_length_s[0] * self.feature_extractor.sampling_rate / align_to) * align_to) + stride_right = int(round(stride_length_s[1] * self.feature_extractor.sampling_rate / align_to) * align_to) + + if chunk_len < stride_left + stride_right: + raise ValueError("Chunk length must be superior to stride length") + + for item in chunk_iter( + inputs, self.feature_extractor, chunk_len, stride_left, stride_right, self.torch_dtype + ): + yield {**item, **extra} + else: + if self.type == "seq2seq_whisper" and inputs.shape[0] > self.feature_extractor.n_samples: + processed = self.feature_extractor( + inputs, + sampling_rate=self.feature_extractor.sampling_rate, + truncation=False, + padding="longest", + return_tensors="pt", + return_attention_mask=True, + ) + else: + if self.type == "seq2seq_whisper" and stride is None: + processed = self.feature_extractor( + inputs, + sampling_rate=self.feature_extractor.sampling_rate, + return_tensors="pt", + return_token_timestamps=True, + return_attention_mask=True, + ) + extra["num_frames"] = processed.pop("num_frames") + else: + processed = self.feature_extractor( + inputs, + sampling_rate=self.feature_extractor.sampling_rate, + return_tensors="pt", + return_attention_mask=True, + ) + if self.torch_dtype is not None: + processed = processed.to(dtype=self.torch_dtype) + if stride is not None: + if self.type == "seq2seq": + raise ValueError("Stride is only usable with CTC models, try removing it !") + + processed["stride"] = stride + yield {"is_last": True, **processed, **extra} + + def _forward(self, model_inputs, return_timestamps=False, **generate_kwargs): + attention_mask = model_inputs.pop("attention_mask", None) + stride = model_inputs.pop("stride", None) + num_frames = model_inputs.pop("num_frames", None) + is_last = model_inputs.pop("is_last") + + if stride is not None and num_frames is not None: + raise ValueError("num_frames must be used only when stride is None") + + if self.type in {"seq2seq", "seq2seq_whisper"}: + # Consume values so we can let extra information flow freely through + # the pipeline (important for `partial` in microphone) + if "input_features" in model_inputs: + inputs = model_inputs.pop("input_features") + elif "input_values" in model_inputs: + inputs = model_inputs.pop("input_values") + else: + raise ValueError( + "Seq2Seq speech recognition model requires either a " + f"`input_features` or `input_values` key, but only has {model_inputs.keys()}" + ) + + # custom processing for Whisper timestamps and word-level timestamps + if return_timestamps and self.type == "seq2seq_whisper": + generate_kwargs["return_timestamps"] = return_timestamps + if return_timestamps == "word": + generate_kwargs["return_token_timestamps"] = True + generate_kwargs["return_segments"] = True + + if stride is not None: + if isinstance(stride, tuple): + generate_kwargs["num_frames"] = stride[0] // self.feature_extractor.hop_length + else: + generate_kwargs["num_frames"] = [s[0] // self.feature_extractor.hop_length for s in stride] + else: + generate_kwargs["num_frames"] = num_frames + + # User-defined `generation_config` passed to the pipeline call take precedence + if "generation_config" not in generate_kwargs: + generate_kwargs["generation_config"] = self.generation_config + + tokens = self.model.generate( + inputs=inputs, + attention_mask=attention_mask, + **generate_kwargs, + ) + # whisper longform generation stores timestamps in "segments" + if return_timestamps == "word" and self.type == "seq2seq_whisper": + if "segments" not in tokens: + out = {"tokens": tokens["sequences"], "token_timestamps": tokens["token_timestamps"]} + else: + token_timestamps = [ + torch.cat([segment["token_timestamps"] for segment in segment_list]) + for segment_list in tokens["segments"] + ] + out = {"tokens": tokens["sequences"], "token_timestamps": token_timestamps} + else: + out = {"tokens": tokens} + if self.type == "seq2seq_whisper": + if stride is not None: + out["stride"] = stride + + else: + inputs = { + self.model.main_input_name: model_inputs.pop(self.model.main_input_name), + "attention_mask": attention_mask, + } + outputs = self.model(**inputs) + logits = outputs.logits + + if self.type == "ctc_with_lm": + out = {"logits": logits} + else: + out = {"tokens": logits.argmax(dim=-1)} + if stride is not None: + # Send stride to `postprocess`. + # it needs to be handled there where + # the pieces are to be concatenated. + ratio = 1 / self.model.config.inputs_to_logits_ratio + if isinstance(stride, tuple): + out["stride"] = rescale_stride([stride], ratio)[0] + else: + out["stride"] = rescale_stride(stride, ratio) + # Leftover + extra = model_inputs + return {"is_last": is_last, **out, **extra} + + def postprocess( + self, model_outputs, decoder_kwargs: Optional[Dict] = None, return_timestamps=None, return_language=None + ): + # Optional return types + optional = {} + + final_items = [] + key = "logits" if self.type == "ctc_with_lm" else "tokens" + stride = None + for outputs in model_outputs: + if self.framework == "pt" and outputs[key].dtype in (torch.bfloat16, torch.float16): + items = outputs[key].to(torch.float32).numpy() + else: + items = outputs[key].numpy() + stride = outputs.get("stride", None) + if stride is not None and self.type in {"ctc", "ctc_with_lm"}: + total_n, left, right = stride + # Total_n might be < logits.shape[1] + # because of padding, that's why + # we need to reconstruct this information + # This won't work with left padding (which doesn't exist right now) + right_n = total_n - right + items = items[:, left:right_n] + final_items.append(items) + + if stride and self.type == "seq2seq": + items = _find_longest_common_sequence(final_items, self.tokenizer) + elif self.type == "seq2seq_whisper": + time_precision = self.feature_extractor.chunk_length / self.model.config.max_source_positions + # Send the chunking back to seconds, it's easier to handle in whisper + sampling_rate = self.feature_extractor.sampling_rate + for output in model_outputs: + if "stride" in output: + chunk_len, stride_left, stride_right = output["stride"] + # Go back in seconds + chunk_len /= sampling_rate + stride_left /= sampling_rate + stride_right /= sampling_rate + output["stride"] = chunk_len, stride_left, stride_right + + text, optional = self.tokenizer._decode_asr( + model_outputs, + return_timestamps=return_timestamps, + return_language=return_language, + time_precision=time_precision, + ) + else: + items = np.concatenate(final_items, axis=1) + items = items.squeeze(0) + + if self.type == "ctc_with_lm": + if decoder_kwargs is None: + decoder_kwargs = {} + beams = self.decoder.decode_beams(items, **decoder_kwargs) + text = beams[0][0] + if return_timestamps: + # Simply cast from pyctcdecode format to wav2vec2 format to leverage + # pre-existing code later + chunk_offset = beams[0][2] + offsets = [] + for word, (start_offset, end_offset) in chunk_offset: + offsets.append({"word": word, "start_offset": start_offset, "end_offset": end_offset}) + elif self.type != "seq2seq_whisper": + skip_special_tokens = self.type != "ctc" + text = self.tokenizer.decode(items, skip_special_tokens=skip_special_tokens) + if return_timestamps: + offsets = self.tokenizer.decode( + items, skip_special_tokens=skip_special_tokens, output_char_offsets=True + )["char_offsets"] + if return_timestamps == "word": + offsets = self.tokenizer._get_word_offsets(offsets, self.tokenizer.replace_word_delimiter_char) + + if return_timestamps and self.type not in {"seq2seq", "seq2seq_whisper"}: + chunks = [] + for item in offsets: + start = item["start_offset"] * self.model.config.inputs_to_logits_ratio + start /= self.feature_extractor.sampling_rate + + stop = item["end_offset"] * self.model.config.inputs_to_logits_ratio + stop /= self.feature_extractor.sampling_rate + + chunks.append({"text": item[return_timestamps], "timestamp": (start, stop)}) + optional["chunks"] = chunks + + extra = defaultdict(list) + for output in model_outputs: + output.pop("tokens", None) + output.pop("logits", None) + output.pop("is_last", None) + output.pop("stride", None) + output.pop("token_timestamps", None) + for k, v in output.items(): + extra[k].append(v) + return {"text": text, **optional, **extra} + + +def _find_timestamp_sequence(sequences, tokenizer, feature_extractor, max_source_positions): + """ + Computes the final sequences by merging the end of the nth sequence with the beginning of the n+1th sequence. Since + `WhisperForConditionalGeneration` produces the timestamps pairwise, we filter the consecutive timestamps and only + iterate over them. We keep track of the `time` which indicates the actual starting time of the chunk that is + processed. We need to make sure to offset the timestamps tokens by the `time` in order for the tokenizer to + properly compute the final `offset`. + """ + # index of the first timestamp token + timestamp_begin = tokenizer.convert_tokens_to_ids("<|notimestamps|>") + 1 + items = [] + # approximation of the token to time ratio : ~0.2seconds + time_precision = feature_extractor.chunk_length / max_source_positions + time = 0 + for seq_idx, item in enumerate(sequences): + sequence, stride = item + if isinstance(sequence, list): + sequence = np.array(sequence) + chunk_len, stride_left, stride_right = stride + sequence = sequence.squeeze(0) + # get rid of the `forced_decoder_idx` that are use to parametrize the generation + begin_idx = np.where(sequence == timestamp_begin)[0][0] if timestamp_begin in sequence else 0 + sequence = sequence[begin_idx:] + + timestamp_tokens = sequence >= timestamp_begin + if seq_idx != 0 and sum(timestamp_tokens) > 0: + consecutive = np.where(timestamp_tokens[:-1] & timestamp_tokens[1:])[0] + 1 + last_timestamp = np.where(timestamp_tokens)[0][-1] + consecutive = np.append(consecutive, last_timestamp) if last_timestamp not in consecutive else consecutive + time -= stride_left + stride_right + offset = int((time / feature_extractor.sampling_rate) / time_precision) + overlap_time = int((stride_left / feature_extractor.sampling_rate) / time_precision) + # relevant timestamps are in the overlapping part + relevant_timestamp = np.where(sequence[consecutive] >= timestamp_begin + overlap_time)[0] + if relevant_timestamp.shape[0] > 0: + relevant_timestamp = ( + consecutive[relevant_timestamp[0] - 1] if relevant_timestamp[0] > 0 else consecutive[0] + ) + # if a big stride is used, we need to check some of the previous items for the best overlap + best_match = 0 + sliced_sequence = [] + for idx, previous_sequence in enumerate(reversed(items)): + previous_tokens = previous_sequence[1:-1] + if previous_sequence[0] < (timestamp_begin + offset - overlap_time) and idx != 0: + break # the previous sequence is too far in the past + if len(previous_tokens) > 0: + # find the longest common sequence between the overlapping parts + index_left, index_right, match_length = _fast_find_longest_common_sequence( + sequence[1:relevant_timestamp], previous_tokens + ) + # don't do anything if only 1 token was matched + if match_length > 1 and match_length > best_match: + best_match = match_length + best_idx = idx + end_of_curr_sequence_idx = ( + np.where(sequence[index_left + 1 :] >= timestamp_begin)[0][0] + 1 + ) + end_of_curr_sequence_idx = end_of_curr_sequence_idx + 1 + index_left + # if all the tokens are matched, suffix + if index_left == 0 and match_length == len(previous_tokens): + sliced_sequence = np.insert( + sequence[index_left + 1 : end_of_curr_sequence_idx], 0, previous_sequence[0] + ) + sliced_sequence[-1] = previous_sequence[-1] + # if part of the previous sequence is not taken + elif index_left >= 0: + sliced_sequence = sequence[index_left + 1 : end_of_curr_sequence_idx] + # let's insert the missing part of the previous sequence + previous_slice = ( + previous_sequence[: index_right + 1] if index_right > 0 else [previous_sequence[0]] + ) + sliced_sequence = np.insert(sliced_sequence, 0, previous_slice) + sliced_sequence[-1] += offset + + if len(sliced_sequence) > 0: + items[len(items) - best_idx - 1] = sliced_sequence + items = items[: len(items) - best_idx] + sequence = sequence[end_of_curr_sequence_idx:] + + # sequence might have changed + timestamp_tokens = sequence >= timestamp_begin + consecutive = np.where(timestamp_tokens[:-1] & timestamp_tokens[1:])[0] + 1 + if sum(timestamp_tokens) > 0: + last_timestamp = np.where(timestamp_tokens)[0][-1] + consecutive = ( + np.append(consecutive, last_timestamp + 1) if last_timestamp not in consecutive else consecutive + ) + + if len(consecutive) > 0: + last_slice = 0 + for current_slice in consecutive: + actual_offset = items[-1][-1] if seq_idx != 0 or last_slice != 0 else sequence[0] + sliced_tokens = sequence[last_slice:current_slice] + duration = sliced_tokens[-1] - sliced_tokens[0] + sliced_tokens[0] = actual_offset + sliced_tokens[-1] = actual_offset + duration + items.append(sliced_tokens) + last_slice = current_slice + + time += chunk_len + result = [] + for i in range(len(items)): + result += items[i].tolist() + return result diff --git a/.venv/lib/python3.11/site-packages/transformers/pipelines/base.py b/.venv/lib/python3.11/site-packages/transformers/pipelines/base.py new file mode 100644 index 0000000000000000000000000000000000000000..a24e9c3f69787849de363dc501666d511e84ee13 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/transformers/pipelines/base.py @@ -0,0 +1,1484 @@ +# coding=utf-8 +# Copyright 2018 The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import collections +import copy +import csv +import importlib +import json +import os +import pickle +import sys +import traceback +import types +import warnings +from abc import ABC, abstractmethod +from collections import UserDict +from contextlib import contextmanager +from os.path import abspath, exists +from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union + +from ..dynamic_module_utils import custom_object_save +from ..feature_extraction_utils import PreTrainedFeatureExtractor +from ..image_processing_utils import BaseImageProcessor +from ..modelcard import ModelCard +from ..models.auto import AutoConfig, AutoTokenizer +from ..processing_utils import ProcessorMixin +from ..tokenization_utils import PreTrainedTokenizer +from ..utils import ( + ModelOutput, + PushToHubMixin, + add_end_docstrings, + copy_func, + infer_framework, + is_tf_available, + is_torch_available, + is_torch_cuda_available, + is_torch_mlu_available, + is_torch_mps_available, + is_torch_musa_available, + is_torch_npu_available, + is_torch_xpu_available, + logging, +) + + +GenericTensor = Union[List["GenericTensor"], "torch.Tensor", "tf.Tensor"] + +if is_tf_available(): + import tensorflow as tf + + from ..models.auto.modeling_tf_auto import TFAutoModel + +if is_torch_available(): + import torch + from torch.utils.data import DataLoader, Dataset + + from ..models.auto.modeling_auto import AutoModel + + # Re-export for backward compatibility + from .pt_utils import KeyDataset +else: + Dataset = None + KeyDataset = None + +if TYPE_CHECKING: + from ..modeling_tf_utils import TFPreTrainedModel + from ..modeling_utils import PreTrainedModel + + +logger = logging.get_logger(__name__) + + +def no_collate_fn(items): + if len(items) != 1: + raise ValueError("This collate_fn is meant to be used with batch_size=1") + return items[0] + + +def _pad(items, key, padding_value, padding_side): + batch_size = len(items) + if isinstance(items[0][key], torch.Tensor): + # Others include `attention_mask` etc... + shape = items[0][key].shape + dim = len(shape) + if dim == 1: + # We have a list of 1-dim torch tensors, which can be stacked without padding + return torch.cat([item[key] for item in items], dim=0) + if key in ["pixel_values", "image"]: + # This is probable image so padding shouldn't be necessary + # B, C, H, W + return torch.cat([item[key] for item in items], dim=0) + elif dim == 4 and key == "input_features": + # this is probably a mel spectrogram batched + return torch.cat([item[key] for item in items], dim=0) + max_length = max(item[key].shape[1] for item in items) + min_length = min(item[key].shape[1] for item in items) + dtype = items[0][key].dtype + + if dim == 2: + if max_length == min_length: + # Bypass for `ImageGPT` which doesn't provide a padding value, yet + # we can consistently pad since the size should be matching + return torch.cat([item[key] for item in items], dim=0) + tensor = torch.zeros((batch_size, max_length), dtype=dtype) + padding_value + elif dim == 3: + tensor = torch.zeros((batch_size, max_length, shape[-1]), dtype=dtype) + padding_value + elif dim == 4: + tensor = torch.zeros((batch_size, max_length, shape[-2], shape[-1]), dtype=dtype) + padding_value + + for i, item in enumerate(items): + if dim == 2: + if padding_side == "left": + tensor[i, -len(item[key][0]) :] = item[key][0].clone() + else: + tensor[i, : len(item[key][0])] = item[key][0].clone() + elif dim == 3: + if padding_side == "left": + tensor[i, -len(item[key][0]) :, :] = item[key][0].clone() + else: + tensor[i, : len(item[key][0]), :] = item[key][0].clone() + elif dim == 4: + if padding_side == "left": + tensor[i, -len(item[key][0]) :, :, :] = item[key][0].clone() + else: + tensor[i, : len(item[key][0]), :, :] = item[key][0].clone() + + return tensor + else: + return [item[key] for item in items] + + +def pad_collate_fn(tokenizer, feature_extractor): + # Tokenizer + t_padding_side = None + # Feature extractor + f_padding_side = None + if tokenizer is None and feature_extractor is None: + raise ValueError("Pipeline without tokenizer or feature_extractor cannot do batching") + if tokenizer is not None: + if tokenizer.pad_token_id is None: + raise ValueError( + "Pipeline with tokenizer without pad_token cannot do batching. You can try to set it with " + "`pipe.tokenizer.pad_token_id = model.config.eos_token_id`." + ) + else: + t_padding_value = tokenizer.pad_token_id + t_padding_side = tokenizer.padding_side + if feature_extractor is not None: + # Feature extractor can be images, where no padding is expected + f_padding_value = getattr(feature_extractor, "padding_value", None) + f_padding_side = getattr(feature_extractor, "padding_side", None) + + if t_padding_side is not None and f_padding_side is not None and t_padding_side != f_padding_side: + raise ValueError( + f"The feature extractor, and tokenizer don't agree on padding side {t_padding_side} != {f_padding_side}" + ) + padding_side = "right" + if t_padding_side is not None: + padding_side = t_padding_side + if f_padding_side is not None: + padding_side = f_padding_side + + def inner(items): + keys = set(items[0].keys()) + for item in items: + if set(item.keys()) != keys: + raise ValueError( + f"The elements of the batch contain different keys. Cannot batch them ({set(item.keys())} !=" + f" {keys})" + ) + # input_values, input_pixels, input_ids, ... + padded = {} + for key in keys: + if key in {"input_ids"}: + # ImageGPT uses a feature extractor + if tokenizer is None and feature_extractor is not None: + _padding_value = f_padding_value + else: + _padding_value = t_padding_value + elif key in {"input_values", "pixel_values", "input_features"}: + _padding_value = f_padding_value + elif key in {"p_mask", "special_tokens_mask"}: + _padding_value = 1 + elif key in {"attention_mask", "token_type_ids"}: + _padding_value = 0 + else: + # This is likely another random key maybe even user provided + _padding_value = 0 + padded[key] = _pad(items, key, _padding_value, padding_side) + return padded + + return inner + + +def infer_framework_load_model( + model, + config: AutoConfig, + model_classes: Optional[Dict[str, Tuple[type]]] = None, + task: Optional[str] = None, + framework: Optional[str] = None, + **model_kwargs, +): + """ + Select framework (TensorFlow or PyTorch) to use from the `model` passed. Returns a tuple (framework, model). + + If `model` is instantiated, this function will just infer the framework from the model class. Otherwise `model` is + actually a checkpoint name and this method will try to instantiate it using `model_classes`. Since we don't want to + instantiate the model twice, this model is returned for use by the pipeline. + + If both frameworks are installed and available for `model`, PyTorch is selected. + + Args: + model (`str`, [`PreTrainedModel`] or [`TFPreTrainedModel]`): + The model to infer the framework from. If `str`, a checkpoint name. The model to infer the framewrok from. + config ([`AutoConfig`]): + The config associated with the model to help using the correct class + model_classes (dictionary `str` to `type`, *optional*): + A mapping framework to class. + task (`str`): + The task defining which pipeline will be returned. + model_kwargs: + Additional dictionary of keyword arguments passed along to the model's `from_pretrained(..., + **model_kwargs)` function. + + Returns: + `Tuple`: A tuple framework, model. + """ + if not is_tf_available() and not is_torch_available(): + raise RuntimeError( + "At least one of TensorFlow 2.0 or PyTorch should be installed. " + "To install TensorFlow 2.0, read the instructions at https://www.tensorflow.org/install/ " + "To install PyTorch, read the instructions at https://pytorch.org/." + ) + if isinstance(model, str): + model_kwargs["_from_pipeline"] = task + class_tuple = () + look_pt = is_torch_available() and framework in {"pt", None} + look_tf = is_tf_available() and framework in {"tf", None} + if model_classes: + if look_pt: + class_tuple = class_tuple + model_classes.get("pt", (AutoModel,)) + if look_tf: + class_tuple = class_tuple + model_classes.get("tf", (TFAutoModel,)) + if config.architectures: + classes = [] + for architecture in config.architectures: + transformers_module = importlib.import_module("transformers") + if look_pt: + _class = getattr(transformers_module, architecture, None) + if _class is not None: + classes.append(_class) + if look_tf: + _class = getattr(transformers_module, f"TF{architecture}", None) + if _class is not None: + classes.append(_class) + class_tuple = class_tuple + tuple(classes) + + if len(class_tuple) == 0: + raise ValueError(f"Pipeline cannot infer suitable model classes from {model}") + + all_traceback = {} + for model_class in class_tuple: + kwargs = model_kwargs.copy() + if framework == "pt" and model.endswith(".h5"): + kwargs["from_tf"] = True + logger.warning( + "Model might be a TensorFlow model (ending with `.h5`) but TensorFlow is not available. " + "Trying to load the model with PyTorch." + ) + elif framework == "tf" and model.endswith(".bin"): + kwargs["from_pt"] = True + logger.warning( + "Model might be a PyTorch model (ending with `.bin`) but PyTorch is not available. " + "Trying to load the model with Tensorflow." + ) + + try: + model = model_class.from_pretrained(model, **kwargs) + if hasattr(model, "eval"): + model = model.eval() + # Stop loading on the first successful load. + break + except (OSError, ValueError): + all_traceback[model_class.__name__] = traceback.format_exc() + continue + + if isinstance(model, str): + error = "" + for class_name, trace in all_traceback.items(): + error += f"while loading with {class_name}, an error is thrown:\n{trace}\n" + raise ValueError( + f"Could not load model {model} with any of the following classes: {class_tuple}. See the original errors:\n\n{error}\n" + ) + + if framework is None: + framework = infer_framework(model.__class__) + return framework, model + + +def infer_framework_from_model( + model, + model_classes: Optional[Dict[str, Tuple[type]]] = None, + task: Optional[str] = None, + framework: Optional[str] = None, + **model_kwargs, +): + """ + Select framework (TensorFlow or PyTorch) to use from the `model` passed. Returns a tuple (framework, model). + + If `model` is instantiated, this function will just infer the framework from the model class. Otherwise `model` is + actually a checkpoint name and this method will try to instantiate it using `model_classes`. Since we don't want to + instantiate the model twice, this model is returned for use by the pipeline. + + If both frameworks are installed and available for `model`, PyTorch is selected. + + Args: + model (`str`, [`PreTrainedModel`] or [`TFPreTrainedModel]`): + The model to infer the framework from. If `str`, a checkpoint name. The model to infer the framewrok from. + model_classes (dictionary `str` to `type`, *optional*): + A mapping framework to class. + task (`str`): + The task defining which pipeline will be returned. + model_kwargs: + Additional dictionary of keyword arguments passed along to the model's `from_pretrained(..., + **model_kwargs)` function. + + Returns: + `Tuple`: A tuple framework, model. + """ + if isinstance(model, str): + config = AutoConfig.from_pretrained(model, _from_pipeline=task, **model_kwargs) + else: + config = model.config + return infer_framework_load_model( + model, config, model_classes=model_classes, _from_pipeline=task, task=task, framework=framework, **model_kwargs + ) + + +def get_framework(model, revision: Optional[str] = None): + """ + Select framework (TensorFlow or PyTorch) to use. + + Args: + model (`str`, [`PreTrainedModel`] or [`TFPreTrainedModel]`): + If both frameworks are installed, picks the one corresponding to the model passed (either a model class or + the model name). If no specific model is provided, defaults to using PyTorch. + """ + warnings.warn( + "`get_framework` is deprecated and will be removed in v5, use `infer_framework_from_model` instead.", + FutureWarning, + ) + if not is_tf_available() and not is_torch_available(): + raise RuntimeError( + "At least one of TensorFlow 2.0 or PyTorch should be installed. " + "To install TensorFlow 2.0, read the instructions at https://www.tensorflow.org/install/ " + "To install PyTorch, read the instructions at https://pytorch.org/." + ) + if isinstance(model, str): + if is_torch_available() and not is_tf_available(): + model = AutoModel.from_pretrained(model, revision=revision) + elif is_tf_available() and not is_torch_available(): + model = TFAutoModel.from_pretrained(model, revision=revision) + else: + try: + model = AutoModel.from_pretrained(model, revision=revision) + except OSError: + model = TFAutoModel.from_pretrained(model, revision=revision) + + framework = infer_framework(model.__class__) + return framework + + +def get_default_model_and_revision( + targeted_task: Dict, framework: Optional[str], task_options: Optional[Any] +) -> Union[str, Tuple[str, str]]: + """ + Select a default model to use for a given task. Defaults to pytorch if ambiguous. + + Args: + targeted_task (`Dict`): + Dictionary representing the given task, that should contain default models + + framework (`str`, None) + "pt", "tf" or None, representing a specific framework if it was specified, or None if we don't know yet. + + task_options (`Any`, None) + Any further value required by the task to get fully specified, for instance (SRC, TGT) languages for + translation task. + + Returns + + `str` The model string representing the default model for this pipeline + """ + if is_torch_available() and not is_tf_available(): + framework = "pt" + elif is_tf_available() and not is_torch_available(): + framework = "tf" + + defaults = targeted_task["default"] + if task_options: + if task_options not in defaults: + raise ValueError(f"The task does not provide any default models for options {task_options}") + default_models = defaults[task_options]["model"] + elif "model" in defaults: + default_models = targeted_task["default"]["model"] + else: + # XXX This error message needs to be updated to be more generic if more tasks are going to become + # parametrized + raise ValueError('The task defaults can\'t be correctly selected. You probably meant "translation_XX_to_YY"') + + if framework is None: + framework = "pt" + + return default_models[framework] + + +def load_assistant_model( + model: "PreTrainedModel", + assistant_model: Optional[Union[str, "PreTrainedModel"]], + assistant_tokenizer: Optional[PreTrainedTokenizer], +) -> Tuple[Optional["PreTrainedModel"], Optional[PreTrainedTokenizer]]: + """ + Prepares the assistant model and the assistant tokenizer for a pipeline whose model that can call `generate`. + + Args: + model ([`PreTrainedModel`]): + The main model that will be used by the pipeline to make predictions. + assistant_model (`str` or [`PreTrainedModel`], *optional*): + The assistant model that will be used by the pipeline to make predictions. + assistant_tokenizer ([`PreTrainedTokenizer`], *optional*): + The assistant tokenizer that will be used by the pipeline to encode data for the model. + + Returns: + Tuple: The loaded assistant model and (optionally) the loaded tokenizer. + """ + if not model.can_generate() or assistant_model is None: + return None, None + + if not isinstance(model, PreTrainedModel): + raise ValueError( + "Assisted generation, triggered by the `assistant_model` argument, is only available for " + "`PreTrainedModel` model instances. For instance, TF or JAX models are not supported." + ) + + # If the model is passed as a string, load the model and the corresponding tokenizer + if isinstance(assistant_model, str): + assistant_config = AutoConfig.from_pretrained(assistant_model) + _, loaded_assistant_model = infer_framework_load_model(assistant_model, config=assistant_config) + loaded_assistant_model = loaded_assistant_model.to(device=model.device, dtype=model.dtype) + loaded_assistant_tokenizer = AutoTokenizer.from_pretrained(assistant_model) + else: + loaded_assistant_model = assistant_model + loaded_assistant_tokenizer = assistant_tokenizer + + # Finally, let's check the tokenizers: if the two models have different tokenizers, we need to keep the assistant + # tokenizer + same_vocab_size = model.config.vocab_size == loaded_assistant_model.config.vocab_size + same_special_tokens = all( + getattr(model.config, token) == getattr(loaded_assistant_model.config, token) + for token in ("eos_token_id", "pad_token_id", "bos_token_id") + ) + if same_vocab_size and same_special_tokens: + loaded_assistant_tokenizer = None + elif loaded_assistant_tokenizer is None: + raise ValueError( + "The assistant model has a different tokenizer than the main model. You should pass the assistant " + "tokenizer." + ) + + return loaded_assistant_model, loaded_assistant_tokenizer + + +class PipelineException(Exception): + """ + Raised by a [`Pipeline`] when handling __call__. + + Args: + task (`str`): The task of the pipeline. + model (`str`): The model used by the pipeline. + reason (`str`): The error message to display. + """ + + def __init__(self, task: str, model: str, reason: str): + super().__init__(reason) + + self.task = task + self.model = model + + +class ArgumentHandler(ABC): + """ + Base interface for handling arguments for each [`~pipelines.Pipeline`]. + """ + + @abstractmethod + def __call__(self, *args, **kwargs): + raise NotImplementedError() + + +class PipelineDataFormat: + """ + Base class for all the pipeline supported data format both for reading and writing. Supported data formats + currently includes: + + - JSON + - CSV + - stdin/stdout (pipe) + + `PipelineDataFormat` also includes some utilities to work with multi-columns like mapping from datasets columns to + pipelines keyword arguments through the `dataset_kwarg_1=dataset_column_1` format. + + Args: + output_path (`str`): Where to save the outgoing data. + input_path (`str`): Where to look for the input data. + column (`str`): The column to read. + overwrite (`bool`, *optional*, defaults to `False`): + Whether or not to overwrite the `output_path`. + """ + + SUPPORTED_FORMATS = ["json", "csv", "pipe"] + + def __init__( + self, + output_path: Optional[str], + input_path: Optional[str], + column: Optional[str], + overwrite: bool = False, + ): + self.output_path = output_path + self.input_path = input_path + self.column = column.split(",") if column is not None else [""] + self.is_multi_columns = len(self.column) > 1 + + if self.is_multi_columns: + self.column = [tuple(c.split("=")) if "=" in c else (c, c) for c in self.column] + + if output_path is not None and not overwrite: + if exists(abspath(self.output_path)): + raise OSError(f"{self.output_path} already exists on disk") + + if input_path is not None: + if not exists(abspath(self.input_path)): + raise OSError(f"{self.input_path} doesnt exist on disk") + + @abstractmethod + def __iter__(self): + raise NotImplementedError() + + @abstractmethod + def save(self, data: Union[dict, List[dict]]): + """ + Save the provided data object with the representation for the current [`~pipelines.PipelineDataFormat`]. + + Args: + data (`dict` or list of `dict`): The data to store. + """ + raise NotImplementedError() + + def save_binary(self, data: Union[dict, List[dict]]) -> str: + """ + Save the provided data object as a pickle-formatted binary data on the disk. + + Args: + data (`dict` or list of `dict`): The data to store. + + Returns: + `str`: Path where the data has been saved. + """ + path, _ = os.path.splitext(self.output_path) + binary_path = os.path.extsep.join((path, "pickle")) + + with open(binary_path, "wb+") as f_output: + pickle.dump(data, f_output) + + return binary_path + + @staticmethod + def from_str( + format: str, + output_path: Optional[str], + input_path: Optional[str], + column: Optional[str], + overwrite=False, + ) -> "PipelineDataFormat": + """ + Creates an instance of the right subclass of [`~pipelines.PipelineDataFormat`] depending on `format`. + + Args: + format (`str`): + The format of the desired pipeline. Acceptable values are `"json"`, `"csv"` or `"pipe"`. + output_path (`str`, *optional*): + Where to save the outgoing data. + input_path (`str`, *optional*): + Where to look for the input data. + column (`str`, *optional*): + The column to read. + overwrite (`bool`, *optional*, defaults to `False`): + Whether or not to overwrite the `output_path`. + + Returns: + [`~pipelines.PipelineDataFormat`]: The proper data format. + """ + if format == "json": + return JsonPipelineDataFormat(output_path, input_path, column, overwrite=overwrite) + elif format == "csv": + return CsvPipelineDataFormat(output_path, input_path, column, overwrite=overwrite) + elif format == "pipe": + return PipedPipelineDataFormat(output_path, input_path, column, overwrite=overwrite) + else: + raise KeyError(f"Unknown reader {format} (Available reader are json/csv/pipe)") + + +class CsvPipelineDataFormat(PipelineDataFormat): + """ + Support for pipelines using CSV data format. + + Args: + output_path (`str`): Where to save the outgoing data. + input_path (`str`): Where to look for the input data. + column (`str`): The column to read. + overwrite (`bool`, *optional*, defaults to `False`): + Whether or not to overwrite the `output_path`. + """ + + def __init__( + self, + output_path: Optional[str], + input_path: Optional[str], + column: Optional[str], + overwrite=False, + ): + super().__init__(output_path, input_path, column, overwrite=overwrite) + + def __iter__(self): + with open(self.input_path, "r") as f: + reader = csv.DictReader(f) + for row in reader: + if self.is_multi_columns: + yield {k: row[c] for k, c in self.column} + else: + yield row[self.column[0]] + + def save(self, data: List[dict]): + """ + Save the provided data object with the representation for the current [`~pipelines.PipelineDataFormat`]. + + Args: + data (`List[dict]`): The data to store. + """ + with open(self.output_path, "w") as f: + if len(data) > 0: + writer = csv.DictWriter(f, list(data[0].keys())) + writer.writeheader() + writer.writerows(data) + + +class JsonPipelineDataFormat(PipelineDataFormat): + """ + Support for pipelines using JSON file format. + + Args: + output_path (`str`): Where to save the outgoing data. + input_path (`str`): Where to look for the input data. + column (`str`): The column to read. + overwrite (`bool`, *optional*, defaults to `False`): + Whether or not to overwrite the `output_path`. + """ + + def __init__( + self, + output_path: Optional[str], + input_path: Optional[str], + column: Optional[str], + overwrite=False, + ): + super().__init__(output_path, input_path, column, overwrite=overwrite) + + with open(input_path, "r") as f: + self._entries = json.load(f) + + def __iter__(self): + for entry in self._entries: + if self.is_multi_columns: + yield {k: entry[c] for k, c in self.column} + else: + yield entry[self.column[0]] + + def save(self, data: dict): + """ + Save the provided data object in a json file. + + Args: + data (`dict`): The data to store. + """ + with open(self.output_path, "w") as f: + json.dump(data, f) + + +class PipedPipelineDataFormat(PipelineDataFormat): + """ + Read data from piped input to the python process. For multi columns data, columns should separated by \t + + If columns are provided, then the output will be a dictionary with {column_x: value_x} + + Args: + output_path (`str`): Where to save the outgoing data. + input_path (`str`): Where to look for the input data. + column (`str`): The column to read. + overwrite (`bool`, *optional*, defaults to `False`): + Whether or not to overwrite the `output_path`. + """ + + def __iter__(self): + for line in sys.stdin: + # Split for multi-columns + if "\t" in line: + line = line.split("\t") + if self.column: + # Dictionary to map arguments + yield {kwargs: l for (kwargs, _), l in zip(self.column, line)} + else: + yield tuple(line) + + # No dictionary to map arguments + else: + yield line + + def save(self, data: dict): + """ + Print the data. + + Args: + data (`dict`): The data to store. + """ + print(data) + + def save_binary(self, data: Union[dict, List[dict]]) -> str: + if self.output_path is None: + raise KeyError( + "When using piped input on pipeline outputting large object requires an output file path. " + "Please provide such output path through --output argument." + ) + + return super().save_binary(data) + + +class _ScikitCompat(ABC): + """ + Interface layer for the Scikit and Keras compatibility. + """ + + @abstractmethod + def transform(self, X): + raise NotImplementedError() + + @abstractmethod + def predict(self, X): + raise NotImplementedError() + + +def build_pipeline_init_args( + has_tokenizer: bool = False, + has_feature_extractor: bool = False, + has_image_processor: bool = False, + has_processor: bool = False, + supports_binary_output: bool = True, +) -> str: + docstring = r""" + Arguments: + model ([`PreTrainedModel`] or [`TFPreTrainedModel`]): + The model that will be used by the pipeline to make predictions. This needs to be a model inheriting from + [`PreTrainedModel`] for PyTorch and [`TFPreTrainedModel`] for TensorFlow.""" + if has_tokenizer: + docstring += r""" + tokenizer ([`PreTrainedTokenizer`]): + The tokenizer that will be used by the pipeline to encode data for the model. This object inherits from + [`PreTrainedTokenizer`].""" + if has_feature_extractor: + docstring += r""" + feature_extractor ([`SequenceFeatureExtractor`]): + The feature extractor that will be used by the pipeline to encode data for the model. This object inherits from + [`SequenceFeatureExtractor`].""" + if has_image_processor: + docstring += r""" + image_processor ([`BaseImageProcessor`]): + The image processor that will be used by the pipeline to encode data for the model. This object inherits from + [`BaseImageProcessor`].""" + if has_processor: + docstring += r""" + processor ([`ProcessorMixin`]): + The processor that will be used by the pipeline to encode data for the model. This object inherits from + [`ProcessorMixin`]. Processor is a composite object that might contain `tokenizer`, `feature_extractor`, and + `image_processor`.""" + docstring += r""" + modelcard (`str` or [`ModelCard`], *optional*): + Model card attributed to the model for this pipeline. + framework (`str`, *optional*): + The framework to use, either `"pt"` for PyTorch or `"tf"` for TensorFlow. The specified framework must be + installed. + + If no framework is specified, will default to the one currently installed. If no framework is specified and + both frameworks are installed, will default to the framework of the `model`, or to PyTorch if no model is + provided. + task (`str`, defaults to `""`): + A task-identifier for the pipeline. + num_workers (`int`, *optional*, defaults to 8): + When the pipeline will use *DataLoader* (when passing a dataset, on GPU for a Pytorch model), the number of + workers to be used. + batch_size (`int`, *optional*, defaults to 1): + When the pipeline will use *DataLoader* (when passing a dataset, on GPU for a Pytorch model), the size of + the batch to use, for inference this is not always beneficial, please read [Batching with + pipelines](https://huggingface.co/transformers/main_classes/pipelines.html#pipeline-batching) . + args_parser ([`~pipelines.ArgumentHandler`], *optional*): + Reference to the object in charge of parsing supplied pipeline parameters. + device (`int`, *optional*, defaults to -1): + Device ordinal for CPU/GPU supports. Setting this to -1 will leverage CPU, a positive will run the model on + the associated CUDA device id. You can pass native `torch.device` or a `str` too + torch_dtype (`str` or `torch.dtype`, *optional*): + Sent directly as `model_kwargs` (just a simpler shortcut) to use the available precision for this model + (`torch.float16`, `torch.bfloat16`, ... or `"auto"`)""" + if supports_binary_output: + docstring += r""" + binary_output (`bool`, *optional*, defaults to `False`): + Flag indicating if the output the pipeline should happen in a serialized format (i.e., pickle) or as + the raw output data e.g. text.""" + return docstring + + +PIPELINE_INIT_ARGS = build_pipeline_init_args( + has_tokenizer=True, + has_feature_extractor=True, + has_image_processor=True, + has_processor=True, + supports_binary_output=True, +) + + +if is_torch_available(): + from transformers.pipelines.pt_utils import ( + PipelineChunkIterator, + PipelineDataset, + PipelineIterator, + PipelinePackIterator, + ) + + +@add_end_docstrings( + build_pipeline_init_args( + has_tokenizer=True, has_feature_extractor=True, has_image_processor=True, has_processor=True + ) +) +class Pipeline(_ScikitCompat, PushToHubMixin): + """ + The Pipeline class is the class from which all pipelines inherit. Refer to this class for methods shared across + different pipelines. + + Base class implementing pipelined operations. Pipeline workflow is defined as a sequence of the following + operations: + + Input -> Tokenization -> Model Inference -> Post-Processing (task dependent) -> Output + + Pipeline supports running on CPU or GPU through the device argument (see below). + + Some pipeline, like for instance [`FeatureExtractionPipeline`] (`'feature-extraction'`) output large tensor object + as nested-lists. In order to avoid dumping such large structure as textual data we provide the `binary_output` + constructor argument. If set to `True`, the output will be stored in the pickle format. + """ + + # Historically we have pipelines working with `tokenizer`, `feature_extractor`, and `image_processor` + # as separate processing components. While we have `processor` class that combines them, some pipelines + # might still operate with these components separately. + # With the addition of `processor` to `pipeline`, we want to avoid: + # - loading `processor` for pipelines that still work with `image_processor` and `tokenizer` separately; + # - loading `image_processor`/`tokenizer` as a separate component while we operate only with `processor`, + # because `processor` will load required sub-components by itself. + # Below flags allow granular control over loading components and set to be backward compatible with current + # pipelines logic. You may override these flags when creating your pipeline. For example, for + # `zero-shot-object-detection` pipeline which operates with `processor` you should set `_load_processor=True` + # and all the rest flags to `False` to avoid unnecessary loading of the components. + _load_processor = False + _load_image_processor = True + _load_feature_extractor = True + _load_tokenizer = True + + default_input_names = None + + def __init__( + self, + model: Union["PreTrainedModel", "TFPreTrainedModel"], + tokenizer: Optional[PreTrainedTokenizer] = None, + feature_extractor: Optional[PreTrainedFeatureExtractor] = None, + image_processor: Optional[BaseImageProcessor] = None, + processor: Optional[ProcessorMixin] = None, + modelcard: Optional[ModelCard] = None, + framework: Optional[str] = None, + task: str = "", + args_parser: ArgumentHandler = None, + device: Union[int, "torch.device"] = None, + torch_dtype: Optional[Union[str, "torch.dtype"]] = None, + binary_output: bool = False, + **kwargs, + ): + if framework is None: + framework, model = infer_framework_load_model(model, config=model.config) + + self.task = task + self.model = model + self.tokenizer = tokenizer + self.feature_extractor = feature_extractor + self.image_processor = image_processor + self.processor = processor + self.modelcard = modelcard + self.framework = framework + + # `accelerate` device map + hf_device_map = getattr(self.model, "hf_device_map", None) + + if hf_device_map is not None and device is not None: + raise ValueError( + "The model has been loaded with `accelerate` and therefore cannot be moved to a specific device. Please " + "discard the `device` argument when creating your pipeline object." + ) + + if device is None: + if hf_device_map is not None: + # Take the first device used by `accelerate`. + device = next(iter(hf_device_map.values())) + else: + device = 0 + + if is_torch_available() and self.framework == "pt": + if device == -1 and self.model.device is not None: + device = self.model.device + if isinstance(device, torch.device): + if device.type == "xpu" and not is_torch_xpu_available(check_device=True): + raise ValueError(f'{device} is not available, you should use device="cpu" instead') + self.device = device + elif isinstance(device, str): + if "xpu" in device and not is_torch_xpu_available(check_device=True): + raise ValueError(f'{device} is not available, you should use device="cpu" instead') + self.device = torch.device(device) + elif device < 0: + self.device = torch.device("cpu") + elif is_torch_mlu_available(): + self.device = torch.device(f"mlu:{device}") + elif is_torch_musa_available(): + self.device = torch.device(f"musa:{device}") + elif is_torch_cuda_available(): + self.device = torch.device(f"cuda:{device}") + elif is_torch_npu_available(): + self.device = torch.device(f"npu:{device}") + elif is_torch_xpu_available(check_device=True): + self.device = torch.device(f"xpu:{device}") + elif is_torch_mps_available(): + self.device = torch.device(f"mps:{device}") + else: + self.device = torch.device("cpu") + else: + self.device = device if device is not None else -1 + + logger.warning(f"Device set to use {self.device}") + + self.binary_output = binary_output + # We shouldn't call `model.to()` for models loaded with accelerate as well as the case that model is already on device + if ( + self.framework == "pt" + and self.model.device != self.device + and not (isinstance(self.device, int) and self.device < 0) + and hf_device_map is None + ): + self.model.to(self.device) + + # If the model can generate: + # 1 - create a local generation config. This is done to avoid side-effects on the model as we apply local + # tweaks to the generation config. + # 2 - load the assistant model if it is passed. + self.assistant_model, self.assistant_tokenizer = load_assistant_model( + self.model, kwargs.pop("assistant_model", None), kwargs.pop("assistant_tokenizer", None) + ) + if self.model.can_generate(): + self.prefix = self.model.config.prefix if hasattr(self.model.config, "prefix") else None + self.generation_config = copy.deepcopy(self.model.generation_config) + # Update the generation config with task specific params if they exist + # NOTE: `prefix` is pipeline-specific and doesn't exist in the generation config. + task_specific_params = self.model.config.task_specific_params + if task_specific_params is not None and task in task_specific_params: + this_task_params = task_specific_params.get(task) + if "prefix" in this_task_params: + self.prefix = this_task_params.pop("prefix") + self.generation_config.update(**this_task_params) + # If the tokenizer has a pad token but the model doesn't, set it so that `generate` is aware of it. + if ( + self.tokenizer is not None + and self.tokenizer.pad_token_id is not None + and self.generation_config.pad_token_id is None + ): + self.generation_config.pad_token_id = self.tokenizer.pad_token_id + + self.call_count = 0 + self._batch_size = kwargs.pop("batch_size", None) + self._num_workers = kwargs.pop("num_workers", None) + self._preprocess_params, self._forward_params, self._postprocess_params = self._sanitize_parameters(**kwargs) + + # In processor only mode, we can get the modality processors from the processor + if self.processor is not None and all( + [self.tokenizer is None, self.feature_extractor is None, self.image_processor is None] + ): + self.tokenizer = getattr(self.processor, "tokenizer", None) + self.feature_extractor = getattr(self.processor, "feature_extractor", None) + self.image_processor = getattr(self.processor, "image_processor", None) + + if self.image_processor is None and self.feature_extractor is not None: + if isinstance(self.feature_extractor, BaseImageProcessor): + # Backward compatible change, if users called + # ImageSegmentationPipeline(.., feature_extractor=MyFeatureExtractor()) + # then we should keep working + self.image_processor = self.feature_extractor + + def save_pretrained( + self, + save_directory: Union[str, os.PathLike], + safe_serialization: bool = True, + **kwargs, + ): + """ + Save the pipeline's model and tokenizer. + + Args: + save_directory (`str` or `os.PathLike`): + A path to the directory where to saved. It will be created if it doesn't exist. + safe_serialization (`str`): + Whether to save the model using `safetensors` or the traditional way for PyTorch or Tensorflow. + kwargs (`Dict[str, Any]`, *optional*): + Additional key word arguments passed along to the [`~utils.PushToHubMixin.push_to_hub`] method. + """ + use_auth_token = kwargs.pop("use_auth_token", None) + + if use_auth_token is not None: + warnings.warn( + "The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers. Please use `token` instead.", + FutureWarning, + ) + if kwargs.get("token", None) is not None: + raise ValueError( + "`token` and `use_auth_token` are both specified. Please set only the argument `token`." + ) + kwargs["token"] = use_auth_token + + if os.path.isfile(save_directory): + logger.error(f"Provided path ({save_directory}) should be a directory, not a file") + return + os.makedirs(save_directory, exist_ok=True) + + if hasattr(self, "_registered_impl"): + # Add info to the config + pipeline_info = self._registered_impl.copy() + custom_pipelines = {} + for task, info in pipeline_info.items(): + if info["impl"] != self.__class__: + continue + + info = info.copy() + module_name = info["impl"].__module__ + last_module = module_name.split(".")[-1] + # Change classes into their names/full names + info["impl"] = f"{last_module}.{info['impl'].__name__}" + info["pt"] = tuple(c.__name__ for c in info["pt"]) + info["tf"] = tuple(c.__name__ for c in info["tf"]) + + custom_pipelines[task] = info + self.model.config.custom_pipelines = custom_pipelines + # Save the pipeline custom code + custom_object_save(self, save_directory) + + kwargs["safe_serialization"] = safe_serialization + self.model.save_pretrained(save_directory, **kwargs) + + if self.tokenizer is not None: + self.tokenizer.save_pretrained(save_directory, **kwargs) + + if self.feature_extractor is not None: + self.feature_extractor.save_pretrained(save_directory, **kwargs) + + if self.image_processor is not None: + self.image_processor.save_pretrained(save_directory, **kwargs) + + if self.modelcard is not None: + self.modelcard.save_pretrained(save_directory) + + def transform(self, X): + """ + Scikit / Keras interface to transformers' pipelines. This method will forward to __call__(). + """ + return self(X) + + def predict(self, X): + """ + Scikit / Keras interface to transformers' pipelines. This method will forward to __call__(). + """ + return self(X) + + @property + def torch_dtype(self) -> Optional["torch.dtype"]: + """ + Torch dtype of the model (if it's Pytorch model), `None` otherwise. + """ + return getattr(self.model, "dtype", None) + + @contextmanager + def device_placement(self): + """ + Context Manager allowing tensor allocation on the user-specified device in framework agnostic way. + + Returns: + Context manager + + Examples: + + ```python + # Explicitly ask for tensor allocation on CUDA device :0 + pipe = pipeline(..., device=0) + with pipe.device_placement(): + # Every framework specific tensor allocation will be done on the request device + output = pipe(...) + ```""" + if self.framework == "tf": + with tf.device("/CPU:0" if self.device == -1 else f"/device:GPU:{self.device}"): + yield + else: + if self.device.type == "cuda": + with torch.cuda.device(self.device): + yield + elif self.device.type == "mlu": + with torch.mlu.device(self.device): + yield + elif self.device.type == "musa": + with torch.musa.device(self.device): + yield + else: + yield + + def ensure_tensor_on_device(self, **inputs): + """ + Ensure PyTorch tensors are on the specified device. + + Args: + inputs (keyword arguments that should be `torch.Tensor`, the rest is ignored): + The tensors to place on `self.device`. + Recursive on lists **only**. + + Return: + `Dict[str, torch.Tensor]`: The same as `inputs` but on the proper device. + """ + return self._ensure_tensor_on_device(inputs, self.device) + + def _ensure_tensor_on_device(self, inputs, device): + if isinstance(inputs, ModelOutput): + return ModelOutput( + {name: self._ensure_tensor_on_device(tensor, device) for name, tensor in inputs.items()} + ) + elif isinstance(inputs, dict): + return {name: self._ensure_tensor_on_device(tensor, device) for name, tensor in inputs.items()} + elif isinstance(inputs, UserDict): + return UserDict({name: self._ensure_tensor_on_device(tensor, device) for name, tensor in inputs.items()}) + elif isinstance(inputs, list): + return [self._ensure_tensor_on_device(item, device) for item in inputs] + elif isinstance(inputs, tuple): + return tuple([self._ensure_tensor_on_device(item, device) for item in inputs]) + elif isinstance(inputs, torch.Tensor): + return inputs.to(device) + else: + return inputs + + def check_model_type(self, supported_models: Union[List[str], dict]): + """ + Check if the model class is in supported by the pipeline. + + Args: + supported_models (`List[str]` or `dict`): + The list of models supported by the pipeline, or a dictionary with model class values. + """ + if not isinstance(supported_models, list): # Create from a model mapping + supported_models_names = [] + for _, model_name in supported_models.items(): + # Mapping can now contain tuples of models for the same configuration. + if isinstance(model_name, tuple): + supported_models_names.extend(list(model_name)) + else: + supported_models_names.append(model_name) + if hasattr(supported_models, "_model_mapping"): + for _, model in supported_models._model_mapping._extra_content.items(): + if isinstance(model_name, tuple): + supported_models_names.extend([m.__name__ for m in model]) + else: + supported_models_names.append(model.__name__) + supported_models = supported_models_names + if self.model.__class__.__name__ not in supported_models: + logger.error( + f"The model '{self.model.__class__.__name__}' is not supported for {self.task}. Supported models are" + f" {supported_models}." + ) + + @abstractmethod + def _sanitize_parameters(self, **pipeline_parameters): + """ + _sanitize_parameters will be called with any excessive named arguments from either `__init__` or `__call__` + methods. It should return 3 dictionaries of the resolved parameters used by the various `preprocess`, + `forward` and `postprocess` methods. Do not fill dictionaries if the caller didn't specify a kwargs. This + lets you keep defaults in function signatures, which is more "natural". + + It is not meant to be called directly, it will be automatically called and the final parameters resolved by + `__init__` and `__call__` + """ + raise NotImplementedError("_sanitize_parameters not implemented") + + @abstractmethod + def preprocess(self, input_: Any, **preprocess_parameters: Dict) -> Dict[str, GenericTensor]: + """ + Preprocess will take the `input_` of a specific pipeline and return a dictionary of everything necessary for + `_forward` to run properly. It should contain at least one tensor, but might have arbitrary other items. + """ + raise NotImplementedError("preprocess not implemented") + + @abstractmethod + def _forward(self, input_tensors: Dict[str, GenericTensor], **forward_parameters: Dict) -> ModelOutput: + """ + _forward will receive the prepared dictionary from `preprocess` and run it on the model. This method might + involve the GPU or the CPU and should be agnostic to it. Isolating this function is the reason for `preprocess` + and `postprocess` to exist, so that the hot path, this method generally can run as fast as possible. + + It is not meant to be called directly, `forward` is preferred. It is basically the same but contains additional + code surrounding `_forward` making sure tensors and models are on the same device, disabling the training part + of the code (leading to faster inference). + """ + raise NotImplementedError("_forward not implemented") + + @abstractmethod + def postprocess(self, model_outputs: ModelOutput, **postprocess_parameters: Dict) -> Any: + """ + Postprocess will receive the raw outputs of the `_forward` method, generally tensors, and reformat them into + something more friendly. Generally it will output a list or a dict or results (containing just strings and + numbers). + """ + raise NotImplementedError("postprocess not implemented") + + def get_inference_context(self): + return torch.no_grad + + def forward(self, model_inputs, **forward_params): + with self.device_placement(): + if self.framework == "tf": + model_inputs["training"] = False + model_outputs = self._forward(model_inputs, **forward_params) + elif self.framework == "pt": + inference_context = self.get_inference_context() + with inference_context(): + model_inputs = self._ensure_tensor_on_device(model_inputs, device=self.device) + model_outputs = self._forward(model_inputs, **forward_params) + model_outputs = self._ensure_tensor_on_device(model_outputs, device=torch.device("cpu")) + else: + raise ValueError(f"Framework {self.framework} is not supported") + return model_outputs + + def get_iterator( + self, inputs, num_workers: int, batch_size: int, preprocess_params, forward_params, postprocess_params + ): + if isinstance(inputs, collections.abc.Sized): + dataset = PipelineDataset(inputs, self.preprocess, preprocess_params) + else: + if num_workers > 1: + logger.warning( + "For iterable dataset using num_workers>1 is likely to result" + " in errors since everything is iterable, setting `num_workers=1`" + " to guarantee correctness." + ) + num_workers = 1 + dataset = PipelineIterator(inputs, self.preprocess, preprocess_params) + if "TOKENIZERS_PARALLELISM" not in os.environ: + logger.info("Disabling tokenizer parallelism, we're using DataLoader multithreading already") + os.environ["TOKENIZERS_PARALLELISM"] = "false" + # TODO hack by collating feature_extractor and image_processor + feature_extractor = self.feature_extractor if self.feature_extractor is not None else self.image_processor + collate_fn = no_collate_fn if batch_size == 1 else pad_collate_fn(self.tokenizer, feature_extractor) + dataloader = DataLoader(dataset, num_workers=num_workers, batch_size=batch_size, collate_fn=collate_fn) + model_iterator = PipelineIterator(dataloader, self.forward, forward_params, loader_batch_size=batch_size) + final_iterator = PipelineIterator(model_iterator, self.postprocess, postprocess_params) + return final_iterator + + def __call__(self, inputs, *args, num_workers=None, batch_size=None, **kwargs): + if args: + logger.warning(f"Ignoring args : {args}") + + if num_workers is None: + if self._num_workers is None: + num_workers = 0 + else: + num_workers = self._num_workers + if batch_size is None: + if self._batch_size is None: + batch_size = 1 + else: + batch_size = self._batch_size + + preprocess_params, forward_params, postprocess_params = self._sanitize_parameters(**kwargs) + + # Fuse __init__ params and __call__ params without modifying the __init__ ones. + preprocess_params = {**self._preprocess_params, **preprocess_params} + forward_params = {**self._forward_params, **forward_params} + postprocess_params = {**self._postprocess_params, **postprocess_params} + + self.call_count += 1 + if self.call_count > 10 and self.framework == "pt" and self.device.type == "cuda": + logger.warning_once( + "You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a" + " dataset", + ) + + is_dataset = Dataset is not None and isinstance(inputs, Dataset) + is_generator = isinstance(inputs, types.GeneratorType) + is_list = isinstance(inputs, list) + + is_iterable = is_dataset or is_generator or is_list + + # TODO make the get_iterator work also for `tf` (and `flax`). + can_use_iterator = self.framework == "pt" and (is_dataset or is_generator or is_list) + + if is_list: + if can_use_iterator: + final_iterator = self.get_iterator( + inputs, num_workers, batch_size, preprocess_params, forward_params, postprocess_params + ) + outputs = list(final_iterator) + return outputs + else: + return self.run_multi(inputs, preprocess_params, forward_params, postprocess_params) + elif can_use_iterator: + return self.get_iterator( + inputs, num_workers, batch_size, preprocess_params, forward_params, postprocess_params + ) + elif is_iterable: + return self.iterate(inputs, preprocess_params, forward_params, postprocess_params) + elif self.framework == "pt" and isinstance(self, ChunkPipeline): + return next( + iter( + self.get_iterator( + [inputs], num_workers, batch_size, preprocess_params, forward_params, postprocess_params + ) + ) + ) + else: + return self.run_single(inputs, preprocess_params, forward_params, postprocess_params) + + def run_multi(self, inputs, preprocess_params, forward_params, postprocess_params): + return [self.run_single(item, preprocess_params, forward_params, postprocess_params) for item in inputs] + + def run_single(self, inputs, preprocess_params, forward_params, postprocess_params): + model_inputs = self.preprocess(inputs, **preprocess_params) + model_outputs = self.forward(model_inputs, **forward_params) + outputs = self.postprocess(model_outputs, **postprocess_params) + return outputs + + def iterate(self, inputs, preprocess_params, forward_params, postprocess_params): + # This function should become `get_iterator` again, this is a temporary + # easy solution. + for input_ in inputs: + yield self.run_single(input_, preprocess_params, forward_params, postprocess_params) + + +Pipeline.push_to_hub = copy_func(Pipeline.push_to_hub) +if Pipeline.push_to_hub.__doc__ is not None: + Pipeline.push_to_hub.__doc__ = Pipeline.push_to_hub.__doc__.format( + object="pipe", object_class="pipeline", object_files="pipeline file" + ).replace(".from_pretrained", "") + + +class ChunkPipeline(Pipeline): + def run_single(self, inputs, preprocess_params, forward_params, postprocess_params): + all_outputs = [] + for model_inputs in self.preprocess(inputs, **preprocess_params): + model_outputs = self.forward(model_inputs, **forward_params) + all_outputs.append(model_outputs) + outputs = self.postprocess(all_outputs, **postprocess_params) + return outputs + + def get_iterator( + self, inputs, num_workers: int, batch_size: int, preprocess_params, forward_params, postprocess_params + ): + if "TOKENIZERS_PARALLELISM" not in os.environ: + logger.info("Disabling tokenizer parallelism, we're using DataLoader multithreading already") + os.environ["TOKENIZERS_PARALLELISM"] = "false" + if num_workers > 1: + logger.warning( + "For ChunkPipeline using num_workers>0 is likely to result in errors since everything is iterable," + " setting `num_workers=1` to guarantee correctness." + ) + num_workers = 1 + dataset = PipelineChunkIterator(inputs, self.preprocess, preprocess_params) + + # TODO hack by collating feature_extractor and image_processor + feature_extractor = self.feature_extractor if self.feature_extractor is not None else self.image_processor + collate_fn = no_collate_fn if batch_size == 1 else pad_collate_fn(self.tokenizer, feature_extractor) + dataloader = DataLoader(dataset, num_workers=num_workers, batch_size=batch_size, collate_fn=collate_fn) + model_iterator = PipelinePackIterator(dataloader, self.forward, forward_params, loader_batch_size=batch_size) + final_iterator = PipelineIterator(model_iterator, self.postprocess, postprocess_params) + return final_iterator + + +class PipelineRegistry: + def __init__(self, supported_tasks: Dict[str, Any], task_aliases: Dict[str, str]) -> None: + self.supported_tasks = supported_tasks + self.task_aliases = task_aliases + + def get_supported_tasks(self) -> List[str]: + supported_task = list(self.supported_tasks.keys()) + list(self.task_aliases.keys()) + supported_task.sort() + return supported_task + + def check_task(self, task: str) -> Tuple[str, Dict, Any]: + if task in self.task_aliases: + task = self.task_aliases[task] + if task in self.supported_tasks: + targeted_task = self.supported_tasks[task] + return task, targeted_task, None + + if task.startswith("translation"): + tokens = task.split("_") + if len(tokens) == 4 and tokens[0] == "translation" and tokens[2] == "to": + targeted_task = self.supported_tasks["translation"] + task = "translation" + return task, targeted_task, (tokens[1], tokens[3]) + raise KeyError(f"Invalid translation task {task}, use 'translation_XX_to_YY' format") + + raise KeyError( + f"Unknown task {task}, available tasks are {self.get_supported_tasks() + ['translation_XX_to_YY']}" + ) + + def register_pipeline( + self, + task: str, + pipeline_class: type, + pt_model: Optional[Union[type, Tuple[type]]] = None, + tf_model: Optional[Union[type, Tuple[type]]] = None, + default: Optional[Dict] = None, + type: Optional[str] = None, + ) -> None: + if task in self.supported_tasks: + logger.warning(f"{task} is already registered. Overwriting pipeline for task {task}...") + + if pt_model is None: + pt_model = () + elif not isinstance(pt_model, tuple): + pt_model = (pt_model,) + + if tf_model is None: + tf_model = () + elif not isinstance(tf_model, tuple): + tf_model = (tf_model,) + + task_impl = {"impl": pipeline_class, "pt": pt_model, "tf": tf_model} + + if default is not None: + if "model" not in default and ("pt" in default or "tf" in default): + default = {"model": default} + task_impl["default"] = default + + if type is not None: + task_impl["type"] = type + + self.supported_tasks[task] = task_impl + pipeline_class._registered_impl = {task: task_impl} + + def to_dict(self): + return self.supported_tasks diff --git a/.venv/lib/python3.11/site-packages/transformers/pipelines/depth_estimation.py b/.venv/lib/python3.11/site-packages/transformers/pipelines/depth_estimation.py new file mode 100644 index 0000000000000000000000000000000000000000..2203ac09c9cf9b6e9a51055c60678f5266ddd439 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/transformers/pipelines/depth_estimation.py @@ -0,0 +1,133 @@ +from typing import List, Union + +from ..utils import ( + add_end_docstrings, + is_torch_available, + is_vision_available, + logging, + requires_backends, +) +from .base import Pipeline, build_pipeline_init_args + + +if is_vision_available(): + from PIL import Image + + from ..image_utils import load_image + +if is_torch_available(): + from ..models.auto.modeling_auto import MODEL_FOR_DEPTH_ESTIMATION_MAPPING_NAMES + +logger = logging.get_logger(__name__) + + +@add_end_docstrings(build_pipeline_init_args(has_image_processor=True)) +class DepthEstimationPipeline(Pipeline): + """ + Depth estimation pipeline using any `AutoModelForDepthEstimation`. This pipeline predicts the depth of an image. + + Example: + + ```python + >>> from transformers import pipeline + + >>> depth_estimator = pipeline(task="depth-estimation", model="LiheYoung/depth-anything-base-hf") + >>> output = depth_estimator("http://images.cocodataset.org/val2017/000000039769.jpg") + >>> # This is a tensor with the values being the depth expressed in meters for each pixel + >>> output["predicted_depth"].shape + torch.Size([1, 384, 384]) + ``` + + Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial) + + + This depth estimation pipeline can currently be loaded from [`pipeline`] using the following task identifier: + `"depth-estimation"`. + + See the list of available models on [huggingface.co/models](https://huggingface.co/models?filter=depth-estimation). + """ + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + requires_backends(self, "vision") + self.check_model_type(MODEL_FOR_DEPTH_ESTIMATION_MAPPING_NAMES) + + def __call__(self, inputs: Union[str, List[str], "Image.Image", List["Image.Image"]] = None, **kwargs): + """ + Predict the depth(s) of the image(s) passed as inputs. + + Args: + inputs (`str`, `List[str]`, `PIL.Image` or `List[PIL.Image]`): + The pipeline handles three types of images: + + - A string containing a http link pointing to an image + - A string containing a local path to an image + - An image loaded in PIL directly + + The pipeline accepts either a single image or a batch of images, which must then be passed as a string. + Images in a batch must all be in the same format: all as http links, all as local paths, or all as PIL + images. + parameters (`Dict`, *optional*): + A dictionary of argument names to parameter values, to control pipeline behaviour. + The only parameter available right now is `timeout`, which is the length of time, in seconds, + that the pipeline should wait before giving up on trying to download an image. + timeout (`float`, *optional*, defaults to None): + The maximum time in seconds to wait for fetching images from the web. If None, no timeout is set and + the call may block forever. + + Return: + A dictionary or a list of dictionaries containing result. If the input is a single image, will return a + dictionary, if the input is a list of several images, will return a list of dictionaries corresponding to + the images. + + The dictionaries contain the following keys: + + - **predicted_depth** (`torch.Tensor`) -- The predicted depth by the model as a `torch.Tensor`. + - **depth** (`PIL.Image`) -- The predicted depth by the model as a `PIL.Image`. + """ + # After deprecation of this is completed, remove the default `None` value for `images` + if "images" in kwargs: + inputs = kwargs.pop("images") + if inputs is None: + raise ValueError("Cannot call the depth-estimation pipeline without an inputs argument!") + return super().__call__(inputs, **kwargs) + + def _sanitize_parameters(self, timeout=None, parameters=None, **kwargs): + preprocess_params = {} + if timeout is not None: + preprocess_params["timeout"] = timeout + if isinstance(parameters, dict) and "timeout" in parameters: + preprocess_params["timeout"] = parameters["timeout"] + return preprocess_params, {}, {} + + def preprocess(self, image, timeout=None): + image = load_image(image, timeout) + model_inputs = self.image_processor(images=image, return_tensors=self.framework) + if self.framework == "pt": + model_inputs = model_inputs.to(self.torch_dtype) + model_inputs["target_size"] = image.size[::-1] + return model_inputs + + def _forward(self, model_inputs): + target_size = model_inputs.pop("target_size") + model_outputs = self.model(**model_inputs) + model_outputs["target_size"] = target_size + return model_outputs + + def postprocess(self, model_outputs): + outputs = self.image_processor.post_process_depth_estimation( + model_outputs, + # this acts as `source_sizes` for ZoeDepth and as `target_sizes` for the rest of the models so do *not* + # replace with `target_sizes = [model_outputs["target_size"]]` + [model_outputs["target_size"]], + ) + + formatted_outputs = [] + for output in outputs: + depth = output["predicted_depth"].detach().cpu().numpy() + depth = (depth - depth.min()) / (depth.max() - depth.min()) + depth = Image.fromarray((depth * 255).astype("uint8")) + + formatted_outputs.append({"predicted_depth": output["predicted_depth"], "depth": depth}) + + return formatted_outputs[0] if len(outputs) == 1 else formatted_outputs diff --git a/.venv/lib/python3.11/site-packages/transformers/pipelines/document_question_answering.py b/.venv/lib/python3.11/site-packages/transformers/pipelines/document_question_answering.py new file mode 100644 index 0000000000000000000000000000000000000000..c176d841e29fa6c6bb8c6867562f985d181c7138 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/transformers/pipelines/document_question_answering.py @@ -0,0 +1,516 @@ +# Copyright 2022 The Impira Team and the HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import re +from typing import List, Optional, Tuple, Union + +import numpy as np + +from ..utils import ( + ExplicitEnum, + add_end_docstrings, + is_pytesseract_available, + is_torch_available, + is_vision_available, + logging, +) +from .base import ChunkPipeline, build_pipeline_init_args +from .question_answering import select_starts_ends + + +if is_vision_available(): + from PIL import Image + + from ..image_utils import load_image + +if is_torch_available(): + import torch + + from ..models.auto.modeling_auto import MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING_NAMES + +TESSERACT_LOADED = False +if is_pytesseract_available(): + TESSERACT_LOADED = True + import pytesseract + +logger = logging.get_logger(__name__) + + +# normalize_bbox() and apply_tesseract() are derived from apply_tesseract in models/layoutlmv3/feature_extraction_layoutlmv3.py. +# However, because the pipeline may evolve from what layoutlmv3 currently does, it's copied (vs. imported) to avoid creating an +# unnecessary dependency. +def normalize_box(box, width, height): + return [ + int(1000 * (box[0] / width)), + int(1000 * (box[1] / height)), + int(1000 * (box[2] / width)), + int(1000 * (box[3] / height)), + ] + + +def apply_tesseract(image: "Image.Image", lang: Optional[str], tesseract_config: Optional[str]): + """Applies Tesseract OCR on a document image, and returns recognized words + normalized bounding boxes.""" + # apply OCR + data = pytesseract.image_to_data(image, lang=lang, output_type="dict", config=tesseract_config) + words, left, top, width, height = data["text"], data["left"], data["top"], data["width"], data["height"] + + # filter empty words and corresponding coordinates + irrelevant_indices = [idx for idx, word in enumerate(words) if not word.strip()] + words = [word for idx, word in enumerate(words) if idx not in irrelevant_indices] + left = [coord for idx, coord in enumerate(left) if idx not in irrelevant_indices] + top = [coord for idx, coord in enumerate(top) if idx not in irrelevant_indices] + width = [coord for idx, coord in enumerate(width) if idx not in irrelevant_indices] + height = [coord for idx, coord in enumerate(height) if idx not in irrelevant_indices] + + # turn coordinates into (left, top, left+width, top+height) format + actual_boxes = [] + for x, y, w, h in zip(left, top, width, height): + actual_box = [x, y, x + w, y + h] + actual_boxes.append(actual_box) + + image_width, image_height = image.size + + # finally, normalize the bounding boxes + normalized_boxes = [] + for box in actual_boxes: + normalized_boxes.append(normalize_box(box, image_width, image_height)) + + if len(words) != len(normalized_boxes): + raise ValueError("Not as many words as there are bounding boxes") + + return words, normalized_boxes + + +class ModelType(ExplicitEnum): + LayoutLM = "layoutlm" + LayoutLMv2andv3 = "layoutlmv2andv3" + VisionEncoderDecoder = "vision_encoder_decoder" + + +@add_end_docstrings(build_pipeline_init_args(has_image_processor=True, has_tokenizer=True)) +class DocumentQuestionAnsweringPipeline(ChunkPipeline): + # TODO: Update task_summary docs to include an example with document QA and then update the first sentence + """ + Document Question Answering pipeline using any `AutoModelForDocumentQuestionAnswering`. The inputs/outputs are + similar to the (extractive) question answering pipeline; however, the pipeline takes an image (and optional OCR'd + words/boxes) as input instead of text context. + + Example: + + ```python + >>> from transformers import pipeline + + >>> document_qa = pipeline(model="impira/layoutlm-document-qa") + >>> document_qa( + ... image="https://huggingface.co/spaces/impira/docquery/resolve/2359223c1837a7587402bda0f2643382a6eefeab/invoice.png", + ... question="What is the invoice number?", + ... ) + [{'score': 0.425, 'answer': 'us-001', 'start': 16, 'end': 16}] + ``` + + Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial) + + This document question answering pipeline can currently be loaded from [`pipeline`] using the following task + identifier: `"document-question-answering"`. + + The models that this pipeline can use are models that have been fine-tuned on a document question answering task. + See the up-to-date list of available models on + [huggingface.co/models](https://huggingface.co/models?filter=document-question-answering). + """ + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + if self.tokenizer is not None and not self.tokenizer.__class__.__name__.endswith("Fast"): + raise ValueError( + "`DocumentQuestionAnsweringPipeline` requires a fast tokenizer, but a slow tokenizer " + f"(`{self.tokenizer.__class__.__name__}`) is provided." + ) + + if self.model.config.__class__.__name__ == "VisionEncoderDecoderConfig": + self.model_type = ModelType.VisionEncoderDecoder + if self.model.config.encoder.model_type != "donut-swin": + raise ValueError("Currently, the only supported VisionEncoderDecoder model is Donut") + else: + self.check_model_type(MODEL_FOR_DOCUMENT_QUESTION_ANSWERING_MAPPING_NAMES) + if self.model.config.__class__.__name__ == "LayoutLMConfig": + self.model_type = ModelType.LayoutLM + else: + self.model_type = ModelType.LayoutLMv2andv3 + + def _sanitize_parameters( + self, + padding=None, + doc_stride=None, + max_question_len=None, + lang: Optional[str] = None, + tesseract_config: Optional[str] = None, + max_answer_len=None, + max_seq_len=None, + top_k=None, + handle_impossible_answer=None, + timeout=None, + **kwargs, + ): + preprocess_params, postprocess_params = {}, {} + if padding is not None: + preprocess_params["padding"] = padding + if doc_stride is not None: + preprocess_params["doc_stride"] = doc_stride + if max_question_len is not None: + preprocess_params["max_question_len"] = max_question_len + if max_seq_len is not None: + preprocess_params["max_seq_len"] = max_seq_len + if lang is not None: + preprocess_params["lang"] = lang + if tesseract_config is not None: + preprocess_params["tesseract_config"] = tesseract_config + if timeout is not None: + preprocess_params["timeout"] = timeout + + if top_k is not None: + if top_k < 1: + raise ValueError(f"top_k parameter should be >= 1 (got {top_k})") + postprocess_params["top_k"] = top_k + if max_answer_len is not None: + if max_answer_len < 1: + raise ValueError(f"max_answer_len parameter should be >= 1 (got {max_answer_len}") + postprocess_params["max_answer_len"] = max_answer_len + if handle_impossible_answer is not None: + postprocess_params["handle_impossible_answer"] = handle_impossible_answer + + forward_params = {} + if self.assistant_model is not None: + forward_params["assistant_model"] = self.assistant_model + if self.assistant_tokenizer is not None: + forward_params["tokenizer"] = self.tokenizer + forward_params["assistant_tokenizer"] = self.assistant_tokenizer + + return preprocess_params, forward_params, postprocess_params + + def __call__( + self, + image: Union["Image.Image", str], + question: Optional[str] = None, + word_boxes: Tuple[str, List[float]] = None, + **kwargs, + ): + """ + Answer the question(s) given as inputs by using the document(s). A document is defined as an image and an + optional list of (word, box) tuples which represent the text in the document. If the `word_boxes` are not + provided, it will use the Tesseract OCR engine (if available) to extract the words and boxes automatically for + LayoutLM-like models which require them as input. For Donut, no OCR is run. + + You can invoke the pipeline several ways: + + - `pipeline(image=image, question=question)` + - `pipeline(image=image, question=question, word_boxes=word_boxes)` + - `pipeline([{"image": image, "question": question}])` + - `pipeline([{"image": image, "question": question, "word_boxes": word_boxes}])` + + Args: + image (`str` or `PIL.Image`): + The pipeline handles three types of images: + + - A string containing a http link pointing to an image + - A string containing a local path to an image + - An image loaded in PIL directly + + The pipeline accepts either a single image or a batch of images. If given a single image, it can be + broadcasted to multiple questions. + question (`str`): + A question to ask of the document. + word_boxes (`List[str, Tuple[float, float, float, float]]`, *optional*): + A list of words and bounding boxes (normalized 0->1000). If you provide this optional input, then the + pipeline will use these words and boxes instead of running OCR on the image to derive them for models + that need them (e.g. LayoutLM). This allows you to reuse OCR'd results across many invocations of the + pipeline without having to re-run it each time. + top_k (`int`, *optional*, defaults to 1): + The number of answers to return (will be chosen by order of likelihood). Note that we return less than + top_k answers if there are not enough options available within the context. + doc_stride (`int`, *optional*, defaults to 128): + If the words in the document are too long to fit with the question for the model, it will be split in + several chunks with some overlap. This argument controls the size of that overlap. + max_answer_len (`int`, *optional*, defaults to 15): + The maximum length of predicted answers (e.g., only answers with a shorter length are considered). + max_seq_len (`int`, *optional*, defaults to 384): + The maximum length of the total sentence (context + question) in tokens of each chunk passed to the + model. The context will be split in several chunks (using `doc_stride` as overlap) if needed. + max_question_len (`int`, *optional*, defaults to 64): + The maximum length of the question after tokenization. It will be truncated if needed. + handle_impossible_answer (`bool`, *optional*, defaults to `False`): + Whether or not we accept impossible as an answer. + lang (`str`, *optional*): + Language to use while running OCR. Defaults to english. + tesseract_config (`str`, *optional*): + Additional flags to pass to tesseract while running OCR. + timeout (`float`, *optional*, defaults to None): + The maximum time in seconds to wait for fetching images from the web. If None, no timeout is set and + the call may block forever. + + Return: + A `dict` or a list of `dict`: Each result comes as a dictionary with the following keys: + + - **score** (`float`) -- The probability associated to the answer. + - **start** (`int`) -- The start word index of the answer (in the OCR'd version of the input or provided + `word_boxes`). + - **end** (`int`) -- The end word index of the answer (in the OCR'd version of the input or provided + `word_boxes`). + - **answer** (`str`) -- The answer to the question. + - **words** (`list[int]`) -- The index of each word/box pair that is in the answer + """ + if isinstance(question, str): + inputs = {"question": question, "image": image} + if word_boxes is not None: + inputs["word_boxes"] = word_boxes + else: + inputs = image + return super().__call__(inputs, **kwargs) + + def preprocess( + self, + input, + padding="do_not_pad", + doc_stride=None, + max_seq_len=None, + word_boxes: Tuple[str, List[float]] = None, + lang=None, + tesseract_config="", + timeout=None, + ): + # NOTE: This code mirrors the code in question answering and will be implemented in a follow up PR + # to support documents with enough tokens that overflow the model's window + if max_seq_len is None: + max_seq_len = self.tokenizer.model_max_length + + if doc_stride is None: + doc_stride = min(max_seq_len // 2, 256) + + image = None + image_features = {} + if input.get("image", None) is not None: + image = load_image(input["image"], timeout=timeout) + if self.image_processor is not None: + image_inputs = self.image_processor(images=image, return_tensors=self.framework) + if self.framework == "pt": + image_inputs = image_inputs.to(self.torch_dtype) + image_features.update(image_inputs) + elif self.feature_extractor is not None: + image_features.update(self.feature_extractor(images=image, return_tensors=self.framework)) + elif self.model_type == ModelType.VisionEncoderDecoder: + raise ValueError("If you are using a VisionEncoderDecoderModel, you must provide a feature extractor") + + words, boxes = None, None + if not self.model_type == ModelType.VisionEncoderDecoder: + if "word_boxes" in input: + words = [x[0] for x in input["word_boxes"]] + boxes = [x[1] for x in input["word_boxes"]] + elif "words" in image_features and "boxes" in image_features: + words = image_features.pop("words")[0] + boxes = image_features.pop("boxes")[0] + elif image is not None: + if not TESSERACT_LOADED: + raise ValueError( + "If you provide an image without word_boxes, then the pipeline will run OCR using Tesseract," + " but pytesseract is not available" + ) + if TESSERACT_LOADED: + words, boxes = apply_tesseract(image, lang=lang, tesseract_config=tesseract_config) + else: + raise ValueError( + "You must provide an image or word_boxes. If you provide an image, the pipeline will automatically" + " run OCR to derive words and boxes" + ) + + if self.tokenizer.padding_side != "right": + raise ValueError( + "Document question answering only supports tokenizers whose padding side is 'right', not" + f" {self.tokenizer.padding_side}" + ) + + if self.model_type == ModelType.VisionEncoderDecoder: + task_prompt = f'{input["question"]}' + # Adapted from https://huggingface.co/spaces/nielsr/donut-docvqa/blob/main/app.py + encoding = { + "inputs": image_features["pixel_values"], + "decoder_input_ids": self.tokenizer( + task_prompt, add_special_tokens=False, return_tensors=self.framework + ).input_ids, + "return_dict_in_generate": True, + } + yield { + **encoding, + "p_mask": None, + "word_ids": None, + "words": None, + "output_attentions": True, + "is_last": True, + } + else: + tokenizer_kwargs = {} + if self.model_type == ModelType.LayoutLM: + tokenizer_kwargs["text"] = input["question"].split() + tokenizer_kwargs["text_pair"] = words + tokenizer_kwargs["is_split_into_words"] = True + else: + tokenizer_kwargs["text"] = [input["question"]] + tokenizer_kwargs["text_pair"] = [words] + tokenizer_kwargs["boxes"] = [boxes] + + encoding = self.tokenizer( + padding=padding, + max_length=max_seq_len, + stride=doc_stride, + return_token_type_ids=True, + truncation="only_second", + return_overflowing_tokens=True, + **tokenizer_kwargs, + ) + # TODO: check why slower `LayoutLMTokenizer` and `LayoutLMv2Tokenizer` don't have this key in outputs + # FIXME: ydshieh and/or Narsil + encoding.pop("overflow_to_sample_mapping", None) # We do not use this + + num_spans = len(encoding["input_ids"]) + + # p_mask: mask with 1 for token than cannot be in the answer (0 for token which can be in an answer) + # We put 0 on the tokens from the context and 1 everywhere else (question and special tokens) + # This logic mirrors the logic in the question_answering pipeline + p_mask = np.array([[tok != 1 for tok in encoding.sequence_ids(span_id)] for span_id in range(num_spans)]) + for span_idx in range(num_spans): + if self.framework == "pt": + span_encoding = {k: torch.tensor(v[span_idx : span_idx + 1]) for (k, v) in encoding.items()} + if "pixel_values" in image_features: + span_encoding["image"] = image_features["pixel_values"] + else: + raise ValueError("Unsupported: Tensorflow preprocessing for DocumentQuestionAnsweringPipeline") + + input_ids_span_idx = encoding["input_ids"][span_idx] + # keep the cls_token unmasked (some models use it to indicate unanswerable questions) + if self.tokenizer.cls_token_id is not None: + cls_indices = np.nonzero(np.array(input_ids_span_idx) == self.tokenizer.cls_token_id)[0] + for cls_index in cls_indices: + p_mask[span_idx][cls_index] = 0 + + # For each span, place a bounding box [0,0,0,0] for question and CLS tokens, [1000,1000,1000,1000] + # for SEP tokens, and the word's bounding box for words in the original document. + if "boxes" not in tokenizer_kwargs: + bbox = [] + for input_id, sequence_id, word_id in zip( + encoding.input_ids[span_idx], + encoding.sequence_ids(span_idx), + encoding.word_ids(span_idx), + ): + if sequence_id == 1: + bbox.append(boxes[word_id]) + elif input_id == self.tokenizer.sep_token_id: + bbox.append([1000] * 4) + else: + bbox.append([0] * 4) + + if self.framework == "pt": + span_encoding["bbox"] = torch.tensor(bbox).unsqueeze(0) + elif self.framework == "tf": + raise ValueError("Unsupported: Tensorflow preprocessing for DocumentQuestionAnsweringPipeline") + yield { + **span_encoding, + "p_mask": p_mask[span_idx], + "word_ids": encoding.word_ids(span_idx), + "words": words, + "is_last": span_idx == num_spans - 1, + } + + def _forward(self, model_inputs, **generate_kwargs): + p_mask = model_inputs.pop("p_mask", None) + word_ids = model_inputs.pop("word_ids", None) + words = model_inputs.pop("words", None) + is_last = model_inputs.pop("is_last", False) + + if self.model_type == ModelType.VisionEncoderDecoder: + # User-defined `generation_config` passed to the pipeline call take precedence + if "generation_config" not in generate_kwargs: + generate_kwargs["generation_config"] = self.generation_config + + model_outputs = self.model.generate(**model_inputs, **generate_kwargs) + else: + model_outputs = self.model(**model_inputs) + + model_outputs = dict(model_outputs.items()) + model_outputs["p_mask"] = p_mask + model_outputs["word_ids"] = word_ids + model_outputs["words"] = words + model_outputs["attention_mask"] = model_inputs.get("attention_mask", None) + model_outputs["is_last"] = is_last + return model_outputs + + def postprocess(self, model_outputs, top_k=1, **kwargs): + if self.model_type == ModelType.VisionEncoderDecoder: + answers = [self.postprocess_encoder_decoder_single(o) for o in model_outputs] + else: + answers = self.postprocess_extractive_qa(model_outputs, top_k=top_k, **kwargs) + + answers = sorted(answers, key=lambda x: x.get("score", 0), reverse=True)[:top_k] + return answers + + def postprocess_encoder_decoder_single(self, model_outputs, **kwargs): + sequence = self.tokenizer.batch_decode(model_outputs["sequences"])[0] + + # TODO: A lot of this logic is specific to Donut and should probably be handled in the tokenizer + # (see https://github.com/huggingface/transformers/pull/18414/files#r961747408 for more context). + sequence = sequence.replace(self.tokenizer.eos_token, "").replace(self.tokenizer.pad_token, "") + sequence = re.sub(r"<.*?>", "", sequence, count=1).strip() # remove first task start token + ret = { + "answer": None, + } + + answer = re.search(r"(.*)", sequence) + if answer is not None: + ret["answer"] = answer.group(1).strip() + return ret + + def postprocess_extractive_qa( + self, model_outputs, top_k=1, handle_impossible_answer=False, max_answer_len=15, **kwargs + ): + min_null_score = 1000000 # large and positive + answers = [] + for output in model_outputs: + words = output["words"] + + starts, ends, scores, min_null_score = select_starts_ends( + start=output["start_logits"], + end=output["end_logits"], + p_mask=output["p_mask"], + attention_mask=output["attention_mask"].numpy() + if output.get("attention_mask", None) is not None + else None, + min_null_score=min_null_score, + top_k=top_k, + handle_impossible_answer=handle_impossible_answer, + max_answer_len=max_answer_len, + ) + word_ids = output["word_ids"] + for start, end, score in zip(starts, ends, scores): + word_start, word_end = word_ids[start], word_ids[end] + if word_start is not None and word_end is not None: + answers.append( + { + "score": float(score), + "answer": " ".join(words[word_start : word_end + 1]), + "start": word_start, + "end": word_end, + } + ) + + if handle_impossible_answer: + answers.append({"score": min_null_score, "answer": "", "start": 0, "end": 0}) + + return answers diff --git a/.venv/lib/python3.11/site-packages/transformers/pipelines/feature_extraction.py b/.venv/lib/python3.11/site-packages/transformers/pipelines/feature_extraction.py new file mode 100644 index 0000000000000000000000000000000000000000..7d67a615ac02d29625f51242e1f747b39e6118bd --- /dev/null +++ b/.venv/lib/python3.11/site-packages/transformers/pipelines/feature_extraction.py @@ -0,0 +1,86 @@ +from typing import Dict + +from ..utils import add_end_docstrings +from .base import GenericTensor, Pipeline, build_pipeline_init_args + + +@add_end_docstrings( + build_pipeline_init_args(has_tokenizer=True, supports_binary_output=False), + r""" + tokenize_kwargs (`dict`, *optional*): + Additional dictionary of keyword arguments passed along to the tokenizer. + return_tensors (`bool`, *optional*): + If `True`, returns a tensor according to the specified framework, otherwise returns a list.""", +) +class FeatureExtractionPipeline(Pipeline): + """ + Feature extraction pipeline uses no model head. This pipeline extracts the hidden states from the base + transformer, which can be used as features in downstream tasks. + + Example: + + ```python + >>> from transformers import pipeline + + >>> extractor = pipeline(model="google-bert/bert-base-uncased", task="feature-extraction") + >>> result = extractor("This is a simple test.", return_tensors=True) + >>> result.shape # This is a tensor of shape [1, sequence_length, hidden_dimension] representing the input string. + torch.Size([1, 8, 768]) + ``` + + Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial) + + This feature extraction pipeline can currently be loaded from [`pipeline`] using the task identifier: + `"feature-extraction"`. + + All models may be used for this pipeline. See a list of all models, including community-contributed models on + [huggingface.co/models](https://huggingface.co/models). + """ + + def _sanitize_parameters(self, truncation=None, tokenize_kwargs=None, return_tensors=None, **kwargs): + if tokenize_kwargs is None: + tokenize_kwargs = {} + + if truncation is not None: + if "truncation" in tokenize_kwargs: + raise ValueError( + "truncation parameter defined twice (given as keyword argument as well as in tokenize_kwargs)" + ) + tokenize_kwargs["truncation"] = truncation + + preprocess_params = tokenize_kwargs + + postprocess_params = {} + if return_tensors is not None: + postprocess_params["return_tensors"] = return_tensors + + return preprocess_params, {}, postprocess_params + + def preprocess(self, inputs, **tokenize_kwargs) -> Dict[str, GenericTensor]: + model_inputs = self.tokenizer(inputs, return_tensors=self.framework, **tokenize_kwargs) + return model_inputs + + def _forward(self, model_inputs): + model_outputs = self.model(**model_inputs) + return model_outputs + + def postprocess(self, model_outputs, return_tensors=False): + # [0] is the first available tensor, logits or last_hidden_state. + if return_tensors: + return model_outputs[0] + if self.framework == "pt": + return model_outputs[0].tolist() + elif self.framework == "tf": + return model_outputs[0].numpy().tolist() + + def __call__(self, *args, **kwargs): + """ + Extract the features of the input(s). + + Args: + args (`str` or `List[str]`): One or several texts (or one list of texts) to get the features of. + + Return: + A nested list of `float`: The features computed by the model. + """ + return super().__call__(*args, **kwargs) diff --git a/.venv/lib/python3.11/site-packages/transformers/pipelines/fill_mask.py b/.venv/lib/python3.11/site-packages/transformers/pipelines/fill_mask.py new file mode 100644 index 0000000000000000000000000000000000000000..c14f54118486b971f64b0985fe2dc688de52f863 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/transformers/pipelines/fill_mask.py @@ -0,0 +1,273 @@ +from typing import Dict + +import numpy as np + +from ..utils import add_end_docstrings, is_tf_available, is_torch_available, logging +from .base import GenericTensor, Pipeline, PipelineException, build_pipeline_init_args + + +if is_tf_available(): + import tensorflow as tf + + from ..tf_utils import stable_softmax + + +if is_torch_available(): + import torch + + +logger = logging.get_logger(__name__) + + +@add_end_docstrings( + build_pipeline_init_args(has_tokenizer=True), + r""" + top_k (`int`, *optional*, defaults to 5): + The number of predictions to return. + targets (`str` or `List[str]`, *optional*): + When passed, the model will limit the scores to the passed targets instead of looking up in the whole + vocab. If the provided targets are not in the model vocab, they will be tokenized and the first resulting + token will be used (with a warning, and that might be slower). + tokenizer_kwargs (`dict`, *optional*): + Additional dictionary of keyword arguments passed along to the tokenizer.""", +) +class FillMaskPipeline(Pipeline): + """ + Masked language modeling prediction pipeline using any `ModelWithLMHead`. See the [masked language modeling + examples](../task_summary#masked-language-modeling) for more information. + + Example: + + ```python + >>> from transformers import pipeline + + >>> fill_masker = pipeline(model="google-bert/bert-base-uncased") + >>> fill_masker("This is a simple [MASK].") + [{'score': 0.042, 'token': 3291, 'token_str': 'problem', 'sequence': 'this is a simple problem.'}, {'score': 0.031, 'token': 3160, 'token_str': 'question', 'sequence': 'this is a simple question.'}, {'score': 0.03, 'token': 8522, 'token_str': 'equation', 'sequence': 'this is a simple equation.'}, {'score': 0.027, 'token': 2028, 'token_str': 'one', 'sequence': 'this is a simple one.'}, {'score': 0.024, 'token': 3627, 'token_str': 'rule', 'sequence': 'this is a simple rule.'}] + ``` + + Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial) + + This mask filling pipeline can currently be loaded from [`pipeline`] using the following task identifier: + `"fill-mask"`. + + The models that this pipeline can use are models that have been trained with a masked language modeling objective, + which includes the bi-directional models in the library. See the up-to-date list of available models on + [huggingface.co/models](https://huggingface.co/models?filter=fill-mask). + + + + This pipeline only works for inputs with exactly one token masked. Experimental: We added support for multiple + masks. The returned values are raw model output, and correspond to disjoint probabilities where one might expect + joint probabilities (See [discussion](https://github.com/huggingface/transformers/pull/10222)). + + + + + + This pipeline now supports tokenizer_kwargs. For example try: + + ```python + >>> from transformers import pipeline + + >>> fill_masker = pipeline(model="google-bert/bert-base-uncased") + >>> tokenizer_kwargs = {"truncation": True} + >>> fill_masker( + ... "This is a simple [MASK]. " + "...with a large amount of repeated text appended. " * 100, + ... tokenizer_kwargs=tokenizer_kwargs, + ... ) + ``` + + + + + + """ + + def get_masked_index(self, input_ids: GenericTensor) -> np.ndarray: + if self.framework == "tf": + masked_index = tf.where(input_ids == self.tokenizer.mask_token_id).numpy() + elif self.framework == "pt": + masked_index = torch.nonzero(input_ids == self.tokenizer.mask_token_id, as_tuple=False) + else: + raise ValueError("Unsupported framework") + return masked_index + + def _ensure_exactly_one_mask_token(self, input_ids: GenericTensor) -> np.ndarray: + masked_index = self.get_masked_index(input_ids) + numel = np.prod(masked_index.shape) + if numel < 1: + raise PipelineException( + "fill-mask", + self.model.base_model_prefix, + f"No mask_token ({self.tokenizer.mask_token}) found on the input", + ) + + def ensure_exactly_one_mask_token(self, model_inputs: GenericTensor): + if isinstance(model_inputs, list): + for model_input in model_inputs: + self._ensure_exactly_one_mask_token(model_input["input_ids"][0]) + else: + for input_ids in model_inputs["input_ids"]: + self._ensure_exactly_one_mask_token(input_ids) + + def preprocess( + self, inputs, return_tensors=None, tokenizer_kwargs=None, **preprocess_parameters + ) -> Dict[str, GenericTensor]: + if return_tensors is None: + return_tensors = self.framework + if tokenizer_kwargs is None: + tokenizer_kwargs = {} + + model_inputs = self.tokenizer(inputs, return_tensors=return_tensors, **tokenizer_kwargs) + self.ensure_exactly_one_mask_token(model_inputs) + return model_inputs + + def _forward(self, model_inputs): + model_outputs = self.model(**model_inputs) + model_outputs["input_ids"] = model_inputs["input_ids"] + return model_outputs + + def postprocess(self, model_outputs, top_k=5, target_ids=None): + # Cap top_k if there are targets + if target_ids is not None and target_ids.shape[0] < top_k: + top_k = target_ids.shape[0] + input_ids = model_outputs["input_ids"][0] + outputs = model_outputs["logits"] + + if self.framework == "tf": + masked_index = tf.where(input_ids == self.tokenizer.mask_token_id).numpy()[:, 0] + + outputs = outputs.numpy() + + logits = outputs[0, masked_index, :] + probs = stable_softmax(logits, axis=-1) + if target_ids is not None: + probs = tf.gather_nd(tf.squeeze(probs, 0), target_ids.reshape(-1, 1)) + probs = tf.expand_dims(probs, 0) + + topk = tf.math.top_k(probs, k=top_k) + values, predictions = topk.values.numpy(), topk.indices.numpy() + else: + masked_index = torch.nonzero(input_ids == self.tokenizer.mask_token_id, as_tuple=False).squeeze(-1) + # Fill mask pipeline supports only one ${mask_token} per sample + + logits = outputs[0, masked_index, :] + probs = logits.softmax(dim=-1) + if target_ids is not None: + probs = probs[..., target_ids] + + values, predictions = probs.topk(top_k) + + result = [] + single_mask = values.shape[0] == 1 + for i, (_values, _predictions) in enumerate(zip(values.tolist(), predictions.tolist())): + row = [] + for v, p in zip(_values, _predictions): + # Copy is important since we're going to modify this array in place + tokens = input_ids.numpy().copy() + if target_ids is not None: + p = target_ids[p].tolist() + + tokens[masked_index[i]] = p + # Filter padding out: + tokens = tokens[np.where(tokens != self.tokenizer.pad_token_id)] + # Originally we skip special tokens to give readable output. + # For multi masks though, the other [MASK] would be removed otherwise + # making the output look odd, so we add them back + sequence = self.tokenizer.decode(tokens, skip_special_tokens=single_mask) + proposition = {"score": v, "token": p, "token_str": self.tokenizer.decode([p]), "sequence": sequence} + row.append(proposition) + result.append(row) + if single_mask: + return result[0] + return result + + def get_target_ids(self, targets, top_k=None): + if isinstance(targets, str): + targets = [targets] + try: + vocab = self.tokenizer.get_vocab() + except Exception: + vocab = {} + target_ids = [] + for target in targets: + id_ = vocab.get(target, None) + if id_ is None: + input_ids = self.tokenizer( + target, + add_special_tokens=False, + return_attention_mask=False, + return_token_type_ids=False, + max_length=1, + truncation=True, + )["input_ids"] + if len(input_ids) == 0: + logger.warning( + f"The specified target token `{target}` does not exist in the model vocabulary. " + "We cannot replace it with anything meaningful, ignoring it" + ) + continue + id_ = input_ids[0] + # XXX: If users encounter this pass + # it becomes pretty slow, so let's make sure + # The warning enables them to fix the input to + # get faster performance. + logger.warning( + f"The specified target token `{target}` does not exist in the model vocabulary. " + f"Replacing with `{self.tokenizer.convert_ids_to_tokens(id_)}`." + ) + target_ids.append(id_) + target_ids = list(set(target_ids)) + if len(target_ids) == 0: + raise ValueError("At least one target must be provided when passed.") + target_ids = np.array(target_ids) + return target_ids + + def _sanitize_parameters(self, top_k=None, targets=None, tokenizer_kwargs=None): + preprocess_params = {} + + if tokenizer_kwargs is not None: + preprocess_params["tokenizer_kwargs"] = tokenizer_kwargs + + postprocess_params = {} + + if targets is not None: + target_ids = self.get_target_ids(targets, top_k) + postprocess_params["target_ids"] = target_ids + + if top_k is not None: + postprocess_params["top_k"] = top_k + + if self.tokenizer.mask_token_id is None: + raise PipelineException( + "fill-mask", self.model.base_model_prefix, "The tokenizer does not define a `mask_token`." + ) + return preprocess_params, {}, postprocess_params + + def __call__(self, inputs, **kwargs): + """ + Fill the masked token in the text(s) given as inputs. + + Args: + inputs (`str` or `List[str]`): + One or several texts (or one list of prompts) with masked tokens. + targets (`str` or `List[str]`, *optional*): + When passed, the model will limit the scores to the passed targets instead of looking up in the whole + vocab. If the provided targets are not in the model vocab, they will be tokenized and the first + resulting token will be used (with a warning, and that might be slower). + top_k (`int`, *optional*): + When passed, overrides the number of predictions to return. + + Return: + A list or a list of list of `dict`: Each result comes as list of dictionaries with the following keys: + + - **sequence** (`str`) -- The corresponding input with the mask token prediction. + - **score** (`float`) -- The corresponding probability. + - **token** (`int`) -- The predicted token id (to replace the masked one). + - **token_str** (`str`) -- The predicted token (to replace the masked one). + """ + outputs = super().__call__(inputs, **kwargs) + if isinstance(inputs, list) and len(inputs) == 1: + return outputs[0] + return outputs diff --git a/.venv/lib/python3.11/site-packages/transformers/pipelines/image_classification.py b/.venv/lib/python3.11/site-packages/transformers/pipelines/image_classification.py new file mode 100644 index 0000000000000000000000000000000000000000..0085e5eb73f826598dae8461a15431e3e5ef8f80 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/transformers/pipelines/image_classification.py @@ -0,0 +1,226 @@ +# Copyright 2023 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import List, Union + +import numpy as np + +from ..utils import ( + ExplicitEnum, + add_end_docstrings, + is_tf_available, + is_torch_available, + is_vision_available, + logging, + requires_backends, +) +from .base import Pipeline, build_pipeline_init_args + + +if is_vision_available(): + from PIL import Image + + from ..image_utils import load_image + +if is_tf_available(): + from ..models.auto.modeling_tf_auto import TF_MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES + +if is_torch_available(): + import torch + + from ..models.auto.modeling_auto import MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES + +logger = logging.get_logger(__name__) + + +# Copied from transformers.pipelines.text_classification.sigmoid +def sigmoid(_outputs): + return 1.0 / (1.0 + np.exp(-_outputs)) + + +# Copied from transformers.pipelines.text_classification.softmax +def softmax(_outputs): + maxes = np.max(_outputs, axis=-1, keepdims=True) + shifted_exp = np.exp(_outputs - maxes) + return shifted_exp / shifted_exp.sum(axis=-1, keepdims=True) + + +# Copied from transformers.pipelines.text_classification.ClassificationFunction +class ClassificationFunction(ExplicitEnum): + SIGMOID = "sigmoid" + SOFTMAX = "softmax" + NONE = "none" + + +@add_end_docstrings( + build_pipeline_init_args(has_image_processor=True), + r""" + function_to_apply (`str`, *optional*, defaults to `"default"`): + The function to apply to the model outputs in order to retrieve the scores. Accepts four different values: + + - `"default"`: if the model has a single label, will apply the sigmoid function on the output. If the model + has several labels, will apply the softmax function on the output. + - `"sigmoid"`: Applies the sigmoid function on the output. + - `"softmax"`: Applies the softmax function on the output. + - `"none"`: Does not apply any function on the output.""", +) +class ImageClassificationPipeline(Pipeline): + """ + Image classification pipeline using any `AutoModelForImageClassification`. This pipeline predicts the class of an + image. + + Example: + + ```python + >>> from transformers import pipeline + + >>> classifier = pipeline(model="microsoft/beit-base-patch16-224-pt22k-ft22k") + >>> classifier("https://huggingface.co/datasets/Narsil/image_dummy/raw/main/parrots.png") + [{'score': 0.442, 'label': 'macaw'}, {'score': 0.088, 'label': 'popinjay'}, {'score': 0.075, 'label': 'parrot'}, {'score': 0.073, 'label': 'parodist, lampooner'}, {'score': 0.046, 'label': 'poll, poll_parrot'}] + ``` + + Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial) + + This image classification pipeline can currently be loaded from [`pipeline`] using the following task identifier: + `"image-classification"`. + + See the list of available models on + [huggingface.co/models](https://huggingface.co/models?filter=image-classification). + """ + + function_to_apply: ClassificationFunction = ClassificationFunction.NONE + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + requires_backends(self, "vision") + self.check_model_type( + TF_MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES + if self.framework == "tf" + else MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING_NAMES + ) + + def _sanitize_parameters(self, top_k=None, function_to_apply=None, timeout=None): + preprocess_params = {} + if timeout is not None: + preprocess_params["timeout"] = timeout + postprocess_params = {} + if top_k is not None: + postprocess_params["top_k"] = top_k + if isinstance(function_to_apply, str): + function_to_apply = ClassificationFunction(function_to_apply.lower()) + if function_to_apply is not None: + postprocess_params["function_to_apply"] = function_to_apply + return preprocess_params, {}, postprocess_params + + def __call__(self, inputs: Union[str, List[str], "Image.Image", List["Image.Image"]] = None, **kwargs): + """ + Assign labels to the image(s) passed as inputs. + + Args: + inputs (`str`, `List[str]`, `PIL.Image` or `List[PIL.Image]`): + The pipeline handles three types of images: + + - A string containing a http link pointing to an image + - A string containing a local path to an image + - An image loaded in PIL directly + + The pipeline accepts either a single image or a batch of images, which must then be passed as a string. + Images in a batch must all be in the same format: all as http links, all as local paths, or all as PIL + images. + function_to_apply (`str`, *optional*, defaults to `"default"`): + The function to apply to the model outputs in order to retrieve the scores. Accepts four different + values: + + If this argument is not specified, then it will apply the following functions according to the number + of labels: + + - If the model has a single label, will apply the sigmoid function on the output. + - If the model has several labels, will apply the softmax function on the output. + + Possible values are: + + - `"sigmoid"`: Applies the sigmoid function on the output. + - `"softmax"`: Applies the softmax function on the output. + - `"none"`: Does not apply any function on the output. + top_k (`int`, *optional*, defaults to 5): + The number of top labels that will be returned by the pipeline. If the provided number is higher than + the number of labels available in the model configuration, it will default to the number of labels. + timeout (`float`, *optional*, defaults to None): + The maximum time in seconds to wait for fetching images from the web. If None, no timeout is set and + the call may block forever. + + Return: + A dictionary or a list of dictionaries containing result. If the input is a single image, will return a + dictionary, if the input is a list of several images, will return a list of dictionaries corresponding to + the images. + + The dictionaries contain the following keys: + + - **label** (`str`) -- The label identified by the model. + - **score** (`int`) -- The score attributed by the model for that label. + """ + # After deprecation of this is completed, remove the default `None` value for `images` + if "images" in kwargs: + inputs = kwargs.pop("images") + if inputs is None: + raise ValueError("Cannot call the image-classification pipeline without an inputs argument!") + return super().__call__(inputs, **kwargs) + + def preprocess(self, image, timeout=None): + image = load_image(image, timeout=timeout) + model_inputs = self.image_processor(images=image, return_tensors=self.framework) + if self.framework == "pt": + model_inputs = model_inputs.to(self.torch_dtype) + return model_inputs + + def _forward(self, model_inputs): + model_outputs = self.model(**model_inputs) + return model_outputs + + def postprocess(self, model_outputs, function_to_apply=None, top_k=5): + if function_to_apply is None: + if self.model.config.problem_type == "single_label_classification" or self.model.config.num_labels == 1: + function_to_apply = ClassificationFunction.SIGMOID + elif self.model.config.problem_type == "multi_label_classification" or self.model.config.num_labels > 1: + function_to_apply = ClassificationFunction.SOFTMAX + elif hasattr(self.model.config, "function_to_apply") and function_to_apply is None: + function_to_apply = self.model.config.function_to_apply + else: + function_to_apply = ClassificationFunction.NONE + + if top_k > self.model.config.num_labels: + top_k = self.model.config.num_labels + + outputs = model_outputs["logits"][0] + if self.framework == "pt" and outputs.dtype in (torch.bfloat16, torch.float16): + outputs = outputs.to(torch.float32).numpy() + else: + outputs = outputs.numpy() + + if function_to_apply == ClassificationFunction.SIGMOID: + scores = sigmoid(outputs) + elif function_to_apply == ClassificationFunction.SOFTMAX: + scores = softmax(outputs) + elif function_to_apply == ClassificationFunction.NONE: + scores = outputs + else: + raise ValueError(f"Unrecognized `function_to_apply` argument: {function_to_apply}") + + dict_scores = [ + {"label": self.model.config.id2label[i], "score": score.item()} for i, score in enumerate(scores) + ] + dict_scores.sort(key=lambda x: x["score"], reverse=True) + if top_k is not None: + dict_scores = dict_scores[:top_k] + + return dict_scores diff --git a/.venv/lib/python3.11/site-packages/transformers/pipelines/image_feature_extraction.py b/.venv/lib/python3.11/site-packages/transformers/pipelines/image_feature_extraction.py new file mode 100644 index 0000000000000000000000000000000000000000..391eb2b3aec714dbac61fe46bddc7ee74f10cd2f --- /dev/null +++ b/.venv/lib/python3.11/site-packages/transformers/pipelines/image_feature_extraction.py @@ -0,0 +1,112 @@ +from typing import Dict + +from ..utils import add_end_docstrings, is_vision_available +from .base import GenericTensor, Pipeline, build_pipeline_init_args + + +if is_vision_available(): + from ..image_utils import load_image + + +@add_end_docstrings( + build_pipeline_init_args(has_image_processor=True), + """ + image_processor_kwargs (`dict`, *optional*): + Additional dictionary of keyword arguments passed along to the image processor e.g. + {"size": {"height": 100, "width": 100}} + pool (`bool`, *optional*, defaults to `False`): + Whether or not to return the pooled output. If `False`, the model will return the raw hidden states. + """, +) +class ImageFeatureExtractionPipeline(Pipeline): + """ + Image feature extraction pipeline uses no model head. This pipeline extracts the hidden states from the base + transformer, which can be used as features in downstream tasks. + + Example: + + ```python + >>> from transformers import pipeline + + >>> extractor = pipeline(model="google/vit-base-patch16-224", task="image-feature-extraction") + >>> result = extractor("https://huggingface.co/datasets/Narsil/image_dummy/raw/main/parrots.png", return_tensors=True) + >>> result.shape # This is a tensor of shape [1, sequence_lenth, hidden_dimension] representing the input image. + torch.Size([1, 197, 768]) + ``` + + Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial) + + This image feature extraction pipeline can currently be loaded from [`pipeline`] using the task identifier: + `"image-feature-extraction"`. + + All vision models may be used for this pipeline. See a list of all models, including community-contributed models on + [huggingface.co/models](https://huggingface.co/models). + """ + + def _sanitize_parameters(self, image_processor_kwargs=None, return_tensors=None, pool=None, **kwargs): + preprocess_params = {} if image_processor_kwargs is None else image_processor_kwargs + + postprocess_params = {} + if pool is not None: + postprocess_params["pool"] = pool + if return_tensors is not None: + postprocess_params["return_tensors"] = return_tensors + + if "timeout" in kwargs: + preprocess_params["timeout"] = kwargs["timeout"] + + return preprocess_params, {}, postprocess_params + + def preprocess(self, image, timeout=None, **image_processor_kwargs) -> Dict[str, GenericTensor]: + image = load_image(image, timeout=timeout) + model_inputs = self.image_processor(image, return_tensors=self.framework, **image_processor_kwargs) + if self.framework == "pt": + model_inputs = model_inputs.to(self.torch_dtype) + return model_inputs + + def _forward(self, model_inputs): + model_outputs = self.model(**model_inputs) + return model_outputs + + def postprocess(self, model_outputs, pool=None, return_tensors=False): + pool = pool if pool is not None else False + + if pool: + if "pooler_output" not in model_outputs: + raise ValueError( + "No pooled output was returned. Make sure the model has a `pooler` layer when using the `pool` option." + ) + outputs = model_outputs["pooler_output"] + else: + # [0] is the first available tensor, logits or last_hidden_state. + outputs = model_outputs[0] + + if return_tensors: + return outputs + if self.framework == "pt": + return outputs.tolist() + elif self.framework == "tf": + return outputs.numpy().tolist() + + def __call__(self, *args, **kwargs): + """ + Extract the features of the input(s). + + Args: + images (`str`, `List[str]`, `PIL.Image` or `List[PIL.Image]`): + The pipeline handles three types of images: + + - A string containing a http link pointing to an image + - A string containing a local path to an image + - An image loaded in PIL directly + + The pipeline accepts either a single image or a batch of images, which must then be passed as a string. + Images in a batch must all be in the same format: all as http links, all as local paths, or all as PIL + images. + timeout (`float`, *optional*, defaults to None): + The maximum time in seconds to wait for fetching images from the web. If None, no timeout is used and + the call may block forever. + Return: + A nested list of `float`: The features computed by the model. + """ + return super().__call__(*args, **kwargs) diff --git a/.venv/lib/python3.11/site-packages/transformers/pipelines/image_segmentation.py b/.venv/lib/python3.11/site-packages/transformers/pipelines/image_segmentation.py new file mode 100644 index 0000000000000000000000000000000000000000..d388e591bf9df45c4905a6c8ff86fdce1e123906 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/transformers/pipelines/image_segmentation.py @@ -0,0 +1,220 @@ +from typing import Any, Dict, List, Union + +import numpy as np + +from ..utils import add_end_docstrings, is_torch_available, is_vision_available, logging, requires_backends +from .base import Pipeline, build_pipeline_init_args + + +if is_vision_available(): + from PIL import Image + + from ..image_utils import load_image + +if is_torch_available(): + from ..models.auto.modeling_auto import ( + MODEL_FOR_IMAGE_SEGMENTATION_MAPPING_NAMES, + MODEL_FOR_INSTANCE_SEGMENTATION_MAPPING_NAMES, + MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING_NAMES, + MODEL_FOR_UNIVERSAL_SEGMENTATION_MAPPING_NAMES, + ) + + +logger = logging.get_logger(__name__) + + +Prediction = Dict[str, Any] +Predictions = List[Prediction] + + +@add_end_docstrings(build_pipeline_init_args(has_image_processor=True)) +class ImageSegmentationPipeline(Pipeline): + """ + Image segmentation pipeline using any `AutoModelForXXXSegmentation`. This pipeline predicts masks of objects and + their classes. + + Example: + + ```python + >>> from transformers import pipeline + + >>> segmenter = pipeline(model="facebook/detr-resnet-50-panoptic") + >>> segments = segmenter("https://huggingface.co/datasets/Narsil/image_dummy/raw/main/parrots.png") + >>> len(segments) + 2 + + >>> segments[0]["label"] + 'bird' + + >>> segments[1]["label"] + 'bird' + + >>> type(segments[0]["mask"]) # This is a black and white mask showing where is the bird on the original image. + + + >>> segments[0]["mask"].size + (768, 512) + ``` + + + This image segmentation pipeline can currently be loaded from [`pipeline`] using the following task identifier: + `"image-segmentation"`. + + See the list of available models on + [huggingface.co/models](https://huggingface.co/models?filter=image-segmentation). + """ + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + if self.framework == "tf": + raise ValueError(f"The {self.__class__} is only available in PyTorch.") + + requires_backends(self, "vision") + mapping = MODEL_FOR_IMAGE_SEGMENTATION_MAPPING_NAMES.copy() + mapping.update(MODEL_FOR_SEMANTIC_SEGMENTATION_MAPPING_NAMES) + mapping.update(MODEL_FOR_INSTANCE_SEGMENTATION_MAPPING_NAMES) + mapping.update(MODEL_FOR_UNIVERSAL_SEGMENTATION_MAPPING_NAMES) + self.check_model_type(mapping) + + def _sanitize_parameters(self, **kwargs): + preprocess_kwargs = {} + postprocess_kwargs = {} + if "subtask" in kwargs: + postprocess_kwargs["subtask"] = kwargs["subtask"] + preprocess_kwargs["subtask"] = kwargs["subtask"] + if "threshold" in kwargs: + postprocess_kwargs["threshold"] = kwargs["threshold"] + if "mask_threshold" in kwargs: + postprocess_kwargs["mask_threshold"] = kwargs["mask_threshold"] + if "overlap_mask_area_threshold" in kwargs: + postprocess_kwargs["overlap_mask_area_threshold"] = kwargs["overlap_mask_area_threshold"] + if "timeout" in kwargs: + preprocess_kwargs["timeout"] = kwargs["timeout"] + + return preprocess_kwargs, {}, postprocess_kwargs + + def __call__(self, inputs=None, **kwargs) -> Union[Predictions, List[Prediction]]: + """ + Perform segmentation (detect masks & classes) in the image(s) passed as inputs. + + Args: + inputs (`str`, `List[str]`, `PIL.Image` or `List[PIL.Image]`): + The pipeline handles three types of images: + + - A string containing an HTTP(S) link pointing to an image + - A string containing a local path to an image + - An image loaded in PIL directly + + The pipeline accepts either a single image or a batch of images. Images in a batch must all be in the + same format: all as HTTP(S) links, all as local paths, or all as PIL images. + subtask (`str`, *optional*): + Segmentation task to be performed, choose [`semantic`, `instance` and `panoptic`] depending on model + capabilities. If not set, the pipeline will attempt tp resolve in the following order: + `panoptic`, `instance`, `semantic`. + threshold (`float`, *optional*, defaults to 0.9): + Probability threshold to filter out predicted masks. + mask_threshold (`float`, *optional*, defaults to 0.5): + Threshold to use when turning the predicted masks into binary values. + overlap_mask_area_threshold (`float`, *optional*, defaults to 0.5): + Mask overlap threshold to eliminate small, disconnected segments. + timeout (`float`, *optional*, defaults to None): + The maximum time in seconds to wait for fetching images from the web. If None, no timeout is set and + the call may block forever. + + Return: + A dictionary or a list of dictionaries containing the result. If the input is a single image, will return a + list of dictionaries, if the input is a list of several images, will return a list of list of dictionaries + corresponding to each image. + + The dictionaries contain the mask, label and score (where applicable) of each detected object and contains + the following keys: + + - **label** (`str`) -- The class label identified by the model. + - **mask** (`PIL.Image`) -- A binary mask of the detected object as a Pil Image of shape (width, height) of + the original image. Returns a mask filled with zeros if no object is found. + - **score** (*optional* `float`) -- Optionally, when the model is capable of estimating a confidence of the + "object" described by the label and the mask. + """ + # After deprecation of this is completed, remove the default `None` value for `images` + if "images" in kwargs: + inputs = kwargs.pop("images") + if inputs is None: + raise ValueError("Cannot call the image-classification pipeline without an inputs argument!") + return super().__call__(inputs, **kwargs) + + def preprocess(self, image, subtask=None, timeout=None): + image = load_image(image, timeout=timeout) + target_size = [(image.height, image.width)] + if self.model.config.__class__.__name__ == "OneFormerConfig": + if subtask is None: + kwargs = {} + else: + kwargs = {"task_inputs": [subtask]} + inputs = self.image_processor(images=[image], return_tensors="pt", **kwargs) + if self.framework == "pt": + inputs = inputs.to(self.torch_dtype) + inputs["task_inputs"] = self.tokenizer( + inputs["task_inputs"], + padding="max_length", + max_length=self.model.config.task_seq_len, + return_tensors=self.framework, + )["input_ids"] + else: + inputs = self.image_processor(images=[image], return_tensors="pt") + if self.framework == "pt": + inputs = inputs.to(self.torch_dtype) + inputs["target_size"] = target_size + return inputs + + def _forward(self, model_inputs): + target_size = model_inputs.pop("target_size") + model_outputs = self.model(**model_inputs) + model_outputs["target_size"] = target_size + return model_outputs + + def postprocess( + self, model_outputs, subtask=None, threshold=0.9, mask_threshold=0.5, overlap_mask_area_threshold=0.5 + ): + fn = None + if subtask in {"panoptic", None} and hasattr(self.image_processor, "post_process_panoptic_segmentation"): + fn = self.image_processor.post_process_panoptic_segmentation + elif subtask in {"instance", None} and hasattr(self.image_processor, "post_process_instance_segmentation"): + fn = self.image_processor.post_process_instance_segmentation + + if fn is not None: + outputs = fn( + model_outputs, + threshold=threshold, + mask_threshold=mask_threshold, + overlap_mask_area_threshold=overlap_mask_area_threshold, + target_sizes=model_outputs["target_size"], + )[0] + + annotation = [] + segmentation = outputs["segmentation"] + + for segment in outputs["segments_info"]: + mask = (segmentation == segment["id"]) * 255 + mask = Image.fromarray(mask.numpy().astype(np.uint8), mode="L") + label = self.model.config.id2label[segment["label_id"]] + score = segment["score"] + annotation.append({"score": score, "label": label, "mask": mask}) + + elif subtask in {"semantic", None} and hasattr(self.image_processor, "post_process_semantic_segmentation"): + outputs = self.image_processor.post_process_semantic_segmentation( + model_outputs, target_sizes=model_outputs["target_size"] + )[0] + + annotation = [] + segmentation = outputs.numpy() + labels = np.unique(segmentation) + + for label in labels: + mask = (segmentation == label) * 255 + mask = Image.fromarray(mask.astype(np.uint8), mode="L") + label = self.model.config.id2label[label] + annotation.append({"score": None, "label": label, "mask": mask}) + else: + raise ValueError(f"Subtask {subtask} is not supported for model {type(self.model)}") + return annotation diff --git a/.venv/lib/python3.11/site-packages/transformers/pipelines/image_text_to_text.py b/.venv/lib/python3.11/site-packages/transformers/pipelines/image_text_to_text.py new file mode 100644 index 0000000000000000000000000000000000000000..5afba0d7c0410ed5ee7a0f4d53d0f791b43c6f8c --- /dev/null +++ b/.venv/lib/python3.11/site-packages/transformers/pipelines/image_text_to_text.py @@ -0,0 +1,432 @@ +# coding=utf-8 +# Copyright 2024 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import enum +from typing import Dict, List, Optional, Union + +from ..processing_utils import ProcessingKwargs, Unpack +from ..utils import ( + add_end_docstrings, + is_torch_available, + is_vision_available, + logging, + requires_backends, +) +from .base import Pipeline, build_pipeline_init_args + + +if is_vision_available(): + from PIL import Image + + from ..image_utils import load_images, valid_images + + +if is_torch_available(): + from ..models.auto.modeling_auto import MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES + from .pt_utils import KeyDataset + +logger = logging.get_logger(__name__) + +IMAGE_TOKEN = "" + + +class ReturnType(enum.Enum): + TENSORS = 0 + NEW_TEXT = 1 + FULL_TEXT = 2 + + +class Chat: + """This class is intended to just be used internally in this pipeline and not exposed to users. We convert chats + to this format because the rest of the pipeline code tends to assume that lists of messages are + actually a batch of samples rather than messages in the same conversation.""" + + def __init__(self, messages: Dict, images: Union[str, List[str], "Image.Image", List["Image.Image"]]): + for message in messages: + if not ("role" in message and "content" in message): + raise ValueError("When passing chat dicts as input, each dict must have a 'role' and 'content' key.") + images = retrieve_images_in_messages(messages, images) + + self.messages = messages + self.images = images + + +def retrieve_images_in_messages( + messages: dict, images: Optional[Union[str, List[str], "Image.Image", List["Image.Image"]]] +): + """ + Retrieve and combine images from the chat and the images passed as input. + """ + if images is None: + images = [] + idx_images = 0 + retrieved_images = [] + for message in messages: + for content in message["content"]: + if isinstance(content, dict): + if content.get("type") == "image": + for key in ["image", "url", "path", "base64"]: + if key in content: + retrieved_images.append(content[key]) + break + else: + if idx_images < len(images): + retrieved_images.append(images[idx_images]) + idx_images += 1 + else: + raise ValueError( + "The number of images in the chat messages should be the same as the number of images passed to the pipeline." + ) + # Add support for OpenAI/TGI chat format + elif content.get("type") == "image_url": + if isinstance(content.get("image_url"), dict) and "url" in content["image_url"]: + retrieved_images.append(content["image_url"]["url"]) + # Rewrite content to be in the Transformers chat format + content["type"] = "image" + content["image"] = content["image_url"]["url"] + del content["image_url"] + else: + raise ValueError( + "Wrong format for 'image_url' content type. The content should have an 'image_url' dict with a 'url' key." + ) + + # The number of images passed should be consistent with the number of images in the chat without an image key + if idx_images != len(images): + raise ValueError( + "The number of images in the chat messages should be the same as the number of images passed to the pipeline." + ) + + return retrieved_images + + +@add_end_docstrings(build_pipeline_init_args(has_processor=True)) +class ImageTextToTextPipeline(Pipeline): + """ + Image-text-to-text pipeline using an `AutoModelForImageTextToText`. This pipeline generates text given an image and text. + When the underlying model is a conversational model, it can also accept one or more chats, + in which case the pipeline will operate in chat mode and will continue the chat(s) by adding its response(s). + Each chat takes the form of a list of dicts, where each dict contains "role" and "content" keys. + + Example: + + ```python + >>> from transformers import pipeline + + >>> pipe = pipeline(task="image-text-to-text", model="Salesforce/blip-image-captioning-base") + >>> pipe("https://huggingface.co/datasets/Narsil/image_dummy/raw/main/parrots.png", text="A photo of") + [{'generated_text': 'a photo of two birds'}] + ``` + + ```python + >>> from transformers import pipeline + + >>> pipe = pipeline("image-text-to-text", model="llava-hf/llava-interleave-qwen-0.5b-hf") + >>> messages = [ + >>> { + >>> "role": "user", + >>> "content": [ + >>> { + >>> "type": "image", + >>> "url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg", + >>> }, + >>> {"type": "text", "text": "Describe this image."}, + >>> ], + >>> }, + >>> { + >>> "role": "assistant", + >>> "content": [ + >>> {"type": "text", "text": "There is a dog and"}, + >>> ], + >>> }, + >>> ] + >>> pipe(text=messages, max_new_tokens=20, return_full_text=False) + [{'input_text': [{'role': 'user', + 'content': [{'type': 'image', + 'url': 'https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg'}, + {'type': 'text', 'text': 'Describe this image.'}]}, + {'role': 'assistant', + 'content': [{'type': 'text', 'text': 'There is a dog and'}]}], + 'generated_text': ' a person in the image. The dog is sitting on the sand, and the person is sitting on'}] + ``` + + Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial) + + This image-text to text pipeline can currently be loaded from pipeline() using the following task identifier: + "image-text-to-text". + + See the list of available models on + [huggingface.co/models](https://huggingface.co/models?pipeline_tag=image-text-to-text). + """ + + _load_processor = True + _load_image_processor = False + _load_feature_extractor = False + _load_tokenizer = False + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + requires_backends(self, "vision") + self.check_model_type(MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES) + + def _sanitize_parameters( + self, + max_new_tokens=None, + generate_kwargs=None, + timeout=None, + return_full_text=None, + return_tensors=None, + return_type=None, + continue_final_message=None, + **kwargs: Unpack[ProcessingKwargs], + ): + forward_kwargs = {} + preprocess_params = {} + postprocess_params = {} + + preprocess_params["processing_kwargs"] = kwargs + + if timeout is not None: + preprocess_params["timeout"] = timeout + + if continue_final_message is not None: + preprocess_params["continue_final_message"] = continue_final_message + + if generate_kwargs is not None: + forward_kwargs["generate_kwargs"] = generate_kwargs + + if max_new_tokens is not None: + if "generate_kwargs" not in forward_kwargs: + forward_kwargs["generate_kwargs"] = {} + if "max_new_tokens" in forward_kwargs["generate_kwargs"]: + raise ValueError( + "'max_new_tokens' is defined twice, once in 'generate_kwargs' and once as a direct parameter," + " please use only one" + ) + forward_kwargs["generate_kwargs"]["max_new_tokens"] = max_new_tokens + + if return_full_text is not None and return_type is None: + if return_tensors is not None: + raise ValueError("`return_full_text` is mutually exclusive with `return_tensors`") + return_type = ReturnType.FULL_TEXT if return_full_text else ReturnType.NEW_TEXT + if return_tensors is not None and return_type is None: + return_type = ReturnType.TENSORS + if return_type is not None: + postprocess_params["return_type"] = return_type + if continue_final_message is not None: + postprocess_params["continue_final_message"] = continue_final_message + + return preprocess_params, forward_kwargs, postprocess_params + + def __call__( + self, + images: Optional[ + Union[str, List[str], List[List[str]], "Image.Image", List["Image.Image"], List[List["Image.Image"]]] + ] = None, + text: Optional[Union[str, List[str], List[dict]]] = None, + **kwargs, + ): + """ + Generate a text given text and the image(s) passed as inputs. + + Args: + images (`str`, `List[str]`, `PIL.Image or `List[PIL.Image]`): + The pipeline handles three types of images: + + - A string containing a HTTP(s) link pointing to an image + - A string containing a local path to an image + - An image loaded in PIL directly + + The pipeline accepts either a single image or a batch of images. + text (str, List[str], `List[Dict[str, Union[str, PIL.Image]]]`): + The text to be used for generation. If a list of strings is passed, the length of the list should be the + same as the number of images. Text can also follow the chat format: a list of dictionaries where each + dictionary represents a message in a conversation. Each dictionary should have two keys: 'role' and + 'content'. 'role' should be one of 'user', 'system' or 'assistant'. 'content' should be a list of dictionary + containing the text of the message and the type of the message. The type of the message can be either + 'text' or 'image'. If the type is 'image', no text is needed. + return_tensors (`bool`, *optional*, defaults to `False`): + Returns the tensors of predictions (as token indices) in the outputs. If set to + `True`, the decoded text is not returned. + return_text (`bool`, *optional*): + Returns the decoded texts in the outputs. + return_full_text (`bool`, *optional*, defaults to `True`): + If set to `False` only added text is returned, otherwise the full text is returned. Cannot be + specified at the same time as `return_text`. + continue_final_message( `bool`, *optional*): This indicates that you want the model to continue the + last message in the input chat rather than starting a new one, allowing you to "prefill" its response. + By default this is `True` when the final message in the input chat has the `assistant` role and + `False` otherwise, but you can manually override that behaviour by setting this flag. + + Return: + A list or a list of list of `dict`: Each result comes as a dictionary with the following key (cannot return a combination + of both `generated_text` and `generated_token_ids`): + + - **generated_text** (`str`, present when `return_text=True`) -- The generated text. + - **generated_token_ids** (`torch.Tensor`, present when `return_tensors=True`) -- The token + ids of the generated text. + - **input_text** (`str`) -- The input text. + """ + if images is None and text is None: + raise ValueError("You must at least provide either text or images.") + if images is not None and text is None and not valid_images(images): + """ + Supports the following format + - {"image": image, "text": text} + - [{"image": image, "text": text}] + - Generator and datasets + This is a common pattern in other multimodal pipelines, so we support it here as well. + """ + return super().__call__(images, **kwargs) + + if isinstance(text, (list, tuple, KeyDataset)) and isinstance(text[0], (list, tuple, dict)): + # We have one or more prompts in list-of-dicts format, so this is chat mode + if isinstance(text[0], dict): + return super().__call__(Chat(text, images), **kwargs) + else: + if images is None: + images = [None] * len(text) + chats = [Chat(chat, image) for chat, image in zip(text, images)] # 🐈 🐈 🐈 + return super().__call__(chats, **kwargs) + + # encourage the user to use the chat format if supported + if getattr(self.processor, "chat_template", None) is not None: + logger.warning_once( + "The input data was not formatted as a chat with dicts containing 'role' and 'content' keys, even though this model supports chat. " + "Consider using the chat format for better results. For more information, see https://huggingface.co/docs/transformers/en/chat_templating" + ) + + # support text only generation + if images is None: + return super().__call__(text, **kwargs) + if text is None: + raise ValueError("You must provide text for this pipeline.") + + return super().__call__({"images": images, "text": text}, **kwargs) + + def preprocess(self, inputs=None, timeout=None, continue_final_message=None, processing_kwargs=None): + # In case we only have text inputs + if isinstance(inputs, (list, tuple, str)): + images = None + text = inputs + inputs_text = inputs + else: + if isinstance(inputs, Chat): + # If the user passes a chat that ends in an assistant message, we treat it as a prefill by default + # because very few models support multiple separate, consecutive assistant messages + if continue_final_message is None: + continue_final_message = inputs.messages[-1]["role"] == "assistant" + text = self.processor.apply_chat_template( + inputs.messages, + add_generation_prompt=not continue_final_message, + continue_final_message=continue_final_message, + return_tensors=self.framework, + ) + inputs_text = inputs + images = inputs.images + else: + text = inputs["text"] + inputs_text = inputs["text"] + images = inputs["images"] + + images = load_images(images) + + # if batched text inputs, we set padding to True unless specified otherwise + if isinstance(text, (list, tuple)) and len(text) > 1: + processing_kwargs.setdefault("padding", True) + model_inputs = self.processor( + images=images, text=text, return_tensors=self.framework, legacy=False, **processing_kwargs + ).to(dtype=self.torch_dtype) + + model_inputs["text"] = inputs_text + + return model_inputs + + def _forward(self, model_inputs, generate_kwargs=None): + generate_kwargs = {} if generate_kwargs is None else generate_kwargs + prompt_text = model_inputs.pop("text") + input_ids = ( + model_inputs["input_ids"] if "input_ids" in model_inputs else model_inputs["decoder_input_ids"] + ) # for decoder-only models + generated_sequence = self.model.generate(**model_inputs, **generate_kwargs) + + return {"generated_sequence": generated_sequence, "prompt_text": prompt_text, "input_ids": input_ids} + + def postprocess(self, model_outputs, return_type=ReturnType.FULL_TEXT, continue_final_message=None): + input_texts = model_outputs["prompt_text"] + input_texts = [input_texts] if isinstance(input_texts, (str, Chat)) else input_texts + generated_sequence = model_outputs["generated_sequence"] + input_ids = model_outputs["input_ids"] + if return_type == ReturnType.TENSORS: + return [ + {"input_text": input_texts[i], "generated_token_ids": generated_sequence[i]} + for i in range(len(input_texts)) + ] + + # Decode inputs and outputs the same way to remove input text from generated text if present + generated_texts = self.processor.post_process_image_text_to_text(generated_sequence) + decoded_inputs = self.processor.post_process_image_text_to_text(input_ids) + + # Force consistent behavior for including the input text in the output + if return_type in {ReturnType.NEW_TEXT, ReturnType.FULL_TEXT}: + # Remove the input text from the generated text if the generated text starts with the input text + # (accounting for the possibility of a space between the input and generated text) + new_generated_texts = [] + for text_generated, decoded_input in zip(generated_texts, decoded_inputs): + # There can be added characters before the input text, so we need to find the beginning of the input text in the generated text + index_input_text = text_generated.find(decoded_input) + # Limit the search to 2 residual characters, like spaces or new lines, to avoid removing a large part of the answer + if 0 <= index_input_text <= 2: + # If the input text is found, we remove it + new_generated_texts.append(text_generated[index_input_text + len(decoded_input) :]) + else: + new_generated_texts.append(text_generated) + generated_texts = new_generated_texts + if return_type == ReturnType.FULL_TEXT: + full_texts = [] + for prompt_text, generated_text in zip(input_texts, generated_texts): + if isinstance(prompt_text, str): + generated_text = prompt_text + generated_text + elif isinstance(prompt_text, Chat): + if continue_final_message is None: + # If the user passes a chat ending in an assistant message, we treat it as a prefill by + # default because very few models support multiple separate, consecutive assistant messages + continue_final_message = prompt_text.messages[-1]["role"] == "assistant" + if continue_final_message: + # With assistant prefill, concat onto the end of the last message + new_text = dict(prompt_text.messages[-1]["content"][-1].items()) + new_text["text"] += generated_text + generated_text = list(prompt_text.messages)[:-1] + [ + { + "role": prompt_text.messages[-1]["role"], + "content": prompt_text.messages[-1]["content"][:-1] + [new_text], + } + ] + else: + # When we're not starting from a prefill, the output is a new assistant message + generated_text = list(prompt_text.messages) + [ + {"role": "assistant", "content": generated_text} + ] + full_texts.append(generated_text) + generated_texts = full_texts + + records = [ + { + "input_text": input_text.messages if isinstance(input_text, Chat) else input_text, + "generated_text": generated_text, + } + for input_text, generated_text in zip(input_texts, generated_texts) + ] + + return records diff --git a/.venv/lib/python3.11/site-packages/transformers/pipelines/image_to_image.py b/.venv/lib/python3.11/site-packages/transformers/pipelines/image_to_image.py new file mode 100644 index 0000000000000000000000000000000000000000..cb66359a4dddea48519f2de2dc69e86cd4ac5645 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/transformers/pipelines/image_to_image.py @@ -0,0 +1,136 @@ +# Copyright 2023 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import List, Union + +import numpy as np + +from ..utils import ( + add_end_docstrings, + is_torch_available, + is_vision_available, + logging, + requires_backends, +) +from .base import Pipeline, build_pipeline_init_args + + +if is_vision_available(): + from PIL import Image + + from ..image_utils import load_image + +if is_torch_available(): + from ..models.auto.modeling_auto import MODEL_FOR_IMAGE_TO_IMAGE_MAPPING_NAMES + +logger = logging.get_logger(__name__) + + +@add_end_docstrings(build_pipeline_init_args(has_image_processor=True)) +class ImageToImagePipeline(Pipeline): + """ + Image to Image pipeline using any `AutoModelForImageToImage`. This pipeline generates an image based on a previous + image input. + + Example: + + ```python + >>> from PIL import Image + >>> import requests + + >>> from transformers import pipeline + + >>> upscaler = pipeline("image-to-image", model="caidas/swin2SR-classical-sr-x2-64") + >>> img = Image.open(requests.get("http://images.cocodataset.org/val2017/000000039769.jpg", stream=True).raw) + >>> img = img.resize((64, 64)) + >>> upscaled_img = upscaler(img) + >>> img.size + (64, 64) + + >>> upscaled_img.size + (144, 144) + ``` + + This image to image pipeline can currently be loaded from [`pipeline`] using the following task identifier: + `"image-to-image"`. + + See the list of available models on [huggingface.co/models](https://huggingface.co/models?filter=image-to-image). + """ + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + requires_backends(self, "vision") + self.check_model_type(MODEL_FOR_IMAGE_TO_IMAGE_MAPPING_NAMES) + + def _sanitize_parameters(self, **kwargs): + preprocess_params = {} + postprocess_params = {} + forward_params = {} + + if "timeout" in kwargs: + preprocess_params["timeout"] = kwargs["timeout"] + if "head_mask" in kwargs: + forward_params["head_mask"] = kwargs["head_mask"] + + return preprocess_params, forward_params, postprocess_params + + def __call__( + self, images: Union[str, List[str], "Image.Image", List["Image.Image"]], **kwargs + ) -> Union["Image.Image", List["Image.Image"]]: + """ + Transform the image(s) passed as inputs. + + Args: + images (`str`, `List[str]`, `PIL.Image` or `List[PIL.Image]`): + The pipeline handles three types of images: + + - A string containing a http link pointing to an image + - A string containing a local path to an image + - An image loaded in PIL directly + + The pipeline accepts either a single image or a batch of images, which must then be passed as a string. + Images in a batch must all be in the same format: all as http links, all as local paths, or all as PIL + images. + timeout (`float`, *optional*, defaults to None): + The maximum time in seconds to wait for fetching images from the web. If None, no timeout is used and + the call may block forever. + + Return: + An image (Image.Image) or a list of images (List["Image.Image"]) containing result(s). If the input is a + single image, the return will be also a single image, if the input is a list of several images, it will + return a list of transformed images. + """ + return super().__call__(images, **kwargs) + + def _forward(self, model_inputs): + model_outputs = self.model(**model_inputs) + return model_outputs + + def preprocess(self, image, timeout=None): + image = load_image(image, timeout=timeout) + inputs = self.image_processor(images=[image], return_tensors="pt") + if self.framework == "pt": + inputs = inputs.to(self.torch_dtype) + return inputs + + def postprocess(self, model_outputs): + images = [] + if "reconstruction" in model_outputs.keys(): + outputs = model_outputs.reconstruction + for output in outputs: + output = output.data.squeeze().float().cpu().clamp_(0, 1).numpy() + output = np.moveaxis(output, source=0, destination=-1) + output = (output * 255.0).round().astype(np.uint8) # float32 to uint8 + images.append(Image.fromarray(output)) + + return images if len(images) > 1 else images[0] diff --git a/.venv/lib/python3.11/site-packages/transformers/pipelines/image_to_text.py b/.venv/lib/python3.11/site-packages/transformers/pipelines/image_to_text.py new file mode 100644 index 0000000000000000000000000000000000000000..32a3ec218dac305f93d8e41959200a78c590c8df --- /dev/null +++ b/.venv/lib/python3.11/site-packages/transformers/pipelines/image_to_text.py @@ -0,0 +1,222 @@ +# coding=utf-8 +# Copyright 2024 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import List, Union + +from ..utils import ( + add_end_docstrings, + is_tf_available, + is_torch_available, + is_vision_available, + logging, + requires_backends, +) +from .base import Pipeline, build_pipeline_init_args + + +if is_vision_available(): + from PIL import Image + + from ..image_utils import load_image + +if is_tf_available(): + from ..models.auto.modeling_tf_auto import TF_MODEL_FOR_VISION_2_SEQ_MAPPING_NAMES + +if is_torch_available(): + import torch + + from ..models.auto.modeling_auto import MODEL_FOR_VISION_2_SEQ_MAPPING_NAMES + +logger = logging.get_logger(__name__) + + +@add_end_docstrings(build_pipeline_init_args(has_tokenizer=True, has_image_processor=True)) +class ImageToTextPipeline(Pipeline): + """ + Image To Text pipeline using a `AutoModelForVision2Seq`. This pipeline predicts a caption for a given image. + + Example: + + ```python + >>> from transformers import pipeline + + >>> captioner = pipeline(model="ydshieh/vit-gpt2-coco-en") + >>> captioner("https://huggingface.co/datasets/Narsil/image_dummy/raw/main/parrots.png") + [{'generated_text': 'two birds are standing next to each other '}] + ``` + + Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial) + + This image to text pipeline can currently be loaded from pipeline() using the following task identifier: + "image-to-text". + + See the list of available models on + [huggingface.co/models](https://huggingface.co/models?pipeline_tag=image-to-text). + """ + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + requires_backends(self, "vision") + self.check_model_type( + TF_MODEL_FOR_VISION_2_SEQ_MAPPING_NAMES if self.framework == "tf" else MODEL_FOR_VISION_2_SEQ_MAPPING_NAMES + ) + + def _sanitize_parameters(self, max_new_tokens=None, generate_kwargs=None, prompt=None, timeout=None): + forward_params = {} + preprocess_params = {} + + if prompt is not None: + preprocess_params["prompt"] = prompt + if timeout is not None: + preprocess_params["timeout"] = timeout + + if max_new_tokens is not None: + forward_params["max_new_tokens"] = max_new_tokens + if generate_kwargs is not None: + if max_new_tokens is not None and "max_new_tokens" in generate_kwargs: + raise ValueError( + "`max_new_tokens` is defined both as an argument and inside `generate_kwargs` argument, please use" + " only 1 version" + ) + forward_params.update(generate_kwargs) + + if self.assistant_model is not None: + forward_params["assistant_model"] = self.assistant_model + if self.assistant_tokenizer is not None: + forward_params["tokenizer"] = self.tokenizer + forward_params["assistant_tokenizer"] = self.assistant_tokenizer + + return preprocess_params, forward_params, {} + + def __call__(self, inputs: Union[str, List[str], "Image.Image", List["Image.Image"]] = None, **kwargs): + """ + Assign labels to the image(s) passed as inputs. + + Args: + inputs (`str`, `List[str]`, `PIL.Image` or `List[PIL.Image]`): + The pipeline handles three types of images: + + - A string containing a HTTP(s) link pointing to an image + - A string containing a local path to an image + - An image loaded in PIL directly + + The pipeline accepts either a single image or a batch of images. + + max_new_tokens (`int`, *optional*): + The amount of maximum tokens to generate. By default it will use `generate` default. + + generate_kwargs (`Dict`, *optional*): + Pass it to send all of these arguments directly to `generate` allowing full control of this function. + + timeout (`float`, *optional*, defaults to None): + The maximum time in seconds to wait for fetching images from the web. If None, no timeout is set and + the call may block forever. + + Return: + A list or a list of list of `dict`: Each result comes as a dictionary with the following key: + + - **generated_text** (`str`) -- The generated text. + """ + # After deprecation of this is completed, remove the default `None` value for `images` + if "images" in kwargs: + inputs = kwargs.pop("images") + if inputs is None: + raise ValueError("Cannot call the image-to-text pipeline without an inputs argument!") + return super().__call__(inputs, **kwargs) + + def preprocess(self, image, prompt=None, timeout=None): + image = load_image(image, timeout=timeout) + + if prompt is not None: + logger.warning_once( + "Passing `prompt` to the `image-to-text` pipeline is deprecated and will be removed in version 4.48" + " of 🤗 Transformers. Use the `image-text-to-text` pipeline instead", + ) + if not isinstance(prompt, str): + raise ValueError( + f"Received an invalid text input, got - {type(prompt)} - but expected a single string. " + "Note also that one single text can be provided for conditional image to text generation." + ) + + model_type = self.model.config.model_type + + if model_type == "git": + model_inputs = self.image_processor(images=image, return_tensors=self.framework) + if self.framework == "pt": + model_inputs = model_inputs.to(self.torch_dtype) + input_ids = self.tokenizer(text=prompt, add_special_tokens=False).input_ids + input_ids = [self.tokenizer.cls_token_id] + input_ids + input_ids = torch.tensor(input_ids).unsqueeze(0) + model_inputs.update({"input_ids": input_ids}) + + elif model_type == "pix2struct": + model_inputs = self.image_processor(images=image, header_text=prompt, return_tensors=self.framework) + if self.framework == "pt": + model_inputs = model_inputs.to(self.torch_dtype) + + elif model_type != "vision-encoder-decoder": + # vision-encoder-decoder does not support conditional generation + model_inputs = self.image_processor(images=image, return_tensors=self.framework) + if self.framework == "pt": + model_inputs = model_inputs.to(self.torch_dtype) + text_inputs = self.tokenizer(prompt, return_tensors=self.framework) + model_inputs.update(text_inputs) + + else: + raise ValueError(f"Model type {model_type} does not support conditional text generation") + + else: + model_inputs = self.image_processor(images=image, return_tensors=self.framework) + if self.framework == "pt": + model_inputs = model_inputs.to(self.torch_dtype) + + if self.model.config.model_type == "git" and prompt is None: + model_inputs["input_ids"] = None + + return model_inputs + + def _forward(self, model_inputs, **generate_kwargs): + # Git model sets `model_inputs["input_ids"] = None` in `preprocess` (when `prompt=None`). In batch model, the + # pipeline will group them into a list of `None`, which fail `_forward`. Avoid this by checking it first. + if ( + "input_ids" in model_inputs + and isinstance(model_inputs["input_ids"], list) + and all(x is None for x in model_inputs["input_ids"]) + ): + model_inputs["input_ids"] = None + + # User-defined `generation_config` passed to the pipeline call take precedence + if "generation_config" not in generate_kwargs: + generate_kwargs["generation_config"] = self.generation_config + + # FIXME: We need to pop here due to a difference in how `generation.py` and `generation.tf_utils.py` + # parse inputs. In the Tensorflow version, `generate` raises an error if we don't use `input_ids` whereas + # the PyTorch version matches it with `self.model.main_input_name` or `self.model.encoder.main_input_name` + # in the `_prepare_model_inputs` method. + inputs = model_inputs.pop(self.model.main_input_name) + model_outputs = self.model.generate(inputs, **model_inputs, **generate_kwargs) + return model_outputs + + def postprocess(self, model_outputs): + records = [] + for output_ids in model_outputs: + record = { + "generated_text": self.tokenizer.decode( + output_ids, + skip_special_tokens=True, + ) + } + records.append(record) + return records diff --git a/.venv/lib/python3.11/site-packages/transformers/pipelines/mask_generation.py b/.venv/lib/python3.11/site-packages/transformers/pipelines/mask_generation.py new file mode 100644 index 0000000000000000000000000000000000000000..f87e45b7f8ecb410ba5d0a088188256d59290f0f --- /dev/null +++ b/.venv/lib/python3.11/site-packages/transformers/pipelines/mask_generation.py @@ -0,0 +1,287 @@ +from collections import defaultdict +from typing import Optional + +from ..image_utils import load_image +from ..utils import ( + add_end_docstrings, + is_torch_available, + logging, + requires_backends, +) +from .base import ChunkPipeline, build_pipeline_init_args + + +if is_torch_available(): + import torch + + from ..models.auto.modeling_auto import MODEL_FOR_MASK_GENERATION_MAPPING_NAMES + +logger = logging.get_logger(__name__) + + +@add_end_docstrings( + build_pipeline_init_args(has_image_processor=True), + r""" + points_per_batch (*optional*, int, default to 64): + Sets the number of points run simultaneously by the model. Higher numbers may be faster but use more GPU + memory. + output_bboxes_mask (`bool`, *optional*, default to `False`): + Whether or not to output the bounding box predictions. + output_rle_masks (`bool`, *optional*, default to `False`): + Whether or not to output the masks in `RLE` format""", +) +class MaskGenerationPipeline(ChunkPipeline): + """ + Automatic mask generation for images using `SamForMaskGeneration`. This pipeline predicts binary masks for an + image, given an image. It is a `ChunkPipeline` because you can seperate the points in a mini-batch in order to + avoid OOM issues. Use the `points_per_batch` argument to control the number of points that will be processed at the + same time. Default is `64`. + + The pipeline works in 3 steps: + 1. `preprocess`: A grid of 1024 points evenly separated is generated along with bounding boxes and point + labels. + For more details on how the points and bounding boxes are created, check the `_generate_crop_boxes` + function. The image is also preprocessed using the `image_processor`. This function `yields` a minibatch of + `points_per_batch`. + + 2. `forward`: feeds the outputs of `preprocess` to the model. The image embedding is computed only once. + Calls both `self.model.get_image_embeddings` and makes sure that the gradients are not computed, and the + tensors and models are on the same device. + + 3. `postprocess`: The most important part of the automatic mask generation happens here. Three steps + are induced: + - image_processor.postprocess_masks (run on each minibatch loop): takes in the raw output masks, + resizes them according + to the image size, and transforms there to binary masks. + - image_processor.filter_masks (on each minibatch loop): uses both `pred_iou_thresh` and + `stability_scores`. Also + applies a variety of filters based on non maximum suppression to remove bad masks. + - image_processor.postprocess_masks_for_amg applies the NSM on the mask to only keep relevant ones. + + Example: + + ```python + >>> from transformers import pipeline + + >>> generator = pipeline(model="facebook/sam-vit-base", task="mask-generation") + >>> outputs = generator( + ... "http://images.cocodataset.org/val2017/000000039769.jpg", + ... ) + + >>> outputs = generator( + ... "https://huggingface.co/datasets/Narsil/image_dummy/raw/main/parrots.png", points_per_batch=128 + ... ) + ``` + + Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial) + + This segmentation pipeline can currently be loaded from [`pipeline`] using the following task identifier: + `"mask-generation"`. + + See the list of available models on [huggingface.co/models](https://huggingface.co/models?filter=mask-generation). + """ + + def __init__(self, **kwargs): + super().__init__(**kwargs) + requires_backends(self, "vision") + requires_backends(self, "torch") + + if self.framework != "pt": + raise ValueError(f"The {self.__class__} is only available in PyTorch.") + + self.check_model_type(MODEL_FOR_MASK_GENERATION_MAPPING_NAMES) + + def _sanitize_parameters(self, **kwargs): + preprocess_kwargs = {} + postprocess_kwargs = {} + forward_params = {} + # preprocess args + if "points_per_batch" in kwargs: + preprocess_kwargs["points_per_batch"] = kwargs["points_per_batch"] + if "points_per_crop" in kwargs: + preprocess_kwargs["points_per_crop"] = kwargs["points_per_crop"] + if "crops_n_layers" in kwargs: + preprocess_kwargs["crops_n_layers"] = kwargs["crops_n_layers"] + if "crop_overlap_ratio" in kwargs: + preprocess_kwargs["crop_overlap_ratio"] = kwargs["crop_overlap_ratio"] + if "crop_n_points_downscale_factor" in kwargs: + preprocess_kwargs["crop_n_points_downscale_factor"] = kwargs["crop_n_points_downscale_factor"] + if "timeout" in kwargs: + preprocess_kwargs["timeout"] = kwargs["timeout"] + # postprocess args + if "pred_iou_thresh" in kwargs: + forward_params["pred_iou_thresh"] = kwargs["pred_iou_thresh"] + if "stability_score_offset" in kwargs: + forward_params["stability_score_offset"] = kwargs["stability_score_offset"] + if "mask_threshold" in kwargs: + forward_params["mask_threshold"] = kwargs["mask_threshold"] + if "stability_score_thresh" in kwargs: + forward_params["stability_score_thresh"] = kwargs["stability_score_thresh"] + if "crops_nms_thresh" in kwargs: + postprocess_kwargs["crops_nms_thresh"] = kwargs["crops_nms_thresh"] + if "output_rle_mask" in kwargs: + postprocess_kwargs["output_rle_mask"] = kwargs["output_rle_mask"] + if "output_bboxes_mask" in kwargs: + postprocess_kwargs["output_bboxes_mask"] = kwargs["output_bboxes_mask"] + return preprocess_kwargs, forward_params, postprocess_kwargs + + def __call__(self, image, *args, num_workers=None, batch_size=None, **kwargs): + """ + Generates binary segmentation masks + + Args: + inputs (`np.ndarray` or `bytes` or `str` or `dict`): + Image or list of images. + mask_threshold (`float`, *optional*, defaults to 0.0): + Threshold to use when turning the predicted masks into binary values. + pred_iou_thresh (`float`, *optional*, defaults to 0.88): + A filtering threshold in `[0,1]` applied on the model's predicted mask quality. + stability_score_thresh (`float`, *optional*, defaults to 0.95): + A filtering threshold in `[0,1]`, using the stability of the mask under changes to the cutoff used to + binarize the model's mask predictions. + stability_score_offset (`int`, *optional*, defaults to 1): + The amount to shift the cutoff when calculated the stability score. + crops_nms_thresh (`float`, *optional*, defaults to 0.7): + The box IoU cutoff used by non-maximal suppression to filter duplicate masks. + crops_n_layers (`int`, *optional*, defaults to 0): + If `crops_n_layers>0`, mask prediction will be run again on crops of the image. Sets the number of + layers to run, where each layer has 2**i_layer number of image crops. + crop_overlap_ratio (`float`, *optional*, defaults to `512 / 1500`): + Sets the degree to which crops overlap. In the first crop layer, crops will overlap by this fraction of + the image length. Later layers with more crops scale down this overlap. + crop_n_points_downscale_factor (`int`, *optional*, defaults to `1`): + The number of points-per-side sampled in layer n is scaled down by crop_n_points_downscale_factor**n. + timeout (`float`, *optional*, defaults to None): + The maximum time in seconds to wait for fetching images from the web. If None, no timeout is set and + the call may block forever. + + Return: + `Dict`: A dictionary with the following keys: + - **mask** (`PIL.Image`) -- A binary mask of the detected object as a PIL Image of shape `(width, + height)` of the original image. Returns a mask filled with zeros if no object is found. + - **score** (*optional* `float`) -- Optionally, when the model is capable of estimating a confidence of + the "object" described by the label and the mask. + + """ + return super().__call__(image, *args, num_workers=num_workers, batch_size=batch_size, **kwargs) + + def preprocess( + self, + image, + points_per_batch=64, + crops_n_layers: int = 0, + crop_overlap_ratio: float = 512 / 1500, + points_per_crop: Optional[int] = 32, + crop_n_points_downscale_factor: Optional[int] = 1, + timeout: Optional[float] = None, + ): + image = load_image(image, timeout=timeout) + target_size = self.image_processor.size["longest_edge"] + crop_boxes, grid_points, cropped_images, input_labels = self.image_processor.generate_crop_boxes( + image, target_size, crops_n_layers, crop_overlap_ratio, points_per_crop, crop_n_points_downscale_factor + ) + model_inputs = self.image_processor(images=cropped_images, return_tensors="pt") + if self.framework == "pt": + model_inputs = model_inputs.to(self.torch_dtype) + + with self.device_placement(): + if self.framework == "pt": + inference_context = self.get_inference_context() + with inference_context(): + model_inputs = self._ensure_tensor_on_device(model_inputs, device=self.device) + image_embeddings = self.model.get_image_embeddings(model_inputs.pop("pixel_values")) + model_inputs["image_embeddings"] = image_embeddings + + n_points = grid_points.shape[1] + points_per_batch = points_per_batch if points_per_batch is not None else n_points + + if points_per_batch <= 0: + raise ValueError( + "Cannot have points_per_batch<=0. Must be >=1 to returned batched outputs. " + "To return all points at once, set points_per_batch to None" + ) + + for i in range(0, n_points, points_per_batch): + batched_points = grid_points[:, i : i + points_per_batch, :, :] + labels = input_labels[:, i : i + points_per_batch] + is_last = i == n_points - points_per_batch + yield { + "input_points": batched_points, + "input_labels": labels, + "input_boxes": crop_boxes, + "is_last": is_last, + **model_inputs, + } + + def _forward( + self, + model_inputs, + pred_iou_thresh=0.88, + stability_score_thresh=0.95, + mask_threshold=0, + stability_score_offset=1, + ): + input_boxes = model_inputs.pop("input_boxes") + is_last = model_inputs.pop("is_last") + original_sizes = model_inputs.pop("original_sizes").tolist() + reshaped_input_sizes = model_inputs.pop("reshaped_input_sizes").tolist() + + model_outputs = self.model(**model_inputs) + + # post processing happens here in order to avoid CPU GPU copies of ALL the masks + low_resolution_masks = model_outputs["pred_masks"] + masks = self.image_processor.post_process_masks( + low_resolution_masks, original_sizes, reshaped_input_sizes, mask_threshold, binarize=False + ) + iou_scores = model_outputs["iou_scores"] + masks, iou_scores, boxes = self.image_processor.filter_masks( + masks[0], + iou_scores[0], + original_sizes[0], + input_boxes[0], + pred_iou_thresh, + stability_score_thresh, + mask_threshold, + stability_score_offset, + ) + return { + "masks": masks, + "is_last": is_last, + "boxes": boxes, + "iou_scores": iou_scores, + } + + def postprocess( + self, + model_outputs, + output_rle_mask=False, + output_bboxes_mask=False, + crops_nms_thresh=0.7, + ): + all_scores = [] + all_masks = [] + all_boxes = [] + for model_output in model_outputs: + all_scores.append(model_output.pop("iou_scores")) + all_masks.extend(model_output.pop("masks")) + all_boxes.append(model_output.pop("boxes")) + + all_scores = torch.cat(all_scores) + all_boxes = torch.cat(all_boxes) + output_masks, iou_scores, rle_mask, bounding_boxes = self.image_processor.post_process_for_mask_generation( + all_masks, all_scores, all_boxes, crops_nms_thresh + ) + + extra = defaultdict(list) + for output in model_outputs: + for k, v in output.items(): + extra[k].append(v) + + optional = {} + if output_rle_mask: + optional["rle_mask"] = rle_mask + + if output_bboxes_mask: + optional["bounding_boxes"] = bounding_boxes + + return {"masks": output_masks, "scores": iou_scores, **optional, **extra} diff --git a/.venv/lib/python3.11/site-packages/transformers/pipelines/object_detection.py b/.venv/lib/python3.11/site-packages/transformers/pipelines/object_detection.py new file mode 100644 index 0000000000000000000000000000000000000000..c84f17b2bd6ad0ac2bbbe95a3421e7197a5744c6 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/transformers/pipelines/object_detection.py @@ -0,0 +1,191 @@ +from typing import Any, Dict, List, Union + +from ..utils import add_end_docstrings, is_torch_available, is_vision_available, logging, requires_backends +from .base import Pipeline, build_pipeline_init_args + + +if is_vision_available(): + from ..image_utils import load_image + + +if is_torch_available(): + import torch + + from ..models.auto.modeling_auto import ( + MODEL_FOR_OBJECT_DETECTION_MAPPING_NAMES, + MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES, + ) + +logger = logging.get_logger(__name__) + + +Prediction = Dict[str, Any] +Predictions = List[Prediction] + + +@add_end_docstrings(build_pipeline_init_args(has_image_processor=True)) +class ObjectDetectionPipeline(Pipeline): + """ + Object detection pipeline using any `AutoModelForObjectDetection`. This pipeline predicts bounding boxes of objects + and their classes. + + Example: + + ```python + >>> from transformers import pipeline + + >>> detector = pipeline(model="facebook/detr-resnet-50") + >>> detector("https://huggingface.co/datasets/Narsil/image_dummy/raw/main/parrots.png") + [{'score': 0.997, 'label': 'bird', 'box': {'xmin': 69, 'ymin': 171, 'xmax': 396, 'ymax': 507}}, {'score': 0.999, 'label': 'bird', 'box': {'xmin': 398, 'ymin': 105, 'xmax': 767, 'ymax': 507}}] + + >>> # x, y are expressed relative to the top left hand corner. + ``` + + Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial) + + This object detection pipeline can currently be loaded from [`pipeline`] using the following task identifier: + `"object-detection"`. + + See the list of available models on [huggingface.co/models](https://huggingface.co/models?filter=object-detection). + """ + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + if self.framework == "tf": + raise ValueError(f"The {self.__class__} is only available in PyTorch.") + + requires_backends(self, "vision") + mapping = MODEL_FOR_OBJECT_DETECTION_MAPPING_NAMES.copy() + mapping.update(MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES) + self.check_model_type(mapping) + + def _sanitize_parameters(self, **kwargs): + preprocess_params = {} + if "timeout" in kwargs: + preprocess_params["timeout"] = kwargs["timeout"] + postprocess_kwargs = {} + if "threshold" in kwargs: + postprocess_kwargs["threshold"] = kwargs["threshold"] + return preprocess_params, {}, postprocess_kwargs + + def __call__(self, *args, **kwargs) -> Union[Predictions, List[Prediction]]: + """ + Detect objects (bounding boxes & classes) in the image(s) passed as inputs. + + Args: + inputs (`str`, `List[str]`, `PIL.Image` or `List[PIL.Image]`): + The pipeline handles three types of images: + + - A string containing an HTTP(S) link pointing to an image + - A string containing a local path to an image + - An image loaded in PIL directly + + The pipeline accepts either a single image or a batch of images. Images in a batch must all be in the + same format: all as HTTP(S) links, all as local paths, or all as PIL images. + threshold (`float`, *optional*, defaults to 0.5): + The probability necessary to make a prediction. + timeout (`float`, *optional*, defaults to None): + The maximum time in seconds to wait for fetching images from the web. If None, no timeout is set and + the call may block forever. + + Return: + A list of dictionaries or a list of list of dictionaries containing the result. If the input is a single + image, will return a list of dictionaries, if the input is a list of several images, will return a list of + list of dictionaries corresponding to each image. + + The dictionaries contain the following keys: + + - **label** (`str`) -- The class label identified by the model. + - **score** (`float`) -- The score attributed by the model for that label. + - **box** (`List[Dict[str, int]]`) -- The bounding box of detected object in image's original size. + """ + # After deprecation of this is completed, remove the default `None` value for `images` + if "images" in kwargs and "inputs" not in kwargs: + kwargs["inputs"] = kwargs.pop("images") + return super().__call__(*args, **kwargs) + + def preprocess(self, image, timeout=None): + image = load_image(image, timeout=timeout) + target_size = torch.IntTensor([[image.height, image.width]]) + inputs = self.image_processor(images=[image], return_tensors="pt") + if self.framework == "pt": + inputs = inputs.to(self.torch_dtype) + if self.tokenizer is not None: + inputs = self.tokenizer(text=inputs["words"], boxes=inputs["boxes"], return_tensors="pt") + inputs["target_size"] = target_size + return inputs + + def _forward(self, model_inputs): + target_size = model_inputs.pop("target_size") + outputs = self.model(**model_inputs) + model_outputs = outputs.__class__({"target_size": target_size, **outputs}) + if self.tokenizer is not None: + model_outputs["bbox"] = model_inputs["bbox"] + return model_outputs + + def postprocess(self, model_outputs, threshold=0.5): + target_size = model_outputs["target_size"] + if self.tokenizer is not None: + # This is a LayoutLMForTokenClassification variant. + # The OCR got the boxes and the model classified the words. + height, width = target_size[0].tolist() + + def unnormalize(bbox): + return self._get_bounding_box( + torch.Tensor( + [ + (width * bbox[0] / 1000), + (height * bbox[1] / 1000), + (width * bbox[2] / 1000), + (height * bbox[3] / 1000), + ] + ) + ) + + scores, classes = model_outputs["logits"].squeeze(0).softmax(dim=-1).max(dim=-1) + labels = [self.model.config.id2label[prediction] for prediction in classes.tolist()] + boxes = [unnormalize(bbox) for bbox in model_outputs["bbox"].squeeze(0)] + keys = ["score", "label", "box"] + annotation = [dict(zip(keys, vals)) for vals in zip(scores.tolist(), labels, boxes) if vals[0] > threshold] + else: + # This is a regular ForObjectDetectionModel + raw_annotations = self.image_processor.post_process_object_detection(model_outputs, threshold, target_size) + raw_annotation = raw_annotations[0] + scores = raw_annotation["scores"] + labels = raw_annotation["labels"] + boxes = raw_annotation["boxes"] + + raw_annotation["scores"] = scores.tolist() + raw_annotation["labels"] = [self.model.config.id2label[label.item()] for label in labels] + raw_annotation["boxes"] = [self._get_bounding_box(box) for box in boxes] + + # {"scores": [...], ...} --> [{"score":x, ...}, ...] + keys = ["score", "label", "box"] + annotation = [ + dict(zip(keys, vals)) + for vals in zip(raw_annotation["scores"], raw_annotation["labels"], raw_annotation["boxes"]) + ] + + return annotation + + def _get_bounding_box(self, box: "torch.Tensor") -> Dict[str, int]: + """ + Turns list [xmin, xmax, ymin, ymax] into dict { "xmin": xmin, ... } + + Args: + box (`torch.Tensor`): Tensor containing the coordinates in corners format. + + Returns: + bbox (`Dict[str, int]`): Dict containing the coordinates in corners format. + """ + if self.framework != "pt": + raise ValueError("The ObjectDetectionPipeline is only available in PyTorch.") + xmin, ymin, xmax, ymax = box.int().tolist() + bbox = { + "xmin": xmin, + "ymin": ymin, + "xmax": xmax, + "ymax": ymax, + } + return bbox diff --git a/.venv/lib/python3.11/site-packages/transformers/pipelines/pt_utils.py b/.venv/lib/python3.11/site-packages/transformers/pipelines/pt_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..19663437cd691efb265770ae007871cafe1275ed --- /dev/null +++ b/.venv/lib/python3.11/site-packages/transformers/pipelines/pt_utils.py @@ -0,0 +1,321 @@ +import numpy as np +import torch +from torch.utils.data import Dataset, IterableDataset + +from ..utils.generic import ModelOutput + + +class PipelineDataset(Dataset): + def __init__(self, dataset, process, params): + self.dataset = dataset + self.process = process + self.params = params + + def __len__(self): + return len(self.dataset) + + def __getitem__(self, i): + item = self.dataset[i] + processed = self.process(item, **self.params) + return processed + + +class PipelineIterator(IterableDataset): + def __init__(self, loader, infer, params, loader_batch_size=None): + """ + Roughly equivalent to + + ``` + for item in loader: + yield infer(item, **params) + ``` + + Arguments: + loader (`torch.utils.data.DataLoader` or `Iterable`): + The iterator that will be used to apply `infer` on. + infer (any function): + The function to apply of each element of `loader`. + params (`dict`): + The parameters passed to `infer` along with every item + loader_batch_size (`int`, *optional*): + If specified, the items of `loader` are supposed to come as batch, and are loader_batched here + making it roughly behave as + + + ``` + for items in loader: + for i in loader_batch_size: + item = items[i] + yield infer(item, **params) + ```""" + self.loader = loader + self.infer = infer + self.params = params + if loader_batch_size == 1: + # Let's spare some time by deactivating altogether + loader_batch_size = None + self.loader_batch_size = loader_batch_size + + # Internal bookkeeping + self._loader_batch_index = None + self._loader_batch_data = None + + def __len__(self): + return len(self.loader) + + def __iter__(self): + self.iterator = iter(self.loader) + return self + + def loader_batch_item(self): + """ + Return item located at `loader_batch_index` within the current `loader_batch_data`. + """ + if isinstance(self._loader_batch_data, torch.Tensor): + # Batch data is simple tensor, just fetch the slice + result = self._loader_batch_data[self._loader_batch_index].unsqueeze(0) + else: + # Batch data is assumed to be BaseModelOutput (or dict) + loader_batched = {} + for k, element in self._loader_batch_data.items(): + if isinstance(element, ModelOutput): + # Convert ModelOutput to tuple first + element = element.to_tuple() + if isinstance(element[0], torch.Tensor): + loader_batched[k] = tuple(el[self._loader_batch_index].unsqueeze(0) for el in element) + elif isinstance(element[0], np.ndarray): + loader_batched[k] = tuple(np.expand_dims(el[self._loader_batch_index], 0) for el in element) + continue + if k in {"hidden_states", "past_key_values", "attentions"} and isinstance(element, tuple): + # Those are stored as lists of tensors so need specific unbatching. + if isinstance(element[0], torch.Tensor): + loader_batched[k] = tuple(el[self._loader_batch_index].unsqueeze(0) for el in element) + elif isinstance(element[0], np.ndarray): + loader_batched[k] = tuple(np.expand_dims(el[self._loader_batch_index], 0) for el in element) + continue + if element is None: + # This can happen for optional data that get passed around + loader_batched[k] = None + elif isinstance(element[self._loader_batch_index], torch.Tensor): + # Take correct batch data, but make it looked like batch_size=1 + # For compatibility with other methods within transformers + + loader_batched[k] = element[self._loader_batch_index].unsqueeze(0) + elif isinstance(element[self._loader_batch_index], np.ndarray): + # Take correct batch data, but make it looked like batch_size=1 + # For compatibility with other methods within transformers + loader_batched[k] = np.expand_dims(element[self._loader_batch_index], 0) + else: + # This is typically a list, so no need to `unsqueeze`. + loader_batched[k] = element[self._loader_batch_index] + # Recreate the element by reusing the original class to make it look + # batch_size=1 + result = self._loader_batch_data.__class__(loader_batched) + self._loader_batch_index += 1 + return result + + def __next__(self): + if self._loader_batch_index is not None and self._loader_batch_index < self.loader_batch_size: + # We are currently unrolling a batch so we just need to return + # the current item within a batch + return self.loader_batch_item() + + # We're out of items within a batch + item = next(self.iterator) + processed = self.infer(item, **self.params) + # We now have a batch of "inferred things". + if self.loader_batch_size is not None: + # Try to infer the size of the batch + if isinstance(processed, torch.Tensor): + first_tensor = processed + elif isinstance(processed, tuple): + first_tensor = processed[0] + else: + key = list(processed.keys())[0] + first_tensor = processed[key] + + if isinstance(first_tensor, list): + observed_batch_size = len(first_tensor) + else: + observed_batch_size = first_tensor.shape[0] + if 0 < observed_batch_size < self.loader_batch_size: + # could be last batch so we can't unroll as many + # elements. + self.loader_batch_size = observed_batch_size + # Setting internal index to unwrap the batch + self._loader_batch_data = processed[0] if isinstance(processed, tuple) else processed + self._loader_batch_index = 0 + return self.loader_batch_item() + else: + # We're not unrolling batches + return processed + + +class PipelineChunkIterator(PipelineIterator): + def __init__(self, loader, infer, params, loader_batch_size=None): + """ + Roughly equivalent to + + ``` + for iterator in loader: + for item in iterator: + yield infer(item, **params) + ``` + + Arguments: + loader (`torch.utils.data.DataLoader` or `Iterable`): + The iterator that will be used to apply `infer` on. + infer (any function): + The function to apply of each element of `loader`. + params (`dict`): + The parameters passed to `infer` along with every item + """ + super().__init__(loader, infer, params) + + def __iter__(self): + self.iterator = iter(self.loader) + self.subiterator = None + return self + + def __next__(self): + if self.subiterator is None: + "Subiterator None means we haven't started a `preprocess` iterator. so start it" + self.subiterator = self.infer(next(self.iterator), **self.params) + try: + # Try to return next item + processed = next(self.subiterator) + except StopIteration: + # When a preprocess iterator ends, we can start lookig at the next item + # ChunkIterator will keep feeding until ALL elements of iterator + # all have created their subiterator and have been iterating against. + # + # Another way to look at it, is we're basically flattening lists of lists + # into a single list, but with generators + self.subiterator = self.infer(next(self.iterator), **self.params) + processed = next(self.subiterator) + return processed + + +class PipelinePackIterator(PipelineIterator): + """ + Roughly equivalent to + + ``` + packed = [] + for item in loader: + packed.append(item) + if item["is_last"]: + yield packed + packed = [] + ``` + + but it also handles cases where `item` are batched (meaning it's a dict of Tensor with first dimension > 1. In + that case it does + + ``` + packed = [] + for batch in loader: + # item is batched + for item in batch: + packed.append(item) + if item["is_last"]: + yield packed + packed = [] + ``` + + Arguments: + loader (`torch.utils.data.DataLoader` or `Iterable`): + The iterator that will be used to apply `infer` on. + infer (any function): + The function to apply of each element of `loader`. + params (`dict`): + The parameters passed to `infer` along with every item + loader_batch_size (`int`, *optional*): + If specified, the items of `loader` are supposed to come as batch, and are loader_batched here making + it roughly behave as + + + ``` + for items in loader: + for i in loader_batch_size: + item = items[i] + yield infer(item, **params) + ```""" + + def __iter__(self): + self.iterator = iter(self.loader) + return self + + def __next__(self): + # Extremely similar to PipelineIterator in its unpacking mechanism + # BUT, we have an extra required item which is the presence of `is_last` + # That is because everything is flattened by `PipelineChunkIterator` we + # need to keep track of how to regroup here in the original `process` + # boundaries so that `process` and `postprocess` see the same data. + + # This iterator accumulates items (possibly while unbatching) until it + # its a `is_last` and then just passes it on to the caller. + is_last = False + accumulator = [] + if self._loader_batch_index is not None and self._loader_batch_index < self.loader_batch_size: + while self._loader_batch_index < self.loader_batch_size: + item = self.loader_batch_item() + is_last = item.pop("is_last") + accumulator.append(item) + if is_last: + return accumulator + + while not is_last: + processed = self.infer(next(self.iterator), **self.params) + if self.loader_batch_size is not None: + if isinstance(processed, torch.Tensor): + first_tensor = processed + else: + key = list(processed.keys())[0] + first_tensor = processed[key] + if isinstance(first_tensor, list): + observed_batch_size = len(first_tensor) + else: + observed_batch_size = first_tensor.shape[0] + if 0 < observed_batch_size < self.loader_batch_size: + # could be last batch so we can't unroll as many + # elements. + self.loader_batch_size = observed_batch_size + self._loader_batch_data = processed + self._loader_batch_index = 0 + while self._loader_batch_index < self.loader_batch_size: + item = self.loader_batch_item() + is_last = item.pop("is_last") + accumulator.append(item) + if is_last: + return accumulator + else: + item = processed + is_last = item.pop("is_last") + accumulator.append(item) + return accumulator + + +class KeyDataset(Dataset): + def __init__(self, dataset: Dataset, key: str): + self.dataset = dataset + self.key = key + + def __len__(self): + return len(self.dataset) + + def __getitem__(self, i): + return self.dataset[i][self.key] + + +class KeyPairDataset(Dataset): + def __init__(self, dataset: Dataset, key1: str, key2: str): + self.dataset = dataset + self.key1 = key1 + self.key2 = key2 + + def __len__(self): + return len(self.dataset) + + def __getitem__(self, i): + return {"text": self.dataset[i][self.key1], "text_pair": self.dataset[i][self.key2]} diff --git a/.venv/lib/python3.11/site-packages/transformers/pipelines/question_answering.py b/.venv/lib/python3.11/site-packages/transformers/pipelines/question_answering.py new file mode 100644 index 0000000000000000000000000000000000000000..7b876eefc492793087871602f51fcd6fb55f5244 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/transformers/pipelines/question_answering.py @@ -0,0 +1,682 @@ +import inspect +import types +import warnings +from collections.abc import Iterable +from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union + +import numpy as np + +from ..data import SquadExample, SquadFeatures, squad_convert_examples_to_features +from ..modelcard import ModelCard +from ..tokenization_utils import PreTrainedTokenizer +from ..utils import ( + PaddingStrategy, + add_end_docstrings, + is_tf_available, + is_tokenizers_available, + is_torch_available, + logging, +) +from .base import ArgumentHandler, ChunkPipeline, build_pipeline_init_args + + +logger = logging.get_logger(__name__) + +if TYPE_CHECKING: + from ..modeling_tf_utils import TFPreTrainedModel + from ..modeling_utils import PreTrainedModel + + if is_tokenizers_available(): + import tokenizers + +if is_tf_available(): + import tensorflow as tf + + from ..models.auto.modeling_tf_auto import TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING_NAMES + + Dataset = None + +if is_torch_available(): + import torch + from torch.utils.data import Dataset + + from ..models.auto.modeling_auto import MODEL_FOR_QUESTION_ANSWERING_MAPPING_NAMES + + +def decode_spans( + start: np.ndarray, end: np.ndarray, topk: int, max_answer_len: int, undesired_tokens: np.ndarray +) -> Tuple: + """ + Take the output of any `ModelForQuestionAnswering` and will generate probabilities for each span to be the actual + answer. + + In addition, it filters out some unwanted/impossible cases like answer len being greater than max_answer_len or + answer end position being before the starting position. The method supports output the k-best answer through the + topk argument. + + Args: + start (`np.ndarray`): Individual start probabilities for each token. + end (`np.ndarray`): Individual end probabilities for each token. + topk (`int`): Indicates how many possible answer span(s) to extract from the model output. + max_answer_len (`int`): Maximum size of the answer to extract from the model's output. + undesired_tokens (`np.ndarray`): Mask determining tokens that can be part of the answer + """ + # Ensure we have batch axis + if start.ndim == 1: + start = start[None] + + if end.ndim == 1: + end = end[None] + + # Compute the score of each tuple(start, end) to be the real answer + outer = np.matmul(np.expand_dims(start, -1), np.expand_dims(end, 1)) + + # Remove candidate with end < start and end - start > max_answer_len + candidates = np.tril(np.triu(outer), max_answer_len - 1) + + # Inspired by Chen & al. (https://github.com/facebookresearch/DrQA) + scores_flat = candidates.flatten() + if topk == 1: + idx_sort = [np.argmax(scores_flat)] + elif len(scores_flat) < topk: + idx_sort = np.argsort(-scores_flat) + else: + idx = np.argpartition(-scores_flat, topk)[0:topk] + idx_sort = idx[np.argsort(-scores_flat[idx])] + + starts, ends = np.unravel_index(idx_sort, candidates.shape)[1:] + desired_spans = np.isin(starts, undesired_tokens.nonzero()) & np.isin(ends, undesired_tokens.nonzero()) + starts = starts[desired_spans] + ends = ends[desired_spans] + scores = candidates[0, starts, ends] + + return starts, ends, scores + + +def select_starts_ends( + start, + end, + p_mask, + attention_mask, + min_null_score=1000000, + top_k=1, + handle_impossible_answer=False, + max_answer_len=15, +): + """ + Takes the raw output of any `ModelForQuestionAnswering` and first normalizes its outputs and then uses + `decode_spans()` to generate probabilities for each span to be the actual answer. + + Args: + start (`np.ndarray`): Individual start logits for each token. + end (`np.ndarray`): Individual end logits for each token. + p_mask (`np.ndarray`): A mask with 1 for values that cannot be in the answer + attention_mask (`np.ndarray`): The attention mask generated by the tokenizer + min_null_score(`float`): The minimum null (empty) answer score seen so far. + topk (`int`): Indicates how many possible answer span(s) to extract from the model output. + handle_impossible_answer(`bool`): Whether to allow null (empty) answers + max_answer_len (`int`): Maximum size of the answer to extract from the model's output. + """ + # Ensure padded tokens & question tokens cannot belong to the set of candidate answers. + undesired_tokens = np.abs(np.array(p_mask) - 1) + + if attention_mask is not None: + undesired_tokens = undesired_tokens & attention_mask + + # Generate mask + undesired_tokens_mask = undesired_tokens == 0.0 + + # Make sure non-context indexes in the tensor cannot contribute to the softmax + start = np.where(undesired_tokens_mask, -10000.0, start) + end = np.where(undesired_tokens_mask, -10000.0, end) + + # Normalize logits and spans to retrieve the answer + start = np.exp(start - start.max(axis=-1, keepdims=True)) + start = start / start.sum() + + end = np.exp(end - end.max(axis=-1, keepdims=True)) + end = end / end.sum() + + if handle_impossible_answer: + min_null_score = min(min_null_score, (start[0, 0] * end[0, 0]).item()) + + # Mask CLS + start[0, 0] = end[0, 0] = 0.0 + + starts, ends, scores = decode_spans(start, end, top_k, max_answer_len, undesired_tokens) + return starts, ends, scores, min_null_score + + +class QuestionAnsweringArgumentHandler(ArgumentHandler): + """ + QuestionAnsweringPipeline requires the user to provide multiple arguments (i.e. question & context) to be mapped to + internal [`SquadExample`]. + + QuestionAnsweringArgumentHandler manages all the possible to create a [`SquadExample`] from the command-line + supplied arguments. + """ + + def normalize(self, item): + if isinstance(item, SquadExample): + return item + elif isinstance(item, dict): + for k in ["question", "context"]: + if k not in item: + raise KeyError("You need to provide a dictionary with keys {question:..., context:...}") + elif item[k] is None: + raise ValueError(f"`{k}` cannot be None") + elif isinstance(item[k], str) and len(item[k]) == 0: + raise ValueError(f"`{k}` cannot be empty") + + return QuestionAnsweringPipeline.create_sample(**item) + raise ValueError(f"{item} argument needs to be of type (SquadExample, dict)") + + def __call__(self, *args, **kwargs): + # Detect where the actual inputs are + if args is not None and len(args) > 0: + if len(args) == 1: + inputs = args[0] + elif len(args) == 2 and {type(el) for el in args} == {str}: + inputs = [{"question": args[0], "context": args[1]}] + else: + inputs = list(args) + # Generic compatibility with sklearn and Keras + # Batched data + elif "X" in kwargs: + warnings.warn( + "Passing the `X` argument to the pipeline is deprecated and will be removed in v5. Inputs should be passed using the `question` and `context` keyword arguments instead.", + FutureWarning, + ) + inputs = kwargs["X"] + elif "data" in kwargs: + warnings.warn( + "Passing the `data` argument to the pipeline is deprecated and will be removed in v5. Inputs should be passed using the `question` and `context` keyword arguments instead.", + FutureWarning, + ) + inputs = kwargs["data"] + elif "question" in kwargs and "context" in kwargs: + if isinstance(kwargs["question"], list) and isinstance(kwargs["context"], str): + inputs = [{"question": Q, "context": kwargs["context"]} for Q in kwargs["question"]] + elif isinstance(kwargs["question"], list) and isinstance(kwargs["context"], list): + if len(kwargs["question"]) != len(kwargs["context"]): + raise ValueError("Questions and contexts don't have the same lengths") + + inputs = [{"question": Q, "context": C} for Q, C in zip(kwargs["question"], kwargs["context"])] + elif isinstance(kwargs["question"], str) and isinstance(kwargs["context"], str): + inputs = [{"question": kwargs["question"], "context": kwargs["context"]}] + else: + raise ValueError("Arguments can't be understood") + else: + raise ValueError(f"Unknown arguments {kwargs}") + + # When user is sending a generator we need to trust it's a valid example + generator_types = (types.GeneratorType, Dataset) if Dataset is not None else (types.GeneratorType,) + if isinstance(inputs, generator_types): + return inputs + + # Normalize inputs + if isinstance(inputs, dict): + inputs = [inputs] + elif isinstance(inputs, Iterable): + # Copy to avoid overriding arguments + inputs = list(inputs) + else: + raise ValueError(f"Invalid arguments {kwargs}") + + for i, item in enumerate(inputs): + inputs[i] = self.normalize(item) + + return inputs + + +@add_end_docstrings(build_pipeline_init_args(has_tokenizer=True)) +class QuestionAnsweringPipeline(ChunkPipeline): + """ + Question Answering pipeline using any `ModelForQuestionAnswering`. See the [question answering + examples](../task_summary#question-answering) for more information. + + Example: + + ```python + >>> from transformers import pipeline + + >>> oracle = pipeline(model="deepset/roberta-base-squad2") + >>> oracle(question="Where do I live?", context="My name is Wolfgang and I live in Berlin") + {'score': 0.9191, 'start': 34, 'end': 40, 'answer': 'Berlin'} + ``` + + Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial) + + This question answering pipeline can currently be loaded from [`pipeline`] using the following task identifier: + `"question-answering"`. + + The models that this pipeline can use are models that have been fine-tuned on a question answering task. See the + up-to-date list of available models on + [huggingface.co/models](https://huggingface.co/models?filter=question-answering). + """ + + default_input_names = "question,context" + handle_impossible_answer = False + + def __init__( + self, + model: Union["PreTrainedModel", "TFPreTrainedModel"], + tokenizer: PreTrainedTokenizer, + modelcard: Optional[ModelCard] = None, + framework: Optional[str] = None, + task: str = "", + **kwargs, + ): + super().__init__( + model=model, + tokenizer=tokenizer, + modelcard=modelcard, + framework=framework, + task=task, + **kwargs, + ) + + self._args_parser = QuestionAnsweringArgumentHandler() + self.check_model_type( + TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING_NAMES + if self.framework == "tf" + else MODEL_FOR_QUESTION_ANSWERING_MAPPING_NAMES + ) + + @staticmethod + def create_sample( + question: Union[str, List[str]], context: Union[str, List[str]] + ) -> Union[SquadExample, List[SquadExample]]: + """ + QuestionAnsweringPipeline leverages the [`SquadExample`] internally. This helper method encapsulate all the + logic for converting question(s) and context(s) to [`SquadExample`]. + + We currently support extractive question answering. + + Arguments: + question (`str` or `List[str]`): The question(s) asked. + context (`str` or `List[str]`): The context(s) in which we will look for the answer. + + Returns: + One or a list of [`SquadExample`]: The corresponding [`SquadExample`] grouping question and context. + """ + if isinstance(question, list): + return [SquadExample(None, q, c, None, None, None) for q, c in zip(question, context)] + else: + return SquadExample(None, question, context, None, None, None) + + def _sanitize_parameters( + self, + padding=None, + topk=None, + top_k=None, + doc_stride=None, + max_answer_len=None, + max_seq_len=None, + max_question_len=None, + handle_impossible_answer=None, + align_to_words=None, + **kwargs, + ): + # Set defaults values + preprocess_params = {} + if padding is not None: + preprocess_params["padding"] = padding + if doc_stride is not None: + preprocess_params["doc_stride"] = doc_stride + if max_question_len is not None: + preprocess_params["max_question_len"] = max_question_len + if max_seq_len is not None: + preprocess_params["max_seq_len"] = max_seq_len + + postprocess_params = {} + if topk is not None and top_k is None: + warnings.warn("topk parameter is deprecated, use top_k instead", UserWarning) + top_k = topk + if top_k is not None: + if top_k < 1: + raise ValueError(f"top_k parameter should be >= 1 (got {top_k})") + postprocess_params["top_k"] = top_k + if max_answer_len is not None: + if max_answer_len < 1: + raise ValueError(f"max_answer_len parameter should be >= 1 (got {max_answer_len}") + if max_answer_len is not None: + postprocess_params["max_answer_len"] = max_answer_len + if handle_impossible_answer is not None: + postprocess_params["handle_impossible_answer"] = handle_impossible_answer + if align_to_words is not None: + postprocess_params["align_to_words"] = align_to_words + return preprocess_params, {}, postprocess_params + + def __call__(self, *args, **kwargs): + """ + Answer the question(s) given as inputs by using the context(s). + + Args: + question (`str` or `List[str]`): + One or several question(s) (must be used in conjunction with the `context` argument). + context (`str` or `List[str]`): + One or several context(s) associated with the question(s) (must be used in conjunction with the + `question` argument). + top_k (`int`, *optional*, defaults to 1): + The number of answers to return (will be chosen by order of likelihood). Note that we return less than + top_k answers if there are not enough options available within the context. + doc_stride (`int`, *optional*, defaults to 128): + If the context is too long to fit with the question for the model, it will be split in several chunks + with some overlap. This argument controls the size of that overlap. + max_answer_len (`int`, *optional*, defaults to 15): + The maximum length of predicted answers (e.g., only answers with a shorter length are considered). + max_seq_len (`int`, *optional*, defaults to 384): + The maximum length of the total sentence (context + question) in tokens of each chunk passed to the + model. The context will be split in several chunks (using `doc_stride` as overlap) if needed. + max_question_len (`int`, *optional*, defaults to 64): + The maximum length of the question after tokenization. It will be truncated if needed. + handle_impossible_answer (`bool`, *optional*, defaults to `False`): + Whether or not we accept impossible as an answer. + align_to_words (`bool`, *optional*, defaults to `True`): + Attempts to align the answer to real words. Improves quality on space separated languages. Might hurt on + non-space-separated languages (like Japanese or Chinese) + + Return: + A `dict` or a list of `dict`: Each result comes as a dictionary with the following keys: + + - **score** (`float`) -- The probability associated to the answer. + - **start** (`int`) -- The character start index of the answer (in the tokenized version of the input). + - **end** (`int`) -- The character end index of the answer (in the tokenized version of the input). + - **answer** (`str`) -- The answer to the question. + """ + + # Convert inputs to features + if args: + warnings.warn( + "Passing a list of SQuAD examples to the pipeline is deprecated and will be removed in v5. Inputs should be passed using the `question` and `context` keyword arguments instead.", + FutureWarning, + ) + + examples = self._args_parser(*args, **kwargs) + if isinstance(examples, (list, tuple)) and len(examples) == 1: + return super().__call__(examples[0], **kwargs) + return super().__call__(examples, **kwargs) + + def preprocess(self, example, padding="do_not_pad", doc_stride=None, max_question_len=64, max_seq_len=None): + # XXX: This is specal, args_parser will not handle anything generator or dataset like + # For those we expect user to send a simple valid example either directly as a SquadExample or simple dict. + # So we still need a little sanitation here. + if isinstance(example, dict): + example = SquadExample(None, example["question"], example["context"], None, None, None) + + if max_seq_len is None: + max_seq_len = min(self.tokenizer.model_max_length, 384) + if doc_stride is None: + doc_stride = min(max_seq_len // 2, 128) + + if doc_stride > max_seq_len: + raise ValueError(f"`doc_stride` ({doc_stride}) is larger than `max_seq_len` ({max_seq_len})") + + if not self.tokenizer.is_fast: + features = squad_convert_examples_to_features( + examples=[example], + tokenizer=self.tokenizer, + max_seq_length=max_seq_len, + doc_stride=doc_stride, + max_query_length=max_question_len, + padding_strategy=PaddingStrategy.MAX_LENGTH, + is_training=False, + tqdm_enabled=False, + ) + else: + # Define the side we want to truncate / pad and the text/pair sorting + question_first = self.tokenizer.padding_side == "right" + + encoded_inputs = self.tokenizer( + text=example.question_text if question_first else example.context_text, + text_pair=example.context_text if question_first else example.question_text, + padding=padding, + truncation="only_second" if question_first else "only_first", + max_length=max_seq_len, + stride=doc_stride, + return_token_type_ids=True, + return_overflowing_tokens=True, + return_offsets_mapping=True, + return_special_tokens_mask=True, + ) + # When the input is too long, it's converted in a batch of inputs with overflowing tokens + # and a stride of overlap between the inputs. If a batch of inputs is given, a special output + # "overflow_to_sample_mapping" indicate which member of the encoded batch belong to which original batch sample. + # Here we tokenize examples one-by-one so we don't need to use "overflow_to_sample_mapping". + # "num_span" is the number of output samples generated from the overflowing tokens. + num_spans = len(encoded_inputs["input_ids"]) + + # p_mask: mask with 1 for token than cannot be in the answer (0 for token which can be in an answer) + # We put 0 on the tokens from the context and 1 everywhere else (question and special tokens) + p_mask = [ + [tok != 1 if question_first else 0 for tok in encoded_inputs.sequence_ids(span_id)] + for span_id in range(num_spans) + ] + + features = [] + for span_idx in range(num_spans): + input_ids_span_idx = encoded_inputs["input_ids"][span_idx] + attention_mask_span_idx = ( + encoded_inputs["attention_mask"][span_idx] if "attention_mask" in encoded_inputs else None + ) + token_type_ids_span_idx = ( + encoded_inputs["token_type_ids"][span_idx] if "token_type_ids" in encoded_inputs else None + ) + # keep the cls_token unmasked (some models use it to indicate unanswerable questions) + if self.tokenizer.cls_token_id is not None: + cls_indices = np.nonzero(np.array(input_ids_span_idx) == self.tokenizer.cls_token_id)[0] + for cls_index in cls_indices: + p_mask[span_idx][cls_index] = 0 + submask = p_mask[span_idx] + features.append( + SquadFeatures( + input_ids=input_ids_span_idx, + attention_mask=attention_mask_span_idx, + token_type_ids=token_type_ids_span_idx, + p_mask=submask, + encoding=encoded_inputs[span_idx], + # We don't use the rest of the values - and actually + # for Fast tokenizer we could totally avoid using SquadFeatures and SquadExample + cls_index=None, + token_to_orig_map={}, + example_index=0, + unique_id=0, + paragraph_len=0, + token_is_max_context=0, + tokens=[], + start_position=0, + end_position=0, + is_impossible=False, + qas_id=None, + ) + ) + + for i, feature in enumerate(features): + fw_args = {} + others = {} + model_input_names = self.tokenizer.model_input_names + ["p_mask", "token_type_ids"] + + for k, v in feature.__dict__.items(): + if k in model_input_names: + if self.framework == "tf": + tensor = tf.constant(v) + if tensor.dtype == tf.int64: + tensor = tf.cast(tensor, tf.int32) + fw_args[k] = tf.expand_dims(tensor, 0) + elif self.framework == "pt": + tensor = torch.tensor(v) + if tensor.dtype == torch.int32: + tensor = tensor.long() + fw_args[k] = tensor.unsqueeze(0) + else: + others[k] = v + + is_last = i == len(features) - 1 + yield {"example": example, "is_last": is_last, **fw_args, **others} + + def _forward(self, inputs): + example = inputs["example"] + model_inputs = {k: inputs[k] for k in self.tokenizer.model_input_names} + # `XXXForSequenceClassification` models should not use `use_cache=True` even if it's supported + model_forward = self.model.forward if self.framework == "pt" else self.model.call + if "use_cache" in inspect.signature(model_forward).parameters.keys(): + model_inputs["use_cache"] = False + output = self.model(**model_inputs) + if isinstance(output, dict): + return {"start": output["start_logits"], "end": output["end_logits"], "example": example, **inputs} + else: + start, end = output[:2] + return {"start": start, "end": end, "example": example, **inputs} + + def postprocess( + self, + model_outputs, + top_k=1, + handle_impossible_answer=False, + max_answer_len=15, + align_to_words=True, + ): + min_null_score = 1000000 # large and positive + answers = [] + for output in model_outputs: + if self.framework == "pt" and output["start"].dtype == torch.bfloat16: + start_ = output["start"].to(torch.float32) + else: + start_ = output["start"] + if self.framework == "pt" and output["start"].dtype == torch.bfloat16: + end_ = output["end"].to(torch.float32) + else: + end_ = output["end"] + example = output["example"] + p_mask = output["p_mask"] + attention_mask = ( + output["attention_mask"].numpy() if output.get("attention_mask", None) is not None else None + ) + + starts, ends, scores, min_null_score = select_starts_ends( + start_, end_, p_mask, attention_mask, min_null_score, top_k, handle_impossible_answer, max_answer_len + ) + + if not self.tokenizer.is_fast: + char_to_word = np.array(example.char_to_word_offset) + + # Convert the answer (tokens) back to the original text + # Score: score from the model + # Start: Index of the first character of the answer in the context string + # End: Index of the character following the last character of the answer in the context string + # Answer: Plain text of the answer + for s, e, score in zip(starts, ends, scores): + token_to_orig_map = output["token_to_orig_map"] + answers.append( + { + "score": score.item(), + "start": np.where(char_to_word == token_to_orig_map[s])[0][0].item(), + "end": np.where(char_to_word == token_to_orig_map[e])[0][-1].item(), + "answer": " ".join(example.doc_tokens[token_to_orig_map[s] : token_to_orig_map[e] + 1]), + } + ) + else: + # Convert the answer (tokens) back to the original text + # Score: score from the model + # Start: Index of the first character of the answer in the context string + # End: Index of the character following the last character of the answer in the context string + # Answer: Plain text of the answer + question_first = bool(self.tokenizer.padding_side == "right") + enc = output["encoding"] + + # Encoding was *not* padded, input_ids *might*. + # It doesn't make a difference unless we're padding on + # the left hand side, since now we have different offsets + # everywhere. + if self.tokenizer.padding_side == "left": + offset = (output["input_ids"] == self.tokenizer.pad_token_id).numpy().sum() + else: + offset = 0 + + # Sometimes the max probability token is in the middle of a word so: + # - we start by finding the right word containing the token with `token_to_word` + # - then we convert this word in a character span with `word_to_chars` + sequence_index = 1 if question_first else 0 + for s, e, score in zip(starts, ends, scores): + s = s - offset + e = e - offset + + start_index, end_index = self.get_indices(enc, s, e, sequence_index, align_to_words) + + answers.append( + { + "score": score.item(), + "start": start_index, + "end": end_index, + "answer": example.context_text[start_index:end_index], + } + ) + + if handle_impossible_answer: + answers.append({"score": min_null_score, "start": 0, "end": 0, "answer": ""}) + answers = sorted(answers, key=lambda x: x["score"], reverse=True)[:top_k] + if len(answers) == 1: + return answers[0] + return answers + + def get_indices( + self, enc: "tokenizers.Encoding", s: int, e: int, sequence_index: int, align_to_words: bool + ) -> Tuple[int, int]: + if align_to_words: + try: + start_word = enc.token_to_word(s) + end_word = enc.token_to_word(e) + start_index = enc.word_to_chars(start_word, sequence_index=sequence_index)[0] + end_index = enc.word_to_chars(end_word, sequence_index=sequence_index)[1] + except Exception: + # Some tokenizers don't really handle words. Keep to offsets then. + start_index = enc.offsets[s][0] + end_index = enc.offsets[e][1] + else: + start_index = enc.offsets[s][0] + end_index = enc.offsets[e][1] + return start_index, end_index + + def span_to_answer(self, text: str, start: int, end: int) -> Dict[str, Union[str, int]]: + """ + When decoding from token probabilities, this method maps token indexes to actual word in the initial context. + + Args: + text (`str`): The actual context to extract the answer from. + start (`int`): The answer starting token index. + end (`int`): The answer end token index. + + Returns: + Dictionary like `{'answer': str, 'start': int, 'end': int}` + """ + words = [] + token_idx = char_start_idx = char_end_idx = chars_idx = 0 + + for i, word in enumerate(text.split(" ")): + token = self.tokenizer.tokenize(word) + + # Append words if they are in the span + if start <= token_idx <= end: + if token_idx == start: + char_start_idx = chars_idx + + if token_idx == end: + char_end_idx = chars_idx + len(word) + + words += [word] + + # Stop if we went over the end of the answer + if token_idx > end: + break + + # Append the subtokenization length to the running index + token_idx += len(token) + chars_idx += len(word) + 1 + + # Join text with spaces + return { + "answer": " ".join(words), + "start": max(0, char_start_idx), + "end": min(len(text), char_end_idx), + } diff --git a/.venv/lib/python3.11/site-packages/transformers/pipelines/table_question_answering.py b/.venv/lib/python3.11/site-packages/transformers/pipelines/table_question_answering.py new file mode 100644 index 0000000000000000000000000000000000000000..10ea7170fed40cbc6f14c8b712741ce570fbf3f7 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/transformers/pipelines/table_question_answering.py @@ -0,0 +1,443 @@ +import collections +import types + +import numpy as np + +from ..utils import ( + add_end_docstrings, + is_tf_available, + is_torch_available, + requires_backends, +) +from .base import ArgumentHandler, Dataset, Pipeline, PipelineException, build_pipeline_init_args + + +if is_torch_available(): + import torch + + from ..models.auto.modeling_auto import ( + MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES, + MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING_NAMES, + ) + +if is_tf_available(): + import tensorflow as tf + + from ..models.auto.modeling_tf_auto import ( + TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES, + TF_MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING_NAMES, + ) + + +class TableQuestionAnsweringArgumentHandler(ArgumentHandler): + """ + Handles arguments for the TableQuestionAnsweringPipeline + """ + + def __call__(self, table=None, query=None, **kwargs): + # Returns tqa_pipeline_inputs of shape: + # [ + # {"table": pd.DataFrame, "query": List[str]}, + # ..., + # {"table": pd.DataFrame, "query" : List[str]} + # ] + requires_backends(self, "pandas") + import pandas as pd + + if table is None: + raise ValueError("Keyword argument `table` cannot be None.") + elif query is None: + if isinstance(table, dict) and table.get("query") is not None and table.get("table") is not None: + tqa_pipeline_inputs = [table] + elif isinstance(table, list) and len(table) > 0: + if not all(isinstance(d, dict) for d in table): + raise ValueError( + f"Keyword argument `table` should be a list of dict, but is {(type(d) for d in table)}" + ) + + if table[0].get("query") is not None and table[0].get("table") is not None: + tqa_pipeline_inputs = table + else: + raise ValueError( + "If keyword argument `table` is a list of dictionaries, each dictionary should have a `table`" + f" and `query` key, but only dictionary has keys {table[0].keys()} `table` and `query` keys." + ) + elif Dataset is not None and isinstance(table, Dataset) or isinstance(table, types.GeneratorType): + return table + else: + raise ValueError( + "Invalid input. Keyword argument `table` should be either of type `dict` or `list`, but " + f"is {type(table)})" + ) + else: + tqa_pipeline_inputs = [{"table": table, "query": query}] + + for tqa_pipeline_input in tqa_pipeline_inputs: + if not isinstance(tqa_pipeline_input["table"], pd.DataFrame): + if tqa_pipeline_input["table"] is None: + raise ValueError("Table cannot be None.") + + tqa_pipeline_input["table"] = pd.DataFrame(tqa_pipeline_input["table"]) + + return tqa_pipeline_inputs + + +@add_end_docstrings(build_pipeline_init_args(has_tokenizer=True)) +class TableQuestionAnsweringPipeline(Pipeline): + """ + Table Question Answering pipeline using a `ModelForTableQuestionAnswering`. This pipeline is only available in + PyTorch. + + Example: + + ```python + >>> from transformers import pipeline + + >>> oracle = pipeline(model="google/tapas-base-finetuned-wtq") + >>> table = { + ... "Repository": ["Transformers", "Datasets", "Tokenizers"], + ... "Stars": ["36542", "4512", "3934"], + ... "Contributors": ["651", "77", "34"], + ... "Programming language": ["Python", "Python", "Rust, Python and NodeJS"], + ... } + >>> oracle(query="How many stars does the transformers repository have?", table=table) + {'answer': 'AVERAGE > 36542', 'coordinates': [(0, 1)], 'cells': ['36542'], 'aggregator': 'AVERAGE'} + ``` + + Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial) + + This tabular question answering pipeline can currently be loaded from [`pipeline`] using the following task + identifier: `"table-question-answering"`. + + The models that this pipeline can use are models that have been fine-tuned on a tabular question answering task. + See the up-to-date list of available models on + [huggingface.co/models](https://huggingface.co/models?filter=table-question-answering). + """ + + default_input_names = "table,query" + + def __init__(self, args_parser=TableQuestionAnsweringArgumentHandler(), *args, **kwargs): + super().__init__(*args, **kwargs) + self._args_parser = args_parser + + if self.framework == "tf": + mapping = TF_MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING_NAMES.copy() + mapping.update(TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES) + else: + mapping = MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING_NAMES.copy() + mapping.update(MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES) + self.check_model_type(mapping) + + self.aggregate = bool(getattr(self.model.config, "aggregation_labels", None)) and bool( + getattr(self.model.config, "num_aggregation_labels", None) + ) + self.type = "tapas" if hasattr(self.model.config, "aggregation_labels") else None + + def batch_inference(self, **inputs): + return self.model(**inputs) + + def sequential_inference(self, **inputs): + """ + Inference used for models that need to process sequences in a sequential fashion, like the SQA models which + handle conversational query related to a table. + """ + if self.framework == "pt": + all_logits = [] + all_aggregations = [] + prev_answers = None + batch_size = inputs["input_ids"].shape[0] + + input_ids = inputs["input_ids"].to(self.device) + attention_mask = inputs["attention_mask"].to(self.device) + token_type_ids = inputs["token_type_ids"].to(self.device) + token_type_ids_example = None + + for index in range(batch_size): + # If sequences have already been processed, the token type IDs will be created according to the previous + # answer. + if prev_answers is not None: + prev_labels_example = token_type_ids_example[:, 3] # shape (seq_len,) + model_labels = np.zeros_like(prev_labels_example.cpu().numpy()) # shape (seq_len,) + + token_type_ids_example = token_type_ids[index] # shape (seq_len, 7) + for i in range(model_labels.shape[0]): + segment_id = token_type_ids_example[:, 0].tolist()[i] + col_id = token_type_ids_example[:, 1].tolist()[i] - 1 + row_id = token_type_ids_example[:, 2].tolist()[i] - 1 + + if row_id >= 0 and col_id >= 0 and segment_id == 1: + model_labels[i] = int(prev_answers[(col_id, row_id)]) + + token_type_ids_example[:, 3] = torch.from_numpy(model_labels).type(torch.long).to(self.device) + + input_ids_example = input_ids[index] + attention_mask_example = attention_mask[index] # shape (seq_len,) + token_type_ids_example = token_type_ids[index] # shape (seq_len, 7) + outputs = self.model( + input_ids=input_ids_example.unsqueeze(0), + attention_mask=attention_mask_example.unsqueeze(0), + token_type_ids=token_type_ids_example.unsqueeze(0), + ) + logits = outputs.logits + + if self.aggregate: + all_aggregations.append(outputs.logits_aggregation) + + all_logits.append(logits) + + dist_per_token = torch.distributions.Bernoulli(logits=logits) + probabilities = dist_per_token.probs * attention_mask_example.type(torch.float32).to( + dist_per_token.probs.device + ) + + coords_to_probs = collections.defaultdict(list) + for i, p in enumerate(probabilities.squeeze().tolist()): + segment_id = token_type_ids_example[:, 0].tolist()[i] + col = token_type_ids_example[:, 1].tolist()[i] - 1 + row = token_type_ids_example[:, 2].tolist()[i] - 1 + if col >= 0 and row >= 0 and segment_id == 1: + coords_to_probs[(col, row)].append(p) + + prev_answers = {key: np.array(coords_to_probs[key]).mean() > 0.5 for key in coords_to_probs} + + logits_batch = torch.cat(tuple(all_logits), 0) + + return (logits_batch,) if not self.aggregate else (logits_batch, torch.cat(tuple(all_aggregations), 0)) + else: + all_logits = [] + all_aggregations = [] + prev_answers = None + batch_size = inputs["input_ids"].shape[0] + + input_ids = inputs["input_ids"] + attention_mask = inputs["attention_mask"] + token_type_ids = inputs["token_type_ids"].numpy() + token_type_ids_example = None + + for index in range(batch_size): + # If sequences have already been processed, the token type IDs will be created according to the previous + # answer. + if prev_answers is not None: + prev_labels_example = token_type_ids_example[:, 3] # shape (seq_len,) + model_labels = np.zeros_like(prev_labels_example, dtype=np.int32) # shape (seq_len,) + + token_type_ids_example = token_type_ids[index] # shape (seq_len, 7) + for i in range(model_labels.shape[0]): + segment_id = token_type_ids_example[:, 0].tolist()[i] + col_id = token_type_ids_example[:, 1].tolist()[i] - 1 + row_id = token_type_ids_example[:, 2].tolist()[i] - 1 + + if row_id >= 0 and col_id >= 0 and segment_id == 1: + model_labels[i] = int(prev_answers[(col_id, row_id)]) + + token_type_ids_example[:, 3] = model_labels + + input_ids_example = input_ids[index] + attention_mask_example = attention_mask[index] # shape (seq_len,) + token_type_ids_example = token_type_ids[index] # shape (seq_len, 7) + outputs = self.model( + input_ids=np.expand_dims(input_ids_example, axis=0), + attention_mask=np.expand_dims(attention_mask_example, axis=0), + token_type_ids=np.expand_dims(token_type_ids_example, axis=0), + ) + logits = outputs.logits + + if self.aggregate: + all_aggregations.append(outputs.logits_aggregation) + + all_logits.append(logits) + + probabilities = tf.math.sigmoid(tf.cast(logits, tf.float32)) * tf.cast( + attention_mask_example, tf.float32 + ) + + coords_to_probs = collections.defaultdict(list) + token_type_ids_example = token_type_ids_example + for i, p in enumerate(tf.squeeze(probabilities).numpy().tolist()): + segment_id = token_type_ids_example[:, 0].tolist()[i] + col = token_type_ids_example[:, 1].tolist()[i] - 1 + row = token_type_ids_example[:, 2].tolist()[i] - 1 + if col >= 0 and row >= 0 and segment_id == 1: + coords_to_probs[(col, row)].append(p) + + prev_answers = {key: np.array(coords_to_probs[key]).mean() > 0.5 for key in coords_to_probs} + + logits_batch = tf.concat(tuple(all_logits), 0) + + return (logits_batch,) if not self.aggregate else (logits_batch, tf.concat(tuple(all_aggregations), 0)) + + def __call__(self, *args, **kwargs): + r""" + Answers queries according to a table. The pipeline accepts several types of inputs which are detailed below: + + - `pipeline(table, query)` + - `pipeline(table, [query])` + - `pipeline(table=table, query=query)` + - `pipeline(table=table, query=[query])` + - `pipeline({"table": table, "query": query})` + - `pipeline({"table": table, "query": [query]})` + - `pipeline([{"table": table, "query": query}, {"table": table, "query": query}])` + + The `table` argument should be a dict or a DataFrame built from that dict, containing the whole table: + + Example: + + ```python + data = { + "actors": ["brad pitt", "leonardo di caprio", "george clooney"], + "age": ["56", "45", "59"], + "number of movies": ["87", "53", "69"], + "date of birth": ["7 february 1967", "10 june 1996", "28 november 1967"], + } + ``` + + This dictionary can be passed in as such, or can be converted to a pandas DataFrame: + + Example: + + ```python + import pandas as pd + + table = pd.DataFrame.from_dict(data) + ``` + + Args: + table (`pd.DataFrame` or `Dict`): + Pandas DataFrame or dictionary that will be converted to a DataFrame containing all the table values. + See above for an example of dictionary. + query (`str` or `List[str]`): + Query or list of queries that will be sent to the model alongside the table. + sequential (`bool`, *optional*, defaults to `False`): + Whether to do inference sequentially or as a batch. Batching is faster, but models like SQA require the + inference to be done sequentially to extract relations within sequences, given their conversational + nature. + padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `False`): + Activates and controls padding. Accepts the following values: + + - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single + sequence if provided). + - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum + acceptable input length for the model if that argument is not provided. + - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different + lengths). + + truncation (`bool`, `str` or [`TapasTruncationStrategy`], *optional*, defaults to `False`): + Activates and controls truncation. Accepts the following values: + + - `True` or `'drop_rows_to_fit'`: Truncate to a maximum length specified with the argument `max_length` + or to the maximum acceptable input length for the model if that argument is not provided. This will + truncate row by row, removing rows from the table. + - `False` or `'do_not_truncate'` (default): No truncation (i.e., can output batch with sequence lengths + greater than the model maximum admissible input size). + + + Return: + A dictionary or a list of dictionaries containing results: Each result is a dictionary with the following + keys: + + - **answer** (`str`) -- The answer of the query given the table. If there is an aggregator, the answer will + be preceded by `AGGREGATOR >`. + - **coordinates** (`List[Tuple[int, int]]`) -- Coordinates of the cells of the answers. + - **cells** (`List[str]`) -- List of strings made up of the answer cell values. + - **aggregator** (`str`) -- If the model has an aggregator, this returns the aggregator. + """ + pipeline_inputs = self._args_parser(*args, **kwargs) + + results = super().__call__(pipeline_inputs, **kwargs) + if len(results) == 1: + return results[0] + return results + + def _sanitize_parameters(self, sequential=None, padding=None, truncation=None, **kwargs): + preprocess_params = {} + if padding is not None: + preprocess_params["padding"] = padding + if truncation is not None: + preprocess_params["truncation"] = truncation + + forward_params = {} + if sequential is not None: + forward_params["sequential"] = sequential + + if self.assistant_model is not None: + forward_params["assistant_model"] = self.assistant_model + if self.assistant_tokenizer is not None: + forward_params["tokenizer"] = self.tokenizer + forward_params["assistant_tokenizer"] = self.assistant_tokenizer + + return preprocess_params, forward_params, {} + + def preprocess(self, pipeline_input, sequential=None, padding=True, truncation=None): + if truncation is None: + if self.type == "tapas": + truncation = "drop_rows_to_fit" + else: + truncation = "do_not_truncate" + + table, query = pipeline_input["table"], pipeline_input["query"] + if table.empty: + raise ValueError("table is empty") + if query is None or query == "": + raise ValueError("query is empty") + inputs = self.tokenizer(table, query, return_tensors=self.framework, truncation=truncation, padding=padding) + inputs["table"] = table + return inputs + + def _forward(self, model_inputs, sequential=False, **generate_kwargs): + table = model_inputs.pop("table") + + if self.type == "tapas": + if sequential: + outputs = self.sequential_inference(**model_inputs) + else: + outputs = self.batch_inference(**model_inputs) + else: + # User-defined `generation_config` passed to the pipeline call take precedence + if "generation_config" not in generate_kwargs: + generate_kwargs["generation_config"] = self.generation_config + + outputs = self.model.generate(**model_inputs, **generate_kwargs) + model_outputs = {"model_inputs": model_inputs, "table": table, "outputs": outputs} + return model_outputs + + def postprocess(self, model_outputs): + inputs = model_outputs["model_inputs"] + table = model_outputs["table"] + outputs = model_outputs["outputs"] + if self.type == "tapas": + if self.aggregate: + logits, logits_agg = outputs[:2] + predictions = self.tokenizer.convert_logits_to_predictions(inputs, logits, logits_agg) + answer_coordinates_batch, agg_predictions = predictions + aggregators = {i: self.model.config.aggregation_labels[pred] for i, pred in enumerate(agg_predictions)} + + no_agg_label_index = self.model.config.no_aggregation_label_index + aggregators_prefix = { + i: aggregators[i] + " > " for i, pred in enumerate(agg_predictions) if pred != no_agg_label_index + } + else: + logits = outputs[0] + predictions = self.tokenizer.convert_logits_to_predictions(inputs, logits) + answer_coordinates_batch = predictions[0] + aggregators = {} + aggregators_prefix = {} + answers = [] + for index, coordinates in enumerate(answer_coordinates_batch): + cells = [table.iat[coordinate] for coordinate in coordinates] + aggregator = aggregators.get(index, "") + aggregator_prefix = aggregators_prefix.get(index, "") + answer = { + "answer": aggregator_prefix + ", ".join(cells), + "coordinates": coordinates, + "cells": [table.iat[coordinate] for coordinate in coordinates], + } + if aggregator: + answer["aggregator"] = aggregator + + answers.append(answer) + if len(answer) == 0: + raise PipelineException("Empty answer") + else: + answers = [{"answer": answer} for answer in self.tokenizer.batch_decode(outputs, skip_special_tokens=True)] + + return answers if len(answers) > 1 else answers[0] diff --git a/.venv/lib/python3.11/site-packages/transformers/pipelines/text2text_generation.py b/.venv/lib/python3.11/site-packages/transformers/pipelines/text2text_generation.py new file mode 100644 index 0000000000000000000000000000000000000000..9bc7544550286ecb2ad2108d7dffb142cc123877 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/transformers/pipelines/text2text_generation.py @@ -0,0 +1,382 @@ +import enum +import warnings + +from ..tokenization_utils import TruncationStrategy +from ..utils import add_end_docstrings, is_tf_available, is_torch_available, logging +from .base import Pipeline, build_pipeline_init_args + + +if is_tf_available(): + import tensorflow as tf + + from ..models.auto.modeling_tf_auto import TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES + +if is_torch_available(): + from ..models.auto.modeling_auto import MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES + +logger = logging.get_logger(__name__) + + +class ReturnType(enum.Enum): + TENSORS = 0 + TEXT = 1 + + +@add_end_docstrings(build_pipeline_init_args(has_tokenizer=True)) +class Text2TextGenerationPipeline(Pipeline): + """ + Pipeline for text to text generation using seq2seq models. + + Example: + + ```python + >>> from transformers import pipeline + + >>> generator = pipeline(model="mrm8488/t5-base-finetuned-question-generation-ap") + >>> generator( + ... "answer: Manuel context: Manuel has created RuPERTa-base with the support of HF-Transformers and Google" + ... ) + [{'generated_text': 'question: Who created the RuPERTa-base?'}] + ``` + + Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial). You can pass text + generation parameters to this pipeline to control stopping criteria, decoding strategy, and more. Learn more about + text generation parameters in [Text generation strategies](../generation_strategies) and [Text + generation](text_generation). + + This Text2TextGenerationPipeline pipeline can currently be loaded from [`pipeline`] using the following task + identifier: `"text2text-generation"`. + + The models that this pipeline can use are models that have been fine-tuned on a translation task. See the + up-to-date list of available models on + [huggingface.co/models](https://huggingface.co/models?filter=text2text-generation). For a list of available + parameters, see the [following + documentation](https://huggingface.co/docs/transformers/en/main_classes/text_generation#transformers.generation.GenerationMixin.generate) + + Usage: + + ```python + text2text_generator = pipeline("text2text-generation") + text2text_generator("question: What is 42 ? context: 42 is the answer to life, the universe and everything") + ```""" + + # Used in the return key of the pipeline. + return_name = "generated" + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + self.check_model_type( + TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES + if self.framework == "tf" + else MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING_NAMES + ) + + def _sanitize_parameters( + self, + return_tensors=None, + return_text=None, + return_type=None, + clean_up_tokenization_spaces=None, + truncation=None, + stop_sequence=None, + **generate_kwargs, + ): + preprocess_params = {} + if truncation is not None: + preprocess_params["truncation"] = truncation + + forward_params = generate_kwargs + + postprocess_params = {} + if return_tensors is not None and return_type is None: + return_type = ReturnType.TENSORS if return_tensors else ReturnType.TEXT + if return_type is not None: + postprocess_params["return_type"] = return_type + + if clean_up_tokenization_spaces is not None: + postprocess_params["clean_up_tokenization_spaces"] = clean_up_tokenization_spaces + + if stop_sequence is not None: + stop_sequence_ids = self.tokenizer.encode(stop_sequence, add_special_tokens=False) + if len(stop_sequence_ids) > 1: + warnings.warn( + "Stopping on a multiple token sequence is not yet supported on transformers. The first token of" + " the stop sequence will be used as the stop sequence string in the interim." + ) + generate_kwargs["eos_token_id"] = stop_sequence_ids[0] + + if self.assistant_model is not None: + forward_params["assistant_model"] = self.assistant_model + if self.assistant_tokenizer is not None: + forward_params["tokenizer"] = self.tokenizer + forward_params["assistant_tokenizer"] = self.assistant_tokenizer + + return preprocess_params, forward_params, postprocess_params + + def check_inputs(self, input_length: int, min_length: int, max_length: int): + """ + Checks whether there might be something wrong with given input with regard to the model. + """ + return True + + def _parse_and_tokenize(self, *args, truncation): + prefix = self.prefix if self.prefix is not None else "" + if isinstance(args[0], list): + if self.tokenizer.pad_token_id is None: + raise ValueError("Please make sure that the tokenizer has a pad_token_id when using a batch input") + args = ([prefix + arg for arg in args[0]],) + padding = True + + elif isinstance(args[0], str): + args = (prefix + args[0],) + padding = False + else: + raise ValueError( + f" `args[0]`: {args[0]} have the wrong format. The should be either of type `str` or type `list`" + ) + inputs = self.tokenizer(*args, padding=padding, truncation=truncation, return_tensors=self.framework) + # This is produced by tokenizers but is an invalid generate kwargs + if "token_type_ids" in inputs: + del inputs["token_type_ids"] + return inputs + + def __call__(self, *args, **kwargs): + r""" + Generate the output text(s) using text(s) given as inputs. + + Args: + args (`str` or `List[str]`): + Input text for the encoder. + return_tensors (`bool`, *optional*, defaults to `False`): + Whether or not to include the tensors of predictions (as token indices) in the outputs. + return_text (`bool`, *optional*, defaults to `True`): + Whether or not to include the decoded texts in the outputs. + clean_up_tokenization_spaces (`bool`, *optional*, defaults to `False`): + Whether or not to clean up the potential extra spaces in the text output. + truncation (`TruncationStrategy`, *optional*, defaults to `TruncationStrategy.DO_NOT_TRUNCATE`): + The truncation strategy for the tokenization within the pipeline. `TruncationStrategy.DO_NOT_TRUNCATE` + (default) will never truncate, but it is sometimes desirable to truncate the input to fit the model's + max_length instead of throwing an error down the line. + generate_kwargs: + Additional keyword arguments to pass along to the generate method of the model (see the generate method + corresponding to your framework [here](./text_generation)). + + Return: + A list or a list of list of `dict`: Each result comes as a dictionary with the following keys: + + - **generated_text** (`str`, present when `return_text=True`) -- The generated text. + - **generated_token_ids** (`torch.Tensor` or `tf.Tensor`, present when `return_tensors=True`) -- The token + ids of the generated text. + """ + + result = super().__call__(*args, **kwargs) + if ( + isinstance(args[0], list) + and all(isinstance(el, str) for el in args[0]) + and all(len(res) == 1 for res in result) + ): + return [res[0] for res in result] + return result + + def preprocess(self, inputs, truncation=TruncationStrategy.DO_NOT_TRUNCATE, **kwargs): + inputs = self._parse_and_tokenize(inputs, truncation=truncation, **kwargs) + return inputs + + def _forward(self, model_inputs, **generate_kwargs): + if self.framework == "pt": + in_b, input_length = model_inputs["input_ids"].shape + elif self.framework == "tf": + in_b, input_length = tf.shape(model_inputs["input_ids"]).numpy() + + self.check_inputs( + input_length, + generate_kwargs.get("min_length", self.generation_config.min_length), + generate_kwargs.get("max_length", self.generation_config.max_length), + ) + + # User-defined `generation_config` passed to the pipeline call take precedence + if "generation_config" not in generate_kwargs: + generate_kwargs["generation_config"] = self.generation_config + + output_ids = self.model.generate(**model_inputs, **generate_kwargs) + out_b = output_ids.shape[0] + if self.framework == "pt": + output_ids = output_ids.reshape(in_b, out_b // in_b, *output_ids.shape[1:]) + elif self.framework == "tf": + output_ids = tf.reshape(output_ids, (in_b, out_b // in_b, *output_ids.shape[1:])) + return {"output_ids": output_ids} + + def postprocess(self, model_outputs, return_type=ReturnType.TEXT, clean_up_tokenization_spaces=False): + records = [] + for output_ids in model_outputs["output_ids"][0]: + if return_type == ReturnType.TENSORS: + record = {f"{self.return_name}_token_ids": output_ids} + elif return_type == ReturnType.TEXT: + record = { + f"{self.return_name}_text": self.tokenizer.decode( + output_ids, + skip_special_tokens=True, + clean_up_tokenization_spaces=clean_up_tokenization_spaces, + ) + } + records.append(record) + return records + + +@add_end_docstrings(build_pipeline_init_args(has_tokenizer=True)) +class SummarizationPipeline(Text2TextGenerationPipeline): + """ + Summarize news articles and other documents. + + This summarizing pipeline can currently be loaded from [`pipeline`] using the following task identifier: + `"summarization"`. + + The models that this pipeline can use are models that have been fine-tuned on a summarization task, which is + currently, '*bart-large-cnn*', '*google-t5/t5-small*', '*google-t5/t5-base*', '*google-t5/t5-large*', '*google-t5/t5-3b*', '*google-t5/t5-11b*'. See the up-to-date + list of available models on [huggingface.co/models](https://huggingface.co/models?filter=summarization). For a list + of available parameters, see the [following + documentation](https://huggingface.co/docs/transformers/en/main_classes/text_generation#transformers.generation.GenerationMixin.generate) + + Usage: + + ```python + # use bart in pytorch + summarizer = pipeline("summarization") + summarizer("An apple a day, keeps the doctor away", min_length=5, max_length=20) + + # use t5 in tf + summarizer = pipeline("summarization", model="google-t5/t5-base", tokenizer="google-t5/t5-base", framework="tf") + summarizer("An apple a day, keeps the doctor away", min_length=5, max_length=20) + ```""" + + # Used in the return key of the pipeline. + return_name = "summary" + + def __call__(self, *args, **kwargs): + r""" + Summarize the text(s) given as inputs. + + Args: + documents (*str* or `List[str]`): + One or several articles (or one list of articles) to summarize. + return_text (`bool`, *optional*, defaults to `True`): + Whether or not to include the decoded texts in the outputs + return_tensors (`bool`, *optional*, defaults to `False`): + Whether or not to include the tensors of predictions (as token indices) in the outputs. + clean_up_tokenization_spaces (`bool`, *optional*, defaults to `False`): + Whether or not to clean up the potential extra spaces in the text output. + generate_kwargs: + Additional keyword arguments to pass along to the generate method of the model (see the generate method + corresponding to your framework [here](./text_generation)). + + Return: + A list or a list of list of `dict`: Each result comes as a dictionary with the following keys: + + - **summary_text** (`str`, present when `return_text=True`) -- The summary of the corresponding input. + - **summary_token_ids** (`torch.Tensor` or `tf.Tensor`, present when `return_tensors=True`) -- The token + ids of the summary. + """ + return super().__call__(*args, **kwargs) + + def check_inputs(self, input_length: int, min_length: int, max_length: int) -> bool: + """ + Checks whether there might be something wrong with given input with regard to the model. + """ + if max_length < min_length: + logger.warning(f"Your min_length={min_length} must be inferior than your max_length={max_length}.") + + if input_length < max_length: + logger.warning( + f"Your max_length is set to {max_length}, but your input_length is only {input_length}. Since this is " + "a summarization task, where outputs shorter than the input are typically wanted, you might " + f"consider decreasing max_length manually, e.g. summarizer('...', max_length={input_length//2})" + ) + + +@add_end_docstrings(build_pipeline_init_args(has_tokenizer=True)) +class TranslationPipeline(Text2TextGenerationPipeline): + """ + Translates from one language to another. + + This translation pipeline can currently be loaded from [`pipeline`] using the following task identifier: + `"translation_xx_to_yy"`. + + The models that this pipeline can use are models that have been fine-tuned on a translation task. See the + up-to-date list of available models on [huggingface.co/models](https://huggingface.co/models?filter=translation). + For a list of available parameters, see the [following + documentation](https://huggingface.co/docs/transformers/en/main_classes/text_generation#transformers.generation.GenerationMixin.generate) + + Usage: + + ```python + en_fr_translator = pipeline("translation_en_to_fr") + en_fr_translator("How old are you?") + ```""" + + # Used in the return key of the pipeline. + return_name = "translation" + + def check_inputs(self, input_length: int, min_length: int, max_length: int): + if input_length > 0.9 * max_length: + logger.warning( + f"Your input_length: {input_length} is bigger than 0.9 * max_length: {max_length}. You might consider " + "increasing your max_length manually, e.g. translator('...', max_length=400)" + ) + return True + + def preprocess(self, *args, truncation=TruncationStrategy.DO_NOT_TRUNCATE, src_lang=None, tgt_lang=None): + if getattr(self.tokenizer, "_build_translation_inputs", None): + return self.tokenizer._build_translation_inputs( + *args, return_tensors=self.framework, truncation=truncation, src_lang=src_lang, tgt_lang=tgt_lang + ) + else: + return super()._parse_and_tokenize(*args, truncation=truncation) + + def _sanitize_parameters(self, src_lang=None, tgt_lang=None, **kwargs): + preprocess_params, forward_params, postprocess_params = super()._sanitize_parameters(**kwargs) + if src_lang is not None: + preprocess_params["src_lang"] = src_lang + if tgt_lang is not None: + preprocess_params["tgt_lang"] = tgt_lang + if src_lang is None and tgt_lang is None: + # Backward compatibility, direct arguments use is preferred. + task = kwargs.get("task", self.task) + items = task.split("_") + if task and len(items) == 4: + # translation, XX, to YY + preprocess_params["src_lang"] = items[1] + preprocess_params["tgt_lang"] = items[3] + return preprocess_params, forward_params, postprocess_params + + def __call__(self, *args, **kwargs): + r""" + Translate the text(s) given as inputs. + + Args: + args (`str` or `List[str]`): + Texts to be translated. + return_tensors (`bool`, *optional*, defaults to `False`): + Whether or not to include the tensors of predictions (as token indices) in the outputs. + return_text (`bool`, *optional*, defaults to `True`): + Whether or not to include the decoded texts in the outputs. + clean_up_tokenization_spaces (`bool`, *optional*, defaults to `False`): + Whether or not to clean up the potential extra spaces in the text output. + src_lang (`str`, *optional*): + The language of the input. Might be required for multilingual models. Will not have any effect for + single pair translation models + tgt_lang (`str`, *optional*): + The language of the desired output. Might be required for multilingual models. Will not have any effect + for single pair translation models + generate_kwargs: + Additional keyword arguments to pass along to the generate method of the model (see the generate method + corresponding to your framework [here](./text_generation)). + + Return: + A list or a list of list of `dict`: Each result comes as a dictionary with the following keys: + + - **translation_text** (`str`, present when `return_text=True`) -- The translation. + - **translation_token_ids** (`torch.Tensor` or `tf.Tensor`, present when `return_tensors=True`) -- The + token ids of the translation. + """ + return super().__call__(*args, **kwargs) diff --git a/.venv/lib/python3.11/site-packages/transformers/pipelines/text_classification.py b/.venv/lib/python3.11/site-packages/transformers/pipelines/text_classification.py new file mode 100644 index 0000000000000000000000000000000000000000..dadb29c386b41e4ca3bd1a49ee103308c3f02174 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/transformers/pipelines/text_classification.py @@ -0,0 +1,236 @@ +import inspect +import warnings +from typing import Dict + +import numpy as np + +from ..utils import ExplicitEnum, add_end_docstrings, is_tf_available, is_torch_available +from .base import GenericTensor, Pipeline, build_pipeline_init_args + + +if is_tf_available(): + from ..models.auto.modeling_tf_auto import TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES + +if is_torch_available(): + from ..models.auto.modeling_auto import MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES + + +def sigmoid(_outputs): + return 1.0 / (1.0 + np.exp(-_outputs)) + + +def softmax(_outputs): + maxes = np.max(_outputs, axis=-1, keepdims=True) + shifted_exp = np.exp(_outputs - maxes) + return shifted_exp / shifted_exp.sum(axis=-1, keepdims=True) + + +class ClassificationFunction(ExplicitEnum): + SIGMOID = "sigmoid" + SOFTMAX = "softmax" + NONE = "none" + + +@add_end_docstrings( + build_pipeline_init_args(has_tokenizer=True), + r""" + return_all_scores (`bool`, *optional*, defaults to `False`): + Whether to return all prediction scores or just the one of the predicted class. + function_to_apply (`str`, *optional*, defaults to `"default"`): + The function to apply to the model outputs in order to retrieve the scores. Accepts four different values: + + - `"default"`: if the model has a single label, will apply the sigmoid function on the output. If the model + has several labels, will apply the softmax function on the output. In case of regression tasks, will not + apply any function on the output. + - `"sigmoid"`: Applies the sigmoid function on the output. + - `"softmax"`: Applies the softmax function on the output. + - `"none"`: Does not apply any function on the output.""", +) +class TextClassificationPipeline(Pipeline): + """ + Text classification pipeline using any `ModelForSequenceClassification`. See the [sequence classification + examples](../task_summary#sequence-classification) for more information. + + Example: + + ```python + >>> from transformers import pipeline + + >>> classifier = pipeline(model="distilbert/distilbert-base-uncased-finetuned-sst-2-english") + >>> classifier("This movie is disgustingly good !") + [{'label': 'POSITIVE', 'score': 1.0}] + + >>> classifier("Director tried too much.") + [{'label': 'NEGATIVE', 'score': 0.996}] + ``` + + Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial) + + This text classification pipeline can currently be loaded from [`pipeline`] using the following task identifier: + `"sentiment-analysis"` (for classifying sequences according to positive or negative sentiments). + + If multiple classification labels are available (`model.config.num_labels >= 2`), the pipeline will run a softmax + over the results. If there is a single label, the pipeline will run a sigmoid over the result. In case of regression + tasks (`model.config.problem_type == "regression"`), will not apply any function on the output. + + The models that this pipeline can use are models that have been fine-tuned on a sequence classification task. See + the up-to-date list of available models on + [huggingface.co/models](https://huggingface.co/models?filter=text-classification). + """ + + return_all_scores = False + function_to_apply = ClassificationFunction.NONE + + def __init__(self, **kwargs): + super().__init__(**kwargs) + + self.check_model_type( + TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES + if self.framework == "tf" + else MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES + ) + + def _sanitize_parameters(self, return_all_scores=None, function_to_apply=None, top_k="", **tokenizer_kwargs): + # Using "" as default argument because we're going to use `top_k=None` in user code to declare + # "No top_k" + preprocess_params = tokenizer_kwargs + + postprocess_params = {} + if hasattr(self.model.config, "return_all_scores") and return_all_scores is None: + return_all_scores = self.model.config.return_all_scores + + if isinstance(top_k, int) or top_k is None: + postprocess_params["top_k"] = top_k + postprocess_params["_legacy"] = False + elif return_all_scores is not None: + warnings.warn( + "`return_all_scores` is now deprecated, if want a similar functionality use `top_k=None` instead of" + " `return_all_scores=True` or `top_k=1` instead of `return_all_scores=False`.", + UserWarning, + ) + if return_all_scores: + postprocess_params["top_k"] = None + else: + postprocess_params["top_k"] = 1 + + if isinstance(function_to_apply, str): + function_to_apply = ClassificationFunction[function_to_apply.upper()] + + if function_to_apply is not None: + postprocess_params["function_to_apply"] = function_to_apply + return preprocess_params, {}, postprocess_params + + def __call__(self, inputs, **kwargs): + """ + Classify the text(s) given as inputs. + + Args: + inputs (`str` or `List[str]` or `Dict[str]`, or `List[Dict[str]]`): + One or several texts to classify. In order to use text pairs for your classification, you can send a + dictionary containing `{"text", "text_pair"}` keys, or a list of those. + top_k (`int`, *optional*, defaults to `1`): + How many results to return. + function_to_apply (`str`, *optional*, defaults to `"default"`): + The function to apply to the model outputs in order to retrieve the scores. Accepts four different + values: + + If this argument is not specified, then it will apply the following functions according to the number + of labels: + + - If problem type is regression, will not apply any function on the output. + - If the model has a single label, will apply the sigmoid function on the output. + - If the model has several labels, will apply the softmax function on the output. + + Possible values are: + + - `"sigmoid"`: Applies the sigmoid function on the output. + - `"softmax"`: Applies the softmax function on the output. + - `"none"`: Does not apply any function on the output. + + Return: + A list or a list of list of `dict`: Each result comes as list of dictionaries with the following keys: + + - **label** (`str`) -- The label predicted. + - **score** (`float`) -- The corresponding probability. + + If `top_k` is used, one such dictionary is returned per label. + """ + inputs = (inputs,) + result = super().__call__(*inputs, **kwargs) + # TODO try and retrieve it in a nicer way from _sanitize_parameters. + _legacy = "top_k" not in kwargs + if isinstance(inputs[0], str) and _legacy: + # This pipeline is odd, and return a list when single item is run + return [result] + else: + return result + + def preprocess(self, inputs, **tokenizer_kwargs) -> Dict[str, GenericTensor]: + return_tensors = self.framework + if isinstance(inputs, dict): + return self.tokenizer(**inputs, return_tensors=return_tensors, **tokenizer_kwargs) + elif isinstance(inputs, list) and len(inputs) == 1 and isinstance(inputs[0], list) and len(inputs[0]) == 2: + # It used to be valid to use a list of list of list for text pairs, keeping this path for BC + return self.tokenizer( + text=inputs[0][0], text_pair=inputs[0][1], return_tensors=return_tensors, **tokenizer_kwargs + ) + elif isinstance(inputs, list): + # This is likely an invalid usage of the pipeline attempting to pass text pairs. + raise ValueError( + "The pipeline received invalid inputs, if you are trying to send text pairs, you can try to send a" + ' dictionary `{"text": "My text", "text_pair": "My pair"}` in order to send a text pair.' + ) + return self.tokenizer(inputs, return_tensors=return_tensors, **tokenizer_kwargs) + + def _forward(self, model_inputs): + # `XXXForSequenceClassification` models should not use `use_cache=True` even if it's supported + model_forward = self.model.forward if self.framework == "pt" else self.model.call + if "use_cache" in inspect.signature(model_forward).parameters.keys(): + model_inputs["use_cache"] = False + return self.model(**model_inputs) + + def postprocess(self, model_outputs, function_to_apply=None, top_k=1, _legacy=True): + # `_legacy` is used to determine if we're running the naked pipeline and in backward + # compatibility mode, or if running the pipeline with `pipeline(..., top_k=1)` we're running + # the more natural result containing the list. + # Default value before `set_parameters` + if function_to_apply is None: + if self.model.config.problem_type == "regression": + function_to_apply = ClassificationFunction.NONE + elif self.model.config.problem_type == "multi_label_classification" or self.model.config.num_labels == 1: + function_to_apply = ClassificationFunction.SIGMOID + elif self.model.config.problem_type == "single_label_classification" or self.model.config.num_labels > 1: + function_to_apply = ClassificationFunction.SOFTMAX + elif hasattr(self.model.config, "function_to_apply") and function_to_apply is None: + function_to_apply = self.model.config.function_to_apply + else: + function_to_apply = ClassificationFunction.NONE + + outputs = model_outputs["logits"][0] + + if self.framework == "pt": + # To enable using fp16 and bf16 + outputs = outputs.float().numpy() + else: + outputs = outputs.numpy() + + if function_to_apply == ClassificationFunction.SIGMOID: + scores = sigmoid(outputs) + elif function_to_apply == ClassificationFunction.SOFTMAX: + scores = softmax(outputs) + elif function_to_apply == ClassificationFunction.NONE: + scores = outputs + else: + raise ValueError(f"Unrecognized `function_to_apply` argument: {function_to_apply}") + + if top_k == 1 and _legacy: + return {"label": self.model.config.id2label[scores.argmax().item()], "score": scores.max().item()} + + dict_scores = [ + {"label": self.model.config.id2label[i], "score": score.item()} for i, score in enumerate(scores) + ] + if not _legacy: + dict_scores.sort(key=lambda x: x["score"], reverse=True) + if top_k is not None: + dict_scores = dict_scores[:top_k] + return dict_scores diff --git a/.venv/lib/python3.11/site-packages/transformers/pipelines/text_generation.py b/.venv/lib/python3.11/site-packages/transformers/pipelines/text_generation.py new file mode 100644 index 0000000000000000000000000000000000000000..c0f14663ffdf5876d1aa4612cf54432974049606 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/transformers/pipelines/text_generation.py @@ -0,0 +1,449 @@ +import enum +import itertools +import types +from typing import Dict + +from ..utils import add_end_docstrings, is_tf_available, is_torch_available +from .base import Pipeline, build_pipeline_init_args + + +if is_torch_available(): + from ..models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING_NAMES + from .pt_utils import KeyDataset + +if is_tf_available(): + import tensorflow as tf + + from ..models.auto.modeling_tf_auto import TF_MODEL_FOR_CAUSAL_LM_MAPPING_NAMES + + +class ReturnType(enum.Enum): + TENSORS = 0 + NEW_TEXT = 1 + FULL_TEXT = 2 + + +class Chat: + """This class is intended to just be used internally in this pipeline and not exposed to users. We convert chats + to this format because the rest of the pipeline code tends to assume that lists of messages are + actually a batch of samples rather than messages in the same conversation.""" + + def __init__(self, messages: Dict): + for message in messages: + if not ("role" in message and "content" in message): + raise ValueError("When passing chat dicts as input, each dict must have a 'role' and 'content' key.") + self.messages = messages + + +@add_end_docstrings(build_pipeline_init_args(has_tokenizer=True)) +class TextGenerationPipeline(Pipeline): + """ + Language generation pipeline using any `ModelWithLMHead`. This pipeline predicts the words that will follow a + specified text prompt. When the underlying model is a conversational model, it can also accept one or more chats, + in which case the pipeline will operate in chat mode and will continue the chat(s) by adding its response(s). + Each chat takes the form of a list of dicts, where each dict contains "role" and "content" keys. + + Examples: + + ```python + >>> from transformers import pipeline + + >>> generator = pipeline(model="openai-community/gpt2") + >>> generator("I can't believe you did such a ", do_sample=False) + [{'generated_text': "I can't believe you did such a icky thing to me. I'm so sorry. I'm so sorry. I'm so sorry. I'm so sorry. I'm so sorry. I'm so sorry. I'm so sorry. I"}] + + >>> # These parameters will return suggestions, and only the newly created text making it easier for prompting suggestions. + >>> outputs = generator("My tart needs some", num_return_sequences=4, return_full_text=False) + ``` + + ```python + >>> from transformers import pipeline + + >>> generator = pipeline(model="HuggingFaceH4/zephyr-7b-beta") + >>> # Zephyr-beta is a conversational model, so let's pass it a chat instead of a single string + >>> generator([{"role": "user", "content": "What is the capital of France? Answer in one word."}], do_sample=False, max_new_tokens=2) + [{'generated_text': [{'role': 'user', 'content': 'What is the capital of France? Answer in one word.'}, {'role': 'assistant', 'content': 'Paris'}]}] + ``` + + Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial). You can pass text + generation parameters to this pipeline to control stopping criteria, decoding strategy, and more. Learn more about + text generation parameters in [Text generation strategies](../generation_strategies) and [Text + generation](text_generation). + + This language generation pipeline can currently be loaded from [`pipeline`] using the following task identifier: + `"text-generation"`. + + The models that this pipeline can use are models that have been trained with an autoregressive language modeling + objective. See the list of available [text completion models](https://huggingface.co/models?filter=text-generation) + and the list of [conversational models](https://huggingface.co/models?other=conversational) + on [huggingface.co/models]. + """ + + # Prefix text to help Transformer-XL and XLNet with short prompts as proposed by Aman Rusia + # in https://github.com/rusiaaman/XLNet-gen#methodology + # and https://medium.com/@amanrusia/xlnet-speaks-comparison-to-gpt-2-ea1a4e9ba39e + + XL_PREFIX = """ + In 1991, the remains of Russian Tsar Nicholas II and his family (except for Alexei and Maria) are discovered. The + voice of Nicholas's young son, Tsarevich Alexei Nikolaevich, narrates the remainder of the story. 1883 Western + Siberia, a young Grigori Rasputin is asked by his father and a group of men to perform magic. Rasputin has a vision + and denounces one of the men as a horse thief. Although his father initially slaps him for making such an + accusation, Rasputin watches as the man is chased outside and beaten. Twenty years later, Rasputin sees a vision of + the Virgin Mary, prompting him to become a priest. Rasputin quickly becomes famous, with people, even a bishop, + begging for his blessing. + """ + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.check_model_type( + TF_MODEL_FOR_CAUSAL_LM_MAPPING_NAMES if self.framework == "tf" else MODEL_FOR_CAUSAL_LM_MAPPING_NAMES + ) + if "prefix" not in self._preprocess_params: + # This is very specific. The logic is quite complex and needs to be done + # as a "default". + # It also defines both some preprocess_kwargs and generate_kwargs + # which is why we cannot put them in their respective methods. + prefix = None + if self.prefix is not None: + prefix = self.prefix + if prefix is None and self.model.__class__.__name__ in [ + "XLNetLMHeadModel", + "TransfoXLLMHeadModel", + "TFXLNetLMHeadModel", + "TFTransfoXLLMHeadModel", + ]: + # For XLNet and TransformerXL we add an article to the prompt to give more state to the model. + prefix = self.XL_PREFIX + if prefix is not None: + # Recalculate some generate_kwargs linked to prefix. + preprocess_params, forward_params, _ = self._sanitize_parameters(prefix=prefix, **self._forward_params) + self._preprocess_params = {**self._preprocess_params, **preprocess_params} + self._forward_params = {**self._forward_params, **forward_params} + + def _sanitize_parameters( + self, + return_full_text=None, + return_tensors=None, + return_text=None, + return_type=None, + clean_up_tokenization_spaces=None, + prefix=None, + handle_long_generation=None, + stop_sequence=None, + truncation=None, + max_length=None, + continue_final_message=None, + **generate_kwargs, + ): + preprocess_params = {} + + add_special_tokens = False + if "add_special_tokens" in generate_kwargs: + add_special_tokens = preprocess_params["add_special_tokens"] = generate_kwargs.pop("add_special_tokens") + + if "padding" in generate_kwargs: + preprocess_params["padding"] = generate_kwargs.pop("padding") + + if truncation is not None: + preprocess_params["truncation"] = truncation + + if max_length is not None: + preprocess_params["max_length"] = max_length + generate_kwargs["max_length"] = max_length + + if prefix is not None: + preprocess_params["prefix"] = prefix + if prefix: + prefix_inputs = self.tokenizer( + prefix, padding=False, add_special_tokens=add_special_tokens, return_tensors=self.framework + ) + generate_kwargs["prefix_length"] = prefix_inputs["input_ids"].shape[-1] + + if handle_long_generation is not None: + if handle_long_generation not in {"hole"}: + raise ValueError( + f"{handle_long_generation} is not a valid value for `handle_long_generation` parameter expected" + " [None, 'hole']" + ) + preprocess_params["handle_long_generation"] = handle_long_generation + + if continue_final_message is not None: + preprocess_params["continue_final_message"] = continue_final_message + + preprocess_params.update(generate_kwargs) + forward_params = generate_kwargs + + postprocess_params = {} + if return_full_text is not None and return_type is None: + if return_text is not None: + raise ValueError("`return_text` is mutually exclusive with `return_full_text`") + if return_tensors is not None: + raise ValueError("`return_full_text` is mutually exclusive with `return_tensors`") + return_type = ReturnType.FULL_TEXT if return_full_text else ReturnType.NEW_TEXT + if return_tensors is not None and return_type is None: + if return_text is not None: + raise ValueError("`return_text` is mutually exclusive with `return_tensors`") + return_type = ReturnType.TENSORS + if return_type is not None: + postprocess_params["return_type"] = return_type + if clean_up_tokenization_spaces is not None: + postprocess_params["clean_up_tokenization_spaces"] = clean_up_tokenization_spaces + if continue_final_message is not None: + postprocess_params["continue_final_message"] = continue_final_message + + if stop_sequence is not None: + stop_sequence_ids = self.tokenizer.encode(stop_sequence, add_special_tokens=False) + generate_kwargs["eos_token_id"] = stop_sequence_ids + + if self.assistant_model is not None: + forward_params["assistant_model"] = self.assistant_model + if self.assistant_tokenizer is not None: + forward_params["tokenizer"] = self.tokenizer + forward_params["assistant_tokenizer"] = self.assistant_tokenizer + + return preprocess_params, forward_params, postprocess_params + + # overriding _parse_and_tokenize to allow for unusual language-modeling tokenizer arguments + def _parse_and_tokenize(self, *args, **kwargs): + """ + Parse arguments and tokenize + """ + # Parse arguments + if self.model.__class__.__name__ in ["TransfoXLLMHeadModel"]: + kwargs.update({"add_space_before_punct_symbol": True}) + + return super()._parse_and_tokenize(*args, **kwargs) + + def __call__(self, text_inputs, **kwargs): + """ + Complete the prompt(s) given as inputs. + + Args: + text_inputs (`str`, `List[str]`, List[Dict[str, str]], or `List[List[Dict[str, str]]]`): + One or several prompts (or one list of prompts) to complete. If strings or a list of string are + passed, this pipeline will continue each prompt. Alternatively, a "chat", in the form of a list + of dicts with "role" and "content" keys, can be passed, or a list of such chats. When chats are passed, + the model's chat template will be used to format them before passing them to the model. + return_tensors (`bool`, *optional*, defaults to `False`): + Returns the tensors of predictions (as token indices) in the outputs. If set to + `True`, the decoded text is not returned. + return_text (`bool`, *optional*): + Returns the decoded texts in the outputs. + return_full_text (`bool`, *optional*, defaults to `True`): + If set to `False` only added text is returned, otherwise the full text is returned. Cannot be + specified at the same time as `return_text`. + clean_up_tokenization_spaces (`bool`, *optional*, defaults to `True`): + Whether or not to clean up the potential extra spaces in the text output. + continue_final_message( `bool`, *optional*): This indicates that you want the model to continue the + last message in the input chat rather than starting a new one, allowing you to "prefill" its response. + By default this is `True` when the final message in the input chat has the `assistant` role and + `False` otherwise, but you can manually override that behaviour by setting this flag. + prefix (`str`, *optional*): + Prefix added to prompt. + handle_long_generation (`str`, *optional*): + By default, this pipelines does not handle long generation (ones that exceed in one form or the other + the model maximum length). There is no perfect way to adress this (more info + :https://github.com/huggingface/transformers/issues/14033#issuecomment-948385227). This provides common + strategies to work around that problem depending on your use case. + + - `None` : default strategy where nothing in particular happens + - `"hole"`: Truncates left of input, and leaves a gap wide enough to let generation happen (might + truncate a lot of the prompt and not suitable when generation exceed the model capacity) + generate_kwargs (`dict`, *optional*): + Additional keyword arguments to pass along to the generate method of the model (see the generate method + corresponding to your framework [here](./text_generation)). + + Return: + A list or a list of lists of `dict`: Returns one of the following dictionaries (cannot return a combination + of both `generated_text` and `generated_token_ids`): + + - **generated_text** (`str`, present when `return_text=True`) -- The generated text. + - **generated_token_ids** (`torch.Tensor` or `tf.Tensor`, present when `return_tensors=True`) -- The token + ids of the generated text. + """ + if isinstance( + text_inputs, + (list, tuple, types.GeneratorType, KeyDataset) + if is_torch_available() + else (list, tuple, types.GeneratorType), + ): + if isinstance(text_inputs, types.GeneratorType): + text_inputs, _ = itertools.tee(text_inputs) + text_inputs, first_item = (x for x in text_inputs), next(_) + else: + first_item = text_inputs[0] + if isinstance(first_item, (list, tuple, dict)): + # We have one or more prompts in list-of-dicts format, so this is chat mode + if isinstance(first_item, dict): + return super().__call__(Chat(text_inputs), **kwargs) + else: + chats = (Chat(chat) for chat in text_inputs) # 🐈 🐈 🐈 + if isinstance(text_inputs, types.GeneratorType): + return super().__call__(chats, **kwargs) + else: + return super().__call__(list(chats), **kwargs) + return super().__call__(text_inputs, **kwargs) + + def preprocess( + self, + prompt_text, + prefix="", + handle_long_generation=None, + add_special_tokens=None, + truncation=None, + padding=None, + max_length=None, + continue_final_message=None, + **generate_kwargs, + ): + # Only set non-None tokenizer kwargs, so as to rely on the tokenizer's defaults + tokenizer_kwargs = { + "add_special_tokens": add_special_tokens, + "truncation": truncation, + "padding": padding, + "max_length": max_length, + } + tokenizer_kwargs = {key: value for key, value in tokenizer_kwargs.items() if value is not None} + + if isinstance(prompt_text, Chat): + tokenizer_kwargs.pop("add_special_tokens", None) # ignore add_special_tokens on chats + # If the user passes a chat that ends in an assistant message, we treat it as a prefill by default + # because very few models support multiple separate, consecutive assistant messages + if continue_final_message is None: + continue_final_message = prompt_text.messages[-1]["role"] == "assistant" + inputs = self.tokenizer.apply_chat_template( + prompt_text.messages, + add_generation_prompt=not continue_final_message, + continue_final_message=continue_final_message, + return_dict=True, + return_tensors=self.framework, + **tokenizer_kwargs, + ) + else: + inputs = self.tokenizer(prefix + prompt_text, return_tensors=self.framework, **tokenizer_kwargs) + + inputs["prompt_text"] = prompt_text + + if handle_long_generation == "hole": + cur_len = inputs["input_ids"].shape[-1] + if "max_new_tokens" in generate_kwargs: + new_tokens = generate_kwargs["max_new_tokens"] + else: + new_tokens = generate_kwargs.get("max_length", self.generation_config.max_length) - cur_len + if new_tokens < 0: + raise ValueError("We cannot infer how many new tokens are expected") + if cur_len + new_tokens > self.tokenizer.model_max_length: + keep_length = self.tokenizer.model_max_length - new_tokens + if keep_length <= 0: + raise ValueError( + "We cannot use `hole` to handle this generation the number of desired tokens exceeds the" + " models max length" + ) + + inputs["input_ids"] = inputs["input_ids"][:, -keep_length:] + if "attention_mask" in inputs: + inputs["attention_mask"] = inputs["attention_mask"][:, -keep_length:] + + return inputs + + def _forward(self, model_inputs, **generate_kwargs): + input_ids = model_inputs["input_ids"] + attention_mask = model_inputs.get("attention_mask", None) + # Allow empty prompts + if input_ids.shape[1] == 0: + input_ids = None + attention_mask = None + in_b = 1 + else: + in_b = input_ids.shape[0] + prompt_text = model_inputs.pop("prompt_text") + + # If there is a prefix, we may need to adjust the generation length. Do so without permanently modifying + # generate_kwargs, as some of the parameterization may come from the initialization of the pipeline. + prefix_length = generate_kwargs.pop("prefix_length", 0) + if prefix_length > 0: + has_max_new_tokens = "max_new_tokens" in generate_kwargs or ( + "generation_config" in generate_kwargs + and generate_kwargs["generation_config"].max_new_tokens is not None + ) + if not has_max_new_tokens: + generate_kwargs["max_length"] = generate_kwargs.get("max_length") or self.generation_config.max_length + generate_kwargs["max_length"] += prefix_length + has_min_new_tokens = "min_new_tokens" in generate_kwargs or ( + "generation_config" in generate_kwargs + and generate_kwargs["generation_config"].min_new_tokens is not None + ) + if not has_min_new_tokens and "min_length" in generate_kwargs: + generate_kwargs["min_length"] += prefix_length + + # User-defined `generation_config` passed to the pipeline call take precedence + if "generation_config" not in generate_kwargs: + generate_kwargs["generation_config"] = self.generation_config + + generated_sequence = self.model.generate(input_ids=input_ids, attention_mask=attention_mask, **generate_kwargs) + out_b = generated_sequence.shape[0] + if self.framework == "pt": + generated_sequence = generated_sequence.reshape(in_b, out_b // in_b, *generated_sequence.shape[1:]) + elif self.framework == "tf": + generated_sequence = tf.reshape(generated_sequence, (in_b, out_b // in_b, *generated_sequence.shape[1:])) + return {"generated_sequence": generated_sequence, "input_ids": input_ids, "prompt_text": prompt_text} + + def postprocess( + self, + model_outputs, + return_type=ReturnType.FULL_TEXT, + clean_up_tokenization_spaces=True, + continue_final_message=None, + ): + generated_sequence = model_outputs["generated_sequence"][0] + input_ids = model_outputs["input_ids"] + prompt_text = model_outputs["prompt_text"] + generated_sequence = generated_sequence.numpy().tolist() + records = [] + for sequence in generated_sequence: + if return_type == ReturnType.TENSORS: + record = {"generated_token_ids": sequence} + elif return_type in {ReturnType.NEW_TEXT, ReturnType.FULL_TEXT}: + # Decode text + text = self.tokenizer.decode( + sequence, + skip_special_tokens=True, + clean_up_tokenization_spaces=clean_up_tokenization_spaces, + ) + + # Remove PADDING prompt of the sequence if XLNet or Transfo-XL model is used + if input_ids is None: + prompt_length = 0 + else: + prompt_length = len( + self.tokenizer.decode( + input_ids[0], + skip_special_tokens=True, + clean_up_tokenization_spaces=clean_up_tokenization_spaces, + ) + ) + + all_text = text[prompt_length:] + if return_type == ReturnType.FULL_TEXT: + if isinstance(prompt_text, str): + all_text = prompt_text + all_text + elif isinstance(prompt_text, Chat): + if continue_final_message is None: + # If the user passes a chat ending in an assistant message, we treat it as a prefill by + # default because very few models support multiple separate, consecutive assistant messages + continue_final_message = prompt_text.messages[-1]["role"] == "assistant" + if continue_final_message: + # With assistant prefill, concat onto the end of the last message + all_text = list(prompt_text.messages)[:-1] + [ + { + "role": prompt_text.messages[-1]["role"], + "content": prompt_text.messages[-1]["content"] + all_text, + } + ] + else: + # When we're not starting from a prefill, the output is a new assistant message + all_text = list(prompt_text.messages) + [{"role": "assistant", "content": all_text}] + record = {"generated_text": all_text} + records.append(record) + + return records diff --git a/.venv/lib/python3.11/site-packages/transformers/pipelines/text_to_audio.py b/.venv/lib/python3.11/site-packages/transformers/pipelines/text_to_audio.py new file mode 100644 index 0000000000000000000000000000000000000000..b7beca586d21957b2eb3ec2dbb7daa2c49453970 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/transformers/pipelines/text_to_audio.py @@ -0,0 +1,219 @@ +# Copyright 2023 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License.from typing import List, Union +from typing import List, Union + +from ..utils import is_torch_available +from .base import Pipeline + + +if is_torch_available(): + from ..models.auto.modeling_auto import MODEL_FOR_TEXT_TO_SPECTROGRAM_MAPPING + from ..models.speecht5.modeling_speecht5 import SpeechT5HifiGan + +DEFAULT_VOCODER_ID = "microsoft/speecht5_hifigan" + + +class TextToAudioPipeline(Pipeline): + """ + Text-to-audio generation pipeline using any `AutoModelForTextToWaveform` or `AutoModelForTextToSpectrogram`. This + pipeline generates an audio file from an input text and optional other conditional inputs. + + Example: + + ```python + >>> from transformers import pipeline + + >>> pipe = pipeline(model="suno/bark-small") + >>> output = pipe("Hey it's HuggingFace on the phone!") + + >>> audio = output["audio"] + >>> sampling_rate = output["sampling_rate"] + ``` + + Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial) + + + + You can specify parameters passed to the model by using [`TextToAudioPipeline.__call__.forward_params`] or + [`TextToAudioPipeline.__call__.generate_kwargs`]. + + Example: + + ```python + >>> from transformers import pipeline + + >>> music_generator = pipeline(task="text-to-audio", model="facebook/musicgen-small", framework="pt") + + >>> # diversify the music generation by adding randomness with a high temperature and set a maximum music length + >>> generate_kwargs = { + ... "do_sample": True, + ... "temperature": 0.7, + ... "max_new_tokens": 35, + ... } + + >>> outputs = music_generator("Techno music with high melodic riffs", generate_kwargs=generate_kwargs) + ``` + + + + This pipeline can currently be loaded from [`pipeline`] using the following task identifiers: `"text-to-speech"` or + `"text-to-audio"`. + + See the list of available models on [huggingface.co/models](https://huggingface.co/models?filter=text-to-speech). + """ + + def __init__(self, *args, vocoder=None, sampling_rate=None, **kwargs): + super().__init__(*args, **kwargs) + + if self.framework == "tf": + raise ValueError("The TextToAudioPipeline is only available in PyTorch.") + + self.vocoder = None + if self.model.__class__ in MODEL_FOR_TEXT_TO_SPECTROGRAM_MAPPING.values(): + self.vocoder = ( + SpeechT5HifiGan.from_pretrained(DEFAULT_VOCODER_ID).to(self.model.device) + if vocoder is None + else vocoder + ) + + self.sampling_rate = sampling_rate + if self.vocoder is not None: + self.sampling_rate = self.vocoder.config.sampling_rate + + if self.sampling_rate is None: + # get sampling_rate from config and generation config + + config = self.model.config + gen_config = self.model.__dict__.get("generation_config", None) + if gen_config is not None: + config.update(gen_config.to_dict()) + + for sampling_rate_name in ["sample_rate", "sampling_rate"]: + sampling_rate = getattr(config, sampling_rate_name, None) + if sampling_rate is not None: + self.sampling_rate = sampling_rate + + def preprocess(self, text, **kwargs): + if isinstance(text, str): + text = [text] + + if self.model.config.model_type == "bark": + # bark Tokenizer is called with BarkProcessor which uses those kwargs + new_kwargs = { + "max_length": self.generation_config.semantic_config.get("max_input_semantic_length", 256), + "add_special_tokens": False, + "return_attention_mask": True, + "return_token_type_ids": False, + "padding": "max_length", + } + + # priority is given to kwargs + new_kwargs.update(kwargs) + + kwargs = new_kwargs + + output = self.tokenizer(text, **kwargs, return_tensors="pt") + + return output + + def _forward(self, model_inputs, **kwargs): + # we expect some kwargs to be additional tensors which need to be on the right device + kwargs = self._ensure_tensor_on_device(kwargs, device=self.device) + forward_params = kwargs["forward_params"] + generate_kwargs = kwargs["generate_kwargs"] + + if self.model.can_generate(): + # we expect some kwargs to be additional tensors which need to be on the right device + generate_kwargs = self._ensure_tensor_on_device(generate_kwargs, device=self.device) + + # User-defined `generation_config` passed to the pipeline call take precedence + if "generation_config" not in generate_kwargs: + generate_kwargs["generation_config"] = self.generation_config + + # generate_kwargs get priority over forward_params + forward_params.update(generate_kwargs) + + output = self.model.generate(**model_inputs, **forward_params) + else: + if len(generate_kwargs): + raise ValueError( + "You're using the `TextToAudioPipeline` with a forward-only model, but `generate_kwargs` is non " + "empty. For forward-only TTA models, please use `forward_params` instead of `generate_kwargs`. " + f"For reference, the `generate_kwargs` used here are: {generate_kwargs.keys()}" + ) + output = self.model(**model_inputs, **forward_params)[0] + + if self.vocoder is not None: + # in that case, the output is a spectrogram that needs to be converted into a waveform + output = self.vocoder(output) + + return output + + def __call__(self, text_inputs: Union[str, List[str]], **forward_params): + """ + Generates speech/audio from the inputs. See the [`TextToAudioPipeline`] documentation for more information. + + Args: + text_inputs (`str` or `List[str]`): + The text(s) to generate. + forward_params (`dict`, *optional*): + Parameters passed to the model generation/forward method. `forward_params` are always passed to the + underlying model. + generate_kwargs (`dict`, *optional*): + The dictionary of ad-hoc parametrization of `generate_config` to be used for the generation call. For a + complete overview of generate, check the [following + guide](https://huggingface.co/docs/transformers/en/main_classes/text_generation). `generate_kwargs` are + only passed to the underlying model if the latter is a generative model. + + Return: + A `dict` or a list of `dict`: The dictionaries have two keys: + + - **audio** (`np.ndarray` of shape `(nb_channels, audio_length)`) -- The generated audio waveform. + - **sampling_rate** (`int`) -- The sampling rate of the generated audio waveform. + """ + return super().__call__(text_inputs, **forward_params) + + def _sanitize_parameters( + self, + preprocess_params=None, + forward_params=None, + generate_kwargs=None, + ): + if self.assistant_model is not None: + generate_kwargs["assistant_model"] = self.assistant_model + if self.assistant_tokenizer is not None: + generate_kwargs["tokenizer"] = self.tokenizer + generate_kwargs["assistant_tokenizer"] = self.assistant_tokenizer + + params = { + "forward_params": forward_params if forward_params else {}, + "generate_kwargs": generate_kwargs if generate_kwargs else {}, + } + + if preprocess_params is None: + preprocess_params = {} + postprocess_params = {} + + return preprocess_params, params, postprocess_params + + def postprocess(self, waveform): + output_dict = {} + if isinstance(waveform, dict): + waveform = waveform["waveform"] + elif isinstance(waveform, tuple): + waveform = waveform[0] + output_dict["audio"] = waveform.cpu().float().numpy() + output_dict["sampling_rate"] = self.sampling_rate + + return output_dict diff --git a/.venv/lib/python3.11/site-packages/transformers/pipelines/token_classification.py b/.venv/lib/python3.11/site-packages/transformers/pipelines/token_classification.py new file mode 100644 index 0000000000000000000000000000000000000000..9256f238148476b4d923c84f884156b4564c93a7 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/transformers/pipelines/token_classification.py @@ -0,0 +1,576 @@ +import types +import warnings +from typing import List, Optional, Tuple, Union + +import numpy as np + +from ..models.bert.tokenization_bert import BasicTokenizer +from ..utils import ( + ExplicitEnum, + add_end_docstrings, + is_tf_available, + is_torch_available, +) +from .base import ArgumentHandler, ChunkPipeline, Dataset, build_pipeline_init_args + + +if is_tf_available(): + import tensorflow as tf + + from ..models.auto.modeling_tf_auto import TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES +if is_torch_available(): + import torch + + from ..models.auto.modeling_auto import MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES + + +class TokenClassificationArgumentHandler(ArgumentHandler): + """ + Handles arguments for token classification. + """ + + def __call__(self, inputs: Union[str, List[str]], **kwargs): + if inputs is not None and isinstance(inputs, (list, tuple)) and len(inputs) > 0: + inputs = list(inputs) + batch_size = len(inputs) + elif isinstance(inputs, str): + inputs = [inputs] + batch_size = 1 + elif Dataset is not None and isinstance(inputs, Dataset) or isinstance(inputs, types.GeneratorType): + return inputs, None + else: + raise ValueError("At least one input is required.") + + offset_mapping = kwargs.get("offset_mapping") + if offset_mapping: + if isinstance(offset_mapping, list) and isinstance(offset_mapping[0], tuple): + offset_mapping = [offset_mapping] + if len(offset_mapping) != batch_size: + raise ValueError("offset_mapping should have the same batch size as the input") + return inputs, offset_mapping + + +class AggregationStrategy(ExplicitEnum): + """All the valid aggregation strategies for TokenClassificationPipeline""" + + NONE = "none" + SIMPLE = "simple" + FIRST = "first" + AVERAGE = "average" + MAX = "max" + + +@add_end_docstrings( + build_pipeline_init_args(has_tokenizer=True), + r""" + ignore_labels (`List[str]`, defaults to `["O"]`): + A list of labels to ignore. + grouped_entities (`bool`, *optional*, defaults to `False`): + DEPRECATED, use `aggregation_strategy` instead. Whether or not to group the tokens corresponding to the + same entity together in the predictions or not. + stride (`int`, *optional*): + If stride is provided, the pipeline is applied on all the text. The text is split into chunks of size + model_max_length. Works only with fast tokenizers and `aggregation_strategy` different from `NONE`. The + value of this argument defines the number of overlapping tokens between chunks. In other words, the model + will shift forward by `tokenizer.model_max_length - stride` tokens each step. + aggregation_strategy (`str`, *optional*, defaults to `"none"`): + The strategy to fuse (or not) tokens based on the model prediction. + + - "none" : Will simply not do any aggregation and simply return raw results from the model + - "simple" : Will attempt to group entities following the default schema. (A, B-TAG), (B, I-TAG), (C, + I-TAG), (D, B-TAG2) (E, B-TAG2) will end up being [{"word": ABC, "entity": "TAG"}, {"word": "D", + "entity": "TAG2"}, {"word": "E", "entity": "TAG2"}] Notice that two consecutive B tags will end up as + different entities. On word based languages, we might end up splitting words undesirably : Imagine + Microsoft being tagged as [{"word": "Micro", "entity": "ENTERPRISE"}, {"word": "soft", "entity": + "NAME"}]. Look for FIRST, MAX, AVERAGE for ways to mitigate that and disambiguate words (on languages + that support that meaning, which is basically tokens separated by a space). These mitigations will + only work on real words, "New york" might still be tagged with two different entities. + - "first" : (works only on word based models) Will use the `SIMPLE` strategy except that words, cannot + end up with different tags. Words will simply use the tag of the first token of the word when there + is ambiguity. + - "average" : (works only on word based models) Will use the `SIMPLE` strategy except that words, + cannot end up with different tags. scores will be averaged first across tokens, and then the maximum + label is applied. + - "max" : (works only on word based models) Will use the `SIMPLE` strategy except that words, cannot + end up with different tags. Word entity will simply be the token with the maximum score.""", +) +class TokenClassificationPipeline(ChunkPipeline): + """ + Named Entity Recognition pipeline using any `ModelForTokenClassification`. See the [named entity recognition + examples](../task_summary#named-entity-recognition) for more information. + + Example: + + ```python + >>> from transformers import pipeline + + >>> token_classifier = pipeline(model="Jean-Baptiste/camembert-ner", aggregation_strategy="simple") + >>> sentence = "Je m'appelle jean-baptiste et je vis à montréal" + >>> tokens = token_classifier(sentence) + >>> tokens + [{'entity_group': 'PER', 'score': 0.9931, 'word': 'jean-baptiste', 'start': 12, 'end': 26}, {'entity_group': 'LOC', 'score': 0.998, 'word': 'montréal', 'start': 38, 'end': 47}] + + >>> token = tokens[0] + >>> # Start and end provide an easy way to highlight words in the original text. + >>> sentence[token["start"] : token["end"]] + ' jean-baptiste' + + >>> # Some models use the same idea to do part of speech. + >>> syntaxer = pipeline(model="vblagoje/bert-english-uncased-finetuned-pos", aggregation_strategy="simple") + >>> syntaxer("My name is Sarah and I live in London") + [{'entity_group': 'PRON', 'score': 0.999, 'word': 'my', 'start': 0, 'end': 2}, {'entity_group': 'NOUN', 'score': 0.997, 'word': 'name', 'start': 3, 'end': 7}, {'entity_group': 'AUX', 'score': 0.994, 'word': 'is', 'start': 8, 'end': 10}, {'entity_group': 'PROPN', 'score': 0.999, 'word': 'sarah', 'start': 11, 'end': 16}, {'entity_group': 'CCONJ', 'score': 0.999, 'word': 'and', 'start': 17, 'end': 20}, {'entity_group': 'PRON', 'score': 0.999, 'word': 'i', 'start': 21, 'end': 22}, {'entity_group': 'VERB', 'score': 0.998, 'word': 'live', 'start': 23, 'end': 27}, {'entity_group': 'ADP', 'score': 0.999, 'word': 'in', 'start': 28, 'end': 30}, {'entity_group': 'PROPN', 'score': 0.999, 'word': 'london', 'start': 31, 'end': 37}] + ``` + + Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial) + + This token recognition pipeline can currently be loaded from [`pipeline`] using the following task identifier: + `"ner"` (for predicting the classes of tokens in a sequence: person, organisation, location or miscellaneous). + + The models that this pipeline can use are models that have been fine-tuned on a token classification task. See the + up-to-date list of available models on + [huggingface.co/models](https://huggingface.co/models?filter=token-classification). + """ + + default_input_names = "sequences" + + def __init__(self, args_parser=TokenClassificationArgumentHandler(), *args, **kwargs): + super().__init__(*args, **kwargs) + self.check_model_type( + TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES + if self.framework == "tf" + else MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES + ) + + self._basic_tokenizer = BasicTokenizer(do_lower_case=False) + self._args_parser = args_parser + + def _sanitize_parameters( + self, + ignore_labels=None, + grouped_entities: Optional[bool] = None, + ignore_subwords: Optional[bool] = None, + aggregation_strategy: Optional[AggregationStrategy] = None, + offset_mapping: Optional[List[Tuple[int, int]]] = None, + stride: Optional[int] = None, + ): + preprocess_params = {} + if offset_mapping is not None: + preprocess_params["offset_mapping"] = offset_mapping + + postprocess_params = {} + if grouped_entities is not None or ignore_subwords is not None: + if grouped_entities and ignore_subwords: + aggregation_strategy = AggregationStrategy.FIRST + elif grouped_entities and not ignore_subwords: + aggregation_strategy = AggregationStrategy.SIMPLE + else: + aggregation_strategy = AggregationStrategy.NONE + + if grouped_entities is not None: + warnings.warn( + "`grouped_entities` is deprecated and will be removed in version v5.0.0, defaulted to" + f' `aggregation_strategy="{aggregation_strategy}"` instead.' + ) + if ignore_subwords is not None: + warnings.warn( + "`ignore_subwords` is deprecated and will be removed in version v5.0.0, defaulted to" + f' `aggregation_strategy="{aggregation_strategy}"` instead.' + ) + + if aggregation_strategy is not None: + if isinstance(aggregation_strategy, str): + aggregation_strategy = AggregationStrategy[aggregation_strategy.upper()] + if ( + aggregation_strategy + in {AggregationStrategy.FIRST, AggregationStrategy.MAX, AggregationStrategy.AVERAGE} + and not self.tokenizer.is_fast + ): + raise ValueError( + "Slow tokenizers cannot handle subwords. Please set the `aggregation_strategy` option" + ' to `"simple"` or use a fast tokenizer.' + ) + postprocess_params["aggregation_strategy"] = aggregation_strategy + if ignore_labels is not None: + postprocess_params["ignore_labels"] = ignore_labels + if stride is not None: + if stride >= self.tokenizer.model_max_length: + raise ValueError( + "`stride` must be less than `tokenizer.model_max_length` (or even lower if the tokenizer adds special tokens)" + ) + if aggregation_strategy == AggregationStrategy.NONE: + raise ValueError( + "`stride` was provided to process all the text but `aggregation_strategy=" + f'"{aggregation_strategy}"`, please select another one instead.' + ) + else: + if self.tokenizer.is_fast: + tokenizer_params = { + "return_overflowing_tokens": True, + "padding": True, + "stride": stride, + } + preprocess_params["tokenizer_params"] = tokenizer_params + else: + raise ValueError( + "`stride` was provided to process all the text but you're using a slow tokenizer." + " Please use a fast tokenizer." + ) + return preprocess_params, {}, postprocess_params + + def __call__(self, inputs: Union[str, List[str]], **kwargs): + """ + Classify each token of the text(s) given as inputs. + + Args: + inputs (`str` or `List[str]`): + One or several texts (or one list of texts) for token classification. + + Return: + A list or a list of list of `dict`: Each result comes as a list of dictionaries (one for each token in the + corresponding input, or each entity if this pipeline was instantiated with an aggregation_strategy) with + the following keys: + + - **word** (`str`) -- The token/word classified. This is obtained by decoding the selected tokens. If you + want to have the exact string in the original sentence, use `start` and `end`. + - **score** (`float`) -- The corresponding probability for `entity`. + - **entity** (`str`) -- The entity predicted for that token/word (it is named *entity_group* when + *aggregation_strategy* is not `"none"`. + - **index** (`int`, only present when `aggregation_strategy="none"`) -- The index of the corresponding + token in the sentence. + - **start** (`int`, *optional*) -- The index of the start of the corresponding entity in the sentence. Only + exists if the offsets are available within the tokenizer + - **end** (`int`, *optional*) -- The index of the end of the corresponding entity in the sentence. Only + exists if the offsets are available within the tokenizer + """ + + _inputs, offset_mapping = self._args_parser(inputs, **kwargs) + if offset_mapping: + kwargs["offset_mapping"] = offset_mapping + + return super().__call__(inputs, **kwargs) + + def preprocess(self, sentence, offset_mapping=None, **preprocess_params): + tokenizer_params = preprocess_params.pop("tokenizer_params", {}) + truncation = True if self.tokenizer.model_max_length and self.tokenizer.model_max_length > 0 else False + inputs = self.tokenizer( + sentence, + return_tensors=self.framework, + truncation=truncation, + return_special_tokens_mask=True, + return_offsets_mapping=self.tokenizer.is_fast, + **tokenizer_params, + ) + inputs.pop("overflow_to_sample_mapping", None) + num_chunks = len(inputs["input_ids"]) + + for i in range(num_chunks): + if self.framework == "tf": + model_inputs = {k: tf.expand_dims(v[i], 0) for k, v in inputs.items()} + else: + model_inputs = {k: v[i].unsqueeze(0) for k, v in inputs.items()} + if offset_mapping is not None: + model_inputs["offset_mapping"] = offset_mapping + model_inputs["sentence"] = sentence if i == 0 else None + model_inputs["is_last"] = i == num_chunks - 1 + + yield model_inputs + + def _forward(self, model_inputs): + # Forward + special_tokens_mask = model_inputs.pop("special_tokens_mask") + offset_mapping = model_inputs.pop("offset_mapping", None) + sentence = model_inputs.pop("sentence") + is_last = model_inputs.pop("is_last") + if self.framework == "tf": + logits = self.model(**model_inputs)[0] + else: + output = self.model(**model_inputs) + logits = output["logits"] if isinstance(output, dict) else output[0] + + return { + "logits": logits, + "special_tokens_mask": special_tokens_mask, + "offset_mapping": offset_mapping, + "sentence": sentence, + "is_last": is_last, + **model_inputs, + } + + def postprocess(self, all_outputs, aggregation_strategy=AggregationStrategy.NONE, ignore_labels=None): + if ignore_labels is None: + ignore_labels = ["O"] + all_entities = [] + for model_outputs in all_outputs: + if self.framework == "pt" and model_outputs["logits"][0].dtype in (torch.bfloat16, torch.float16): + logits = model_outputs["logits"][0].to(torch.float32).numpy() + else: + logits = model_outputs["logits"][0].numpy() + + sentence = all_outputs[0]["sentence"] + input_ids = model_outputs["input_ids"][0] + offset_mapping = ( + model_outputs["offset_mapping"][0] if model_outputs["offset_mapping"] is not None else None + ) + special_tokens_mask = model_outputs["special_tokens_mask"][0].numpy() + + maxes = np.max(logits, axis=-1, keepdims=True) + shifted_exp = np.exp(logits - maxes) + scores = shifted_exp / shifted_exp.sum(axis=-1, keepdims=True) + + if self.framework == "tf": + input_ids = input_ids.numpy() + offset_mapping = offset_mapping.numpy() if offset_mapping is not None else None + + pre_entities = self.gather_pre_entities( + sentence, input_ids, scores, offset_mapping, special_tokens_mask, aggregation_strategy + ) + grouped_entities = self.aggregate(pre_entities, aggregation_strategy) + # Filter anything that is in self.ignore_labels + entities = [ + entity + for entity in grouped_entities + if entity.get("entity", None) not in ignore_labels + and entity.get("entity_group", None) not in ignore_labels + ] + all_entities.extend(entities) + num_chunks = len(all_outputs) + if num_chunks > 1: + all_entities = self.aggregate_overlapping_entities(all_entities) + return all_entities + + def aggregate_overlapping_entities(self, entities): + if len(entities) == 0: + return entities + entities = sorted(entities, key=lambda x: x["start"]) + aggregated_entities = [] + previous_entity = entities[0] + for entity in entities: + if previous_entity["start"] <= entity["start"] < previous_entity["end"]: + current_length = entity["end"] - entity["start"] + previous_length = previous_entity["end"] - previous_entity["start"] + if current_length > previous_length: + previous_entity = entity + elif current_length == previous_length and entity["score"] > previous_entity["score"]: + previous_entity = entity + else: + aggregated_entities.append(previous_entity) + previous_entity = entity + aggregated_entities.append(previous_entity) + return aggregated_entities + + def gather_pre_entities( + self, + sentence: str, + input_ids: np.ndarray, + scores: np.ndarray, + offset_mapping: Optional[List[Tuple[int, int]]], + special_tokens_mask: np.ndarray, + aggregation_strategy: AggregationStrategy, + ) -> List[dict]: + """Fuse various numpy arrays into dicts with all the information needed for aggregation""" + pre_entities = [] + for idx, token_scores in enumerate(scores): + # Filter special_tokens + if special_tokens_mask[idx]: + continue + + word = self.tokenizer.convert_ids_to_tokens(int(input_ids[idx])) + if offset_mapping is not None: + start_ind, end_ind = offset_mapping[idx] + if not isinstance(start_ind, int): + if self.framework == "pt": + start_ind = start_ind.item() + end_ind = end_ind.item() + word_ref = sentence[start_ind:end_ind] + if getattr(self.tokenizer, "_tokenizer", None) and getattr( + self.tokenizer._tokenizer.model, "continuing_subword_prefix", None + ): + # This is a BPE, word aware tokenizer, there is a correct way + # to fuse tokens + is_subword = len(word) != len(word_ref) + else: + # This is a fallback heuristic. This will fail most likely on any kind of text + punctuation mixtures that will be considered "words". Non word aware models cannot do better than this unfortunately. + if aggregation_strategy in { + AggregationStrategy.FIRST, + AggregationStrategy.AVERAGE, + AggregationStrategy.MAX, + }: + warnings.warn( + "Tokenizer does not support real words, using fallback heuristic", + UserWarning, + ) + is_subword = start_ind > 0 and " " not in sentence[start_ind - 1 : start_ind + 1] + + if int(input_ids[idx]) == self.tokenizer.unk_token_id: + word = word_ref + is_subword = False + else: + start_ind = None + end_ind = None + is_subword = False + + pre_entity = { + "word": word, + "scores": token_scores, + "start": start_ind, + "end": end_ind, + "index": idx, + "is_subword": is_subword, + } + pre_entities.append(pre_entity) + return pre_entities + + def aggregate(self, pre_entities: List[dict], aggregation_strategy: AggregationStrategy) -> List[dict]: + if aggregation_strategy in {AggregationStrategy.NONE, AggregationStrategy.SIMPLE}: + entities = [] + for pre_entity in pre_entities: + entity_idx = pre_entity["scores"].argmax() + score = pre_entity["scores"][entity_idx] + entity = { + "entity": self.model.config.id2label[entity_idx], + "score": score, + "index": pre_entity["index"], + "word": pre_entity["word"], + "start": pre_entity["start"], + "end": pre_entity["end"], + } + entities.append(entity) + else: + entities = self.aggregate_words(pre_entities, aggregation_strategy) + + if aggregation_strategy == AggregationStrategy.NONE: + return entities + return self.group_entities(entities) + + def aggregate_word(self, entities: List[dict], aggregation_strategy: AggregationStrategy) -> dict: + word = self.tokenizer.convert_tokens_to_string([entity["word"] for entity in entities]) + if aggregation_strategy == AggregationStrategy.FIRST: + scores = entities[0]["scores"] + idx = scores.argmax() + score = scores[idx] + entity = self.model.config.id2label[idx] + elif aggregation_strategy == AggregationStrategy.MAX: + max_entity = max(entities, key=lambda entity: entity["scores"].max()) + scores = max_entity["scores"] + idx = scores.argmax() + score = scores[idx] + entity = self.model.config.id2label[idx] + elif aggregation_strategy == AggregationStrategy.AVERAGE: + scores = np.stack([entity["scores"] for entity in entities]) + average_scores = np.nanmean(scores, axis=0) + entity_idx = average_scores.argmax() + entity = self.model.config.id2label[entity_idx] + score = average_scores[entity_idx] + else: + raise ValueError("Invalid aggregation_strategy") + new_entity = { + "entity": entity, + "score": score, + "word": word, + "start": entities[0]["start"], + "end": entities[-1]["end"], + } + return new_entity + + def aggregate_words(self, entities: List[dict], aggregation_strategy: AggregationStrategy) -> List[dict]: + """ + Override tokens from a given word that disagree to force agreement on word boundaries. + + Example: micro|soft| com|pany| B-ENT I-NAME I-ENT I-ENT will be rewritten with first strategy as microsoft| + company| B-ENT I-ENT + """ + if aggregation_strategy in { + AggregationStrategy.NONE, + AggregationStrategy.SIMPLE, + }: + raise ValueError("NONE and SIMPLE strategies are invalid for word aggregation") + + word_entities = [] + word_group = None + for entity in entities: + if word_group is None: + word_group = [entity] + elif entity["is_subword"]: + word_group.append(entity) + else: + word_entities.append(self.aggregate_word(word_group, aggregation_strategy)) + word_group = [entity] + # Last item + if word_group is not None: + word_entities.append(self.aggregate_word(word_group, aggregation_strategy)) + return word_entities + + def group_sub_entities(self, entities: List[dict]) -> dict: + """ + Group together the adjacent tokens with the same entity predicted. + + Args: + entities (`dict`): The entities predicted by the pipeline. + """ + # Get the first entity in the entity group + entity = entities[0]["entity"].split("-", 1)[-1] + scores = np.nanmean([entity["score"] for entity in entities]) + tokens = [entity["word"] for entity in entities] + + entity_group = { + "entity_group": entity, + "score": np.mean(scores), + "word": self.tokenizer.convert_tokens_to_string(tokens), + "start": entities[0]["start"], + "end": entities[-1]["end"], + } + return entity_group + + def get_tag(self, entity_name: str) -> Tuple[str, str]: + if entity_name.startswith("B-"): + bi = "B" + tag = entity_name[2:] + elif entity_name.startswith("I-"): + bi = "I" + tag = entity_name[2:] + else: + # It's not in B-, I- format + # Default to I- for continuation. + bi = "I" + tag = entity_name + return bi, tag + + def group_entities(self, entities: List[dict]) -> List[dict]: + """ + Find and group together the adjacent tokens with the same entity predicted. + + Args: + entities (`dict`): The entities predicted by the pipeline. + """ + + entity_groups = [] + entity_group_disagg = [] + + for entity in entities: + if not entity_group_disagg: + entity_group_disagg.append(entity) + continue + + # If the current entity is similar and adjacent to the previous entity, + # append it to the disaggregated entity group + # The split is meant to account for the "B" and "I" prefixes + # Shouldn't merge if both entities are B-type + bi, tag = self.get_tag(entity["entity"]) + last_bi, last_tag = self.get_tag(entity_group_disagg[-1]["entity"]) + + if tag == last_tag and bi != "B": + # Modify subword type to be previous_type + entity_group_disagg.append(entity) + else: + # If the current entity is different from the previous entity + # aggregate the disaggregated entity group + entity_groups.append(self.group_sub_entities(entity_group_disagg)) + entity_group_disagg = [entity] + if entity_group_disagg: + # it's the last entity, add it to the entity groups + entity_groups.append(self.group_sub_entities(entity_group_disagg)) + + return entity_groups + + +NerPipeline = TokenClassificationPipeline diff --git a/.venv/lib/python3.11/site-packages/transformers/pipelines/video_classification.py b/.venv/lib/python3.11/site-packages/transformers/pipelines/video_classification.py new file mode 100644 index 0000000000000000000000000000000000000000..057910098da20a1dfc02bf0d8b041e2d7af8cd09 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/transformers/pipelines/video_classification.py @@ -0,0 +1,184 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import warnings +from io import BytesIO +from typing import List, Union + +import requests + +from ..utils import ( + add_end_docstrings, + is_av_available, + is_torch_available, + logging, + requires_backends, +) +from .base import Pipeline, build_pipeline_init_args + + +if is_av_available(): + import av + import numpy as np + + +if is_torch_available(): + from ..models.auto.modeling_auto import MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING_NAMES + +logger = logging.get_logger(__name__) + + +@add_end_docstrings(build_pipeline_init_args(has_image_processor=True)) +class VideoClassificationPipeline(Pipeline): + """ + Video classification pipeline using any `AutoModelForVideoClassification`. This pipeline predicts the class of a + video. + + This video classification pipeline can currently be loaded from [`pipeline`] using the following task identifier: + `"video-classification"`. + + See the list of available models on + [huggingface.co/models](https://huggingface.co/models?filter=video-classification). + """ + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + requires_backends(self, "av") + self.check_model_type(MODEL_FOR_VIDEO_CLASSIFICATION_MAPPING_NAMES) + + def _sanitize_parameters(self, top_k=None, num_frames=None, frame_sampling_rate=None, function_to_apply=None): + preprocess_params = {} + if frame_sampling_rate is not None: + preprocess_params["frame_sampling_rate"] = frame_sampling_rate + if num_frames is not None: + preprocess_params["num_frames"] = num_frames + + postprocess_params = {} + if top_k is not None: + postprocess_params["top_k"] = top_k + if function_to_apply is not None: + if function_to_apply not in ["softmax", "sigmoid", "none"]: + raise ValueError( + f"Invalid value for `function_to_apply`: {function_to_apply}. " + "Valid options are ['softmax', 'sigmoid', 'none']" + ) + postprocess_params["function_to_apply"] = function_to_apply + else: + postprocess_params["function_to_apply"] = "softmax" + return preprocess_params, {}, postprocess_params + + def __call__(self, inputs: Union[str, List[str]] = None, **kwargs): + """ + Assign labels to the video(s) passed as inputs. + + Args: + inputs (`str`, `List[str]`): + The pipeline handles three types of videos: + + - A string containing a http link pointing to a video + - A string containing a local path to a video + + The pipeline accepts either a single video or a batch of videos, which must then be passed as a string. + Videos in a batch must all be in the same format: all as http links or all as local paths. + top_k (`int`, *optional*, defaults to 5): + The number of top labels that will be returned by the pipeline. If the provided number is higher than + the number of labels available in the model configuration, it will default to the number of labels. + num_frames (`int`, *optional*, defaults to `self.model.config.num_frames`): + The number of frames sampled from the video to run the classification on. If not provided, will default + to the number of frames specified in the model configuration. + frame_sampling_rate (`int`, *optional*, defaults to 1): + The sampling rate used to select frames from the video. If not provided, will default to 1, i.e. every + frame will be used. + function_to_apply(`str`, *optional*, defaults to "softmax"): + The function to apply to the model output. By default, the pipeline will apply the softmax function to + the output of the model. Valid options: ["softmax", "sigmoid", "none"]. Note that passing Python's + built-in `None` will default to "softmax", so you need to pass the string "none" to disable any + post-processing. + + Return: + A dictionary or a list of dictionaries containing result. If the input is a single video, will return a + dictionary, if the input is a list of several videos, will return a list of dictionaries corresponding to + the videos. + + The dictionaries contain the following keys: + + - **label** (`str`) -- The label identified by the model. + - **score** (`int`) -- The score attributed by the model for that label. + """ + # After deprecation of this is completed, remove the default `None` value for `images` + if "videos" in kwargs: + warnings.warn( + "The `videos` argument has been renamed to `inputs`. In version 5 of Transformers, `videos` will no longer be accepted", + FutureWarning, + ) + inputs = kwargs.pop("videos") + if inputs is None: + raise ValueError("Cannot call the video-classification pipeline without an inputs argument!") + return super().__call__(inputs, **kwargs) + + def preprocess(self, video, num_frames=None, frame_sampling_rate=1): + if num_frames is None: + num_frames = self.model.config.num_frames + + if video.startswith("http://") or video.startswith("https://"): + video = BytesIO(requests.get(video).content) + + container = av.open(video) + + start_idx = 0 + end_idx = num_frames * frame_sampling_rate - 1 + indices = np.linspace(start_idx, end_idx, num=num_frames, dtype=np.int64) + + video = read_video_pyav(container, indices) + video = list(video) + + model_inputs = self.image_processor(video, return_tensors=self.framework) + if self.framework == "pt": + model_inputs = model_inputs.to(self.torch_dtype) + return model_inputs + + def _forward(self, model_inputs): + model_outputs = self.model(**model_inputs) + return model_outputs + + def postprocess(self, model_outputs, top_k=5, function_to_apply="softmax"): + if top_k > self.model.config.num_labels: + top_k = self.model.config.num_labels + + if self.framework == "pt": + if function_to_apply == "softmax": + probs = model_outputs.logits[0].softmax(-1) + elif function_to_apply == "sigmoid": + probs = model_outputs.logits[0].sigmoid() + else: + probs = model_outputs.logits[0] + scores, ids = probs.topk(top_k) + else: + raise ValueError(f"Unsupported framework: {self.framework}") + + scores = scores.tolist() + ids = ids.tolist() + return [{"score": score, "label": self.model.config.id2label[_id]} for score, _id in zip(scores, ids)] + + +def read_video_pyav(container, indices): + frames = [] + container.seek(0) + start_index = indices[0] + end_index = indices[-1] + for i, frame in enumerate(container.decode(video=0)): + if i > end_index: + break + if i >= start_index and i in indices: + frames.append(frame) + return np.stack([x.to_ndarray(format="rgb24") for x in frames]) diff --git a/.venv/lib/python3.11/site-packages/transformers/pipelines/visual_question_answering.py b/.venv/lib/python3.11/site-packages/transformers/pipelines/visual_question_answering.py new file mode 100644 index 0000000000000000000000000000000000000000..6d600c9eaf50bc99f6810b0c2836b154cd62ed51 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/transformers/pipelines/visual_question_answering.py @@ -0,0 +1,200 @@ +from typing import List, Union + +from ..utils import add_end_docstrings, is_torch_available, is_vision_available, logging +from .base import Pipeline, build_pipeline_init_args + + +if is_vision_available(): + from PIL import Image + + from ..image_utils import load_image + +if is_torch_available(): + from ..models.auto.modeling_auto import MODEL_FOR_VISUAL_QUESTION_ANSWERING_MAPPING_NAMES + from .pt_utils import KeyDataset + +logger = logging.get_logger(__name__) + + +@add_end_docstrings(build_pipeline_init_args(has_tokenizer=True, has_image_processor=True)) +class VisualQuestionAnsweringPipeline(Pipeline): + """ + Visual Question Answering pipeline using a `AutoModelForVisualQuestionAnswering`. This pipeline is currently only + available in PyTorch. + + Example: + + ```python + >>> from transformers import pipeline + + >>> oracle = pipeline(model="dandelin/vilt-b32-finetuned-vqa") + >>> image_url = "https://huggingface.co/datasets/Narsil/image_dummy/raw/main/lena.png" + >>> oracle(question="What is she wearing ?", image=image_url) + [{'score': 0.948, 'answer': 'hat'}, {'score': 0.009, 'answer': 'fedora'}, {'score': 0.003, 'answer': 'clothes'}, {'score': 0.003, 'answer': 'sun hat'}, {'score': 0.002, 'answer': 'nothing'}] + + >>> oracle(question="What is she wearing ?", image=image_url, top_k=1) + [{'score': 0.948, 'answer': 'hat'}] + + >>> oracle(question="Is this a person ?", image=image_url, top_k=1) + [{'score': 0.993, 'answer': 'yes'}] + + >>> oracle(question="Is this a man ?", image=image_url, top_k=1) + [{'score': 0.996, 'answer': 'no'}] + ``` + + Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial) + + This visual question answering pipeline can currently be loaded from [`pipeline`] using the following task + identifiers: `"visual-question-answering", "vqa"`. + + The models that this pipeline can use are models that have been fine-tuned on a visual question answering task. See + the up-to-date list of available models on + [huggingface.co/models](https://huggingface.co/models?filter=visual-question-answering). + """ + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.check_model_type(MODEL_FOR_VISUAL_QUESTION_ANSWERING_MAPPING_NAMES) + + def _sanitize_parameters(self, top_k=None, padding=None, truncation=None, timeout=None, **kwargs): + preprocess_params, postprocess_params = {}, {} + if padding is not None: + preprocess_params["padding"] = padding + if truncation is not None: + preprocess_params["truncation"] = truncation + if timeout is not None: + preprocess_params["timeout"] = timeout + if top_k is not None: + postprocess_params["top_k"] = top_k + + forward_params = {} + if self.assistant_model is not None: + forward_params["assistant_model"] = self.assistant_model + if self.assistant_tokenizer is not None: + forward_params["tokenizer"] = self.tokenizer + forward_params["assistant_tokenizer"] = self.assistant_tokenizer + + return preprocess_params, forward_params, postprocess_params + + def __call__( + self, + image: Union["Image.Image", str, List["Image.Image"], List[str], "KeyDataset"], + question: Union[str, List[str]] = None, + **kwargs, + ): + r""" + Answers open-ended questions about images. The pipeline accepts several types of inputs which are detailed + below: + + - `pipeline(image=image, question=question)` + - `pipeline({"image": image, "question": question})` + - `pipeline([{"image": image, "question": question}])` + - `pipeline([{"image": image, "question": question}, {"image": image, "question": question}])` + + Args: + image (`str`, `List[str]`, `PIL.Image`, `List[PIL.Image]` or `KeyDataset`): + The pipeline handles three types of images: + + - A string containing a http link pointing to an image + - A string containing a local path to an image + - An image loaded in PIL directly + + The pipeline accepts either a single image or a batch of images. If given a single image, it can be + broadcasted to multiple questions. + For dataset: the passed in dataset must be of type `transformers.pipelines.pt_utils.KeyDataset` + Example: + ```python + >>> from transformers.pipelines.pt_utils import KeyDataset + >>> from datasets import load_dataset + + >>> dataset = load_dataset("detection-datasets/coco") + >>> oracle(image=KeyDataset(dataset, "image"), question="What's in this image?") + + ``` + question (`str`, `List[str]`): + The question(s) asked. If given a single question, it can be broadcasted to multiple images. + If multiple images and questions are given, each and every question will be broadcasted to all images + (same effect as a Cartesian product) + top_k (`int`, *optional*, defaults to 5): + The number of top labels that will be returned by the pipeline. If the provided number is higher than + the number of labels available in the model configuration, it will default to the number of labels. + timeout (`float`, *optional*, defaults to None): + The maximum time in seconds to wait for fetching images from the web. If None, no timeout is set and + the call may block forever. + Return: + A dictionary or a list of dictionaries containing the result. The dictionaries contain the following keys: + + - **label** (`str`) -- The label identified by the model. + - **score** (`int`) -- The score attributed by the model for that label. + """ + is_dataset = isinstance(image, KeyDataset) + is_image_batch = isinstance(image, list) and all(isinstance(item, (Image.Image, str)) for item in image) + is_question_batch = isinstance(question, list) and all(isinstance(item, str) for item in question) + + if isinstance(image, (Image.Image, str)) and isinstance(question, str): + inputs = {"image": image, "question": question} + elif (is_image_batch or is_dataset) and isinstance(question, str): + inputs = [{"image": im, "question": question} for im in image] + elif isinstance(image, (Image.Image, str)) and is_question_batch: + inputs = [{"image": image, "question": q} for q in question] + elif (is_image_batch or is_dataset) and is_question_batch: + question_image_pairs = [] + for q in question: + for im in image: + question_image_pairs.append({"image": im, "question": q}) + inputs = question_image_pairs + else: + """ + Supports the following format + - {"image": image, "question": question} + - [{"image": image, "question": question}] + - Generator and datasets + """ + inputs = image + results = super().__call__(inputs, **kwargs) + return results + + def preprocess(self, inputs, padding=False, truncation=False, timeout=None): + image = load_image(inputs["image"], timeout=timeout) + model_inputs = self.tokenizer( + inputs["question"], + return_tensors=self.framework, + padding=padding, + truncation=truncation, + ) + image_features = self.image_processor(images=image, return_tensors=self.framework) + if self.framework == "pt": + image_features = image_features.to(self.torch_dtype) + model_inputs.update(image_features) + return model_inputs + + def _forward(self, model_inputs, **generate_kwargs): + if self.model.can_generate(): + # User-defined `generation_config` passed to the pipeline call take precedence + if "generation_config" not in generate_kwargs: + generate_kwargs["generation_config"] = self.generation_config + + model_outputs = self.model.generate(**model_inputs, **generate_kwargs) + else: + model_outputs = self.model(**model_inputs) + return model_outputs + + def postprocess(self, model_outputs, top_k=5): + if self.model.can_generate(): + return [ + {"answer": self.tokenizer.decode(output_ids, skip_special_tokens=True).strip()} + for output_ids in model_outputs + ] + else: + if top_k > self.model.config.num_labels: + top_k = self.model.config.num_labels + + if self.framework == "pt": + probs = model_outputs.logits.sigmoid()[0] + scores, ids = probs.topk(top_k) + else: + raise ValueError(f"Unsupported framework: {self.framework}") + + scores = scores.tolist() + ids = ids.tolist() + return [{"score": score, "answer": self.model.config.id2label[_id]} for score, _id in zip(scores, ids)] diff --git a/.venv/lib/python3.11/site-packages/transformers/pipelines/zero_shot_audio_classification.py b/.venv/lib/python3.11/site-packages/transformers/pipelines/zero_shot_audio_classification.py new file mode 100644 index 0000000000000000000000000000000000000000..8ed339a5b7f889c21991eaec6901887ce97d90cd --- /dev/null +++ b/.venv/lib/python3.11/site-packages/transformers/pipelines/zero_shot_audio_classification.py @@ -0,0 +1,164 @@ +# coding=utf-8 +# Copyright 2023 The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from collections import UserDict +from typing import Union + +import numpy as np +import requests + +from ..utils import ( + add_end_docstrings, + logging, +) +from .audio_classification import ffmpeg_read +from .base import Pipeline, build_pipeline_init_args + + +logger = logging.get_logger(__name__) + + +@add_end_docstrings(build_pipeline_init_args(has_feature_extractor=True, has_tokenizer=True)) +class ZeroShotAudioClassificationPipeline(Pipeline): + """ + Zero shot audio classification pipeline using `ClapModel`. This pipeline predicts the class of an audio when you + provide an audio and a set of `candidate_labels`. + + + + The default `hypothesis_template` is : `"This is a sound of {}."`. Make sure you update it for your usage. + + + + Example: + ```python + >>> from transformers import pipeline + >>> from datasets import load_dataset + + >>> dataset = load_dataset("ashraq/esc50") + >>> audio = next(iter(dataset["train"]["audio"]))["array"] + >>> classifier = pipeline(task="zero-shot-audio-classification", model="laion/clap-htsat-unfused") + >>> classifier(audio, candidate_labels=["Sound of a dog", "Sound of vaccum cleaner"]) + [{'score': 0.9996, 'label': 'Sound of a dog'}, {'score': 0.0004, 'label': 'Sound of vaccum cleaner'}] + ``` + + + Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial) This audio + classification pipeline can currently be loaded from [`pipeline`] using the following task identifier: + `"zero-shot-audio-classification"`. See the list of available models on + [huggingface.co/models](https://huggingface.co/models?filter=zero-shot-audio-classification). + """ + + def __init__(self, **kwargs): + super().__init__(**kwargs) + + if self.framework != "pt": + raise ValueError(f"The {self.__class__} is only available in PyTorch.") + # No specific FOR_XXX available yet + + def __call__(self, audios: Union[np.ndarray, bytes, str], **kwargs): + """ + Assign labels to the audio(s) passed as inputs. + + Args: + audios (`str`, `List[str]`, `np.array` or `List[np.array]`): + The pipeline handles three types of inputs: + - A string containing a http link pointing to an audio + - A string containing a local path to an audio + - An audio loaded in numpy + candidate_labels (`List[str]`): + The candidate labels for this audio. They will be formatted using *hypothesis_template*. + hypothesis_template (`str`, *optional*, defaults to `"This is a sound of {}"`): + The format used in conjunction with *candidate_labels* to attempt the audio classification by + replacing the placeholder with the candidate_labels. Pass "{}" if *candidate_labels* are + already formatted. + Return: + A list of dictionaries containing one entry per proposed label. Each dictionary contains the + following keys: + - **label** (`str`) -- One of the suggested *candidate_labels*. + - **score** (`float`) -- The score attributed by the model to that label. It is a value between + 0 and 1, computed as the `softmax` of `logits_per_audio`. + """ + return super().__call__(audios, **kwargs) + + def _sanitize_parameters(self, **kwargs): + preprocess_params = {} + if "candidate_labels" in kwargs: + preprocess_params["candidate_labels"] = kwargs["candidate_labels"] + if "hypothesis_template" in kwargs: + preprocess_params["hypothesis_template"] = kwargs["hypothesis_template"] + + return preprocess_params, {}, {} + + def preprocess(self, audio, candidate_labels=None, hypothesis_template="This is a sound of {}."): + if isinstance(audio, str): + if audio.startswith("http://") or audio.startswith("https://"): + # We need to actually check for a real protocol, otherwise it's impossible to use a local file + # like http_huggingface_co.png + audio = requests.get(audio).content + else: + with open(audio, "rb") as f: + audio = f.read() + + if isinstance(audio, bytes): + audio = ffmpeg_read(audio, self.feature_extractor.sampling_rate) + + if not isinstance(audio, np.ndarray): + raise TypeError("We expect a numpy ndarray as input") + if len(audio.shape) != 1: + raise ValueError("We expect a single channel audio input for ZeroShotAudioClassificationPipeline") + + inputs = self.feature_extractor( + [audio], sampling_rate=self.feature_extractor.sampling_rate, return_tensors="pt" + ) + if self.framework == "pt": + inputs = inputs.to(self.torch_dtype) + inputs["candidate_labels"] = candidate_labels + sequences = [hypothesis_template.format(x) for x in candidate_labels] + text_inputs = self.tokenizer(sequences, return_tensors=self.framework, padding=True) + inputs["text_inputs"] = [text_inputs] + return inputs + + def _forward(self, model_inputs): + candidate_labels = model_inputs.pop("candidate_labels") + text_inputs = model_inputs.pop("text_inputs") + if isinstance(text_inputs[0], UserDict): + text_inputs = text_inputs[0] + else: + # Batching case. + text_inputs = text_inputs[0][0] + + outputs = self.model(**text_inputs, **model_inputs) + + model_outputs = { + "candidate_labels": candidate_labels, + "logits": outputs.logits_per_audio, + } + return model_outputs + + def postprocess(self, model_outputs): + candidate_labels = model_outputs.pop("candidate_labels") + logits = model_outputs["logits"][0] + + if self.framework == "pt": + probs = logits.softmax(dim=0) + scores = probs.tolist() + else: + raise ValueError("`tf` framework not supported.") + + result = [ + {"score": score, "label": candidate_label} + for score, candidate_label in sorted(zip(scores, candidate_labels), key=lambda x: -x[0]) + ] + return result diff --git a/.venv/lib/python3.11/site-packages/transformers/pipelines/zero_shot_classification.py b/.venv/lib/python3.11/site-packages/transformers/pipelines/zero_shot_classification.py new file mode 100644 index 0000000000000000000000000000000000000000..f4aee3341e30d55691ea74d0e90dd00ba4567c8b --- /dev/null +++ b/.venv/lib/python3.11/site-packages/transformers/pipelines/zero_shot_classification.py @@ -0,0 +1,268 @@ +import inspect +from typing import List, Union + +import numpy as np + +from ..tokenization_utils import TruncationStrategy +from ..utils import add_end_docstrings, logging +from .base import ArgumentHandler, ChunkPipeline, build_pipeline_init_args + + +logger = logging.get_logger(__name__) + + +class ZeroShotClassificationArgumentHandler(ArgumentHandler): + """ + Handles arguments for zero-shot for text classification by turning each possible label into an NLI + premise/hypothesis pair. + """ + + def _parse_labels(self, labels): + if isinstance(labels, str): + labels = [label.strip() for label in labels.split(",") if label.strip()] + return labels + + def __call__(self, sequences, labels, hypothesis_template): + if len(labels) == 0 or len(sequences) == 0: + raise ValueError("You must include at least one label and at least one sequence.") + if hypothesis_template.format(labels[0]) == hypothesis_template: + raise ValueError( + ( + 'The provided hypothesis_template "{}" was not able to be formatted with the target labels. ' + "Make sure the passed template includes formatting syntax such as {{}} where the label should go." + ).format(hypothesis_template) + ) + + if isinstance(sequences, str): + sequences = [sequences] + + sequence_pairs = [] + for sequence in sequences: + sequence_pairs.extend([[sequence, hypothesis_template.format(label)] for label in labels]) + + return sequence_pairs, sequences + + +@add_end_docstrings(build_pipeline_init_args(has_tokenizer=True)) +class ZeroShotClassificationPipeline(ChunkPipeline): + """ + NLI-based zero-shot classification pipeline using a `ModelForSequenceClassification` trained on NLI (natural + language inference) tasks. Equivalent of `text-classification` pipelines, but these models don't require a + hardcoded number of potential classes, they can be chosen at runtime. It usually means it's slower but it is + **much** more flexible. + + Any combination of sequences and labels can be passed and each combination will be posed as a premise/hypothesis + pair and passed to the pretrained model. Then, the logit for *entailment* is taken as the logit for the candidate + label being valid. Any NLI model can be used, but the id of the *entailment* label must be included in the model + config's :attr:*~transformers.PretrainedConfig.label2id*. + + Example: + + ```python + >>> from transformers import pipeline + + >>> oracle = pipeline(model="facebook/bart-large-mnli") + >>> oracle( + ... "I have a problem with my iphone that needs to be resolved asap!!", + ... candidate_labels=["urgent", "not urgent", "phone", "tablet", "computer"], + ... ) + {'sequence': 'I have a problem with my iphone that needs to be resolved asap!!', 'labels': ['urgent', 'phone', 'computer', 'not urgent', 'tablet'], 'scores': [0.504, 0.479, 0.013, 0.003, 0.002]} + + >>> oracle( + ... "I have a problem with my iphone that needs to be resolved asap!!", + ... candidate_labels=["english", "german"], + ... ) + {'sequence': 'I have a problem with my iphone that needs to be resolved asap!!', 'labels': ['english', 'german'], 'scores': [0.814, 0.186]} + ``` + + Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial) + + This NLI pipeline can currently be loaded from [`pipeline`] using the following task identifier: + `"zero-shot-classification"`. + + The models that this pipeline can use are models that have been fine-tuned on an NLI task. See the up-to-date list + of available models on [huggingface.co/models](https://huggingface.co/models?search=nli). + """ + + def __init__(self, args_parser=ZeroShotClassificationArgumentHandler(), *args, **kwargs): + self._args_parser = args_parser + super().__init__(*args, **kwargs) + if self.entailment_id == -1: + logger.warning( + "Failed to determine 'entailment' label id from the label2id mapping in the model config. Setting to " + "-1. Define a descriptive label2id mapping in the model config to ensure correct outputs." + ) + + @property + def entailment_id(self): + for label, ind in self.model.config.label2id.items(): + if label.lower().startswith("entail"): + return ind + return -1 + + def _parse_and_tokenize( + self, sequence_pairs, padding=True, add_special_tokens=True, truncation=TruncationStrategy.ONLY_FIRST, **kwargs + ): + """ + Parse arguments and tokenize only_first so that hypothesis (label) is not truncated + """ + return_tensors = self.framework + if self.tokenizer.pad_token is None: + # Override for tokenizers not supporting padding + logger.error( + "Tokenizer was not supporting padding necessary for zero-shot, attempting to use " + " `pad_token=eos_token`" + ) + self.tokenizer.pad_token = self.tokenizer.eos_token + try: + inputs = self.tokenizer( + sequence_pairs, + add_special_tokens=add_special_tokens, + return_tensors=return_tensors, + padding=padding, + truncation=truncation, + ) + except Exception as e: + if "too short" in str(e): + # tokenizers might yell that we want to truncate + # to a value that is not even reached by the input. + # In that case we don't want to truncate. + # It seems there's not a really better way to catch that + # exception. + + inputs = self.tokenizer( + sequence_pairs, + add_special_tokens=add_special_tokens, + return_tensors=return_tensors, + padding=padding, + truncation=TruncationStrategy.DO_NOT_TRUNCATE, + ) + else: + raise e + + return inputs + + def _sanitize_parameters(self, **kwargs): + if kwargs.get("multi_class", None) is not None: + kwargs["multi_label"] = kwargs["multi_class"] + logger.warning( + "The `multi_class` argument has been deprecated and renamed to `multi_label`. " + "`multi_class` will be removed in a future version of Transformers." + ) + preprocess_params = {} + if "candidate_labels" in kwargs: + preprocess_params["candidate_labels"] = self._args_parser._parse_labels(kwargs["candidate_labels"]) + if "hypothesis_template" in kwargs: + preprocess_params["hypothesis_template"] = kwargs["hypothesis_template"] + + postprocess_params = {} + if "multi_label" in kwargs: + postprocess_params["multi_label"] = kwargs["multi_label"] + return preprocess_params, {}, postprocess_params + + def __call__( + self, + sequences: Union[str, List[str]], + *args, + **kwargs, + ): + """ + Classify the sequence(s) given as inputs. See the [`ZeroShotClassificationPipeline`] documentation for more + information. + + Args: + sequences (`str` or `List[str]`): + The sequence(s) to classify, will be truncated if the model input is too large. + candidate_labels (`str` or `List[str]`): + The set of possible class labels to classify each sequence into. Can be a single label, a string of + comma-separated labels, or a list of labels. + hypothesis_template (`str`, *optional*, defaults to `"This example is {}."`): + The template used to turn each label into an NLI-style hypothesis. This template must include a {} or + similar syntax for the candidate label to be inserted into the template. For example, the default + template is `"This example is {}."` With the candidate label `"sports"`, this would be fed into the + model like `" sequence to classify This example is sports . "`. The default template + works well in many cases, but it may be worthwhile to experiment with different templates depending on + the task setting. + multi_label (`bool`, *optional*, defaults to `False`): + Whether or not multiple candidate labels can be true. If `False`, the scores are normalized such that + the sum of the label likelihoods for each sequence is 1. If `True`, the labels are considered + independent and probabilities are normalized for each candidate by doing a softmax of the entailment + score vs. the contradiction score. + + Return: + A `dict` or a list of `dict`: Each result comes as a dictionary with the following keys: + + - **sequence** (`str`) -- The sequence for which this is the output. + - **labels** (`List[str]`) -- The labels sorted by order of likelihood. + - **scores** (`List[float]`) -- The probabilities for each of the labels. + """ + if len(args) == 0: + pass + elif len(args) == 1 and "candidate_labels" not in kwargs: + kwargs["candidate_labels"] = args[0] + else: + raise ValueError(f"Unable to understand extra arguments {args}") + + return super().__call__(sequences, **kwargs) + + def preprocess(self, inputs, candidate_labels=None, hypothesis_template="This example is {}."): + sequence_pairs, sequences = self._args_parser(inputs, candidate_labels, hypothesis_template) + + for i, (candidate_label, sequence_pair) in enumerate(zip(candidate_labels, sequence_pairs)): + model_input = self._parse_and_tokenize([sequence_pair]) + + yield { + "candidate_label": candidate_label, + "sequence": sequences[0], + "is_last": i == len(candidate_labels) - 1, + **model_input, + } + + def _forward(self, inputs): + candidate_label = inputs["candidate_label"] + sequence = inputs["sequence"] + model_inputs = {k: inputs[k] for k in self.tokenizer.model_input_names} + # `XXXForSequenceClassification` models should not use `use_cache=True` even if it's supported + model_forward = self.model.forward if self.framework == "pt" else self.model.call + if "use_cache" in inspect.signature(model_forward).parameters.keys(): + model_inputs["use_cache"] = False + outputs = self.model(**model_inputs) + + model_outputs = { + "candidate_label": candidate_label, + "sequence": sequence, + "is_last": inputs["is_last"], + **outputs, + } + return model_outputs + + def postprocess(self, model_outputs, multi_label=False): + candidate_labels = [outputs["candidate_label"] for outputs in model_outputs] + sequences = [outputs["sequence"] for outputs in model_outputs] + if self.framework == "pt": + logits = np.concatenate([output["logits"].float().numpy() for output in model_outputs]) + else: + logits = np.concatenate([output["logits"].numpy() for output in model_outputs]) + N = logits.shape[0] + n = len(candidate_labels) + num_sequences = N // n + reshaped_outputs = logits.reshape((num_sequences, n, -1)) + + if multi_label or len(candidate_labels) == 1: + # softmax over the entailment vs. contradiction dim for each label independently + entailment_id = self.entailment_id + contradiction_id = -1 if entailment_id == 0 else 0 + entail_contr_logits = reshaped_outputs[..., [contradiction_id, entailment_id]] + scores = np.exp(entail_contr_logits) / np.exp(entail_contr_logits).sum(-1, keepdims=True) + scores = scores[..., 1] + else: + # softmax the "entailment" logits over all candidate labels + entail_logits = reshaped_outputs[..., self.entailment_id] + scores = np.exp(entail_logits) / np.exp(entail_logits).sum(-1, keepdims=True) + + top_inds = list(reversed(scores[0].argsort())) + return { + "sequence": sequences[0], + "labels": [candidate_labels[i] for i in top_inds], + "scores": scores[0, top_inds].tolist(), + } diff --git a/.venv/lib/python3.11/site-packages/transformers/pipelines/zero_shot_image_classification.py b/.venv/lib/python3.11/site-packages/transformers/pipelines/zero_shot_image_classification.py new file mode 100644 index 0000000000000000000000000000000000000000..c53b515dcccd9c1f277a3f8a8871be08661e7a1c --- /dev/null +++ b/.venv/lib/python3.11/site-packages/transformers/pipelines/zero_shot_image_classification.py @@ -0,0 +1,193 @@ +import warnings +from collections import UserDict +from typing import List, Union + +from ..utils import ( + add_end_docstrings, + is_tf_available, + is_torch_available, + is_vision_available, + logging, + requires_backends, +) +from .base import Pipeline, build_pipeline_init_args + + +if is_vision_available(): + from PIL import Image + + from ..image_utils import load_image + +if is_torch_available(): + import torch + + from ..models.auto.modeling_auto import MODEL_FOR_ZERO_SHOT_IMAGE_CLASSIFICATION_MAPPING_NAMES + +if is_tf_available(): + from ..models.auto.modeling_tf_auto import TF_MODEL_FOR_ZERO_SHOT_IMAGE_CLASSIFICATION_MAPPING_NAMES + from ..tf_utils import stable_softmax + +logger = logging.get_logger(__name__) + + +@add_end_docstrings(build_pipeline_init_args(has_image_processor=True)) +class ZeroShotImageClassificationPipeline(Pipeline): + """ + Zero shot image classification pipeline using `CLIPModel`. This pipeline predicts the class of an image when you + provide an image and a set of `candidate_labels`. + + Example: + + ```python + >>> from transformers import pipeline + + >>> classifier = pipeline(model="google/siglip-so400m-patch14-384") + >>> classifier( + ... "https://huggingface.co/datasets/Narsil/image_dummy/raw/main/parrots.png", + ... candidate_labels=["animals", "humans", "landscape"], + ... ) + [{'score': 0.965, 'label': 'animals'}, {'score': 0.03, 'label': 'humans'}, {'score': 0.005, 'label': 'landscape'}] + + >>> classifier( + ... "https://huggingface.co/datasets/Narsil/image_dummy/raw/main/parrots.png", + ... candidate_labels=["black and white", "photorealist", "painting"], + ... ) + [{'score': 0.996, 'label': 'black and white'}, {'score': 0.003, 'label': 'photorealist'}, {'score': 0.0, 'label': 'painting'}] + ``` + + Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial) + + This image classification pipeline can currently be loaded from [`pipeline`] using the following task identifier: + `"zero-shot-image-classification"`. + + See the list of available models on + [huggingface.co/models](https://huggingface.co/models?filter=zero-shot-image-classification). + """ + + def __init__(self, **kwargs): + super().__init__(**kwargs) + + requires_backends(self, "vision") + self.check_model_type( + TF_MODEL_FOR_ZERO_SHOT_IMAGE_CLASSIFICATION_MAPPING_NAMES + if self.framework == "tf" + else MODEL_FOR_ZERO_SHOT_IMAGE_CLASSIFICATION_MAPPING_NAMES + ) + + def __call__(self, image: Union[str, List[str], "Image", List["Image"]] = None, **kwargs): + """ + Assign labels to the image(s) passed as inputs. + + Args: + image (`str`, `List[str]`, `PIL.Image` or `List[PIL.Image]`): + The pipeline handles three types of images: + + - A string containing a http link pointing to an image + - A string containing a local path to an image + - An image loaded in PIL directly + + candidate_labels (`List[str]`): + The candidate labels for this image. They will be formatted using *hypothesis_template*. + + hypothesis_template (`str`, *optional*, defaults to `"This is a photo of {}"`): + The format used in conjunction with *candidate_labels* to attempt the image classification by + replacing the placeholder with the candidate_labels. Pass "{}" if *candidate_labels* are + already formatted. + + timeout (`float`, *optional*, defaults to None): + The maximum time in seconds to wait for fetching images from the web. If None, no timeout is set and + the call may block forever. + + Return: + A list of dictionaries containing one entry per proposed label. Each dictionary contains the + following keys: + - **label** (`str`) -- One of the suggested *candidate_labels*. + - **score** (`float`) -- The score attributed by the model to that label. It is a value between + 0 and 1, computed as the `softmax` of `logits_per_image`. + """ + # After deprecation of this is completed, remove the default `None` value for `image` + if "images" in kwargs: + image = kwargs.pop("images") + if image is None: + raise ValueError("Cannot call the zero-shot-image-classification pipeline without an images argument!") + return super().__call__(image, **kwargs) + + def _sanitize_parameters(self, tokenizer_kwargs=None, **kwargs): + preprocess_params = {} + if "candidate_labels" in kwargs: + preprocess_params["candidate_labels"] = kwargs["candidate_labels"] + if "timeout" in kwargs: + preprocess_params["timeout"] = kwargs["timeout"] + if "hypothesis_template" in kwargs: + preprocess_params["hypothesis_template"] = kwargs["hypothesis_template"] + if tokenizer_kwargs is not None: + warnings.warn( + "The `tokenizer_kwargs` argument is deprecated and will be removed in version 5 of Transformers", + FutureWarning, + ) + preprocess_params["tokenizer_kwargs"] = tokenizer_kwargs + + return preprocess_params, {}, {} + + def preprocess( + self, + image, + candidate_labels=None, + hypothesis_template="This is a photo of {}.", + timeout=None, + tokenizer_kwargs=None, + ): + if tokenizer_kwargs is None: + tokenizer_kwargs = {} + image = load_image(image, timeout=timeout) + inputs = self.image_processor(images=[image], return_tensors=self.framework) + if self.framework == "pt": + inputs = inputs.to(self.torch_dtype) + inputs["candidate_labels"] = candidate_labels + sequences = [hypothesis_template.format(x) for x in candidate_labels] + padding = "max_length" if self.model.config.model_type == "siglip" else True + text_inputs = self.tokenizer(sequences, return_tensors=self.framework, padding=padding, **tokenizer_kwargs) + inputs["text_inputs"] = [text_inputs] + return inputs + + def _forward(self, model_inputs): + candidate_labels = model_inputs.pop("candidate_labels") + text_inputs = model_inputs.pop("text_inputs") + if isinstance(text_inputs[0], UserDict): + text_inputs = text_inputs[0] + else: + # Batching case. + text_inputs = text_inputs[0][0] + + outputs = self.model(**text_inputs, **model_inputs) + + model_outputs = { + "candidate_labels": candidate_labels, + "logits": outputs.logits_per_image, + } + return model_outputs + + def postprocess(self, model_outputs): + candidate_labels = model_outputs.pop("candidate_labels") + logits = model_outputs["logits"][0] + if self.framework == "pt" and self.model.config.model_type == "siglip": + probs = torch.sigmoid(logits).squeeze(-1) + scores = probs.tolist() + if not isinstance(scores, list): + scores = [scores] + elif self.framework == "pt": + probs = logits.softmax(dim=-1).squeeze(-1) + scores = probs.tolist() + if not isinstance(scores, list): + scores = [scores] + elif self.framework == "tf": + probs = stable_softmax(logits, axis=-1) + scores = probs.numpy().tolist() + else: + raise ValueError(f"Unsupported framework: {self.framework}") + + result = [ + {"score": score, "label": candidate_label} + for score, candidate_label in sorted(zip(scores, candidate_labels), key=lambda x: -x[0]) + ] + return result diff --git a/.venv/lib/python3.11/site-packages/transformers/pipelines/zero_shot_object_detection.py b/.venv/lib/python3.11/site-packages/transformers/pipelines/zero_shot_object_detection.py new file mode 100644 index 0000000000000000000000000000000000000000..ce8da7340bcce527f6ef8c013f1f609c341f9857 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/transformers/pipelines/zero_shot_object_detection.py @@ -0,0 +1,235 @@ +from typing import Any, Dict, List, Union + +from ..utils import add_end_docstrings, is_torch_available, is_vision_available, logging, requires_backends +from .base import ChunkPipeline, build_pipeline_init_args + + +if is_vision_available(): + from PIL import Image + + from ..image_utils import load_image, valid_images + +if is_torch_available(): + import torch + + from transformers.modeling_outputs import BaseModelOutput + + from ..models.auto.modeling_auto import MODEL_FOR_ZERO_SHOT_OBJECT_DETECTION_MAPPING_NAMES + +logger = logging.get_logger(__name__) + + +@add_end_docstrings(build_pipeline_init_args(has_image_processor=True)) +class ZeroShotObjectDetectionPipeline(ChunkPipeline): + """ + Zero shot object detection pipeline using `OwlViTForObjectDetection`. This pipeline predicts bounding boxes of + objects when you provide an image and a set of `candidate_labels`. + + Example: + + ```python + >>> from transformers import pipeline + + >>> detector = pipeline(model="google/owlvit-base-patch32", task="zero-shot-object-detection") + >>> detector( + ... "http://images.cocodataset.org/val2017/000000039769.jpg", + ... candidate_labels=["cat", "couch"], + ... ) + [{'score': 0.287, 'label': 'cat', 'box': {'xmin': 324, 'ymin': 20, 'xmax': 640, 'ymax': 373}}, {'score': 0.254, 'label': 'cat', 'box': {'xmin': 1, 'ymin': 55, 'xmax': 315, 'ymax': 472}}, {'score': 0.121, 'label': 'couch', 'box': {'xmin': 4, 'ymin': 0, 'xmax': 642, 'ymax': 476}}] + + >>> detector( + ... "https://huggingface.co/datasets/Narsil/image_dummy/raw/main/parrots.png", + ... candidate_labels=["head", "bird"], + ... ) + [{'score': 0.119, 'label': 'bird', 'box': {'xmin': 71, 'ymin': 170, 'xmax': 410, 'ymax': 508}}] + ``` + + Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial) + + This object detection pipeline can currently be loaded from [`pipeline`] using the following task identifier: + `"zero-shot-object-detection"`. + + See the list of available models on + [huggingface.co/models](https://huggingface.co/models?filter=zero-shot-object-detection). + """ + + def __init__(self, **kwargs): + super().__init__(**kwargs) + + if self.framework == "tf": + raise ValueError(f"The {self.__class__} is only available in PyTorch.") + + requires_backends(self, "vision") + self.check_model_type(MODEL_FOR_ZERO_SHOT_OBJECT_DETECTION_MAPPING_NAMES) + + def __call__( + self, + image: Union[str, "Image.Image", List[Dict[str, Any]]], + candidate_labels: Union[str, List[str]] = None, + **kwargs, + ): + """ + Detect objects (bounding boxes & classes) in the image(s) passed as inputs. + + Args: + image (`str`, `PIL.Image` or `List[Dict[str, Any]]`): + The pipeline handles three types of images: + + - A string containing an http url pointing to an image + - A string containing a local path to an image + - An image loaded in PIL directly + + You can use this parameter to send directly a list of images, or a dataset or a generator like so: + + ```python + >>> from transformers import pipeline + + >>> detector = pipeline(model="google/owlvit-base-patch32", task="zero-shot-object-detection") + >>> detector( + ... [ + ... { + ... "image": "http://images.cocodataset.org/val2017/000000039769.jpg", + ... "candidate_labels": ["cat", "couch"], + ... }, + ... { + ... "image": "http://images.cocodataset.org/val2017/000000039769.jpg", + ... "candidate_labels": ["cat", "couch"], + ... }, + ... ] + ... ) + [[{'score': 0.287, 'label': 'cat', 'box': {'xmin': 324, 'ymin': 20, 'xmax': 640, 'ymax': 373}}, {'score': 0.25, 'label': 'cat', 'box': {'xmin': 1, 'ymin': 55, 'xmax': 315, 'ymax': 472}}, {'score': 0.121, 'label': 'couch', 'box': {'xmin': 4, 'ymin': 0, 'xmax': 642, 'ymax': 476}}], [{'score': 0.287, 'label': 'cat', 'box': {'xmin': 324, 'ymin': 20, 'xmax': 640, 'ymax': 373}}, {'score': 0.254, 'label': 'cat', 'box': {'xmin': 1, 'ymin': 55, 'xmax': 315, 'ymax': 472}}, {'score': 0.121, 'label': 'couch', 'box': {'xmin': 4, 'ymin': 0, 'xmax': 642, 'ymax': 476}}]] + ``` + + + candidate_labels (`str` or `List[str]` or `List[List[str]]`): + What the model should recognize in the image. + + threshold (`float`, *optional*, defaults to 0.1): + The probability necessary to make a prediction. + + top_k (`int`, *optional*, defaults to None): + The number of top predictions that will be returned by the pipeline. If the provided number is `None` + or higher than the number of predictions available, it will default to the number of predictions. + + timeout (`float`, *optional*, defaults to None): + The maximum time in seconds to wait for fetching images from the web. If None, no timeout is set and + the call may block forever. + + + Return: + A list of lists containing prediction results, one list per input image. Each list contains dictionaries + with the following keys: + + - **label** (`str`) -- Text query corresponding to the found object. + - **score** (`float`) -- Score corresponding to the object (between 0 and 1). + - **box** (`Dict[str,int]`) -- Bounding box of the detected object in image's original size. It is a + dictionary with `x_min`, `x_max`, `y_min`, `y_max` keys. + """ + if "text_queries" in kwargs: + candidate_labels = kwargs.pop("text_queries") + + if isinstance(image, (str, Image.Image)): + inputs = {"image": image, "candidate_labels": candidate_labels} + elif isinstance(image, (list, tuple)) and valid_images(image): + return list( + super().__call__( + ({"image": img, "candidate_labels": labels} for img, labels in zip(image, candidate_labels)), + **kwargs, + ) + ) + else: + """ + Supports the following format + - {"image": image, "candidate_labels": candidate_labels} + - [{"image": image, "candidate_labels": candidate_labels}] + - Generator and datasets + This is a common pattern in other multimodal pipelines, so we support it here as well. + """ + inputs = image + + results = super().__call__(inputs, **kwargs) + return results + + def _sanitize_parameters(self, **kwargs): + preprocess_params = {} + if "timeout" in kwargs: + preprocess_params["timeout"] = kwargs["timeout"] + postprocess_params = {} + if "threshold" in kwargs: + postprocess_params["threshold"] = kwargs["threshold"] + if "top_k" in kwargs: + postprocess_params["top_k"] = kwargs["top_k"] + return preprocess_params, {}, postprocess_params + + def preprocess(self, inputs, timeout=None): + image = load_image(inputs["image"], timeout=timeout) + candidate_labels = inputs["candidate_labels"] + if isinstance(candidate_labels, str): + candidate_labels = candidate_labels.split(",") + + target_size = torch.tensor([[image.height, image.width]], dtype=torch.int32) + for i, candidate_label in enumerate(candidate_labels): + text_inputs = self.tokenizer(candidate_label, return_tensors=self.framework) + image_features = self.image_processor(image, return_tensors=self.framework) + if self.framework == "pt": + image_features = image_features.to(self.torch_dtype) + yield { + "is_last": i == len(candidate_labels) - 1, + "target_size": target_size, + "candidate_label": candidate_label, + **text_inputs, + **image_features, + } + + def _forward(self, model_inputs): + target_size = model_inputs.pop("target_size") + candidate_label = model_inputs.pop("candidate_label") + is_last = model_inputs.pop("is_last") + + outputs = self.model(**model_inputs) + + model_outputs = {"target_size": target_size, "candidate_label": candidate_label, "is_last": is_last, **outputs} + return model_outputs + + def postprocess(self, model_outputs, threshold=0.1, top_k=None): + results = [] + for model_output in model_outputs: + label = model_output["candidate_label"] + model_output = BaseModelOutput(model_output) + outputs = self.image_processor.post_process_object_detection( + outputs=model_output, threshold=threshold, target_sizes=model_output["target_size"] + )[0] + + for index in outputs["scores"].nonzero(): + score = outputs["scores"][index].item() + box = self._get_bounding_box(outputs["boxes"][index][0]) + + result = {"score": score, "label": label, "box": box} + results.append(result) + + results = sorted(results, key=lambda x: x["score"], reverse=True) + if top_k: + results = results[:top_k] + + return results + + def _get_bounding_box(self, box: "torch.Tensor") -> Dict[str, int]: + """ + Turns list [xmin, xmax, ymin, ymax] into dict { "xmin": xmin, ... } + + Args: + box (`torch.Tensor`): Tensor containing the coordinates in corners format. + + Returns: + bbox (`Dict[str, int]`): Dict containing the coordinates in corners format. + """ + if self.framework != "pt": + raise ValueError("The ZeroShotObjectDetectionPipeline is only available in PyTorch.") + xmin, ymin, xmax, ymax = box.int().tolist() + bbox = { + "xmin": xmin, + "ymin": ymin, + "xmax": xmax, + "ymax": ymax, + } + return bbox diff --git a/.venv/lib/python3.11/site-packages/transformers/sagemaker/__init__.py b/.venv/lib/python3.11/site-packages/transformers/sagemaker/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..98fe38de89cd025911d03669f9e22b03ab0768bd --- /dev/null +++ b/.venv/lib/python3.11/site-packages/transformers/sagemaker/__init__.py @@ -0,0 +1,16 @@ +# Copyright 2021 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .trainer_sm import SageMakerTrainer +from .training_args_sm import SageMakerTrainingArguments, is_sagemaker_dp_enabled diff --git a/.venv/lib/python3.11/site-packages/transformers/sagemaker/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/sagemaker/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..099e01a2157fc76f0966eba131749abed573c936 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/sagemaker/__pycache__/__init__.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/transformers/sagemaker/__pycache__/trainer_sm.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/sagemaker/__pycache__/trainer_sm.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a88bfd9fa5aaca8b9e4ab2a9039c24821a8f6931 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/sagemaker/__pycache__/trainer_sm.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/transformers/sagemaker/__pycache__/training_args_sm.cpython-311.pyc b/.venv/lib/python3.11/site-packages/transformers/sagemaker/__pycache__/training_args_sm.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..364e39f0f6299f4340252c5fea617553c18a8087 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/transformers/sagemaker/__pycache__/training_args_sm.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/transformers/sagemaker/trainer_sm.py b/.venv/lib/python3.11/site-packages/transformers/sagemaker/trainer_sm.py new file mode 100644 index 0000000000000000000000000000000000000000..6ab4e01acdbcd3ade1afc2339a75850bc538bd7a --- /dev/null +++ b/.venv/lib/python3.11/site-packages/transformers/sagemaker/trainer_sm.py @@ -0,0 +1,30 @@ +# Copyright 2021 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import warnings + +from ..trainer import Trainer +from ..utils import logging + + +logger = logging.get_logger(__name__) + + +class SageMakerTrainer(Trainer): + def __init__(self, args=None, **kwargs): + warnings.warn( + "`SageMakerTrainer` is deprecated and will be removed in v5 of Transformers. You can use `Trainer` " + "instead.", + FutureWarning, + ) + super().__init__(args=args, **kwargs) diff --git a/.venv/lib/python3.11/site-packages/transformers/sagemaker/training_args_sm.py b/.venv/lib/python3.11/site-packages/transformers/sagemaker/training_args_sm.py new file mode 100644 index 0000000000000000000000000000000000000000..3daac7859b550de31f211a5e7c9938d8d557fc4c --- /dev/null +++ b/.venv/lib/python3.11/site-packages/transformers/sagemaker/training_args_sm.py @@ -0,0 +1,136 @@ +# Copyright 2021 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import importlib.util +import json +import os +import warnings +from dataclasses import dataclass, field + +import torch + +from ..training_args import TrainingArguments +from ..utils import cached_property, is_sagemaker_dp_enabled, logging + + +logger = logging.get_logger(__name__) + +# TODO: should be moved to `utils` after refactoring of SageMakerTrainer + + +def is_sagemaker_model_parallel_available(): + # Get the sagemaker specific mp parameters from smp_options variable. + smp_options = os.getenv("SM_HP_MP_PARAMETERS", "{}") + try: + # Parse it and check the field "partitions" is included, it is required for model parallel. + smp_options = json.loads(smp_options) + if "partitions" not in smp_options: + return False + except json.JSONDecodeError: + return False + + # Get the sagemaker specific framework parameters from mpi_options variable. + mpi_options = os.getenv("SM_FRAMEWORK_PARAMS", "{}") + try: + # Parse it and check the field "sagemaker_distributed_dataparallel_enabled". + mpi_options = json.loads(mpi_options) + if not mpi_options.get("sagemaker_mpi_enabled", False): + return False + except json.JSONDecodeError: + return False + # Lastly, check if the `smdistributed` module is present. + return importlib.util.find_spec("smdistributed") is not None + + +if is_sagemaker_model_parallel_available(): + import smdistributed.modelparallel.torch as smp + + smp.init() + + +@dataclass +class SageMakerTrainingArguments(TrainingArguments): + mp_parameters: str = field( + default="", + metadata={"help": "Used by the SageMaker launcher to send mp-specific args. Ignored in SageMakerTrainer"}, + ) + + def __post_init__(self): + super().__post_init__() + warnings.warn( + "`SageMakerTrainingArguments` is deprecated and will be removed in v5 of Transformers. You can use " + "`TrainingArguments` instead.", + FutureWarning, + ) + + @cached_property + def _setup_devices(self) -> "torch.device": + logger.info("PyTorch: setting up devices") + if torch.distributed.is_available() and torch.distributed.is_initialized() and self.local_rank == -1: + logger.warning( + "torch.distributed process group is initialized, but local_rank == -1. " + "In order to use Torch DDP, launch your script with `python -m torch.distributed.launch" + ) + if self.no_cuda: + device = torch.device("cpu") + self._n_gpu = 0 + elif is_sagemaker_model_parallel_available(): + local_rank = smp.local_rank() + device = torch.device("cuda", local_rank) + self._n_gpu = 1 + elif is_sagemaker_dp_enabled(): + import smdistributed.dataparallel.torch.torch_smddp # noqa: F401 + + torch.distributed.init_process_group(backend="smddp", timeout=self.ddp_timeout_delta) + self.local_rank = int(os.getenv("SMDATAPARALLEL_LOCAL_RANK")) + device = torch.device("cuda", self.local_rank) + self._n_gpu = 1 + elif self.local_rank == -1: + # if n_gpu is > 1 we'll use nn.DataParallel. + # If you only want to use a specific subset of GPUs use `CUDA_VISIBLE_DEVICES=0` + # Explicitly set CUDA to the first (index 0) CUDA device, otherwise `set_device` will + # trigger an error that a device index is missing. Index 0 takes into account the + # GPUs available in the environment, so `CUDA_VISIBLE_DEVICES=1,2` with `cuda:0` + # will use the first GPU in that env, i.e. GPU#1 + device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") + # Sometimes the line in the postinit has not been run before we end up here, so just checking we're not at + # the default value. + self._n_gpu = torch.cuda.device_count() + else: + # Here, we'll use torch.distributed. + # Initializes the distributed backend which will take care of synchronizing nodes/GPUs + if not torch.distributed.is_initialized(): + torch.distributed.init_process_group(backend="nccl", timeout=self.ddp_timeout_delta) + device = torch.device("cuda", self.local_rank) + self._n_gpu = 1 + + if device.type == "cuda": + torch.cuda.set_device(device) + + return device + + @property + def world_size(self): + if is_sagemaker_model_parallel_available(): + return smp.dp_size() + + return super().world_size + + @property + def place_model_on_device(self): + return not is_sagemaker_model_parallel_available() + + @property + def _no_sync_in_gradient_accumulation(self): + return False