Spaces:
Paused
Paused
| ##################################################### | |
| ### DOCUMENT PROCESSOR [MODELS] | |
| ##################################################### | |
| # Jonathan Wang | |
| # ABOUT: | |
| # This project creates an app to chat with PDFs. | |
| # This is the LANGUAGE MODELS | |
| # that are used in the document reader. | |
| ##################################################### | |
| ## TODOS: | |
| # <!> Add support for vLLM / AWQ / GPTQ models. (probably not going to be done due to lack of attention scores) | |
| # Add KTransformers backend? | |
| # https://github.com/kvcache-ai/ktransformers | |
| # https://github.com/Tada-AI/pdf_parser | |
| ##################################################### | |
| ## IMPORTS: | |
| from __future__ import annotations | |
| import gc | |
| import logging | |
| import sys | |
| from typing import ( | |
| Any, | |
| Callable, | |
| Dict, | |
| List, | |
| Optional, | |
| Protocol, | |
| Sequence, | |
| Union, | |
| cast, | |
| runtime_checkable, | |
| ) | |
| import streamlit as st | |
| import torch | |
| from llama_index.core.base.embeddings.base import BaseEmbedding | |
| from llama_index.core.base.llms.base import BaseLLM | |
| from llama_index.core.base.llms.generic_utils import ( | |
| messages_to_prompt as generic_messages_to_prompt, | |
| ) | |
| from llama_index.core.base.llms.types import ( | |
| ChatMessage, | |
| ChatResponse, | |
| ChatResponseGen, | |
| CompletionResponse, | |
| CompletionResponseGen, | |
| LLMMetadata, | |
| MessageRole, | |
| ) | |
| from llama_index.core.bridge.pydantic import Field, PrivateAttr | |
| from llama_index.core.callbacks import CallbackManager | |
| from llama_index.core.constants import DEFAULT_CONTEXT_WINDOW, DEFAULT_NUM_OUTPUTS | |
| from llama_index.core.llms.callbacks import ( | |
| llm_chat_callback, | |
| llm_completion_callback, | |
| ) | |
| from llama_index.core.multi_modal_llms import MultiModalLLM | |
| from llama_index.core.postprocessor import SentenceTransformerRerank | |
| from llama_index.core.prompts.base import PromptTemplate | |
| from llama_index.core.schema import ImageDocument, ImageNode | |
| from llama_index.core.types import BaseOutputParser, PydanticProgramMode | |
| from llama_index.embeddings.huggingface import HuggingFaceEmbedding | |
| from llama_index.llms.huggingface import HuggingFaceLLM | |
| from PIL import Image as PILImage | |
| from transformers import ( | |
| AutoImageProcessor, | |
| AutoModelForVision2Seq, | |
| AutoTokenizer, | |
| LogitsProcessor, | |
| QuantoConfig, | |
| StoppingCriteria, | |
| StoppingCriteriaList, | |
| ) | |
| from typing_extensions import Annotated | |
| # from wtpsplit import SaT # Sentence segmentation model. Dropping this. Requires adapters=0.2.1->Transformers=4.39.3 | Phi3 Vision requires Transformers 4.40.2 | |
| ## NOTE: Proposal for LAZY LOADING packages for running LLMS: | |
| # Currently not done because empahsis is on local inference w/ ability to get Attention Scores, which is not yet supported in non-HF Transformers methods. | |
| ## LLamacpp: | |
| # from llama_index.llms.llama_cpp import LlamaCPP | |
| # from llama_index.llms.llama_cpp.llama_utils import ( | |
| # messages_to_prompt, | |
| # completion_to_prompt | |
| # ) | |
| ## HF Transformers LLM: | |
| # from transformers import AutoTokenizer, BitsAndBytesConfig | |
| # from llama_index.llms.huggingface import HuggingFaceLLM | |
| ## GROQ | |
| # from llama_index.llms.groq import Groq | |
| ##################################################### | |
| ### SETTINGS: | |
| DEFAULT_HF_MULTIMODAL_LLM = "Salesforce/xgen-mm-phi3-mini-instruct-interleave-r-v1.5" | |
| DEFAULT_HF_MULTIMODAL_CONTEXT_WINDOW = 1024 | |
| DEFAULT_HF_MULTIMODAL_MAX_NEW_TOKENS = 1024 | |
| ##################################################### | |
| ### CODE: | |
| logger = logging.getLogger(__name__) | |
| def get_embedder( | |
| model_path: str = "mixedbread-ai/mxbai-embed-large-v1", | |
| device: str = "cuda", # 'cpu' is unbearably slow | |
| ) -> BaseEmbedding: | |
| """Given the path to an embedding model, load it.""" | |
| # NOTE: okay we definitely could have not made this wrapper, but shrug | |
| return HuggingFaceEmbedding( | |
| model_path, | |
| device=device | |
| ) | |
| def get_reranker( | |
| model_path: str = "mixedbread-ai/mxbai-rerank-large-v1", | |
| top_n: int = 3, | |
| device: str = "cpu", # 'cuda' if we were rich | |
| ) -> SentenceTransformerRerank: # technically this is a BaseNodePostprocessor, but that seems too abstract. | |
| """Given the path to a reranking model, load it.""" | |
| # NOTE: okay we definitely could have not made this wrapper, but shrug | |
| return SentenceTransformerRerank( | |
| model=model_path, | |
| top_n=top_n, | |
| device=device | |
| ) | |
| ## LLM Options Below | |
| # def _get_llamacpp_llm( | |
| # model_path: str, | |
| # model_seed: int = 31415926, | |
| # model_temperature: float = 1e-64, # ideally 0, but HF-type doesn't allow that. # a good dev might use sys.float_info()['min'] | |
| # model_context_length: Optional[int] = 8192, | |
| # model_max_new_tokens: Optional[int] = 1024, | |
| # ) -> BaseLLM: | |
| # """Load a LlamaCPP model using GPU and other sane defaults.""" | |
| # # Lazy Loading | |
| # from llama_index.llms.llama_cpp import LlamaCPP | |
| # from llama_index.llms.llama_cpp.llama_utils import ( | |
| # messages_to_prompt, | |
| # completion_to_prompt | |
| # ) | |
| # # Arguments to Pass | |
| # llm = LlamaCPP( | |
| # model_path=model_path, | |
| # temperature=model_temperature, | |
| # max_new_tokens=model_max_new_tokens, | |
| # context_window=model_context_length, | |
| # # kwargs to pass to __call__() | |
| # generate_kwargs={'seed': model_seed}, # {'temperature': TEMPERATURE, 'top_p':0.7, 'min_p':0.1, 'seed': MODEL_SEED}, | |
| # # kwargs to pass to __init__() | |
| # # set to at least 1 to use GPU | |
| # model_kwargs={'n_gpu_layers': -1, 'n_threads': os.cpu_count()-1}, #, 'rope_freq_scale': 0.83, 'rope_freq_base': 20000}, | |
| # # transform inputs into model format | |
| # messages_to_prompt=messages_to_prompt, | |
| # completion_to_prompt=completion_to_prompt, | |
| # verbose=True, | |
| # ) | |
| # return (llm) | |
| def _get_hf_llm( | |
| model_path: str, | |
| model_temperature: float = sys.float_info.min, # ideally 0, but HF-type doesn't allow that. # a good dev might use sys.float_info()['min'] to confirm (?) | |
| model_context_length: int | None = 16384, | |
| model_max_new_tokens: int | None = 2048, | |
| hf_quant_level: int | None = 8, | |
| ) -> BaseLLM: | |
| """Load a Huggingface-Transformers based model using sane defaults.""" | |
| # Fix temperature if needed; HF implementation complains about it being zero | |
| model_temperature = max(sys.float_info.min, model_temperature) | |
| # Get Quantization with BitsandBytes | |
| quanto_config = None # NOTE: by default, no quantization. | |
| if (hf_quant_level == 4): | |
| # bnb_config = BitsAndBytesConfig( | |
| # # load_in_8bit=True, | |
| # load_in_4bit=True, | |
| # # bnb_4bit_use_double_quant=True, | |
| # bnb_4bit_quant_type="nf4", | |
| # bnb_4bit_compute_dtype='bfloat16', # NOTE: Tesla T4 GPUs are too crappy for bfloat16 | |
| # # bnb_4bit_compute_dtype='float16' | |
| # ) | |
| quanto_config = QuantoConfig( | |
| weights="int4" # there's also 'int2' if you're crazy... | |
| ) | |
| elif (hf_quant_level == 8): | |
| # bnb_config = BitsAndBytesConfig( | |
| # load_in_8bit=True | |
| # ) | |
| quanto_config = QuantoConfig( | |
| weights="int8" | |
| ) | |
| # Get Stopping Tokens for Llama3 based models, because they're /special/ and added a new one. | |
| tokenizer = AutoTokenizer.from_pretrained( | |
| model_path | |
| ) | |
| stopping_ids = [ | |
| tokenizer.eos_token_id, | |
| tokenizer.convert_tokens_to_ids("<|eot_id|>"), | |
| ] | |
| return HuggingFaceLLM( | |
| model_name=model_path, | |
| tokenizer_name=model_path, | |
| stopping_ids=stopping_ids, | |
| max_new_tokens=model_max_new_tokens or DEFAULT_NUM_OUTPUTS, | |
| context_window=model_context_length or DEFAULT_CONTEXT_WINDOW, | |
| tokenizer_kwargs={"trust_remote_code": True}, | |
| model_kwargs={"trust_remote_code": True, "quantization_config": quanto_config}, | |
| generate_kwargs={ | |
| "do_sample": not model_temperature > sys.float_info.min, | |
| "temperature": model_temperature, | |
| }, | |
| is_chat_model=True, | |
| ) | |
| def get_llm( | |
| model_path: str = "meta-llama/Meta-Llama-3.1-8B-Instruct", | |
| model_temperature: float = 0, # ideally 0, but HF-type doesn't allow that. # a good dev might use sys.float_info()['min'] | |
| model_context_length: int | None = 8192, | |
| model_max_new_tokens: int | None = 1024, | |
| hf_quant_level: int | None = 8, # 4-bit / 8-bit loading for HF models | |
| ) -> BaseLLM: | |
| """ | |
| Given the path to a LLM, determine the type, load it in and convert it into a Llamaindex-compatable LLM. | |
| NOTE: I chose to set some "sane" defaults, so it's probably not as flexible as some other dev would like. | |
| """ | |
| # if (model_path_extension == ".gguf"): | |
| # ##### LLAMA.CPP | |
| # return(_get_llamacpp_llm(model_path, model_seed, model_temperature, model_context_length, model_max_new_tokens)) | |
| # TODO(Jonathan Wang): Consider non-HF-Transformers backends | |
| # vLLM support for AWQ/GPTQ models | |
| # I guess reluctantly AutoAWQ and AutoGPTQ packages. | |
| # Exllamav2 is kinda dead IMO. | |
| # else: | |
| #### No extension or weird fake extension suggests a folder, i.e., the base model from HF | |
| return(_get_hf_llm(model_path=model_path, model_temperature=model_temperature, model_context_length=model_context_length, model_max_new_tokens=model_max_new_tokens, hf_quant_level=hf_quant_level)) | |
| # @st.cache_resource | |
| # def get_llm() -> BaseLLM: | |
| # from llama_index.llms.groq import Groq | |
| # llm = Groq( | |
| # model='llama-3.1-8b-instant', # old: 'llama3-8b-8192' | |
| # api_key=os.environ.get('GROQ_API_KEY'), | |
| # ) | |
| # return (llm) | |
| class EosLogitProcessor(LogitsProcessor): | |
| """Special snowflake processor for Salesforce Vision Model.""" | |
| def __init__(self, eos_token_id: int, end_token_id: int): | |
| super().__init__() | |
| self.eos_token_id = eos_token_id | |
| self.end_token_id = end_token_id | |
| def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor: | |
| if input_ids.size(1) > 1: # Expect at least 1 output token. | |
| forced_eos = torch.full((scores.size(1),), -float("inf"), device=input_ids.device) | |
| forced_eos[self.eos_token_id] = 0 | |
| # Force generation of EOS after the <|end|> token. | |
| scores[input_ids[:, -1] == self.end_token_id] = forced_eos | |
| return scores | |
| # NOTE: These two protocols are needed to appease mypy | |
| # https://github.com/run-llama/llama_index/blob/5238b04c183119b3035b84e2663db115e63dcfda/llama-index-core/llama_index/core/llms/llm.py#L89 | |
| class MessagesImagesToPromptType(Protocol): | |
| def __call__(self, messages: Sequence[ChatMessage], images: Sequence[ImageDocument], **kwargs: Any) -> str: | |
| pass | |
| MessagesImagesToPromptCallable = Annotated[ | |
| Optional[MessagesImagesToPromptType], | |
| WithJsonSchema({"type": "string"}), | |
| ] | |
| # https://huggingface.co/Salesforce/xgen-mm-phi3-mini-instruct-interleave-r-v1.5/blob/main/batch_inference.ipynb | |
| class HuggingFaceMultiModalLLM(MultiModalLLM): | |
| """Supposed to be a wrapper around HuggingFace's Vision LLMS. | |
| Currently only supports one model type: Salesforce/xgen-mm-phi3-mini-instruct-interleave-r-v1.5 | |
| """ | |
| model_name: str = Field( | |
| description='The multi-modal huggingface LLM to use. Currently only using Phi3.', | |
| default=DEFAULT_HF_MULTIMODAL_LLM | |
| ) | |
| context_window: int = Field( | |
| default=DEFAULT_HF_MULTIMODAL_CONTEXT_WINDOW, | |
| description="The maximum number of tokens available for input.", | |
| gt=0, | |
| ) | |
| max_new_tokens: int = Field( | |
| default=DEFAULT_HF_MULTIMODAL_MAX_NEW_TOKENS, | |
| description="The maximum number of tokens to generate.", | |
| gt=0, | |
| ) | |
| system_prompt: str = Field( | |
| default="", | |
| description=( | |
| "The system prompt, containing any extra instructions or context. " | |
| "The model card on HuggingFace should specify if this is needed." | |
| ), | |
| ) | |
| query_wrapper_prompt: PromptTemplate = Field( | |
| default=PromptTemplate("{query_str}"), | |
| description=( | |
| "The query wrapper prompt, containing the query placeholder. " | |
| "The model card on HuggingFace should specify if this is needed. " | |
| "Should contain a `{query_str}` placeholder." | |
| ), | |
| ) | |
| tokenizer_name: str = Field( | |
| default=DEFAULT_HF_MULTIMODAL_LLM, | |
| description=( | |
| "The name of the tokenizer to use from HuggingFace. " | |
| "Unused if `tokenizer` is passed in directly." | |
| ), | |
| ) | |
| processor_name: str = Field( | |
| default=DEFAULT_HF_MULTIMODAL_LLM, | |
| description=( | |
| "The name of the processor to use from HuggingFace. " | |
| "Unused if `processor` is passed in directly." | |
| ), | |
| ) | |
| device_map: str = Field( | |
| default="auto", description="The device_map to use. Defaults to 'auto'." | |
| ) | |
| stopping_ids: list[int] = Field( | |
| default_factory=list, | |
| description=( | |
| "The stopping ids to use. " | |
| "Generation stops when these token IDs are predicted." | |
| ), | |
| ) | |
| tokenizer_outputs_to_remove: list = Field( | |
| default_factory=list, | |
| description=( | |
| "The outputs to remove from the tokenizer. " | |
| "Sometimes huggingface tokenizers return extra inputs that cause errors." | |
| ), | |
| ) | |
| tokenizer_kwargs: dict = Field( | |
| default_factory=dict, description="The kwargs to pass to the tokenizer." | |
| ) | |
| processor_kwargs: dict = Field( | |
| default_factory=dict, description="The kwargs to pass to the processor." | |
| ) | |
| model_kwargs: dict = Field( | |
| default_factory=dict, | |
| description="The kwargs to pass to the model during initialization.", | |
| ) | |
| generate_kwargs: dict = Field( | |
| default_factory=dict, | |
| description="The kwargs to pass to the model during generation.", | |
| ) | |
| is_chat_model: bool = Field( | |
| default=False, | |
| description=( | |
| "Whether the model can have multiple messages passed at once, like the OpenAI chat API." | |
| # LLMMetadata.__fields__["is_chat_model"].field_info.description | |
| # + " Be sure to verify that you either pass an appropriate tokenizer " | |
| # "that can convert prompts to properly formatted chat messages or a " | |
| # "`messages_to_prompt` that does so." | |
| ), | |
| ) | |
| messages_images_to_prompt: MessagesImagesToPromptCallable = Field( | |
| default=generic_messages_to_prompt, | |
| description="A function that takes in a list of messages and images and returns a prompt string.", | |
| ) | |
| _model: Any = PrivateAttr() | |
| _tokenizer: Any = PrivateAttr() | |
| # TODO(Jonathan Wang): We need to add a separate field for AutoProcessor as opposed to ImageProcessors. | |
| _processor: Any = PrivateAttr() | |
| _stopping_criteria: Any = PrivateAttr() | |
| def __init__( | |
| self, | |
| context_window: int = DEFAULT_HF_MULTIMODAL_CONTEXT_WINDOW, | |
| max_new_tokens: int = DEFAULT_HF_MULTIMODAL_MAX_NEW_TOKENS, | |
| query_wrapper_prompt: Union[str, PromptTemplate] = "{query_str}", | |
| tokenizer_name: str = DEFAULT_HF_MULTIMODAL_LLM, | |
| processor_name: str = DEFAULT_HF_MULTIMODAL_LLM, | |
| model_name: str = DEFAULT_HF_MULTIMODAL_LLM, | |
| model: Any | None = None, | |
| tokenizer: Any | None = None, | |
| processor: Any | None = None, | |
| device_map: str = "auto", | |
| stopping_ids: list[int] | None = None, | |
| tokenizer_kwargs: dict[str, Any] | None = None, | |
| processor_kwargs: dict[str, Any] | None = None, | |
| tokenizer_outputs_to_remove: list[str] | None = None, | |
| model_kwargs: dict[str, Any] | None = None, | |
| generate_kwargs: dict[str, Any] | None = None, | |
| is_chat_model: bool = False, | |
| callback_manager: CallbackManager | None = None, | |
| system_prompt: str = "", | |
| messages_images_to_prompt: Callable[[Sequence[ChatMessage], Sequence[ImageDocument]], str] | None = None, | |
| # completion_to_prompt: Callable[[str], str] | None = None, | |
| # pydantic_program_mode: PydanticProgramMode = PydanticProgramMode.DEFAULT, | |
| # output_parser: BaseOutputParser | None = None, | |
| ) -> None: | |
| logger.info(f"CUDA Memory Pre-AutoModelForVision2Seq: {torch.cuda.mem_get_info()}") | |
| # Salesforce one is a AutoModelForVision2Seq, but not AutoCausalLM which is more common. | |
| model = model or AutoModelForVision2Seq.from_pretrained( | |
| model_name, | |
| device_map=device_map, | |
| trust_remote_code=True, | |
| **(model_kwargs or {}) | |
| ) | |
| logger.info(f"CUDA Memory Post-AutoModelForVision2Seq: {torch.cuda.mem_get_info()}") | |
| # check context_window | |
| config_dict = model.config.to_dict() | |
| model_context_window = int( | |
| config_dict.get("max_position_embeddings", context_window) | |
| ) | |
| if model_context_window < context_window: | |
| logger.warning( | |
| f"Supplied context_window {context_window} is greater " | |
| f"than the model's max input size {model_context_window}. " | |
| "Disable this warning by setting a lower context_window." | |
| ) | |
| context_window = model_context_window | |
| processor_kwargs = processor_kwargs or {} | |
| if "max_length" not in processor_kwargs: | |
| processor_kwargs["max_length"] = context_window | |
| # NOTE: Sometimes models (phi-3) will use AutoProcessor and include the tokenizer within it. | |
| logger.info(f"CUDA Memory Pre-Processor: {torch.cuda.mem_get_info()}") | |
| processor = processor or AutoImageProcessor.from_pretrained( | |
| processor_name or model_name, | |
| trust_remote_code=True, | |
| **processor_kwargs | |
| ) | |
| logger.info(f"CUDA Memory Post-Processor: {torch.cuda.mem_get_info()}") | |
| tokenizer = tokenizer or AutoTokenizer.from_pretrained( | |
| tokenizer_name or model_name, | |
| trust_remote_code=True, | |
| **(tokenizer_kwargs or {}) | |
| ) | |
| logger.info(f"CUDA Memory Post-Tokenizer: {torch.cuda.mem_get_info()}") | |
| # Tokenizer-Model disagreement | |
| if (hasattr(tokenizer, "name_or_path") and tokenizer.name_or_path != model_name): # type: ignore (checked for attribute) | |
| logger.warning( | |
| f"The model `{model_name}` and processor `{getattr(tokenizer, 'name_or_path', None)}` " | |
| f"are different, please ensure that they are compatible." | |
| ) | |
| # Processor-Model disagreement | |
| if (hasattr(processor, "name_or_path") and getattr(processor, "name_or_path", None) != model_name): | |
| logger.warning( | |
| f"The model `{model_name}` and processor `{getattr(processor, 'name_or_path', None)}` " | |
| f"are different, please ensure that they are compatible." | |
| ) | |
| # setup stopping criteria | |
| stopping_ids_list = stopping_ids or [] | |
| class StopOnTokens(StoppingCriteria): | |
| def __call__( | |
| self, | |
| input_ids: torch.LongTensor, | |
| scores: torch.FloatTensor, | |
| **kwargs: Any, | |
| ) -> bool: | |
| return any(input_ids[0][-1] == stop_id for stop_id in stopping_ids_list) | |
| stopping_criteria = StoppingCriteriaList([StopOnTokens()]) | |
| if isinstance(query_wrapper_prompt, str): | |
| query_wrapper_prompt = PromptTemplate(query_wrapper_prompt) | |
| messages_images_to_prompt = messages_images_to_prompt or self._processor_messages_to_prompt | |
| # Initiate standard LLM | |
| super().__init__( | |
| callback_manager=callback_manager or CallbackManager([]), | |
| ) | |
| logger.info(f"CUDA Memory Post-SuperInit: {torch.cuda.mem_get_info()}") | |
| # Initiate remaining fields | |
| self._model = model | |
| self._tokenizer = tokenizer | |
| self._processor = processor | |
| logger.info(f"CUDA Memory Post-Init: {torch.cuda.mem_get_info()}") | |
| self._stopping_criteria = stopping_criteria | |
| self.model_name = model_name | |
| self.context_window=context_window | |
| self.max_new_tokens=max_new_tokens | |
| self.system_prompt=system_prompt | |
| self.query_wrapper_prompt=query_wrapper_prompt | |
| self.tokenizer_name=tokenizer_name | |
| self.processor_name=processor_name | |
| self.model_name=model_name | |
| self.device_map=device_map | |
| self.stopping_ids=stopping_ids or [] | |
| self.tokenizer_outputs_to_remove=tokenizer_outputs_to_remove or [] | |
| self.tokenizer_kwargs=tokenizer_kwargs or {} | |
| self.processor_kwargs=processor_kwargs or {} | |
| self.model_kwargs=model_kwargs or {} | |
| self.generate_kwargs=generate_kwargs or {} | |
| self.is_chat_model=is_chat_model | |
| self.messages_images_to_prompt=messages_images_to_prompt | |
| # self.completion_to_prompt=completion_to_prompt, | |
| # self.pydantic_program_mode=pydantic_program_mode, | |
| # self.output_parser=output_parser, | |
| def class_name(cls) -> str: | |
| return "HuggingFace_MultiModal_LLM" | |
| def metadata(self) -> LLMMetadata: | |
| """LLM metadata.""" | |
| return LLMMetadata( | |
| context_window=self.context_window, | |
| num_output=self.max_new_tokens, | |
| model_name=self.model_name, | |
| is_chat_model=self.is_chat_model, | |
| ) | |
| def _processor_messages_to_prompt(self, messages: Sequence[ChatMessage], images: Sequence[ImageDocument]) -> str: | |
| ### TODO(Jonathan Wang): Make this work generically. Currently we're building for `Salesforce/xgen-mm-phi3-mini-instruct-interleave-r-v1.5` | |
| """Converts a list of messages into a prompt for the multimodal LLM. | |
| NOTE: we assume for simplicity here that these images are related, and not the user bouncing between multiple different topics. Thus, we send them all at once. | |
| Args: | |
| messages (Sequence[ChatMessage]): A list of the messages to convert, where each message is a dict containing the message role and content. | |
| images (Sequence[ImageDocument]): The number of images the user is passing to the MultiModalLLM. | |
| Returns: | |
| str: The prompt. | |
| """ | |
| # NOTE: For `Salesforce/xgen-mm-phi3-mini-instruct-interleave-r-v1.5`, we actually ignore the `images`; no plaaceholders. | |
| """Use the tokenizer to convert messages to prompt. Fallback to generic.""" | |
| if hasattr(self._tokenizer, "apply_chat_template"): | |
| messages_dict = [ | |
| {"role": message.role.value, "content": message.content} | |
| for message in messages | |
| ] | |
| return self._tokenizer.apply_chat_template( | |
| messages_dict, tokenize=False, add_generation_prompt=True | |
| ) | |
| return generic_messages_to_prompt(messages) | |
| def complete( | |
| self, | |
| prompt: str, | |
| image_documents: ImageNode | List[ImageNode] | ImageDocument | List[ImageDocument], # this also takes ImageDocument which inherits from ImageNode. | |
| formatted: bool = False, | |
| **kwargs: Any | |
| ) -> CompletionResponse: | |
| """Given a prompt and image node(s), get the Phi-3 Vision prompt""" | |
| # Handle images input | |
| # https://huggingface.co/Salesforce/xgen-mm-phi3-mini-instruct-interleave-r-v1.5/blob/main/demo.ipynb | |
| batch_image_list = [] | |
| batch_image_sizes = [] | |
| batch_prompt = [] | |
| # Fix image_documents input typing | |
| if (not isinstance(image_documents, list)): | |
| image_documents = [image_documents] | |
| image_documents = [cast(ImageDocument, image) for image in image_documents] # we probably won't be using the Document features, so I think this is fine. | |
| # Convert input images into PIL images for the model. | |
| image_list = [] | |
| image_sizes = [] | |
| for image in image_documents: | |
| # NOTE: ImageDocument inherets from ImageNode. We'll go extract the image. | |
| image_io = image.resolve_image() | |
| image_pil = PILImage.open(image_io) | |
| image_list.append(self._processor([image_pil], image_aspect_ratio='anyres')['pixel_values'].to(self._model.device)) | |
| image_sizes.append(image_pil.size) | |
| batch_image_list.append(image_list) | |
| batch_image_sizes.append(image_sizes) | |
| batch_prompt.append(prompt) # only one question per image | |
| # Get the prompt | |
| if not formatted and self.query_wrapper_prompt: | |
| prompt = self.query_wrapper_prompt.format(query_str=prompt) | |
| prompt_sequence = [] | |
| if self.system_prompt: | |
| prompt_sequence.append(ChatMessage(role=MessageRole.SYSTEM, content=self.system_prompt)) | |
| prompt_sequence.append(ChatMessage(role=MessageRole.USER, content=prompt)) | |
| prompt = self.messages_images_to_prompt(messages=prompt_sequence, images=image_documents) | |
| # Get the model input | |
| batch_inputs = { | |
| "pixel_values": batch_image_list | |
| } | |
| language_inputs = self._tokenizer( | |
| [prompt], | |
| return_tensors="pt", | |
| padding='longest', # probably not needed. | |
| max_length=self._tokenizer.model_max_length, | |
| truncation=True | |
| ).to(self._model.device) | |
| # TODO: why does the example cookbook have this weird conversion to Cuda instead of .to(device)? | |
| # language_inputs = {name: tensor.cuda() for name, tensor in language_inputs.items()} | |
| batch_inputs.update(language_inputs) | |
| gc.collect() | |
| torch.cuda.empty_cache() | |
| # remove keys from the tokenizer if needed, to avoid HF errors | |
| # TODO: this probably is broken and wouldn't work. | |
| for key in self.tokenizer_outputs_to_remove: | |
| if key in batch_inputs: | |
| batch_inputs.pop(key, None) | |
| # Get output | |
| tokens = self._model.generate( | |
| **batch_inputs, | |
| image_sizes=batch_image_sizes, | |
| pad_token_id=self._tokenizer.pad_token_id, | |
| eos_token_id=self._tokenizer.eos_token_id, | |
| max_new_tokens=self.max_new_tokens, | |
| stopping_criteria=self._stopping_criteria, | |
| # NOTE: Special snowflake processor for Salesforce XGEN Phi3 Mini. | |
| logits_processor=[EosLogitProcessor(eos_token_id=self._tokenizer.eos_token_id, end_token_id=32007)], | |
| **self.generate_kwargs | |
| ) | |
| gc.collect() | |
| torch.cuda.empty_cache() | |
| # completion_tokens = tokens[:, batch_inputs['input_ids'].shape[1]:] | |
| completion = self._tokenizer.batch_decode( | |
| tokens, | |
| skip_special_tokens=True, | |
| clean_up_tokenization_spaces=False | |
| )[0] | |
| gc.collect() | |
| torch.cuda.empty_cache() | |
| output = CompletionResponse(text=completion, raw={'model_output': tokens}) | |
| # Clean stuff up | |
| del batch_image_list, batch_image_sizes, batch_inputs, tokens, completion | |
| gc.collect() | |
| torch.cuda.empty_cache() | |
| # Return the completion | |
| return output | |
| def stream_complete( | |
| self, prompt: str, formatted: bool = False, **kwargs: Any | |
| ) -> CompletionResponseGen: | |
| raise NotImplementedError | |
| def chat(self, messages: Sequence[ChatMessage], **kwargs: Any) -> ChatResponse: | |
| raise NotImplementedError | |
| def stream_chat(self, messages: Sequence[ChatMessage], **kwargs: Any) -> ChatResponseGen: | |
| raise NotImplementedError | |
| async def acomplete( | |
| self, | |
| prompt: str, | |
| images: ImageNode | List[ImageNode], # this also takes ImageDocument which inherits from ImageNode. | |
| formatted: bool = False, | |
| **kwargs: Any | |
| ) -> CompletionResponse: | |
| raise NotImplementedError | |
| async def astream_complete( | |
| self, prompt: str, formatted: bool = False, **kwargs: Any | |
| ) -> CompletionResponseGen: | |
| raise NotImplementedError | |
| async def achat(self, messages: Sequence[ChatMessage], **kwargs: Any) -> ChatResponse: | |
| raise NotImplementedError | |
| async def astream_chat(self, messages: Sequence[ChatMessage], **kwargs: Any) -> ChatResponseGen: | |
| raise NotImplementedError | |
| # @st.cache_resource() | |
| # def get_multimodal_llm(**kwargs) -> MultiModalLLM: | |
| # vision_llm = OpenAIMultiModal( | |
| # model='gpt-4o-mini', | |
| # temperature=0, | |
| # max_new_tokens=512, | |
| # image_detail='auto' | |
| # ) | |
| # return (vision_llm) | |
| def get_multimodal_llm( | |
| model_name: str = DEFAULT_HF_MULTIMODAL_LLM, | |
| device_map: str = "cuda", # does not support 'auto' | |
| processor_kwargs: dict[str, Any] | None = None, | |
| model_kwargs: dict[str, Any] | None = None, # {'torch_dtype': torch.bfloat16}, # {'torch_dtype': torch.float8_e5m2} | |
| generate_kwargs: dict[str, Any] | None = None, # from the example cookbook | |
| hf_quant_level: int | None = 8, | |
| ) -> HuggingFaceMultiModalLLM: | |
| # Get default generate kwargs | |
| if model_kwargs is None: | |
| model_kwargs = {} | |
| if processor_kwargs is None: | |
| processor_kwargs = {} | |
| if generate_kwargs is None: | |
| generate_kwargs = { | |
| "temperature": sys.float_info.min, | |
| "top_p": None, | |
| "num_beams": 1 | |
| # NOTE: we hack in EOSLogitProcessor in the HuggingFaceMultiModalLLM because it allows us to get the tokenizer.eos_token_id | |
| } | |
| # Get Quantization with Quanto | |
| quanto_config = None # NOTE: by default, no quantization. | |
| if (hf_quant_level == 4): | |
| # bnb_config = BitsAndBytesConfig( | |
| # # load_in_8bit=True, | |
| # load_in_4bit=True, | |
| # # bnb_4bit_use_double_quant=True, | |
| # bnb_4bit_quant_type="nf4", | |
| # bnb_4bit_compute_dtype='bfloat16', # NOTE: Tesla T4 GPUs are too crappy for bfloat16 | |
| # # bnb_4bit_compute_dtype='float16' | |
| # ) | |
| quanto_config = QuantoConfig( | |
| weights="int4" # there's also 'int2' if you're crazy... | |
| ) | |
| elif (hf_quant_level == 8): | |
| # bnb_config = BitsAndBytesConfig( | |
| # load_in_8bit=True | |
| # ) | |
| quanto_config = QuantoConfig( | |
| weights="int8" | |
| ) | |
| if (quanto_config is not None): | |
| model_kwargs["quantization_config"] = quanto_config | |
| return HuggingFaceMultiModalLLM( | |
| model_name=model_name, | |
| device_map=device_map, | |
| processor_kwargs=processor_kwargs, | |
| model_kwargs=model_kwargs, | |
| generate_kwargs=generate_kwargs, | |
| max_new_tokens=1024 # from the example cookbook | |
| ) | |