import time from transformers import TextIteratorStreamer from threading import Thread import os from transformers import AutoModelForImageTextToText, QuantoConfig from PIL import Image import io import requests from transformers import AutoProcessor, AutoModelForImageTextToText #import torch import streamlit as st device = torch.device("cuda" if torch.cuda.is_available() else "cpu") def reduce_image_size(img, scale_percent=50): """Reduces the image size by a specified percentage.""" width, height = img.size new_width = int(width * scale_percent / 100) new_height = int(height * scale_percent / 100) resized_img = img.resize((new_width, new_height)) return resized_img def model_inference( user_prompt, chat_history, max_new_tokens, images ): """Performs model inference using the provided inputs.""" user_prompt = { "role": "user", "content": [ {"type": "image"}, {"type": "text", "text": user_prompt}, ], } chat_history.append(user_prompt) streamer = TextIteratorStreamer( processor.tokenizer, skip_prompt=True, timeout=5.0 ) generation_args = { "max_new_tokens": max_new_tokens, "streamer": streamer, "do_sample": False, } prompt = processor.apply_chat_template(chat_history, add_generation_prompt=True) inputs = processor(text=prompt, images=images, return_tensors="pt").to(device) generation_args.update(inputs) thread = Thread(target=model.generate, kwargs=generation_args) thread.start() acc_text = "" for text_token in streamer: time.sleep(0.04) acc_text += text_token if acc_text.endswith(""): acc_text = acc_text[:-18] yield acc_text thread.join() def main(): """Main function of the Streamlit app.""" st.title("Text and Image Input App") # Load the model and processor outside the loop (once) global model, processor if "model" not in st.session_state: model_id = "HuggingFaceM4/idefics2-8b" quantization_config = QuantoConfig(weights="int8") processor = AutoProcessor.from_pretrained(model_id) model = AutoModelForImageTextToText.from_pretrained( model_id, device_map="cuda", quantization_config=quantization_config ) st.session_state["model"] = model st.session_state["processor"] = processor model = st.session_state["model"] processor = st.session_state["processor"] # Get text input text_input = st.text_input("Enter your text:") # Get image input image_input = st.file_uploader("Upload an image", type=["jpg", "jpeg", "png"]) if image_input is not None: image = Image.open(image_input) st.image(image, caption='Uploaded Image') processed_image = reduce_image_size(image) else: image_url = st.text_input("Enter image URL:") if image_url: response = requests.get(image_url) img = Image.open(io.BytesIO(response.content)) st.image(img, caption='Image from URL') processed_image = reduce_image_size(img) if st.button("Predict"): if text_input and processed_image: prediction = model_inference( user_prompt="And what is in this image?", chat_history=[], # Initialize chat history here max_new_tokens=100, images=processed_image)