Spaces:
Build error
Build error
| import time | |
| from transformers import TextIteratorStreamer | |
| from threading import Thread | |
| import os | |
| from transformers import AutoModelForImageTextToText, QuantoConfig | |
| from PIL import Image | |
| import io | |
| import requests | |
| from transformers import AutoProcessor, AutoModelForImageTextToText | |
| #import torch | |
| import streamlit as st | |
| device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
| def reduce_image_size(img, scale_percent=50): | |
| """Reduces the image size by a specified percentage.""" | |
| width, height = img.size | |
| new_width = int(width * scale_percent / 100) | |
| new_height = int(height * scale_percent / 100) | |
| resized_img = img.resize((new_width, new_height)) | |
| return resized_img | |
| def model_inference( | |
| user_prompt, chat_history, max_new_tokens, images | |
| ): | |
| """Performs model inference using the provided inputs.""" | |
| user_prompt = { | |
| "role": "user", | |
| "content": [ | |
| {"type": "image"}, | |
| {"type": "text", "text": user_prompt}, | |
| ], | |
| } | |
| chat_history.append(user_prompt) | |
| streamer = TextIteratorStreamer( | |
| processor.tokenizer, skip_prompt=True, timeout=5.0 | |
| ) | |
| generation_args = { | |
| "max_new_tokens": max_new_tokens, | |
| "streamer": streamer, | |
| "do_sample": False, | |
| } | |
| prompt = processor.apply_chat_template(chat_history, add_generation_prompt=True) | |
| inputs = processor(text=prompt, images=images, return_tensors="pt").to(device) | |
| generation_args.update(inputs) | |
| thread = Thread(target=model.generate, kwargs=generation_args) | |
| thread.start() | |
| acc_text = "" | |
| for text_token in streamer: | |
| time.sleep(0.04) | |
| acc_text += text_token | |
| if acc_text.endswith("<end_of_utterance>"): | |
| acc_text = acc_text[:-18] | |
| yield acc_text | |
| thread.join() | |
| def main(): | |
| """Main function of the Streamlit app.""" | |
| st.title("Text and Image Input App") | |
| # Load the model and processor outside the loop (once) | |
| global model, processor | |
| if "model" not in st.session_state: | |
| model_id = "HuggingFaceM4/idefics2-8b" | |
| quantization_config = QuantoConfig(weights="int8") | |
| processor = AutoProcessor.from_pretrained(model_id) | |
| model = AutoModelForImageTextToText.from_pretrained( | |
| model_id, device_map="cuda", quantization_config=quantization_config | |
| ) | |
| st.session_state["model"] = model | |
| st.session_state["processor"] = processor | |
| model = st.session_state["model"] | |
| processor = st.session_state["processor"] | |
| # Get text input | |
| text_input = st.text_input("Enter your text:") | |
| # Get image input | |
| image_input = st.file_uploader("Upload an image", type=["jpg", "jpeg", "png"]) | |
| if image_input is not None: | |
| image = Image.open(image_input) | |
| st.image(image, caption='Uploaded Image') | |
| processed_image = reduce_image_size(image) | |
| else: | |
| image_url = st.text_input("Enter image URL:") | |
| if image_url: | |
| response = requests.get(image_url) | |
| img = Image.open(io.BytesIO(response.content)) | |
| st.image(img, caption='Image from URL') | |
| processed_image = reduce_image_size(img) | |
| if st.button("Predict"): | |
| if text_input and processed_image: | |
| prediction = model_inference( | |
| user_prompt="And what is in this image?", | |
| chat_history=[], # Initialize chat history here | |
| max_new_tokens=100, | |
| images=processed_image) |