IsraelSalgado commited on
Commit
2134a77
·
verified ·
1 Parent(s): 251c587

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +106 -106
app.py CHANGED
@@ -1,107 +1,107 @@
1
- import time
2
- from transformers import TextIteratorStreamer
3
- from threading import Thread
4
- import os
5
- from transformers import AutoModelForImageTextToText, QuantoConfig
6
- from PIL import Image
7
- import io
8
- import requests
9
- from transformers import AutoProcessor, AutoModelForImageTextToText
10
- import torch
11
- import streamlit as st
12
-
13
-
14
- device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
15
-
16
-
17
- def reduce_image_size(img, scale_percent=50):
18
- """Reduces the image size by a specified percentage."""
19
- width, height = img.size
20
- new_width = int(width * scale_percent / 100)
21
- new_height = int(height * scale_percent / 100)
22
- resized_img = img.resize((new_width, new_height))
23
- return resized_img
24
-
25
-
26
- def model_inference(
27
- user_prompt, chat_history, max_new_tokens, images
28
- ):
29
- """Performs model inference using the provided inputs."""
30
- user_prompt = {
31
- "role": "user",
32
- "content": [
33
- {"type": "image"},
34
- {"type": "text", "text": user_prompt},
35
- ],
36
- }
37
- chat_history.append(user_prompt)
38
- streamer = TextIteratorStreamer(
39
- processor.tokenizer, skip_prompt=True, timeout=5.0
40
- )
41
-
42
- generation_args = {
43
- "max_new_tokens": max_new_tokens,
44
- "streamer": streamer,
45
- "do_sample": False,
46
- }
47
-
48
- prompt = processor.apply_chat_template(chat_history, add_generation_prompt=True)
49
- inputs = processor(text=prompt, images=images, return_tensors="pt").to(device)
50
- generation_args.update(inputs)
51
-
52
- thread = Thread(target=model.generate, kwargs=generation_args)
53
- thread.start()
54
-
55
- acc_text = ""
56
- for text_token in streamer:
57
- time.sleep(0.04)
58
- acc_text += text_token
59
- if acc_text.endswith("<end_of_utterance>"):
60
- acc_text = acc_text[:-18]
61
- yield acc_text
62
-
63
- thread.join()
64
-
65
- def main():
66
- """Main function of the Streamlit app."""
67
- st.title("Text and Image Input App")
68
-
69
- # Load the model and processor outside the loop (once)
70
- global model, processor
71
- if "model" not in st.session_state:
72
- model_id = "HuggingFaceM4/idefics2-8b"
73
- quantization_config = QuantoConfig(weights="int8")
74
- processor = AutoProcessor.from_pretrained(model_id)
75
- model = AutoModelForImageTextToText.from_pretrained(
76
- model_id, device_map="cuda", quantization_config=quantization_config
77
- )
78
- st.session_state["model"] = model
79
- st.session_state["processor"] = processor
80
-
81
- model = st.session_state["model"]
82
- processor = st.session_state["processor"]
83
-
84
- # Get text input
85
- text_input = st.text_input("Enter your text:")
86
-
87
- # Get image input
88
- image_input = st.file_uploader("Upload an image", type=["jpg", "jpeg", "png"])
89
- if image_input is not None:
90
- image = Image.open(image_input)
91
- st.image(image, caption='Uploaded Image')
92
- processed_image = reduce_image_size(image)
93
- else:
94
- image_url = st.text_input("Enter image URL:")
95
- if image_url:
96
- response = requests.get(image_url)
97
- img = Image.open(io.BytesIO(response.content))
98
- st.image(img, caption='Image from URL')
99
- processed_image = reduce_image_size(img)
100
-
101
- if st.button("Predict"):
102
- if text_input and processed_image:
103
- prediction = model_inference(
104
- user_prompt="And what is in this image?",
105
- chat_history=[], # Initialize chat history here
106
- max_new_tokens=100,
107
  images=processed_image)
 
1
+ import time
2
+ from transformers import TextIteratorStreamer
3
+ from threading import Thread
4
+ import os
5
+ from transformers import AutoModelForImageTextToText, QuantoConfig
6
+ from PIL import Image
7
+ import io
8
+ import requests
9
+ from transformers import AutoProcessor, AutoModelForImageTextToText
10
+ #import torch
11
+ import streamlit as st
12
+
13
+
14
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
15
+
16
+
17
+ def reduce_image_size(img, scale_percent=50):
18
+ """Reduces the image size by a specified percentage."""
19
+ width, height = img.size
20
+ new_width = int(width * scale_percent / 100)
21
+ new_height = int(height * scale_percent / 100)
22
+ resized_img = img.resize((new_width, new_height))
23
+ return resized_img
24
+
25
+
26
+ def model_inference(
27
+ user_prompt, chat_history, max_new_tokens, images
28
+ ):
29
+ """Performs model inference using the provided inputs."""
30
+ user_prompt = {
31
+ "role": "user",
32
+ "content": [
33
+ {"type": "image"},
34
+ {"type": "text", "text": user_prompt},
35
+ ],
36
+ }
37
+ chat_history.append(user_prompt)
38
+ streamer = TextIteratorStreamer(
39
+ processor.tokenizer, skip_prompt=True, timeout=5.0
40
+ )
41
+
42
+ generation_args = {
43
+ "max_new_tokens": max_new_tokens,
44
+ "streamer": streamer,
45
+ "do_sample": False,
46
+ }
47
+
48
+ prompt = processor.apply_chat_template(chat_history, add_generation_prompt=True)
49
+ inputs = processor(text=prompt, images=images, return_tensors="pt").to(device)
50
+ generation_args.update(inputs)
51
+
52
+ thread = Thread(target=model.generate, kwargs=generation_args)
53
+ thread.start()
54
+
55
+ acc_text = ""
56
+ for text_token in streamer:
57
+ time.sleep(0.04)
58
+ acc_text += text_token
59
+ if acc_text.endswith("<end_of_utterance>"):
60
+ acc_text = acc_text[:-18]
61
+ yield acc_text
62
+
63
+ thread.join()
64
+
65
+ def main():
66
+ """Main function of the Streamlit app."""
67
+ st.title("Text and Image Input App")
68
+
69
+ # Load the model and processor outside the loop (once)
70
+ global model, processor
71
+ if "model" not in st.session_state:
72
+ model_id = "HuggingFaceM4/idefics2-8b"
73
+ quantization_config = QuantoConfig(weights="int8")
74
+ processor = AutoProcessor.from_pretrained(model_id)
75
+ model = AutoModelForImageTextToText.from_pretrained(
76
+ model_id, device_map="cuda", quantization_config=quantization_config
77
+ )
78
+ st.session_state["model"] = model
79
+ st.session_state["processor"] = processor
80
+
81
+ model = st.session_state["model"]
82
+ processor = st.session_state["processor"]
83
+
84
+ # Get text input
85
+ text_input = st.text_input("Enter your text:")
86
+
87
+ # Get image input
88
+ image_input = st.file_uploader("Upload an image", type=["jpg", "jpeg", "png"])
89
+ if image_input is not None:
90
+ image = Image.open(image_input)
91
+ st.image(image, caption='Uploaded Image')
92
+ processed_image = reduce_image_size(image)
93
+ else:
94
+ image_url = st.text_input("Enter image URL:")
95
+ if image_url:
96
+ response = requests.get(image_url)
97
+ img = Image.open(io.BytesIO(response.content))
98
+ st.image(img, caption='Image from URL')
99
+ processed_image = reduce_image_size(img)
100
+
101
+ if st.button("Predict"):
102
+ if text_input and processed_image:
103
+ prediction = model_inference(
104
+ user_prompt="And what is in this image?",
105
+ chat_history=[], # Initialize chat history here
106
+ max_new_tokens=100,
107
  images=processed_image)