Maham930 commited on
Commit
bd901c8
Β·
verified Β·
1 Parent(s): 2ba5249

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +108 -24
app.py CHANGED
@@ -1,54 +1,138 @@
1
  import streamlit as st
2
- from PIL import Image
3
  import torch
 
 
4
  from transformers import Blip2Processor, Blip2ForConditionalGeneration
5
 
6
- st.set_page_config(page_title="Multimodal Image Understanding AI")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
 
8
- @st.cache_resource
 
 
 
 
 
9
  def load_model():
10
  processor = Blip2Processor.from_pretrained(
11
- "Salesforce/blip2-flan-t5-xl"
 
12
  )
 
13
  model = Blip2ForConditionalGeneration.from_pretrained(
14
- "Salesforce/blip2-flan-t5-xl",
15
- torch_dtype=torch.float16,
16
- device_map="auto"
 
17
  )
 
 
18
  return processor, model
19
 
 
20
  processor, model = load_model()
21
 
22
- st.title("πŸ“Έ Multimodal Image Understanding & Storytelling")
 
 
 
 
 
 
 
 
23
 
24
- image = st.file_uploader("Upload an image", type=["jpg", "png"])
 
 
 
 
25
 
26
- def ask(prompt, image):
27
- inputs = processor(images=image, text=prompt, return_tensors="pt").to("cuda")
28
- out = model.generate(**inputs, max_new_tokens=200)
29
- return processor.decode(out[0], skip_special_tokens=True)
30
 
31
- if image:
32
- img = Image.open(image).convert("RGB")
33
- st.image(img, use_column_width=True)
 
 
 
 
 
 
 
 
 
 
 
34
 
35
- caption = ask("Describe this image in one factual sentence.", img)
36
- summary = ask("Give a 3–5 line descriptive summary of this image.", img)
37
- objects = ask("List the main objects and entities visible in this image.", img)
38
- emotion = ask("What emotional tone or mood does this image convey?", img)
39
- story = ask("Write a short fictional story (5–10 lines) inspired by this image.", img)
40
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
  st.subheader("πŸ“ Caption")
42
  st.write(caption)
43
 
44
  st.subheader("πŸ“„ Summary")
45
  st.write(summary)
46
 
47
- st.subheader("πŸ“¦ Objects Detected")
48
  st.write(objects)
49
 
50
- st.subheader("😊 Emotion / Mood")
51
  st.write(emotion)
52
 
53
- st.subheader("πŸ“– Story")
54
  st.write(story)
 
 
 
 
1
  import streamlit as st
 
2
  import torch
3
+ from PIL import Image
4
+ import os
5
  from transformers import Blip2Processor, Blip2ForConditionalGeneration
6
 
7
+ # --------------------------------------------------
8
+ # STREAMLIT CONFIG (MUST BE FIRST STREAMLIT CALL)
9
+ # --------------------------------------------------
10
+ st.set_page_config(
11
+ page_title="Multimodal Image Understanding AI",
12
+ layout="centered"
13
+ )
14
+
15
+ st.write("πŸš€ App is starting...") # Debug indicator
16
+
17
+ # --------------------------------------------------
18
+ # MODEL CONFIG (SAFE FOR HF SPACES)
19
+ # --------------------------------------------------
20
+ MODEL_NAME = "Salesforce/blip2-flan-t5-large" # NOT XL
21
+ DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
22
 
23
+ HF_TOKEN = os.getenv("HF_TOKEN") # Optional but recommended
24
+
25
+ # --------------------------------------------------
26
+ # LOAD MODEL (STREAMLIT-SAFE)
27
+ # --------------------------------------------------
28
+ @st.cache_resource(show_spinner="πŸ”„ Loading AI model (first time only)...")
29
  def load_model():
30
  processor = Blip2Processor.from_pretrained(
31
+ MODEL_NAME,
32
+ token=HF_TOKEN
33
  )
34
+
35
  model = Blip2ForConditionalGeneration.from_pretrained(
36
+ MODEL_NAME,
37
+ torch_dtype=torch.float16 if DEVICE == "cuda" else torch.float32,
38
+ device_map="auto" if DEVICE == "cuda" else None,
39
+ token=HF_TOKEN
40
  )
41
+
42
+ model.eval()
43
  return processor, model
44
 
45
+
46
  processor, model = load_model()
47
 
48
+ # --------------------------------------------------
49
+ # HELPER FUNCTION
50
+ # --------------------------------------------------
51
+ def ask_model(prompt, image):
52
+ inputs = processor(
53
+ images=image,
54
+ text=prompt,
55
+ return_tensors="pt"
56
+ ).to(DEVICE)
57
 
58
+ with torch.no_grad():
59
+ output = model.generate(
60
+ **inputs,
61
+ max_new_tokens=150
62
+ )
63
 
64
+ return processor.decode(output[0], skip_special_tokens=True)
 
 
 
65
 
66
+ # --------------------------------------------------
67
+ # UI
68
+ # --------------------------------------------------
69
+ st.title("πŸ“Έ Multimodal Image Understanding & Storytelling AI")
70
+ st.markdown(
71
+ """
72
+ Upload an image and the AI will generate:
73
+ - A factual caption
74
+ - A descriptive summary
75
+ - Detected objects
76
+ - Emotional tone
77
+ - A short story
78
+ """
79
+ )
80
 
81
+ image_file = st.file_uploader(
82
+ "Upload an image",
83
+ type=["jpg", "jpeg", "png"]
84
+ )
 
85
 
86
+ # --------------------------------------------------
87
+ # IMAGE PROCESSING
88
+ # --------------------------------------------------
89
+ if image_file:
90
+ image = Image.open(image_file).convert("RGB")
91
+ st.image(image, caption="Uploaded Image", use_column_width=True)
92
+
93
+ with st.spinner("🧠 Analyzing image..."):
94
+ caption = ask_model(
95
+ "Describe this image in one factual sentence.",
96
+ image
97
+ )
98
+
99
+ summary = ask_model(
100
+ "Give a concise 3–5 line descriptive summary of this image.",
101
+ image
102
+ )
103
+
104
+ objects = ask_model(
105
+ "List the main objects and entities visible in this image.",
106
+ image
107
+ )
108
+
109
+ emotion = ask_model(
110
+ "What emotional tone or mood does this image convey?",
111
+ image
112
+ )
113
+
114
+ story = ask_model(
115
+ "Write a short fictional story (5–10 lines) inspired by this image.",
116
+ image
117
+ )
118
+
119
+ # --------------------------------------------------
120
+ # OUTPUT
121
+ # --------------------------------------------------
122
  st.subheader("πŸ“ Caption")
123
  st.write(caption)
124
 
125
  st.subheader("πŸ“„ Summary")
126
  st.write(summary)
127
 
128
+ st.subheader("πŸ“¦ Detected Objects")
129
  st.write(objects)
130
 
131
+ st.subheader("😊 Emotional Tone")
132
  st.write(emotion)
133
 
134
+ st.subheader("πŸ“– Short Story")
135
  st.write(story)
136
+
137
+ else:
138
+ st.info("⬆️ Upload an image to begin.")