Maham930 commited on
Commit
e4866f8
Β·
verified Β·
1 Parent(s): eb666b5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +52 -99
app.py CHANGED
@@ -1,124 +1,77 @@
1
  import streamlit as st
2
- import torch
3
  from PIL import Image
4
- import os
5
  from transformers import Blip2Processor, Blip2ForConditionalGeneration
 
6
 
7
- # --------------------------------------------------
8
- # STREAMLIT CONFIG (MUST BE FIRST STREAMLIT CALL)
9
- # --------------------------------------------------
10
- st.set_page_config(
11
- page_title="Multimodal Image Understanding AI",
12
- layout="centered"
 
 
 
 
 
 
13
  )
14
 
15
- st.write("πŸš€ App is starting...") # Debug indicator
16
-
17
- # --------------------------------------------------
18
- # MODEL CONFIG (SAFE FOR HF SPACES)
19
- # --------------------------------------------------
20
- MODEL_NAME = "TeichAI/Qwen3-4B-Instruct-2507-Gemini-3-Pro-Preview-Distill-GGUF" # NOT XL
21
  DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 
22
 
23
- HF_TOKEN = os.getenv("HF_TOKEN") # Optional but recommended
24
-
25
- # --------------------------------------------------
26
- # LOAD MODEL (STREAMLIT-SAFE)
27
- # --------------------------------------------------
28
- @st.cache_resource(show_spinner="πŸ”„ Loading AI model (first time only)...")
29
  def load_model():
30
- processor = Blip2Processor.from_pretrained(
31
- MODEL_NAME,
32
- token=HF_TOKEN
33
- )
34
-
35
  model = Blip2ForConditionalGeneration.from_pretrained(
36
  MODEL_NAME,
37
- torch_dtype=torch.float16 if DEVICE == "cuda" else torch.float32,
38
- device_map="auto" if DEVICE == "cuda" else None,
39
  token=HF_TOKEN
40
  )
41
-
42
  model.eval()
43
  return processor, model
44
 
45
-
46
  processor, model = load_model()
47
 
48
- # --------------------------------------------------
49
- # HELPER FUNCTION
50
- # --------------------------------------------------
51
- def ask_model(prompt, image):
52
- inputs = processor(
53
- images=image,
54
- text=prompt,
55
- return_tensors="pt"
56
- ).to(DEVICE)
57
 
58
- with torch.no_grad():
59
- output = model.generate(
60
- **inputs,
61
- max_new_tokens=150
62
- )
63
-
64
- return processor.decode(output[0], skip_special_tokens=True)
65
-
66
- # --------------------------------------------------
67
- # UI
68
- # --------------------------------------------------
69
- st.title("πŸ“Έ Multimodal Image Understanding & Storytelling AI")
70
- st.markdown(
71
- """
72
- Upload an image and the AI will generate:
73
- - A factual caption
74
- - A descriptive summary
75
- - Detected objects
76
- - Emotional tone
77
- - A short story
78
- """
79
- )
80
-
81
- image_file = st.file_uploader(
82
- "Upload an image",
83
- type=["jpg", "jpeg", "png"]
84
- )
85
-
86
- # --------------------------------------------------
87
- # IMAGE PROCESSING
88
- # --------------------------------------------------
89
- if image_file:
90
  image = Image.open(image_file).convert("RGB")
91
- st.image(image, caption="Uploaded Image", use_column_width=True)
92
-
93
- with st.spinner("🧠 Analyzing image..."):
94
- caption = ask_model(
95
- "Describe this image in one factual sentence.",
96
- image
97
- )
98
 
99
- summary = ask_model(
100
- "Give a concise 3–5 line descriptive summary of this image.",
101
- image
102
- )
103
 
104
- objects = ask_model(
105
- "List the main objects and entities visible in this image.",
106
- image
107
- )
 
 
 
108
 
109
- emotion = ask_model(
110
- "What emotional tone or mood does this image convey?",
111
- image
112
- )
113
-
114
- story = ask_model(
115
- "Write a short fictional story (5–10 lines) inspired by this image.",
116
- image
117
- )
118
-
119
- # --------------------------------------------------
120
- # OUTPUT
121
- # --------------------------------------------------
122
  st.subheader("πŸ“ Caption")
123
  st.write(caption)
124
 
@@ -135,4 +88,4 @@ if image_file:
135
  st.write(story)
136
 
137
  else:
138
- st.info("⬆️ Upload an image to begin.")
 
1
  import streamlit as st
 
2
  from PIL import Image
3
+ import torch
4
  from transformers import Blip2Processor, Blip2ForConditionalGeneration
5
+ import os
6
 
7
+ # -----------------------
8
+ # Streamlit config
9
+ # -----------------------
10
+ st.set_page_config(page_title="Multimodal Image Understanding AI", layout="centered")
11
+ st.title("πŸ“Έ Multimodal Image Understanding & Storytelling AI")
12
+ st.markdown(
13
+ "Upload an image or use live camera, and get:\n"
14
+ "- Caption\n"
15
+ "- Summary\n"
16
+ "- Detected objects\n"
17
+ "- Emotion/mood\n"
18
+ "- Short story inspired by the image"
19
  )
20
 
21
+ # -----------------------
22
+ # Model settings
23
+ # -----------------------
24
+ MODEL_NAME = "Salesforce/blip2-flan-t5-large"
 
 
25
  DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
26
+ HF_TOKEN = os.getenv("HF_TOKEN") # Add HF_TOKEN as secret in Spaces (recommended)
27
 
28
+ @st.cache_resource(show_spinner="πŸ”„ Loading AI model, please wait...")
 
 
 
 
 
29
  def load_model():
30
+ processor = Blip2Processor.from_pretrained(MODEL_NAME, use_fast=False, token=HF_TOKEN)
 
 
 
 
31
  model = Blip2ForConditionalGeneration.from_pretrained(
32
  MODEL_NAME,
33
+ torch_dtype=torch.float16 if DEVICE=="cuda" else torch.float32,
34
+ device_map="auto" if DEVICE=="cuda" else None,
35
  token=HF_TOKEN
36
  )
 
37
  model.eval()
38
  return processor, model
39
 
 
40
  processor, model = load_model()
41
 
42
+ # -----------------------
43
+ # Image input
44
+ # -----------------------
45
+ image_file = st.file_uploader("Upload an image", type=["jpg", "jpeg", "png"])
46
+ camera_image = st.camera_input("Or take a live picture")
 
 
 
 
47
 
48
+ image = None
49
+ if camera_image:
50
+ image = Image.open(camera_image).convert("RGB")
51
+ elif image_file:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
  image = Image.open(image_file).convert("RGB")
 
 
 
 
 
 
 
53
 
54
+ if image:
55
+ st.image(image, caption="Your Image", use_column_width=True)
 
 
56
 
57
+ # -----------------------
58
+ # Helper function
59
+ # -----------------------
60
+ def ask_model(prompt):
61
+ inputs = processor(images=image, text=prompt, return_tensors="pt").to(DEVICE)
62
+ out = model.generate(**inputs, max_new_tokens=150)
63
+ return processor.decode(out[0], skip_special_tokens=True)
64
 
65
+ with st.spinner("🧠 Analyzing image..."):
66
+ caption = ask_model("Describe this image in one factual sentence.")
67
+ summary = ask_model("Give a concise 3–5 line descriptive summary of this image.")
68
+ objects = ask_model("List the main objects and entities visible in this image.")
69
+ emotion = ask_model("Detect the emotional tone or mood of this image (happy, calm, tense, etc.).")
70
+ story = ask_model("Write a short story (5–10 lines) inspired by this image.")
71
+
72
+ # -----------------------
73
+ # Output
74
+ # -----------------------
 
 
 
75
  st.subheader("πŸ“ Caption")
76
  st.write(caption)
77
 
 
88
  st.write(story)
89
 
90
  else:
91
+ st.info("⬆️ Upload an image or use the camera above to begin.")