Spaces:

hskwon7
/

ISOM5240-Individual-Assignment

Sleeping

App Files Files Community

hskwon7 commited on Apr 26, 2025

Commit

639df53

verified ·

1 Parent(s): 1fd7c62

Update app.py

Browse files

Files changed (1) hide show

app.py +34 -26

app.py CHANGED Viewed

@@ -1,13 +1,12 @@
 import streamlit as st
 from transformers import pipeline
 from PIL import Image
-import io
-from gtts import gTTS
-import tempfile
-st.title("🖼️ → 📖 Image-to-Story Demo")
 st.write("Upload an image and watch as it’s captioned, turned into a short story, and even read aloud!")
 @st.cache_resource
 def load_captioner():
     return pipeline("image-to-text", model="unography/blip-large-long-cap")
@@ -16,49 +15,58 @@ def load_captioner():
 def load_story_gen():
     return pipeline("text-generation", model="gpt2", tokenizer="gpt2")
 captioner = load_captioner()
 story_gen = load_story_gen()
 uploaded = st.file_uploader("Upload an image", type=["png","jpg","jpeg"], key="image")
 if uploaded:
     img = Image.open(uploaded)
     st.image(img, use_column_width=True)
-    # Caption
     if "caption" not in st.session_state:
         with st.spinner("Generating caption…"):
-            caps = captioner(img)
-            st.session_state.caption = caps[0] if isinstance(caps, list) else caps
     st.write("**Caption:**", st.session_state.caption)
-    # Story
     if "story" not in st.session_state:
         with st.spinner("Spinning up a story…"):
             out = story_gen(
                 st.session_state.caption,
                 max_length=200,
-                num_return_sequences=1,
                 do_sample=True,
-                top_p=0.9
             )
             st.session_state.story = out[0]["generated_text"]
     st.write("**Story:**", st.session_state.story)
-    # Prepare audio bytes once
-    if "audio_bytes" not in st.session_state:
-        with st.spinner("Generating audio…"):
-            tts = gTTS(text=st.session_state.story, lang="en")
-            buf = io.BytesIO()
-            tts.write_to_fp(buf)
-            st.session_state.audio_bytes = buf.getvalue()
-    # Play button
     if st.button("🔊 Play Story Audio"):
-        # Write to a temp file
-        tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".mp3")
-        tmp.write(st.session_state.audio_bytes)
-        tmp.flush()
-        tmp_path = tmp.name
-        tmp.close()
-        # Stream it
-        st.audio(tmp_path, format="audio/mp3")

 import streamlit as st
 from transformers import pipeline
 from PIL import Image
+import numpy as np
+st.title("🖼️ → 📖 Image-to-Story Demo (with HF TTS)")
 st.write("Upload an image and watch as it’s captioned, turned into a short story, and even read aloud!")
+# 1) load and cache pipelines
 @st.cache_resource
 def load_captioner():
     return pipeline("image-to-text", model="unography/blip-large-long-cap")
 def load_story_gen():
     return pipeline("text-generation", model="gpt2", tokenizer="gpt2")
+@st.cache_resource
+def load_tts():
+    # SpeechT5 text-to-speech
+    return pipeline("text-to-speech", model="microsoft/speecht5_tts")
 captioner = load_captioner()
 story_gen = load_story_gen()
+tts = load_tts()
+# 2) upload image
 uploaded = st.file_uploader("Upload an image", type=["png","jpg","jpeg"], key="image")
 if uploaded:
     img = Image.open(uploaded)
     st.image(img, use_column_width=True)
+    # 3) generate caption
     if "caption" not in st.session_state:
         with st.spinner("Generating caption…"):
+            cap = captioner(img)
+            # BLIP returns a list of strings
+            st.session_state.caption = cap[0] if isinstance(cap, list) else cap
     st.write("**Caption:**", st.session_state.caption)
+    # 4) generate story
     if "story" not in st.session_state:
         with st.spinner("Spinning up a story…"):
             out = story_gen(
                 st.session_state.caption,
                 max_length=200,
                 do_sample=True,
+                top_p=0.9,
+                num_return_sequences=1
             )
             st.session_state.story = out[0]["generated_text"]
     st.write("**Story:**", st.session_state.story)
+    # 5) generate TTS once
+    if "tts_array" not in st.session_state:
+        with st.spinner("Generating speech…"):
+            # returns list of dicts with "array" and "sampling_rate"
+            speech = tts(st.session_state.story)
+            arr = speech[0]["array"]            # NumPy float32 array
+            sr = speech[0]["sampling_rate"]     # e.g. 48000
+            # Hugging Face outputs float32 in [-1,1]; convert to int16 for playback
+            int16 = (arr * 32767).astype(np.int16)
+            st.session_state.tts_array = int16
+            st.session_state.tts_sr = sr
+    # 6) play on button
     if st.button("🔊 Play Story Audio"):
+        st.audio(
+            data=st.session_state.tts_array,
+            format="audio/wav",
+            sample_rate=st.session_state.tts_sr
+        )