Spaces:

shingguy1
/

Assignment1

Sleeping

App Files Files Community

shingguy1 commited on Apr 29, 2025

Commit

19b4c5e

verified ·

1 Parent(s): 58abad5

Update app.py

Browse files

Files changed (1) hide show

app.py +124 -59

app.py CHANGED Viewed

@@ -1,63 +1,128 @@
 import streamlit as st
 from PIL import Image
-from transformers import BlipProcessor, BlipForConditionalGeneration, pipeline
-from gtts import gTTS
-import os
-import tempfile
-# Load models
 @st.cache_resource
-def load_models():
-    processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
-    blip_model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
-    gpt2_pipeline = pipeline("text-generation", model="gpt2")
-    return processor, blip_model, gpt2_pipeline
-processor, blip_model, gpt2 = load_models()
-# UI
-st.title("🖼️📖 Storyteller for Kids")
-st.write("Upload an image and let the app create and read a magical story just for kids!")
-uploaded_file = st.file_uploader("Upload an image", type=["jpg", "jpeg", "png"])
-if uploaded_file:
-    image = Image.open(uploaded_file).convert("RGB")
-    st.image(image, caption="Uploaded Image", use_column_width=True)
-    with st.spinner("Generating image caption..."):
-        inputs = processor(images=image, return_tensors="pt")
-        out = blip_model.generate(**inputs)
-        caption = processor.decode(out[0], skip_special_tokens=True)
-        st.success("Caption generated!")
-        st.write(f"**Caption:** {caption}")
-    with st.spinner("Writing a children's story..."):
-        prompt = f"Write a short, imaginative story for children aged 3-10 about this: {caption}"
-        story_output = gpt2(
-            prompt,
-            max_length=100,
-            num_return_sequences=1,
-            do_sample=True,
-            temperature=0.9,
-            top_p=0.95,
-            top_k=50,
-            repetition_penalty=1.2,
-            pad_token_id=50256,
-            eos_token_id=50256,
-        )[0]["generated_text"]
-        story = story_output.strip().replace('\n', ' ')
-        # Truncate to ~100 words for safety
-        story = " ".join(story.split()[:100])
-        st.success("Story created!")
-        st.write(f"**Story:**\n\n{story}")
-    with st.spinner("Converting story to audio..."):
-        try:
-            tts = gTTS(text=story, lang='en')
-            with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as fp:
-                tts.save(fp.name)
-                st.audio(fp.name, format="audio/mp3")
-                st.success("Audio playback ready!")
-        except Exception as e:
-            st.error(f"Text-to-speech failed: {e}")

+# app.py
+import io
+import wave
 import streamlit as st
+from transformers import pipeline
 from PIL import Image
+import numpy as np
+# ——— 1) MODEL LOADING (cached) ————————————————
 @st.cache_resource
+def get_image_captioner(model_name="Salesforce/blip-image-captioning-base"):
+    return pipeline("image-to-text", model=model_name, device="cpu")
+@st.cache_resource
+def get_story_pipe(model_name="google/flan-t5-base"):
+    return pipeline("text2text-generation", model=model_name, device="cpu")
+@st.cache_resource
+def get_tts_pipe(model_name="facebook/mms-tts-eng"):
+    return pipeline("text-to-speech", model=model_name, device="cpu")
+# ——— 2) TRANSFORM FUNCTIONS ————————————————
+def part1_image_to_text(pil_img, captioner):
+    results = captioner(pil_img)
+    return results[0].get("generated_text", "") if results else ""
+def part2_text_to_story(
+    caption: str,
+    story_pipe,
+    target_words: int  = 100,
+    max_length:   int  = 100,
+    min_length:   int  = 80,
+    do_sample:    bool = True,
+    top_k:        int  = 100,
+    top_p:        float= 0.9,
+    temperature:  float= 0.7,
+    repetition_penalty: float = 1.1,
+    no_repeat_ngram_size: int = 4
+) -> str:
+    prompt = (
+        f"Write a vivid, imaginative short story of about {target_words} words "
+        f"describing this scene: {caption}"
+    )
+    out = story_pipe(
+        prompt,
+        max_length=max_length,
+        min_length=min_length,
+        do_sample=do_sample,
+        top_k=top_k,
+        top_p=top_p,
+        temperature=temperature,
+        repetition_penalty=repetition_penalty,
+        no_repeat_ngram_size=no_repeat_ngram_size,
+        early_stopping=False
+    )
+    raw = out[0].get("generated_text", "").strip()
+    if not raw:
+        return ""
+    # strip echo of prompt
+    if raw.lower().startswith(prompt.lower()):
+        story = raw[len(prompt):].strip()
+    else:
+        story = raw
+    # cut at last full stop
+    idx = story.rfind(".")
+    if idx != -1:
+        story = story[:idx+1]
+    return story
+def part3_text_to_speech_bytes(text: str, tts_pipe) -> bytes:
+    out = tts_pipe(text)
+    if isinstance(out, list):
+        out = out[0]
+    audio_array = out["audio"]            # np.ndarray (channels, samples)
+    rate        = out["sampling_rate"]     # int
+    data = audio_array.T if audio_array.ndim == 2 else audio_array
+    pcm  = (data * 32767).astype(np.int16)
+    buffer = io.BytesIO()
+    wf = wave.open(buffer, "wb")
+    channels = 1 if data.ndim == 1 else data.shape[1]
+    wf.setnchannels(channels)
+    wf.setsampwidth(2)
+    wf.setframerate(rate)
+    wf.writeframes(pcm.tobytes())
+    wf.close()
+    buffer.seek(0)
+    return buffer.read()
+# ——— 3) STREAMLIT UI ————————————————————————————
+st.set_page_config(
+    page_title="Image→Story→Speech",
+    page_icon="🖼️🎤",
+    layout="centered"
+)
+st.title("🖼️ ➡️ 📖 ➡️ 🎙️ Image → Story → Speech")
+uploaded = st.file_uploader("1️⃣ Upload an image", type=["jpg","jpeg","png"])
+if not uploaded:
+    st.info("Please upload an image to begin.")
+    st.stop()
+# Show image
+with st.spinner("Rendering image..."):
+    pil_img = Image.open(uploaded)
+    st.image(pil_img, use_container_width=True)
+# Generate caption
+captioner = get_image_captioner()
+with st.spinner("Generating caption..."):
+    caption = part1_image_to_text(pil_img, captioner)
+st.markdown(f"**Caption:** {caption}")
+# Generate story & play audio
+if st.button("📝 Generate Story & Play Audio"):
+    # Story
+    story_pipe = get_story_pipe()
+    with st.spinner("Generating story..."):
+        story = part2_text_to_story(caption, story_pipe)
+    st.markdown("**Story:**")
+    st.write(story)
+    # TTS
+    tts_pipe = get_tts_pipe()
+    with st.spinner("Synthesizing speech..."):
+        audio_bytes = part3_text_to_speech_bytes(story, tts_pipe)
+    st.audio(audio_bytes, format="audio/wav")
+    st.success("All done!")