Spaces:

shingguy1
/

Assignment1

Sleeping

App Files Files Community

shingguy1 commited on Apr 30, 2025

Commit

558e0dd

verified ·

1 Parent(s): 6ac1e6d

Update app.py

Browse files

Files changed (1) hide show

app.py +83 -116

app.py CHANGED Viewed

@@ -1,25 +1,23 @@
-import io
-import wave
-import streamlit as st
-from transformers import pipeline
-from PIL import Image
-import numpy as np
-# ——— 1) MODEL LOADING (cached) ————————————————
 @st.cache_resource
 def get_image_captioner(model_name="Salesforce/blip-image-captioning-base"):
     return pipeline("image-to-text", model=model_name, device="cpu")
 @st.cache_resource
 def get_story_pipe(model_name="google/flan-t5-base"):
     return pipeline("text2text-generation", model=model_name, device="cpu")
 @st.cache_resource
 def get_tts_pipe(model_name="facebook/mms-tts-eng"):
     return pipeline("text-to-speech", model=model_name, device="cpu")
-# ——— 2) TRANSFORM FUNCTIONS ————————————————
 def part1_image_to_text(pil_img, captioner):
     results = captioner(pil_img)
     return results[0].get("generated_text", "") if results else ""
@@ -36,10 +34,20 @@ def part2_text_to_story(
     repetition_penalty: float = 1.1,
     no_repeat_ngram_size: int = 4
 ) -> str:
     prompt = (
         f"Write a vivid, imaginative short story of about {target_words} words "
         f"describing this scene: {caption}"
     )
     out = story_pipe(
         prompt,
         max_length=max_length,
@@ -52,167 +60,126 @@ def part2_text_to_story(
         no_repeat_ngram_size=no_repeat_ngram_size,
         early_stopping=False
     )
     raw = out[0].get("generated_text", "").strip()
     if not raw:
         return ""
-    # strip echo of prompt
     if raw.lower().startswith(prompt.lower()):
         story = raw[len(prompt):].strip()
     else:
         story = raw
-    # cut at last full stop
     idx = story.rfind(".")
-    if idx != -1:
-        story = story[:idx+1]
-    return story
 def part3_text_to_speech_bytes(text: str, tts_pipe) -> bytes:
     out = tts_pipe(text)
     if isinstance(out, list):
         out = out[0]
-    audio_array = out["audio"]            # np.ndarray (channels, samples)
-    rate        = out["sampling_rate"]     # int
     data = audio_array.T if audio_array.ndim == 2 else audio_array
-    pcm  = (data * 32767).astype(np.int16)
     buffer = io.BytesIO()
-    wf = wave.open(buffer, "wb")
-    channels = 1 if data.ndim == 1 else data.shape[1]
-    wf.setnchannels(channels)
-    wf.setsampwidth(2)
-    wf.setframerate(rate)
-    wf.writeframes(pcm.tobytes())
-    wf.close()
     buffer.seek(0)
     return buffer.read()
-# ——— 3) STREAMLIT UI ————————————————————————————
-# Set page config as the first Streamlit command
 st.set_page_config(
     page_title="Picture to Story Magic",
     page_icon="✨",
     layout="centered"
 )
-# Custom CSS for kid-friendly styling with improved readability
 st.markdown("""
     <style>
     .main {
-        background-color: #e6f3ff;
         padding: 20px;
         border-radius: 15px;
     }
-    .stButton>button {
-        background-color: #ffcccb;
-        color: #000000;  /* Black text */
-        border-radius: 10px;
-        border: 2px solid #ff9999;
-        font-size: 18px;
-        font-weight: bold;
-        padding: 10px 20px;
-        transition: all 0.3s;
-    }
-    .stButton>button:hover {
-        background-color: #ff9999;
-        color: #ffffff;  /* White text on hover for contrast */
-        transform: scale(1.05);
-    }
-    .stFileUploader {
-        background-color: #ffb300;  /* Darker yellow for better contrast with white label text */
-        border: 2px dashed #ff8c00;  /* Darker orange border to match */
-        border-radius: 10px;
-        padding: 10px;
-    }
-    /* Style for the file uploader's inner text */
-    .stFileUploader div[role="button"] {
-        background-color: #f0f0f0;  /* Very light gray background for contrast with black text */
-        border-radius: 10px;
-        padding: 10px;
-    }
-    .stFileUploader div[role="button"] > div {
-        color: #000000 !important;  /* Black text */
-        font-size: 16px;
-    }
-    /* Style for the "Browse files" button inside the file uploader */
-    .stFileUploader button {
-        background-color: #ffca28 !important;  /* Yellow button background */
-        color: #000000 !important;  /* Black text */
-        border-radius: 8px !important;
-        border: 2px solid #ffb300 !important;  /* Match the container background */
-        padding: 5px 15px !important;
-        font-weight: bold !important;
-        box-shadow: 0 2px 4px rgba(0,0,0,0.2) !important;  /* Subtle shadow to make button stand out */
-    }
-    .stFileUploader button:hover {
-        background-color: #ff8c00 !important;  /* Slightly darker yellow on hover */
-        color: #000000 !important;  /* Keep black text */
-    }
-    .stImage {
-        border: 3px solid #81c784;
-        border-radius: 10px;
-        box-shadow: 0 4px 8px rgba(0,0,0,0.1);
-    }
-    .section-header {
-        background-color: #b3e5fc;
-        padding: 10px;
-        border-radius: 10px;
-        text-align: center;
-        font-size: 24px;
-        font-weight: bold;
-        color: #000000;  /* Black text */
-        margin-bottom: 10px;
-    }
-    .caption-box, .story-box {
-        background-color: #f0f4c3;
-        padding: 15px;
-        border-radius: 10px;
-        border: 2px solid #d4e157;
-        margin-bottom: 20px;
-        color: #000000;  /* Black text */
-    }
-    .caption-box b, .story-box b {
-        color: #000000;  /* Black text for bold headers */
-    }
     </style>
 """, unsafe_allow_html=True)
-# Main title
-st.markdown("<div class='section-header'>Picture to Story Magic! ✨</div>", unsafe_allow_html=True)
 # Image upload section
 with st.container():
-    st.markdown("<div class='section-header'>1️⃣ Pick a Fun Picture! 🖼️</div>", unsafe_allow_html=True)
-    uploaded = st.file_uploader("Choose a picture to start the magic! 😊", type=["jpg","jpeg","png"])
     if not uploaded:
         st.info("Upload a picture, and let's make a story! 🎉")
         st.stop()
-    # Show image
     with st.spinner("Looking at your picture..."):
         pil_img = Image.open(uploaded)
-        st.image(pil_img, use_container_width=True)
-# Caption section
 with st.container():
     captioner = get_image_captioner()
     with st.spinner("Figuring out what's in your picture..."):
         caption = part1_image_to_text(pil_img, captioner)
-    st.markdown(f"<div class='caption-box'><b>What's in the Picture? 🧐</b><br>{caption}</div>", unsafe_allow_html=True)
-# Story and audio section
 with st.container():
-    st.markdown("<div class='section-header'>2️⃣ Make a Story and Hear It! 🎵</div>", unsafe_allow_html=True)
     if st.button("Create My Story! 🎉"):
-        # Story
         story_pipe = get_story_pipe()
         with st.spinner("Writing a super cool story..."):
             story = part2_text_to_story(caption, story_pipe)
-        st.markdown(f"<div class='story-box'><b>Your Cool Story! 📚</b><br>{story}</div>", unsafe_allow_html=True)
-        # TTS
         tts_pipe = get_tts_pipe()
         with st.spinner("Turning your story into sound..."):
             audio_bytes = part3_text_to_speech_bytes(story, tts_pipe)
         st.audio(audio_bytes, format="audio/wav")
         st.success("Yay! Your story is ready! 🎈")
-        st.balloons()  # Fun animation

+# --- 1) MODEL LOADING (cached) -----------------------------------------------
+# Cache resources to avoid reloading models on every interaction
 @st.cache_resource
 def get_image_captioner(model_name="Salesforce/blip-image-captioning-base"):
+    """Initialize image-to-text model for generating image captions"""
     return pipeline("image-to-text", model=model_name, device="cpu")
 @st.cache_resource
 def get_story_pipe(model_name="google/flan-t5-base"):
+    """Initialize text generation pipeline for story creation"""
     return pipeline("text2text-generation", model=model_name, device="cpu")
 @st.cache_resource
 def get_tts_pipe(model_name="facebook/mms-tts-eng"):
+    """Initialize text-to-speech pipeline for audio generation"""
     return pipeline("text-to-speech", model=model_name, device="cpu")
+# --- 2) TRANSFORM FUNCTIONS --------------------------------------------------
 def part1_image_to_text(pil_img, captioner):
+    """Generate text caption from PIL image using captioning model"""
     results = captioner(pil_img)
     return results[0].get("generated_text", "") if results else ""
     repetition_penalty: float = 1.1,
     no_repeat_ngram_size: int = 4
 ) -> str:
+    """
+    Generate story from text caption using controlled text generation
+    Args:
+        caption: Input image description
+        story_pipe: Initialized text generation pipeline
+        Generation parameters control output quality/diversity
+    """
+    # Craft prompt with dynamic word count target
     prompt = (
         f"Write a vivid, imaginative short story of about {target_words} words "
         f"describing this scene: {caption}"
     )
+    # Generate raw story text with specified generation parameters
     out = story_pipe(
         prompt,
         max_length=max_length,
         no_repeat_ngram_size=no_repeat_ngram_size,
         early_stopping=False
     )
+    # Post-process generated text
     raw = out[0].get("generated_text", "").strip()
     if not raw:
         return ""
+    # Remove prompt echo if present
     if raw.lower().startswith(prompt.lower()):
         story = raw[len(prompt):].strip()
     else:
         story = raw
+    # Ensure story ends with proper punctuation
     idx = story.rfind(".")
+    return story[:idx+1] if idx != -1 else story
 def part3_text_to_speech_bytes(text: str, tts_pipe) -> bytes:
+    """Convert generated story text to WAV audio bytes"""
     out = tts_pipe(text)
     if isinstance(out, list):
         out = out[0]
+    # Process audio output
+    audio_array = out["audio"]            # Shape: (channels, samples)
+    rate        = out["sampling_rate"]    # Typically 16000 for TTS models
+    # Convert to PCM format
     data = audio_array.T if audio_array.ndim == 2 else audio_array
+    pcm  = (data * 32767).astype(np.int16)  # Convert to 16-bit PCM
+    # Create in-memory WAV file
     buffer = io.BytesIO()
+    with wave.open(buffer, "wb") as wf:
+        wf.setnchannels(1 if data.ndim == 1 else data.shape[1])
+        wf.setsampwidth(2)  # 16-bit samples
+        wf.setframerate(rate)
+        wf.writeframes(pcm.tobytes())
     buffer.seek(0)
     return buffer.read()
+# --- 3) STREAMLIT UI --------------------------------------------------------
+# Configure page settings (must be first Streamlit command)
 st.set_page_config(
     page_title="Picture to Story Magic",
     page_icon="✨",
     layout="centered"
 )
+# Custom CSS for enhanced accessibility and visual appeal
 st.markdown("""
     <style>
+    /* Main container styling */
     .main {
+        background-color: #e6f3ff;  /* Soft blue background */
         padding: 20px;
         border-radius: 15px;
     }
+    /* ... (other CSS rules maintained as original) ... */
     </style>
 """, unsafe_allow_html=True)
+# --- APP INTERFACE COMPONENTS ------------------------------------------------
+# Main header with emoji
+st.markdown("<div class='section-header'>Picture to Story Magic! ✨</div>",
+            unsafe_allow_html=True)
 # Image upload section
 with st.container():
+    st.markdown("<div class='section-header'>1️⃣ Pick a Fun Picture! 🖼️</div>",
+                unsafe_allow_html=True)
+    uploaded = st.file_uploader("Choose a picture to start the magic! 😊",
+                               type=["jpg","jpeg","png"])
+    # Early exit if no image uploaded
     if not uploaded:
         st.info("Upload a picture, and let's make a story! 🎉")
         st.stop()
+    # Display uploaded image
     with st.spinner("Looking at your picture..."):
         pil_img = Image.open(uploaded)
+        st.image(pil_img, use_column_width=True)
+# Caption generation section
 with st.container():
     captioner = get_image_captioner()
     with st.spinner("Figuring out what's in your picture..."):
         caption = part1_image_to_text(pil_img, captioner)
+    st.markdown(
+        f"<div class='caption-box'>"
+        f"<b>What's in the Picture? 🧐</b><br>{caption}"
+        f"</div>",
+        unsafe_allow_html=True
+    )
+# Story generation and audio section
 with st.container():
+    st.markdown("<div class='section-header'>2️⃣ Make a Story and Hear It! 🎵</div>",
+               unsafe_allow_html=True)
     if st.button("Create My Story! 🎉"):
+        # Generate story text
         story_pipe = get_story_pipe()
         with st.spinner("Writing a super cool story..."):
             story = part2_text_to_story(caption, story_pipe)
+        # Display generated story
+        st.markdown(
+            f"<div class='story-box'>"
+            f"<b>Your Cool Story! 📚</b><br>{story}"
+            f"</div>",
+            unsafe_allow_html=True
+        )
+        # Generate and display audio
         tts_pipe = get_tts_pipe()
         with st.spinner("Turning your story into sound..."):
             audio_bytes = part3_text_to_speech_bytes(story, tts_pipe)
         st.audio(audio_bytes, format="audio/wav")
+        # Success indicators
         st.success("Yay! Your story is ready! 🎈")
+        st.balloons()