Spaces:

shingguy1
/

Assignment1

Sleeping

File size: 7,079 Bytes

import io
import wave
import streamlit as st
from transformers import pipeline
from PIL import Image
import numpy as np

# ——— 1) MODEL LOADING (cached) ————————————————
@st.cache_resource
def get_image_captioner(model_name="Salesforce/blip-image-captioning-base"):
    return pipeline("image-to-text", model=model_name, device="cpu")

@st.cache_resource
def get_story_pipe(model_name="google/flan-t5-base"):
    return pipeline("text2text-generation", model=model_name, device="cpu")

@st.cache_resource
def get_tts_pipe(model_name="facebook/mms-tts-eng"):
    return pipeline("text-to-speech", model=model_name, device="cpu")

# ——— 2) TRANSFORM FUNCTIONS ————————————————
def part1_image_to_text(pil_img, captioner):
    results = captioner(pil_img)
    return results[0].get("generated_text", "") if results else ""

def part2_text_to_story(
    caption: str,
    story_pipe,
    target_words: int = 100,
    max_length: int = 100,
    min_length: int = 80,
    do_sample: bool = True,
    top_k: int = 100,
    top_p: float= 0.9,
    temperature: float= 0.7,
    repetition_penalty: float = 1.1,
    no_repeat_ngram_size: int = 4
) -> str:
    prompt = (
        f"Write a vivid, imaginative short story of about {target_words} words "
        f"describing this scene: {caption}"
    )
    out = story_pipe(
        prompt,
        max_length=max_length,
        min_length=min_length,
        do_sample=do_sample,
        top_k=top_k,
        top_p=top_p,
        temperature=temperature,
        repetition_penalty=repetition_penalty,
        no_repeat_ngram_size=no_repeat_ngram_size,
        early_stopping=False
    )
    raw = out[0].get("generated_text", "").strip()
    if not raw:
        return ""
    # strip echo of prompt
    if raw.lower().startswith(prompt.lower()):
        story = raw[len(prompt):].strip()
    else:
        story = raw
    # cut at last full stop
    idx = story.rfind(".")
    if idx != -1:
        story = story[:idx+1]
    return story

def part3_text_to_speech_bytes(text: str, tts_pipe) -> bytes:
    out = tts_pipe(text)
    if isinstance(out, list):
        out = out[0]
    audio_array = out["audio"] # np.ndarray (channels, samples)
    rate = out["sampling_rate"] # int
    data = audio_array.T if audio_array.ndim == 2 else audio_array
    pcm = (data * 32767).astype(np.int16)

    buffer = io.BytesIO()
    wf = wave.open(buffer, "wb")
    channels = 1 if data.ndim == 1 else data.shape[1]
    wf.setnchannels(channels)
    wf.setsampwidth(2)
    wf.setframerate(rate)
    wf.writeframes(pcm.tobytes())
    wf.close()
    buffer.seek(0)
    return buffer.read()

# ——— 3) STREAMLIT UI ————————————————————————————
# Set page config as the first Streamlit command
st.set_page_config(
    page_title="Picture to Story Magic",
    page_icon="✨",
    layout="centered"
)

# Custom CSS for kid-friendly styling with improved readability
st.markdown("""
<style>
.main {
    background-color: #e6f3ff;
    padding: 20px;
    border-radius: 15px;
}
.stButton>button {
    background-color: #ffcccb;
    color: #000000; /* Black text */
    border-radius: 10px;
    border: 2px solid #ff9999;
    font-size: 18px;
    font-weight: bold;
    padding: 10px 20px;
    transition: all 0.3s;
}
.stButton>button:hover {
    background-color: #ff9999;
    color: #ffffff; /* White text on hover for contrast */
    transform: scale(1.05);
}
.stFileUploader {
    background-color: #ffb300; /* Darker yellow for better contrast with white label text */
    border: 2px dashed #ff8c00; /* Darker orange border to match */
    border-radius: 10px;
    padding: 10px;
}
/* Style for the file uploader's inner text */
.stFileUploader div[role="button"] {
    background-color: #f0f0f0; /* Very light gray background for contrast with black text */
    border-radius: 10px;
    padding: 10px;
}
.stFileUploader div[role="button"] > div {
    color: #000000 !important; /* Black text */
    font-size: 16px;
}
/* Style for the "Browse files" button inside the file uploader */
.stFileUploader button {
    background-color: #ffca28 !important; /* Yellow button background */
    color: #000000 !important; /* Black text */
    border-radius: 8px !important;
    border: 2px solid #ffb300 !important; /* Match the container background */
    padding: 5px 15px !important;
    font-weight: bold !important;
    box-shadow: 0 2px 4px rgba(0,0,0,0.2) !important; /* Subtle shadow to make button stand out */
}
.stFileUploader button:hover {
    background-color: #ff8c00 !important; /* Slightly darker yellow on hover */
    color: #000000 !important; /* Keep black text */
}
.stImage {
    border: 3px solid #81c784;
    border-radius: 10px;
    box-shadow: 0 4px 8px rgba(0,0,0,0.1);
}
.section-header {
    background-color: #b3e5fc;
    padding: 10px;
    border-radius: 10px;
    text-align: center;
    font-size: 24px;
    font-weight: bold;
    color: #000000; /* Black text */
    margin-bottom: 10px;
}
.caption-box, .story-box {
    background-color: #f0f4c3;
    padding: 15px;
    border-radius: 10px;
    border: 2px solid #d4e157;
    margin-bottom: 20px;
    color: #000000; /* Black text */
}
.caption-box b, .story-box b {
    color: #000000; /* Black text for bold headers */
}
</style>
""", unsafe_allow_html=True)

# Main title
st.markdown("<div class='section-header'>Picture to Story Magic! ✨</div>", unsafe_allow_html=True)

# Image upload section
with st.container():
    st.markdown("<div class='section-header'>1️⃣ Pick a Fun Picture! 🖼️</div>", unsafe_allow_html=True)
    uploaded = st.file_uploader("Choose a picture to start the magic! 😊", type=["jpg","jpeg","png"])
    if not uploaded:
        st.info("Upload a picture, and let's make a story! 🎉")
        st.stop()

# Show image
with st.spinner("Looking at your picture..."):
    pil_img = Image.open(uploaded)
    st.image(pil_img, use_container_width=True)

# Caption section
with st.container():
    captioner = get_image_captioner()
    with st.spinner("Figuring out what's in your picture..."):
        caption = part1_image_to_text(pil_img, captioner)
    st.markdown(f"<div class='caption-box'><b>What's in the Picture? 🧐</b><br>{caption}</div>", unsafe_allow_html=True)

# Story and audio section
with st.container():
    st.markdown("<div class='section-header'>2️⃣ Make a Story and Hear It! 🎵</div>", unsafe_allow_html=True)
    if st.button("Create My Story! 🎉"):
        # Story
        story_pipe = get_story_pipe()
        with st.spinner("Writing a super cool story..."):
            story = part2_text_to_story(caption, story_pipe)
        st.markdown(f"<div class='story-box'><b>Your Cool Story! 📚</b><br>{story}</div>", unsafe_allow_html=True)

        # TTS
        tts_pipe = get_tts_pipe()
        with st.spinner("Turning your story into sound..."):
            audio_bytes = part3_text_to_speech_bytes(story, tts_pipe)
            st.audio(audio_bytes, format="audio/wav")