Spaces:
Sleeping
Sleeping
File size: 7,079 Bytes
2e503f2 6543b0a 73ca6b2 19b4c5e 04e49b0 19b4c5e 04e49b0 19b4c5e 04e49b0 19b4c5e 6543b0a 19b4c5e 04e49b0 19b4c5e 04e49b0 19b4c5e 04e49b0 19b4c5e 04e49b0 6543b0a c6134f9 04e49b0 c6134f9 6543b0a 3520514 6543b0a 04e49b0 6543b0a 04e49b0 6543b0a 04e49b0 6543b0a 04e49b0 6543b0a 04e49b0 6543b0a 04e49b0 6543b0a 04e49b0 6543b0a 04e49b0 6543b0a 04e49b0 6543b0a 04e49b0 6543b0a 04e49b0 6543b0a 04e49b0 6543b0a 3520514 6543b0a 3520514 04e49b0 6543b0a 04e49b0 6543b0a 3520514 04e49b0 6543b0a 3520514 04e49b0 e8cddda |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 |
import io
import wave
import streamlit as st
from transformers import pipeline
from PIL import Image
import numpy as np
# βββ 1) MODEL LOADING (cached) ββββββββββββββββ
@st.cache_resource
def get_image_captioner(model_name="Salesforce/blip-image-captioning-base"):
return pipeline("image-to-text", model=model_name, device="cpu")
@st.cache_resource
def get_story_pipe(model_name="google/flan-t5-base"):
return pipeline("text2text-generation", model=model_name, device="cpu")
@st.cache_resource
def get_tts_pipe(model_name="facebook/mms-tts-eng"):
return pipeline("text-to-speech", model=model_name, device="cpu")
# βββ 2) TRANSFORM FUNCTIONS ββββββββββββββββ
def part1_image_to_text(pil_img, captioner):
results = captioner(pil_img)
return results[0].get("generated_text", "") if results else ""
def part2_text_to_story(
caption: str,
story_pipe,
target_words: int = 100,
max_length: int = 100,
min_length: int = 80,
do_sample: bool = True,
top_k: int = 100,
top_p: float= 0.9,
temperature: float= 0.7,
repetition_penalty: float = 1.1,
no_repeat_ngram_size: int = 4
) -> str:
prompt = (
f"Write a vivid, imaginative short story of about {target_words} words "
f"describing this scene: {caption}"
)
out = story_pipe(
prompt,
max_length=max_length,
min_length=min_length,
do_sample=do_sample,
top_k=top_k,
top_p=top_p,
temperature=temperature,
repetition_penalty=repetition_penalty,
no_repeat_ngram_size=no_repeat_ngram_size,
early_stopping=False
)
raw = out[0].get("generated_text", "").strip()
if not raw:
return ""
# strip echo of prompt
if raw.lower().startswith(prompt.lower()):
story = raw[len(prompt):].strip()
else:
story = raw
# cut at last full stop
idx = story.rfind(".")
if idx != -1:
story = story[:idx+1]
return story
def part3_text_to_speech_bytes(text: str, tts_pipe) -> bytes:
out = tts_pipe(text)
if isinstance(out, list):
out = out[0]
audio_array = out["audio"] # np.ndarray (channels, samples)
rate = out["sampling_rate"] # int
data = audio_array.T if audio_array.ndim == 2 else audio_array
pcm = (data * 32767).astype(np.int16)
buffer = io.BytesIO()
wf = wave.open(buffer, "wb")
channels = 1 if data.ndim == 1 else data.shape[1]
wf.setnchannels(channels)
wf.setsampwidth(2)
wf.setframerate(rate)
wf.writeframes(pcm.tobytes())
wf.close()
buffer.seek(0)
return buffer.read()
# βββ 3) STREAMLIT UI ββββββββββββββββββββββββββββ
# Set page config as the first Streamlit command
st.set_page_config(
page_title="Picture to Story Magic",
page_icon="β¨",
layout="centered"
)
# Custom CSS for kid-friendly styling with improved readability
st.markdown("""
<style>
.main {
background-color: #e6f3ff;
padding: 20px;
border-radius: 15px;
}
.stButton>button {
background-color: #ffcccb;
color: #000000; /* Black text */
border-radius: 10px;
border: 2px solid #ff9999;
font-size: 18px;
font-weight: bold;
padding: 10px 20px;
transition: all 0.3s;
}
.stButton>button:hover {
background-color: #ff9999;
color: #ffffff; /* White text on hover for contrast */
transform: scale(1.05);
}
.stFileUploader {
background-color: #ffb300; /* Darker yellow for better contrast with white label text */
border: 2px dashed #ff8c00; /* Darker orange border to match */
border-radius: 10px;
padding: 10px;
}
/* Style for the file uploader's inner text */
.stFileUploader div[role="button"] {
background-color: #f0f0f0; /* Very light gray background for contrast with black text */
border-radius: 10px;
padding: 10px;
}
.stFileUploader div[role="button"] > div {
color: #000000 !important; /* Black text */
font-size: 16px;
}
/* Style for the "Browse files" button inside the file uploader */
.stFileUploader button {
background-color: #ffca28 !important; /* Yellow button background */
color: #000000 !important; /* Black text */
border-radius: 8px !important;
border: 2px solid #ffb300 !important; /* Match the container background */
padding: 5px 15px !important;
font-weight: bold !important;
box-shadow: 0 2px 4px rgba(0,0,0,0.2) !important; /* Subtle shadow to make button stand out */
}
.stFileUploader button:hover {
background-color: #ff8c00 !important; /* Slightly darker yellow on hover */
color: #000000 !important; /* Keep black text */
}
.stImage {
border: 3px solid #81c784;
border-radius: 10px;
box-shadow: 0 4px 8px rgba(0,0,0,0.1);
}
.section-header {
background-color: #b3e5fc;
padding: 10px;
border-radius: 10px;
text-align: center;
font-size: 24px;
font-weight: bold;
color: #000000; /* Black text */
margin-bottom: 10px;
}
.caption-box, .story-box {
background-color: #f0f4c3;
padding: 15px;
border-radius: 10px;
border: 2px solid #d4e157;
margin-bottom: 20px;
color: #000000; /* Black text */
}
.caption-box b, .story-box b {
color: #000000; /* Black text for bold headers */
}
</style>
""", unsafe_allow_html=True)
# Main title
st.markdown("<div class='section-header'>Picture to Story Magic! β¨</div>", unsafe_allow_html=True)
# Image upload section
with st.container():
st.markdown("<div class='section-header'>1οΈβ£ Pick a Fun Picture! πΌοΈ</div>", unsafe_allow_html=True)
uploaded = st.file_uploader("Choose a picture to start the magic! π", type=["jpg","jpeg","png"])
if not uploaded:
st.info("Upload a picture, and let's make a story! π")
st.stop()
# Show image
with st.spinner("Looking at your picture..."):
pil_img = Image.open(uploaded)
st.image(pil_img, use_container_width=True)
# Caption section
with st.container():
captioner = get_image_captioner()
with st.spinner("Figuring out what's in your picture..."):
caption = part1_image_to_text(pil_img, captioner)
st.markdown(f"<div class='caption-box'><b>What's in the Picture? π§</b><br>{caption}</div>", unsafe_allow_html=True)
# Story and audio section
with st.container():
st.markdown("<div class='section-header'>2οΈβ£ Make a Story and Hear It! π΅</div>", unsafe_allow_html=True)
if st.button("Create My Story! π"):
# Story
story_pipe = get_story_pipe()
with st.spinner("Writing a super cool story..."):
story = part2_text_to_story(caption, story_pipe)
st.markdown(f"<div class='story-box'><b>Your Cool Story! π</b><br>{story}</div>", unsafe_allow_html=True)
# TTS
tts_pipe = get_tts_pipe()
with st.spinner("Turning your story into sound..."):
audio_bytes = part3_text_to_speech_bytes(story, tts_pipe)
st.audio(audio_bytes, format="audio/wav") |