Spaces:
Sleeping
Sleeping
File size: 10,491 Bytes
2e503f2 f97d7d0 2e503f2 6543b0a 4816af6 73ca6b2 19b4c5e 4816af6 04e49b0 19b4c5e 4816af6 04e49b0 19b4c5e 4816af6 04e49b0 19b4c5e 6543b0a 19b4c5e 4816af6 04e49b0 4816af6 04e49b0 19b4c5e 04e49b0 19b4c5e 4816af6 04e49b0 4816af6 04e49b0 4816af6 04e49b0 4816af6 04e49b0 4816af6 04e49b0 4816af6 04e49b0 19b4c5e 4816af6 04e49b0 4816af6 04e49b0 4816af6 04e49b0 4816af6 04e49b0 4816af6 04e49b0 4816af6 04e49b0 4816af6 04e49b0 4816af6 6543b0a 4816af6 c6134f9 04e49b0 c6134f9 4816af6 3520514 6543b0a 4816af6 04e49b0 6543b0a 4816af6 f97d7d0 04e49b0 4816af6 04e49b0 4816af6 6543b0a 4816af6 5c728a9 4816af6 6543b0a 4816af6 04e49b0 6543b0a 4816af6 04e49b0 6543b0a 4816af6 04e49b0 6543b0a 4816af6 5c728a9 04e49b0 4816af6 04e49b0 4816af6 6543b0a 4816af6 5c728a9 6543b0a 4816af6 04e49b0 4816af6 6543b0a 4816af6 04e49b0 5c728a9 04e49b0 6543b0a 4816af6 04e49b0 4816af6 04e49b0 5c728a9 6543b0a 4816af6 5c728a9 4816af6 6543b0a 3520514 4816af6 6543b0a 3520514 4816af6 04e49b0 4816af6 04e49b0 6543b0a 4816af6 6543b0a 04e49b0 4816af6 6543b0a 4816af6 3520514 5c728a9 4816af6 f97d7d0 4816af6 f97d7d0 04e49b0 f97d7d0 4816af6 5c728a9 4816af6 f97d7d0 4816af6 5c728a9 6543b0a 4816af6 3520514 5c728a9 4816af6 5c728a9 4816af6 f97d7d0 4816af6 f97d7d0 5c728a9 f97d7d0 4816af6 5c728a9 4816af6 f97d7d0 5c728a9 f97d7d0 4816af6 5c728a9 04e49b0 4816af6 5c728a9 4816af6 f97d7d0 4816af6 f97d7d0 5c728a9 f97d7d0 4816af6 5c728a9 4816af6 f97d7d0 5c728a9 f97d7d0 4816af6 5c728a9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 |
import io
import wave
import streamlit as st
from transformers import pipeline
from PIL import Image
import numpy as np
import time
import threading
# βββ 1) MODEL LOADING (cached) ββββββββββββββββ
# Cache the model loading to avoid reloading on every rerun, improving performance
@st.cache_resource
def get_image_captioner(model_name="Salesforce/blip-image-captioning-base"):
# Load the image-to-text model for generating captions from images
return pipeline("image-to-text", model=model_name, device="cpu")
@st.cache_resource
def get_story_pipe(model_name="google/flan-t5-base"):
# Load the text-to-text model for generating stories from captions
return pipeline("text2text-generation", model=model_name, device="cpu")
@st.cache_resource
def get_tts_pipe(model_name="facebook/mms-tts-eng"):
# Load the text-to-speech model for converting stories to audio
return pipeline("text-to-speech", model=model_name, device="cpu")
# βββ 2) TRANSFORM FUNCTIONS ββββββββββββββββ
def part1_image_to_text(pil_img, captioner):
# Generate a caption for the input image using the captioner model
results = captioner(pil_img)
# Extract the generated caption, return empty string if no result
return results[0].get("generated_text", "") if results else ""
def part2_text_to_story(
caption: str,
story_pipe,
target_words: int = 100,
max_length: int = 100,
min_length: int = 80,
do_sample: bool = True,
top_k: int = 100,
top_p: float= 0.9,
temperature: float= 0.7,
repetition_penalty: float = 1.1,
no_repeat_ngram_size: int = 4
) -> str:
# Create a prompt instructing the model to write a story based on the caption
prompt = (
f"Write a vivid, imaginative short story of about {target_words} words "
f"describing this scene: {caption}"
)
# Generate the story using the text-to-text model with specified parameters
out = story_pipe(
prompt,
max_length=max_length, # Maximum length of generated text
min_length=min_length, # Minimum length to ensure sufficient content
do_sample=do_sample, # Enable sampling for creative output
top_k=top_k, # Consider top-k tokens for sampling
top_p=top_p, # Use nucleus sampling for diversity
temperature=temperature, # Control randomness of output
repetition_penalty=repetition_penalty, # Penalize repeated phrases
no_repeat_ngram_size=no_repeat_ngram_size, # Prevent repeating n-grams
early_stopping=False # Continue until max_length is reached
)
# Extract the generated text and clean it
raw = out[0].get("generated_text", "").strip()
if not raw:
return ""
# Remove the prompt if it appears in the output
if raw.lower().startswith(prompt.lower()):
story = raw[len(prompt):].strip()
else:
story = raw
# Truncate at the last full stop for a natural ending
idx = story.rfind(".")
if idx != -1:
story = story[:idx+1]
return story
def part3_text_to_speech_bytes(text: str, tts_pipe) -> bytes:
# Convert the input text to audio using the text-to-speech model
out = tts_pipe(text)
if isinstance(out, list):
out = out[0]
# Extract audio data (numpy array) and sampling rate
audio_array = out["audio"] # np.ndarray (channels, samples)
rate = out["sampling_rate"] # int
# Transpose audio array if it has multiple channels
data = audio_array.T if audio_array.ndim == 2 else audio_array
# Convert audio to 16-bit PCM format for WAV compatibility
pcm = (data * 32767).astype(np.int16)
# Create a WAV file in memory
buffer = io.BytesIO()
wf = wave.open(buffer, "wb")
channels = 1 if data.ndim == 1 else data.shape[1] # Set mono or stereo
wf.setnchannels(channels)
wf.setsampwidth(2) # 2 bytes for 16-bit audio
wf.setframerate(rate) # Set sampling rate
wf.writeframes(pcm.tobytes()) # Write audio data
wf.close()
buffer.seek(0) # Reset buffer to start for reading
return buffer.read() # Return WAV bytes
# βββ 3) STREAMLIT UI ββββββββββββββββββββββββββββ
# Configure the Streamlit page for a kid-friendly, centered layout
st.set_page_config(
page_title="Picture to Story Magic",
page_icon="β¨",
layout="centered"
)
# Apply custom CSS for a colorful, engaging, and readable interface
st.markdown("""
<style>
.main {
background-color: #e6f3ff; /* Light blue background for main area */
padding: 20px;
border-radius: 15px;
}
.stButton>button {
background-color: #ffcccb; /* Pink button background */
button-color: #000000;
border-radius: 10px;
border: 2px solid #ff9999; /* Red border */
font-size: 18px;
font-weight: bold;
padding: 10px 20px;
transition: all 0.3s; /* Smooth hover effect */
}
.stButton>button:hover {
background-color: #ff9999; /* Darker pink on hover */
color: #ffffff;
transform: scale(1.05); /* Slight zoom on hover */
}
.stFileUploader {
background-color: #ffb300; /* Orange uploader background */
border: 2px dashed #ff8c00; /* Dashed orange border */
border-radius: 10px;
padding: 10px;
}
.stFileUploader div[role="button"] {
background-color: #f0f0f0; /* Light gray button */
border-radius: 10px;
padding: 10px;
}
.stFileUploader div[role="button"] > div {
color: #000000 !important; /* Black text for readability */
font-size: 16px;
}
.stFileUploader button {
background-color: #ffca28 !important; /* Yellow button */
color: #000000 !important;
border-radius: 8px !important;
border: 2px solid #ffb300 !important; /* Orange border */
padding: 5px 15px !important;
font-weight: bold !important;
box-shadow: 0 2px 4px rgba(0,0,0,0.2) !important; /* Subtle shadow */
}
.stFileUploader button:hover {
background-color: #ff8c00 !important; /* Orange on hover */
color: #000000 !important;
}
.stImage {
border: 3px solid #81c784; /* Green border for images */
border-radius: 10px;
box-shadow: 0 4px 8px rgba(0,0,0,0.1); /* Soft shadow */
}
.section-header {
background-color: #b3e5fc; /* Light blue header background */
padding: 10px;
border-radius: 10px;
text-align: center;
font-size: 24px;
font-weight: bold;
color: #000000;
margin-bottom: 10px;
}
.caption-box, .story-box {
background-color: #f0f4c3; /* Light yellow for text boxes */
padding: 15px;
border-radius: 10px;
border: 2px solid #d4e157; /* Green-yellow border */
margin-bottom: 20px;
color: #000000;
}
.caption-box b, .story-box b {
color: #000000; /* Black for bold text */
}
.stProgress > div > div {
background-color: #81c784; /* Green progress bar */
}
</style>
""", unsafe_allow_html=True)
# Display the main title with a fun, magical theme
st.markdown("<div class='section-header'>Picture to Story Magic! β¨</div>", unsafe_allow_html=True)
# Image upload section
with st.container():
# Prompt user to upload an image
st.markdown("<div class='section-header'>1οΈβ£ Pick a Fun Picture! πΌοΈ</div>", unsafe_allow_html=True)
uploaded = st.file_uploader("Choose a picture to start the magic! π", type=["jpg","jpeg","png"])
if not uploaded:
# Stop execution if no image is uploaded, with a friendly message
st.info("Upload a picture, and let's make a story! π")
st.stop()
# Display the uploaded image
with st.spinner("Looking at your picture..."):
pil_img = Image.open(uploaded)
st.image(pil_img, use_container_width=True) # Show image scaled to container
# Caption generation section
with st.container():
st.markdown("<div class='section-header'>2οΈβ£ What's in the Picture? π§</div>", unsafe_allow_html=True)
captioner = get_image_captioner() # Load captioning model
progress_bar = st.progress(0) # Initialize progress bar
result = [None] # Store caption result
def run_caption():
# Run captioning in a separate thread to avoid blocking UI
result[0] = part1_image_to_text(pil_img, captioner)
with st.spinner("Figuring out what's in your picture..."):
thread = threading.Thread(target=run_caption)
thread.start()
# Simulate progress for ~5 seconds
for i in range(100):
progress_bar.progress(i + 1)
time.sleep(0.05)
thread.join() # Wait for captioning to complete
progress_bar.empty() # Clear progress bar
caption = result[0]
# Display the generated caption in a styled box
st.markdown(f"<div class='caption-box'><b>Picture Description:</b><br>{caption}</div>", unsafe_allow_html=True)
# Story and audio generation section
with st.container():
st.markdown("<div class='section-header'>3οΈβ£ Your Story and Audio! π΅</div>", unsafe_allow_html=True)
# Story generation
story_pipe = get_story_pipe() # Load story model
progress_bar = st.progress(0)
result = [None] # Store story result
def run_story():
# Generate story in a separate thread
result[0] = part2_text_to_story(caption, story_pipe)
with st.spinner("Writing a super cool story..."):
thread = threading.Thread(target=run_story)
thread.start()
# Simulate progress for ~7 seconds
for i in range(100):
progress_bar.progress(i + 1)
time.sleep(0.07)
thread.join()
progress_bar.empty()
story = result[0]
# Display the generated story in a styled box
st.markdown(f"<div class='story-box'><b>Your Cool Story! π</b><br>{story}</div>", unsafe_allow_html=True)
# Text-to-speech conversion
tts_pipe = get_tts_pipe() # Load TTS model
progress_bar = st.progress(0)
result = [None] # Store audio result
def run_tts():
# Generate audio in a separate thread
result[0] = part3_text_to_speech_bytes(story, tts_pipe)
with st.spinner("Turning your story into sound..."):
thread = threading.Thread(target=run_tts)
thread.start()
# Simulate progress for ~10 seconds
for i in range(100):
progress_bar.progress(i + 1)
time.sleep(0.10)
thread.join()
progress_bar.empty()
audio_bytes = result[0]
# Play the generated audio in the UI
st.audio(audio_bytes, format="audio/wav") |