Update app.py
Browse files
app.py
CHANGED
|
@@ -66,7 +66,7 @@ def get_caption(image_bytes):
|
|
| 66 |
and generates a caption.
|
| 67 |
"""
|
| 68 |
image = Image.open(io.BytesIO(image_bytes)).convert("RGB")
|
| 69 |
-
# Resize to
|
| 70 |
image.thumbnail((256, 256))
|
| 71 |
caption = st.session_state.captioner(image)[0]["generated_text"]
|
| 72 |
return caption
|
|
@@ -96,12 +96,32 @@ def get_story(caption):
|
|
| 96 |
def get_audio(story):
|
| 97 |
"""
|
| 98 |
Converts the generated story text into audio.
|
| 99 |
-
Splits the text into 300-character chunks to reduce repeated TTS calls
|
| 100 |
-
|
| 101 |
"""
|
| 102 |
chunks = textwrap.wrap(story, width=300)
|
| 103 |
-
audio_chunks = [
|
| 104 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 105 |
buffer = io.BytesIO()
|
| 106 |
sf.write(buffer, audio, st.session_state.tts.model.config.sampling_rate, format="WAV")
|
| 107 |
buffer.seek(0)
|
|
@@ -111,7 +131,7 @@ def get_audio(story):
|
|
| 111 |
uploaded_file = st.file_uploader("Choose a Picture...", type=["jpg", "jpeg", "png"])
|
| 112 |
if uploaded_file is not None:
|
| 113 |
try:
|
| 114 |
-
load_models() #
|
| 115 |
image_bytes = uploaded_file.getvalue()
|
| 116 |
# Display the uploaded image
|
| 117 |
image = Image.open(io.BytesIO(image_bytes)).convert("RGB")
|
|
|
|
| 66 |
and generates a caption.
|
| 67 |
"""
|
| 68 |
image = Image.open(io.BytesIO(image_bytes)).convert("RGB")
|
| 69 |
+
# Resize image to 256x256 maximum for faster processing
|
| 70 |
image.thumbnail((256, 256))
|
| 71 |
caption = st.session_state.captioner(image)[0]["generated_text"]
|
| 72 |
return caption
|
|
|
|
| 96 |
def get_audio(story):
|
| 97 |
"""
|
| 98 |
Converts the generated story text into audio.
|
| 99 |
+
Splits the text into 300-character chunks to reduce repeated TTS calls.
|
| 100 |
+
Checks each chunk, and if no valid audio is produced, creates a brief default silent audio.
|
| 101 |
"""
|
| 102 |
chunks = textwrap.wrap(story, width=300)
|
| 103 |
+
audio_chunks = []
|
| 104 |
+
for chunk in chunks:
|
| 105 |
+
try:
|
| 106 |
+
output = st.session_state.tts(chunk)
|
| 107 |
+
# Some pipelines return a list; if so, use the first element.
|
| 108 |
+
if isinstance(output, list):
|
| 109 |
+
output = output[0]
|
| 110 |
+
if "audio" in output:
|
| 111 |
+
# Ensure the audio is a numpy array and squeeze any extra dimensions.
|
| 112 |
+
audio_array = np.array(output["audio"]).squeeze()
|
| 113 |
+
audio_chunks.append(audio_array)
|
| 114 |
+
except Exception as e:
|
| 115 |
+
# Skip any chunk that raises an error.
|
| 116 |
+
continue
|
| 117 |
+
|
| 118 |
+
# If no audio was generated, produce 1 second of silence as a fallback.
|
| 119 |
+
if not audio_chunks:
|
| 120 |
+
sr = st.session_state.tts.model.config.sampling_rate
|
| 121 |
+
audio = np.zeros(sr, dtype=np.float32)
|
| 122 |
+
else:
|
| 123 |
+
audio = np.concatenate(audio_chunks)
|
| 124 |
+
|
| 125 |
buffer = io.BytesIO()
|
| 126 |
sf.write(buffer, audio, st.session_state.tts.model.config.sampling_rate, format="WAV")
|
| 127 |
buffer.seek(0)
|
|
|
|
| 131 |
uploaded_file = st.file_uploader("Choose a Picture...", type=["jpg", "jpeg", "png"])
|
| 132 |
if uploaded_file is not None:
|
| 133 |
try:
|
| 134 |
+
load_models() # Ensure models are loaded
|
| 135 |
image_bytes = uploaded_file.getvalue()
|
| 136 |
# Display the uploaded image
|
| 137 |
image = Image.open(io.BytesIO(image_bytes)).convert("RGB")
|