szeandlinkProject_Testing

Sleeping

App Files Files Community

Szeyu commited on Apr 30, 2025

Commit

95bff35

verified ·

1 Parent(s): d31e539

Update app.py

Browse files

Files changed (1) hide show

app.py +11 -13

app.py CHANGED Viewed

@@ -43,7 +43,6 @@ def load_models():
       3. TTS: Converts text into audio.
     """
     if "captioner" not in st.session_state:
-        # Use the "base" version for faster/cost-effective captioning.
         st.session_state.captioner = pipeline(
             "image-to-text",
             model="Salesforce/blip-image-captioning-base"
@@ -63,11 +62,11 @@ def load_models():
 @st.cache_data(show_spinner=False)
 def get_caption(image_bytes):
     """
-    Convert the image bytes into a smaller image to speed up captioning,
-    then return the generated caption.
     """
     image = Image.open(io.BytesIO(image_bytes)).convert("RGB")
-    # Resize the image (preserving aspect ratio) to only 256x256 for faster processing.
     image.thumbnail((256, 256))
     caption = st.session_state.captioner(image)[0]["generated_text"]
     return caption
@@ -75,9 +74,8 @@ def get_caption(image_bytes):
 @st.cache_data(show_spinner=False)
 def get_story(caption):
     """
-    Generate a humorous and engaging children's story using the caption.
-    The prompt instructs the model to produce a playful story (50-100 words).
-    We lower max_new_tokens to 80 so that it generates its text faster.
     """
     prompt = (
         f"Write a funny, warm, and imaginative children's story for ages 3-10, 50-100 words, "
@@ -88,7 +86,7 @@ def get_story(caption):
     )
     raw_story = st.session_state.storyer(
         prompt,
-        max_new_tokens=80,   # Reduced token generation for faster response
         do_sample=True,
         temperature=0.7,
         top_p=0.9,
@@ -100,9 +98,9 @@ def get_story(caption):
 @st.cache_data(show_spinner=False)
 def get_audio(story):
     """
-    Convert the generated story text into audio.
-    The text is split into 300-character chunks to reduce repeated TTS calls,
-    the audio chunks are concatenated, and then stored in an in-memory WAV buffer.
     """
     chunks = textwrap.wrap(story, width=300)
     audio_chunks = [st.session_state.tts(chunk)["audio"].squeeze() for chunk in chunks]
@@ -116,9 +114,9 @@ def get_audio(story):
 uploaded_file = st.file_uploader("Choose a Picture...", type=["jpg", "jpeg", "png"])
 if uploaded_file is not None:
     try:
-        load_models()  # Ensure models are loaded once
         image_bytes = uploaded_file.getvalue()
-        # Display the user-uploaded image
         image = Image.open(io.BytesIO(image_bytes)).convert("RGB")
         st.image(image, caption="Your Amazing Picture!", use_column_width=True)
         st.markdown("<h3 style='text-align: center;'>Ready for your story?</h3>", unsafe_allow_html=True)

       3. TTS: Converts text into audio.
     """
     if "captioner" not in st.session_state:
         st.session_state.captioner = pipeline(
             "image-to-text",
             model="Salesforce/blip-image-captioning-base"
 @st.cache_data(show_spinner=False)
 def get_caption(image_bytes):
     """
+    Converts image bytes into a lower resolution image (256x256 maximum)
+    and generates a caption.
     """
     image = Image.open(io.BytesIO(image_bytes)).convert("RGB")
+    # Resize to speed up processing
     image.thumbnail((256, 256))
     caption = st.session_state.captioner(image)[0]["generated_text"]
     return caption
 @st.cache_data(show_spinner=False)
 def get_story(caption):
     """
+    Generates a humorous and engaging children's story based on the caption.
+    Uses a prompt to instruct the model and limits token generation to 80 tokens.
     """
     prompt = (
         f"Write a funny, warm, and imaginative children's story for ages 3-10, 50-100 words, "
     )
     raw_story = st.session_state.storyer(
         prompt,
+        max_new_tokens=80,
         do_sample=True,
         temperature=0.7,
         top_p=0.9,
 @st.cache_data(show_spinner=False)
 def get_audio(story):
     """
+    Converts the generated story text into audio.
+    Splits the text into 300-character chunks to reduce repeated TTS calls,
+    concatenates the resulting audio chunks, and returns an in-memory WAV buffer.
     """
     chunks = textwrap.wrap(story, width=300)
     audio_chunks = [st.session_state.tts(chunk)["audio"].squeeze() for chunk in chunks]
 uploaded_file = st.file_uploader("Choose a Picture...", type=["jpg", "jpeg", "png"])
 if uploaded_file is not None:
     try:
+        load_models()  # Make sure models are loaded
         image_bytes = uploaded_file.getvalue()
+        # Display the uploaded image
         image = Image.open(io.BytesIO(image_bytes)).convert("RGB")
         st.image(image, caption="Your Amazing Picture!", use_column_width=True)
         st.markdown("<h3 style='text-align: center;'>Ready for your story?</h3>", unsafe_allow_html=True)