Spaces:

CR7CAD
/

Assignment1

Sleeping

App Files Files Community

CR7CAD commited on Mar 8

Commit

ab8ead3

verified ·

1 Parent(s): 1ebc71c

Update app.py

Browse files

Files changed (1) hide show

app.py +89 -80

app.py CHANGED Viewed

@@ -1,68 +1,72 @@
-# import part - only using the two requested imports
 import streamlit as st
 from transformers import pipeline
-# function part
-# img2text
-def img2text(image_path):
-    image_to_text = pipeline("image-to-text", model="sooh-j/blip-image-captioning-base")
-    text = image_to_text(image_path)[0]["generated_text"]
     return text
-# text2story - IMPROVED to end naturally
 def text2story(text):
-    # Using a smaller text generation model
-    generator = pipeline("text-generation", model="TinyLlama/TinyLlama-1.1B-Chat-v1.0")
-    # Create a prompt for the story generation
-    prompt = f"Write a fun children's story based on this: {text}. The story should be short and end naturally with a conclusion. Once upon a time, "
-    # Generate the story
     story_result = generator(
         prompt,
-        max_length=250,  # Increased to allow for a complete story
         num_return_sequences=1,
         temperature=0.7,
-        top_k=50,
-        top_p=0.95,
         do_sample=True
     )
-    # Extract the generated text
     story_text = story_result[0]['generated_text']
     story_text = story_text.replace(prompt, "Once upon a time, ")
-    # Find a natural ending point (end of sentence) before 100 words
-    words = story_text.split()
-    if len(words) > 100:
-        # Join the first 100 words
-        shortened_text = " ".join(words[:100])
-        # Find the last complete sentence
-        last_period = shortened_text.rfind('.')
-        last_question = shortened_text.rfind('?')
-        last_exclamation = shortened_text.rfind('!')
-        # Find the last sentence ending punctuation
-        last_end = max(last_period, last_question, last_exclamation)
-        if last_end > 0:
-            # Truncate at the end of the last complete sentence
-            story_text = shortened_text[:last_end + 1]
-        else:
-            # If no sentence ending found, just use the shortened text
-            story_text = shortened_text
     return story_text
-# text2audio - Using HelpingAI-TTS-v1 model
 def text2audio(story_text):
     try:
-        # Use the HelpingAI TTS model as requested
-        synthesizer = pipeline("text-to-speech", model="HelpingAI/HelpingAI-TTS-v1")
-        # Limit text length to avoid timeouts
-        max_chars = 500
         if len(story_text) > max_chars:
             last_period = story_text[:max_chars].rfind('.')
             if last_period > 0:
@@ -72,46 +76,57 @@ def text2audio(story_text):
         # Generate speech
         speech = synthesizer(story_text)
-        # Get output information
-        st.write(f"Speech output keys: {list(speech.keys())}")
         return speech
     except Exception as e:
         st.error(f"Error generating audio: {str(e)}")
         return None
-# main part
-st.set_page_config(page_title="Your Image to Audio Story", page_icon="🦜")
-st.header("Turn Your Image to Audio Story")
-uploaded_file = st.file_uploader("Select an Image...")
-if uploaded_file is not None:
-    # Display the uploaded image
-    st.image(uploaded_file, caption="Uploaded Image", use_container_width=True)
-    # Create a temporary file in memory from the uploaded file
-    image_bytes = uploaded_file.getvalue()
-    # Stage 1: Image to Text
-    st.text('Processing img2text...')
-    caption = img2text(image_bytes)  # Pass bytes directly to pipeline
-    st.write(caption)
-    # Stage 2: Text to Story
-    st.text('Generating a story...')
-    story = text2story(caption)
-    st.write(story)
-    # Stage 3: Story to Audio data
-    st.text('Generating audio data...')
-    speech_output = text2audio(story)
-    # Play button
-    if st.button("Play Audio"):
         if speech_output is not None:
-            # Try to play the audio directly
             try:
                 if 'audio' in speech_output and 'sampling_rate' in speech_output:
                     st.audio(speech_output['audio'], sample_rate=speech_output['sampling_rate'])
@@ -120,21 +135,15 @@ if uploaded_file is not None:
                 elif 'waveform' in speech_output and 'sample_rate' in speech_output:
                     st.audio(speech_output['waveform'], sample_rate=speech_output['sample_rate'])
                 else:
-                    # Try the first array-like value as audio data
                     for key, value in speech_output.items():
                         if hasattr(value, '__len__') and len(value) > 1000:
-                            if 'rate' in speech_output:
-                                st.audio(value, sample_rate=speech_output['rate'])
-                            elif 'sample_rate' in speech_output:
-                                st.audio(value, sample_rate=speech_output['sample_rate'])
-                            elif 'sampling_rate' in speech_output:
-                                st.audio(value, sample_rate=speech_output['sampling_rate'])
-                            else:
-                                st.audio(value, sample_rate=24000)  # Default sample rate
                             break
                     else:
-                        st.error(f"Could not find compatible audio format in: {list(speech_output.keys())}")
             except Exception as e:
                 st.error(f"Error playing audio: {str(e)}")
         else:
-            st.error("Audio generation failed. Please try again.")

+# import part
 import streamlit as st
 from transformers import pipeline
+from PIL import Image
+# Set global caching options for Transformers
+from transformers import set_caching_enabled
+set_caching_enabled(True)
+# function part with caching for better performance
+@st.cache_resource
+def load_image_captioning_model():
+    return pipeline("image-to-text", model="sooh-j/blip-image-captioning-base")
+@st.cache_resource
+def load_text_generator():
+    return pipeline("text-generation", model="TinyLlama/TinyLlama-1.1B-Chat-v1.0")
+@st.cache_resource
+def load_tts_model():
+    return pipeline("text-to-speech", model="HelpingAI/HelpingAI-TTS-v1")
+# img2text - Using the original model with more constraints
+def img2text(image):
+    # Load the model (cached)
+    image_to_text = load_image_captioning_model()
+    # Strongly limit output length for speed
+    text = image_to_text(image, max_new_tokens=15)[0]["generated_text"]
     return text
+# text2story - Much more constrained for speed
 def text2story(text):
+    # Load the model (cached)
+    generator = load_text_generator()
+    # Very brief prompt to minimize work
+    prompt = f"Short story about {text}: Once upon a time, "
+    # Very constrained parameters for maximum speed
     story_result = generator(
         prompt,
+        max_new_tokens=60,  # Much shorter output
         num_return_sequences=1,
         temperature=0.7,
+        top_k=10,  # Lower value = faster
+        top_p=0.9,  # Lower value = faster
         do_sample=True
     )
+    # Extract and clean text
     story_text = story_result[0]['generated_text']
     story_text = story_text.replace(prompt, "Once upon a time, ")
+    # Find a natural ending point
+    last_period = story_text.rfind('.')
+    if last_period > 30:  # Ensure we have at least some content
+        story_text = story_text[:last_period + 1]
     return story_text
+# text2audio - Minimal text for faster processing
 def text2audio(story_text):
     try:
+        # Load the model (cached)
+        synthesizer = load_tts_model()
+        # Aggressively limit text length to speed up TTS
+        max_chars = 200  # Much shorter than before
         if len(story_text) > max_chars:
             last_period = story_text[:max_chars].rfind('.')
             if last_period > 0:
         # Generate speech
         speech = synthesizer(story_text)
         return speech
     except Exception as e:
         st.error(f"Error generating audio: {str(e)}")
         return None
+# Streamlined main UI
+st.set_page_config(page_title="Image to Story", page_icon="📚")
+st.header("Image to Audio Story")
+# Add info about processing time
+st.info("Note: Processing may take some time as the models are loading. Please be patient.")
+# Cache the file uploader state
+if "uploaded_file" not in st.session_state:
+    st.session_state["uploaded_file"] = None
+uploaded_file = st.file_uploader("Select an Image...", key="file_uploader")
+# Process the image if uploaded
+if uploaded_file is not None:
+    st.session_state["uploaded_file"] = uploaded_file
+    # Display the uploaded image
+    st.image(uploaded_file, caption="Uploaded Image", use_column_width=True)
+    # Convert to PIL image
+    image = Image.open(uploaded_file)
+    # Optional processing toggle to let user decide
+    if st.button("Generate Story and Audio"):
+        col1, col2 = st.columns(2)
+        # Stage 1: Image to Text with minimal output
+        with col1:
+            with st.spinner('Captioning image...'):
+                caption = img2text(image)
+            st.write(f"**Caption:** {caption}")
+        # Stage 2: Text to Story with minimal length
+        with col2:
+            with st.spinner('Creating story...'):
+                story = text2story(caption)
+            st.write(f"**Story:** {story}")
+        # Stage 3: Audio with minimal text
+        with st.spinner('Generating audio...'):
+            speech_output = text2audio(story)
+        # Display audio immediately
         if speech_output is not None:
             try:
                 if 'audio' in speech_output and 'sampling_rate' in speech_output:
                     st.audio(speech_output['audio'], sample_rate=speech_output['sampling_rate'])
                 elif 'waveform' in speech_output and 'sample_rate' in speech_output:
                     st.audio(speech_output['waveform'], sample_rate=speech_output['sample_rate'])
                 else:
+                    # Try any array-like data
                     for key, value in speech_output.items():
                         if hasattr(value, '__len__') and len(value) > 1000:
+                            sample_rate = speech_output.get('sampling_rate', speech_output.get('sample_rate', 24000))
+                            st.audio(value, sample_rate=sample_rate)
                             break
                     else:
+                        st.error("Could not find audio data in the output")
             except Exception as e:
                 st.error(f"Error playing audio: {str(e)}")
         else:
+            st.error("Audio generation failed")