Spaces:

CR7CAD
/

Assignment1

Sleeping

App Files Files Community

CR7CAD commited on Mar 8

Commit

7c4bc18

verified ·

1 Parent(s): ad4186a

Update app.py

Browse files

Files changed (1) hide show

app.py +77 -23

app.py CHANGED Viewed

@@ -9,14 +9,15 @@ def img2text(image):
     text = image_to_text(image)[0]["generated_text"]
     return text
-# Simple text-to-story function
 def text2story(text):
     generator = pipeline("text-generation", model="TinyLlama/TinyLlama-1.1B-Chat-v1.0")
-    prompt = f"Write a short children's story based on this: {text}. Once upon a time, "
     story_result = generator(
         prompt,
-        max_length=150,
         num_return_sequences=1,
         temperature=0.7,
         do_sample=True
@@ -24,12 +25,60 @@ def text2story(text):
     story_text = story_result[0]['generated_text']
     story_text = story_text.replace(prompt, "Once upon a time, ")
     return story_text
-# Simple text-to-audio function
 def text2audio(story_text):
-    synthesizer = pipeline("text-to-speech", model="HelpingAI/HelpingAI-TTS-v1")
-    speech = synthesizer(story_text)
     return speech
 # Basic Streamlit interface
@@ -44,26 +93,31 @@ if uploaded_file is not None:
     image = Image.open(uploaded_file)
     # Image to Text
-    st.write("Generating caption...")
-    caption = img2text(image)
     st.write(f"Caption: {caption}")
     # Text to Story
-    st.write("Creating story...")
-    story = text2story(caption)
     st.write(f"Story: {story}")
     # Text to Audio
-    st.write("Generating audio...")
-    speech_output = text2audio(story)
-    # Play audio
-    try:
-        if 'audio' in speech_output and 'sampling_rate' in speech_output:
-            st.audio(speech_output['audio'], sample_rate=speech_output['sampling_rate'])
-        elif 'audio_array' in speech_output and 'sampling_rate' in speech_output:
-            st.audio(speech_output['audio_array'], sample_rate=speech_output['sampling_rate'])
-        else:
-            st.write("Audio generated but could not be played.")
-    except Exception as e:
-        st.error(f"Error playing audio: {e}")

     text = image_to_text(image)[0]["generated_text"]
     return text
+# Improved text-to-story function with natural ending
 def text2story(text):
     generator = pipeline("text-generation", model="TinyLlama/TinyLlama-1.1B-Chat-v1.0")
+    prompt = f"Write a short children's story based on this: {text}. The story should have a clear beginning, middle, and end. Keep it under 150 words. Once upon a time, "
+    # Generate a longer text to ensure we get a complete story
     story_result = generator(
         prompt,
+        max_length=300,
         num_return_sequences=1,
         temperature=0.7,
         do_sample=True
     story_text = story_result[0]['generated_text']
     story_text = story_text.replace(prompt, "Once upon a time, ")
+    # Find natural ending points (end of sentences)
+    periods = [i for i, char in enumerate(story_text) if char == '.']
+    question_marks = [i for i, char in enumerate(story_text) if char == '?']
+    exclamation_marks = [i for i, char in enumerate(story_text) if char == '!']
+    # Combine all ending punctuation and sort
+    all_endings = sorted(periods + question_marks + exclamation_marks)
+    # If we have any sentence endings
+    if all_endings:
+        # Get the index where the story should reasonably end (after at least 100 characters)
+        min_story_length = 100
+        suitable_endings = [i for i in all_endings if i >= min_story_length]
+        if suitable_endings:
+            # Find an ending that completes a thought (not just the first sentence)
+            if len(suitable_endings) > 2:
+                # Use the third sentence ending or later for a more complete story
+                return story_text[:suitable_endings[2]+1]
+            else:
+                # If we don't have many sentences, use the last one we found
+                return story_text[:suitable_endings[-1]+1]
+    # If no good ending is found, return as is
     return story_text
+# Updated text-to-audio function with a compatible model
 def text2audio(story_text):
+    # Use Microsoft's SpeechT5 model which is widely supported
+    synthesizer = pipeline("text-to-speech", model="microsoft/speecht5_tts")
+    # This model requires speaker embeddings
+    from transformers import SpeechT5HifiGan
+    vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
+    # Get speaker embeddings for a female voice
+    from transformers import SpeechT5Processor
+    processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
+    speaker_embeddings = processor.speaker_embeddings["female"]
+    # Limit text length to avoid issues
+    max_chars = 500
+    if len(story_text) > max_chars:
+        last_period = story_text[:max_chars].rfind('.')
+        if last_period > 0:
+            story_text = story_text[:last_period + 1]
+        else:
+            story_text = story_text[:max_chars]
+    # Generate speech with appropriate parameters
+    inputs = processor(text=story_text, return_tensors="pt")
+    speech = synthesizer(inputs["input_ids"][0], speaker_embeddings=speaker_embeddings, vocoder=vocoder)
     return speech
 # Basic Streamlit interface
     image = Image.open(uploaded_file)
     # Image to Text
+    with st.spinner("Generating caption..."):
+        caption = img2text(image)
     st.write(f"Caption: {caption}")
     # Text to Story
+    with st.spinner("Creating story..."):
+        story = text2story(caption)
     st.write(f"Story: {story}")
     # Text to Audio
+    with st.spinner("Generating audio..."):
+        try:
+            speech_output = text2audio(story)
+            # Play audio
+            if hasattr(speech_output, 'numpy') or hasattr(speech_output, 'audio'):
+                if hasattr(speech_output, 'numpy'):
+                    audio_data = speech_output.numpy()
+                else:
+                    audio_data = speech_output.audio
+                sample_rate = speech_output.sampling_rate if hasattr(speech_output, 'sampling_rate') else 16000
+                st.audio(audio_data, sample_rate=sample_rate)
+            else:
+                st.audio(speech_output['audio'], sample_rate=speech_output.get('sampling_rate', 16000))
+        except Exception as e:
+            st.error(f"Error generating or playing audio: {e}")
+            st.write("Try installing the latest transformers library with: pip install --upgrade transformers")