Spaces:
Sleeping
Sleeping
| # Imports | |
| import streamlit as st | |
| from transformers import pipeline | |
| from PIL import Image | |
| import torch | |
| import os | |
| import tempfile | |
| # For TTS, try multiple options in order of preference | |
| try: | |
| # Try gTTS first | |
| from gtts import gTTS | |
| def text2audio(story_text): | |
| # Create a temporary file | |
| temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.mp3') | |
| temp_filename = temp_file.name | |
| temp_file.close() | |
| # Use gTTS to convert text to speech | |
| tts = gTTS(text=story_text, lang='en', slow=False) | |
| tts.save(temp_filename) | |
| # Read the audio file | |
| with open(temp_filename, 'rb') as audio_file: | |
| audio_bytes = audio_file.read() | |
| # Clean up the temporary file | |
| os.unlink(temp_filename) | |
| return audio_bytes, 'audio/mp3' | |
| except ImportError: | |
| st.warning("gTTS not available. Using alternative text-to-speech method.") | |
| # Define alternative TTS using built-in transformers pipeline | |
| def text2audio(story_text): | |
| # Use a different TTS method | |
| from transformers import pipeline | |
| # Try a simple TTS model that should work with base transformers | |
| synthesizer = pipeline("text-to-speech", model="facebook/mms-tts-eng") | |
| # Generate speech | |
| speech = synthesizer(story_text) | |
| # Return the audio data | |
| if 'audio' in speech: | |
| return speech['audio'], speech.get('sampling_rate', 16000) | |
| elif 'audio_array' in speech: | |
| return speech['audio_array'], speech.get('sampling_rate', 16000) | |
| else: | |
| # In case of failure, return an error message | |
| raise Exception("Failed to generate audio with any available method") | |
| # Simple image-to-text function | |
| def img2text(image): | |
| image_to_text = pipeline("image-to-text", model="sooh-j/blip-image-captioning-base") | |
| text = image_to_text(image)[0]["generated_text"] | |
| return text | |
| # Improved text-to-story function with longer stories (approaching 100 words) | |
| def text2story(text): | |
| generator = pipeline("text-generation", model="TinyLlama/TinyLlama-1.1B-Chat-v1.0") | |
| # Specifically ask for a longer story (150-200 words) to ensure we get at least 100 | |
| prompt = f"""Write a children's story based on this: {text}. | |
| The story should have a clear beginning, middle, and end. | |
| Make the story approximately 150-200 words long with descriptive language. | |
| Start with "Once upon a time, " | |
| """ | |
| # Generate a longer text with higher max_length to ensure we get a complete story | |
| story_result = generator( | |
| prompt, | |
| max_length=500, # Increased max length | |
| num_return_sequences=1, | |
| temperature=0.7, | |
| do_sample=True | |
| ) | |
| story_text = story_result[0]['generated_text'] | |
| # Extract just the story part (after the prompt) | |
| if "Once upon a time, " in story_text: | |
| # Find the index of "Once upon a time" and extract from there | |
| start_idx = story_text.find("Once upon a time, ") | |
| story_text = story_text[start_idx:] | |
| else: | |
| # If we can't find the exact phrase, try to find the story after the prompt | |
| story_text = story_text.replace(prompt, "Once upon a time, ") | |
| # Find natural ending points (end of sentences) | |
| periods = [i for i, char in enumerate(story_text) if char == '.'] | |
| question_marks = [i for i, char in enumerate(story_text) if char == '?'] | |
| exclamation_marks = [i for i, char in enumerate(story_text) if char == '!'] | |
| # Combine all ending punctuation and sort | |
| all_endings = sorted(periods + question_marks + exclamation_marks) | |
| # Count words in the story so far | |
| def count_words(text): | |
| return len(text.split()) | |
| # If we have any sentence endings | |
| if all_endings: | |
| # Find endings that give us stories of approximately 100 words or more | |
| target_word_count = 100 | |
| min_word_count = 80 # Allow slightly shorter stories that end naturally | |
| suitable_endings = [] | |
| for ending_idx in all_endings: | |
| candidate_text = story_text[:ending_idx+1] | |
| word_count = count_words(candidate_text) | |
| if word_count >= min_word_count: | |
| suitable_endings.append((ending_idx, word_count)) | |
| if suitable_endings: | |
| # Find the ending that gets us closest to our target word count | |
| suitable_endings.sort(key=lambda x: abs(x[1] - target_word_count)) | |
| best_ending_idx = suitable_endings[0][0] | |
| return story_text[:best_ending_idx+1] | |
| # If we couldn't find a good ending point, try to get at least 80-100 words | |
| if len(all_endings) > 0: | |
| for i in range(len(all_endings)-1, -1, -1): | |
| ending_idx = all_endings[i] | |
| candidate_text = story_text[:ending_idx+1] | |
| if count_words(candidate_text) >= 80: | |
| return candidate_text | |
| # If all else fails, return the story as is | |
| return story_text | |
| # Basic Streamlit interface | |
| st.title("Image to Audio Story") | |
| uploaded_file = st.file_uploader("Upload an image") | |
| if uploaded_file is not None: | |
| # Display image | |
| st.image(uploaded_file, caption="Uploaded Image") | |
| # Convert to PIL Image | |
| image = Image.open(uploaded_file) | |
| # Image to Text | |
| with st.spinner("Generating caption..."): | |
| caption = img2text(image) | |
| st.write(f"Caption: {caption}") | |
| # Text to Story | |
| with st.spinner("Creating story..."): | |
| story = text2story(caption) | |
| # Display word count for transparency | |
| word_count = len(story.split()) | |
| st.write(f"Story ({word_count} words):") | |
| st.write(story) | |
| # Text to Audio | |
| with st.spinner("Generating audio..."): | |
| try: | |
| audio_data, audio_format = text2audio(story) | |
| # Play audio | |
| if isinstance(audio_format, str) and audio_format.startswith('audio/'): | |
| st.audio(audio_data, format=audio_format) | |
| else: | |
| st.audio(audio_data, sample_rate=audio_format) | |
| except Exception as e: | |
| st.error(f"Error generating or playing audio: {e}") | |
| st.info("There was an issue with the text-to-speech conversion.") |