# Imports import streamlit as st from transformers import pipeline from PIL import Image import torch import os import tempfile # For TTS, try multiple options in order of preference try: # Try gTTS first from gtts import gTTS def text2audio(story_text): # Create a temporary file temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.mp3') temp_filename = temp_file.name temp_file.close() # Use gTTS to convert text to speech tts = gTTS(text=story_text, lang='en', slow=False) tts.save(temp_filename) # Read the audio file with open(temp_filename, 'rb') as audio_file: audio_bytes = audio_file.read() # Clean up the temporary file os.unlink(temp_filename) return audio_bytes, 'audio/mp3' except ImportError: st.warning("gTTS not available. Using alternative text-to-speech method.") # Define alternative TTS using built-in transformers pipeline def text2audio(story_text): # Use a different TTS method from transformers import pipeline # Try a simple TTS model that should work with base transformers synthesizer = pipeline("text-to-speech", model="facebook/mms-tts-eng") # Generate speech speech = synthesizer(story_text) # Return the audio data if 'audio' in speech: return speech['audio'], speech.get('sampling_rate', 16000) elif 'audio_array' in speech: return speech['audio_array'], speech.get('sampling_rate', 16000) else: # In case of failure, return an error message raise Exception("Failed to generate audio with any available method") # Simple image-to-text function def img2text(image): image_to_text = pipeline("image-to-text", model="sooh-j/blip-image-captioning-base") text = image_to_text(image)[0]["generated_text"] return text # Improved text-to-story function with longer stories (approaching 100 words) def text2story(text): generator = pipeline("text-generation", model="TinyLlama/TinyLlama-1.1B-Chat-v1.0") # Specifically ask for a longer story (150-200 words) to ensure we get at least 100 prompt = f"""Write a children's story based on this: {text}. The story should have a clear beginning, middle, and end. Make the story approximately 150-200 words long with descriptive language. Start with "Once upon a time, " """ # Generate a longer text with higher max_length to ensure we get a complete story story_result = generator( prompt, max_length=500, # Increased max length num_return_sequences=1, temperature=0.7, do_sample=True ) story_text = story_result[0]['generated_text'] # Extract just the story part (after the prompt) if "Once upon a time, " in story_text: # Find the index of "Once upon a time" and extract from there start_idx = story_text.find("Once upon a time, ") story_text = story_text[start_idx:] else: # If we can't find the exact phrase, try to find the story after the prompt story_text = story_text.replace(prompt, "Once upon a time, ") # Find natural ending points (end of sentences) periods = [i for i, char in enumerate(story_text) if char == '.'] question_marks = [i for i, char in enumerate(story_text) if char == '?'] exclamation_marks = [i for i, char in enumerate(story_text) if char == '!'] # Combine all ending punctuation and sort all_endings = sorted(periods + question_marks + exclamation_marks) # Count words in the story so far def count_words(text): return len(text.split()) # If we have any sentence endings if all_endings: # Find endings that give us stories of approximately 100 words or more target_word_count = 100 min_word_count = 80 # Allow slightly shorter stories that end naturally suitable_endings = [] for ending_idx in all_endings: candidate_text = story_text[:ending_idx+1] word_count = count_words(candidate_text) if word_count >= min_word_count: suitable_endings.append((ending_idx, word_count)) if suitable_endings: # Find the ending that gets us closest to our target word count suitable_endings.sort(key=lambda x: abs(x[1] - target_word_count)) best_ending_idx = suitable_endings[0][0] return story_text[:best_ending_idx+1] # If we couldn't find a good ending point, try to get at least 80-100 words if len(all_endings) > 0: for i in range(len(all_endings)-1, -1, -1): ending_idx = all_endings[i] candidate_text = story_text[:ending_idx+1] if count_words(candidate_text) >= 80: return candidate_text # If all else fails, return the story as is return story_text # Basic Streamlit interface st.title("Image to Audio Story") uploaded_file = st.file_uploader("Upload an image") if uploaded_file is not None: # Display image st.image(uploaded_file, caption="Uploaded Image") # Convert to PIL Image image = Image.open(uploaded_file) # Image to Text with st.spinner("Generating caption..."): caption = img2text(image) st.write(f"Caption: {caption}") # Text to Story with st.spinner("Creating story..."): story = text2story(caption) # Display word count for transparency word_count = len(story.split()) st.write(f"Story ({word_count} words):") st.write(story) # Text to Audio with st.spinner("Generating audio..."): try: audio_data, audio_format = text2audio(story) # Play audio if isinstance(audio_format, str) and audio_format.startswith('audio/'): st.audio(audio_data, format=audio_format) else: st.audio(audio_data, sample_rate=audio_format) except Exception as e: st.error(f"Error generating or playing audio: {e}") st.info("There was an issue with the text-to-speech conversion.")