# Only the two imports you requested import streamlit as st from transformers import pipeline from PIL import Image # Simple image-to-text function def img2text(image): image_to_text = pipeline("image-to-text", model="sooh-j/blip-image-captioning-base") text = image_to_text(image)[0]["generated_text"] return text # Improved text-to-story function with natural ending def text2story(text): generator = pipeline("text-generation", model="TinyLlama/TinyLlama-1.1B-Chat-v1.0") prompt = f"Write a short children's story based on this: {text}. The story should have a clear beginning, middle, and end. Keep it under 150 words. Once upon a time, " # Generate a longer text to ensure we get a complete story story_result = generator( prompt, max_length=300, num_return_sequences=1, temperature=0.7, do_sample=True ) story_text = story_result[0]['generated_text'] story_text = story_text.replace(prompt, "Once upon a time, ") # Find natural ending points (end of sentences) periods = [i for i, char in enumerate(story_text) if char == '.'] question_marks = [i for i, char in enumerate(story_text) if char == '?'] exclamation_marks = [i for i, char in enumerate(story_text) if char == '!'] # Combine all ending punctuation and sort all_endings = sorted(periods + question_marks + exclamation_marks) # If we have any sentence endings if all_endings: # Get the index where the story should reasonably end (after at least 100 characters) min_story_length = 100 suitable_endings = [i for i in all_endings if i >= min_story_length] if suitable_endings: # Find an ending that completes a thought (not just the first sentence) if len(suitable_endings) > 2: # Use the third sentence ending or later for a more complete story return story_text[:suitable_endings[2]+1] else: # If we don't have many sentences, use the last one we found return story_text[:suitable_endings[-1]+1] # If no good ending is found, return as is return story_text # Updated text-to-audio function with a compatible model def text2audio(story_text): # Use Microsoft's SpeechT5 model which is widely supported synthesizer = pipeline("text-to-speech", model="microsoft/speecht5_tts") # This model requires speaker embeddings from transformers import SpeechT5HifiGan vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan") # Get speaker embeddings for a female voice from transformers import SpeechT5Processor processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts") speaker_embeddings = processor.speaker_embeddings["female"] # Limit text length to avoid issues max_chars = 500 if len(story_text) > max_chars: last_period = story_text[:max_chars].rfind('.') if last_period > 0: story_text = story_text[:last_period + 1] else: story_text = story_text[:max_chars] # Generate speech with appropriate parameters inputs = processor(text=story_text, return_tensors="pt") speech = synthesizer(inputs["input_ids"][0], speaker_embeddings=speaker_embeddings, vocoder=vocoder) return speech # Basic Streamlit interface st.title("Image to Audio Story") uploaded_file = st.file_uploader("Upload an image") if uploaded_file is not None: # Display image st.image(uploaded_file, caption="Uploaded Image") # Convert to PIL Image image = Image.open(uploaded_file) # Image to Text with st.spinner("Generating caption..."): caption = img2text(image) st.write(f"Caption: {caption}") # Text to Story with st.spinner("Creating story..."): story = text2story(caption) st.write(f"Story: {story}") # Text to Audio with st.spinner("Generating audio..."): try: speech_output = text2audio(story) # Play audio if hasattr(speech_output, 'numpy') or hasattr(speech_output, 'audio'): if hasattr(speech_output, 'numpy'): audio_data = speech_output.numpy() else: audio_data = speech_output.audio sample_rate = speech_output.sampling_rate if hasattr(speech_output, 'sampling_rate') else 16000 st.audio(audio_data, sample_rate=sample_rate) else: st.audio(speech_output['audio'], sample_rate=speech_output.get('sampling_rate', 16000)) except Exception as e: st.error(f"Error generating or playing audio: {e}") st.write("Try installing the latest transformers library with: pip install --upgrade transformers")