Spaces:
Sleeping
Sleeping
| # Only the two imports you requested | |
| import streamlit as st | |
| from transformers import pipeline | |
| from PIL import Image | |
| # Simple image-to-text function | |
| def img2text(image): | |
| image_to_text = pipeline("image-to-text", model="sooh-j/blip-image-captioning-base") | |
| text = image_to_text(image)[0]["generated_text"] | |
| return text | |
| # Improved text-to-story function with natural ending | |
| def text2story(text): | |
| generator = pipeline("text-generation", model="TinyLlama/TinyLlama-1.1B-Chat-v1.0") | |
| prompt = f"Write a short children's story based on this: {text}. The story should have a clear beginning, middle, and end. Keep it under 150 words. Once upon a time, " | |
| # Generate a longer text to ensure we get a complete story | |
| story_result = generator( | |
| prompt, | |
| max_length=300, | |
| num_return_sequences=1, | |
| temperature=0.7, | |
| do_sample=True | |
| ) | |
| story_text = story_result[0]['generated_text'] | |
| story_text = story_text.replace(prompt, "Once upon a time, ") | |
| # Find natural ending points (end of sentences) | |
| periods = [i for i, char in enumerate(story_text) if char == '.'] | |
| question_marks = [i for i, char in enumerate(story_text) if char == '?'] | |
| exclamation_marks = [i for i, char in enumerate(story_text) if char == '!'] | |
| # Combine all ending punctuation and sort | |
| all_endings = sorted(periods + question_marks + exclamation_marks) | |
| # If we have any sentence endings | |
| if all_endings: | |
| # Get the index where the story should reasonably end (after at least 100 characters) | |
| min_story_length = 100 | |
| suitable_endings = [i for i in all_endings if i >= min_story_length] | |
| if suitable_endings: | |
| # Find an ending that completes a thought (not just the first sentence) | |
| if len(suitable_endings) > 2: | |
| # Use the third sentence ending or later for a more complete story | |
| return story_text[:suitable_endings[2]+1] | |
| else: | |
| # If we don't have many sentences, use the last one we found | |
| return story_text[:suitable_endings[-1]+1] | |
| # If no good ending is found, return as is | |
| return story_text | |
| # Updated text-to-audio function with a compatible model | |
| def text2audio(story_text): | |
| # Use Microsoft's SpeechT5 model which is widely supported | |
| synthesizer = pipeline("text-to-speech", model="microsoft/speecht5_tts") | |
| # This model requires speaker embeddings | |
| from transformers import SpeechT5HifiGan | |
| vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan") | |
| # Get speaker embeddings for a female voice | |
| from transformers import SpeechT5Processor | |
| processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts") | |
| speaker_embeddings = processor.speaker_embeddings["female"] | |
| # Limit text length to avoid issues | |
| max_chars = 500 | |
| if len(story_text) > max_chars: | |
| last_period = story_text[:max_chars].rfind('.') | |
| if last_period > 0: | |
| story_text = story_text[:last_period + 1] | |
| else: | |
| story_text = story_text[:max_chars] | |
| # Generate speech with appropriate parameters | |
| inputs = processor(text=story_text, return_tensors="pt") | |
| speech = synthesizer(inputs["input_ids"][0], speaker_embeddings=speaker_embeddings, vocoder=vocoder) | |
| return speech | |
| # Basic Streamlit interface | |
| st.title("Image to Audio Story") | |
| uploaded_file = st.file_uploader("Upload an image") | |
| if uploaded_file is not None: | |
| # Display image | |
| st.image(uploaded_file, caption="Uploaded Image") | |
| # Convert to PIL Image | |
| image = Image.open(uploaded_file) | |
| # Image to Text | |
| with st.spinner("Generating caption..."): | |
| caption = img2text(image) | |
| st.write(f"Caption: {caption}") | |
| # Text to Story | |
| with st.spinner("Creating story..."): | |
| story = text2story(caption) | |
| st.write(f"Story: {story}") | |
| # Text to Audio | |
| with st.spinner("Generating audio..."): | |
| try: | |
| speech_output = text2audio(story) | |
| # Play audio | |
| if hasattr(speech_output, 'numpy') or hasattr(speech_output, 'audio'): | |
| if hasattr(speech_output, 'numpy'): | |
| audio_data = speech_output.numpy() | |
| else: | |
| audio_data = speech_output.audio | |
| sample_rate = speech_output.sampling_rate if hasattr(speech_output, 'sampling_rate') else 16000 | |
| st.audio(audio_data, sample_rate=sample_rate) | |
| else: | |
| st.audio(speech_output['audio'], sample_rate=speech_output.get('sampling_rate', 16000)) | |
| except Exception as e: | |
| st.error(f"Error generating or playing audio: {e}") | |
| st.write("Try installing the latest transformers library with: pip install --upgrade transformers") |