Spaces:
Sleeping
Sleeping
| # import part | |
| import streamlit as st | |
| from transformers import pipeline | |
| import os | |
| import numpy as np | |
| import io | |
| import scipy.io.wavfile as wavfile | |
| # function part | |
| # img2text | |
| def img2text(image_path): | |
| image_to_text = pipeline("image-to-text", model="sooh-j/blip-image-captioning-base") | |
| text = image_to_text(image_path)[0]["generated_text"] | |
| return text | |
| # text2story | |
| def text2story(text): | |
| # Using a smaller text generation model | |
| generator = pipeline("text-generation", model="TinyLlama/TinyLlama-1.1B-Chat-v1.0") | |
| # Create a prompt for the story generation | |
| prompt = f"Write a fun children's story based on this: {text}. Once upon a time, " | |
| # Generate the story | |
| story_result = generator( | |
| prompt, | |
| max_length=150, | |
| num_return_sequences=1, | |
| temperature=0.7, | |
| top_k=50, | |
| top_p=0.95, | |
| do_sample=True | |
| ) | |
| # Extract the generated text | |
| story_text = story_result[0]['generated_text'] | |
| story_text = story_text.replace(prompt, "Once upon a time, ") | |
| # Make sure the story is at least 100 words | |
| words = story_text.split() | |
| if len(words) > 100: | |
| # Simply truncate to 100 words | |
| story_text = " ".join(words[:100]) | |
| return story_text | |
| # text2audio - REVISED to use facebook/mms-tts-eng model | |
| def text2audio(story_text): | |
| try: | |
| # Use a smaller and more reliable TTS model | |
| synthesizer = pipeline("text-to-speech", model="facebook/mms-tts-eng") | |
| # Break the text into smaller chunks if needed (prevent timeout) | |
| max_chunk_size = 200 # characters | |
| chunks = [] | |
| for i in range(0, len(story_text), max_chunk_size): | |
| chunk = story_text[i:i+max_chunk_size] | |
| # Make sure we break at word boundaries | |
| if i+max_chunk_size < len(story_text) and story_text[i+max_chunk_size] != ' ': | |
| # Find the last space in this chunk | |
| last_space = chunk.rfind(' ') | |
| if last_space != -1: | |
| chunk = chunk[:last_space] | |
| chunks.append(chunk) | |
| # Process each chunk | |
| audio_arrays = [] | |
| sampling_rate = None | |
| for chunk in chunks: | |
| if not chunk.strip(): # Skip empty chunks | |
| continue | |
| speech = synthesizer(chunk) | |
| if sampling_rate is None: | |
| sampling_rate = speech["sampling_rate"] | |
| audio_arrays.append(speech["audio"]) | |
| # Combine all audio chunks | |
| combined_audio = np.concatenate(audio_arrays) | |
| # Create a BytesIO object to store the wave file | |
| wav_buffer = io.BytesIO() | |
| wavfile.write(wav_buffer, sampling_rate, combined_audio) | |
| wav_buffer.seek(0) # Rewind the buffer | |
| return { | |
| "audio": wav_buffer.getvalue(), | |
| "sampling_rate": sampling_rate | |
| } | |
| except Exception as e: | |
| st.error(f"Error generating audio: {str(e)}") | |
| # Fallback to a pre-recorded audio file if available | |
| try: | |
| with open("fallback_audio.wav", "rb") as f: | |
| return { | |
| "audio": f.read(), | |
| "sampling_rate": 22050 # Common sample rate | |
| } | |
| except: | |
| return None | |
| # Function to save temporary image file | |
| def save_uploaded_image(uploaded_file): | |
| if not os.path.exists("temp"): | |
| os.makedirs("temp") | |
| image_path = os.path.join("temp", uploaded_file.name) | |
| with open(image_path, "wb") as f: | |
| f.write(uploaded_file.getvalue()) | |
| return image_path | |
| # main part | |
| st.set_page_config(page_title="Your Image to Audio Story", page_icon="🦜") | |
| st.header("Turn Your Image to Audio Story") | |
| uploaded_file = st.file_uploader("Select an Image...") | |
| if uploaded_file is not None: | |
| # Display the uploaded image | |
| st.image(uploaded_file, caption="Uploaded Image", use_container_width=True) | |
| # Save the image temporarily | |
| image_path = save_uploaded_image(uploaded_file) | |
| # Stage 1: Image to Text | |
| st.text('Processing img2text...') | |
| caption = img2text(image_path) | |
| st.write(caption) | |
| # Stage 2: Text to Story | |
| st.text('Generating a story...') | |
| story = text2story(caption) | |
| st.write(story) | |
| # Stage 3: Story to Audio data | |
| st.text('Generating audio data...') | |
| audio_data = text2audio(story) | |
| # Play button | |
| if st.button("Play Audio"): | |
| if audio_data: | |
| st.audio( | |
| audio_data["audio"], | |
| format="audio/wav", | |
| start_time=0, | |
| sample_rate=audio_data["sampling_rate"] | |
| ) | |
| else: | |
| st.error("Failed to generate audio. Please try again.") | |
| # Clean up the temporary file | |
| try: | |
| os.remove(image_path) | |
| except: | |
| pass |