# import part import streamlit as st from transformers import pipeline import os # function part # img2text def img2text(image_path): image_to_text = pipeline("image-to-text", model="sooh-j/blip-image-captioning-base") text = image_to_text(image_path)[0]["generated_text"] return text # text2story def text2story(text): # Using a smaller text generation model generator = pipeline("text-generation", model="TinyLlama/TinyLlama-1.1B-Chat-v1.0") # Create a prompt for the story generation prompt = f"Write a fun children's story based on this: {text}. Once upon a time, " # Generate the story story_result = generator( prompt, max_length=150, num_return_sequences=1, temperature=0.7, top_k=50, top_p=0.95, do_sample=True ) # Extract the generated text story_text = story_result[0]['generated_text'] story_text = story_text.replace(prompt, "Once upon a time, ") # Make sure the story is at least 100 words words = story_text.split() if len(words) > 100: # Simply truncate to 100 words story_text = " ".join(words[:100]) return story_text # text2audio def text2audio(story_text): tts = pipeline("text-to-speech", model="espnet/kan-bayashi_ljspeech_vits") audio_output = tts(story_text) return { "audio": audio_output["audio"], "sampling_rate": audio_output["sampling_rate"] } # Function to save temporary image file def save_uploaded_image(uploaded_file): if not os.path.exists("temp"): os.makedirs("temp") image_path = os.path.join("temp", uploaded_file.name) with open(image_path, "wb") as f: f.write(uploaded_file.getvalue()) return image_path # main part st.set_page_config(page_title="Your Image to Audio Story", page_icon="🦜") st.header("Turn Your Image to Audio Story") uploaded_file = st.file_uploader("Select an Image...") if uploaded_file is not None: # Display the uploaded image st.image(uploaded_file, caption="Uploaded Image", use_container_width=True) # Save the image temporarily image_path = save_uploaded_image(uploaded_file) # Stage 1: Image to Text st.text('Processing img2text...') caption = img2text(image_path) st.write(caption) # Stage 2: Text to Story st.text('Generating a story...') story = text2story(caption) st.write(story) # Stage 3: Story to Audio data st.text('Generating audio data...') audio_data = text2audio(story) # Play button if st.button("Play Audio"): st.audio( audio_data["audio"], format="audio/wav", start_time=0, sample_rate=audio_data["sampling_rate"] ) # Clean up the temporary file try: os.remove(image_path) except: pass