# import part import streamlit as st from transformers import pipeline import os import tempfile # function part # img2text def img2text(image_path): image_to_text = pipeline("image-to-text", model="sooh-j/blip-image-captioning-base") text = image_to_text(image_path)[0]["generated_text"] return text # text2story def text2story(text): # Using a smaller text generation model generator = pipeline("text-generation", model="TinyLlama/TinyLlama-1.1B-Chat-v1.0") # Create a prompt for the story generation prompt = f"Write a fun children's story based on this: {text}. Once upon a time, " # Generate the story story_result = generator( prompt, max_length=150, num_return_sequences=1, temperature=0.7, top_k=50, top_p=0.95, do_sample=True ) # Extract the generated text story_text = story_result[0]['generated_text'] story_text = story_text.replace(prompt, "Once upon a time, ") # Make sure the story is at least 100 words words = story_text.split() if len(words) > 100: # Simply truncate to 100 words story_text = " ".join(words[:100]) return story_text # text2audio - REVISED to handle audio format correctly # text2audio - REVISED with proper audio field handling def text2audio(story_text): try: # Use the MeloTTS model which has better audio quality synthesizer = pipeline("text-to-speech", model="myshell-ai/MeloTTS-English") # Limit text length to avoid timeouts max_chars = 500 if len(story_text) > max_chars: last_period = story_text[:max_chars].rfind('.') if last_period > 0: story_text = story_text[:last_period + 1] else: story_text = story_text[:max_chars] # Generate speech speech = synthesizer(story_text) # Create a temporary WAV file temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.wav') temp_filename = temp_file.name temp_file.close() # Debug: Show what keys are available in the speech output st.write(f"Speech output keys: {list(speech.keys())}") # Write the audio data to the temporary file - MeloTTS should have audio and sampling_rate if 'audio' in speech and 'sampling_rate' in speech: # Convert numpy array to WAV file scipy.io.wavfile.write( temp_filename, speech['sampling_rate'], speech['audio'].astype(np.float32) ) st.write("Audio successfully written to file") else: raise ValueError(f"Expected 'audio' and 'sampling_rate' in output, but got: {list(speech.keys())}") return temp_filename except Exception as e: st.error(f"Error generating audio: {str(e)}") import traceback st.error(traceback.format_exc()) return None # Function to save temporary image file def save_uploaded_image(uploaded_file): if not os.path.exists("temp"): os.makedirs("temp") image_path = os.path.join("temp", uploaded_file.name) with open(image_path, "wb") as f: f.write(uploaded_file.getvalue()) return image_path # main part st.set_page_config(page_title="Your Image to Audio Story", page_icon="🦜") st.header("Turn Your Image to Audio Story") uploaded_file = st.file_uploader("Select an Image...") if uploaded_file is not None: # Display the uploaded image st.image(uploaded_file, caption="Uploaded Image", use_container_width=True) # Save the image temporarily image_path = save_uploaded_image(uploaded_file) # Stage 1: Image to Text st.text('Processing img2text...') caption = img2text(image_path) st.write(caption) # Stage 2: Text to Story st.text('Generating a story...') story = text2story(caption) st.write(story) # Stage 3: Story to Audio data st.text('Generating audio data...') audio_file = text2audio(story) # Play button if st.button("Play Audio"): if audio_file and os.path.exists(audio_file): # Play the audio file st.audio(audio_file) else: st.error("Audio generation failed. Please try again.") # Clean up the temporary files try: os.remove(image_path) # Don't delete audio file immediately as it might still be playing except: pass