import streamlit as st from PIL import Image from transformers import pipeline def generate_caption(image_file): image = Image.open(image_file) caption_generator = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base") caption_results = caption_generator(image) caption = caption_results[0]['generated_text'] return caption def generate_story(caption): story_generator = pipeline("text-generation", model="gpt2") prompt = f"Based on the following image caption: '{caption}', generate a complete fairy tale story for children with at least 100 words. " result = story_generator(prompt, max_length=300, num_return_sequences=1) story = result[0]['generated_text'] if len(story.split()) < 100: additional = story_generator(prompt, max_length=350, num_return_sequences=1)[0]['generated_text'] story += " " + additional return story def text_to_speech(text, output_file="output.mp3"): from gtts import gTTS tts = gTTS(text=text, lang="en") tts.save(output_file) return output_file def main(): st.title("CREATE YOUR STORY FOR CHILDREN!") st.write("Upload a picture. We create a story and read it for you.") uploaded_file = st.file_uploader("Choose a picture:", type=["png", "jpg", "jpeg"]) if uploaded_file is not None: image = Image.open(uploaded_file) st.image(image, caption="Picture you choose", use_column_width=True) with st.spinner("Generating..."): caption = generate_caption(uploaded_file) st.write("Picture description:", caption) with st.spinner("Fenerating..."): story = generate_story(caption) st.write("Your story:") st.write(story) # 文本转语音 with st.spinner("Ready to read..."): audio_file = text_to_speech(story) st.audio(audio_file, format="audio/mp3") if __name__ == "__main__": main()