testtest

Sleeping

App Files Files Community

TLH01 commited on May 2, 2025

Commit

146cc47

verified ·

1 Parent(s): 23456e5

Update app.py

Browse files

Files changed (1) hide show

app.py +136 -45

app.py CHANGED Viewed

@@ -1,54 +1,145 @@
 import streamlit as st
 from PIL import Image
-from transformers import VisionEncoderDecoderModel, ViTImageProcessor, AutoTokenizer
 import torch
-from TTS.api import TTS
-# Set page configuration
-st.set_page_config(page_title="Children's Image Storytelling", layout="wide")
-# Load models
-@st.cache_resource
-def load_models():
-    # Load image captioning model
-    vision_model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
-    processor = ViTImageProcessor.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
-    tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
-    # Load text-to-speech model
-    tts = TTS(model_name="tts_models/en/ljspeech/tacotron2-DDC", progress_bar=False, gpu=torch.cuda.is_available())
-    return vision_model, processor, tokenizer, tts
-# Main function
-def main():
-    # Display title
-    st.title("🧒📖 AI Image Storytelling")
-    st.write("Upload an image, and let AI generate a story for children aged 3–10 with voice narration.")
-    # Upload image
-    uploaded_file = st.file_uploader("Upload an image", type=["jpg", "jpeg", "png"])
-    if uploaded_file:
-        image = Image.open(uploaded_file).convert("RGB")
-        st.image(image, caption="Uploaded Image", use_column_width=True)
-        if st.button("Generate Story"):
-            vision_model, processor, tokenizer, tts_model = load_models()
-            with st.spinner("Generating description..."):
-                pixel_values = processor(images=image, return_tensors="pt").pixel_values
-                output_ids = vision_model.generate(pixel_values, max_length=50, num_beams=4)
-                caption = tokenizer.decode(output_ids[0], skip_special_tokens=True)
-                st.success("Image Description: " + caption)
-            with st.spinner("Generating story..."):
-                story_prompt = f"Based on the following description, tell me a short children's story: {caption}"
-                story = caption + " Once upon a time, " + caption.lower() + " entered a magical forest and met many new friends."
-                st.success("Story: " + story)
-            with st.spinner("Generating voice..."):
-                tts_model.tts_to_file(text=story, file_path="story.wav")
-                st.audio("story.wav", format="audio/wav")
-# Run the main program
 if __name__ == "__main__":
-    main()

 import streamlit as st
 from PIL import Image
+import requests
+from transformers import BlipProcessor, BlipForConditionalGeneration
+from transformers import GPT2Tokenizer, GPT2LMHeadModel
 import torch
+import io
+import soundfile as sf
+from speechbrain.pretrained import Tacotron2
+from speechbrain.pretrained import HIFIGAN
+# Stage 1: Image to Keyword/Caption
+def image_to_keyword(uploaded_image):
+    try:
+        # Load model
+        processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
+        model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
+        # Process image
+        raw_image = Image.open(uploaded_image).convert('RGB')
+        inputs = processor(raw_image, return_tensors="pt")
+        # Generate caption
+        out = model.generate(**inputs)
+        caption = processor.decode(out[0], skip_special_tokens=True)
+        return caption
+    except Exception as e:
+        st.error(f"Error in image captioning: {str(e)}")
+        return None
+# Stage 2: Keyword to Story
+def keyword_to_story(keyword):
+    try:
+        # Load model
+        tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
+        model = GPT2LMHeadModel.from_pretrained("gpt2")
+        # Create prompt
+        prompt = f"Write a short story between 50-100 words based on: {keyword}\n\nStory:"
+        # Generate story
+        inputs = tokenizer(prompt, return_tensors="pt")
+        outputs = model.generate(
+            inputs.input_ids,
+            max_length=200,
+            num_return_sequences=1,
+            no_repeat_ngram_size=2,
+            early_stopping=True
+        )
+        story = tokenizer.decode(outputs[0], skip_special_tokens=True)
+        # Clean up the story (remove prompt if it appears)
+        story = story.replace(prompt, "").strip()
+        # Ensure story length is between 50-100 words
+        words = story.split()
+        if len(words) > 100:
+            story = " ".join(words[:100])
+        elif len(words) < 50:
+            # If too short, try again with higher temperature
+            outputs = model.generate(
+                inputs.input_ids,
+                max_length=200,
+                num_return_sequences=1,
+                no_repeat_ngram_size=2,
+                do_sample=True,
+                temperature=0.9,
+                early_stopping=True
+            )
+            story = tokenizer.decode(outputs[0], skip_special_tokens=True)
+            story = story.replace(prompt, "").strip()
+        return story
+    except Exception as e:
+        st.error(f"Error in story generation: {str(e)}")
+        return None
+# Stage 3: Story to Audio
+def story_to_audio(story_text):
+    try:
+        # Initialize TTS
+        tacotron2 = Tacotron2.from_hparams(source="speechbrain/tts-tacotron2-ljspeech", savedir="tmp_tts")
+        hifi_gan = HIFIGAN.from_hparams(source="speechbrain/tts-hifigan-ljspeech", savedir="tmp_vocoder")
+        # Generate mel spectrogram and waveform
+        mel_output, mel_length, alignment = tacotron2.encode_text(story_text)
+        waveforms = hifi_gan.decode_batch(mel_output)
+        # Convert to bytes
+        audio_bytes = io.BytesIO()
+        sf.write(audio_bytes, waveforms.squeeze(1).cpu().numpy(), 22050, format='WAV')
+        audio_bytes.seek(0)
+        return audio_bytes
+    except Exception as e:
+        st.error(f"Error in audio generation: {str(e)}")
+        return None
+# Main App Function
+def main():
+    st.title("Image to Story Generator")
+    st.write("Upload an image to generate a story and audio narration")
+    # File uploader
+    uploaded_file = st.file_uploader("Choose an image...", type=["jpg", "jpeg", "png"])
+    if uploaded_file is not None:
+        # Display image
+        image = Image.open(uploaded_file)
+        st.image(image, caption='Uploaded Image', use_column_width=True)
+        # Stage 1: Image to Keyword
+        st.write("Generating caption from image...")
+        caption = image_to_keyword(uploaded_file)
+        if caption:
+            st.success(f"Generated Caption: {caption}")
+            # Stage 2: Keyword to Story
+            st.write("Generating story from caption...")
+            story = keyword_to_story(caption)
+            if story:
+                st.subheader("Generated Story")
+                st.write(story)
+                # Stage 3: Story to Audio
+                st.write("Converting story to audio...")
+                audio_bytes = story_to_audio(story)
+                if audio_bytes:
+                    st.audio(audio_bytes, format='audio/wav')
+                    # Download button for audio
+                    st.download_button(
+                        label="Download Audio",
+                        data=audio_bytes,
+                        file_name="generated_story.wav",
+                        mime="audio/wav"
+                    )
 if __name__ == "__main__":
+    main()