TLH01 commited on
Commit
bed9467
·
verified ·
1 Parent(s): cf274d2

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +70 -116
app.py CHANGED
@@ -1,145 +1,99 @@
 
1
  import streamlit as st
2
  from PIL import Image
3
- import requests
4
  from transformers import BlipProcessor, BlipForConditionalGeneration
5
  from transformers import GPT2Tokenizer, GPT2LMHeadModel
6
  import torch
 
7
  import io
8
- import soundfile as sf
9
- from speechbrain.pretrained import Tacotron2
10
- from speechbrain.pretrained import HIFIGAN
11
 
12
- # Stage 1: Image to Keyword/Caption
13
- def image_to_keyword(uploaded_image):
 
 
 
 
 
14
  try:
15
- # Load model
16
- processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
17
- model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
18
-
19
- # Process image
20
- raw_image = Image.open(uploaded_image).convert('RGB')
21
- inputs = processor(raw_image, return_tensors="pt")
22
-
23
- # Generate caption
24
- out = model.generate(**inputs)
25
- caption = processor.decode(out[0], skip_special_tokens=True)
26
-
27
- return caption
28
- except Exception as e:
29
- st.error(f"Error in image captioning: {str(e)}")
30
- return None
31
 
32
- # Stage 2: Keyword to Story
33
- def keyword_to_story(keyword):
 
 
 
 
 
 
 
 
 
 
 
34
  try:
35
- # Load model
36
- tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
37
- model = GPT2LMHeadModel.from_pretrained("gpt2")
38
-
39
- # Create prompt
40
- prompt = f"Write a short story between 50-100 words based on: {keyword}\n\nStory:"
41
-
42
- # Generate story
43
  inputs = tokenizer(prompt, return_tensors="pt")
44
  outputs = model.generate(
45
  inputs.input_ids,
46
- max_length=200,
47
  num_return_sequences=1,
48
  no_repeat_ngram_size=2,
49
  early_stopping=True
50
  )
51
-
52
  story = tokenizer.decode(outputs[0], skip_special_tokens=True)
53
-
54
- # Clean up the story (remove prompt if it appears)
55
- story = story.replace(prompt, "").strip()
56
-
57
- # Ensure story length is between 50-100 words
58
- words = story.split()
59
- if len(words) > 100:
60
- story = " ".join(words[:100])
61
- elif len(words) < 50:
62
- # If too short, try again with higher temperature
63
- outputs = model.generate(
64
- inputs.input_ids,
65
- max_length=200,
66
- num_return_sequences=1,
67
- no_repeat_ngram_size=2,
68
- do_sample=True,
69
- temperature=0.9,
70
- early_stopping=True
71
- )
72
- story = tokenizer.decode(outputs[0], skip_special_tokens=True)
73
- story = story.replace(prompt, "").strip()
74
-
75
- return story
76
- except Exception as e:
77
- st.error(f"Error in story generation: {str(e)}")
78
- return None
79
 
80
- # Stage 3: Story to Audio
81
- def story_to_audio(story_text):
 
 
82
  try:
83
- # Initialize TTS
84
- tacotron2 = Tacotron2.from_hparams(source="speechbrain/tts-tacotron2-ljspeech", savedir="tmp_tts")
85
- hifi_gan = HIFIGAN.from_hparams(source="speechbrain/tts-hifigan-ljspeech", savedir="tmp_vocoder")
86
-
87
- # Generate mel spectrogram and waveform
88
- mel_output, mel_length, alignment = tacotron2.encode_text(story_text)
89
- waveforms = hifi_gan.decode_batch(mel_output)
90
-
91
- # Convert to bytes
92
- audio_bytes = io.BytesIO()
93
- sf.write(audio_bytes, waveforms.squeeze(1).cpu().numpy(), 22050, format='WAV')
94
- audio_bytes.seek(0)
95
-
96
- return audio_bytes
97
- except Exception as e:
98
- st.error(f"Error in audio generation: {str(e)}")
99
- return None
100
 
101
- # Main App Function
 
 
102
  def main():
103
- st.title("Image to Story Generator")
104
- st.write("Upload an image to generate a story and audio narration")
105
 
106
- # File uploader
107
- uploaded_file = st.file_uploader("Choose an image...", type=["jpg", "jpeg", "png"])
108
 
109
- if uploaded_file is not None:
110
- # Display image
111
- image = Image.open(uploaded_file)
112
- st.image(image, caption='Uploaded Image', use_column_width=True)
 
 
 
113
 
114
- # Stage 1: Image to Keyword
115
- st.write("Generating caption from image...")
116
- caption = image_to_keyword(uploaded_file)
117
 
118
- if caption:
119
- st.success(f"Generated Caption: {caption}")
120
-
121
- # Stage 2: Keyword to Story
122
- st.write("Generating story from caption...")
123
- story = keyword_to_story(caption)
124
-
125
- if story:
126
- st.subheader("Generated Story")
127
- st.write(story)
128
-
129
- # Stage 3: Story to Audio
130
- st.write("Converting story to audio...")
131
- audio_bytes = story_to_audio(story)
132
-
133
- if audio_bytes:
134
- st.audio(audio_bytes, format='audio/wav')
135
-
136
- # Download button for audio
137
- st.download_button(
138
- label="Download Audio",
139
- data=audio_bytes,
140
- file_name="generated_story.wav",
141
- mime="audio/wav"
142
- )
143
 
144
  if __name__ == "__main__":
145
  main()
 
1
+ # app.py
2
  import streamlit as st
3
  from PIL import Image
 
4
  from transformers import BlipProcessor, BlipForConditionalGeneration
5
  from transformers import GPT2Tokenizer, GPT2LMHeadModel
6
  import torch
7
+ from gtts import gTTS
8
  import io
 
 
 
9
 
10
+ # ======================
11
+ # Stage 1: Image Captioning
12
+ # ======================
13
+ def image_to_caption(uploaded_image):
14
+ processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
15
+ model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
16
+
17
  try:
18
+ img = Image.open(uploaded_image).convert("RGB")
19
+ inputs = processor(
20
+ images=img,
21
+ return_tensors="pt",
22
+ padding=True,
23
+ truncation=True,
24
+ max_length=30
25
+ )
26
+ outputs = model.generate(**inputs)
27
+ return processor.decode(outputs[0], skip_special_tokens=True)
28
+ except:
29
+ return "a happy scene with children" # Fallback caption
 
 
 
 
30
 
31
+ # ======================
32
+ # Stage 2: Story Generation
33
+ # ======================
34
+ def generate_story(caption):
35
+ tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
36
+ model = GPT2LMHeadModel.from_pretrained("gpt2")
37
+
38
+ prompt = f"""Create a children's story (3-6 years old) about {caption} with:
39
+ 1. Friendly animals
40
+ 2. Happy ending
41
+ 3. 50-100 words
42
+ Story:"""
43
+
44
  try:
 
 
 
 
 
 
 
 
45
  inputs = tokenizer(prompt, return_tensors="pt")
46
  outputs = model.generate(
47
  inputs.input_ids,
48
+ max_length=300,
49
  num_return_sequences=1,
50
  no_repeat_ngram_size=2,
51
  early_stopping=True
52
  )
 
53
  story = tokenizer.decode(outputs[0], skip_special_tokens=True)
54
+ return story.replace(prompt, "").strip()[:500] # Length control
55
+ except:
56
+ return """Once upon a time, there was a friendly bear who loved playing with children.
57
+ They had wonderful adventures every day, always ending with big hugs and happy smiles!"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58
 
59
+ # ======================
60
+ # Stage 3: Text-to-Speech
61
+ # ======================
62
+ def create_audio(story_text):
63
  try:
64
+ tts = gTTS(text=story_text[:500], lang='en', slow=False)
65
+ audio_buffer = io.BytesIO()
66
+ tts.write_to_fp(audio_buffer)
67
+ audio_buffer.seek(0)
68
+ return audio_buffer
69
+ except:
70
+ return None # Silent fallback
 
 
 
 
 
 
 
 
 
 
71
 
72
+ # ======================
73
+ # Main Application
74
+ # ======================
75
  def main():
76
+ st.title("🎈 Children's Story Maker")
 
77
 
78
+ uploaded_file = st.file_uploader("Upload a child's photo", type=["jpg", "png"])
 
79
 
80
+ if uploaded_file:
81
+ img = Image.open(uploaded_file)
82
+ st.image(img, use_column_width=True)
83
+
84
+ # Processing pipeline
85
+ caption = image_to_caption(uploaded_file)
86
+ story = generate_story(caption)
87
 
88
+ st.subheader("Generated Story")
89
+ st.write(story)
 
90
 
91
+ if audio_data := create_audio(story):
92
+ st.audio(audio_data, format="audio/mp3")
93
+ st.download_button("Download Audio",
94
+ data=audio_data,
95
+ file_name="story.mp3",
96
+ mime="audio/mp3")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
97
 
98
  if __name__ == "__main__":
99
  main()