ayush-goud commited on
Commit
646d3bf
·
verified ·
1 Parent(s): a626be8

Upload 3 files

Browse files
Files changed (4) hide show
  1. .gitattributes +1 -0
  2. .streamlit/config.toml +2 -0
  3. app.py +199 -0
  4. images/logo.png +3 -0
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ images/logo.png filter=lfs diff=lfs merge=lfs -text
.streamlit/config.toml ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ [theme]
2
+ base="dark"
app.py ADDED
@@ -0,0 +1,199 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import requests
3
+ from PIL import Image
4
+ from transformers import BlipProcessor, BlipForConditionalGeneration, SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
5
+ import os
6
+ import torch
7
+ import soundfile as sf
8
+ from datasets import load_dataset
9
+ import matplotlib.pyplot as plt
10
+ import numpy as np
11
+ os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'
12
+
13
+ # Model Description
14
+ model_description = """
15
+ This application utilizes image captioning and text-to-speech models to generate a caption for an uploaded image
16
+ and convert the caption into speech.
17
+
18
+ The image captioning model is based on [Salesforce's BLIP architecture](https://huggingface.co/Salesforce/blip-image-captioning-base), which can generate descriptive captions for images.
19
+
20
+ The text-to-speech model, based on [Microsoft's SpeechT5](https://huggingface.co/microsoft/speecht5_tts), converts the generated caption into speech with the help of a
21
+ HiFiGAN vocoder.
22
+ """
23
+
24
+
25
+ @st.cache_resource
26
+ def initialize_image_captioning():
27
+ processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
28
+ model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
29
+ return processor, model
30
+
31
+ @st.cache_resource
32
+ def initialize_speech_synthesis():
33
+ processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
34
+ model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
35
+ vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
36
+ embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
37
+ speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
38
+ return processor, model, vocoder, speaker_embeddings
39
+
40
+ def generate_caption(processor, model, image):
41
+ inputs = processor(image, return_tensors="pt")
42
+ out = model.generate(**inputs)
43
+ output_caption = processor.decode(out[0], skip_special_tokens=True)
44
+ return output_caption
45
+
46
+ def generate_speech(processor, model, vocoder, speaker_embeddings, caption):
47
+ inputs = processor(text=caption, return_tensors="pt")
48
+ speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)
49
+ sf.write("speech.wav", speech.numpy(), samplerate=16000)
50
+
51
+ def play_sound():
52
+ audio_file = open("speech.wav", 'rb')
53
+ audio_bytes = audio_file.read()
54
+ st.audio(audio_bytes, format='audio/wav')
55
+
56
+ def visualize_speech():
57
+ data, samplerate = sf.read("speech.wav")
58
+ duration = len(data) / samplerate
59
+
60
+ # Create time axis
61
+ time = np.linspace(0., duration, len(data))
62
+
63
+ # Plot the speech waveform
64
+ fig, ax = plt.subplots(figsize=(10, 4))
65
+ ax.plot(time, data)
66
+ ax.set(xlabel="Time (s)", ylabel="Amplitude", title="Speech Waveform")
67
+
68
+ # Display the plot using st.pyplot()
69
+ st.pyplot(fig)
70
+
71
+ def main():
72
+ st.set_page_config(
73
+ page_title="Image-to-Speech",
74
+ page_icon="📸",
75
+ initial_sidebar_state="collapsed",
76
+ menu_items={
77
+ 'Get Help': 'https://www.extremelycoolapp.com/help',
78
+ 'Report a bug': "https://www.extremelycoolapp.com/bug",
79
+ 'About': "# This is a header. This is an *extremely* cool app!"
80
+ }
81
+ )
82
+
83
+ st.sidebar.markdown("---")
84
+ st.sidebar.markdown("Developed by Alim Tleuliyev")
85
+ st.sidebar.markdown("Contact: [alim.tleuliyev@nu.edu.kz](mailto:alim.tleuliyev@nu.edu.kz)")
86
+ st.sidebar.markdown("GitHub: [Repo](https://github.com/AlimTleuliyev/image-to-audio)")
87
+
88
+ st.markdown(
89
+ """
90
+ <style>
91
+ .container {
92
+ max-width: 800px;
93
+ }
94
+ .title {
95
+ text-align: center;
96
+ font-size: 32px;
97
+ font-weight: bold;
98
+ margin-bottom: 20px;
99
+ }
100
+ .description {
101
+ margin-bottom: 30px;
102
+ }
103
+ .instructions {
104
+ margin-bottom: 20px;
105
+ padding: 10px;
106
+ background-color: #f5f5f5;
107
+ border-radius: 5px;
108
+ }
109
+ </style>
110
+ """,
111
+ unsafe_allow_html=True
112
+ )
113
+
114
+ # Title
115
+ st.markdown("<div class='title'>Image Captioning and Text-to-Speech</div>", unsafe_allow_html=True)
116
+ col1, col2, col3 = st.columns([1,2,1])
117
+
118
+ with col1:
119
+ st.write("")
120
+
121
+ with col2:
122
+ st.image("images/logo.png", use_column_width=True, caption="Generated by DALL-E")
123
+
124
+ with col3:
125
+ st.write("")
126
+
127
+ # Model Description
128
+ st.markdown("<div class='description'>" + model_description + "</div>", unsafe_allow_html=True)
129
+
130
+ # Instructions
131
+ with st.expander("Instructions"):
132
+ st.markdown("1. Upload an image or provide the URL of an image.")
133
+ st.markdown("2. Click the 'Generate Caption and Speech' button.")
134
+ st.markdown("3. The generated caption will be displayed, and the speech will start playing.")
135
+
136
+
137
+ # Choose image source
138
+ image_source = st.radio("Select Image Source:", ("Upload Image", "Open from URL"))
139
+
140
+ image = None
141
+
142
+ if image_source == "Upload Image":
143
+ # File uploader for image
144
+ uploaded_file = st.file_uploader("Upload an image", type=["png", "jpg", "jpeg"])
145
+ if uploaded_file is not None:
146
+ image = Image.open(uploaded_file)
147
+ else:
148
+ image = None
149
+
150
+ else:
151
+ # Input box for image URL
152
+ url = st.text_input("Enter the image URL:")
153
+ if url:
154
+ try:
155
+ response = requests.get(url, stream=True)
156
+ if response.status_code == 200:
157
+ image = Image.open(response.raw)
158
+ else:
159
+ st.error("Error loading image from URL.")
160
+ image = None
161
+ except requests.exceptions.RequestException as e:
162
+ st.error(f"Error loading image from URL: {e}")
163
+ image = None
164
+
165
+ # Generate caption and play sound button
166
+ if image is not None:
167
+ # Display the uploaded image
168
+ st.image(image, caption='Uploaded Image', use_column_width=True)
169
+
170
+ # Initialize image captioning models
171
+ caption_processor, caption_model = initialize_image_captioning()
172
+
173
+ # Initialize speech synthesis models
174
+ speech_processor, speech_model, speech_vocoder, speaker_embeddings = initialize_speech_synthesis()
175
+
176
+ # Generate caption
177
+ with st.spinner("Generating Caption..."):
178
+ output_caption = generate_caption(caption_processor, caption_model, image)
179
+
180
+ # Display the caption
181
+ st.subheader("Caption:")
182
+ st.write(output_caption)
183
+
184
+ # Generate speech from the caption
185
+ with st.spinner("Generating Speech..."):
186
+ generate_speech(speech_processor, speech_model, speech_vocoder, speaker_embeddings, output_caption)
187
+
188
+
189
+ st.subheader("Audio:")
190
+ # Play the generated sound
191
+ play_sound()
192
+
193
+ # Visualize the speech waveform
194
+ with st.expander("See visualization"):
195
+ visualize_speech()
196
+
197
+
198
+ if __name__ == "__main__":
199
+ main()
images/logo.png ADDED

Git LFS Details

  • SHA256: 73504ab6020f847646bd02acd50aeb96492a640491e5952e76f82d77527ceed9
  • Pointer size: 132 Bytes
  • Size of remote file: 1.73 MB