sadimanna commited on
Commit
1168147
·
1 Parent(s): e41a14b

added files

Browse files
Files changed (3) hide show
  1. Dockerfile +34 -0
  2. app.py +204 -0
  3. requirements.txt +9 -0
Dockerfile ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.10-slim
2
+
3
+ # Prevent Python from buffering stdout/stderr
4
+ ENV PYTHONUNBUFFERED=1
5
+
6
+ # Install system dependencies
7
+ RUN apt-get update && apt-get install -y \
8
+ git \
9
+ ffmpeg \
10
+ libsndfile1 \
11
+ && rm -rf /var/lib/apt/lists/*
12
+
13
+ # Set working directory
14
+ WORKDIR /app
15
+
16
+ # Copy dependency files
17
+ COPY requirements.txt .
18
+
19
+ # Install Python dependencies
20
+ RUN pip install --no-cache-dir -r requirements.txt
21
+
22
+ # Copy application code
23
+ COPY app.py .
24
+
25
+ # Expose Streamlit port
26
+ EXPOSE 7860
27
+
28
+ # Streamlit configuration for HF Spaces
29
+ ENV STREAMLIT_SERVER_HEADLESS=true
30
+ ENV STREAMLIT_SERVER_PORT=7860
31
+ ENV STREAMLIT_SERVER_ADDRESS=0.0.0.0
32
+
33
+ # Run Streamlit
34
+ CMD ["streamlit", "run", "app.py"]
app.py ADDED
@@ -0,0 +1,204 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import torch
3
+ import numpy as np
4
+ import tempfile
5
+ from transformers import VitsModel, AutoTokenizer
6
+ from scipy.io.wavfile import write
7
+ import librosa
8
+ from scipy.signal import butter, lfilter
9
+
10
+ #============================================
11
+ # Voice Presets
12
+ #============================================
13
+
14
+ VOICE_PRESETS = {
15
+ "Custom (Manual)": None, # special case
16
+ "Neutral": {
17
+ "pitch": 0,
18
+ "speed": 1.0,
19
+ "effect": None
20
+ },
21
+ "Deep": {
22
+ "pitch": -4,
23
+ "speed": 0.9,
24
+ "effect": "bass"
25
+ },
26
+ "Child-like": {
27
+ "pitch": 5,
28
+ "speed": 1.15,
29
+ "effect": None
30
+ },
31
+ "Robotic": {
32
+ "pitch": 0,
33
+ "speed": 1.0,
34
+ "effect": "robotic"
35
+ }
36
+ }
37
+
38
+
39
+ #============================================
40
+ # Audio Post-Processing Functions
41
+ #============================================
42
+
43
+ def apply_pitch_speed(audio, sr, pitch=0, speed=1.0):
44
+ if speed != 1.0:
45
+ audio = librosa.effects.time_stretch(audio, rate=speed)
46
+ if pitch != 0:
47
+ audio = librosa.effects.pitch_shift(audio, sr=sr, n_steps=pitch)
48
+ return audio
49
+
50
+
51
+ def bass_boost(audio, sr, gain=1.5, cutoff=200):
52
+ b, a = butter(2, cutoff / (sr / 2), btype="low")
53
+ low = lfilter(b, a, audio)
54
+ return audio + gain * low
55
+
56
+
57
+ def robotic_effect(audio, sr, freq=30):
58
+ t = np.arange(len(audio)) / sr
59
+ modulator = np.sin(2 * np.pi * freq * t)
60
+ return audio * modulator
61
+
62
+
63
+ # ------------------------
64
+ # Page config
65
+ # ------------------------
66
+ st.set_page_config(
67
+ page_title="MMS-TTS English",
68
+ layout="centered"
69
+ )
70
+
71
+ st.title("🔊 MMS-TTS English (Speed & Pitch Control)")
72
+
73
+ st.markdown(
74
+ """
75
+ Generate English speech using **facebook/mms-tts-eng**
76
+ Post-process audio to control **speed** and **pitch**.
77
+ """
78
+ )
79
+
80
+ # ------------------------
81
+ # Load model (cached)
82
+ # ------------------------
83
+ @st.cache_resource
84
+ def load_model():
85
+ model = VitsModel.from_pretrained("facebook/mms-tts-eng")
86
+ tokenizer = AutoTokenizer.from_pretrained("facebook/mms-tts-eng")
87
+ model.eval()
88
+ return model, tokenizer
89
+
90
+ model, tokenizer = load_model()
91
+
92
+ # ------------------------
93
+ # UI Controls
94
+ # ------------------------
95
+ st.caption(
96
+ "Select a preset for fixed voice styles, or choose Custom (Manual) to control pitch and speed yourself."
97
+ )
98
+
99
+ preset_name = st.selectbox(
100
+ "Voice Preset",
101
+ options=list(VOICE_PRESETS.keys()),
102
+ index=0
103
+ )
104
+
105
+ preset = VOICE_PRESETS[preset_name]
106
+ is_custom = preset is None
107
+
108
+ text = st.text_area(
109
+ "Input Text",
110
+ height=150,
111
+ placeholder="Enter English text here..."
112
+ )
113
+
114
+ speed = st.slider(
115
+ "Speech Speed",
116
+ min_value=0.5,
117
+ max_value=1.5,
118
+ value=1.0,
119
+ step=0.05,
120
+ disabled=not is_custom
121
+ )
122
+
123
+ pitch = st.slider(
124
+ "Pitch Shift (semitones)",
125
+ min_value=-6,
126
+ max_value=6,
127
+ value=0,
128
+ step=1,
129
+ disabled=not is_custom
130
+ )
131
+
132
+ if not is_custom:
133
+ st.info(
134
+ f"Preset selected: **{preset_name}**\n\n"
135
+ f"- Pitch: {preset['pitch']} semitones\n"
136
+ f"- Speed: {preset['speed']}x\n"
137
+ f"- Effect: {preset['effect'] if preset['effect'] else 'None'}"
138
+ )
139
+ else:
140
+ preset = {
141
+ "pitch": pitch,
142
+ "speed": speed,
143
+ "effect": None
144
+ }
145
+
146
+ #=------------------------
147
+ # Generate Button
148
+ # ------------------------
149
+
150
+ generate = st.button("🎙️ Generate Audio")
151
+
152
+
153
+
154
+ # ------------------------
155
+ # Generation
156
+ # ------------------------
157
+ if generate:
158
+ if not text.strip():
159
+ st.warning("Please enter text.")
160
+ else:
161
+ with st.spinner("Generating speech..."):
162
+ inputs = tokenizer(text, return_tensors="pt")
163
+
164
+ with torch.no_grad():
165
+ waveform = model(**inputs).waveform
166
+
167
+ audio = waveform.squeeze().cpu().numpy()
168
+ sr = model.config.sampling_rate
169
+
170
+ # Apply pitch + speed
171
+ audio = apply_pitch_speed(
172
+ audio,
173
+ sr,
174
+ pitch=preset["pitch"],
175
+ speed=preset["speed"]
176
+ )
177
+
178
+ # Apply effect
179
+ if preset["effect"] == "bass":
180
+ audio = bass_boost(audio, sr)
181
+ elif preset["effect"] == "robotic":
182
+ audio = robotic_effect(audio, sr)
183
+
184
+ # Normalize
185
+ audio = audio / np.max(np.abs(audio))
186
+ audio_int16 = np.int16(audio * 32767)
187
+
188
+
189
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp:
190
+ write(tmp.name, sr, audio_int16)
191
+ output_path = tmp.name
192
+
193
+ st.success("Audio generated successfully!")
194
+
195
+ st.audio(output_path, format="audio/wav")
196
+
197
+ with open(output_path, "rb") as f:
198
+ st.download_button(
199
+ "⬇️ Download WAV",
200
+ data=f,
201
+ file_name="mms_tts_output.wav",
202
+ mime="audio/wav"
203
+ )
204
+ # ------------------------
requirements.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ torch
2
+ transformers>=4.33
3
+ accelerate
4
+ streamlit
5
+ scipy
6
+ soundfile
7
+ librosa
8
+ numpy
9
+ pydub