EngrMuhammadBilal commited on
Commit
fd83c02
·
verified ·
1 Parent(s): bb84400

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +151 -39
app.py CHANGED
@@ -1,80 +1,192 @@
1
  import io
 
 
2
  from datetime import datetime
3
- from gtts import gTTS
 
 
 
4
  import streamlit as st
 
 
 
5
 
6
- st.set_page_config(page_title="Urdu Voice Over", page_icon="🔊", layout="centered")
 
7
 
8
- st.title("🔊 Urdu Text → Voice")
9
- st.caption("Type or paste Urdu text and get an MP3 voice over. Works great on Hugging Face Spaces.")
 
 
 
 
 
 
10
 
11
- # Sidebar options
 
 
 
 
12
  with st.sidebar:
13
  st.header("Options")
14
- slow = st.checkbox("Slow reading", False)
15
- file_basename = st.text_input("Output filename (without extension)", "urdu_voice")
 
 
 
 
 
 
16
  st.markdown("---")
17
- st.write("Tips")
18
- st.caption("1) Paste clean Urdu text\n2) Use Slow reading if the voice feels fast\n3) Download the MP3 for editing")
 
19
 
20
- # Main input
21
- default_sample = "یہ ایک سادہ مثال ہے، آپ یہاں اپنا متن لکھیں اور آڈیو حاصل کریں۔"
 
 
 
 
 
 
 
 
 
 
22
  text = st.text_area(
23
  "Urdu text",
24
- value=default_sample,
25
- height=200,
26
  placeholder="یہاں اردو میں ٹیکسٹ لکھیں یا پیسٹ کریں…"
27
  )
28
 
29
- def tts_to_bytes(urdu_text: str, slow_read: bool = False) -> bytes:
30
- # gTTS automatically chunks long text
31
- tts = gTTS(text=urdu_text, lang="ur", slow=slow_read)
32
- buf = io.BytesIO()
33
- tts.write_to_fp(buf)
34
- buf.seek(0)
35
- return buf.read()
36
-
37
- col1, col2 = st.columns([1,1])
38
  with col1:
39
- make_audio = st.button("🎙️ Generate Voice", use_container_width=True)
40
  with col2:
41
- clear = st.button("🧹 Clear", use_container_width=True)
42
 
43
- if clear:
44
  st.session_state.pop("audio_bytes", None)
45
- st.session_state.pop("last_text", None)
46
  st.experimental_rerun()
47
 
48
- if make_audio:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
  if not text.strip():
50
- st.warning("براہ کرم آڈیو بنانے کے لیے اردو متن درج کریں")
 
 
51
  else:
52
  try:
53
- audio_bytes = tts_to_bytes(text.strip(), slow_read=slow)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54
  st.session_state["audio_bytes"] = audio_bytes
55
- st.session_state["last_text"] = text.strip()
56
- st.success("آڈیو تیار ہے")
 
 
 
 
 
 
 
 
 
57
  except Exception as e:
58
  st.error(f"کچھ مسئلہ آیا: {e}")
59
 
60
- # Playback and download
 
 
61
  if "audio_bytes" in st.session_state:
62
  st.markdown("### ▶️ Preview")
63
- st.audio(st.session_state["audio_bytes"], format="audio/mp3")
64
 
65
- timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
66
- base = file_basename.strip() or "urdu_voice"
67
- fname = f"{base}_{timestamp}.mp3"
68
 
69
  st.download_button(
70
- label="⬇️ Download MP3",
71
  data=st.session_state["audio_bytes"],
72
  file_name=fname,
73
- mime="audio/mpeg",
74
  use_container_width=True
75
  )
76
 
77
  st.markdown("---")
78
  st.caption(
79
- "Note: This app uses gTTS for Urdu speech synthesis. If your Space is very busy or internet is restricted, synthesis can fail. Try again after a short while."
 
80
  )
 
1
  import io
2
+ import os
3
+ import tempfile
4
  from datetime import datetime
5
+
6
+ import librosa
7
+ import numpy as np
8
+ import soundfile as sf
9
  import streamlit as st
10
+ from TTS.api import TTS
11
+
12
+ st.set_page_config(page_title="Urdu Voice Cloner", page_icon="🗣️", layout="centered")
13
 
14
+ st.title("🗣️ Urdu Text → Your Voice (Voice Cloning)")
15
+ st.caption("Upload a short sample of your voice, type Urdu text, and get audio in your voice.")
16
 
17
+ # ----------------------------
18
+ # Caching the model to avoid reloading
19
+ # ----------------------------
20
+ @st.cache_resource(show_spinner=True)
21
+ def load_tts():
22
+ # XTTS v2 supports multilingual zero-shot cloning, including Urdu (code: 'ur')
23
+ # Model will download on first run and then be cached by the Space
24
+ return TTS(model_name="tts_models/multilingual/multi-dataset/xtts_v2")
25
 
26
+ tts = load_tts()
27
+
28
+ # ----------------------------
29
+ # Sidebar: options
30
+ # ----------------------------
31
  with st.sidebar:
32
  st.header("Options")
33
+ st.markdown("**Reference voice**")
34
+ st.caption("Upload a clean 10–30 second sample with minimal noise.")
35
+ # XTTS controls
36
+ similarity_boost = st.slider("Similarity boost", 0.0, 1.0, 0.75, 0.05)
37
+ stability = st.slider("Stability", 0.0, 1.0, 0.6, 0.05)
38
+ style = st.slider("Style (expressiveness)", 0.0, 1.0, 0.35, 0.05)
39
+ seed = st.number_input("Random seed (for reproducibility)", value=42, step=1)
40
+
41
  st.markdown("---")
42
+ st.markdown("**Post-processing**")
43
+ rate = st.slider("Speaking rate (time-stretch)", 0.75, 1.25, 1.00, 0.01)
44
+ normalize = st.checkbox("Normalize loudness", True)
45
 
46
+ st.markdown("---")
47
+ base_name = st.text_input("Output filename (no extension)", "urdu_voice_clone")
48
+
49
+ # ----------------------------
50
+ # Inputs
51
+ # ----------------------------
52
+ ref_file = st.file_uploader(
53
+ "Upload your voice sample (wav/mp3/m4a)",
54
+ type=["wav", "mp3", "m4a", "ogg", "flac"]
55
+ )
56
+
57
+ default_text = "یہ میری آواز کی مثال ہے۔ آپ یہاں اپنا متن لکھیں اور آڈیو حاصل کریں۔"
58
  text = st.text_area(
59
  "Urdu text",
60
+ value=default_text,
61
+ height=180,
62
  placeholder="یہاں اردو میں ٹیکسٹ لکھیں یا پیسٹ کریں…"
63
  )
64
 
65
+ col1, col2 = st.columns(2)
 
 
 
 
 
 
 
 
66
  with col1:
67
+ run_btn = st.button("🎙️ Generate", use_container_width=True)
68
  with col2:
69
+ clear_btn = st.button("🧹 Clear", use_container_width=True)
70
 
71
+ if clear_btn:
72
  st.session_state.pop("audio_bytes", None)
73
+ st.session_state.pop("preview_sr", None)
74
  st.experimental_rerun()
75
 
76
+ # ----------------------------
77
+ # Helpers
78
+ # ----------------------------
79
+ def load_and_standardize(audio_file, target_sr=16000):
80
+ """Load user audio, convert to mono 16 kHz WAV bytes and return temp path."""
81
+ y, sr = librosa.load(audio_file, sr=None, mono=True)
82
+ if len(y) < target_sr * 3:
83
+ st.warning("Voice sample is very short. Try at least 5–10 seconds for better cloning.")
84
+ y_res = librosa.resample(y, orig_sr=sr, target_sr=target_sr)
85
+ # Light trim to remove leading/trailing silence
86
+ yt, _ = librosa.effects.trim(y_res, top_db=30)
87
+ if yt.size < target_sr: # ensure at least 1s remains
88
+ yt = y_res
89
+ tmp_wav = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
90
+ sf.write(tmp_wav.name, yt, target_sr)
91
+ return tmp_wav.name
92
+
93
+ def postprocess_rate_and_norm(wav, sr, rate_factor=1.0, do_norm=True):
94
+ """Time-stretch and normalize loudness."""
95
+ y = wav.astype(np.float32)
96
+ if rate_factor != 1.0:
97
+ # librosa requires strictly positive values
98
+ y = librosa.effects.time_stretch(y, rate_factor)
99
+ if do_norm:
100
+ peak = np.max(np.abs(y)) + 1e-9
101
+ y = 0.98 * (y / peak)
102
+ return y
103
+
104
+ def wav_bytes_from_array(y, sr):
105
+ buf = io.BytesIO()
106
+ sf.write(buf, y, sr, format="WAV")
107
+ buf.seek(0)
108
+ return buf.read()
109
+
110
+ # ----------------------------
111
+ # Run
112
+ # ----------------------------
113
+ if run_btn:
114
  if not text.strip():
115
+ st.warning("براہ کرم اردو متن درج کریں۔")
116
+ elif ref_file is None:
117
+ st.warning("براہ کرم اپنی آواز کی آڈیو فائل اپلوڈ کریں۔")
118
  else:
119
  try:
120
+ st.info("Preparing reference voice…")
121
+ ref_path = load_and_standardize(ref_file)
122
+
123
+ st.info("Cloning voice and synthesizing Urdu…")
124
+ # Generate to a temporary file first
125
+ out_wav_path = tempfile.NamedTemporaryFile(delete=False, suffix=".wav").name
126
+
127
+ # Coqui XTTS generation
128
+ # Extra params passed via "speaker_wav" and "language"
129
+ # Controls: "speaker_similarity", "style", "temperature", "length_scale" etc. are model dependent.
130
+ tts.tts_to_file(
131
+ text=text.strip(),
132
+ file_path=out_wav_path,
133
+ speaker_wav=ref_path,
134
+ language="ur",
135
+ # Extra inference kwargs routed to the model (supported by XTTS v2)
136
+ # See: https://github.com/coqui-ai/TTS
137
+ # Using similarity/stability/style through speaker conditioning
138
+ # Some builds accept these as speaker_cfg; we forward common names:
139
+ split_sentences=True,
140
+ speed=1.0, # base speed, we will also post-process rate
141
+ speaker_similarity=similarity_boost,
142
+ stability=stability,
143
+ style_wav=None,
144
+ style=style,
145
+ seed=int(seed)
146
+ )
147
+
148
+ # Read back and post-process
149
+ y, sr = sf.read(out_wav_path, dtype="float32")
150
+ y = postprocess_rate_and_norm(y, sr, rate_factor=rate, do_norm=normalize)
151
+ audio_bytes = wav_bytes_from_array(y, sr)
152
+
153
+ # Stash in session for preview and download
154
  st.session_state["audio_bytes"] = audio_bytes
155
+ st.session_state["preview_sr"] = sr
156
+
157
+ # Clean temp files
158
+ try:
159
+ os.remove(ref_path)
160
+ os.remove(out_wav_path)
161
+ except Exception:
162
+ pass
163
+
164
+ st.success("آڈیو تیار ہے۔ نیچے سنیں یا ڈاؤن لوڈ کریں۔")
165
+
166
  except Exception as e:
167
  st.error(f"کچھ مسئلہ آیا: {e}")
168
 
169
+ # ----------------------------
170
+ # Preview and download
171
+ # ----------------------------
172
  if "audio_bytes" in st.session_state:
173
  st.markdown("### ▶️ Preview")
174
+ st.audio(st.session_state["audio_bytes"], format="audio/wav")
175
 
176
+ ts = datetime.now().strftime("%Y%m%d_%H%M%S")
177
+ base = (base_name or "urdu_voice_clone").strip()
178
+ fname = f"{base}_{ts}.wav"
179
 
180
  st.download_button(
181
+ "⬇️ Download WAV",
182
  data=st.session_state["audio_bytes"],
183
  file_name=fname,
184
+ mime="audio/wav",
185
  use_container_width=True
186
  )
187
 
188
  st.markdown("---")
189
  st.caption(
190
+ "Tips: Use a clear 10–30 second reference with low noise. Speak naturally. "
191
+ "If cloning feels off, try a different sample, raise Similarity, or lower Stability a little."
192
  )