EngrMuhammadBilal commited on
Commit
346ece7
·
verified ·
1 Parent(s): 86d16eb

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +74 -105
app.py CHANGED
@@ -3,64 +3,72 @@ import os
3
  import tempfile
4
  from datetime import datetime
5
 
6
- import librosa
7
  import numpy as np
8
  import soundfile as sf
9
  import streamlit as st
10
  from TTS.api import TTS
11
 
12
- st.set_page_config(page_title="Urdu Voice Cloner", page_icon="🗣️", layout="centered")
13
 
14
  st.title("🗣️ Urdu Text → Your Voice (Voice Cloning)")
15
- st.caption("Upload a short sample of your voice, type Urdu text, and get audio in your voice.")
16
 
17
  # ----------------------------
18
- # Caching the model to avoid reloading
19
  # ----------------------------
20
  @st.cache_resource(show_spinner=True)
21
  def load_tts():
22
- # XTTS v2 supports multilingual zero-shot cloning, including Urdu (code: 'ur')
23
- # Model will download on first run and then be cached by the Space
24
  return TTS(model_name="tts_models/multilingual/multi-dataset/xtts_v2")
25
 
26
  tts = load_tts()
27
 
28
  # ----------------------------
29
- # Sidebar: options
30
  # ----------------------------
31
  with st.sidebar:
32
  st.header("Options")
33
- st.markdown("**Reference voice**")
34
- st.caption("Upload a clean 10–30 second sample with minimal noise.")
35
- # XTTS controls
36
  similarity_boost = st.slider("Similarity boost", 0.0, 1.0, 0.75, 0.05)
37
- stability = st.slider("Stability", 0.0, 1.0, 0.6, 0.05)
38
  style = st.slider("Style (expressiveness)", 0.0, 1.0, 0.35, 0.05)
39
- seed = st.number_input("Random seed (for reproducibility)", value=42, step=1)
40
-
41
- st.markdown("---")
42
- st.markdown("**Post-processing**")
43
- rate = st.slider("Speaking rate (time-stretch)", 0.75, 1.25, 1.00, 0.01)
44
  normalize = st.checkbox("Normalize loudness", True)
45
-
46
- st.markdown("---")
47
  base_name = st.text_input("Output filename (no extension)", "urdu_voice_clone")
 
48
 
49
  # ----------------------------
50
- # Inputs
51
  # ----------------------------
52
- ref_file = st.file_uploader(
53
- "Upload your voice sample (wav/mp3/m4a)",
54
- type=["wav", "mp3", "m4a", "ogg", "flac"]
55
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
56
 
 
 
 
 
57
  default_text = "یہ میری آواز کی مثال ہے۔ آپ یہاں اپنا متن لکھیں اور آڈیو حاصل کریں۔"
58
- text = st.text_area(
59
- "Urdu text",
60
- value=default_text,
61
- height=180,
62
- placeholder="یہاں اردو میں ٹیکسٹ لکھیں یا پیسٹ کریں…"
63
- )
64
 
65
  col1, col2 = st.columns(2)
66
  with col1:
@@ -70,45 +78,10 @@ with col2:
70
 
71
  if clear_btn:
72
  st.session_state.pop("audio_bytes", None)
73
- st.session_state.pop("preview_sr", None)
74
  st.experimental_rerun()
75
 
76
  # ----------------------------
77
- # Helpers
78
- # ----------------------------
79
- def load_and_standardize(audio_file, target_sr=16000):
80
- """Load user audio, convert to mono 16 kHz WAV bytes and return temp path."""
81
- y, sr = librosa.load(audio_file, sr=None, mono=True)
82
- if len(y) < target_sr * 3:
83
- st.warning("Voice sample is very short. Try at least 5–10 seconds for better cloning.")
84
- y_res = librosa.resample(y, orig_sr=sr, target_sr=target_sr)
85
- # Light trim to remove leading/trailing silence
86
- yt, _ = librosa.effects.trim(y_res, top_db=30)
87
- if yt.size < target_sr: # ensure at least 1s remains
88
- yt = y_res
89
- tmp_wav = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
90
- sf.write(tmp_wav.name, yt, target_sr)
91
- return tmp_wav.name
92
-
93
- def postprocess_rate_and_norm(wav, sr, rate_factor=1.0, do_norm=True):
94
- """Time-stretch and normalize loudness."""
95
- y = wav.astype(np.float32)
96
- if rate_factor != 1.0:
97
- # librosa requires strictly positive values
98
- y = librosa.effects.time_stretch(y, rate_factor)
99
- if do_norm:
100
- peak = np.max(np.abs(y)) + 1e-9
101
- y = 0.98 * (y / peak)
102
- return y
103
-
104
- def wav_bytes_from_array(y, sr):
105
- buf = io.BytesIO()
106
- sf.write(buf, y, sr, format="WAV")
107
- buf.seek(0)
108
- return buf.read()
109
-
110
- # ----------------------------
111
- # Run
112
  # ----------------------------
113
  if run_btn:
114
  if not text.strip():
@@ -117,76 +90,72 @@ if run_btn:
117
  st.warning("براہ کرم اپنی آواز کی آڈیو فائل اپلوڈ کریں۔")
118
  else:
119
  try:
120
- st.info("Preparing reference voice…")
121
- ref_path = load_and_standardize(ref_file)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
122
 
123
- st.info("Cloning voice and synthesizing Urdu…")
124
- # Generate to a temporary file first
125
  out_wav_path = tempfile.NamedTemporaryFile(delete=False, suffix=".wav").name
126
 
127
- # Coqui XTTS generation
128
- # Extra params passed via "speaker_wav" and "language"
129
- # Controls: "speaker_similarity", "style", "temperature", "length_scale" etc. are model dependent.
130
  tts.tts_to_file(
131
  text=text.strip(),
132
  file_path=out_wav_path,
133
- speaker_wav=ref_path,
134
  language="ur",
135
- # Extra inference kwargs routed to the model (supported by XTTS v2)
136
- # See: https://github.com/coqui-ai/TTS
137
- # Using similarity/stability/style through speaker conditioning
138
- # Some builds accept these as speaker_cfg; we forward common names:
139
  split_sentences=True,
140
- speed=1.0, # base speed, we will also post-process rate
141
- speaker_similarity=similarity_boost,
142
- stability=stability,
143
- style_wav=None,
144
- style=style,
145
- seed=int(seed)
146
  )
147
 
148
- # Read back and post-process
149
- y, sr = sf.read(out_wav_path, dtype="float32")
150
- y = postprocess_rate_and_norm(y, sr, rate_factor=rate, do_norm=normalize)
151
- audio_bytes = wav_bytes_from_array(y, sr)
152
 
153
- # Stash in session for preview and download
154
  st.session_state["audio_bytes"] = audio_bytes
155
- st.session_state["preview_sr"] = sr
156
 
157
- # Clean temp files
158
  try:
159
- os.remove(ref_path)
160
  os.remove(out_wav_path)
161
  except Exception:
162
  pass
163
 
164
- st.success("آڈیو تیار ہے۔ نیچے سنیں یا ڈاؤن لوڈ کریں۔")
165
 
166
  except Exception as e:
167
  st.error(f"کچھ مسئلہ آیا: {e}")
168
 
169
  # ----------------------------
170
- # Preview and download
171
  # ----------------------------
172
  if "audio_bytes" in st.session_state:
173
  st.markdown("### ▶️ Preview")
174
  st.audio(st.session_state["audio_bytes"], format="audio/wav")
175
 
176
  ts = datetime.now().strftime("%Y%m%d_%H%M%S")
177
- base = (base_name or "urdu_voice_clone").strip()
178
- fname = f"{base}_{ts}.wav"
179
-
180
- st.download_button(
181
- "⬇️ Download WAV",
182
- data=st.session_state["audio_bytes"],
183
- file_name=fname,
184
- mime="audio/wav",
185
- use_container_width=True
186
- )
187
 
188
  st.markdown("---")
189
  st.caption(
190
- "Tips: Use a clear 10–30 second reference with low noise. Speak naturally. "
191
- "If cloning feels off, try a different sample, raise Similarity, or lower Stability a little."
192
  )
 
3
  import tempfile
4
  from datetime import datetime
5
 
 
6
  import numpy as np
7
  import soundfile as sf
8
  import streamlit as st
9
  from TTS.api import TTS
10
 
11
+ st.set_page_config(page_title="Urdu Voice Cloner (XTTS v2)", page_icon="🗣️", layout="centered")
12
 
13
  st.title("🗣️ Urdu Text → Your Voice (Voice Cloning)")
14
+ st.caption("Upload a short sample of your voice, type Urdu text, and get audio in your voice (XTTS v2, CPU friendly).")
15
 
16
  # ----------------------------
17
+ # Cache the model so it loads once
18
  # ----------------------------
19
  @st.cache_resource(show_spinner=True)
20
  def load_tts():
21
+ # Multilingual zero-shot cloning, supports Urdu with language='ur'
 
22
  return TTS(model_name="tts_models/multilingual/multi-dataset/xtts_v2")
23
 
24
  tts = load_tts()
25
 
26
  # ----------------------------
27
+ # Sidebar options
28
  # ----------------------------
29
  with st.sidebar:
30
  st.header("Options")
31
+ st.caption("Upload a clean 10–30s clip, no background noise if possible.")
 
 
32
  similarity_boost = st.slider("Similarity boost", 0.0, 1.0, 0.75, 0.05)
33
+ stability = st.slider("Stability", 0.0, 1.0, 0.60, 0.05)
34
  style = st.slider("Style (expressiveness)", 0.0, 1.0, 0.35, 0.05)
 
 
 
 
 
35
  normalize = st.checkbox("Normalize loudness", True)
 
 
36
  base_name = st.text_input("Output filename (no extension)", "urdu_voice_clone")
37
+ seed = st.number_input("Random seed", value=42, step=1)
38
 
39
  # ----------------------------
40
+ # Simple helpers (no librosa)
41
  # ----------------------------
42
+ def simple_trim_silence(wave: np.ndarray, threshold: float = 1e-4, pad: int = 0) -> np.ndarray:
43
+ """
44
+ Very simple silence trim: finds where absolute amplitude exceeds threshold.
45
+ If nothing exceeds threshold, returns original.
46
+ """
47
+ if wave.ndim > 1:
48
+ wave = wave.mean(axis=1)
49
+ idx = np.where(np.abs(wave) > threshold)[0]
50
+ if idx.size == 0:
51
+ return wave
52
+ start = max(int(idx[0]) - pad, 0)
53
+ end = min(int(idx[-1]) + pad, wave.shape[0])
54
+ return wave[start:end]
55
+
56
+ def normalize_peak(wave: np.ndarray, peak: float = 0.98) -> np.ndarray:
57
+ m = np.max(np.abs(wave)) + 1e-9
58
+ return (peak * wave / m).astype(np.float32)
59
+
60
+ def wav_bytes_from_array(y: np.ndarray, sr: int) -> bytes:
61
+ buf = io.BytesIO()
62
+ sf.write(buf, y, sr, format="WAV")
63
+ buf.seek(0)
64
+ return buf.read()
65
 
66
+ # ----------------------------
67
+ # Inputs
68
+ # ----------------------------
69
+ ref_file = st.file_uploader("Upload your voice sample (wav/mp3/m4a/ogg/flac)", type=["wav", "mp3", "m4a", "ogg", "flac"])
70
  default_text = "یہ میری آواز کی مثال ہے۔ آپ یہاں اپنا متن لکھیں اور آڈیو حاصل کریں۔"
71
+ text = st.text_area("Urdu text", value=default_text, height=180, placeholder="یہاں اردو میں ٹیکسٹ لکھیں یا پیسٹ کریں…")
 
 
 
 
 
72
 
73
  col1, col2 = st.columns(2)
74
  with col1:
 
78
 
79
  if clear_btn:
80
  st.session_state.pop("audio_bytes", None)
 
81
  st.experimental_rerun()
82
 
83
  # ----------------------------
84
+ # Run synthesis
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
85
  # ----------------------------
86
  if run_btn:
87
  if not text.strip():
 
90
  st.warning("براہ کرم اپنی آواز کی آڈیو فائل اپلوڈ کریں۔")
91
  else:
92
  try:
93
+ # Save uploaded file to a temp path (XTTS can accept various formats via soundfile/ffmpeg backend)
94
+ tmp_ref = tempfile.NamedTemporaryFile(delete=False, suffix=f".{ref_file.name.split('.')[-1]}")
95
+ tmp_ref.write(ref_file.read())
96
+ tmp_ref.flush()
97
+ tmp_ref.close()
98
+
99
+ # Optional: quick silence trim to reduce leading/trailing gaps
100
+ try:
101
+ y_ref, sr_ref = sf.read(tmp_ref.name, dtype="float32", always_2d=False)
102
+ y_ref = simple_trim_silence(y_ref)
103
+ sf.write(tmp_ref.name, y_ref, sr_ref) # overwrite trimmed
104
+ except Exception:
105
+ # If reading/trim fails, keep original file
106
+ pass
107
+
108
+ st.info("Cloning voice and synthesizing Urdu… (CPU can take a bit on first run)")
109
 
 
 
110
  out_wav_path = tempfile.NamedTemporaryFile(delete=False, suffix=".wav").name
111
 
112
+ # Generate audio
 
 
113
  tts.tts_to_file(
114
  text=text.strip(),
115
  file_path=out_wav_path,
116
+ speaker_wav=tmp_ref.name,
117
  language="ur",
118
+ # Common conditioning knobs
119
+ speaker_similarity=float(similarity_boost),
120
+ stability=float(stability),
121
+ style=float(style),
122
  split_sentences=True,
123
+ seed=int(seed),
 
 
 
 
 
124
  )
125
 
126
+ # Load, optional normalize, then serve
127
+ y, sr = sf.read(out_wav_path, dtype="float32", always_2d=False)
128
+ if normalize:
129
+ y = normalize_peak(y)
130
 
131
+ audio_bytes = wav_bytes_from_array(y, sr)
132
  st.session_state["audio_bytes"] = audio_bytes
 
133
 
134
+ # Cleanup temp files
135
  try:
136
+ os.remove(tmp_ref.name)
137
  os.remove(out_wav_path)
138
  except Exception:
139
  pass
140
 
141
+ st.success("آڈیو تیار ہے۔")
142
 
143
  except Exception as e:
144
  st.error(f"کچھ مسئلہ آیا: {e}")
145
 
146
  # ----------------------------
147
+ # Preview & download
148
  # ----------------------------
149
  if "audio_bytes" in st.session_state:
150
  st.markdown("### ▶️ Preview")
151
  st.audio(st.session_state["audio_bytes"], format="audio/wav")
152
 
153
  ts = datetime.now().strftime("%Y%m%d_%H%M%S")
154
+ fname = f"{(base_name or 'urdu_voice_clone').strip()}_{ts}.wav"
155
+ st.download_button("⬇️ Download WAV", data=st.session_state["audio_bytes"], file_name=fname, mime="audio/wav", use_container_width=True)
 
 
 
 
 
 
 
 
156
 
157
  st.markdown("---")
158
  st.caption(
159
+ "Tips: Use a clear 10–30 second reference with low noise. If cloning feels off, try a different sample, "
160
+ "raise Similarity slightly, or lower Stability a little."
161
  )