mehdilaalali commited on
Commit
3cda682
Β·
verified Β·
1 Parent(s): f83030e

feat: add ability to clone voice natively from URLs (YouTube, TikTok) using yt-dlp

Browse files
Files changed (1) hide show
  1. app.py +43 -8
app.py CHANGED
@@ -4,6 +4,7 @@ import tempfile
4
  import gradio as gr
5
  from pathlib import Path
6
  import base64
 
7
  from mistralai.client import Mistral
8
 
9
  def list_user_voices():
@@ -107,23 +108,53 @@ def synthesize_speech(text, voice_id_input, ref_audio_path, audio_format):
107
 
108
 
109
  # ─── Voice Cloning ────────────────────────────────────────────────────────────
110
- def clone_voice(audio_path, voice_name, gender, languages_str):
111
- """Upload a sample audio to create a reusable cloned voice."""
112
- if audio_path is None:
113
- return "⚠️ Please upload a sample audio clip."
114
  if not voice_name.strip():
115
- return "⚠️ Please enter a name for the voice."
 
 
 
116
  try:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
117
  client = get_client()
118
- sample_b64 = base64.b64encode(Path(audio_path).read_bytes()).decode()
119
  langs = [l.strip() for l in languages_str.split(",") if l.strip()] or ["en"]
120
  voice = client.audio.voices.create(
121
  name=voice_name.strip(),
122
  sample_audio=sample_b64,
123
- sample_filename=Path(audio_path).name,
124
  languages=langs,
125
  gender=gender.lower(),
126
  )
 
 
 
 
 
127
  # Build new choices specifically for this user session: Official Voices + Their new clone
128
  new_session_choices = get_voice_choices() + [(f"{voice.name} (Custom Session Clone)", voice.id)]
129
  return (
@@ -373,6 +404,10 @@ with gr.Blocks(title="Voxtral Studio β€” Mistral AI Audio") as demo:
373
  type="filepath",
374
  elem_classes=["audio-component"],
375
  )
 
 
 
 
376
  clone_name = gr.Textbox(
377
  label="Voice Name",
378
  placeholder="e.g. my-assistant-voice",
@@ -397,7 +432,7 @@ with gr.Blocks(title="Voxtral Studio β€” Mistral AI Audio") as demo:
397
 
398
  clone_btn.click(
399
  fn=clone_voice,
400
- inputs=[clone_audio, clone_name, clone_gender, clone_langs],
401
  outputs=[clone_result, tts_voice_id],
402
  )
403
 
 
4
  import gradio as gr
5
  from pathlib import Path
6
  import base64
7
+ import os
8
  from mistralai.client import Mistral
9
 
10
  def list_user_voices():
 
108
 
109
 
110
  # ─── Voice Cloning ────────────────────────────────────────────────────────────
111
+ def clone_voice(audio_path, url_input, voice_name, gender, languages_str):
112
+ """Upload a sample audio or provide a URL to create a reusable cloned voice."""
113
+ if not audio_path and not url_input.strip():
114
+ return "⚠️ Please upload an audio clip or provide a media URL.", gr.update()
115
  if not voice_name.strip():
116
+ return "⚠️ Please enter a name for the voice.", gr.update()
117
+
118
+ final_audio_path = audio_path
119
+
120
  try:
121
+ # If URL is provided, download it with yt-dlp
122
+ if url_input.strip():
123
+ import yt_dlp
124
+ base_out = tempfile.mktemp()
125
+ ydl_opts = {
126
+ 'format': 'bestaudio/best',
127
+ 'outtmpl': base_out + '.%(ext)s',
128
+ 'quiet': True,
129
+ 'postprocessors': [{
130
+ 'key': 'FFmpegExtractAudio',
131
+ 'preferredcodec': 'mp3',
132
+ 'preferredquality': '128',
133
+ }],
134
+ 'postprocessor_args': [
135
+ '-t', '60' # Limit to first 60 seconds to avoid exceeding API limits
136
+ ],
137
+ }
138
+ with yt_dlp.YoutubeDL(ydl_opts) as ydl:
139
+ info = ydl.extract_info(url_input.strip(), download=True)
140
+ # after postprocessing, file has .mp3 extension
141
+ final_audio_path = base_out + '.mp3'
142
+
143
  client = get_client()
144
+ sample_b64 = base64.b64encode(Path(final_audio_path).read_bytes()).decode()
145
  langs = [l.strip() for l in languages_str.split(",") if l.strip()] or ["en"]
146
  voice = client.audio.voices.create(
147
  name=voice_name.strip(),
148
  sample_audio=sample_b64,
149
+ sample_filename=Path(final_audio_path).name,
150
  languages=langs,
151
  gender=gender.lower(),
152
  )
153
+
154
+ # Clean up downloaded file
155
+ if url_input.strip() and os.path.exists(final_audio_path):
156
+ try: os.remove(final_audio_path)
157
+ except: pass
158
  # Build new choices specifically for this user session: Official Voices + Their new clone
159
  new_session_choices = get_voice_choices() + [(f"{voice.name} (Custom Session Clone)", voice.id)]
160
  return (
 
404
  type="filepath",
405
  elem_classes=["audio-component"],
406
  )
407
+ clone_url = gr.Textbox(
408
+ label="OR: Media URL (YouTube, TikTok, MP3, etc.)",
409
+ placeholder="https://www.youtube.com/watch?v=...",
410
+ )
411
  clone_name = gr.Textbox(
412
  label="Voice Name",
413
  placeholder="e.g. my-assistant-voice",
 
432
 
433
  clone_btn.click(
434
  fn=clone_voice,
435
+ inputs=[clone_audio, clone_url, clone_name, clone_gender, clone_langs],
436
  outputs=[clone_result, tts_voice_id],
437
  )
438