don0726 commited on
Commit
545eaa6
Β·
verified Β·
1 Parent(s): 41ed8fc

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +51 -62
app.py CHANGED
@@ -2,93 +2,82 @@ import gradio as gr
2
  import torch
3
  import torchaudio
4
  import tempfile
 
 
 
5
  import os
6
- from TTS.api import TTS
7
-
8
- # ---------------------------
9
- # Device setup (CPU only)
10
- # ---------------------------
11
- device = "cpu"
12
-
13
- print("Loading XTTS model...")
14
- tts = TTS(
15
- model_name="tts_models/multilingual/multi-dataset/xtts_v2",
16
- progress_bar=False
17
- ).to(device)
 
 
 
 
 
18
  print("Model loaded!")
19
 
20
- # ---------------------------
21
  # Voice cloning function
22
- # ---------------------------
23
  def clone_voice(audio_file, text, lang):
24
  try:
25
  if audio_file is None:
26
- return None, "❌ Please upload audio"
27
-
28
- if text.strip() == "":
29
- return None, "❌ Please enter text"
30
 
31
- # CPU safety limit
32
  if len(text) > 200:
33
- return None, "❌ Text too long (max 200 chars for CPU)"
34
 
35
  # Load audio
36
- waveform, sr = torchaudio.load(audio_file)
37
-
38
- # Convert to mono
39
- if waveform.shape[0] > 1:
40
- waveform = waveform.mean(dim=0, keepdim=True)
41
 
42
- # Save temp speaker audio
43
- with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp:
44
- speaker_path = tmp.name
45
- torchaudio.save(speaker_path, waveform, sr)
46
 
47
- # Output file
48
- output_path = tempfile.NamedTemporaryFile(delete=False, suffix=".wav").name
 
49
 
50
- # Generate speech
51
- tts.tts_to_file(
52
- text=text,
 
53
  speaker_wav=speaker_path,
54
- language=lang,
55
- file_path=output_path,
56
- speed=1.1 # slight speed boost
57
  )
58
 
59
- return output_path, "βœ… Success"
 
 
 
60
 
61
  except Exception as e:
62
- return None, f"❌ Error: {str(e)}"
63
 
64
 
65
- # ---------------------------
66
- # Gradio UI
67
- # ---------------------------
68
  with gr.Blocks() as demo:
69
- gr.Markdown("# 🎀 XTTS Voice Cloning (CPU Space)")
70
- gr.Markdown("Upload a voice sample, enter text, choose language")
71
-
72
- with gr.Row():
73
- audio_input = gr.Audio(type="filepath", label="πŸŽ™ Sample Voice")
74
- text_input = gr.Textbox(label="πŸ“ Text", placeholder="Enter text here...")
75
 
76
- lang_input = gr.Textbox(
77
- label="🌐 Language Code",
78
- value="en",
79
- placeholder="en, hi, fr, de..."
80
- )
81
 
82
- generate_btn = gr.Button("πŸš€ Generate")
83
 
84
- output_audio = gr.Audio(label="πŸ”Š Output")
85
- status = gr.Textbox(label="Status")
86
 
87
- generate_btn.click(
88
- fn=clone_voice,
89
- inputs=[audio_input, text_input, lang_input],
90
- outputs=[output_audio, status]
91
- )
92
 
93
- # Required for Hugging Face Spaces
94
  demo.launch(server_name="0.0.0.0", server_port=7860)
 
2
  import torch
3
  import torchaudio
4
  import tempfile
5
+ from TTS.tts.models.xtts import Xtts
6
+ from TTS.utils.audio import AudioProcessor
7
+ from TTS.config import load_config
8
  import os
9
+
10
+ # -------------------------
11
+ # Load model manually (no heavy install)
12
+ # -------------------------
13
+ MODEL_NAME = "tts_models/multilingual/multi-dataset/xtts_v2"
14
+
15
+ print("Loading model...")
16
+
17
+ from huggingface_hub import snapshot_download
18
+
19
+ model_path = snapshot_download(repo_id="coqui/XTTS-v2")
20
+
21
+ config = load_config(os.path.join(model_path, "config.json"))
22
+ model = Xtts.init_from_config(config)
23
+ model.load_checkpoint(config, checkpoint_dir=model_path)
24
+ model.eval()
25
+
26
  print("Model loaded!")
27
 
28
+ # -------------------------
29
  # Voice cloning function
30
+ # -------------------------
31
  def clone_voice(audio_file, text, lang):
32
  try:
33
  if audio_file is None:
34
+ return None, "Upload audio"
 
 
 
35
 
 
36
  if len(text) > 200:
37
+ return None, "Text too long (max 200 chars)"
38
 
39
  # Load audio
40
+ wav, sr = torchaudio.load(audio_file)
 
 
 
 
41
 
42
+ if wav.shape[0] > 1:
43
+ wav = wav.mean(dim=0, keepdim=True)
 
 
44
 
45
+ # Save temp speaker
46
+ speaker_path = tempfile.NamedTemporaryFile(delete=False, suffix=".wav").name
47
+ torchaudio.save(speaker_path, wav, sr)
48
 
49
+ # Generate
50
+ outputs = model.synthesize(
51
+ text,
52
+ config,
53
  speaker_wav=speaker_path,
54
+ language=lang
 
 
55
  )
56
 
57
+ out_path = tempfile.NamedTemporaryFile(delete=False, suffix=".wav").name
58
+ torchaudio.save(out_path, torch.tensor(outputs["wav"]).unsqueeze(0), 24000)
59
+
60
+ return out_path, "Success"
61
 
62
  except Exception as e:
63
+ return None, str(e)
64
 
65
 
66
+ # -------------------------
67
+ # UI
68
+ # -------------------------
69
  with gr.Blocks() as demo:
70
+ gr.Markdown("# XTTS Voice Cloning (CPU Fixed)")
 
 
 
 
 
71
 
72
+ audio = gr.Audio(type="filepath", label="Voice Sample")
73
+ text = gr.Textbox(label="Text")
74
+ lang = gr.Textbox(value="en", label="Language")
 
 
75
 
76
+ btn = gr.Button("Generate")
77
 
78
+ out_audio = gr.Audio()
79
+ status = gr.Textbox()
80
 
81
+ btn.click(clone_voice, [audio, text, lang], [out_audio, status])
 
 
 
 
82
 
 
83
  demo.launch(server_name="0.0.0.0", server_port=7860)