salomonsky commited on
Commit
90d9071
·
verified ·
1 Parent(s): 8132a8c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +28 -54
app.py CHANGED
@@ -7,6 +7,10 @@ import torch
7
  import soundfile as sf
8
  import gradio as gr
9
  from pathlib import Path
 
 
 
 
10
 
11
  logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
12
  logger = logging.getLogger(__name__)
@@ -21,18 +25,13 @@ try:
21
  except ImportError:
22
  pass
23
  except Exception as e:
24
- logger.warning(f"No se pudo aplicar el parche de seguridad de TTS: {e}")
25
-
26
- from transformers import pipeline
27
- from demucs.pretrained import get_model
28
- from demucs.apply import apply_model
29
- import librosa
30
 
31
  class ProcessingManager:
32
  def __init__(self):
33
  self.device = "cuda" if torch.cuda.is_available() else "cpu"
34
  self.models = {}
35
- self.temp_dir = Path(tempfile.gettempdir()) / "song_translator"
36
  self.temp_dir.mkdir(exist_ok=True)
37
 
38
  def get_whisper(self, model_size="large-v3"):
@@ -46,23 +45,6 @@ class ProcessingManager:
46
  )
47
  return self.models[key]
48
 
49
- def get_translator(self, src, tgt):
50
- key = f"trans_{src}_{tgt}"
51
- if key not in self.models:
52
- try:
53
- model_name = f"Helsinki-NLP/opus-mt-{src}-{tgt}"
54
- self.models[key] = pipeline("translation", model=model_name, device=self.device)
55
- except Exception:
56
- # Fallback a NLLB si el par de idiomas no existe en Helsinki-NLP
57
- self.models[key] = pipeline(
58
- "translation",
59
- model="facebook/nllb-200-distilled-600M",
60
- device=self.device,
61
- src_lang=f"{src}_Latn",
62
- tgt_lang=f"{tgt}_Latn"
63
- )
64
- return self.models[key]
65
-
66
  def get_demucs(self):
67
  if "demucs" not in self.models:
68
  self.models["demucs"] = get_model("htdemucs")
@@ -78,8 +60,7 @@ manager = ProcessingManager()
78
 
79
  def process_audio_pipeline(
80
  audio_path,
81
- src_lang,
82
- tgt_lang,
83
  speaker_ref_path,
84
  voice_cleanup_slider,
85
  pitch_shift,
@@ -88,6 +69,9 @@ def process_audio_pipeline(
88
  try:
89
  if not audio_path:
90
  raise ValueError("No audio file provided")
 
 
 
91
 
92
  progress(0.1, desc="Separating Vocals...")
93
  demucs_model = manager.get_demucs()
@@ -109,26 +93,20 @@ def process_audio_pipeline(
109
  sf.write(vocal_path, vocals.T, 44100)
110
  sf.write(inst_path, instrumental.T, 44100)
111
 
112
- progress(0.3, desc="Transcribing...")
113
  whisper = manager.get_whisper()
114
- transcription = whisper(str(vocal_path), generate_kwargs={"task": "transcribe", "language": src_lang})
115
  original_text = transcription["text"]
116
 
117
- progress(0.5, desc="Translating...")
118
- translator = manager.get_translator(src_lang, tgt_lang)
119
- trans_output = translator(original_text)
120
- translated_text = trans_output[0]['translation_text'] if isinstance(trans_output, list) else trans_output['translation_text']
121
-
122
- progress(0.7, desc="Synthesizing Vocals...")
123
  tts_model = manager.get_tts()
124
 
125
- ref_audio = speaker_ref_path if speaker_ref_path else str(vocal_path)
126
  output_tts_path = manager.temp_dir / "tts_output.wav"
127
 
128
  tts_model.tts_to_file(
129
- text=translated_text,
130
- speaker_wav=ref_audio,
131
- language=tgt_lang,
132
  file_path=str(output_tts_path),
133
  split_sentences=True
134
  )
@@ -148,37 +126,34 @@ def process_audio_pipeline(
148
  str(vocal_path),
149
  str(inst_path),
150
  str(output_tts_path),
151
- original_text,
152
- translated_text
153
  )
154
 
155
  except Exception as e:
156
  logger.error(f"Pipeline failed: {str(e)}", exc_info=True)
157
- return None, None, None, None, f"Error: {str(e)}", ""
158
 
159
  custom_css = """
160
  .container { max_width: 900px; margin: auto; }
161
  .gr-box { border-radius: 10px !important; border: 1px solid #e0e0e0; box-shadow: 0 4px 6px rgba(0,0,0,0.05); }
162
  """
163
 
164
- with gr.Blocks(title="AI Song Translator") as demo:
165
- gr.Markdown("# 🎵 AI Song Translator Pro")
166
 
167
  with gr.Row():
168
  with gr.Column(scale=1, variant="panel"):
169
  gr.Markdown("### 1. Input & Settings")
170
  input_audio = gr.Audio(label="Source Song", type="filepath")
171
- ref_audio = gr.Audio(label="Voice Reference (Optional)", type="filepath")
172
 
173
- with gr.Row():
174
- src_lang = gr.Dropdown(["en", "es", "fr", "it", "de", "pt", "ja"], value="en", label="Source")
175
- tgt_lang = gr.Dropdown(["en", "es", "fr", "it", "de", "pt", "ja"], value="es", label="Target")
176
 
177
  with gr.Accordion("Advanced Audio", open=False):
178
  cleanup = gr.Slider(0, 1, value=0.5, label="Voice Cleanup")
179
  pitch = gr.Slider(-12, 12, value=0, step=1, label="Pitch Shift")
180
 
181
- btn_process = gr.Button("🚀 Start Processing", variant="primary", size="lg")
182
 
183
  with gr.Column(scale=1, variant="panel"):
184
  gr.Markdown("### 2. Output Results")
@@ -186,18 +161,17 @@ with gr.Blocks(title="AI Song Translator") as demo:
186
 
187
  with gr.Tabs():
188
  with gr.Tab("Lyrics"):
189
- orig_txt = gr.Textbox(label="Original Lyrics", lines=4, interactive=False)
190
- trans_txt = gr.Textbox(label="Translated Lyrics", lines=4, interactive=False)
191
 
192
  with gr.Tab("Stems"):
193
- voc_out = gr.Audio(label="Extracted Vocals")
194
  inst_out = gr.Audio(label="Instrumental")
195
- tts_out = gr.Audio(label="Raw TTS")
196
 
197
  btn_process.click(
198
  fn=process_audio_pipeline,
199
- inputs=[input_audio, src_lang, tgt_lang, ref_audio, cleanup, pitch],
200
- outputs=[final_output, voc_out, inst_out, tts_out, orig_txt, trans_txt]
201
  )
202
 
203
  if __name__ == "__main__":
 
7
  import soundfile as sf
8
  import gradio as gr
9
  from pathlib import Path
10
+ import librosa
11
+ from transformers import pipeline
12
+ from demucs.pretrained import get_model
13
+ from demucs.apply import apply_model
14
 
15
  logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
16
  logger = logging.getLogger(__name__)
 
25
  except ImportError:
26
  pass
27
  except Exception as e:
28
+ logger.warning(f"{e}")
 
 
 
 
 
29
 
30
  class ProcessingManager:
31
  def __init__(self):
32
  self.device = "cuda" if torch.cuda.is_available() else "cpu"
33
  self.models = {}
34
+ self.temp_dir = Path(tempfile.gettempdir()) / "voice_mask_pro"
35
  self.temp_dir.mkdir(exist_ok=True)
36
 
37
  def get_whisper(self, model_size="large-v3"):
 
45
  )
46
  return self.models[key]
47
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
  def get_demucs(self):
49
  if "demucs" not in self.models:
50
  self.models["demucs"] = get_model("htdemucs")
 
60
 
61
  def process_audio_pipeline(
62
  audio_path,
63
+ language,
 
64
  speaker_ref_path,
65
  voice_cleanup_slider,
66
  pitch_shift,
 
69
  try:
70
  if not audio_path:
71
  raise ValueError("No audio file provided")
72
+
73
+ if not speaker_ref_path:
74
+ raise ValueError("Reference voice (MP3) is required")
75
 
76
  progress(0.1, desc="Separating Vocals...")
77
  demucs_model = manager.get_demucs()
 
93
  sf.write(vocal_path, vocals.T, 44100)
94
  sf.write(inst_path, instrumental.T, 44100)
95
 
96
+ progress(0.4, desc="Transcribing...")
97
  whisper = manager.get_whisper()
98
+ transcription = whisper(str(vocal_path), generate_kwargs={"task": "transcribe", "language": language})
99
  original_text = transcription["text"]
100
 
101
+ progress(0.6, desc="Synthesizing with Reference Voice...")
 
 
 
 
 
102
  tts_model = manager.get_tts()
103
 
 
104
  output_tts_path = manager.temp_dir / "tts_output.wav"
105
 
106
  tts_model.tts_to_file(
107
+ text=original_text,
108
+ speaker_wav=speaker_ref_path,
109
+ language=language,
110
  file_path=str(output_tts_path),
111
  split_sentences=True
112
  )
 
126
  str(vocal_path),
127
  str(inst_path),
128
  str(output_tts_path),
129
+ original_text
 
130
  )
131
 
132
  except Exception as e:
133
  logger.error(f"Pipeline failed: {str(e)}", exc_info=True)
134
+ return None, None, None, None, f"Error: {str(e)}"
135
 
136
  custom_css = """
137
  .container { max_width: 900px; margin: auto; }
138
  .gr-box { border-radius: 10px !important; border: 1px solid #e0e0e0; box-shadow: 0 4px 6px rgba(0,0,0,0.05); }
139
  """
140
 
141
+ with gr.Blocks(title="AI Voice Masker") as demo:
142
+ gr.Markdown("# 🎤 AI Voice Masker")
143
 
144
  with gr.Row():
145
  with gr.Column(scale=1, variant="panel"):
146
  gr.Markdown("### 1. Input & Settings")
147
  input_audio = gr.Audio(label="Source Song", type="filepath")
148
+ ref_audio = gr.Audio(label="Reference Voice (MP3 Required)", type="filepath")
149
 
150
+ language = gr.Dropdown(["en", "es", "fr", "it", "de", "pt", "ja"], value="es", label="Song Language")
 
 
151
 
152
  with gr.Accordion("Advanced Audio", open=False):
153
  cleanup = gr.Slider(0, 1, value=0.5, label="Voice Cleanup")
154
  pitch = gr.Slider(-12, 12, value=0, step=1, label="Pitch Shift")
155
 
156
+ btn_process = gr.Button("🚀 Start Masking", variant="primary", size="lg")
157
 
158
  with gr.Column(scale=1, variant="panel"):
159
  gr.Markdown("### 2. Output Results")
 
161
 
162
  with gr.Tabs():
163
  with gr.Tab("Lyrics"):
164
+ orig_txt = gr.Textbox(label="Transcribed Lyrics", lines=8, interactive=False)
 
165
 
166
  with gr.Tab("Stems"):
167
+ voc_out = gr.Audio(label="Original Vocals")
168
  inst_out = gr.Audio(label="Instrumental")
169
+ tts_out = gr.Audio(label="Generated Vocals (Raw)")
170
 
171
  btn_process.click(
172
  fn=process_audio_pipeline,
173
+ inputs=[input_audio, language, ref_audio, cleanup, pitch],
174
+ outputs=[final_output, voc_out, inst_out, tts_out, orig_txt]
175
  )
176
 
177
  if __name__ == "__main__":