Spaces:

tee342
/

AudioMaster

Sleeping

App Files Files Community

tee342 commited on Jun 12, 2025

Commit

987f28e

verified ·

1 Parent(s): 7009896

Update app.py

Browse files

Files changed (1) hide show

app.py +68 -23

app.py CHANGED Viewed

@@ -24,6 +24,17 @@ from mutagen.id3 import ID3, TIT2, TPE1, TALB, TYER
 from TTS.api import TTS
 import pickle
 # Suppress warnings
 warnings.filterwarnings("ignore")
@@ -266,7 +277,7 @@ def batch_process_audio(files, selected_effects, isolate_vocals, preset_name, ex
     except Exception as e:
         return None, f"❌ Batch processing failed: {str(e)}"
-# === Whisper Transcription Tab ===
 whisper_model = WhisperModel("base")
 def transcribe_audio(audio_path):
@@ -274,7 +285,7 @@ def transcribe_audio(audio_path):
     text = " ".join([seg.text for seg in segments])
     return text
-# === TTS Voice Generator ===
 tts = TTS(model_name="tts_models/en/ljspeech/tacotron2-DDC", progress_bar=False)
 def generate_tts(text):
@@ -326,6 +337,31 @@ def mix_tracks(track1, track2, volume_offset=0):
     mixed.export(out_path, format="wav")
     return out_path
 # === Speaker Diarization ("Who Spoke When?") ===
 try:
     from pyannote.audio import Pipeline as DiarizationPipeline
@@ -334,31 +370,25 @@ try:
     hf_token = os.getenv("HF_TOKEN")
     if hf_token:
         login(token=hf_token)
-    else:
-        print("⚠️ HF_TOKEN not set – speaker diarization disabled")
     diarize_pipeline = DiarizationPipeline.from_pretrained("pyannote/speaker-diarization", use_auth_token=hf_token or True)
-except ImportError:
     diarize_pipeline = None
-    print("⚠️ PyAnnote not installed – speaker diarization disabled")
 def diarize_and_transcribe(audio_path):
     if diarize_pipeline is None:
         return "⚠️ Diarization pipeline not loaded – check HF token or install pyannote.audio"
-    # Run diarization
     audio = AudioSegment.from_file(audio_path)
     temp_wav = os.path.join(tempfile.gettempdir(), "diarize.wav")
     audio.export(temp_wav, format="wav")
     try:
-        from pyannote.audio import Pipeline as DiarizationPipeline
         diarization = diarize_pipeline(temp_wav)
-        # Run transcription
         result = whisper.transcribe(temp_wav)
         segments = []
         for turn, _, speaker in diarization.itertracks(yield_label=True):
             text = " ".join([seg["text"] for seg in result["segments"] if seg["start"] >= turn.start and seg["end"] <= turn.end])
             segments.append({
@@ -462,6 +492,31 @@ with gr.Blocks(title="AI Audio Studio", css="style.css") as demo:
             description="Convert voice to text and edit it before exporting again."
         )
     # --- TTS Voice Generator ===
     with gr.Tab("💬 TTS Voice Generator"):
         gr.Interface(
@@ -472,16 +527,6 @@ with gr.Blocks(title="AI Audio Studio", css="style.css") as demo:
             description="Type anything and turn it into natural-sounding speech."
         )
-    # --- Speaker Diarization (Who Spoke When?) ===
-    with gr.Tab("🧏‍♂️ Who Spoke When?"):
-        gr.Interface(
-            fn=diarize_and_transcribe,
-            inputs=gr.Audio(label="Upload Interview/Podcast", type="filepath"),
-            outputs=gr.JSON(label="Diarized Transcript"),
-            title="Split By Speaker + Transcribe",
-            description="Detect speakers and transcribe their speech automatically."
-        )
     # --- Auto-Save / Resume Sessions ===
     session_state = gr.State()
@@ -517,7 +562,7 @@ with gr.Blocks(title="AI Audio Studio", css="style.css") as demo:
             outputs=[session_data, loaded_audio, loaded_preset, loaded_effects]
         )
-    # --- Trim Silence Automatically (VAD) ===
     with gr.Tab("✂️ Trim Silence Automatically"):
         gr.Interface(
             fn=detect_silence,
@@ -567,7 +612,7 @@ with gr.Blocks(title="AI Audio Studio", css="style.css") as demo:
             ],
             outputs=gr.File(label="Mixed Output"),
             title="Overlay Two Tracks",
-            description="Mix or subtract two audio files."
         )
 demo.launch()

 from TTS.api import TTS
 import pickle
+# Try to install OpenVoice from GitHub if not found
+try:
+    from openvoice.api import TTS as OpenVoiceTTS, ToneColorConverter
+    from openvoice.se_extractor import get_se
+except ImportError:
+    print("Installing OpenVoice from GitHub...")
+    import subprocess
+    subprocess.run(["pip", "install", "git+https://github.com/myshell-ai/OpenVoice.git"])
+    from openvoice.api import TTS as OpenVoiceTTS, ToneColorConverter
+    from openvoice.se_extractor import get_se
 # Suppress warnings
 warnings.filterwarnings("ignore")
     except Exception as e:
         return None, f"❌ Batch processing failed: {str(e)}"
+# === Transcribe & Edit Tab ===
 whisper_model = WhisperModel("base")
 def transcribe_audio(audio_path):
     text = " ".join([seg.text for seg in segments])
     return text
+# === TTS Tab ===
 tts = TTS(model_name="tts_models/en/ljspeech/tacotron2-DDC", progress_bar=False)
 def generate_tts(text):
     mixed.export(out_path, format="wav")
     return out_path
+# === Voice Cloning / Dubbing Tab ===
+def clone_voice(source_audio, target_audio, text):
+    try:
+        source_se, _ = get_se(source_audio)
+        target_se, _ = get_se(target_audio)
+        # Generate base TTS
+        out_path = os.path.join(tempfile.gettempdir(), "cloned_output.wav")
+        tts.tts_to_file(text=text, file_path=out_path)
+        # Apply voice conversion
+        tone_converter.convert(
+            audio_src_path=out_path,
+            src_se=source_se,
+            tgt_se=target_se,
+            output_path=out_path
+        )
+        return out_path
+    except Exception as e:
+        return f"⚠️ Cloning failed: {str(e)}"
+tone_converter = ToneColorConverter().to("cuda" if torch.cuda.is_available() else "cpu")
+openvoice_tts = OpenVoiceTTS(lang='en')
 # === Speaker Diarization ("Who Spoke When?") ===
 try:
     from pyannote.audio import Pipeline as DiarizationPipeline
     hf_token = os.getenv("HF_TOKEN")
     if hf_token:
         login(token=hf_token)
     diarize_pipeline = DiarizationPipeline.from_pretrained("pyannote/speaker-diarization", use_auth_token=hf_token or True)
+except Exception as e:
     diarize_pipeline = None
+    print(f"⚠️ Failed to load diarization: {e}")
 def diarize_and_transcribe(audio_path):
     if diarize_pipeline is None:
         return "⚠️ Diarization pipeline not loaded – check HF token or install pyannote.audio"
     audio = AudioSegment.from_file(audio_path)
     temp_wav = os.path.join(tempfile.gettempdir(), "diarize.wav")
     audio.export(temp_wav, format="wav")
     try:
         diarization = diarize_pipeline(temp_wav)
         result = whisper.transcribe(temp_wav)
         segments = []
         for turn, _, speaker in diarization.itertracks(yield_label=True):
             text = " ".join([seg["text"] for seg in result["segments"] if seg["start"] >= turn.start and seg["end"] <= turn.end])
             segments.append({
             description="Convert voice to text and edit it before exporting again."
         )
+    # --- Voice Cloning (Dubbing) ===
+    with gr.Tab("🎭 Voice Cloning (Dubbing)"):
+        gr.Interface(
+            fn=clone_voice,
+            inputs=[
+                gr.File(label="Source Voice Clip"),
+                gr.File(label="Target Voice Clip"),
+                gr.Textbox(label="Text to Clone", lines=5)
+            ],
+            outputs=gr.Audio(label="Cloned Output", type="filepath"),
+            title="Replace One Voice With Another",
+            description="Clone voice from source to target speaker using AI"
+        )
+    # --- Speaker Diarization (Who Spoke When?) ===
+    if diarize_pipeline:
+        with gr.Tab("🧏‍♂️ Who Spoke When?"):
+            gr.Interface(
+                fn=diarize_and_transcribe,
+                inputs=gr.Audio(label="Upload Interview/Podcast", type="filepath"),
+                outputs=gr.JSON(label="Diarized Transcript"),
+                title="Split By Speaker + Transcribe",
+                description="Detect speakers and transcribe their speech automatically."
+            )
     # --- TTS Voice Generator ===
     with gr.Tab("💬 TTS Voice Generator"):
         gr.Interface(
             description="Type anything and turn it into natural-sounding speech."
         )
     # --- Auto-Save / Resume Sessions ===
     session_state = gr.State()
             outputs=[session_data, loaded_audio, loaded_preset, loaded_effects]
         )
+    # --- VAD – Detect & Remove Silence ===
     with gr.Tab("✂️ Trim Silence Automatically"):
         gr.Interface(
             fn=detect_silence,
             ],
             outputs=gr.File(label="Mixed Output"),
             title="Overlay Two Tracks",
+            description="Mix, blend, or subtract two audio files."
         )
 demo.launch()