Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -24,6 +24,17 @@ from mutagen.id3 import ID3, TIT2, TPE1, TALB, TYER
|
|
| 24 |
from TTS.api import TTS
|
| 25 |
import pickle
|
| 26 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 27 |
# Suppress warnings
|
| 28 |
warnings.filterwarnings("ignore")
|
| 29 |
|
|
@@ -266,7 +277,7 @@ def batch_process_audio(files, selected_effects, isolate_vocals, preset_name, ex
|
|
| 266 |
except Exception as e:
|
| 267 |
return None, f"❌ Batch processing failed: {str(e)}"
|
| 268 |
|
| 269 |
-
# ===
|
| 270 |
whisper_model = WhisperModel("base")
|
| 271 |
|
| 272 |
def transcribe_audio(audio_path):
|
|
@@ -274,7 +285,7 @@ def transcribe_audio(audio_path):
|
|
| 274 |
text = " ".join([seg.text for seg in segments])
|
| 275 |
return text
|
| 276 |
|
| 277 |
-
# === TTS
|
| 278 |
tts = TTS(model_name="tts_models/en/ljspeech/tacotron2-DDC", progress_bar=False)
|
| 279 |
|
| 280 |
def generate_tts(text):
|
|
@@ -326,6 +337,31 @@ def mix_tracks(track1, track2, volume_offset=0):
|
|
| 326 |
mixed.export(out_path, format="wav")
|
| 327 |
return out_path
|
| 328 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 329 |
# === Speaker Diarization ("Who Spoke When?") ===
|
| 330 |
try:
|
| 331 |
from pyannote.audio import Pipeline as DiarizationPipeline
|
|
@@ -334,31 +370,25 @@ try:
|
|
| 334 |
hf_token = os.getenv("HF_TOKEN")
|
| 335 |
if hf_token:
|
| 336 |
login(token=hf_token)
|
| 337 |
-
else:
|
| 338 |
-
print("⚠️ HF_TOKEN not set – speaker diarization disabled")
|
| 339 |
-
|
| 340 |
diarize_pipeline = DiarizationPipeline.from_pretrained("pyannote/speaker-diarization", use_auth_token=hf_token or True)
|
| 341 |
-
except
|
| 342 |
diarize_pipeline = None
|
| 343 |
-
print("⚠️
|
| 344 |
|
| 345 |
def diarize_and_transcribe(audio_path):
|
| 346 |
if diarize_pipeline is None:
|
| 347 |
return "⚠️ Diarization pipeline not loaded – check HF token or install pyannote.audio"
|
| 348 |
|
| 349 |
-
# Run diarization
|
| 350 |
audio = AudioSegment.from_file(audio_path)
|
| 351 |
temp_wav = os.path.join(tempfile.gettempdir(), "diarize.wav")
|
| 352 |
audio.export(temp_wav, format="wav")
|
| 353 |
|
| 354 |
try:
|
| 355 |
-
from pyannote.audio import Pipeline as DiarizationPipeline
|
| 356 |
diarization = diarize_pipeline(temp_wav)
|
| 357 |
|
| 358 |
-
# Run transcription
|
| 359 |
result = whisper.transcribe(temp_wav)
|
| 360 |
-
|
| 361 |
segments = []
|
|
|
|
| 362 |
for turn, _, speaker in diarization.itertracks(yield_label=True):
|
| 363 |
text = " ".join([seg["text"] for seg in result["segments"] if seg["start"] >= turn.start and seg["end"] <= turn.end])
|
| 364 |
segments.append({
|
|
@@ -462,6 +492,31 @@ with gr.Blocks(title="AI Audio Studio", css="style.css") as demo:
|
|
| 462 |
description="Convert voice to text and edit it before exporting again."
|
| 463 |
)
|
| 464 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 465 |
# --- TTS Voice Generator ===
|
| 466 |
with gr.Tab("💬 TTS Voice Generator"):
|
| 467 |
gr.Interface(
|
|
@@ -472,16 +527,6 @@ with gr.Blocks(title="AI Audio Studio", css="style.css") as demo:
|
|
| 472 |
description="Type anything and turn it into natural-sounding speech."
|
| 473 |
)
|
| 474 |
|
| 475 |
-
# --- Speaker Diarization (Who Spoke When?) ===
|
| 476 |
-
with gr.Tab("🧏♂️ Who Spoke When?"):
|
| 477 |
-
gr.Interface(
|
| 478 |
-
fn=diarize_and_transcribe,
|
| 479 |
-
inputs=gr.Audio(label="Upload Interview/Podcast", type="filepath"),
|
| 480 |
-
outputs=gr.JSON(label="Diarized Transcript"),
|
| 481 |
-
title="Split By Speaker + Transcribe",
|
| 482 |
-
description="Detect speakers and transcribe their speech automatically."
|
| 483 |
-
)
|
| 484 |
-
|
| 485 |
# --- Auto-Save / Resume Sessions ===
|
| 486 |
session_state = gr.State()
|
| 487 |
|
|
@@ -517,7 +562,7 @@ with gr.Blocks(title="AI Audio Studio", css="style.css") as demo:
|
|
| 517 |
outputs=[session_data, loaded_audio, loaded_preset, loaded_effects]
|
| 518 |
)
|
| 519 |
|
| 520 |
-
# ---
|
| 521 |
with gr.Tab("✂️ Trim Silence Automatically"):
|
| 522 |
gr.Interface(
|
| 523 |
fn=detect_silence,
|
|
@@ -567,7 +612,7 @@ with gr.Blocks(title="AI Audio Studio", css="style.css") as demo:
|
|
| 567 |
],
|
| 568 |
outputs=gr.File(label="Mixed Output"),
|
| 569 |
title="Overlay Two Tracks",
|
| 570 |
-
description="Mix or subtract two audio files."
|
| 571 |
)
|
| 572 |
|
| 573 |
demo.launch()
|
|
|
|
| 24 |
from TTS.api import TTS
|
| 25 |
import pickle
|
| 26 |
|
| 27 |
+
# Try to install OpenVoice from GitHub if not found
|
| 28 |
+
try:
|
| 29 |
+
from openvoice.api import TTS as OpenVoiceTTS, ToneColorConverter
|
| 30 |
+
from openvoice.se_extractor import get_se
|
| 31 |
+
except ImportError:
|
| 32 |
+
print("Installing OpenVoice from GitHub...")
|
| 33 |
+
import subprocess
|
| 34 |
+
subprocess.run(["pip", "install", "git+https://github.com/myshell-ai/OpenVoice.git"])
|
| 35 |
+
from openvoice.api import TTS as OpenVoiceTTS, ToneColorConverter
|
| 36 |
+
from openvoice.se_extractor import get_se
|
| 37 |
+
|
| 38 |
# Suppress warnings
|
| 39 |
warnings.filterwarnings("ignore")
|
| 40 |
|
|
|
|
| 277 |
except Exception as e:
|
| 278 |
return None, f"❌ Batch processing failed: {str(e)}"
|
| 279 |
|
| 280 |
+
# === Transcribe & Edit Tab ===
|
| 281 |
whisper_model = WhisperModel("base")
|
| 282 |
|
| 283 |
def transcribe_audio(audio_path):
|
|
|
|
| 285 |
text = " ".join([seg.text for seg in segments])
|
| 286 |
return text
|
| 287 |
|
| 288 |
+
# === TTS Tab ===
|
| 289 |
tts = TTS(model_name="tts_models/en/ljspeech/tacotron2-DDC", progress_bar=False)
|
| 290 |
|
| 291 |
def generate_tts(text):
|
|
|
|
| 337 |
mixed.export(out_path, format="wav")
|
| 338 |
return out_path
|
| 339 |
|
| 340 |
+
# === Voice Cloning / Dubbing Tab ===
|
| 341 |
+
def clone_voice(source_audio, target_audio, text):
|
| 342 |
+
try:
|
| 343 |
+
source_se, _ = get_se(source_audio)
|
| 344 |
+
target_se, _ = get_se(target_audio)
|
| 345 |
+
|
| 346 |
+
# Generate base TTS
|
| 347 |
+
out_path = os.path.join(tempfile.gettempdir(), "cloned_output.wav")
|
| 348 |
+
tts.tts_to_file(text=text, file_path=out_path)
|
| 349 |
+
|
| 350 |
+
# Apply voice conversion
|
| 351 |
+
tone_converter.convert(
|
| 352 |
+
audio_src_path=out_path,
|
| 353 |
+
src_se=source_se,
|
| 354 |
+
tgt_se=target_se,
|
| 355 |
+
output_path=out_path
|
| 356 |
+
)
|
| 357 |
+
|
| 358 |
+
return out_path
|
| 359 |
+
except Exception as e:
|
| 360 |
+
return f"⚠️ Cloning failed: {str(e)}"
|
| 361 |
+
|
| 362 |
+
tone_converter = ToneColorConverter().to("cuda" if torch.cuda.is_available() else "cpu")
|
| 363 |
+
openvoice_tts = OpenVoiceTTS(lang='en')
|
| 364 |
+
|
| 365 |
# === Speaker Diarization ("Who Spoke When?") ===
|
| 366 |
try:
|
| 367 |
from pyannote.audio import Pipeline as DiarizationPipeline
|
|
|
|
| 370 |
hf_token = os.getenv("HF_TOKEN")
|
| 371 |
if hf_token:
|
| 372 |
login(token=hf_token)
|
|
|
|
|
|
|
|
|
|
| 373 |
diarize_pipeline = DiarizationPipeline.from_pretrained("pyannote/speaker-diarization", use_auth_token=hf_token or True)
|
| 374 |
+
except Exception as e:
|
| 375 |
diarize_pipeline = None
|
| 376 |
+
print(f"⚠️ Failed to load diarization: {e}")
|
| 377 |
|
| 378 |
def diarize_and_transcribe(audio_path):
|
| 379 |
if diarize_pipeline is None:
|
| 380 |
return "⚠️ Diarization pipeline not loaded – check HF token or install pyannote.audio"
|
| 381 |
|
|
|
|
| 382 |
audio = AudioSegment.from_file(audio_path)
|
| 383 |
temp_wav = os.path.join(tempfile.gettempdir(), "diarize.wav")
|
| 384 |
audio.export(temp_wav, format="wav")
|
| 385 |
|
| 386 |
try:
|
|
|
|
| 387 |
diarization = diarize_pipeline(temp_wav)
|
| 388 |
|
|
|
|
| 389 |
result = whisper.transcribe(temp_wav)
|
|
|
|
| 390 |
segments = []
|
| 391 |
+
|
| 392 |
for turn, _, speaker in diarization.itertracks(yield_label=True):
|
| 393 |
text = " ".join([seg["text"] for seg in result["segments"] if seg["start"] >= turn.start and seg["end"] <= turn.end])
|
| 394 |
segments.append({
|
|
|
|
| 492 |
description="Convert voice to text and edit it before exporting again."
|
| 493 |
)
|
| 494 |
|
| 495 |
+
# --- Voice Cloning (Dubbing) ===
|
| 496 |
+
with gr.Tab("🎭 Voice Cloning (Dubbing)"):
|
| 497 |
+
gr.Interface(
|
| 498 |
+
fn=clone_voice,
|
| 499 |
+
inputs=[
|
| 500 |
+
gr.File(label="Source Voice Clip"),
|
| 501 |
+
gr.File(label="Target Voice Clip"),
|
| 502 |
+
gr.Textbox(label="Text to Clone", lines=5)
|
| 503 |
+
],
|
| 504 |
+
outputs=gr.Audio(label="Cloned Output", type="filepath"),
|
| 505 |
+
title="Replace One Voice With Another",
|
| 506 |
+
description="Clone voice from source to target speaker using AI"
|
| 507 |
+
)
|
| 508 |
+
|
| 509 |
+
# --- Speaker Diarization (Who Spoke When?) ===
|
| 510 |
+
if diarize_pipeline:
|
| 511 |
+
with gr.Tab("🧏♂️ Who Spoke When?"):
|
| 512 |
+
gr.Interface(
|
| 513 |
+
fn=diarize_and_transcribe,
|
| 514 |
+
inputs=gr.Audio(label="Upload Interview/Podcast", type="filepath"),
|
| 515 |
+
outputs=gr.JSON(label="Diarized Transcript"),
|
| 516 |
+
title="Split By Speaker + Transcribe",
|
| 517 |
+
description="Detect speakers and transcribe their speech automatically."
|
| 518 |
+
)
|
| 519 |
+
|
| 520 |
# --- TTS Voice Generator ===
|
| 521 |
with gr.Tab("💬 TTS Voice Generator"):
|
| 522 |
gr.Interface(
|
|
|
|
| 527 |
description="Type anything and turn it into natural-sounding speech."
|
| 528 |
)
|
| 529 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 530 |
# --- Auto-Save / Resume Sessions ===
|
| 531 |
session_state = gr.State()
|
| 532 |
|
|
|
|
| 562 |
outputs=[session_data, loaded_audio, loaded_preset, loaded_effects]
|
| 563 |
)
|
| 564 |
|
| 565 |
+
# --- VAD – Detect & Remove Silence ===
|
| 566 |
with gr.Tab("✂️ Trim Silence Automatically"):
|
| 567 |
gr.Interface(
|
| 568 |
fn=detect_silence,
|
|
|
|
| 612 |
],
|
| 613 |
outputs=gr.File(label="Mixed Output"),
|
| 614 |
title="Overlay Two Tracks",
|
| 615 |
+
description="Mix, blend, or subtract two audio files."
|
| 616 |
)
|
| 617 |
|
| 618 |
demo.launch()
|