|
|
import os |
|
|
import uuid |
|
|
import torch |
|
|
import torchaudio |
|
|
import torchaudio.transforms as T |
|
|
import soundfile as sf |
|
|
import gradio as gr |
|
|
import spaces |
|
|
from moviepy.editor import VideoFileClip, AudioFileClip, CompositeAudioClip |
|
|
import look2hear.models |
|
|
|
|
|
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
|
|
|
|
|
|
|
|
dnr_model = look2hear.models.TIGERDNR.from_pretrained("JusperLee/TIGER-DnR", cache_dir="cache").to(device).eval() |
|
|
sep_model = look2hear.models.TIGER.from_pretrained("JusperLee/TIGER-speech", cache_dir="cache").to(device).eval() |
|
|
|
|
|
TARGET_SR = 16000 |
|
|
MAX_SPEAKERS = 4 |
|
|
|
|
|
def extract_audio_from_video(video_path): |
|
|
video = VideoFileClip(video_path) |
|
|
session_id = uuid.uuid4().hex[:8] |
|
|
audio_path = f"temp_audio/{session_id}.wav" |
|
|
os.makedirs("temp_audio", exist_ok=True) |
|
|
video.audio.write_audiofile(audio_path, fps=44100, verbose=False, logger=None) |
|
|
return audio_path, video |
|
|
|
|
|
def attach_audio_to_video(original_video, audio_path, out_path): |
|
|
new_audio = AudioFileClip(audio_path) |
|
|
new_video = original_video.set_audio(new_audio) |
|
|
new_video.write_videofile(out_path, audio_codec='aac', verbose=False, logger=None) |
|
|
return out_path |
|
|
|
|
|
@spaces.GPU() |
|
|
def separate_dnr(audio_file): |
|
|
audio, sr = torchaudio.load(audio_file) |
|
|
audio = audio.to(device) |
|
|
|
|
|
with torch.no_grad(): |
|
|
dialog, effect, music = dnr_model(audio[None]) |
|
|
|
|
|
session_id = uuid.uuid4().hex[:8] |
|
|
output_dir = os.path.join("output_dnr", session_id) |
|
|
os.makedirs(output_dir, exist_ok=True) |
|
|
|
|
|
paths = { |
|
|
"dialog": os.path.join(output_dir, "dialog.wav"), |
|
|
"effect": os.path.join(output_dir, "effect.wav"), |
|
|
"music": os.path.join(output_dir, "music.wav"), |
|
|
} |
|
|
|
|
|
torchaudio.save(paths["dialog"], dialog.cpu(), sr) |
|
|
torchaudio.save(paths["effect"], effect.cpu(), sr) |
|
|
torchaudio.save(paths["music"], music.cpu(), sr) |
|
|
|
|
|
return paths["dialog"], paths["effect"], paths["music"] |
|
|
|
|
|
@spaces.GPU() |
|
|
def separate_speakers(audio_path): |
|
|
waveform, original_sr = torchaudio.load(audio_path) |
|
|
if original_sr != TARGET_SR: |
|
|
waveform = T.Resample(orig_freq=original_sr, new_freq=TARGET_SR)(waveform) |
|
|
|
|
|
if waveform.dim() == 1: |
|
|
waveform = waveform.unsqueeze(0) |
|
|
audio_input = waveform.unsqueeze(0).to(device) |
|
|
|
|
|
with torch.no_grad(): |
|
|
ests_speech = sep_model(audio_input).squeeze(0) |
|
|
|
|
|
session_id = uuid.uuid4().hex[:8] |
|
|
output_dir = os.path.join("output_sep", session_id) |
|
|
os.makedirs(output_dir, exist_ok=True) |
|
|
|
|
|
output_files = [] |
|
|
for i in range(ests_speech.shape[0]): |
|
|
path = os.path.join(output_dir, f"speaker_{i+1}.wav") |
|
|
sf.write(path, ests_speech[i].cpu().numpy(), TARGET_SR) |
|
|
output_files.append(path) |
|
|
|
|
|
updates = [] |
|
|
for i in range(MAX_SPEAKERS): |
|
|
if i < len(output_files): |
|
|
updates.append(gr.update(value=output_files[i], visible=True, label=f"Speaker {i+1}")) |
|
|
else: |
|
|
updates.append(gr.update(value=None, visible=False)) |
|
|
return updates |
|
|
|
|
|
@spaces.GPU() |
|
|
def separate_dnr_video(video_path): |
|
|
audio_path, video = extract_audio_from_video(video_path) |
|
|
dialog_path, effect_path, music_path = separate_dnr(audio_path) |
|
|
|
|
|
session_id = uuid.uuid4().hex[:8] |
|
|
output_dir = os.path.join("output_dnr_video", session_id) |
|
|
os.makedirs(output_dir, exist_ok=True) |
|
|
|
|
|
dialog_video = attach_audio_to_video(video, dialog_path, os.path.join(output_dir, "dialog_video.mp4")) |
|
|
effect_video = attach_audio_to_video(video, effect_path, os.path.join(output_dir, "effect_video.mp4")) |
|
|
music_video = attach_audio_to_video(video, music_path, os.path.join(output_dir, "music_video.mp4")) |
|
|
|
|
|
return dialog_video, effect_video, music_video |
|
|
|
|
|
@spaces.GPU() |
|
|
def separate_speakers_video(video_path): |
|
|
audio_path, video = extract_audio_from_video(video_path) |
|
|
|
|
|
waveform, original_sr = torchaudio.load(audio_path) |
|
|
if original_sr != TARGET_SR: |
|
|
waveform = T.Resample(orig_freq=original_sr, new_freq=TARGET_SR)(waveform) |
|
|
|
|
|
if waveform.dim() == 1: |
|
|
waveform = waveform.unsqueeze(0) |
|
|
audio_input = waveform.unsqueeze(0).to(device) |
|
|
|
|
|
with torch.no_grad(): |
|
|
ests_speech = sep_model(audio_input).squeeze(0) |
|
|
|
|
|
session_id = uuid.uuid4().hex[:8] |
|
|
output_dir = os.path.join("output_sep_video", session_id) |
|
|
os.makedirs(output_dir, exist_ok=True) |
|
|
|
|
|
output_videos = [] |
|
|
for i in range(ests_speech.shape[0]): |
|
|
path = os.path.join(output_dir, f"speaker_{i+1}.wav") |
|
|
sf.write(path, ests_speech[i].cpu().numpy(), TARGET_SR) |
|
|
video_path = os.path.join(output_dir, f"speaker_{i+1}_video.mp4") |
|
|
attach_audio_to_video(video, path, video_path) |
|
|
output_videos.append(video_path) |
|
|
|
|
|
updates = [] |
|
|
for i in range(MAX_SPEAKERS): |
|
|
if i < len(output_videos): |
|
|
updates.append(gr.update(value=output_videos[i], visible=True, label=f"Speaker {i+1}")) |
|
|
else: |
|
|
updates.append(gr.update(value=None, visible=False)) |
|
|
return updates |
|
|
|
|
|
|
|
|
with gr.Blocks() as demo: |
|
|
gr.Markdown("# TIGER: Time-frequency Interleaved Gain Extraction and Reconstruction for Efficient Speech Separation") |
|
|
gr.Markdown("TIGER is a lightweight model for speech separation which effectively extracts key acoustic features through frequency band-split, multi-scale and full-frequency-frame modeling.") |
|
|
|
|
|
gr.HTML(""" |
|
|
<div style="display:flex;column-gap:4px;"> |
|
|
<a href="https://cslikai.cn/TIGER/"> |
|
|
<img src='https://img.shields.io/badge/Project-Page-green'> |
|
|
</a> |
|
|
|
|
|
<a href="https://huggingface.co/spaces/fffiloni/TIGER-audio-extraction?duplicate=true"> |
|
|
<img src="https://huggingface.co/datasets/huggingface/badges/resolve/main/duplicate-this-space-sm.svg" alt="Duplicate this Space"> |
|
|
</a> |
|
|
</div> |
|
|
""") |
|
|
|
|
|
with gr.Tabs(): |
|
|
with gr.Tab("Audio DnR"): |
|
|
dnr_input = gr.Audio(type="filepath", label="Upload Audio") |
|
|
dnr_btn = gr.Button("Separate") |
|
|
gr.Examples( |
|
|
examples = ["./test/test_mixture_466.wav"], |
|
|
inputs = dnr_input |
|
|
) |
|
|
dnr_output = [gr.Audio(label=l) for l in ["Dialog", "Effects", "Music"]] |
|
|
dnr_btn.click(separate_dnr, inputs=dnr_input, outputs=dnr_output) |
|
|
|
|
|
with gr.Tab("Audio Speaker Separation"): |
|
|
sep_input = gr.Audio(type="filepath", label="Upload Speech Audio") |
|
|
sep_btn = gr.Button("Separate Speakers") |
|
|
gr.Examples( |
|
|
examples = ["./test/mix.wav"], |
|
|
inputs = sep_input |
|
|
) |
|
|
sep_outputs = [gr.Audio(label=f"Speaker {i+1}", visible=(i==0)) for i in range(MAX_SPEAKERS)] |
|
|
sep_btn.click(separate_speakers, inputs=sep_input, outputs=sep_outputs) |
|
|
|
|
|
with gr.Tab("Video DnR"): |
|
|
vdnr_input = gr.Video(label="Upload Video") |
|
|
vdnr_btn = gr.Button("Separate Audio Tracks") |
|
|
vdnr_output = [gr.Video(label=l) for l in ["Dialog Video", "Effects Video", "Music Video"]] |
|
|
vdnr_btn.click(separate_dnr_video, inputs=vdnr_input, outputs=vdnr_output) |
|
|
|
|
|
with gr.Tab("Video Speaker Separation"): |
|
|
vsep_input = gr.Video(label="Upload Video") |
|
|
vsep_btn = gr.Button("Separate Speakers") |
|
|
vsep_outputs = [gr.Video(label=f"Speaker {i+1}", visible=(i==0)) for i in range(MAX_SPEAKERS)] |
|
|
vsep_btn.click(separate_speakers_video, inputs=vsep_input, outputs=vsep_outputs) |
|
|
|
|
|
if __name__ == "__main__": |
|
|
demo.launch() |