Spaces:

ziqiangao
/

surroundify

Sleeping

App Files Files Community

ziqiangao commited on Aug 5, 2025

Commit

ee5922f

verified ·

1 Parent(s): ee1df14

Change Smartmode Workflow

Browse files

Files changed (1) hide show

app.py +69 -52

app.py CHANGED Viewed

@@ -251,14 +251,13 @@ def smart_mode_process(input_file, api_key, multi_singer=False):
     import shutil
     if not api_key:
-        raise gr.Error("An MVSep API Key Is Required For This. Get your key <a href=\"https://mvsep.com/user-api\">Here</a>.")
     # Load original
     wav = convert_to_wav_float(input_file)
     data, fs = sf.read(wav, dtype='float32')
     os.unlink(wav)
-    p((0,7), "Loading File")
-    print("Loading File")
     if data.ndim != 2:
         raise gr.Error("Expected stereo input")
@@ -266,96 +265,113 @@ def smart_mode_process(input_file, api_key, multi_singer=False):
     stereo = np.column_stack([L, R])
     # Step 1: LFE from lowpass
-    p((1,7), "Processing LFE")
-    print("Processing LFE")
     bass = sox_filter(0.5 * (L + R), fs, 'lowpass', 120)
-    # Step 2: Highpass for MVSep
-    p((2,7), "Processing Speech, Music and SFX")
-    print("Speech, Music, SFX")
     hp_left = sox_filter(L, fs, 'highpass', 120)
     hp_right = sox_filter(R, fs, 'highpass', 120)
     hp_stereo = np.column_stack([hp_left, hp_right])
-    hp_buf = tempfile.NamedTemporaryFile(suffix=".flac", delete=False)
-    sf.write(hp_buf.name, hp_stereo, fs, format='FLAC', subtype='PCM_16')
-    hp_buf.close()
-    # Send to MVSep
     demucs_resp = send_mvsep_audio_job(
-        api_key, open(hp_buf.name, 'rb').read(), os.path.basename(hp_buf.name), sep_type=24, output_format=2, addopt1=1
     )
-    os.unlink(hp_buf.name)
-    print(demucs_resp)
     dialog, _ = download_wav(demucs_resp['files'][0]['url'], target_fs=fs)
     sfx, _ = download_wav(demucs_resp['files'][2]['url'], target_fs=fs)
     music, _ = download_wav(demucs_resp['files'][1]['url'], target_fs=fs)
-    # Step 3: Extract crowd
-    p((3,7), "Extracting Crowd")
-    print("Crowd")
     music_buf = tempfile.NamedTemporaryFile(suffix=".flac", delete=False)
     sf.write(music_buf.name, music, fs, format='FLAC', subtype='PCM_16')
     music_buf.close()
-    crowd_resp = send_mvsep_audio_job(
-        api_key, open(music_buf.name, 'rb').read(), os.path.basename(music_buf.name), sep_type=34, output_format=2, addopt1=1
-    )
-    os.unlink(music_buf.name)
-    crowd, _ = download_wav(crowd_resp['files'][0]['url'], target_fs=fs)
-    other, _ = download_wav(crowd_resp['files'][1]['url'], target_fs=fs)
-    print(crowd_resp)
-    # Step 4: Extract vocals
-    p((4,7), "Extracting Vocals")
-    print("Vocals")
-    other_buf = tempfile.NamedTemporaryFile(suffix=".flac", delete=False)
-    sf.write(other_buf.name, other, fs, format='FLAC', subtype='PCM_16')
-    other_buf.close()
     karaoke_resp = send_mvsep_audio_job(
-        api_key, open(other_buf.name, 'rb').read(), os.path.basename(other_buf.name), sep_type=49, output_format=2, addopt1=3, addopt2=1
     )
-    os.unlink(other_buf.name)
     vocals_full, _ = download_wav(karaoke_resp['files'][0]['url'], target_fs=fs)
     vocals_lead, _ = download_wav(karaoke_resp['files'][1]['url'], target_fs=fs)
     vocals_back, _ = download_wav(karaoke_resp['files'][2]['url'], target_fs=fs)
     instr, _ = download_wav(karaoke_resp['files'][3]['url'], target_fs=fs)
-    print(karaoke_resp)
-    # Step 5: Phantom center for lead vocals
-    p((5,7), "Distributing Front Vocal Channels")
-    print("Front Vocal Channels")
     vl_buf = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
     sf.write(vl_buf.name, vocals_full if multi_singer else vocals_lead, fs, subtype='FLOAT')
     vl_buf.close()
     _, FL_vl, FR_vl, FC_vl = extract_phantom_center(vl_buf.name)
     os.unlink(vl_buf.name)
-    # Step 6: Map channels and pad
-    p((6,7), "Mapping Channels")
-    print("Mapping")
     def match_len(x, length): return np.pad(x, (0, length - len(x)))
-    lens = [len(FL_vl), len(FR_vl), len(FC_vl), len(bass), sfx.shape[0], crowd.shape[0], vocals_back.shape[0], instr.shape[0]]
     length = max(lens)
-    out_L = match_len(FL_vl, length) + match_len(instr[:,0], length)
-    out_R = match_len(FR_vl, length) + match_len(instr[:,1], length)
     out_C = match_len(FC_vl, length)
     out_LFE = match_len(bass, length)
-    if multi_singer:
-        SL = match_len(sfx[:,0], length) + match_len(crowd[:,0], length)
-        SR = match_len(sfx[:,1], length) + match_len(crowd[:,1], length)
-    else:
-        SL = match_len(vocals_back[:,0], length) + match_len(sfx[:,0], length) + match_len(crowd[:,0], length)
-        SR = match_len(vocals_back[:,1], length) + match_len(sfx[:,1], length) + match_len(crowd[:,1], length)
-    # Step 7: Encode to 5.1 OGG
-    p((7,7), "Processing Step 7, Encoding")
-    print("Encoding")
     multich = np.column_stack([out_L, out_R, out_C, out_LFE, SL, SR])
     out_wav = tempfile.NamedTemporaryFile(suffix='.wav', delete=False)
     sf.write(out_wav.name, multich, fs, subtype='FLOAT')
     out_wav.close()
     out_ogg = tempfile.NamedTemporaryFile(suffix='.ogg', delete=False)
     subprocess.run([
         "ffmpeg", "-y", "-i", out_wav.name,
@@ -365,6 +381,7 @@ def smart_mode_process(input_file, api_key, multi_singer=False):
     return out_ogg.name
 # ========== Gradio UI ==========
 with gr.Blocks(title="Stereo to 5.1 Surround") as demo:
     gr.Markdown("# 🎧 Stereo to 5.1 Converter")

     import shutil
     if not api_key:
+        raise gr.Error("An MVSep API Key Is Required For This. Get your key <a href=\"https://mvsep.com/user-api\">Here</a>. it's Free!")
     # Load original
     wav = convert_to_wav_float(input_file)
     data, fs = sf.read(wav, dtype='float32')
     os.unlink(wav)
+    p((0, 8), "Loading File")
     if data.ndim != 2:
         raise gr.Error("Expected stereo input")
     stereo = np.column_stack([L, R])
     # Step 1: LFE from lowpass
+    p((1, 8), "Processing LFE")
     bass = sox_filter(0.5 * (L + R), fs, 'lowpass', 120)
+    # Step 2: Highpass for crowd extraction
+    p((2, 8), "Extracting Crowd")
     hp_left = sox_filter(L, fs, 'highpass', 120)
     hp_right = sox_filter(R, fs, 'highpass', 120)
     hp_stereo = np.column_stack([hp_left, hp_right])
+    music_buf = tempfile.NamedTemporaryFile(suffix=".flac", delete=False)
+    sf.write(music_buf.name, hp_stereo, fs, format='FLAC', subtype='PCM_16')
+    music_buf.close()
+    crowd_resp = send_mvsep_audio_job(
+        api_key, open(music_buf.name, 'rb').read(), os.path.basename(music_buf.name),
+        sep_type=34, output_format=2, addopt1=1
+    )
+    os.unlink(music_buf.name)
+    crowd, _ = download_wav(crowd_resp['files'][0]['url'], target_fs=fs)
+    other_after_crowd, _ = download_wav(crowd_resp['files'][1]['url'], target_fs=fs)
+    # Step 3: Reverb removal on "other" part
+    p((3, 8), "Removing Reverb")
+    other_buf = tempfile.NamedTemporaryFile(suffix=".flac", delete=False)
+    sf.write(other_buf.name, other_after_crowd, fs, format='FLAC', subtype='PCM_16')
+    other_buf.close()
+    reverb_resp = send_mvsep_audio_job(
+        api_key, open(other_buf.name, 'rb').read(), os.path.basename(other_buf.name),
+        sep_type=22, output_format=2, addopt1=2, addopt2=1
+    )
+    os.unlink(other_buf.name)
+    # Ignore first file (no reverb), use second for SL/SR
+    reverb, _ = download_wav(reverb_resp['files'][1]['url'], target_fs=fs)
+    # Step 4: Speech, music, SFX separation from 'other_after_crowd'
+    p((4, 8), "Separating Speech, Music, and SFX")
+    demucs_input_buf = tempfile.NamedTemporaryFile(suffix=".flac", delete=False)
+    sf.write(demucs_input_buf.name, other_after_crowd, fs, format='FLAC', subtype='PCM_16')
+    demucs_input_buf.close()
     demucs_resp = send_mvsep_audio_job(
+        api_key, open(demucs_input_buf.name, 'rb').read(), os.path.basename(demucs_input_buf.name),
+        sep_type=24, output_format=2, addopt1=1
     )
+    os.unlink(demucs_input_buf.name)
     dialog, _ = download_wav(demucs_resp['files'][0]['url'], target_fs=fs)
     sfx, _ = download_wav(demucs_resp['files'][2]['url'], target_fs=fs)
     music, _ = download_wav(demucs_resp['files'][1]['url'], target_fs=fs)
+    # Step 5: Vocal Extraction from music
+    p((5, 8), "Extracting Vocals")
     music_buf = tempfile.NamedTemporaryFile(suffix=".flac", delete=False)
     sf.write(music_buf.name, music, fs, format='FLAC', subtype='PCM_16')
     music_buf.close()
     karaoke_resp = send_mvsep_audio_job(
+        api_key, open(music_buf.name, 'rb').read(), os.path.basename(music_buf.name),
+        sep_type=49, output_format=2, addopt1=3, addopt2=1
     )
+    os.unlink(music_buf.name)
     vocals_full, _ = download_wav(karaoke_resp['files'][0]['url'], target_fs=fs)
     vocals_lead, _ = download_wav(karaoke_resp['files'][1]['url'], target_fs=fs)
     vocals_back, _ = download_wav(karaoke_resp['files'][2]['url'], target_fs=fs)
     instr, _ = download_wav(karaoke_resp['files'][3]['url'], target_fs=fs)
+    # Step 6: Phantom center on vocals (lead or full)
+    p((6, 8), "Phantom Center for Lead Vocals")
     vl_buf = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
     sf.write(vl_buf.name, vocals_full if multi_singer else vocals_lead, fs, subtype='FLOAT')
     vl_buf.close()
     _, FL_vl, FR_vl, FC_vl = extract_phantom_center(vl_buf.name)
     os.unlink(vl_buf.name)
+    # Step 7: Mapping and stacking
+    p((7, 8), "Mapping Channels and Encoding")
     def match_len(x, length): return np.pad(x, (0, length - len(x)))
+    lens = [len(FL_vl), len(FR_vl), len(FC_vl), len(bass), len(sfx), crowd.shape[0], vocals_back.shape[0], instr.shape[0], len(reverb)]
     length = max(lens)
+    # FL and FR: Lead vocals + SFX + instruments
+    out_L = match_len(FL_vl, length) + match_len(sfx[:, 0], length) + match_len(instr[:, 0], length)
+    out_R = match_len(FR_vl, length) + match_len(sfx[:, 1], length) + match_len(instr[:, 1], length)
     out_C = match_len(FC_vl, length)
     out_LFE = match_len(bass, length)
+    # SL/SR: Use reverb output
+    SL = match_len(reverb[:, 0], length)
+    SR = match_len(reverb[:, 1], length)
+    # Optional: if multi_singer, don’t include backing vocals
+    if not multi_singer:
+        SL += match_len(vocals_back[:, 0], length)
+        SR += match_len(vocals_back[:, 1], length)
+    SL += match_len(crowd[:, 0], length)
+    SR += match_len(crowd[:, 1], length)
+    # Final multichannel stack
     multich = np.column_stack([out_L, out_R, out_C, out_LFE, SL, SR])
     out_wav = tempfile.NamedTemporaryFile(suffix='.wav', delete=False)
     sf.write(out_wav.name, multich, fs, subtype='FLOAT')
     out_wav.close()
     out_ogg = tempfile.NamedTemporaryFile(suffix='.ogg', delete=False)
     subprocess.run([
         "ffmpeg", "-y", "-i", out_wav.name,
     return out_ogg.name
 # ========== Gradio UI ==========
 with gr.Blocks(title="Stereo to 5.1 Surround") as demo:
     gr.Markdown("# 🎧 Stereo to 5.1 Converter")