Spaces:

ziqiangao
/

surroundify

Sleeping

App Files Files Community

ziqiangao commited on Aug 5, 2025

Commit

1da61b3

1 Parent(s): a37de4d

corrrect process

Browse files

Files changed (1) hide show

app.py +13 -15

app.py CHANGED Viewed

@@ -262,7 +262,6 @@ def smart_mode_process(input_file, api_key, multi_singer=False):
     if data.ndim != 2:
         raise gr.Error("Expected stereo input")
     L, R = data[:, 0], data[:, 1]
-    stereo = np.column_stack([L, R])
     # Step 1: LFE from lowpass
     p((1, 8), "Processing LFE")
@@ -285,19 +284,9 @@ def smart_mode_process(input_file, api_key, multi_singer=False):
     crowd, _ = download_wav(crowd_resp['files'][0]['url'], target_fs=fs)
     other_after_crowd, _ = download_wav(crowd_resp['files'][1]['url'], target_fs=fs)
-    # Step 3: Reverb using SoX (like regular mode)
-    p((3, 8), "Applying Reverb")
-    # Use the same reverb_args as 'open' preset from create_5_1_surround
-    reverb_args = ['70', '40', '100', '95', '10', '0']  # music preset
-    # Apply reverb to left and right channels separately
-    reverb_L = apply_reverb_wet_only(other_after_crowd[:, 0], fs, reverb_args)
-    reverb_R = apply_reverb_wet_only(other_after_crowd[:, 1], fs, reverb_args)
-    reverb = np.column_stack([reverb_L, reverb_R])
-    # Step 4: Speech, music, SFX separation from 'other_after_crowd'
-    p((4, 8), "Separating Speech, Music, and SFX")
     demucs_input_buf = tempfile.NamedTemporaryFile(suffix=".flac", delete=False)
     sf.write(demucs_input_buf.name, other_after_crowd, fs, format='FLAC', subtype='PCM_16')
     demucs_input_buf.close()
@@ -312,6 +301,14 @@ def smart_mode_process(input_file, api_key, multi_singer=False):
     sfx, _ = download_wav(demucs_resp['files'][2]['url'], target_fs=fs)
     music, _ = download_wav(demucs_resp['files'][1]['url'], target_fs=fs)
     # Step 5: Vocal Extraction from music
     p((5, 8), "Extracting Vocals")
     music_buf = tempfile.NamedTemporaryFile(suffix=".flac", delete=False)
@@ -338,6 +335,9 @@ def smart_mode_process(input_file, api_key, multi_singer=False):
     _, FL_vl, FR_vl, FC_vl = extract_phantom_center(vl_buf.name)
     os.unlink(vl_buf.name)
     # Step 7: Mapping and stacking
     p((7, 8), "Mapping Channels and Encoding")
     def match_len(x, length): return np.pad(x, (0, length - len(x)))
@@ -354,14 +354,12 @@ def smart_mode_process(input_file, api_key, multi_singer=False):
     SL = match_len(reverb[:, 0], length)
     SR = match_len(reverb[:, 1], length)
-    # Optional: if multi_singer, don’t include backing vocals
     if not multi_singer:
         SL += match_len(vocals_back[:, 0], length)
         SR += match_len(vocals_back[:, 1], length)
     SL += match_len(crowd[:, 0], length)
     SR += match_len(crowd[:, 1], length)
-    # Final multichannel stack
     multich = np.column_stack([out_L, out_R, out_C, out_LFE, SL, SR])
     out_wav = tempfile.NamedTemporaryFile(suffix='.wav', delete=False)

     if data.ndim != 2:
         raise gr.Error("Expected stereo input")
     L, R = data[:, 0], data[:, 1]
     # Step 1: LFE from lowpass
     p((1, 8), "Processing LFE")
     crowd, _ = download_wav(crowd_resp['files'][0]['url'], target_fs=fs)
     other_after_crowd, _ = download_wav(crowd_resp['files'][1]['url'], target_fs=fs)
+    # Step 3: Speech, music, SFX separation from 'other_after_crowd'
+    p((3, 8), "Separating Speech, Music, and SFX")
     demucs_input_buf = tempfile.NamedTemporaryFile(suffix=".flac", delete=False)
     sf.write(demucs_input_buf.name, other_after_crowd, fs, format='FLAC', subtype='PCM_16')
     demucs_input_buf.close()
     sfx, _ = download_wav(demucs_resp['files'][2]['url'], target_fs=fs)
     music, _ = download_wav(demucs_resp['files'][1]['url'], target_fs=fs)
+    # Step 4: Apply Reverb to the 'music' stem
+    p((4, 8), "Applying Reverb")
+    reverb_args = ['70', '40', '100', '95', '10', '0']  # music preset
+    reverb_L = apply_reverb_wet_only(music[:, 0], fs, reverb_args)
+    reverb_R = apply_reverb_wet_only(music[:, 1], fs, reverb_args)
+    reverb = np.column_stack([reverb_L, reverb_R])
     # Step 5: Vocal Extraction from music
     p((5, 8), "Extracting Vocals")
     music_buf = tempfile.NamedTemporaryFile(suffix=".flac", delete=False)
     _, FL_vl, FR_vl, FC_vl = extract_phantom_center(vl_buf.name)
     os.unlink(vl_buf.name)
+    # Mix dialog into the centre channel
+    FC_vl += dialog[:, 0] if dialog.ndim == 2 else dialog
     # Step 7: Mapping and stacking
     p((7, 8), "Mapping Channels and Encoding")
     def match_len(x, length): return np.pad(x, (0, length - len(x)))
     SL = match_len(reverb[:, 0], length)
     SR = match_len(reverb[:, 1], length)
     if not multi_singer:
         SL += match_len(vocals_back[:, 0], length)
         SR += match_len(vocals_back[:, 1], length)
     SL += match_len(crowd[:, 0], length)
     SR += match_len(crowd[:, 1], length)
     multich = np.column_stack([out_L, out_R, out_C, out_LFE, SL, SR])
     out_wav = tempfile.NamedTemporaryFile(suffix='.wav', delete=False)