Spaces:
Sleeping
Sleeping
ziqiangao commited on
Commit ·
1da61b3
1
Parent(s): a37de4d
corrrect process
Browse files
app.py
CHANGED
|
@@ -262,7 +262,6 @@ def smart_mode_process(input_file, api_key, multi_singer=False):
|
|
| 262 |
if data.ndim != 2:
|
| 263 |
raise gr.Error("Expected stereo input")
|
| 264 |
L, R = data[:, 0], data[:, 1]
|
| 265 |
-
stereo = np.column_stack([L, R])
|
| 266 |
|
| 267 |
# Step 1: LFE from lowpass
|
| 268 |
p((1, 8), "Processing LFE")
|
|
@@ -285,19 +284,9 @@ def smart_mode_process(input_file, api_key, multi_singer=False):
|
|
| 285 |
crowd, _ = download_wav(crowd_resp['files'][0]['url'], target_fs=fs)
|
| 286 |
other_after_crowd, _ = download_wav(crowd_resp['files'][1]['url'], target_fs=fs)
|
| 287 |
|
| 288 |
-
# Step 3: Reverb using SoX (like regular mode)
|
| 289 |
-
p((3, 8), "Applying Reverb")
|
| 290 |
|
| 291 |
-
#
|
| 292 |
-
|
| 293 |
-
|
| 294 |
-
# Apply reverb to left and right channels separately
|
| 295 |
-
reverb_L = apply_reverb_wet_only(other_after_crowd[:, 0], fs, reverb_args)
|
| 296 |
-
reverb_R = apply_reverb_wet_only(other_after_crowd[:, 1], fs, reverb_args)
|
| 297 |
-
reverb = np.column_stack([reverb_L, reverb_R])
|
| 298 |
-
|
| 299 |
-
# Step 4: Speech, music, SFX separation from 'other_after_crowd'
|
| 300 |
-
p((4, 8), "Separating Speech, Music, and SFX")
|
| 301 |
demucs_input_buf = tempfile.NamedTemporaryFile(suffix=".flac", delete=False)
|
| 302 |
sf.write(demucs_input_buf.name, other_after_crowd, fs, format='FLAC', subtype='PCM_16')
|
| 303 |
demucs_input_buf.close()
|
|
@@ -312,6 +301,14 @@ def smart_mode_process(input_file, api_key, multi_singer=False):
|
|
| 312 |
sfx, _ = download_wav(demucs_resp['files'][2]['url'], target_fs=fs)
|
| 313 |
music, _ = download_wav(demucs_resp['files'][1]['url'], target_fs=fs)
|
| 314 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 315 |
# Step 5: Vocal Extraction from music
|
| 316 |
p((5, 8), "Extracting Vocals")
|
| 317 |
music_buf = tempfile.NamedTemporaryFile(suffix=".flac", delete=False)
|
|
@@ -338,6 +335,9 @@ def smart_mode_process(input_file, api_key, multi_singer=False):
|
|
| 338 |
_, FL_vl, FR_vl, FC_vl = extract_phantom_center(vl_buf.name)
|
| 339 |
os.unlink(vl_buf.name)
|
| 340 |
|
|
|
|
|
|
|
|
|
|
| 341 |
# Step 7: Mapping and stacking
|
| 342 |
p((7, 8), "Mapping Channels and Encoding")
|
| 343 |
def match_len(x, length): return np.pad(x, (0, length - len(x)))
|
|
@@ -354,14 +354,12 @@ def smart_mode_process(input_file, api_key, multi_singer=False):
|
|
| 354 |
SL = match_len(reverb[:, 0], length)
|
| 355 |
SR = match_len(reverb[:, 1], length)
|
| 356 |
|
| 357 |
-
# Optional: if multi_singer, don’t include backing vocals
|
| 358 |
if not multi_singer:
|
| 359 |
SL += match_len(vocals_back[:, 0], length)
|
| 360 |
SR += match_len(vocals_back[:, 1], length)
|
| 361 |
SL += match_len(crowd[:, 0], length)
|
| 362 |
SR += match_len(crowd[:, 1], length)
|
| 363 |
|
| 364 |
-
# Final multichannel stack
|
| 365 |
multich = np.column_stack([out_L, out_R, out_C, out_LFE, SL, SR])
|
| 366 |
|
| 367 |
out_wav = tempfile.NamedTemporaryFile(suffix='.wav', delete=False)
|
|
|
|
| 262 |
if data.ndim != 2:
|
| 263 |
raise gr.Error("Expected stereo input")
|
| 264 |
L, R = data[:, 0], data[:, 1]
|
|
|
|
| 265 |
|
| 266 |
# Step 1: LFE from lowpass
|
| 267 |
p((1, 8), "Processing LFE")
|
|
|
|
| 284 |
crowd, _ = download_wav(crowd_resp['files'][0]['url'], target_fs=fs)
|
| 285 |
other_after_crowd, _ = download_wav(crowd_resp['files'][1]['url'], target_fs=fs)
|
| 286 |
|
|
|
|
|
|
|
| 287 |
|
| 288 |
+
# Step 3: Speech, music, SFX separation from 'other_after_crowd'
|
| 289 |
+
p((3, 8), "Separating Speech, Music, and SFX")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 290 |
demucs_input_buf = tempfile.NamedTemporaryFile(suffix=".flac", delete=False)
|
| 291 |
sf.write(demucs_input_buf.name, other_after_crowd, fs, format='FLAC', subtype='PCM_16')
|
| 292 |
demucs_input_buf.close()
|
|
|
|
| 301 |
sfx, _ = download_wav(demucs_resp['files'][2]['url'], target_fs=fs)
|
| 302 |
music, _ = download_wav(demucs_resp['files'][1]['url'], target_fs=fs)
|
| 303 |
|
| 304 |
+
# Step 4: Apply Reverb to the 'music' stem
|
| 305 |
+
p((4, 8), "Applying Reverb")
|
| 306 |
+
reverb_args = ['70', '40', '100', '95', '10', '0'] # music preset
|
| 307 |
+
reverb_L = apply_reverb_wet_only(music[:, 0], fs, reverb_args)
|
| 308 |
+
reverb_R = apply_reverb_wet_only(music[:, 1], fs, reverb_args)
|
| 309 |
+
reverb = np.column_stack([reverb_L, reverb_R])
|
| 310 |
+
|
| 311 |
+
|
| 312 |
# Step 5: Vocal Extraction from music
|
| 313 |
p((5, 8), "Extracting Vocals")
|
| 314 |
music_buf = tempfile.NamedTemporaryFile(suffix=".flac", delete=False)
|
|
|
|
| 335 |
_, FL_vl, FR_vl, FC_vl = extract_phantom_center(vl_buf.name)
|
| 336 |
os.unlink(vl_buf.name)
|
| 337 |
|
| 338 |
+
# Mix dialog into the centre channel
|
| 339 |
+
FC_vl += dialog[:, 0] if dialog.ndim == 2 else dialog
|
| 340 |
+
|
| 341 |
# Step 7: Mapping and stacking
|
| 342 |
p((7, 8), "Mapping Channels and Encoding")
|
| 343 |
def match_len(x, length): return np.pad(x, (0, length - len(x)))
|
|
|
|
| 354 |
SL = match_len(reverb[:, 0], length)
|
| 355 |
SR = match_len(reverb[:, 1], length)
|
| 356 |
|
|
|
|
| 357 |
if not multi_singer:
|
| 358 |
SL += match_len(vocals_back[:, 0], length)
|
| 359 |
SR += match_len(vocals_back[:, 1], length)
|
| 360 |
SL += match_len(crowd[:, 0], length)
|
| 361 |
SR += match_len(crowd[:, 1], length)
|
| 362 |
|
|
|
|
| 363 |
multich = np.column_stack([out_L, out_R, out_C, out_LFE, SL, SR])
|
| 364 |
|
| 365 |
out_wav = tempfile.NamedTemporaryFile(suffix='.wav', delete=False)
|