Spaces:
Sleeping
Sleeping
Change Smartmode Workflow
Browse files
app.py
CHANGED
|
@@ -251,14 +251,13 @@ def smart_mode_process(input_file, api_key, multi_singer=False):
|
|
| 251 |
import shutil
|
| 252 |
|
| 253 |
if not api_key:
|
| 254 |
-
raise gr.Error("An MVSep API Key Is Required For This. Get your key <a href=\"https://mvsep.com/user-api\">Here</a>.")
|
| 255 |
|
| 256 |
# Load original
|
| 257 |
wav = convert_to_wav_float(input_file)
|
| 258 |
data, fs = sf.read(wav, dtype='float32')
|
| 259 |
os.unlink(wav)
|
| 260 |
-
p((0,
|
| 261 |
-
print("Loading File")
|
| 262 |
|
| 263 |
if data.ndim != 2:
|
| 264 |
raise gr.Error("Expected stereo input")
|
|
@@ -266,96 +265,113 @@ def smart_mode_process(input_file, api_key, multi_singer=False):
|
|
| 266 |
stereo = np.column_stack([L, R])
|
| 267 |
|
| 268 |
# Step 1: LFE from lowpass
|
| 269 |
-
p((1,
|
| 270 |
-
print("Processing LFE")
|
| 271 |
bass = sox_filter(0.5 * (L + R), fs, 'lowpass', 120)
|
| 272 |
|
| 273 |
-
# Step 2: Highpass for
|
| 274 |
-
p((2,
|
| 275 |
-
print("Speech, Music, SFX")
|
| 276 |
hp_left = sox_filter(L, fs, 'highpass', 120)
|
| 277 |
hp_right = sox_filter(R, fs, 'highpass', 120)
|
| 278 |
hp_stereo = np.column_stack([hp_left, hp_right])
|
| 279 |
-
|
| 280 |
-
sf.write(
|
| 281 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 282 |
|
| 283 |
-
# Send to MVSep
|
| 284 |
demucs_resp = send_mvsep_audio_job(
|
| 285 |
-
api_key, open(
|
|
|
|
| 286 |
)
|
| 287 |
-
os.unlink(
|
| 288 |
|
| 289 |
-
print(demucs_resp)
|
| 290 |
dialog, _ = download_wav(demucs_resp['files'][0]['url'], target_fs=fs)
|
| 291 |
sfx, _ = download_wav(demucs_resp['files'][2]['url'], target_fs=fs)
|
| 292 |
music, _ = download_wav(demucs_resp['files'][1]['url'], target_fs=fs)
|
| 293 |
|
| 294 |
-
# Step
|
| 295 |
-
p((
|
| 296 |
-
print("Crowd")
|
| 297 |
music_buf = tempfile.NamedTemporaryFile(suffix=".flac", delete=False)
|
| 298 |
sf.write(music_buf.name, music, fs, format='FLAC', subtype='PCM_16')
|
| 299 |
music_buf.close()
|
| 300 |
-
crowd_resp = send_mvsep_audio_job(
|
| 301 |
-
api_key, open(music_buf.name, 'rb').read(), os.path.basename(music_buf.name), sep_type=34, output_format=2, addopt1=1
|
| 302 |
-
)
|
| 303 |
-
os.unlink(music_buf.name)
|
| 304 |
-
crowd, _ = download_wav(crowd_resp['files'][0]['url'], target_fs=fs)
|
| 305 |
-
other, _ = download_wav(crowd_resp['files'][1]['url'], target_fs=fs)
|
| 306 |
-
print(crowd_resp)
|
| 307 |
|
| 308 |
-
# Step 4: Extract vocals
|
| 309 |
-
p((4,7), "Extracting Vocals")
|
| 310 |
-
print("Vocals")
|
| 311 |
-
other_buf = tempfile.NamedTemporaryFile(suffix=".flac", delete=False)
|
| 312 |
-
sf.write(other_buf.name, other, fs, format='FLAC', subtype='PCM_16')
|
| 313 |
-
other_buf.close()
|
| 314 |
karaoke_resp = send_mvsep_audio_job(
|
| 315 |
-
api_key, open(
|
|
|
|
| 316 |
)
|
| 317 |
-
os.unlink(
|
|
|
|
| 318 |
vocals_full, _ = download_wav(karaoke_resp['files'][0]['url'], target_fs=fs)
|
| 319 |
vocals_lead, _ = download_wav(karaoke_resp['files'][1]['url'], target_fs=fs)
|
| 320 |
vocals_back, _ = download_wav(karaoke_resp['files'][2]['url'], target_fs=fs)
|
| 321 |
instr, _ = download_wav(karaoke_resp['files'][3]['url'], target_fs=fs)
|
| 322 |
-
print(karaoke_resp)
|
| 323 |
|
| 324 |
-
# Step
|
| 325 |
-
p((
|
| 326 |
-
print("Front Vocal Channels")
|
| 327 |
vl_buf = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
|
| 328 |
sf.write(vl_buf.name, vocals_full if multi_singer else vocals_lead, fs, subtype='FLOAT')
|
| 329 |
vl_buf.close()
|
|
|
|
| 330 |
_, FL_vl, FR_vl, FC_vl = extract_phantom_center(vl_buf.name)
|
| 331 |
os.unlink(vl_buf.name)
|
| 332 |
|
| 333 |
-
# Step
|
| 334 |
-
p((
|
| 335 |
-
print("Mapping")
|
| 336 |
def match_len(x, length): return np.pad(x, (0, length - len(x)))
|
| 337 |
-
lens = [len(FL_vl), len(FR_vl), len(FC_vl), len(bass), sfx
|
| 338 |
length = max(lens)
|
| 339 |
|
| 340 |
-
|
| 341 |
-
|
|
|
|
| 342 |
out_C = match_len(FC_vl, length)
|
| 343 |
out_LFE = match_len(bass, length)
|
| 344 |
-
if multi_singer:
|
| 345 |
-
SL = match_len(sfx[:,0], length) + match_len(crowd[:,0], length)
|
| 346 |
-
SR = match_len(sfx[:,1], length) + match_len(crowd[:,1], length)
|
| 347 |
-
else:
|
| 348 |
-
SL = match_len(vocals_back[:,0], length) + match_len(sfx[:,0], length) + match_len(crowd[:,0], length)
|
| 349 |
-
SR = match_len(vocals_back[:,1], length) + match_len(sfx[:,1], length) + match_len(crowd[:,1], length)
|
| 350 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 351 |
|
| 352 |
-
#
|
| 353 |
-
p((7,7), "Processing Step 7, Encoding")
|
| 354 |
-
print("Encoding")
|
| 355 |
multich = np.column_stack([out_L, out_R, out_C, out_LFE, SL, SR])
|
|
|
|
| 356 |
out_wav = tempfile.NamedTemporaryFile(suffix='.wav', delete=False)
|
| 357 |
sf.write(out_wav.name, multich, fs, subtype='FLOAT')
|
| 358 |
out_wav.close()
|
|
|
|
| 359 |
out_ogg = tempfile.NamedTemporaryFile(suffix='.ogg', delete=False)
|
| 360 |
subprocess.run([
|
| 361 |
"ffmpeg", "-y", "-i", out_wav.name,
|
|
@@ -365,6 +381,7 @@ def smart_mode_process(input_file, api_key, multi_singer=False):
|
|
| 365 |
|
| 366 |
return out_ogg.name
|
| 367 |
|
|
|
|
| 368 |
# ========== Gradio UI ==========
|
| 369 |
with gr.Blocks(title="Stereo to 5.1 Surround") as demo:
|
| 370 |
gr.Markdown("# 🎧 Stereo to 5.1 Converter")
|
|
|
|
| 251 |
import shutil
|
| 252 |
|
| 253 |
if not api_key:
|
| 254 |
+
raise gr.Error("An MVSep API Key Is Required For This. Get your key <a href=\"https://mvsep.com/user-api\">Here</a>. it's Free!")
|
| 255 |
|
| 256 |
# Load original
|
| 257 |
wav = convert_to_wav_float(input_file)
|
| 258 |
data, fs = sf.read(wav, dtype='float32')
|
| 259 |
os.unlink(wav)
|
| 260 |
+
p((0, 8), "Loading File")
|
|
|
|
| 261 |
|
| 262 |
if data.ndim != 2:
|
| 263 |
raise gr.Error("Expected stereo input")
|
|
|
|
| 265 |
stereo = np.column_stack([L, R])
|
| 266 |
|
| 267 |
# Step 1: LFE from lowpass
|
| 268 |
+
p((1, 8), "Processing LFE")
|
|
|
|
| 269 |
bass = sox_filter(0.5 * (L + R), fs, 'lowpass', 120)
|
| 270 |
|
| 271 |
+
# Step 2: Highpass for crowd extraction
|
| 272 |
+
p((2, 8), "Extracting Crowd")
|
|
|
|
| 273 |
hp_left = sox_filter(L, fs, 'highpass', 120)
|
| 274 |
hp_right = sox_filter(R, fs, 'highpass', 120)
|
| 275 |
hp_stereo = np.column_stack([hp_left, hp_right])
|
| 276 |
+
music_buf = tempfile.NamedTemporaryFile(suffix=".flac", delete=False)
|
| 277 |
+
sf.write(music_buf.name, hp_stereo, fs, format='FLAC', subtype='PCM_16')
|
| 278 |
+
music_buf.close()
|
| 279 |
+
|
| 280 |
+
crowd_resp = send_mvsep_audio_job(
|
| 281 |
+
api_key, open(music_buf.name, 'rb').read(), os.path.basename(music_buf.name),
|
| 282 |
+
sep_type=34, output_format=2, addopt1=1
|
| 283 |
+
)
|
| 284 |
+
os.unlink(music_buf.name)
|
| 285 |
+
crowd, _ = download_wav(crowd_resp['files'][0]['url'], target_fs=fs)
|
| 286 |
+
other_after_crowd, _ = download_wav(crowd_resp['files'][1]['url'], target_fs=fs)
|
| 287 |
+
|
| 288 |
+
# Step 3: Reverb removal on "other" part
|
| 289 |
+
p((3, 8), "Removing Reverb")
|
| 290 |
+
other_buf = tempfile.NamedTemporaryFile(suffix=".flac", delete=False)
|
| 291 |
+
sf.write(other_buf.name, other_after_crowd, fs, format='FLAC', subtype='PCM_16')
|
| 292 |
+
other_buf.close()
|
| 293 |
+
|
| 294 |
+
reverb_resp = send_mvsep_audio_job(
|
| 295 |
+
api_key, open(other_buf.name, 'rb').read(), os.path.basename(other_buf.name),
|
| 296 |
+
sep_type=22, output_format=2, addopt1=2, addopt2=1
|
| 297 |
+
)
|
| 298 |
+
os.unlink(other_buf.name)
|
| 299 |
+
|
| 300 |
+
# Ignore first file (no reverb), use second for SL/SR
|
| 301 |
+
reverb, _ = download_wav(reverb_resp['files'][1]['url'], target_fs=fs)
|
| 302 |
+
|
| 303 |
+
# Step 4: Speech, music, SFX separation from 'other_after_crowd'
|
| 304 |
+
p((4, 8), "Separating Speech, Music, and SFX")
|
| 305 |
+
demucs_input_buf = tempfile.NamedTemporaryFile(suffix=".flac", delete=False)
|
| 306 |
+
sf.write(demucs_input_buf.name, other_after_crowd, fs, format='FLAC', subtype='PCM_16')
|
| 307 |
+
demucs_input_buf.close()
|
| 308 |
|
|
|
|
| 309 |
demucs_resp = send_mvsep_audio_job(
|
| 310 |
+
api_key, open(demucs_input_buf.name, 'rb').read(), os.path.basename(demucs_input_buf.name),
|
| 311 |
+
sep_type=24, output_format=2, addopt1=1
|
| 312 |
)
|
| 313 |
+
os.unlink(demucs_input_buf.name)
|
| 314 |
|
|
|
|
| 315 |
dialog, _ = download_wav(demucs_resp['files'][0]['url'], target_fs=fs)
|
| 316 |
sfx, _ = download_wav(demucs_resp['files'][2]['url'], target_fs=fs)
|
| 317 |
music, _ = download_wav(demucs_resp['files'][1]['url'], target_fs=fs)
|
| 318 |
|
| 319 |
+
# Step 5: Vocal Extraction from music
|
| 320 |
+
p((5, 8), "Extracting Vocals")
|
|
|
|
| 321 |
music_buf = tempfile.NamedTemporaryFile(suffix=".flac", delete=False)
|
| 322 |
sf.write(music_buf.name, music, fs, format='FLAC', subtype='PCM_16')
|
| 323 |
music_buf.close()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 324 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 325 |
karaoke_resp = send_mvsep_audio_job(
|
| 326 |
+
api_key, open(music_buf.name, 'rb').read(), os.path.basename(music_buf.name),
|
| 327 |
+
sep_type=49, output_format=2, addopt1=3, addopt2=1
|
| 328 |
)
|
| 329 |
+
os.unlink(music_buf.name)
|
| 330 |
+
|
| 331 |
vocals_full, _ = download_wav(karaoke_resp['files'][0]['url'], target_fs=fs)
|
| 332 |
vocals_lead, _ = download_wav(karaoke_resp['files'][1]['url'], target_fs=fs)
|
| 333 |
vocals_back, _ = download_wav(karaoke_resp['files'][2]['url'], target_fs=fs)
|
| 334 |
instr, _ = download_wav(karaoke_resp['files'][3]['url'], target_fs=fs)
|
|
|
|
| 335 |
|
| 336 |
+
# Step 6: Phantom center on vocals (lead or full)
|
| 337 |
+
p((6, 8), "Phantom Center for Lead Vocals")
|
|
|
|
| 338 |
vl_buf = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
|
| 339 |
sf.write(vl_buf.name, vocals_full if multi_singer else vocals_lead, fs, subtype='FLOAT')
|
| 340 |
vl_buf.close()
|
| 341 |
+
|
| 342 |
_, FL_vl, FR_vl, FC_vl = extract_phantom_center(vl_buf.name)
|
| 343 |
os.unlink(vl_buf.name)
|
| 344 |
|
| 345 |
+
# Step 7: Mapping and stacking
|
| 346 |
+
p((7, 8), "Mapping Channels and Encoding")
|
|
|
|
| 347 |
def match_len(x, length): return np.pad(x, (0, length - len(x)))
|
| 348 |
+
lens = [len(FL_vl), len(FR_vl), len(FC_vl), len(bass), len(sfx), crowd.shape[0], vocals_back.shape[0], instr.shape[0], len(reverb)]
|
| 349 |
length = max(lens)
|
| 350 |
|
| 351 |
+
# FL and FR: Lead vocals + SFX + instruments
|
| 352 |
+
out_L = match_len(FL_vl, length) + match_len(sfx[:, 0], length) + match_len(instr[:, 0], length)
|
| 353 |
+
out_R = match_len(FR_vl, length) + match_len(sfx[:, 1], length) + match_len(instr[:, 1], length)
|
| 354 |
out_C = match_len(FC_vl, length)
|
| 355 |
out_LFE = match_len(bass, length)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 356 |
|
| 357 |
+
# SL/SR: Use reverb output
|
| 358 |
+
SL = match_len(reverb[:, 0], length)
|
| 359 |
+
SR = match_len(reverb[:, 1], length)
|
| 360 |
+
|
| 361 |
+
# Optional: if multi_singer, don’t include backing vocals
|
| 362 |
+
if not multi_singer:
|
| 363 |
+
SL += match_len(vocals_back[:, 0], length)
|
| 364 |
+
SR += match_len(vocals_back[:, 1], length)
|
| 365 |
+
SL += match_len(crowd[:, 0], length)
|
| 366 |
+
SR += match_len(crowd[:, 1], length)
|
| 367 |
|
| 368 |
+
# Final multichannel stack
|
|
|
|
|
|
|
| 369 |
multich = np.column_stack([out_L, out_R, out_C, out_LFE, SL, SR])
|
| 370 |
+
|
| 371 |
out_wav = tempfile.NamedTemporaryFile(suffix='.wav', delete=False)
|
| 372 |
sf.write(out_wav.name, multich, fs, subtype='FLOAT')
|
| 373 |
out_wav.close()
|
| 374 |
+
|
| 375 |
out_ogg = tempfile.NamedTemporaryFile(suffix='.ogg', delete=False)
|
| 376 |
subprocess.run([
|
| 377 |
"ffmpeg", "-y", "-i", out_wav.name,
|
|
|
|
| 381 |
|
| 382 |
return out_ogg.name
|
| 383 |
|
| 384 |
+
|
| 385 |
# ========== Gradio UI ==========
|
| 386 |
with gr.Blocks(title="Stereo to 5.1 Surround") as demo:
|
| 387 |
gr.Markdown("# 🎧 Stereo to 5.1 Converter")
|