ziqiangao commited on
Commit
ee5922f
·
verified ·
1 Parent(s): ee1df14

Change Smartmode Workflow

Browse files
Files changed (1) hide show
  1. app.py +69 -52
app.py CHANGED
@@ -251,14 +251,13 @@ def smart_mode_process(input_file, api_key, multi_singer=False):
251
  import shutil
252
 
253
  if not api_key:
254
- raise gr.Error("An MVSep API Key Is Required For This. Get your key <a href=\"https://mvsep.com/user-api\">Here</a>.")
255
 
256
  # Load original
257
  wav = convert_to_wav_float(input_file)
258
  data, fs = sf.read(wav, dtype='float32')
259
  os.unlink(wav)
260
- p((0,7), "Loading File")
261
- print("Loading File")
262
 
263
  if data.ndim != 2:
264
  raise gr.Error("Expected stereo input")
@@ -266,96 +265,113 @@ def smart_mode_process(input_file, api_key, multi_singer=False):
266
  stereo = np.column_stack([L, R])
267
 
268
  # Step 1: LFE from lowpass
269
- p((1,7), "Processing LFE")
270
- print("Processing LFE")
271
  bass = sox_filter(0.5 * (L + R), fs, 'lowpass', 120)
272
 
273
- # Step 2: Highpass for MVSep
274
- p((2,7), "Processing Speech, Music and SFX")
275
- print("Speech, Music, SFX")
276
  hp_left = sox_filter(L, fs, 'highpass', 120)
277
  hp_right = sox_filter(R, fs, 'highpass', 120)
278
  hp_stereo = np.column_stack([hp_left, hp_right])
279
- hp_buf = tempfile.NamedTemporaryFile(suffix=".flac", delete=False)
280
- sf.write(hp_buf.name, hp_stereo, fs, format='FLAC', subtype='PCM_16')
281
- hp_buf.close()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
282
 
283
- # Send to MVSep
284
  demucs_resp = send_mvsep_audio_job(
285
- api_key, open(hp_buf.name, 'rb').read(), os.path.basename(hp_buf.name), sep_type=24, output_format=2, addopt1=1
 
286
  )
287
- os.unlink(hp_buf.name)
288
 
289
- print(demucs_resp)
290
  dialog, _ = download_wav(demucs_resp['files'][0]['url'], target_fs=fs)
291
  sfx, _ = download_wav(demucs_resp['files'][2]['url'], target_fs=fs)
292
  music, _ = download_wav(demucs_resp['files'][1]['url'], target_fs=fs)
293
 
294
- # Step 3: Extract crowd
295
- p((3,7), "Extracting Crowd")
296
- print("Crowd")
297
  music_buf = tempfile.NamedTemporaryFile(suffix=".flac", delete=False)
298
  sf.write(music_buf.name, music, fs, format='FLAC', subtype='PCM_16')
299
  music_buf.close()
300
- crowd_resp = send_mvsep_audio_job(
301
- api_key, open(music_buf.name, 'rb').read(), os.path.basename(music_buf.name), sep_type=34, output_format=2, addopt1=1
302
- )
303
- os.unlink(music_buf.name)
304
- crowd, _ = download_wav(crowd_resp['files'][0]['url'], target_fs=fs)
305
- other, _ = download_wav(crowd_resp['files'][1]['url'], target_fs=fs)
306
- print(crowd_resp)
307
 
308
- # Step 4: Extract vocals
309
- p((4,7), "Extracting Vocals")
310
- print("Vocals")
311
- other_buf = tempfile.NamedTemporaryFile(suffix=".flac", delete=False)
312
- sf.write(other_buf.name, other, fs, format='FLAC', subtype='PCM_16')
313
- other_buf.close()
314
  karaoke_resp = send_mvsep_audio_job(
315
- api_key, open(other_buf.name, 'rb').read(), os.path.basename(other_buf.name), sep_type=49, output_format=2, addopt1=3, addopt2=1
 
316
  )
317
- os.unlink(other_buf.name)
 
318
  vocals_full, _ = download_wav(karaoke_resp['files'][0]['url'], target_fs=fs)
319
  vocals_lead, _ = download_wav(karaoke_resp['files'][1]['url'], target_fs=fs)
320
  vocals_back, _ = download_wav(karaoke_resp['files'][2]['url'], target_fs=fs)
321
  instr, _ = download_wav(karaoke_resp['files'][3]['url'], target_fs=fs)
322
- print(karaoke_resp)
323
 
324
- # Step 5: Phantom center for lead vocals
325
- p((5,7), "Distributing Front Vocal Channels")
326
- print("Front Vocal Channels")
327
  vl_buf = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
328
  sf.write(vl_buf.name, vocals_full if multi_singer else vocals_lead, fs, subtype='FLOAT')
329
  vl_buf.close()
 
330
  _, FL_vl, FR_vl, FC_vl = extract_phantom_center(vl_buf.name)
331
  os.unlink(vl_buf.name)
332
 
333
- # Step 6: Map channels and pad
334
- p((6,7), "Mapping Channels")
335
- print("Mapping")
336
  def match_len(x, length): return np.pad(x, (0, length - len(x)))
337
- lens = [len(FL_vl), len(FR_vl), len(FC_vl), len(bass), sfx.shape[0], crowd.shape[0], vocals_back.shape[0], instr.shape[0]]
338
  length = max(lens)
339
 
340
- out_L = match_len(FL_vl, length) + match_len(instr[:,0], length)
341
- out_R = match_len(FR_vl, length) + match_len(instr[:,1], length)
 
342
  out_C = match_len(FC_vl, length)
343
  out_LFE = match_len(bass, length)
344
- if multi_singer:
345
- SL = match_len(sfx[:,0], length) + match_len(crowd[:,0], length)
346
- SR = match_len(sfx[:,1], length) + match_len(crowd[:,1], length)
347
- else:
348
- SL = match_len(vocals_back[:,0], length) + match_len(sfx[:,0], length) + match_len(crowd[:,0], length)
349
- SR = match_len(vocals_back[:,1], length) + match_len(sfx[:,1], length) + match_len(crowd[:,1], length)
350
 
 
 
 
 
 
 
 
 
 
 
351
 
352
- # Step 7: Encode to 5.1 OGG
353
- p((7,7), "Processing Step 7, Encoding")
354
- print("Encoding")
355
  multich = np.column_stack([out_L, out_R, out_C, out_LFE, SL, SR])
 
356
  out_wav = tempfile.NamedTemporaryFile(suffix='.wav', delete=False)
357
  sf.write(out_wav.name, multich, fs, subtype='FLOAT')
358
  out_wav.close()
 
359
  out_ogg = tempfile.NamedTemporaryFile(suffix='.ogg', delete=False)
360
  subprocess.run([
361
  "ffmpeg", "-y", "-i", out_wav.name,
@@ -365,6 +381,7 @@ def smart_mode_process(input_file, api_key, multi_singer=False):
365
 
366
  return out_ogg.name
367
 
 
368
  # ========== Gradio UI ==========
369
  with gr.Blocks(title="Stereo to 5.1 Surround") as demo:
370
  gr.Markdown("# 🎧 Stereo to 5.1 Converter")
 
251
  import shutil
252
 
253
  if not api_key:
254
+ raise gr.Error("An MVSep API Key Is Required For This. Get your key <a href=\"https://mvsep.com/user-api\">Here</a>. it's Free!")
255
 
256
  # Load original
257
  wav = convert_to_wav_float(input_file)
258
  data, fs = sf.read(wav, dtype='float32')
259
  os.unlink(wav)
260
+ p((0, 8), "Loading File")
 
261
 
262
  if data.ndim != 2:
263
  raise gr.Error("Expected stereo input")
 
265
  stereo = np.column_stack([L, R])
266
 
267
  # Step 1: LFE from lowpass
268
+ p((1, 8), "Processing LFE")
 
269
  bass = sox_filter(0.5 * (L + R), fs, 'lowpass', 120)
270
 
271
+ # Step 2: Highpass for crowd extraction
272
+ p((2, 8), "Extracting Crowd")
 
273
  hp_left = sox_filter(L, fs, 'highpass', 120)
274
  hp_right = sox_filter(R, fs, 'highpass', 120)
275
  hp_stereo = np.column_stack([hp_left, hp_right])
276
+ music_buf = tempfile.NamedTemporaryFile(suffix=".flac", delete=False)
277
+ sf.write(music_buf.name, hp_stereo, fs, format='FLAC', subtype='PCM_16')
278
+ music_buf.close()
279
+
280
+ crowd_resp = send_mvsep_audio_job(
281
+ api_key, open(music_buf.name, 'rb').read(), os.path.basename(music_buf.name),
282
+ sep_type=34, output_format=2, addopt1=1
283
+ )
284
+ os.unlink(music_buf.name)
285
+ crowd, _ = download_wav(crowd_resp['files'][0]['url'], target_fs=fs)
286
+ other_after_crowd, _ = download_wav(crowd_resp['files'][1]['url'], target_fs=fs)
287
+
288
+ # Step 3: Reverb removal on "other" part
289
+ p((3, 8), "Removing Reverb")
290
+ other_buf = tempfile.NamedTemporaryFile(suffix=".flac", delete=False)
291
+ sf.write(other_buf.name, other_after_crowd, fs, format='FLAC', subtype='PCM_16')
292
+ other_buf.close()
293
+
294
+ reverb_resp = send_mvsep_audio_job(
295
+ api_key, open(other_buf.name, 'rb').read(), os.path.basename(other_buf.name),
296
+ sep_type=22, output_format=2, addopt1=2, addopt2=1
297
+ )
298
+ os.unlink(other_buf.name)
299
+
300
+ # Ignore first file (no reverb), use second for SL/SR
301
+ reverb, _ = download_wav(reverb_resp['files'][1]['url'], target_fs=fs)
302
+
303
+ # Step 4: Speech, music, SFX separation from 'other_after_crowd'
304
+ p((4, 8), "Separating Speech, Music, and SFX")
305
+ demucs_input_buf = tempfile.NamedTemporaryFile(suffix=".flac", delete=False)
306
+ sf.write(demucs_input_buf.name, other_after_crowd, fs, format='FLAC', subtype='PCM_16')
307
+ demucs_input_buf.close()
308
 
 
309
  demucs_resp = send_mvsep_audio_job(
310
+ api_key, open(demucs_input_buf.name, 'rb').read(), os.path.basename(demucs_input_buf.name),
311
+ sep_type=24, output_format=2, addopt1=1
312
  )
313
+ os.unlink(demucs_input_buf.name)
314
 
 
315
  dialog, _ = download_wav(demucs_resp['files'][0]['url'], target_fs=fs)
316
  sfx, _ = download_wav(demucs_resp['files'][2]['url'], target_fs=fs)
317
  music, _ = download_wav(demucs_resp['files'][1]['url'], target_fs=fs)
318
 
319
+ # Step 5: Vocal Extraction from music
320
+ p((5, 8), "Extracting Vocals")
 
321
  music_buf = tempfile.NamedTemporaryFile(suffix=".flac", delete=False)
322
  sf.write(music_buf.name, music, fs, format='FLAC', subtype='PCM_16')
323
  music_buf.close()
 
 
 
 
 
 
 
324
 
 
 
 
 
 
 
325
  karaoke_resp = send_mvsep_audio_job(
326
+ api_key, open(music_buf.name, 'rb').read(), os.path.basename(music_buf.name),
327
+ sep_type=49, output_format=2, addopt1=3, addopt2=1
328
  )
329
+ os.unlink(music_buf.name)
330
+
331
  vocals_full, _ = download_wav(karaoke_resp['files'][0]['url'], target_fs=fs)
332
  vocals_lead, _ = download_wav(karaoke_resp['files'][1]['url'], target_fs=fs)
333
  vocals_back, _ = download_wav(karaoke_resp['files'][2]['url'], target_fs=fs)
334
  instr, _ = download_wav(karaoke_resp['files'][3]['url'], target_fs=fs)
 
335
 
336
+ # Step 6: Phantom center on vocals (lead or full)
337
+ p((6, 8), "Phantom Center for Lead Vocals")
 
338
  vl_buf = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
339
  sf.write(vl_buf.name, vocals_full if multi_singer else vocals_lead, fs, subtype='FLOAT')
340
  vl_buf.close()
341
+
342
  _, FL_vl, FR_vl, FC_vl = extract_phantom_center(vl_buf.name)
343
  os.unlink(vl_buf.name)
344
 
345
+ # Step 7: Mapping and stacking
346
+ p((7, 8), "Mapping Channels and Encoding")
 
347
  def match_len(x, length): return np.pad(x, (0, length - len(x)))
348
+ lens = [len(FL_vl), len(FR_vl), len(FC_vl), len(bass), len(sfx), crowd.shape[0], vocals_back.shape[0], instr.shape[0], len(reverb)]
349
  length = max(lens)
350
 
351
+ # FL and FR: Lead vocals + SFX + instruments
352
+ out_L = match_len(FL_vl, length) + match_len(sfx[:, 0], length) + match_len(instr[:, 0], length)
353
+ out_R = match_len(FR_vl, length) + match_len(sfx[:, 1], length) + match_len(instr[:, 1], length)
354
  out_C = match_len(FC_vl, length)
355
  out_LFE = match_len(bass, length)
 
 
 
 
 
 
356
 
357
+ # SL/SR: Use reverb output
358
+ SL = match_len(reverb[:, 0], length)
359
+ SR = match_len(reverb[:, 1], length)
360
+
361
+ # Optional: if multi_singer, don’t include backing vocals
362
+ if not multi_singer:
363
+ SL += match_len(vocals_back[:, 0], length)
364
+ SR += match_len(vocals_back[:, 1], length)
365
+ SL += match_len(crowd[:, 0], length)
366
+ SR += match_len(crowd[:, 1], length)
367
 
368
+ # Final multichannel stack
 
 
369
  multich = np.column_stack([out_L, out_R, out_C, out_LFE, SL, SR])
370
+
371
  out_wav = tempfile.NamedTemporaryFile(suffix='.wav', delete=False)
372
  sf.write(out_wav.name, multich, fs, subtype='FLOAT')
373
  out_wav.close()
374
+
375
  out_ogg = tempfile.NamedTemporaryFile(suffix='.ogg', delete=False)
376
  subprocess.run([
377
  "ffmpeg", "-y", "-i", out_wav.name,
 
381
 
382
  return out_ogg.name
383
 
384
+
385
  # ========== Gradio UI ==========
386
  with gr.Blocks(title="Stereo to 5.1 Surround") as demo:
387
  gr.Markdown("# 🎧 Stereo to 5.1 Converter")