Spaces:

anggars
/

tunebase

Sleeping

App Files Files Community

github-actions[bot] commited on Jan 26

Commit

61f32ab

1 Parent(s): 207cecb

Sync from GitHub: d66c6dbf67237f862d611254033ad1302afdafd1

Browse files

Files changed (1) hide show

services/audio_separator.py +67 -54

services/audio_separator.py CHANGED Viewed

@@ -142,69 +142,82 @@ class AudioSeparator:
              self._save_audio(source, sr, path)
              return {"guitar_rhythm": path, "guitar_lead": path}
-        # MID-SIDE PROCESSING
-        # Mid = (L + R) / 2 -> Center content (usually rhythm)
-        # Side = (L - R) / 2 -> Stereo difference (usually lead)
-        left = source[0:1, :]   # (1, samples)
-        right = source[1:2, :]  # (1, samples)
-        mid = (left + right) / 2.0    # Center content -> Rhythm
-        side = (left - right) / 2.0   # Stereo diff -> Lead
-        # Apply subtle frequency filtering for better separation
-        # Rhythm: Emphasize low-mid (100-2000Hz) for chunky rhythm tones
-        # Lead: Emphasize mid-high (800-8000Hz) for melodic clarity
-        try:
-            import scipy.signal as signal
-            # Design filters
-            nyquist = sr / 2
-            # Rhythm: Low-pass + slight presence boost (keep fundamentals)
-            rhythm_lowcut = 80 / nyquist
-            rhythm_highcut = 4000 / nyquist
-            b_rhythm, a_rhythm = signal.butter(4, [rhythm_lowcut, rhythm_highcut], btype='band')
-            # Lead: Band-pass for melodic range
-            lead_lowcut = 200 / nyquist
-            lead_highcut = 8000 / nyquist
-            b_lead, a_lead = signal.butter(4, [lead_lowcut, lead_highcut], btype='band')
-            # Apply filters
-            mid_filtered = signal.filtfilt(b_rhythm, a_rhythm, mid.numpy())
-            side_filtered = signal.filtfilt(b_lead, a_lead, side.numpy())
-            mid = torch.from_numpy(mid_filtered).float()
-            side = torch.from_numpy(side_filtered).float()
-        except Exception as e:
-            print(f"Warning: Frequency filtering failed ({e}), using raw Mid-Side")
-        # Make stereo for output (center both)
-        # SWAPPED: Side = Rhythm (strumming often panned wide), Mid = Lead (melody often center)
-        rhythm_stereo = torch.cat([side, side], dim=0)
-        lead_stereo = torch.cat([mid, mid], dim=0)
-        # If side is too quiet (song has no stereo separation), mix some mid into lead
-        side_rms = torch.sqrt(torch.mean(side ** 2))
-        mid_rms = torch.sqrt(torch.mean(mid ** 2))
-        if side_rms < mid_rms * 0.1:  # Side is <10% of mid -> almost mono mix
-            print("Notice: Audio appears to be mostly mono. Rhythm separation may be limited.")
-            # Create pseudo-separation using low frequencies for rhythm
             try:
-                rhythm_lowpass = 2000 / nyquist
-                b_lp, a_lp = signal.butter(4, rhythm_lowpass, btype='low')
-                rhythm_from_mid = signal.filtfilt(b_lp, a_lp, mid.numpy())
-                rhythm_stereo = torch.from_numpy(rhythm_from_mid).float()
                 rhythm_stereo = torch.cat([rhythm_stereo, rhythm_stereo], dim=0)
-            except:
-                pass
-        # Normalize to -3dB to prevent clipping
         def normalize(tensor):
             peak = tensor.abs().max()
             if peak > 0:
-                target_peak = 0.707  # -3dB
                 return tensor * (target_peak / peak)
             return tensor

              self._save_audio(source, sr, path)
              return {"guitar_rhythm": path, "guitar_lead": path}
+        # 1. Smart Spatial Split Check
+        # Calculate correlation between L and R to detect Hard Panning (Math Rock Style)
+        # If correlation is low, it means L and R are playing different things.
+        # We assume Left = Rhythm (often dropped D/lower), Right = Lead (often ornate/higher) - OR provide both as is.
+        # Calculate cross-correlation at lag 0
+        mean_l = left.mean()
+        mean_r = right.mean()
+        var_l = ((left - mean_l)**2).mean()
+        var_r = ((right - mean_r)**2).mean()
+        cov = ((left - mean_l) * (right - mean_r)).mean()
+        correlation = 0.0
+        if var_l > 0 and var_r > 0:
+            correlation = cov / torch.sqrt(var_l * var_r)
+        print(f"Guitar Stereo Correlation: {correlation:.4f}")
+        # Threshold for "Wide Stereo"
+        if abs(correlation) < 0.6:
+            print("Detected Wide Stereo Guitar (Math Rock Style). Using Spatial Split (L=Rhythm, R=Lead).")
+            # Force Hard Split
+            # Rhythm = Left Channel (Duplicated to Stereo)
+            # Lead = Right Channel (Duplicated to Stereo)
+            rhythm_stereo = torch.cat([left, left], dim=0)
+            lead_stereo = torch.cat([right, right], dim=0)
+        else:
+            print("Detected Narrow/Mono Guitar. Using Mid-Side Frequency Split.")
+            # Standard Mid-Side with Tighter Filters
+            mid = (left + right) / 2.0
+            side = (left - right) / 2.0
             try:
+                import scipy.signal as signal
+                nyquist = sr / 2
+                # Rhythm: Low-Mid focus (80-1500Hz) - tighter top end
+                # To distinguish from lead which often occupies 800+
+                rhythm_low = 80 / nyquist
+                rhythm_high = 1200 / nyquist
+                b_r, a_r = signal.butter(4, [rhythm_low, rhythm_high], btype='band')
+                # Lead: High-Mid focus (1000-8000Hz)
+                lead_low = 1000 / nyquist
+                lead_high = 8000 / nyquist
+                b_l, a_l = signal.butter(4, [lead_low, lead_high], btype='band')
+                # Apply to Mid (Center info usually has both, but we try to separate by freq)
+                # We interpret 'Mid' as the main source.
+                rhythm_from_mid = signal.filtfilt(b_r, a_r, mid.numpy())
+                lead_from_mid = signal.filtfilt(b_l, a_l, mid.numpy())
+                # Reconstruct
+                # If Side exists (some stereo), add it to Lead (often spatial effects are on lead)
+                side_np = side.numpy()
+                rhythm_final = rhythm_from_mid
+                lead_final = lead_from_mid + (side_np * 1.5) # Boost side for lead
+                rhythm_stereo = torch.from_numpy(rhythm_final).float()
                 rhythm_stereo = torch.cat([rhythm_stereo, rhythm_stereo], dim=0)
+                lead_stereo = torch.from_numpy(lead_final).float()
+                lead_stereo = torch.cat([lead_stereo, lead_stereo], dim=0)
+            except Exception as e:
+                print(f"Filter failed: {e}. Fallback to raw.")
+                rhythm_stereo = torch.cat([left, left], dim=0)
+                lead_stereo = torch.cat([right, right], dim=0)
+        # Normalize
         def normalize(tensor):
             peak = tensor.abs().max()
             if peak > 0:
+                target_peak = 0.89  # -1dB
                 return tensor * (target_peak / peak)
             return tensor