github-actions[bot] commited on
Commit ·
61f32ab
1
Parent(s): 207cecb
Sync from GitHub: d66c6dbf67237f862d611254033ad1302afdafd1
Browse files- services/audio_separator.py +67 -54
services/audio_separator.py
CHANGED
|
@@ -142,69 +142,82 @@ class AudioSeparator:
|
|
| 142 |
self._save_audio(source, sr, path)
|
| 143 |
return {"guitar_rhythm": path, "guitar_lead": path}
|
| 144 |
|
| 145 |
-
#
|
| 146 |
-
#
|
| 147 |
-
#
|
| 148 |
-
|
| 149 |
-
|
| 150 |
-
|
| 151 |
-
|
| 152 |
-
|
| 153 |
-
|
| 154 |
-
|
| 155 |
-
|
| 156 |
-
|
| 157 |
-
|
| 158 |
-
|
|
|
|
| 159 |
|
| 160 |
-
|
| 161 |
-
|
| 162 |
-
|
| 163 |
-
|
| 164 |
-
|
| 165 |
-
|
| 166 |
-
|
| 167 |
-
|
| 168 |
-
|
| 169 |
-
|
| 170 |
-
|
| 171 |
-
|
|
|
|
| 172 |
|
| 173 |
-
|
| 174 |
-
|
| 175 |
-
side_filtered = signal.filtfilt(b_lead, a_lead, side.numpy())
|
| 176 |
|
| 177 |
-
mid = torch.from_numpy(mid_filtered).float()
|
| 178 |
-
side = torch.from_numpy(side_filtered).float()
|
| 179 |
-
except Exception as e:
|
| 180 |
-
print(f"Warning: Frequency filtering failed ({e}), using raw Mid-Side")
|
| 181 |
-
|
| 182 |
-
# Make stereo for output (center both)
|
| 183 |
-
# SWAPPED: Side = Rhythm (strumming often panned wide), Mid = Lead (melody often center)
|
| 184 |
-
rhythm_stereo = torch.cat([side, side], dim=0)
|
| 185 |
-
lead_stereo = torch.cat([mid, mid], dim=0)
|
| 186 |
-
|
| 187 |
-
# If side is too quiet (song has no stereo separation), mix some mid into lead
|
| 188 |
-
side_rms = torch.sqrt(torch.mean(side ** 2))
|
| 189 |
-
mid_rms = torch.sqrt(torch.mean(mid ** 2))
|
| 190 |
-
|
| 191 |
-
if side_rms < mid_rms * 0.1: # Side is <10% of mid -> almost mono mix
|
| 192 |
-
print("Notice: Audio appears to be mostly mono. Rhythm separation may be limited.")
|
| 193 |
-
# Create pseudo-separation using low frequencies for rhythm
|
| 194 |
try:
|
| 195 |
-
|
| 196 |
-
|
| 197 |
-
|
| 198 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 199 |
rhythm_stereo = torch.cat([rhythm_stereo, rhythm_stereo], dim=0)
|
| 200 |
-
|
| 201 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 202 |
|
| 203 |
-
# Normalize
|
| 204 |
def normalize(tensor):
|
| 205 |
peak = tensor.abs().max()
|
| 206 |
if peak > 0:
|
| 207 |
-
target_peak = 0.
|
| 208 |
return tensor * (target_peak / peak)
|
| 209 |
return tensor
|
| 210 |
|
|
|
|
| 142 |
self._save_audio(source, sr, path)
|
| 143 |
return {"guitar_rhythm": path, "guitar_lead": path}
|
| 144 |
|
| 145 |
+
# 1. Smart Spatial Split Check
|
| 146 |
+
# Calculate correlation between L and R to detect Hard Panning (Math Rock Style)
|
| 147 |
+
# If correlation is low, it means L and R are playing different things.
|
| 148 |
+
# We assume Left = Rhythm (often dropped D/lower), Right = Lead (often ornate/higher) - OR provide both as is.
|
| 149 |
+
|
| 150 |
+
# Calculate cross-correlation at lag 0
|
| 151 |
+
mean_l = left.mean()
|
| 152 |
+
mean_r = right.mean()
|
| 153 |
+
var_l = ((left - mean_l)**2).mean()
|
| 154 |
+
var_r = ((right - mean_r)**2).mean()
|
| 155 |
+
cov = ((left - mean_l) * (right - mean_r)).mean()
|
| 156 |
+
|
| 157 |
+
correlation = 0.0
|
| 158 |
+
if var_l > 0 and var_r > 0:
|
| 159 |
+
correlation = cov / torch.sqrt(var_l * var_r)
|
| 160 |
|
| 161 |
+
print(f"Guitar Stereo Correlation: {correlation:.4f}")
|
| 162 |
+
|
| 163 |
+
# Threshold for "Wide Stereo"
|
| 164 |
+
if abs(correlation) < 0.6:
|
| 165 |
+
print("Detected Wide Stereo Guitar (Math Rock Style). Using Spatial Split (L=Rhythm, R=Lead).")
|
| 166 |
+
# Force Hard Split
|
| 167 |
+
# Rhythm = Left Channel (Duplicated to Stereo)
|
| 168 |
+
# Lead = Right Channel (Duplicated to Stereo)
|
| 169 |
+
rhythm_stereo = torch.cat([left, left], dim=0)
|
| 170 |
+
lead_stereo = torch.cat([right, right], dim=0)
|
| 171 |
+
else:
|
| 172 |
+
print("Detected Narrow/Mono Guitar. Using Mid-Side Frequency Split.")
|
| 173 |
+
# Standard Mid-Side with Tighter Filters
|
| 174 |
|
| 175 |
+
mid = (left + right) / 2.0
|
| 176 |
+
side = (left - right) / 2.0
|
|
|
|
| 177 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 178 |
try:
|
| 179 |
+
import scipy.signal as signal
|
| 180 |
+
nyquist = sr / 2
|
| 181 |
+
|
| 182 |
+
# Rhythm: Low-Mid focus (80-1500Hz) - tighter top end
|
| 183 |
+
# To distinguish from lead which often occupies 800+
|
| 184 |
+
rhythm_low = 80 / nyquist
|
| 185 |
+
rhythm_high = 1200 / nyquist
|
| 186 |
+
b_r, a_r = signal.butter(4, [rhythm_low, rhythm_high], btype='band')
|
| 187 |
+
|
| 188 |
+
# Lead: High-Mid focus (1000-8000Hz)
|
| 189 |
+
lead_low = 1000 / nyquist
|
| 190 |
+
lead_high = 8000 / nyquist
|
| 191 |
+
b_l, a_l = signal.butter(4, [lead_low, lead_high], btype='band')
|
| 192 |
+
|
| 193 |
+
# Apply to Mid (Center info usually has both, but we try to separate by freq)
|
| 194 |
+
# We interpret 'Mid' as the main source.
|
| 195 |
+
rhythm_from_mid = signal.filtfilt(b_r, a_r, mid.numpy())
|
| 196 |
+
lead_from_mid = signal.filtfilt(b_l, a_l, mid.numpy())
|
| 197 |
+
|
| 198 |
+
# Reconstruct
|
| 199 |
+
# If Side exists (some stereo), add it to Lead (often spatial effects are on lead)
|
| 200 |
+
side_np = side.numpy()
|
| 201 |
+
|
| 202 |
+
rhythm_final = rhythm_from_mid
|
| 203 |
+
lead_final = lead_from_mid + (side_np * 1.5) # Boost side for lead
|
| 204 |
+
|
| 205 |
+
rhythm_stereo = torch.from_numpy(rhythm_final).float()
|
| 206 |
rhythm_stereo = torch.cat([rhythm_stereo, rhythm_stereo], dim=0)
|
| 207 |
+
|
| 208 |
+
lead_stereo = torch.from_numpy(lead_final).float()
|
| 209 |
+
lead_stereo = torch.cat([lead_stereo, lead_stereo], dim=0)
|
| 210 |
+
|
| 211 |
+
except Exception as e:
|
| 212 |
+
print(f"Filter failed: {e}. Fallback to raw.")
|
| 213 |
+
rhythm_stereo = torch.cat([left, left], dim=0)
|
| 214 |
+
lead_stereo = torch.cat([right, right], dim=0)
|
| 215 |
|
| 216 |
+
# Normalize
|
| 217 |
def normalize(tensor):
|
| 218 |
peak = tensor.abs().max()
|
| 219 |
if peak > 0:
|
| 220 |
+
target_peak = 0.89 # -1dB
|
| 221 |
return tensor * (target_peak / peak)
|
| 222 |
return tensor
|
| 223 |
|