Spaces:
Build error
Build error
Update audio_processing.py
Browse files- audio_processing.py +56 -2
audio_processing.py
CHANGED
|
@@ -98,6 +98,51 @@ class AudioProcessor:
|
|
| 98 |
|
| 99 |
return translation
|
| 100 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 101 |
@spaces.GPU(duration=60)
|
| 102 |
def process_audio(self, audio_path, translate=False):
|
| 103 |
"""Main processing function"""
|
|
@@ -106,10 +151,19 @@ class AudioProcessor:
|
|
| 106 |
waveform, sample_rate = torchaudio.load(audio_path)
|
| 107 |
if waveform.shape[0] > 1:
|
| 108 |
waveform = torch.mean(waveform, dim=0)
|
| 109 |
-
|
|
|
|
|
|
|
| 110 |
# Resample if necessary
|
| 111 |
if sample_rate != self.sample_rate:
|
| 112 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 113 |
|
| 114 |
# Load models
|
| 115 |
models = self.load_models()
|
|
|
|
| 98 |
|
| 99 |
return translation
|
| 100 |
|
| 101 |
+
def preprocess_audio(self, audio):
|
| 102 |
+
"""
|
| 103 |
+
Create overlapping chunks with improved timing logic
|
| 104 |
+
"""
|
| 105 |
+
chunk_samples = int(self.chunk_size * self.sample_rate)
|
| 106 |
+
overlap_samples = int(self.overlap * self.sample_rate)
|
| 107 |
+
|
| 108 |
+
chunks_with_times = []
|
| 109 |
+
start_idx = 0
|
| 110 |
+
|
| 111 |
+
while start_idx < len(audio):
|
| 112 |
+
end_idx = min(start_idx + chunk_samples, len(audio))
|
| 113 |
+
|
| 114 |
+
# Add padding for first chunk
|
| 115 |
+
if start_idx == 0:
|
| 116 |
+
chunk = audio[start_idx:end_idx]
|
| 117 |
+
padding = torch.zeros(int(1 * self.sample_rate))
|
| 118 |
+
chunk = torch.cat([padding, chunk])
|
| 119 |
+
else:
|
| 120 |
+
# Include overlap from previous chunk
|
| 121 |
+
actual_start = max(0, start_idx - overlap_samples)
|
| 122 |
+
chunk = audio[actual_start:end_idx]
|
| 123 |
+
|
| 124 |
+
# Pad if necessary
|
| 125 |
+
if len(chunk) < chunk_samples:
|
| 126 |
+
chunk = torch.nn.functional.pad(chunk, (0, chunk_samples - len(chunk)))
|
| 127 |
+
|
| 128 |
+
# Adjust time ranges to account for overlaps
|
| 129 |
+
chunk_start_time = max(0, (start_idx / self.sample_rate) - self.overlap)
|
| 130 |
+
chunk_end_time = min((end_idx / self.sample_rate) + self.overlap, len(audio) / self.sample_rate)
|
| 131 |
+
|
| 132 |
+
chunks_with_times.append({
|
| 133 |
+
'chunk': chunk,
|
| 134 |
+
'start_time': start_idx / self.sample_rate,
|
| 135 |
+
'end_time': end_idx / self.sample_rate,
|
| 136 |
+
'transcribe_start': chunk_start_time,
|
| 137 |
+
'transcribe_end': chunk_end_time
|
| 138 |
+
})
|
| 139 |
+
|
| 140 |
+
# Move to next chunk with smaller step size for better continuity
|
| 141 |
+
start_idx += (chunk_samples - overlap_samples)
|
| 142 |
+
|
| 143 |
+
return chunks_with_times
|
| 144 |
+
|
| 145 |
+
|
| 146 |
@spaces.GPU(duration=60)
|
| 147 |
def process_audio(self, audio_path, translate=False):
|
| 148 |
"""Main processing function"""
|
|
|
|
| 151 |
waveform, sample_rate = torchaudio.load(audio_path)
|
| 152 |
if waveform.shape[0] > 1:
|
| 153 |
waveform = torch.mean(waveform, dim=0)
|
| 154 |
+
else:
|
| 155 |
+
waveform = waveform.squeeze(0)
|
| 156 |
+
|
| 157 |
# Resample if necessary
|
| 158 |
if sample_rate != self.sample_rate:
|
| 159 |
+
resampler = torchaudio.transforms.Resample(
|
| 160 |
+
orig_freq=sample_rate,
|
| 161 |
+
new_freq=self.sample_rate
|
| 162 |
+
)
|
| 163 |
+
waveform = resampler(waveform)
|
| 164 |
+
|
| 165 |
+
# if sample_rate != self.sample_rate:
|
| 166 |
+
# waveform = torchaudio.transforms.Resample(sample_rate, self.sample_rate)(waveform)
|
| 167 |
|
| 168 |
# Load models
|
| 169 |
models = self.load_models()
|