crackuser commited on
Commit
0d1b7fe
·
verified ·
1 Parent(s): 7fb0a37

Create audio_processor.py

Browse files
Files changed (1) hide show
  1. audio_processor.py +226 -0
audio_processor.py ADDED
@@ -0,0 +1,226 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import librosa
3
+ import soundfile as sf
4
+ import noisereduce as nr
5
+ from scipy import signal
6
+ from scipy.signal import butter, filtfilt
7
+ import tempfile
8
+ import os
9
+ from typing import Tuple, Optional
10
+ import io
11
+
12
+ class AudioProcessor:
13
+ """Advanced audio processing for voice cloning"""
14
+
15
+ def __init__(self):
16
+ self.target_sr = 22050
17
+
18
+ def preprocess_audio(self, audio: np.ndarray, sr: int) -> np.ndarray:
19
+ """Comprehensive audio preprocessing"""
20
+
21
+ # Resample to target sample rate
22
+ if sr != self.target_sr:
23
+ audio = librosa.resample(audio, orig_sr=sr, target_sr=self.target_sr)
24
+
25
+ # Normalize amplitude
26
+ audio = self.normalize_audio(audio)
27
+
28
+ # Trim silence
29
+ audio = self.trim_silence(audio)
30
+
31
+ # Apply noise reduction
32
+ audio = self.reduce_noise(audio)
33
+
34
+ # Apply pre-emphasis filter
35
+ audio = self.apply_preemphasis(audio)
36
+
37
+ return audio
38
+
39
+ def normalize_audio(self, audio: np.ndarray, target_db: float = -20.0) -> np.ndarray:
40
+ """Normalize audio to target dB level"""
41
+
42
+ # Calculate RMS
43
+ rms = np.sqrt(np.mean(audio**2))
44
+
45
+ if rms > 0:
46
+ # Convert target dB to linear scale
47
+ target_rms = 10**(target_db / 20)
48
+
49
+ # Apply normalization
50
+ audio = audio * (target_rms / rms)
51
+
52
+ # Prevent clipping
53
+ max_val = np.max(np.abs(audio))
54
+ if max_val > 0.95:
55
+ audio = audio * (0.95 / max_val)
56
+
57
+ return audio
58
+
59
+ def trim_silence(self, audio: np.ndarray, threshold_db: float = -40.0) -> np.ndarray:
60
+ """Trim silence from beginning and end"""
61
+
62
+ # Use librosa's trim function
63
+ trimmed_audio, _ = librosa.effects.trim(
64
+ audio,
65
+ top_db=-threshold_db,
66
+ frame_length=2048,
67
+ hop_length=512
68
+ )
69
+
70
+ return trimmed_audio
71
+
72
+ def reduce_noise(self, audio: np.ndarray) -> np.ndarray:
73
+ """Apply noise reduction"""
74
+ try:
75
+ # Use noisereduce library
76
+ reduced_noise = nr.reduce_noise(y=audio, sr=self.target_sr)
77
+ return reduced_noise
78
+ except:
79
+ # Fallback: simple high-pass filter
80
+ return self.apply_highpass_filter(audio, cutoff=80)
81
+
82
+ def apply_preemphasis(self, audio: np.ndarray, coeff: float = 0.97) -> np.ndarray:
83
+ """Apply pre-emphasis filter"""
84
+ return signal.lfilter([1, -coeff], [1], audio)
85
+
86
+ def apply_deemphasis(self, audio: np.ndarray, coeff: float = 0.97) -> np.ndarray:
87
+ """Apply de-emphasis filter"""
88
+ return signal.lfilter([1], [1, -coeff], audio)
89
+
90
+ def apply_highpass_filter(self, audio: np.ndarray, cutoff: float = 80) -> np.ndarray:
91
+ """Apply high-pass filter"""
92
+ nyquist = self.target_sr * 0.5
93
+ normal_cutoff = cutoff / nyquist
94
+ b, a = butter(5, normal_cutoff, btype='high', analog=False)
95
+ return filtfilt(b, a, audio)
96
+
97
+ def apply_lowpass_filter(self, audio: np.ndarray, cutoff: float = 8000) -> np.ndarray:
98
+ """Apply low-pass filter"""
99
+ nyquist = self.target_sr * 0.5
100
+ normal_cutoff = cutoff / nyquist
101
+ b, a = butter(5, normal_cutoff, btype='low', analog=False)
102
+ return filtfilt(b, a, audio)
103
+
104
+ def apply_fade(self, audio: np.ndarray, fade_duration: float = 0.01) -> np.ndarray:
105
+ """Apply fade in/out"""
106
+ fade_samples = int(fade_duration * self.target_sr)
107
+
108
+ if len(audio) > 2 * fade_samples:
109
+ # Fade in
110
+ fade_in = np.linspace(0, 1, fade_samples)
111
+ audio[:fade_samples] *= fade_in
112
+
113
+ # Fade out
114
+ fade_out = np.linspace(1, 0, fade_samples)
115
+ audio[-fade_samples:] *= fade_out
116
+
117
+ return audio
118
+
119
+ def enhance_audio(self, audio: np.ndarray) -> np.ndarray:
120
+ """Enhance audio quality"""
121
+
122
+ # Apply noise reduction
123
+ enhanced = self.reduce_noise(audio)
124
+
125
+ # Apply gentle compression
126
+ enhanced = self.apply_compression(enhanced)
127
+
128
+ # Apply EQ boost for clarity
129
+ enhanced = self.apply_eq_boost(enhanced)
130
+
131
+ # Final normalization
132
+ enhanced = self.normalize_audio(enhanced)
133
+
134
+ # Apply fade
135
+ enhanced = self.apply_fade(enhanced)
136
+
137
+ return enhanced
138
+
139
+ def apply_compression(self, audio: np.ndarray, threshold: float = 0.5, ratio: float = 4.0) -> np.ndarray:
140
+ """Apply dynamic range compression"""
141
+
142
+ # Simple compression algorithm
143
+ compressed = audio.copy()
144
+
145
+ # Find samples above threshold
146
+ above_threshold = np.abs(compressed) > threshold
147
+
148
+ # Apply compression to samples above threshold
149
+ compressed[above_threshold] = np.sign(compressed[above_threshold]) * (
150
+ threshold + (np.abs(compressed[above_threshold]) - threshold) / ratio
151
+ )
152
+
153
+ return compressed
154
+
155
+ def apply_eq_boost(self, audio: np.ndarray) -> np.ndarray:
156
+ """Apply EQ boost for vocal clarity"""
157
+
158
+ # Boost frequencies important for speech (1-4 kHz)
159
+ # This is a simplified EQ - would use more sophisticated filtering in practice
160
+
161
+ # High-pass filter to remove low frequency noise
162
+ audio = self.apply_highpass_filter(audio, cutoff=85)
163
+
164
+ # Gentle low-pass to prevent harsh highs
165
+ audio = self.apply_lowpass_filter(audio, cutoff=7500)
166
+
167
+ return audio
168
+
169
+ def pitch_shift(self, audio: np.ndarray, semitones: float) -> np.ndarray:
170
+ """Shift pitch by semitones"""
171
+ return librosa.effects.pitch_shift(audio, sr=self.target_sr, n_steps=semitones)
172
+
173
+ def time_stretch(self, audio: np.ndarray, rate: float) -> np.ndarray:
174
+ """Change playback speed without affecting pitch"""
175
+ return librosa.effects.time_stretch(audio, rate=rate)
176
+
177
+ def detect_voice_activity(self, audio: np.ndarray, frame_duration: float = 0.025) -> np.ndarray:
178
+ """Detect voice activity in audio"""
179
+
180
+ frame_length = int(frame_duration * self.target_sr)
181
+ hop_length = frame_length // 2
182
+
183
+ # Calculate energy for each frame
184
+ energy = []
185
+ for i in range(0, len(audio) - frame_length + 1, hop_length):
186
+ frame = audio[i:i + frame_length]
187
+ frame_energy = np.sum(frame ** 2)
188
+ energy.append(frame_energy)
189
+
190
+ energy = np.array(energy)
191
+
192
+ # Simple threshold-based VAD
193
+ threshold = np.mean(energy) * 0.1
194
+ voice_activity = energy > threshold
195
+
196
+ return voice_activity
197
+
198
+ @staticmethod
199
+ def audio_to_bytes(audio: np.ndarray, sample_rate: int) -> bytes:
200
+ """Convert audio array to bytes for streaming"""
201
+
202
+ with tempfile.NamedTemporaryFile(delete=False, suffix='.wav') as tmp_file:
203
+ sf.write(tmp_file.name, audio, sample_rate)
204
+
205
+ with open(tmp_file.name, 'rb') as f:
206
+ audio_bytes = f.read()
207
+
208
+ # Clean up
209
+ os.unlink(tmp_file.name)
210
+
211
+ return audio_bytes
212
+
213
+ @staticmethod
214
+ def bytes_to_audio(audio_bytes: bytes) -> Tuple[np.ndarray, int]:
215
+ """Convert bytes to audio array"""
216
+
217
+ with tempfile.NamedTemporaryFile(delete=False, suffix='.wav') as tmp_file:
218
+ tmp_file.write(audio_bytes)
219
+ tmp_file.flush()
220
+
221
+ audio, sr = librosa.load(tmp_file.name, sr=None)
222
+
223
+ # Clean up
224
+ os.unlink(tmp_file.name)
225
+
226
+ return audio, sr