crackuser commited on
Commit
4ec8b62
·
verified ·
1 Parent(s): 02e1276

Delete audio_processor.py

Browse files
Files changed (1) hide show
  1. audio_processor.py +0 -226
audio_processor.py DELETED
@@ -1,226 +0,0 @@
1
- import numpy as np
2
- import librosa
3
- import soundfile as sf
4
- import noisereduce as nr
5
- from scipy import signal
6
- from scipy.signal import butter, filtfilt
7
- import tempfile
8
- import os
9
- from typing import Tuple, Optional
10
- import io
11
-
12
- class AudioProcessor:
13
- """Advanced audio processing for voice cloning"""
14
-
15
- def __init__(self):
16
- self.target_sr = 22050
17
-
18
- def preprocess_audio(self, audio: np.ndarray, sr: int) -> np.ndarray:
19
- """Comprehensive audio preprocessing"""
20
-
21
- # Resample to target sample rate
22
- if sr != self.target_sr:
23
- audio = librosa.resample(audio, orig_sr=sr, target_sr=self.target_sr)
24
-
25
- # Normalize amplitude
26
- audio = self.normalize_audio(audio)
27
-
28
- # Trim silence
29
- audio = self.trim_silence(audio)
30
-
31
- # Apply noise reduction
32
- audio = self.reduce_noise(audio)
33
-
34
- # Apply pre-emphasis filter
35
- audio = self.apply_preemphasis(audio)
36
-
37
- return audio
38
-
39
- def normalize_audio(self, audio: np.ndarray, target_db: float = -20.0) -> np.ndarray:
40
- """Normalize audio to target dB level"""
41
-
42
- # Calculate RMS
43
- rms = np.sqrt(np.mean(audio**2))
44
-
45
- if rms > 0:
46
- # Convert target dB to linear scale
47
- target_rms = 10**(target_db / 20)
48
-
49
- # Apply normalization
50
- audio = audio * (target_rms / rms)
51
-
52
- # Prevent clipping
53
- max_val = np.max(np.abs(audio))
54
- if max_val > 0.95:
55
- audio = audio * (0.95 / max_val)
56
-
57
- return audio
58
-
59
- def trim_silence(self, audio: np.ndarray, threshold_db: float = -40.0) -> np.ndarray:
60
- """Trim silence from beginning and end"""
61
-
62
- # Use librosa's trim function
63
- trimmed_audio, _ = librosa.effects.trim(
64
- audio,
65
- top_db=-threshold_db,
66
- frame_length=2048,
67
- hop_length=512
68
- )
69
-
70
- return trimmed_audio
71
-
72
- def reduce_noise(self, audio: np.ndarray) -> np.ndarray:
73
- """Apply noise reduction"""
74
- try:
75
- # Use noisereduce library
76
- reduced_noise = nr.reduce_noise(y=audio, sr=self.target_sr)
77
- return reduced_noise
78
- except:
79
- # Fallback: simple high-pass filter
80
- return self.apply_highpass_filter(audio, cutoff=80)
81
-
82
- def apply_preemphasis(self, audio: np.ndarray, coeff: float = 0.97) -> np.ndarray:
83
- """Apply pre-emphasis filter"""
84
- return signal.lfilter([1, -coeff], [1], audio)
85
-
86
- def apply_deemphasis(self, audio: np.ndarray, coeff: float = 0.97) -> np.ndarray:
87
- """Apply de-emphasis filter"""
88
- return signal.lfilter([1], [1, -coeff], audio)
89
-
90
- def apply_highpass_filter(self, audio: np.ndarray, cutoff: float = 80) -> np.ndarray:
91
- """Apply high-pass filter"""
92
- nyquist = self.target_sr * 0.5
93
- normal_cutoff = cutoff / nyquist
94
- b, a = butter(5, normal_cutoff, btype='high', analog=False)
95
- return filtfilt(b, a, audio)
96
-
97
- def apply_lowpass_filter(self, audio: np.ndarray, cutoff: float = 8000) -> np.ndarray:
98
- """Apply low-pass filter"""
99
- nyquist = self.target_sr * 0.5
100
- normal_cutoff = cutoff / nyquist
101
- b, a = butter(5, normal_cutoff, btype='low', analog=False)
102
- return filtfilt(b, a, audio)
103
-
104
- def apply_fade(self, audio: np.ndarray, fade_duration: float = 0.01) -> np.ndarray:
105
- """Apply fade in/out"""
106
- fade_samples = int(fade_duration * self.target_sr)
107
-
108
- if len(audio) > 2 * fade_samples:
109
- # Fade in
110
- fade_in = np.linspace(0, 1, fade_samples)
111
- audio[:fade_samples] *= fade_in
112
-
113
- # Fade out
114
- fade_out = np.linspace(1, 0, fade_samples)
115
- audio[-fade_samples:] *= fade_out
116
-
117
- return audio
118
-
119
- def enhance_audio(self, audio: np.ndarray) -> np.ndarray:
120
- """Enhance audio quality"""
121
-
122
- # Apply noise reduction
123
- enhanced = self.reduce_noise(audio)
124
-
125
- # Apply gentle compression
126
- enhanced = self.apply_compression(enhanced)
127
-
128
- # Apply EQ boost for clarity
129
- enhanced = self.apply_eq_boost(enhanced)
130
-
131
- # Final normalization
132
- enhanced = self.normalize_audio(enhanced)
133
-
134
- # Apply fade
135
- enhanced = self.apply_fade(enhanced)
136
-
137
- return enhanced
138
-
139
- def apply_compression(self, audio: np.ndarray, threshold: float = 0.5, ratio: float = 4.0) -> np.ndarray:
140
- """Apply dynamic range compression"""
141
-
142
- # Simple compression algorithm
143
- compressed = audio.copy()
144
-
145
- # Find samples above threshold
146
- above_threshold = np.abs(compressed) > threshold
147
-
148
- # Apply compression to samples above threshold
149
- compressed[above_threshold] = np.sign(compressed[above_threshold]) * (
150
- threshold + (np.abs(compressed[above_threshold]) - threshold) / ratio
151
- )
152
-
153
- return compressed
154
-
155
- def apply_eq_boost(self, audio: np.ndarray) -> np.ndarray:
156
- """Apply EQ boost for vocal clarity"""
157
-
158
- # Boost frequencies important for speech (1-4 kHz)
159
- # This is a simplified EQ - would use more sophisticated filtering in practice
160
-
161
- # High-pass filter to remove low frequency noise
162
- audio = self.apply_highpass_filter(audio, cutoff=85)
163
-
164
- # Gentle low-pass to prevent harsh highs
165
- audio = self.apply_lowpass_filter(audio, cutoff=7500)
166
-
167
- return audio
168
-
169
- def pitch_shift(self, audio: np.ndarray, semitones: float) -> np.ndarray:
170
- """Shift pitch by semitones"""
171
- return librosa.effects.pitch_shift(audio, sr=self.target_sr, n_steps=semitones)
172
-
173
- def time_stretch(self, audio: np.ndarray, rate: float) -> np.ndarray:
174
- """Change playback speed without affecting pitch"""
175
- return librosa.effects.time_stretch(audio, rate=rate)
176
-
177
- def detect_voice_activity(self, audio: np.ndarray, frame_duration: float = 0.025) -> np.ndarray:
178
- """Detect voice activity in audio"""
179
-
180
- frame_length = int(frame_duration * self.target_sr)
181
- hop_length = frame_length // 2
182
-
183
- # Calculate energy for each frame
184
- energy = []
185
- for i in range(0, len(audio) - frame_length + 1, hop_length):
186
- frame = audio[i:i + frame_length]
187
- frame_energy = np.sum(frame ** 2)
188
- energy.append(frame_energy)
189
-
190
- energy = np.array(energy)
191
-
192
- # Simple threshold-based VAD
193
- threshold = np.mean(energy) * 0.1
194
- voice_activity = energy > threshold
195
-
196
- return voice_activity
197
-
198
- @staticmethod
199
- def audio_to_bytes(audio: np.ndarray, sample_rate: int) -> bytes:
200
- """Convert audio array to bytes for streaming"""
201
-
202
- with tempfile.NamedTemporaryFile(delete=False, suffix='.wav') as tmp_file:
203
- sf.write(tmp_file.name, audio, sample_rate)
204
-
205
- with open(tmp_file.name, 'rb') as f:
206
- audio_bytes = f.read()
207
-
208
- # Clean up
209
- os.unlink(tmp_file.name)
210
-
211
- return audio_bytes
212
-
213
- @staticmethod
214
- def bytes_to_audio(audio_bytes: bytes) -> Tuple[np.ndarray, int]:
215
- """Convert bytes to audio array"""
216
-
217
- with tempfile.NamedTemporaryFile(delete=False, suffix='.wav') as tmp_file:
218
- tmp_file.write(audio_bytes)
219
- tmp_file.flush()
220
-
221
- audio, sr = librosa.load(tmp_file.name, sr=None)
222
-
223
- # Clean up
224
- os.unlink(tmp_file.name)
225
-
226
- return audio, sr