File size: 7,978 Bytes
312e168
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
import librosa
import numpy as np
import soundfile as sf
from typing import Union, Tuple, Optional
import logging
import os
from pathlib import Path

class AudioProcessor:
    """
    Audio processing utilities for speech-to-text preprocessing.
    Optimizes audio for better transcription accuracy.
    """
    
    def __init__(self, target_sr: int = 16000):
        self.target_sr = target_sr
        self.logger = logging.getLogger(__name__)
    
    def preprocess_audio(self, audio_input: Union[str, np.ndarray], 
                        normalize: bool = True, 
                        trim_silence: bool = True,
                        noise_reduction: bool = False) -> np.ndarray:
        """
        Preprocess audio for optimal speech recognition.
        
        Args:
            audio_input: Path to audio file or numpy array
            normalize: Whether to normalize audio amplitude
            trim_silence: Whether to trim silence from beginning/end
            noise_reduction: Whether to apply basic noise reduction
            
        Returns:
            Preprocessed audio as numpy array
        """
        try:
            # Load audio if it's a file path
            if isinstance(audio_input, str):
                audio, sr = librosa.load(audio_input, sr=self.target_sr)
            else:
                audio = audio_input
                sr = self.target_sr
            
            # Resample if needed
            if sr != self.target_sr:
                audio = librosa.resample(audio, orig_sr=sr, target_sr=self.target_sr)
            
            # Normalize audio
            if normalize:
                audio = librosa.util.normalize(audio)
            
            # Trim silence
            if trim_silence:
                audio, _ = librosa.effects.trim(audio, top_db=20)
            
            # Basic noise reduction using spectral gating
            if noise_reduction:
                audio = self._reduce_noise(audio)
            
            # Ensure audio is not empty
            if len(audio) == 0:
                self.logger.warning("Audio is empty after preprocessing")
                return np.zeros(1024)  # Return minimal audio
            
            return audio
            
        except Exception as e:
            self.logger.error(f"Audio preprocessing error: {e}")
            # Return original audio or minimal fallback
            if isinstance(audio_input, np.ndarray):
                return audio_input
            else:
                return np.zeros(1024)
    
    def _reduce_noise(self, audio: np.ndarray, noise_factor: float = 0.1) -> np.ndarray:
        """
        Simple noise reduction using spectral subtraction.
        
        Args:
            audio: Input audio signal
            noise_factor: Factor for noise reduction (0.0 to 1.0)
            
        Returns:
            Noise-reduced audio
        """
        try:
            # Compute STFT
            stft = librosa.stft(audio)
            magnitude = np.abs(stft)
            phase = np.angle(stft)
            
            # Estimate noise from first few frames
            noise_frames = min(10, magnitude.shape[1] // 4)
            noise_profile = np.mean(magnitude[:, :noise_frames], axis=1, keepdims=True)
            
            # Spectral subtraction
            clean_magnitude = magnitude - noise_factor * noise_profile
            clean_magnitude = np.maximum(clean_magnitude, 0.1 * magnitude)
            
            # Reconstruct audio
            clean_stft = clean_magnitude * np.exp(1j * phase)
            clean_audio = librosa.istft(clean_stft)
            
            return clean_audio
            
        except Exception as e:
            self.logger.warning(f"Noise reduction failed: {e}")
            return audio
    
    def validate_audio(self, audio_path: str) -> Tuple[bool, str]:
        """
        Validate audio file for processing.
        
        Args:
            audio_path: Path to audio file
            
        Returns:
            Tuple of (is_valid, message)
        """
        try:
            if not os.path.exists(audio_path):
                return False, "Audio file does not exist"
            
            # Check file size
            file_size = os.path.getsize(audio_path)
            if file_size == 0:
                return False, "Audio file is empty"
            
            if file_size > 100 * 1024 * 1024:  # 100MB limit
                return False, "Audio file too large (>100MB)"
            
            # Try to load audio
            try:
                audio, sr = librosa.load(audio_path, duration=1.0)  # Load first second
                if len(audio) == 0:
                    return False, "Audio file contains no audio data"
            except Exception as e:
                return False, f"Cannot load audio file: {str(e)}"
            
            return True, "Audio file is valid"
            
        except Exception as e:
            return False, f"Audio validation error: {str(e)}"
    
    def get_audio_info(self, audio_path: str) -> dict:
        """
        Get information about audio file.
        
        Args:
            audio_path: Path to audio file
            
        Returns:
            Dictionary with audio information
        """
        try:
            # Get file info
            file_size = os.path.getsize(audio_path)
            
            # Load audio to get properties
            audio, sr = librosa.load(audio_path, sr=None)
            duration = len(audio) / sr
            
            return {
                "file_path": audio_path,
                "file_size_mb": file_size / (1024 * 1024),
                "duration_seconds": duration,
                "sample_rate": sr,
                "channels": 1 if audio.ndim == 1 else audio.shape[0],
                "samples": len(audio),
                "format": Path(audio_path).suffix.lower()
            }
            
        except Exception as e:
            return {
                "error": f"Cannot get audio info: {str(e)}"
            }
    
    def convert_audio_format(self, input_path: str, output_path: str, 
                           target_format: str = "wav") -> bool:
        """
        Convert audio to different format.
        
        Args:
            input_path: Input audio file path
            output_path: Output audio file path
            target_format: Target format (wav, mp3, flac, etc.)
            
        Returns:
            Success status
        """
        try:
            # Load audio
            audio, sr = librosa.load(input_path, sr=self.target_sr)
            
            # Save in target format
            sf.write(output_path, audio, sr, format=target_format.upper())
            
            return True
            
        except Exception as e:
            self.logger.error(f"Audio conversion error: {e}")
            return False
    
    def split_audio(self, audio_path: str, chunk_duration: int = 30) -> list:
        """
        Split long audio into chunks for processing.
        
        Args:
            audio_path: Path to audio file
            chunk_duration: Duration of each chunk in seconds
            
        Returns:
            List of audio chunks as numpy arrays
        """
        try:
            # Load full audio
            audio, sr = librosa.load(audio_path, sr=self.target_sr)
            
            # Calculate chunk size in samples
            chunk_samples = chunk_duration * sr
            
            # Split audio into chunks
            chunks = []
            for i in range(0, len(audio), chunk_samples):
                chunk = audio[i:i + chunk_samples]
                if len(chunk) > sr:  # Only include chunks longer than 1 second
                    chunks.append(chunk)
            
            return chunks
            
        except Exception as e:
            self.logger.error(f"Audio splitting error: {e}")
            return []