aryan12345ark commited on
Commit
e051bf8
·
verified ·
1 Parent(s): 0001b53

Create fix_audio_processor.py

Browse files
Files changed (1) hide show
  1. fix_audio_processor.py +356 -0
fix_audio_processor.py ADDED
@@ -0,0 +1,356 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ fix_audio_processor.py
4
+
5
+ Updates the audio processor to handle base64 padding issues.
6
+ Run this in your voice-detection-engine folder.
7
+ """
8
+
9
+ import os
10
+
11
+ content = '''"""
12
+ Voice Detection Engine - Audio Processor
13
+
14
+ Handles Base64 decoding, format conversion, resampling.
15
+ """
16
+
17
+ import io
18
+ import logging
19
+ import base64
20
+ from typing import Optional
21
+
22
+ import numpy as np
23
+ import librosa
24
+ import soundfile as sf
25
+ from pydub import AudioSegment
26
+
27
+ from app.config import settings
28
+
29
+ logger = logging.getLogger("engine.audio_processor")
30
+
31
+
32
+ class AudioProcessor:
33
+ """
34
+ Process audio from Base64 to normalized numpy array.
35
+ """
36
+
37
+ def __init__(self):
38
+ self.target_sr = settings.TARGET_SAMPLE_RATE
39
+ self.max_seconds = settings.MAX_AUDIO_SECONDS
40
+ self.max_samples = self.target_sr * self.max_seconds
41
+
42
+ def decode_base64(self, audio_base64: str) -> bytes:
43
+ """
44
+ Decode base64 string to bytes with padding fix.
45
+ """
46
+ # Remove any whitespace
47
+ audio_base64 = audio_base64.strip()
48
+
49
+ # Remove data URL prefix if present
50
+ if "," in audio_base64:
51
+ audio_base64 = audio_base64.split(",", 1)[1]
52
+
53
+ # Fix padding - base64 must be divisible by 4
54
+ missing_padding = len(audio_base64) % 4
55
+ if missing_padding:
56
+ audio_base64 += "=" * (4 - missing_padding)
57
+
58
+ # Decode
59
+ return base64.b64decode(audio_base64)
60
+
61
+ def process(self, audio_bytes: bytes) -> np.ndarray:
62
+ """
63
+ Process raw audio bytes to normalized numpy array.
64
+ """
65
+ logger.debug(f"Processing audio: {len(audio_bytes)} bytes")
66
+
67
+ audio_array = None
68
+
69
+ # Method 1: Try pydub
70
+ try:
71
+ audio_array = self._decode_with_pydub(audio_bytes)
72
+ logger.debug("Decoded with pydub")
73
+ except Exception as e:
74
+ logger.debug(f"Pydub failed: {e}")
75
+
76
+ # Method 2: Try soundfile
77
+ if audio_array is None:
78
+ try:
79
+ audio_array = self._decode_with_soundfile(audio_bytes)
80
+ logger.debug("Decoded with soundfile")
81
+ except Exception as e:
82
+ logger.debug(f"Soundfile failed: {e}")
83
+
84
+ # Method 3: Try librosa
85
+ if audio_array is None:
86
+ try:
87
+ audio_array = self._decode_with_librosa(audio_bytes)
88
+ logger.debug("Decoded with librosa")
89
+ except Exception as e:
90
+ logger.debug(f"Librosa failed: {e}")
91
+
92
+ if audio_array is None:
93
+ raise ValueError("Failed to decode audio with any method")
94
+
95
+ # Ensure mono
96
+ if len(audio_array.shape) > 1:
97
+ audio_array = np.mean(audio_array, axis=1)
98
+
99
+ # Ensure float32
100
+ audio_array = audio_array.astype(np.float32)
101
+
102
+ # Normalize to [-1, 1]
103
+ max_val = np.abs(audio_array).max()
104
+ if max_val > 0:
105
+ audio_array = audio_array / max_val
106
+
107
+ # Trim to max duration
108
+ if len(audio_array) > self.max_samples:
109
+ audio_array = audio_array[:self.max_samples]
110
+
111
+ logger.debug(f"Processed: {len(audio_array)} samples, {len(audio_array)/self.target_sr:.2f}s")
112
+
113
+ return audio_array
114
+
115
+ def _decode_with_pydub(self, audio_bytes: bytes) -> np.ndarray:
116
+ audio_io = io.BytesIO(audio_bytes)
117
+ audio_segment = AudioSegment.from_file(audio_io)
118
+ audio_segment = audio_segment.set_channels(1)
119
+ audio_segment = audio_segment.set_frame_rate(self.target_sr)
120
+ samples = np.array(audio_segment.get_array_of_samples())
121
+ sample_width = audio_segment.sample_width
122
+ if sample_width == 2:
123
+ samples = samples.astype(np.float32) / 32768.0
124
+ elif sample_width == 4:
125
+ samples = samples.astype(np.float32) / 2147483648.0
126
+ else:
127
+ samples = samples.astype(np.float32) / 128.0
128
+ return samples
129
+
130
+ def _decode_with_soundfile(self, audio_bytes: bytes) -> np.ndarray:
131
+ audio_io = io.BytesIO(audio_bytes)
132
+ audio_array, sr = sf.read(audio_io)
133
+ if sr != self.target_sr:
134
+ audio_array = librosa.resample(audio_array, orig_sr=sr, target_sr=self.target_sr)
135
+ return audio_array
136
+
137
+ def _decode_with_librosa(self, audio_bytes: bytes) -> np.ndarray:
138
+ audio_io = io.BytesIO(audio_bytes)
139
+ audio_array, sr = librosa.load(audio_io, sr=self.target_sr, mono=True)
140
+ return audio_array
141
+ '''
142
+
143
+ # Write file
144
+ filepath = "app/preprocessing/audio_processor.py"
145
+ os.makedirs(os.path.dirname(filepath), exist_ok=True)
146
+
147
+ with open(filepath, "w", encoding="utf-8", newline="\n") as f:
148
+ f.write(content)
149
+
150
+ print(f"[OK] Updated {filepath}")
151
+ print()
152
+ print("Now update the detector to use the new decode method...")
153
+
154
+ # Also update detector.py
155
+ detector_content = '''"""
156
+ Voice Detection Engine - Main Detector
157
+ """
158
+
159
+ import logging
160
+ from typing import Dict, Any, List, Tuple
161
+ from dataclasses import dataclass
162
+
163
+ import numpy as np
164
+
165
+ from app.config import settings
166
+ from app.preprocessing.audio_processor import AudioProcessor
167
+ from app.models.embeddings import EmbeddingExtractor
168
+ from app.features.acoustic import AcousticFeatureExtractor
169
+
170
+ logger = logging.getLogger("engine.detector")
171
+
172
+
173
+ @dataclass
174
+ class RuleHit:
175
+ name: str
176
+ delta: float
177
+ detail: str
178
+
179
+
180
+ class VoiceDetector:
181
+ def __init__(self):
182
+ logger.info("Initializing VoiceDetector...")
183
+ self.audio_processor = AudioProcessor()
184
+ self.embedding_extractor = EmbeddingExtractor()
185
+ self.acoustic_extractor = AcousticFeatureExtractor()
186
+ logger.info("VoiceDetector initialized")
187
+
188
+ def warmup(self):
189
+ logger.info("Warming up detector...")
190
+ dummy_audio = np.zeros(settings.TARGET_SAMPLE_RATE, dtype=np.float32)
191
+ self.embedding_extractor.warmup(dummy_audio)
192
+ self.acoustic_extractor.extract(dummy_audio, settings.TARGET_SAMPLE_RATE)
193
+ logger.info("Detector warmup complete")
194
+
195
+ def analyze(self, audio_base64: str, language: str, request_id: str = "") -> Dict[str, Any]:
196
+ logger.info(f"[{request_id}] Starting analysis for language: {language}")
197
+
198
+ # Decode and Process Audio
199
+ try:
200
+ # Use the new decode method with padding fix
201
+ audio_bytes = self.audio_processor.decode_base64(audio_base64)
202
+ audio_array = self.audio_processor.process(audio_bytes)
203
+ duration = len(audio_array) / settings.TARGET_SAMPLE_RATE
204
+
205
+ logger.info(f"[{request_id}] Audio duration: {duration:.2f}s")
206
+
207
+ if duration < settings.MIN_AUDIO_SECONDS:
208
+ logger.warning(f"[{request_id}] Audio too short: {duration:.2f}s")
209
+ return {
210
+ "classification": "HUMAN",
211
+ "confidence": 0.50,
212
+ "explanation": "Audio too short for reliable analysis."
213
+ }
214
+
215
+ except Exception as e:
216
+ logger.error(f"[{request_id}] Audio processing failed: {e}")
217
+ return {
218
+ "classification": "HUMAN",
219
+ "confidence": 0.50,
220
+ "explanation": f"Audio processing failed: {str(e)[:100]}"
221
+ }
222
+
223
+ # Extract Features
224
+ try:
225
+ acoustic_features = self.acoustic_extractor.extract(audio_array, settings.TARGET_SAMPLE_RATE)
226
+ embedding_features = self.embedding_extractor.extract(audio_array)
227
+ except Exception as e:
228
+ logger.error(f"[{request_id}] Feature extraction failed: {e}")
229
+ return {
230
+ "classification": "HUMAN",
231
+ "confidence": 0.50,
232
+ "explanation": "Feature extraction failed."
233
+ }
234
+
235
+ # Apply Heuristics
236
+ score, rule_hits = self._apply_heuristics(acoustic_features, embedding_features, duration, request_id)
237
+
238
+ # Determine Classification
239
+ if score > 0.5:
240
+ classification = "AI_GENERATED"
241
+ else:
242
+ classification = "HUMAN"
243
+
244
+ confidence = abs(score - 0.5) * 2
245
+ confidence = max(0.0, min(1.0, confidence))
246
+
247
+ explanation = self._generate_explanation(classification, rule_hits, acoustic_features, embedding_features)
248
+
249
+ logger.info(f"[{request_id}] Result: {classification} (score={score:.3f}, confidence={confidence:.3f})")
250
+
251
+ return {
252
+ "classification": classification,
253
+ "confidence": round(confidence, 4),
254
+ "explanation": explanation
255
+ }
256
+
257
+ def _apply_heuristics(self, acoustic: Dict, embeddings: Dict, duration: float, request_id: str) -> Tuple[float, List[RuleHit]]:
258
+ score = 0.5
259
+ rule_hits = []
260
+ inc = settings.SCORE_INCREMENT
261
+ dec = settings.SCORE_DECREMENT
262
+
263
+ # Pitch Analysis
264
+ pitch_std = acoustic.get("pitch_std", 30.0)
265
+ pitch_range = acoustic.get("pitch_range", 80.0)
266
+
267
+ if pitch_std < settings.PITCH_STD_LOW:
268
+ score += inc
269
+ rule_hits.append(RuleHit("low_pitch_std", inc, f"pitch_std={pitch_std:.1f}Hz"))
270
+ elif pitch_std > settings.PITCH_STD_HIGH:
271
+ score -= dec
272
+ rule_hits.append(RuleHit("high_pitch_std", -dec, f"pitch_std={pitch_std:.1f}Hz"))
273
+
274
+ if pitch_range < settings.PITCH_RANGE_LOW:
275
+ score += inc
276
+ rule_hits.append(RuleHit("low_pitch_range", inc, f"pitch_range={pitch_range:.1f}Hz"))
277
+ elif pitch_range > settings.PITCH_RANGE_HIGH:
278
+ score -= dec
279
+ rule_hits.append(RuleHit("high_pitch_range", -dec, f"pitch_range={pitch_range:.1f}Hz"))
280
+
281
+ # Jitter
282
+ jitter = acoustic.get("jitter", 0.020)
283
+ if jitter < settings.JITTER_LOW:
284
+ score += inc
285
+ rule_hits.append(RuleHit("low_jitter", inc, f"jitter={jitter:.4f}"))
286
+ elif jitter > settings.JITTER_HIGH:
287
+ score -= dec
288
+ rule_hits.append(RuleHit("high_jitter", -dec, f"jitter={jitter:.4f}"))
289
+
290
+ # Shimmer
291
+ shimmer = acoustic.get("shimmer", 0.040)
292
+ if shimmer < settings.SHIMMER_LOW:
293
+ score += inc
294
+ rule_hits.append(RuleHit("low_shimmer", inc, f"shimmer={shimmer:.4f}"))
295
+ elif shimmer > settings.SHIMMER_HIGH:
296
+ score -= dec
297
+ rule_hits.append(RuleHit("high_shimmer", -dec, f"shimmer={shimmer:.4f}"))
298
+
299
+ # Embedding variability
300
+ wav2vec_var = embeddings.get("wav2vec_var_ratio", 0.50)
301
+ whisper_var = embeddings.get("whisper_var_ratio", 0.50)
302
+
303
+ if wav2vec_var < settings.EMBEDDING_VAR_LOW:
304
+ score += inc
305
+ rule_hits.append(RuleHit("low_wav2vec_var", inc, f"wav2vec_var={wav2vec_var:.3f}"))
306
+ elif wav2vec_var > settings.EMBEDDING_VAR_HIGH:
307
+ score -= dec
308
+ rule_hits.append(RuleHit("high_wav2vec_var", -dec, f"wav2vec_var={wav2vec_var:.3f}"))
309
+
310
+ if whisper_var < settings.EMBEDDING_VAR_LOW:
311
+ score += inc
312
+ rule_hits.append(RuleHit("low_whisper_var", inc, f"whisper_var={whisper_var:.3f}"))
313
+ elif whisper_var > settings.EMBEDDING_VAR_HIGH:
314
+ score -= dec
315
+ rule_hits.append(RuleHit("high_whisper_var", -dec, f"whisper_var={whisper_var:.3f}"))
316
+
317
+ score = max(0.0, min(1.0, score))
318
+ return score, rule_hits
319
+
320
+ def _generate_explanation(self, classification: str, rule_hits: List[RuleHit], acoustic: Dict, embeddings: Dict) -> str:
321
+ if not rule_hits:
322
+ if classification == "AI_GENERATED":
323
+ return "Audio characteristics suggest synthetic generation."
324
+ else:
325
+ return "Audio characteristics suggest natural human speech."
326
+
327
+ sorted_hits = sorted(rule_hits, key=lambda x: abs(x.delta), reverse=True)
328
+
329
+ if classification == "AI_GENERATED":
330
+ relevant = [h for h in sorted_hits if h.delta > 0]
331
+ prefix = "Synthetic indicators"
332
+ else:
333
+ relevant = [h for h in sorted_hits if h.delta < 0]
334
+ prefix = "Human speech indicators"
335
+
336
+ if not relevant:
337
+ relevant = sorted_hits[:3]
338
+
339
+ details = [h.detail for h in relevant[:3]]
340
+ return f"{prefix}: {'; '.join(details)}."
341
+ '''
342
+
343
+ filepath2 = "app/core/detector.py"
344
+ os.makedirs(os.path.dirname(filepath2), exist_ok=True)
345
+
346
+ with open(filepath2, "w", encoding="utf-8", newline="\n") as f:
347
+ f.write(detector_content)
348
+
349
+ print(f"[OK] Updated {filepath2}")
350
+ print()
351
+ print("=" * 50)
352
+ print("Now push to HuggingFace:")
353
+ print(" git add .")
354
+ print(' git commit -m "Fix base64 padding issue"')
355
+ print(" git push")
356
+ print("=" * 50)