File size: 1,346 Bytes
b3f89f5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
import librosa
import numpy as np

class FeatureExtractor:
    def extract(self, audio: np.ndarray, sr: int) -> dict:
        """
        Extract handcrafted features for rule-based detection.
        Ported from AI-Generated-Voice-Detection reference.
        """
        features = {}

        # Pitch features
        pitches, magnitudes = librosa.piptrack(y=audio, sr=sr)
        # Filter out zero pitches
        pitch_values = pitches[pitches > 0]
        
        features["pitch_mean"] = float(np.mean(pitch_values)) if len(pitch_values) > 0 else 0.0
        features["pitch_std"] = float(np.std(pitch_values)) if len(pitch_values) > 0 else 0.0

        # MFCCs (13 coefficients)
        mfcc = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=13)
        mfcc_means = np.mean(mfcc, axis=1)
        for i, val in enumerate(mfcc_means):
            features[f"mfcc_{i+1}"] = float(val)

        # Spectral centroid
        centroid = librosa.feature.spectral_centroid(y=audio, sr=sr)
        features["spectral_centroid_mean"] = float(np.mean(centroid))

        # Energy variation (RMS)
        rms = librosa.feature.rms(y=audio)
        features["rms_std"] = float(np.std(rms))

        # Zero Crossing Rate
        zcr = librosa.feature.zero_crossing_rate(y=audio)
        features["zcr_mean"] = float(np.mean(zcr))

        return features