File size: 4,885 Bytes
ac9c2ba
 
 
be464b4
ac9c2ba
 
 
 
dd91a12
 
 
 
 
 
ac9c2ba
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
be464b4
ac9c2ba
 
 
 
 
 
 
 
 
dd91a12
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ac9c2ba
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8b1487a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ac9c2ba
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dd91a12
ac9c2ba
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
import sonogram_utility as su
from pyannote.audio import Pipeline
import pickle
import torch

class Sonogram():

    def __init__(self,enableDenoise=False):
        '''
        Initialize Sonogram Class

        enableDenoise : False|True
            Legacy code to support denoise, which has currently been removed.  Consider removing if denoise will not be reimplemented in the future.
        '''
        #TODO: Should these be adjustable via initialization, or constants?
        self.secondDifference = 5
        self.gainWindow = 4
        self.minimumGain = -45
        self.maximumGain = -5
        self.attenLimDB = 3
        self.earlyCleanup = True
        
        self.isTPU = False
        self.isGPU = False
        try:
            raise(RuntimeError("Not an error"))
            #device = xm.xla_device()
            print("TPU is available.")
            self.isTPU = True
        except RuntimeError as e:
            print(f"TPU is not available: {e}")
            # Fallback to CPU or other devices if needed
            self.isGPU = torch.cuda.is_available()
            if not self.isGPU:
                print(f"GPU is not available")
            self.device = torch.device("cuda" if self.isGPU else "cpu")
            print(f"Using {self.device} instead.")
    
        self.pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization-3.1")
        self.pipeline.to(self.device)

        # Load SVM classifier
        with open('groupClassifier.pkl', 'rb') as f:
            self.groupClassifier = pickle.load(f)
    
    def processFile(self,filePath):
        '''
        Processes audio file to generate diarization output

        filePath : string
            Path to the audio file

        Returns
        --------
        diarizationOutput : DiarizeOutput 
            found here https://github.com/pyannote/pyannote-audio/blob/main/src/pyannote/audio/pipelines/speaker_diarization.py#L64

        totalTimeInSeconds : int
            Approximate total seconds of audio file

        waveformGainAdjusted : np.array
            The waveform of the audio file after equalization

        sampleRate : int
            The sample rate of the audio file
        '''
        print(f"Loading file : {filePath}")
        waveformList, sampleRate = su.splitIntoTimeSegments(filePath,600)
        print("File loaded")
        waveformEnhanced = su.combineWaveforms(waveformList)    
        if (self.earlyCleanup):
            del waveformList
        print("Equalizing Audio")
        waveform_gain_adjusted = su.equalizeVolume()(waveformEnhanced,sampleRate,self.gainWindow,self.minimumGain,self.maximumGain)
        if (self.earlyCleanup):
            del waveformEnhanced
        print("Audio Equalized")
        print("Detecting speakers")
        diarizationOutput, embeddings = self.pipeline({"waveform": waveform_gain_adjusted, "sample_rate": sampleRate}, return_embeddings=True)
        annotations = diarizationOutput.speaker_diarization
        embeddings = diarizationOutput.speaker_embeddings
        print("Speakers Detected")
        totalTimeInSeconds = int(waveform_gain_adjusted.shape[-1]/sampleRate)
        print("Time in seconds calculated")
        return diarizationOutput, totalTimeInSeconds, waveformGainAdjusted, sampleRate

    def __call__(self,audioPath):
        '''
        Processes audio file to generate results necessary for app

        filePath : string
            Path to the audio file

        Returns
        --------
        annotation : pyannote.core.annotation 
            found here https://pyannote.github.io/pyannote-core/_modules/pyannote/core/annotation.html

        totalTimeInSeconds : int
            Approximate total seconds of audio file

        waveformGainAdjusted : np.array
            The waveform of the audio file after equalization

        sampleRate : int
            The sample rate of the audio file
        '''
        diarizationOutput, totalTimeInSeconds, waveformGainAdjusted, sampleRate = self.processFile(audioPath)
        annotation = diarizationOutput.speaker_diarization

        # Relabel any existing silence and group speakers
        labelMapping = {}
        for s, speaker in enumerate(output.speaker_diarization.labels()):
            diarizationOutput.speaker_embeddings[s]
            prediction = self.groupClassifier.predict(diarizationOutput.speaker_embeddings[s].reshape(1,-1))
            if prediction == 0:
                labelMapping[speaker] = "silence"
            elif prediction == 2:
                labelMapping[speaker] = "group"
            else:
                # May not be necessary, consider using to reformat default names away from SPEAKER_XX
                labelMapping[speaker] = speaker
        # Rename in place
        annotation.rename_labels(labelMapping)
        return annotation, totalTimeInSeconds, waveformGainAdjusted, sampleRate