Spaces:

Sonogram
/

Instructor-Support-Tool

Running on CPU Upgrade

File size: 4,885 Bytes

import sonogram_utility as su
from pyannote.audio import Pipeline
import pickle
import torch

class Sonogram():

    def __init__(self,enableDenoise=False):
        '''
        Initialize Sonogram Class

        enableDenoise : False|True
            Legacy code to support denoise, which has currently been removed.  Consider removing if denoise will not be reimplemented in the future.
        '''
        #TODO: Should these be adjustable via initialization, or constants?
        self.secondDifference = 5
        self.gainWindow = 4
        self.minimumGain = -45
        self.maximumGain = -5
        self.attenLimDB = 3
        self.earlyCleanup = True
        
        self.isTPU = False
        self.isGPU = False
        try:
            raise(RuntimeError("Not an error"))
            #device = xm.xla_device()
            print("TPU is available.")
            self.isTPU = True
        except RuntimeError as e:
            print(f"TPU is not available: {e}")
            # Fallback to CPU or other devices if needed
            self.isGPU = torch.cuda.is_available()
            if not self.isGPU:
                print(f"GPU is not available")
            self.device = torch.device("cuda" if self.isGPU else "cpu")
            print(f"Using {self.device} instead.")
    
        self.pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization-3.1")
        self.pipeline.to(self.device)

        # Load SVM classifier
        with open('groupClassifier.pkl', 'rb') as f:
            self.groupClassifier = pickle.load(f)
    
    def processFile(self,filePath):
        '''
        Processes audio file to generate diarization output

        filePath : string
            Path to the audio file

        Returns
        --------
        diarizationOutput : DiarizeOutput 
            found here https://github.com/pyannote/pyannote-audio/blob/main/src/pyannote/audio/pipelines/speaker_diarization.py#L64

        totalTimeInSeconds : int
            Approximate total seconds of audio file

        waveformGainAdjusted : np.array
            The waveform of the audio file after equalization

        sampleRate : int
            The sample rate of the audio file
        '''
        print(f"Loading file : {filePath}")
        waveformList, sampleRate = su.splitIntoTimeSegments(filePath,600)
        print("File loaded")
        waveformEnhanced = su.combineWaveforms(waveformList)    
        if (self.earlyCleanup):
            del waveformList
        print("Equalizing Audio")
        waveform_gain_adjusted = su.equalizeVolume()(waveformEnhanced,sampleRate,self.gainWindow,self.minimumGain,self.maximumGain)
        if (self.earlyCleanup):
            del waveformEnhanced
        print("Audio Equalized")
        print("Detecting speakers")
        diarizationOutput, embeddings = self.pipeline({"waveform": waveform_gain_adjusted, "sample_rate": sampleRate}, return_embeddings=True)
        annotations = diarizationOutput.speaker_diarization
        embeddings = diarizationOutput.speaker_embeddings
        print("Speakers Detected")
        totalTimeInSeconds = int(waveform_gain_adjusted.shape[-1]/sampleRate)
        print("Time in seconds calculated")
        return diarizationOutput, totalTimeInSeconds, waveformGainAdjusted, sampleRate

    def __call__(self,audioPath):
        '''
        Processes audio file to generate results necessary for app

        filePath : string
            Path to the audio file

        Returns
        --------
        annotation : pyannote.core.annotation 
            found here https://pyannote.github.io/pyannote-core/_modules/pyannote/core/annotation.html

        totalTimeInSeconds : int
            Approximate total seconds of audio file

        waveformGainAdjusted : np.array
            The waveform of the audio file after equalization

        sampleRate : int
            The sample rate of the audio file
        '''
        diarizationOutput, totalTimeInSeconds, waveformGainAdjusted, sampleRate = self.processFile(audioPath)
        annotation = diarizationOutput.speaker_diarization

        # Relabel any existing silence and group speakers
        labelMapping = {}
        for s, speaker in enumerate(output.speaker_diarization.labels()):
            diarizationOutput.speaker_embeddings[s]
            prediction = self.groupClassifier.predict(diarizationOutput.speaker_embeddings[s].reshape(1,-1))
            if prediction == 0:
                labelMapping[speaker] = "silence"
            elif prediction == 2:
                labelMapping[speaker] = "group"
            else:
                # May not be necessary, consider using to reformat default names away from SPEAKER_XX
                labelMapping[speaker] = speaker
        # Rename in place
        annotation.rename_labels(labelMapping)
        return annotation, totalTimeInSeconds, waveformGainAdjusted, sampleRate