Spaces:
Running on CPU Upgrade
Running on CPU Upgrade
File size: 4,885 Bytes
ac9c2ba be464b4 ac9c2ba dd91a12 ac9c2ba be464b4 ac9c2ba dd91a12 ac9c2ba 8b1487a ac9c2ba dd91a12 ac9c2ba | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 | import sonogram_utility as su
from pyannote.audio import Pipeline
import pickle
import torch
class Sonogram():
def __init__(self,enableDenoise=False):
'''
Initialize Sonogram Class
enableDenoise : False|True
Legacy code to support denoise, which has currently been removed. Consider removing if denoise will not be reimplemented in the future.
'''
#TODO: Should these be adjustable via initialization, or constants?
self.secondDifference = 5
self.gainWindow = 4
self.minimumGain = -45
self.maximumGain = -5
self.attenLimDB = 3
self.earlyCleanup = True
self.isTPU = False
self.isGPU = False
try:
raise(RuntimeError("Not an error"))
#device = xm.xla_device()
print("TPU is available.")
self.isTPU = True
except RuntimeError as e:
print(f"TPU is not available: {e}")
# Fallback to CPU or other devices if needed
self.isGPU = torch.cuda.is_available()
if not self.isGPU:
print(f"GPU is not available")
self.device = torch.device("cuda" if self.isGPU else "cpu")
print(f"Using {self.device} instead.")
self.pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization-3.1")
self.pipeline.to(self.device)
# Load SVM classifier
with open('groupClassifier.pkl', 'rb') as f:
self.groupClassifier = pickle.load(f)
def processFile(self,filePath):
'''
Processes audio file to generate diarization output
filePath : string
Path to the audio file
Returns
--------
diarizationOutput : DiarizeOutput
found here https://github.com/pyannote/pyannote-audio/blob/main/src/pyannote/audio/pipelines/speaker_diarization.py#L64
totalTimeInSeconds : int
Approximate total seconds of audio file
waveformGainAdjusted : np.array
The waveform of the audio file after equalization
sampleRate : int
The sample rate of the audio file
'''
print(f"Loading file : {filePath}")
waveformList, sampleRate = su.splitIntoTimeSegments(filePath,600)
print("File loaded")
waveformEnhanced = su.combineWaveforms(waveformList)
if (self.earlyCleanup):
del waveformList
print("Equalizing Audio")
waveform_gain_adjusted = su.equalizeVolume()(waveformEnhanced,sampleRate,self.gainWindow,self.minimumGain,self.maximumGain)
if (self.earlyCleanup):
del waveformEnhanced
print("Audio Equalized")
print("Detecting speakers")
diarizationOutput, embeddings = self.pipeline({"waveform": waveform_gain_adjusted, "sample_rate": sampleRate}, return_embeddings=True)
annotations = diarizationOutput.speaker_diarization
embeddings = diarizationOutput.speaker_embeddings
print("Speakers Detected")
totalTimeInSeconds = int(waveform_gain_adjusted.shape[-1]/sampleRate)
print("Time in seconds calculated")
return diarizationOutput, totalTimeInSeconds, waveformGainAdjusted, sampleRate
def __call__(self,audioPath):
'''
Processes audio file to generate results necessary for app
filePath : string
Path to the audio file
Returns
--------
annotation : pyannote.core.annotation
found here https://pyannote.github.io/pyannote-core/_modules/pyannote/core/annotation.html
totalTimeInSeconds : int
Approximate total seconds of audio file
waveformGainAdjusted : np.array
The waveform of the audio file after equalization
sampleRate : int
The sample rate of the audio file
'''
diarizationOutput, totalTimeInSeconds, waveformGainAdjusted, sampleRate = self.processFile(audioPath)
annotation = diarizationOutput.speaker_diarization
# Relabel any existing silence and group speakers
labelMapping = {}
for s, speaker in enumerate(output.speaker_diarization.labels()):
diarizationOutput.speaker_embeddings[s]
prediction = self.groupClassifier.predict(diarizationOutput.speaker_embeddings[s].reshape(1,-1))
if prediction == 0:
labelMapping[speaker] = "silence"
elif prediction == 2:
labelMapping[speaker] = "group"
else:
# May not be necessary, consider using to reformat default names away from SPEAKER_XX
labelMapping[speaker] = speaker
# Rename in place
annotation.rename_labels(labelMapping)
return annotation, totalTimeInSeconds, waveformGainAdjusted, sampleRate |