Spaces:

Sonogram
/

Instructor-Support-Tool

Running on CPU Upgrade

App Files Files Community

Instructor-Support-Tool / sonogram.py

czyoung

Added comments

8b1487a verified 4 days ago

raw

history blame contribute delete

4.89 kB

	import sonogram_utility as su
	from pyannote.audio import Pipeline
	import pickle
	import torch

	class Sonogram():

	def __init__(self,enableDenoise=False):
	'''
	Initialize Sonogram Class

	enableDenoise : False\|True
	Legacy code to support denoise, which has currently been removed. Consider removing if denoise will not be reimplemented in the future.
	'''
	#TODO: Should these be adjustable via initialization, or constants?
	self.secondDifference = 5
	self.gainWindow = 4
	self.minimumGain = -45
	self.maximumGain = -5
	self.attenLimDB = 3
	self.earlyCleanup = True

	self.isTPU = False
	self.isGPU = False
	try:
	raise(RuntimeError("Not an error"))
	#device = xm.xla_device()
	print("TPU is available.")
	self.isTPU = True
	except RuntimeError as e:
	print(f"TPU is not available: {e}")
	# Fallback to CPU or other devices if needed
	self.isGPU = torch.cuda.is_available()
	if not self.isGPU:
	print(f"GPU is not available")
	self.device = torch.device("cuda" if self.isGPU else "cpu")
	print(f"Using {self.device} instead.")

	self.pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization-3.1")
	self.pipeline.to(self.device)

	# Load SVM classifier
	with open('groupClassifier.pkl', 'rb') as f:
	self.groupClassifier = pickle.load(f)

	def processFile(self,filePath):
	'''
	Processes audio file to generate diarization output

	filePath : string
	Path to the audio file

	Returns
	--------
	diarizationOutput : DiarizeOutput
	found here https://github.com/pyannote/pyannote-audio/blob/main/src/pyannote/audio/pipelines/speaker_diarization.py#L64

	totalTimeInSeconds : int
	Approximate total seconds of audio file

	waveformGainAdjusted : np.array
	The waveform of the audio file after equalization

	sampleRate : int
	The sample rate of the audio file
	'''
	print(f"Loading file : {filePath}")
	waveformList, sampleRate = su.splitIntoTimeSegments(filePath,600)
	print("File loaded")
	waveformEnhanced = su.combineWaveforms(waveformList)
	if (self.earlyCleanup):
	del waveformList
	print("Equalizing Audio")
	waveform_gain_adjusted = su.equalizeVolume()(waveformEnhanced,sampleRate,self.gainWindow,self.minimumGain,self.maximumGain)
	if (self.earlyCleanup):
	del waveformEnhanced
	print("Audio Equalized")
	print("Detecting speakers")
	diarizationOutput, embeddings = self.pipeline({"waveform": waveform_gain_adjusted, "sample_rate": sampleRate}, return_embeddings=True)
	annotations = diarizationOutput.speaker_diarization
	embeddings = diarizationOutput.speaker_embeddings
	print("Speakers Detected")
	totalTimeInSeconds = int(waveform_gain_adjusted.shape[-1]/sampleRate)
	print("Time in seconds calculated")
	return diarizationOutput, totalTimeInSeconds, waveformGainAdjusted, sampleRate

	def __call__(self,audioPath):
	'''
	Processes audio file to generate results necessary for app

	filePath : string
	Path to the audio file

	Returns
	--------
	annotation : pyannote.core.annotation
	found here https://pyannote.github.io/pyannote-core/_modules/pyannote/core/annotation.html

	totalTimeInSeconds : int
	Approximate total seconds of audio file

	waveformGainAdjusted : np.array
	The waveform of the audio file after equalization

	sampleRate : int
	The sample rate of the audio file
	'''
	diarizationOutput, totalTimeInSeconds, waveformGainAdjusted, sampleRate = self.processFile(audioPath)
	annotation = diarizationOutput.speaker_diarization

	# Relabel any existing silence and group speakers
	labelMapping = {}
	for s, speaker in enumerate(output.speaker_diarization.labels()):
	diarizationOutput.speaker_embeddings[s]
	prediction = self.groupClassifier.predict(diarizationOutput.speaker_embeddings[s].reshape(1,-1))
	if prediction == 0:
	labelMapping[speaker] = "silence"
	elif prediction == 2:
	labelMapping[speaker] = "group"
	else:
	# May not be necessary, consider using to reformat default names away from SPEAKER_XX
	labelMapping[speaker] = speaker
	# Rename in place
	annotation.rename_labels(labelMapping)
	return annotation, totalTimeInSeconds, waveformGainAdjusted, sampleRate