Spaces:

Sonogram
/

Instructor-Support-Tool

Running on CPU Upgrade

App Files Files Community

czyoung commited on 4 days ago

Commit

ac9c2ba

verified ·

1 Parent(s): 12b447c

Reformat sonogram.py and create sonogram class

Browse files

Replaced unused sonogram file with sonogram class. This should be used to help clean up the app.py file and decouple the code.

Files changed (2) hide show

sonogram +0 -96
sonogram.py +77 -0

sonogram DELETED Viewed

@@ -1,96 +0,0 @@
-import cv2
-import random
-import copy
-from pyannote.core import Annotation, Segment
-def colors(n):
-  '''
-  Creates a list size n of distinctive colors
-  '''
-  if n == 0:
-    return []
-  ret = []
-  h = int(random.random() * 180)
-  step = 180 / n
-  for i in range(n):
-    h += step
-    h = int(h) % 180
-    hsv = np.uint8([[[h,200,200]]])
-    bgr = cv2.cvtColor(hsv,cv2.COLOR_HSV2BGR)
-    ret.append((bgr[0][0][0].item()/255,bgr[0][0][1].item()/255,bgr[0][0][2].item()/255))
-  return ret
-def extendSpeakers(mySpeakerList, fileLabel = 'NONE', maximumSecondDifference = 1, minimumSecondDuration = 0):
-    '''
-    Assumes mySpeakerList is already split into Speaker/Audience
-    '''
-    mySpeakerAnnotations = Annotation(uri=fileLabel)
-    newSpeakerList = [[],[]]
-    for i, speaker in enumerate(mySpeakerList):
-        speaker.sort()
-        lastEnd = -1
-        tempSection = None
-        for section in speaker:
-            if lastEnd == -1:
-                tempSection = copy.deepcopy(section)
-                lastEnd = section[0] + section[1]
-            else:
-                if section[0] - lastEnd <= maximumSecondDifference:
-                    tempSection = (tempSection[0],max(section[0] + section[1] - tempSection[0],tempSection[1]))
-                    lastEnd = tempSection[0] + tempSection[1]
-                else:
-                    if tempSection[1] >= minimumSecondDuration:
-                        newSpeakerList[i].append(tempSection)
-                        mySpeakerAnnotations[Segment(tempSection[0],lastEnd)] = i
-                    tempSection = copy.deepcopy(section)
-                    lastEnd = section[0] + section[1]
-        if tempSection is not None:
-            # Add the last section back in
-            if tempSection[1] >= minimumSecondDuration:
-                newSpeakerList[i].append(tempSection)
-                mySpeakerAnnotations[Segment(tempSection[0],lastEnd)] = i
-    return newSpeakerList,mySpeakerAnnotations
-def twoClassExtendAnnotation(myAnnotation,maximumSecondDifference = 1, minimumSecondDuration = 0):
-    lecturerID = None
-    lecturerLen = 0
-    # Identify lecturer
-    for speakerName in myAnnotation.labels():
-        tempLen = len(myAnnotation.label_support(speakerName))
-        if tempLen > lecturerLen:
-            lecturerLen = tempLen
-            lecturerID = speakerName
-    tempSpeakerList = [[],[]]
-    # Recreate speakerList as [[lecturer labels],[audience labels]]
-    for speakerName in myAnnotation.labels():
-        if speakerName != lecturerID:
-            for segmentItem in myAnnotation.label_support(speakerName):
-                tempSpeakerList[1].append((segmentItem.start,segmentItem.duration))
-        else:
-            for segmentItem in myAnnotation.label_support(speakerName):
-                tempSpeakerList[0].append((segmentItem.start,segmentItem.duration))
-    newList, newAnnotation = extendSpeakers(tempSpeakerList, fileLabel = myAnnotation.uri, maximumSecondDifference = maximumSecondDifference, minimumSecondDuration = minimumSecondDuration)
-    return newList, newAnnotation
-def loadAudioRTTM(sampleRTTM):
-    # Read in prediction data
-    # Data in list form, for convenient plotting
-    speakerList = []
-    # Data in Annotation form, for convenient error rate calculation
-    prediction = Annotation(uri=sampleRTTM)
-    with open(sampleRTTM, "r") as rttm:
-        for line in rttm:
-            speakerResult = line.split(' ')
-            index = int(speakerResult[7][-2:])
-            start = float(speakerResult[3])
-            end = start + float(speakerResult[4])
-            while len(speakerList) < index + 1:
-                speakerList.append([])
-            speakerList[index].append((float(speakerResult[3]),float(speakerResult[4])))
-            prediction[Segment(start,end)] = index
-    return speakerList, prediction

sonogram.py ADDED Viewed

	@@ -0,0 +1,77 @@

+import sonogram_utility as su
+from pyannote.audio import Pipeline
+import pickle
+class Sonogram():
+    def __init__(self,enableDenoise=False):
+        #TODO: Should these be adjustable via initialization, or constants?
+        self.secondDifference = 5
+        self.gainWindow = 4
+        self.minimumGain = -45
+        self.maximumGain = -5
+        self.attenLimDB = 3
+        self.earlyCleanup = True
+        self.isTPU = False
+        self.isGPU = False
+        try:
+            raise(RuntimeError("Not an error"))
+            #device = xm.xla_device()
+            print("TPU is available.")
+            self.isTPU = True
+        except RuntimeError as e:
+            print(f"TPU is not available: {e}")
+            # Fallback to CPU or other devices if needed
+            self.isGPU = torch.cuda.is_available()
+            if not self.isGPU:
+                print(f"GPU is not available")
+            self.device = torch.device("cuda" if self.isGPU else "cpu")
+            print(f"Using {device} instead.")
+        self.pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization-3.1")
+        self.pipeline.to(self.device)
+        # Load SVM classifier
+        with open('groupClassifier.pkl', 'rb') as f:
+            self.groupClassifier = pickle.load(f)
+    def processFile(self,filePath):
+        print("Loading file")
+        waveformList, sampleRate = su.splitIntoTimeSegments(filePath,600)
+        print("File loaded")
+        waveformEnhanced = su.combineWaveforms(waveformList)
+        if (self.earlyCleanup):
+            del waveformList
+        print("Equalizing Audio")
+        waveform_gain_adjusted = su.equalizeVolume()(waveformEnhanced,sampleRate,self.gainWindow,self.minimumGain,self.maximumGain)
+        if (self.earlyCleanup):
+            del waveformEnhanced
+        print("Audio Equalized")
+        print("Detecting speakers")
+        diarizationOutput, embeddings = self.pipeline({"waveform": waveform_gain_adjusted, "sample_rate": sampleRate}, return_embeddings=True)
+        annotations = diarizationOutput.speaker_diarization
+        embeddings = diarizationOutput.speaker_embeddings
+        print("Speakers Detected")
+        totalTimeInSeconds = int(waveform_gain_adjusted.shape[-1]/sampleRate)
+        print("Time in seconds calculated")
+        return diarizationOutput, totalTimeInSeconds, waveformGainAdjusted, sampleRate
+    def __call__(self,audioPath):
+        diarizationOutput, totalTimeInSeconds, waveformGainAdjusted, sampleRate = self.processFile(audioPath)
+        annotation = diarizationOutput.speaker_diarization
+        # Relabel any existing silence and group speakers
+        labelMapping = {}
+        for s, speaker in enumerate(output.speaker_diarization.labels()):
+            diarizationOutput.speaker_embeddings[s]
+            prediction = self.groupClassifier.predict(diarizationOutput.speaker_embeddings[s].reshape(1,-1))
+            if prediction == 0:
+                labelMapping[speaker] = "silence"
+            elif prediction == 2:
+                labelMapping[speaker] = "group"
+            else:
+                # May not be necessary, consider using to reformat default names away from SPEAKER_XX
+                labelMapping[speaker] = speaker
+        annotation.rename_labels(labelMapping)
+        return annotation, totalTimeInSeconds, waveformGainAdjusted, sampleRate