Spaces:

Sonogram
/

Instructor-Support-Tool

Sleeping

App Files Files Community

Instructor-Support-Tool / sonogram_utility.py

czyoung

Update sonogram_utility.py

a1d4ef1 verified 9 months ago

raw

history blame contribute delete

19.1 kB

	import cv2
	import random
	import copy
	from pyannote.core import Annotation, Segment
	import numpy as np
	import torch
	import torchaudio
	import pandas as pd
	import datetime as dt

	def colors(n):
	'''
	Creates a list size n of distinctive colors
	'''
	if n == 0:
	return []
	ret = []
	h = int(random.random() * 180)
	step = 180 / n
	for i in range(n):
	h += step
	h = int(h) % 180
	hsv = np.uint8([[[h,200,200]]])
	bgr = cv2.cvtColor(hsv,cv2.COLOR_HSV2BGR)
	ret.append((bgr[0][0][0].item()/255,bgr[0][0][1].item()/255,bgr[0][0][2].item()/255))
	return ret

	def colorsCSS(n):
	'''
	Creates a list size n of distinctive colors based on CSS formatting
	'''
	if n == 0:
	return []
	ret = []
	h = int(random.random() * 180)
	step = 180 / n
	for i in range(n):
	h += step
	h = int(h) % 180
	hsv = np.uint8([[[h,200,200]]])
	bgr = cv2.cvtColor(hsv,cv2.COLOR_HSV2BGR)
	b = f'{bgr[0][0][0].item():02x}'
	g = f'{bgr[0][0][1].item():02x}'
	r = f'{bgr[0][0][2].item():02x}'
	ret.append('#'+b+g+r)
	return ret

	def extendSpeakers(mySpeakerList, fileLabel = 'NONE', maximumSecondDifference = 1, minimumSecondDuration = 0):
	'''
	Assumes mySpeakerList is already split into Speaker/Audience
	'''
	mySpeakerAnnotations = Annotation(uri=fileLabel)
	newSpeakerList = [[],[]]
	for i, speaker in enumerate(mySpeakerList):
	speaker.sort()
	lastEnd = -1
	tempSection = None
	for section in speaker:
	if lastEnd == -1:
	tempSection = copy.deepcopy(section)
	lastEnd = section[0] + section[1]
	else:
	if section[0] - lastEnd <= maximumSecondDifference:
	tempSection = (tempSection[0],max(section[0] + section[1] - tempSection[0],tempSection[1]))
	lastEnd = tempSection[0] + tempSection[1]
	else:
	if tempSection[1] >= minimumSecondDuration:
	newSpeakerList[i].append(tempSection)
	mySpeakerAnnotations[Segment(tempSection[0],lastEnd)] = i
	tempSection = copy.deepcopy(section)
	lastEnd = section[0] + section[1]
	if tempSection is not None:
	# Add the last section back in
	if tempSection[1] >= minimumSecondDuration:
	newSpeakerList[i].append(tempSection)
	mySpeakerAnnotations[Segment(tempSection[0],lastEnd)] = i
	return newSpeakerList,mySpeakerAnnotations

	def twoClassExtendAnnotation(myAnnotation,maximumSecondDifference = 1, minimumSecondDuration = 0):
	lecturerID = None
	lecturerLen = 0

	# Identify lecturer
	for speakerName in myAnnotation.labels():
	tempLen = len(myAnnotation.label_support(speakerName))
	if tempLen > lecturerLen:
	lecturerLen = tempLen
	lecturerID = speakerName

	tempSpeakerList = [[],[]]
	# Recreate speakerList as [[lecturer labels],[audience labels]]
	for speakerName in myAnnotation.labels():
	if speakerName != lecturerID:
	for segmentItem in myAnnotation.label_support(speakerName):
	tempSpeakerList[1].append((segmentItem.start,segmentItem.duration))
	else:
	for segmentItem in myAnnotation.label_support(speakerName):
	tempSpeakerList[0].append((segmentItem.start,segmentItem.duration))

	newList, newAnnotation = extendSpeakers(tempSpeakerList, fileLabel = myAnnotation.uri, maximumSecondDifference = maximumSecondDifference, minimumSecondDuration = minimumSecondDuration)

	return newList, newAnnotation

	def loadAudioRTTM(sampleRTTM):
	# Read in prediction data
	# Data in list form, for convenient plotting
	speakerList = []
	# Data in Annotation form, for convenient error rate calculation
	prediction = Annotation(uri=sampleRTTM)
	with open(sampleRTTM, "r") as rttm:
	for line in rttm:
	speakerResult = line.split(' ')
	index = int(speakerResult[7][-2:])
	start = float(speakerResult[3])
	end = start + float(speakerResult[4])
	while len(speakerList) < index + 1:
	speakerList.append([])
	speakerList[index].append((float(speakerResult[3]),float(speakerResult[4])))
	prediction[Segment(start,end)] = speakerResult[7]

	return speakerList, prediction

	def loadAudioTXT(sampleTXT):
	# Read in prediction data
	# Data in list form, for convenient plotting
	speakerList = []
	# Data in Annotation form, for convenient error rate calculation
	prediction = Annotation(uri=sampleTXT)
	with open(sampleTXT, "r") as txt:
	for line in txt:
	speakerResult = line.split('\t')
	print(speakerResult)
	if len(speakerResult) < 3:
	continue
	index = -1
	start = float(speakerResult[0])
	end = float(speakerResult[1])
	duration = end - start
	prediction[Segment(start,end)] = speakerResult[2]

	return [], prediction

	def loadAudioCSV(sampleCSV):
	# Read in prediction data
	df = pd.read_csv(sampleCSV)

	df = df.reset_index() # make sure indexes pair with number of rows

	# Data in Annotation form, for convenient error rate calculation
	prediction = Annotation(uri=sampleCSV)

	for i, row in df.iterrows():
	index = row['Resource']
	start = row['Start']
	end = row['Finish']
	prediction[Segment(start,end)] = index

	return [], prediction

	def splitIntoTimeSegments(testFile,maxDurationInSeconds=60):

	waveform, sample_rate = torchaudio.load(testFile)
	audioSegments = []

	outOfBoundsIndex = waveform.shape[-1]
	currentStart = 0
	currentEnd = min(maxDurationInSeconds * sample_rate,outOfBoundsIndex)
	done = False
	while(not done):
	waveformSegment = waveform[:,currentStart:currentEnd]
	audioSegments.append(waveformSegment)
	if currentEnd >= outOfBoundsIndex:
	done = True
	break
	else:
	currentStart = currentEnd
	currentEnd = min(currentStart + maxDurationInSeconds * sample_rate,outOfBoundsIndex)
	return audioSegments, sample_rate

	def audioNormalize(waveform,sampleRate,stepSizeInSeconds = 2,dbThreshold = -50,dbTarget = -5):
	print("In audioNormalize")
	copyWaveform = waveform.clone().detach()
	print("Waveform copy made")
	transform = torchaudio.transforms.AmplitudeToDB(stype="amplitude", top_db=80)
	currStart = 0
	currEnd = int(min(currStart + stepSizeInSeconds * sampleRate, len(copyWaveform[0])-1))
	done = False
	while(not done):
	copyWaveform_db = waveform[:,currStart:currEnd].clone().detach()
	copyWaveform_db = transform(copyWaveform_db)
	if currStart == 0:
	print("First DB level calculated")


	if torch.max(copyWaveform_db[0]).item() > dbThreshold:
	gain = torch.min(dbTarget - copyWaveform_db[0])
	adjustGain = torchaudio.transforms.Vol(gain,'db')
	copyWaveform[0][currStart:currEnd] = adjustGain(copyWaveform[0][currStart:currEnd])
	if len(copyWaveform_db) > 1:
	if torch.max(copyWaveform_db[1]).item() > dbThreshold:
	gain = torch.min(dbTarget - copyWaveform_db[1])
	adjustGain = torchaudio.transforms.Vol(gain,'db')
	copyWaveform[1][currStart:currEnd] = adjustGain(copyWaveform[1][currStart:currEnd])
	currStart += int(stepSizeInSeconds * sampleRate)
	if currStart > currEnd:
	done = True
	else:
	currEnd = int(min(currStart + stepSizeInSeconds * sampleRate, len(copyWaveform[0])-1))
	print("Waveform enhanced")
	return copyWaveform

	class equalizeVolume(torch.nn.Module):
	def forward(self, waveform,sampleRate,stepSizeInSeconds,dbThreshold,dbTarget):
	print("In equalizeVolume")
	waveformDifference = audioNormalize(waveform,sampleRate,stepSizeInSeconds,dbThreshold,dbTarget)
	return waveformDifference

	def combineWaveforms(waveformList):
	return torch.cat(waveformList,1)

	def annotationToSpeakerList(myAnnotation):
	tempSpeakerList = []
	tempSpeakerNames = []
	for speakerName in myAnnotation.labels():
	speakerIndex = None
	if speakerName not in tempSpeakerNames:
	speakerIndex = len(tempSpeakerNames)
	tempSpeakerNames.append(speakerName)
	tempSpeakerList.append([])
	else:
	speakerIndex = tempSpeakerNames.index(speakerName)

	for segmentItem in myAnnotation.label_support(speakerName):
	tempSpeakerList[speakerIndex].append((segmentItem.start,segmentItem.duration))
	return tempSpeakerList

	def speakerListToDataFrame(speakerList):
	dataList = []
	for j, row in enumerate(speakerList):
	for k, speakingPoint in enumerate(row):
	h0 = int(speakingPoint[0]//3600)
	m0 = int(speakingPoint[0]%3600//60)
	s0 = int(speakingPoint[0]%60)
	ms0 = int(speakingPoint[0]*1000000%1000000)
	time0 = dt.time(h0,m0,s0,ms0)
	dtStart = dt.datetime.combine(dt.date.today(), time0)
	endPoint = speakingPoint[0] + speakingPoint[1]
	h1 = int(endPoint//3600)
	m1 = int(endPoint%3600//60)
	s1 = int(endPoint%60)
	ms1 = int(endPoint*1000000%1000000)
	time1 = dt.time(h1,m1,s1,ms1)
	dtEnd = dt.datetime.combine(dt.date.today(), time1)
	dataList.append(dict(Task=f"Speaker {j}.{k}", Start=dtStart, Finish=dtEnd, Resource=f"Speaker {j+1}"))
	df = pd.DataFrame(dataList)
	return df

	def removeOverlap(timeSegment,overlap):
	times = []
	if timeSegment.start < overlap.start:
	times.append(Segment(timeSegment.start,min(overlap.start,timeSegment.end)))
	if timeSegment.end > overlap.end:
	times.append(Segment(max(timeSegment.start,overlap.end),timeSegment.end))
	return times

	def checkForOverlap(time1, time2):
	overlap = time1 & time2
	if overlap:
	return overlap
	else:
	return None

	def sumSegments(segmentList):
	total = 0
	for s in segmentList:
	total += s.duration
	return total

	def sumTimes(myAnnotation):
	return myAnnotation.get_timeline(False).duration()

	def sumTimesPerSpeaker(myAnnotation):
	speakerList = []
	timeList = []
	for speaker in myAnnotation.labels():
	if speaker not in speakerList:
	speakerList.append(speaker)
	timeList.append(0)
	timeList[speakerList.index(speaker)] += sumTimes(myAnnotation.subset([speaker]))
	return speakerList, timeList

	def sumMultiTimesPerSpeaker(myAnnotation):
	speakerList = []
	timeList = []
	sList,tList = sumTimesPerSpeaker(myAnnotation)
	for i,speakerGroup in enumerate(sList):
	speakerSplit = speakerGroup.split('+')
	for speaker in speakerSplit:
	if speaker not in speakerList:
	speakerList.append(speaker)
	timeList.append(0)
	timeList[speakerList.index(speaker)] += tList[i]
	return speakerList, timeList

	def annotationToDataFrame(myAnnotation):
	dataList = []
	speakerDict = {}
	for currSpeaker in myAnnotation.labels():
	if currSpeaker not in speakerDict.keys():
	speakerDict[currSpeaker] = []
	for currSegment in myAnnotation.subset([currSpeaker]).itersegments():
	speakerDict[currSpeaker].append(currSegment)

	timeSummary = {}
	for key in speakerDict.keys():
	if key not in timeSummary.keys():
	timeSummary[key] = 0
	for speakingSegment in speakerDict[key]:
	timeSummary[key] += speakingSegment.duration

	for key in speakerDict.keys():
	for k, speakingSegment in enumerate(speakerDict[key]):
	speakerName = key
	startPoint = speakingSegment.start
	endPoint = speakingSegment.end
	h0 = int(startPoint//3600)
	m0 = int(startPoint%3600//60)
	s0 = int(startPoint%60)
	ms0 = int(startPoint*1000000%1000000)
	time0 = dt.time(h0,m0,s0,ms0)
	dtStart = dt.datetime.combine(dt.date.today(), time0)
	h1 = int(endPoint//3600)
	m1 = int(endPoint%3600//60)
	s1 = int(endPoint%60)
	ms1 = int(endPoint*1000000%1000000)
	time1 = dt.time(h1,m1,s1,ms1)
	dtEnd = dt.datetime.combine(dt.date.today(), time1)
	dataList.append(dict(Task=speakerName + f".{k}", Start=dtStart, Finish=dtEnd, Resource=speakerName))
	df = pd.DataFrame(dataList)
	return df, timeSummary

	def annotationToSimpleDataFrame(myAnnotation):
	dataList = []
	speakerDict = {}
	for currSpeaker in myAnnotation.labels():
	if currSpeaker not in speakerDict.keys():
	speakerDict[currSpeaker] = []
	for currSegment in myAnnotation.subset([currSpeaker]).itersegments():
	speakerDict[currSpeaker].append(currSegment)

	timeSummary = {}
	for key in speakerDict.keys():
	if key not in timeSummary.keys():
	timeSummary[key] = 0
	for speakingSegment in speakerDict[key]:
	timeSummary[key] += speakingSegment.duration

	for key in speakerDict.keys():
	for k, speakingSegment in enumerate(speakerDict[key]):
	speakerName = key
	startPoint = speakingSegment.start
	endPoint = speakingSegment.end
	dataList.append(dict(Task=speakerName + f".{k}", Start=startPoint, Finish=endPoint, Resource=speakerName))
	df = pd.DataFrame(dataList)
	return df, timeSummary

	def calcCategories(myAnnotation,categories):
	categorySlots = []
	extraCategories = []
	for category in categories:
	categorySlots.append([])
	for speaker in myAnnotation.labels():
	targetCategory = None
	for i, category in enumerate(categories):
	if speaker in category:
	targetCategory = i
	if targetCategory is None:
	targetCategory = len(categorySlots)
	categorySlots.append([])
	extraCategories.append(speaker)

	for timeSegment in myAnnotation.subset([speaker]).itersegments():
	categorySlots[targetCategory].append((speaker,timeSegment))
	# Clean up categories
	cleanCategories = []
	for category in categorySlots:
	newCategory = []
	catSorted = copy.deepcopy(sorted(category,key=lambda cSegment: cSegment[1].start))
	currID, currSegment = None, None
	if len(catSorted) > 0:
	currID, currSegment = catSorted[0]
	for sp, segmentSlot in catSorted[1:]:
	overlapTime = checkForOverlap(currSegment,segmentSlot)
	if overlapTime is None:
	newCategory.append((currID,currSegment))
	currID = sp
	currTime = segmentSlot
	else:
	currID = currID + "+" + sp
	# Union of segments
	currTime[1] = currSegment \| segmentSlot
	if currSegment is not None:
	newCategory.append((currID,currSegment))
	cleanCategories.append(newCategory)
	return cleanCategories,extraCategories

	def calcSpeakingTypes(myAnnotation,maxTime):
	noVoice = [Segment(0,maxTime)]
	oneVoice = []
	multiVoice = []
	for speaker in myAnnotation.labels():
	timesToProcess = []
	for timeSegment in myAnnotation.subset([speaker]).itersegments():
	timesToProcess.append((speaker,timeSegment))
	while len(timesToProcess) > 0:
	currID, currSegment = timesToProcess[0]
	timesToProcess.remove(timesToProcess[0])
	resetCheck = False
	# Check in multi
	for compareID,timeSegment in multiVoice:
	overlapTime = checkForOverlap(currSegment,timeSegment)
	if overlapTime is None:
	continue
	else:
	compareID.append(currID)
	newTimes = removeOverlap(currSegment,timeSegment)
	for i in range(len(newTimes)):
	newTimes[i] = (currID,newTimes[i])
	timesToProcess += newTimes
	resetCheck = True
	break
	if resetCheck:
	continue
	# Check in one voice
	for timeSlot in oneVoice:
	tID = timeSlot[0]
	tSegment = timeSlot[1]
	overlapTime = checkForOverlap(currSegment,tSegment)
	if overlapTime is None:
	continue
	else:
	oneVoice.remove(timeSlot)
	# Add back non overlap
	newTimes = removeOverlap(tSegment,currSegment)
	for i in range(len(newTimes)):
	newTimes[i] = (tID,newTimes[i])
	oneVoice += newTimes
	# Add overlap time to multivoice
	multiVoice.append(([tID,currID],overlapTime))
	# Add new times back to process
	newTimes = removeOverlap(currSegment,tSegment)
	for i in range(len(newTimes)):
	newTimes[i] = (currID,newTimes[i])
	timesToProcess += newTimes
	resetCheck = True
	break
	if resetCheck:
	continue
	# Add to one voice
	oneVoice.append((currID,currSegment))
	ovAnnotation = Annotation()
	mvAnnotation = Annotation()
	for currID,timeSlot in multiVoice:
	currIDString = '+'.join(currID)
	mvAnnotation[timeSlot] = currIDString
	copyOfNo = copy.deepcopy(noVoice)
	for emptySlot in noVoice:
	if checkForOverlap(timeSlot,emptySlot) is None:
	continue
	else:
	copyOfNo.remove(emptySlot)
	copyOfNo += removeOverlap(emptySlot,timeSlot)
	noVoice = copyOfNo
	for currID,timeSlot in oneVoice:
	ovAnnotation[timeSlot] = currID
	copyOfNo = copy.deepcopy(noVoice)
	for emptySlot in noVoice:
	if checkForOverlap(timeSlot,emptySlot) is None:
	continue
	else:
	copyOfNo.remove(emptySlot)
	copyOfNo += removeOverlap(emptySlot,timeSlot)
	noVoice = copyOfNo
	nvAnnotation = Annotation()
	for emptySlot in noVoice:
	nvAnnotation[emptySlot] = "None"

	return nvAnnotation, ovAnnotation, mvAnnotation

	def timeToString(timeInSeconds):
	if isinstance(timeInSeconds,list):
	return [timeToString(t) for t in timeInSeconds]
	else:
	h = int(timeInSeconds//3600)
	m = int(timeInSeconds%3600//60)
	s = timeInSeconds%60
	return f'{h:02d}::{m:02d}::{s:02.2f}'