Upload folder using huggingface_hub

5c69097 verified about 1 month ago

27.4 kB

	import sys, time, os, tqdm, torch, argparse, glob, subprocess, warnings, cv2, pickle, numpy, pdb, math, python_speech_features

	from scipy import signal
	from shutil import rmtree
	from scipy.io import wavfile
	from scipy.interpolate import interp1d
	from sklearn.metrics import accuracy_score, f1_score

	from scenedetect.video_manager import VideoManager
	from scenedetect.scene_manager import SceneManager
	from scenedetect.frame_timecode import FrameTimecode
	from scenedetect.stats_manager import StatsManager
	from scenedetect.detectors import ContentDetector

	from model.faceDetector.s3fd import S3FD
	from talkNet import talkNet

	warnings.filterwarnings("ignore")

	parser = argparse.ArgumentParser(description="TalkNet Demo or Columnbia ASD Evaluation")

	parser.add_argument("--videoName", type=str, default="001", help="Demo video name")
	parser.add_argument(
	"--videoFolder", type=str, default="demo", help="Path for inputs, tmps and outputs"
	)
	parser.add_argument(
	"--pretrainModel",
	type=str,
	default="pretrain_TalkSet.model",
	help="Path for the pretrained TalkNet model",
	)

	parser.add_argument(
	"--nDataLoaderThread", type=int, default=10, help="Number of workers"
	)
	parser.add_argument(
	"--facedetScale",
	type=float,
	default=0.25,
	help="Scale factor for face detection, the frames will be scale to 0.25 orig",
	)
	parser.add_argument(
	"--minTrack", type=int, default=10, help="Number of min frames for each shot"
	)
	parser.add_argument(
	"--numFailedDet",
	type=int,
	default=10,
	help="Number of missed detections allowed before tracking is stopped",
	)
	parser.add_argument(
	"--minFaceSize", type=int, default=1, help="Minimum face size in pixels"
	)
	parser.add_argument("--cropScale", type=float, default=0.40, help="Scale bounding box")

	parser.add_argument("--start", type=int, default=0, help="The start time of the video")
	parser.add_argument(
	"--duration",
	type=int,
	default=0,
	help="The duration of the video, when set as 0, will extract the whole video",
	)

	parser.add_argument(
	"--evalCol",
	dest="evalCol",
	action="store_true",
	help="Evaluate on Columnbia dataset",
	)
	parser.add_argument(
	"--colSavePath",
	type=str,
	default="/data08/col",
	help="Path for inputs, tmps and outputs",
	)

	args = parser.parse_args()

	if os.path.isfile(args.pretrainModel) == False: # Download the pretrained model
	Link = "1AbN9fCf9IexMxEKXLQY2KYBlb-IhSEea"
	cmd = "gdown --id %s -O %s" % (Link, args.pretrainModel)
	subprocess.call(cmd, shell=True, stdout=None)

	if args.evalCol == True:
	# The process is: 1. download video and labels(I have modified the format of labels to make it easiler for using)
	# 2. extract audio, extract video frames
	# 3. scend detection, face detection and face tracking
	# 4. active speaker detection for the detected face clips
	# 5. use iou to find the identity of each face clips, compute the F1 results
	# The step 1 to 3 will take some time (That is one-time process). It depends on your cpu and gpu speed. For reference, I used 1.5 hour
	# The step 4 and 5 need less than 10 minutes
	# Need about 20G space finally
	# ```
	args.videoName = "col"
	args.videoFolder = args.colSavePath
	args.savePath = os.path.join(args.videoFolder, args.videoName)
	args.videoPath = os.path.join(args.videoFolder, args.videoName + ".mp4")
	args.duration = 0
	if os.path.isfile(args.videoPath) == False: # Download video
	link = "https://www.youtube.com/watch?v=6GzxbrO0DHM&t=2s"
	cmd = "youtube-dl -f best -o %s '%s'" % (args.videoPath, link)
	output = subprocess.call(cmd, shell=True, stdout=None)
	if os.path.isdir(args.videoFolder + "/col_labels") == False: # Download label
	link = "1Tto5JBt6NsEOLFRWzyZEeV6kCCddc6wv"
	cmd = "gdown --id %s -O %s" % (link, args.videoFolder + "/col_labels.tar.gz")
	subprocess.call(cmd, shell=True, stdout=None)
	cmd = "tar -xzvf %s -C %s" % (
	args.videoFolder + "/col_labels.tar.gz",
	args.videoFolder,
	)
	subprocess.call(cmd, shell=True, stdout=None)
	os.remove(args.videoFolder + "/col_labels.tar.gz")
	else:
	args.videoPath = glob.glob(os.path.join(args.videoFolder, args.videoName + ".*"))[0]
	args.savePath = os.path.join(args.videoFolder, args.videoName)


	def scene_detect(args):
	# CPU: Scene detection, output is the list of each shot's time duration
	videoManager = VideoManager([args.videoFilePath])
	statsManager = StatsManager()
	sceneManager = SceneManager(statsManager)
	sceneManager.add_detector(ContentDetector())
	baseTimecode = videoManager.get_base_timecode()
	videoManager.set_downscale_factor()
	videoManager.start()
	sceneManager.detect_scenes(frame_source=videoManager)
	sceneList = sceneManager.get_scene_list(baseTimecode)
	savePath = os.path.join(args.pyworkPath, "scene.pckl")
	if sceneList == []:
	sceneList = [
	(videoManager.get_base_timecode(), videoManager.get_current_timecode())
	]
	with open(savePath, "wb") as fil:
	pickle.dump(sceneList, fil)
	sys.stderr.write(
	"%s - scenes detected %d\n" % (args.videoFilePath, len(sceneList))
	)
	return sceneList


	def inference_video(args):
	# GPU: Face detection, output is the list contains the face location and score in this frame
	DET = S3FD(device="cuda")
	flist = glob.glob(os.path.join(args.pyframesPath, "*.jpg"))
	flist.sort()
	dets = []
	for fidx, fname in enumerate(flist):
	image = cv2.imread(fname)
	imageNumpy = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
	bboxes = DET.detect_faces(imageNumpy, conf_th=0.9, scales=[args.facedetScale])
	dets.append([])
	for bbox in bboxes:
	dets[-1].append(
	{"frame": fidx, "bbox": (bbox[:-1]).tolist(), "conf": bbox[-1]}
	) # dets has the frames info, bbox info, conf info
	sys.stderr.write(
	"%s-%05d; %d dets\r" % (args.videoFilePath, fidx, len(dets[-1]))
	)
	savePath = os.path.join(args.pyworkPath, "faces.pckl")
	with open(savePath, "wb") as fil:
	pickle.dump(dets, fil)
	return dets


	def bb_intersection_over_union(boxA, boxB, evalCol=False):
	# CPU: IOU Function to calculate overlap between two image
	xA = max(boxA[0], boxB[0])
	yA = max(boxA[1], boxB[1])
	xB = min(boxA[2], boxB[2])
	yB = min(boxA[3], boxB[3])
	interArea = max(0, xB - xA) * max(0, yB - yA)
	boxAArea = (boxA[2] - boxA[0]) * (boxA[3] - boxA[1])
	boxBArea = (boxB[2] - boxB[0]) * (boxB[3] - boxB[1])
	if evalCol == True:
	iou = interArea / float(boxAArea)
	else:
	iou = interArea / float(boxAArea + boxBArea - interArea)
	return iou


	def track_shot(args, sceneFaces):
	# CPU: Face tracking
	iouThres = 0.5 # Minimum IOU between consecutive face detections
	tracks = []
	while True:
	track = []
	for frameFaces in sceneFaces:
	for face in frameFaces:
	if track == []:
	track.append(face)
	frameFaces.remove(face)
	elif face["frame"] - track[-1]["frame"] <= args.numFailedDet:
	iou = bb_intersection_over_union(face["bbox"], track[-1]["bbox"])
	if iou > iouThres:
	track.append(face)
	frameFaces.remove(face)
	continue
	else:
	break
	if track == []:
	break
	elif len(track) > args.minTrack:
	frameNum = numpy.array([f["frame"] for f in track])
	bboxes = numpy.array([numpy.array(f["bbox"]) for f in track])
	frameI = numpy.arange(frameNum[0], frameNum[-1] + 1)
	bboxesI = []
	for ij in range(0, 4):
	interpfn = interp1d(frameNum, bboxes[:, ij])
	bboxesI.append(interpfn(frameI))
	bboxesI = numpy.stack(bboxesI, axis=1)
	if (
	max(
	numpy.mean(bboxesI[:, 2] - bboxesI[:, 0]),
	numpy.mean(bboxesI[:, 3] - bboxesI[:, 1]),
	)
	> args.minFaceSize
	):
	tracks.append({"frame": frameI, "bbox": bboxesI})
	return tracks


	def crop_video(args, track, cropFile):
	# CPU: crop the face clips
	flist = glob.glob(os.path.join(args.pyframesPath, "*.jpg")) # Read the frames
	flist.sort()
	vOut = cv2.VideoWriter(
	cropFile + "t.avi", cv2.VideoWriter_fourcc(*"XVID"), 25, (224, 224)
	) # Write video
	dets = {"x": [], "y": [], "s": []}
	for det in track["bbox"]: # Read the tracks
	dets["s"].append(max((det[3] - det[1]), (det[2] - det[0])) / 2)
	dets["y"].append((det[1] + det[3]) / 2) # crop center x
	dets["x"].append((det[0] + det[2]) / 2) # crop center y
	dets["s"] = signal.medfilt(dets["s"], kernel_size=13) # Smooth detections
	dets["x"] = signal.medfilt(dets["x"], kernel_size=13)
	dets["y"] = signal.medfilt(dets["y"], kernel_size=13)
	for fidx, frame in enumerate(track["frame"]):
	cs = args.cropScale
	bs = dets["s"][fidx] # Detection box size
	bsi = int(bs * (1 + 2 * cs)) # Pad videos by this amount
	image = cv2.imread(flist[frame])
	frame = numpy.pad(
	image,
	((bsi, bsi), (bsi, bsi), (0, 0)),
	"constant",
	constant_values=(110, 110),
	)
	my = dets["y"][fidx] + bsi # BBox center Y
	mx = dets["x"][fidx] + bsi # BBox center X
	face = frame[
	int(my - bs) : int(my + bs * (1 + 2 * cs)),
	int(mx - bs * (1 + cs)) : int(mx + bs * (1 + cs)),
	]
	vOut.write(cv2.resize(face, (224, 224)))
	audioTmp = cropFile + ".wav"
	audioStart = (track["frame"][0]) / 25
	audioEnd = (track["frame"][-1] + 1) / 25
	vOut.release()
	command = (
	"ffmpeg -y -i %s -async 1 -ac 1 -vn -acodec pcm_s16le -ar 16000 -threads %d -ss %.3f -to %.3f %s -loglevel panic"
	% (args.audioFilePath, args.nDataLoaderThread, audioStart, audioEnd, audioTmp)
	)
	output = subprocess.call(command, shell=True, stdout=None) # Crop audio file
	_, audio = wavfile.read(audioTmp)
	command = (
	"ffmpeg -y -i %st.avi -i %s -threads %d -c:v copy -c:a copy %s.avi -loglevel panic"
	% (cropFile, audioTmp, args.nDataLoaderThread, cropFile)
	) # Combine audio and video file
	output = subprocess.call(command, shell=True, stdout=None)
	os.remove(cropFile + "t.avi")
	return {"track": track, "proc_track": dets}


	def extract_MFCC(file, outPath):
	# CPU: extract mfcc
	sr, audio = wavfile.read(file)
	mfcc = python_speech_features.mfcc(audio, sr) # (N_frames, 13) [1s = 100 frames]
	featuresPath = os.path.join(outPath, file.split("/")[-1].replace(".wav", ".npy"))
	numpy.save(featuresPath, mfcc)


	def evaluate_network(files, args):
	# GPU: active speaker detection by pretrained TalkNet
	s = talkNet()
	s.loadParameters(args.pretrainModel)
	sys.stderr.write("Model %s loaded from previous state! \r\n" % args.pretrainModel)
	s.eval()
	allScores = []
	# durationSet = {1,2,4,6} # To make the result more reliable
	durationSet = {
	1,
	1,
	1,
	2,
	2,
	2,
	3,
	3,
	4,
	5,
	6,
	} # Use this line can get more reliable result
	for file in tqdm.tqdm(files, total=len(files)):
	fileName = os.path.splitext(file.split("/")[-1])[0] # Load audio and video
	_, audio = wavfile.read(os.path.join(args.pycropPath, fileName + ".wav"))
	audioFeature = python_speech_features.mfcc(
	audio, 16000, numcep=13, winlen=0.025, winstep=0.010
	)
	video = cv2.VideoCapture(os.path.join(args.pycropPath, fileName + ".avi"))
	videoFeature = []
	while video.isOpened():
	ret, frames = video.read()
	if ret == True:
	face = cv2.cvtColor(frames, cv2.COLOR_BGR2GRAY)
	face = cv2.resize(face, (224, 224))
	face = face[
	int(112 - (112 / 2)) : int(112 + (112 / 2)),
	int(112 - (112 / 2)) : int(112 + (112 / 2)),
	]
	videoFeature.append(face)
	else:
	break
	video.release()
	videoFeature = numpy.array(videoFeature)
	length = min(
	(audioFeature.shape[0] - audioFeature.shape[0] % 4) / 100,
	videoFeature.shape[0] / 25,
	)
	audioFeature = audioFeature[: int(round(length * 100)), :]
	videoFeature = videoFeature[: int(round(length * 25)), :, :]
	allScore = [] # Evaluation use TalkNet
	for duration in durationSet:
	batchSize = int(math.ceil(length / duration))
	scores = []
	with torch.no_grad():
	for i in range(batchSize):
	inputA = (
	torch.FloatTensor(
	audioFeature[
	i * duration * 100 : (i + 1) * duration * 100, :
	]
	)
	.unsqueeze(0)
	.cuda()
	)
	inputV = (
	torch.FloatTensor(
	videoFeature[
	i * duration * 25 : (i + 1) * duration * 25, :, :
	]
	)
	.unsqueeze(0)
	.cuda()
	)
	embedA = s.model.forward_audio_frontend(inputA)
	embedV = s.model.forward_visual_frontend(inputV)
	embedA, embedV = s.model.forward_cross_attention(embedA, embedV)
	out = s.model.forward_audio_visual_backend(embedA, embedV)
	score = s.lossAV.forward(out, labels=None)
	scores.extend(score)
	allScore.append(scores)
	allScore = numpy.round((numpy.mean(numpy.array(allScore), axis=0)), 1).astype(
	float
	)
	allScores.append(allScore)
	return allScores


	def visualization(tracks, scores, args):
	# CPU: visulize the result for video format
	flist = glob.glob(os.path.join(args.pyframesPath, "*.jpg"))
	flist.sort()
	faces = [[] for i in range(len(flist))]
	for tidx, track in enumerate(tracks):
	score = scores[tidx]
	for fidx, frame in enumerate(track["track"]["frame"].tolist()):
	s = score[
	max(fidx - 2, 0) : min(fidx + 3, len(score) - 1)
	] # average smoothing
	s = numpy.mean(s)
	faces[frame].append(
	{
	"track": tidx,
	"score": float(s),
	"s": track["proc_track"]["s"][fidx],
	"x": track["proc_track"]["x"][fidx],
	"y": track["proc_track"]["y"][fidx],
	}
	)
	firstImage = cv2.imread(flist[0])
	fw = firstImage.shape[1]
	fh = firstImage.shape[0]
	vOut = cv2.VideoWriter(
	os.path.join(args.pyaviPath, "video_only.avi"),
	cv2.VideoWriter_fourcc(*"XVID"),
	25,
	(fw, fh),
	)
	colorDict = {0: 0, 1: 255}
	for fidx, fname in tqdm.tqdm(enumerate(flist), total=len(flist)):
	image = cv2.imread(fname)
	for face in faces[fidx]:
	clr = colorDict[int((face["score"] >= 0))]
	txt = round(face["score"], 1)
	cv2.rectangle(
	image,
	(int(face["x"] - face["s"]), int(face["y"] - face["s"])),
	(int(face["x"] + face["s"]), int(face["y"] + face["s"])),
	(0, clr, 255 - clr),
	10,
	)
	cv2.putText(
	image,
	"%s" % (txt),
	(int(face["x"] - face["s"]), int(face["y"] - face["s"])),
	cv2.FONT_HERSHEY_SIMPLEX,
	1.5,
	(0, clr, 255 - clr),
	5,
	)
	vOut.write(image)
	vOut.release()
	command = (
	"ffmpeg -y -i %s -i %s -threads %d -c:v copy -c:a copy %s -loglevel panic"
	% (
	os.path.join(args.pyaviPath, "video_only.avi"),
	os.path.join(args.pyaviPath, "audio.wav"),
	args.nDataLoaderThread,
	os.path.join(args.pyaviPath, "video_out.avi"),
	)
	)
	output = subprocess.call(command, shell=True, stdout=None)


	def evaluate_col_ASD(tracks, scores, args):
	txtPath = args.videoFolder + "/col_labels/fusion/*.txt" # Load labels
	predictionSet = {}
	for name in {"long", "bell", "boll", "lieb", "sick", "abbas"}:
	predictionSet[name] = [[], []]
	dictGT = {}
	txtFiles = glob.glob("%s" % txtPath)
	for file in txtFiles:
	lines = open(file).read().splitlines()
	idName = file.split("/")[-1][:-4]
	for line in lines:
	data = line.split("\t")
	frame = int(int(data[0]) / 29.97 * 25)
	x1 = int(data[1])
	y1 = int(data[2])
	x2 = int(data[1]) + int(data[3])
	y2 = int(data[2]) + int(data[3])
	gt = int(data[4])
	if frame in dictGT:
	dictGT[frame].append([x1, y1, x2, y2, gt, idName])
	else:
	dictGT[frame] = [[x1, y1, x2, y2, gt, idName]]
	flist = glob.glob(os.path.join(args.pyframesPath, "*.jpg")) # Load files
	flist.sort()
	faces = [[] for i in range(len(flist))]
	for tidx, track in enumerate(tracks):
	score = scores[tidx]
	for fidx, frame in enumerate(track["track"]["frame"].tolist()):
	s = numpy.mean(
	score[max(fidx - 2, 0) : min(fidx + 3, len(score) - 1)]
	) # average smoothing
	faces[frame].append(
	{
	"track": tidx,
	"score": float(s),
	"s": track["proc_track"]["s"][fidx],
	"x": track["proc_track"]["x"][fidx],
	"y": track["proc_track"]["y"][fidx],
	}
	)
	for fidx, fname in tqdm.tqdm(enumerate(flist), total=len(flist)):
	if fidx in dictGT: # This frame has label
	for gtThisFrame in dictGT[fidx]: # What this label is ?
	faceGT = gtThisFrame[0:4]
	labelGT = gtThisFrame[4]
	idGT = gtThisFrame[5]
	ious = []
	for face in faces[fidx]: # Find the right face in my result
	faceLocation = [
	int(face["x"] - face["s"]),
	int(face["y"] - face["s"]),
	int(face["x"] + face["s"]),
	int(face["y"] + face["s"]),
	]
	faceLocation_new = [
	int(face["x"] - face["s"]) // 2,
	int(face["y"] - face["s"]) // 2,
	int(face["x"] + face["s"]) // 2,
	int(face["y"] + face["s"]) // 2,
	]
	iou = bb_intersection_over_union(
	faceLocation_new, faceGT, evalCol=True
	)
	if iou > 0.5:
	ious.append([iou, round(face["score"], 2)])
	if len(ious) > 0: # Find my result
	ious.sort()
	labelPredict = ious[-1][1]
	else:
	labelPredict = 0
	x1 = faceGT[0]
	y1 = faceGT[1]
	width = faceGT[2] - faceGT[0]
	predictionSet[idGT][0].append(labelPredict)
	predictionSet[idGT][1].append(labelGT)
	names = ["long", "bell", "boll", "lieb", "sick", "abbas"] # Evaluate
	names.sort()
	F1s = 0
	for i in names:
	scores = numpy.array(predictionSet[i][0])
	labels = numpy.array(predictionSet[i][1])
	scores = numpy.int64(scores > 0)
	F1 = f1_score(labels, scores)
	ACC = accuracy_score(labels, scores)
	if i != "abbas":
	F1s += F1
	print("%s, ACC:%.2f, F1:%.2f" % (i, 100 * ACC, 100 * F1))
	print("Average F1:%.2f" % (100 * (F1s / 5)))


	# Main function
	def main():
	# This preprocesstion is modified based on this [repository](https://github.com/joonson/syncnet_python).
	# ```
	# .
	# ├── pyavi
	# │ ├── audio.wav (Audio from input video)
	# │ ├── video.avi (Copy of the input video)
	# │ ├── video_only.avi (Output video without audio)
	# │ └── video_out.avi (Output video with audio)
	# ├── pycrop (The detected face videos and audios)
	# │ ├── 000000.avi
	# │ ├── 000000.wav
	# │ ├── 000001.avi
	# │ ├── 000001.wav
	# │ └── ...
	# ├── pyframes (All the video frames in this video)
	# │ ├── 000001.jpg
	# │ ├── 000002.jpg
	# │ └── ...
	# └── pywork
	# ├── faces.pckl (face detection result)
	# ├── scene.pckl (scene detection result)
	# ├── scores.pckl (ASD result)
	# └── tracks.pckl (face tracking result)
	# ```

	# Initialization
	args.pyaviPath = os.path.join(args.savePath, "pyavi")
	args.pyframesPath = os.path.join(args.savePath, "pyframes")
	args.pyworkPath = os.path.join(args.savePath, "pywork")
	args.pycropPath = os.path.join(args.savePath, "pycrop")
	if os.path.exists(args.savePath):
	rmtree(args.savePath)
	os.makedirs(
	args.pyaviPath, exist_ok=True
	) # The path for the input video, input audio, output video
	os.makedirs(args.pyframesPath, exist_ok=True) # Save all the video frames
	os.makedirs(
	args.pyworkPath, exist_ok=True
	) # Save the results in this process by the pckl method
	os.makedirs(
	args.pycropPath, exist_ok=True
	) # Save the detected face clips (audio+video) in this process

	# Extract video
	args.videoFilePath = os.path.join(args.pyaviPath, "video.avi")
	# If duration did not set, extract the whole video, otherwise extract the video from 'args.start' to 'args.start + args.duration'
	if args.duration == 0:
	command = (
	"ffmpeg -y -i %s -qscale:v 2 -threads %d -async 1 -r 25 %s -loglevel panic"
	% (args.videoPath, args.nDataLoaderThread, args.videoFilePath)
	)
	else:
	command = (
	"ffmpeg -y -i %s -qscale:v 2 -threads %d -ss %.3f -to %.3f -async 1 -r 25 %s -loglevel panic"
	% (
	args.videoPath,
	args.nDataLoaderThread,
	args.start,
	args.start + args.duration,
	args.videoFilePath,
	)
	)
	subprocess.call(command, shell=True, stdout=None)
	sys.stderr.write(
	time.strftime("%Y-%m-%d %H:%M:%S")
	+ " Extract the video and save in %s \r\n" % (args.videoFilePath)
	)

	# Extract audio
	args.audioFilePath = os.path.join(args.pyaviPath, "audio.wav")
	command = (
	"ffmpeg -y -i %s -qscale:a 0 -ac 1 -vn -threads %d -ar 16000 %s -loglevel panic"
	% (args.videoFilePath, args.nDataLoaderThread, args.audioFilePath)
	)
	subprocess.call(command, shell=True, stdout=None)
	sys.stderr.write(
	time.strftime("%Y-%m-%d %H:%M:%S")
	+ " Extract the audio and save in %s \r\n" % (args.audioFilePath)
	)

	# Extract the video frames
	command = "ffmpeg -y -i %s -qscale:v 2 -threads %d -f image2 %s -loglevel panic" % (
	args.videoFilePath,
	args.nDataLoaderThread,
	os.path.join(args.pyframesPath, "%06d.jpg"),
	)
	subprocess.call(command, shell=True, stdout=None)
	sys.stderr.write(
	time.strftime("%Y-%m-%d %H:%M:%S")
	+ " Extract the frames and save in %s \r\n" % (args.pyframesPath)
	)

	# Scene detection for the video frames
	scene = scene_detect(args)
	sys.stderr.write(
	time.strftime("%Y-%m-%d %H:%M:%S")
	+ " Scene detection and save in %s \r\n" % (args.pyworkPath)
	)

	# Face detection for the video frames
	faces = inference_video(args)
	sys.stderr.write(
	time.strftime("%Y-%m-%d %H:%M:%S")
	+ " Face detection and save in %s \r\n" % (args.pyworkPath)
	)

	# Face tracking
	allTracks, vidTracks = [], []
	for shot in scene:
	if (
	shot[1].frame_num - shot[0].frame_num >= args.minTrack
	): # Discard the shot frames less than minTrack frames
	allTracks.extend(
	track_shot(args, faces[shot[0].frame_num : shot[1].frame_num])
	) # 'frames' to present this tracks' timestep, 'bbox' presents the location of the faces
	sys.stderr.write(
	time.strftime("%Y-%m-%d %H:%M:%S")
	+ " Face track and detected %d tracks \r\n" % len(allTracks)
	)

	# Face clips cropping
	for ii, track in tqdm.tqdm(enumerate(allTracks), total=len(allTracks)):
	vidTracks.append(
	crop_video(args, track, os.path.join(args.pycropPath, "%05d" % ii))
	)
	savePath = os.path.join(args.pyworkPath, "tracks.pckl")
	with open(savePath, "wb") as fil:
	pickle.dump(vidTracks, fil)
	sys.stderr.write(
	time.strftime("%Y-%m-%d %H:%M:%S")
	+ " Face Crop and saved in %s tracks \r\n" % args.pycropPath
	)
	fil = open(savePath, "rb")
	vidTracks = pickle.load(fil)

	# Active Speaker Detection by TalkNet
	files = glob.glob("%s/*.avi" % args.pycropPath)
	files.sort()
	scores = evaluate_network(files, args)
	savePath = os.path.join(args.pyworkPath, "scores.pckl")
	with open(savePath, "wb") as fil:
	pickle.dump(scores, fil)
	sys.stderr.write(
	time.strftime("%Y-%m-%d %H:%M:%S")
	+ " Scores extracted and saved in %s \r\n" % args.pyworkPath
	)

	if args.evalCol == True:
	evaluate_col_ASD(
	vidTracks, scores, args
	) # The columnbia video is too big for visualization. You can still add the `visualization` funcition here if you want
	quit()
	else:
	# Visualization, save the result as the new video
	visualization(vidTracks, scores, args)


	if __name__ == "__main__":
	main()