import sys, time, os, tqdm, torch, argparse, glob, subprocess, warnings, cv2, pickle, numpy, pdb, math, python_speech_features from scipy import signal from shutil import rmtree from scipy.io import wavfile from scipy.interpolate import interp1d from sklearn.metrics import accuracy_score, f1_score from scenedetect.video_manager import VideoManager from scenedetect.scene_manager import SceneManager from scenedetect.frame_timecode import FrameTimecode from scenedetect.stats_manager import StatsManager from scenedetect.detectors import ContentDetector from model.faceDetector.s3fd import S3FD from talkNet import talkNet warnings.filterwarnings("ignore") parser = argparse.ArgumentParser(description="TalkNet Demo or Columnbia ASD Evaluation") parser.add_argument("--videoName", type=str, default="001", help="Demo video name") parser.add_argument( "--videoFolder", type=str, default="demo", help="Path for inputs, tmps and outputs" ) parser.add_argument( "--pretrainModel", type=str, default="pretrain_TalkSet.model", help="Path for the pretrained TalkNet model", ) parser.add_argument( "--nDataLoaderThread", type=int, default=10, help="Number of workers" ) parser.add_argument( "--facedetScale", type=float, default=0.25, help="Scale factor for face detection, the frames will be scale to 0.25 orig", ) parser.add_argument( "--minTrack", type=int, default=10, help="Number of min frames for each shot" ) parser.add_argument( "--numFailedDet", type=int, default=10, help="Number of missed detections allowed before tracking is stopped", ) parser.add_argument( "--minFaceSize", type=int, default=1, help="Minimum face size in pixels" ) parser.add_argument("--cropScale", type=float, default=0.40, help="Scale bounding box") parser.add_argument("--start", type=int, default=0, help="The start time of the video") parser.add_argument( "--duration", type=int, default=0, help="The duration of the video, when set as 0, will extract the whole video", ) parser.add_argument( "--evalCol", dest="evalCol", action="store_true", help="Evaluate on Columnbia dataset", ) parser.add_argument( "--colSavePath", type=str, default="/data08/col", help="Path for inputs, tmps and outputs", ) args = parser.parse_args() if os.path.isfile(args.pretrainModel) == False: # Download the pretrained model Link = "1AbN9fCf9IexMxEKXLQY2KYBlb-IhSEea" cmd = "gdown --id %s -O %s" % (Link, args.pretrainModel) subprocess.call(cmd, shell=True, stdout=None) if args.evalCol == True: # The process is: 1. download video and labels(I have modified the format of labels to make it easiler for using) # 2. extract audio, extract video frames # 3. scend detection, face detection and face tracking # 4. active speaker detection for the detected face clips # 5. use iou to find the identity of each face clips, compute the F1 results # The step 1 to 3 will take some time (That is one-time process). It depends on your cpu and gpu speed. For reference, I used 1.5 hour # The step 4 and 5 need less than 10 minutes # Need about 20G space finally # ``` args.videoName = "col" args.videoFolder = args.colSavePath args.savePath = os.path.join(args.videoFolder, args.videoName) args.videoPath = os.path.join(args.videoFolder, args.videoName + ".mp4") args.duration = 0 if os.path.isfile(args.videoPath) == False: # Download video link = "https://www.youtube.com/watch?v=6GzxbrO0DHM&t=2s" cmd = "youtube-dl -f best -o %s '%s'" % (args.videoPath, link) output = subprocess.call(cmd, shell=True, stdout=None) if os.path.isdir(args.videoFolder + "/col_labels") == False: # Download label link = "1Tto5JBt6NsEOLFRWzyZEeV6kCCddc6wv" cmd = "gdown --id %s -O %s" % (link, args.videoFolder + "/col_labels.tar.gz") subprocess.call(cmd, shell=True, stdout=None) cmd = "tar -xzvf %s -C %s" % ( args.videoFolder + "/col_labels.tar.gz", args.videoFolder, ) subprocess.call(cmd, shell=True, stdout=None) os.remove(args.videoFolder + "/col_labels.tar.gz") else: args.videoPath = glob.glob(os.path.join(args.videoFolder, args.videoName + ".*"))[0] args.savePath = os.path.join(args.videoFolder, args.videoName) def scene_detect(args): # CPU: Scene detection, output is the list of each shot's time duration videoManager = VideoManager([args.videoFilePath]) statsManager = StatsManager() sceneManager = SceneManager(statsManager) sceneManager.add_detector(ContentDetector()) baseTimecode = videoManager.get_base_timecode() videoManager.set_downscale_factor() videoManager.start() sceneManager.detect_scenes(frame_source=videoManager) sceneList = sceneManager.get_scene_list(baseTimecode) savePath = os.path.join(args.pyworkPath, "scene.pckl") if sceneList == []: sceneList = [ (videoManager.get_base_timecode(), videoManager.get_current_timecode()) ] with open(savePath, "wb") as fil: pickle.dump(sceneList, fil) sys.stderr.write( "%s - scenes detected %d\n" % (args.videoFilePath, len(sceneList)) ) return sceneList def inference_video(args): # GPU: Face detection, output is the list contains the face location and score in this frame DET = S3FD(device="cuda") flist = glob.glob(os.path.join(args.pyframesPath, "*.jpg")) flist.sort() dets = [] for fidx, fname in enumerate(flist): image = cv2.imread(fname) imageNumpy = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) bboxes = DET.detect_faces(imageNumpy, conf_th=0.9, scales=[args.facedetScale]) dets.append([]) for bbox in bboxes: dets[-1].append( {"frame": fidx, "bbox": (bbox[:-1]).tolist(), "conf": bbox[-1]} ) # dets has the frames info, bbox info, conf info sys.stderr.write( "%s-%05d; %d dets\r" % (args.videoFilePath, fidx, len(dets[-1])) ) savePath = os.path.join(args.pyworkPath, "faces.pckl") with open(savePath, "wb") as fil: pickle.dump(dets, fil) return dets def bb_intersection_over_union(boxA, boxB, evalCol=False): # CPU: IOU Function to calculate overlap between two image xA = max(boxA[0], boxB[0]) yA = max(boxA[1], boxB[1]) xB = min(boxA[2], boxB[2]) yB = min(boxA[3], boxB[3]) interArea = max(0, xB - xA) * max(0, yB - yA) boxAArea = (boxA[2] - boxA[0]) * (boxA[3] - boxA[1]) boxBArea = (boxB[2] - boxB[0]) * (boxB[3] - boxB[1]) if evalCol == True: iou = interArea / float(boxAArea) else: iou = interArea / float(boxAArea + boxBArea - interArea) return iou def track_shot(args, sceneFaces): # CPU: Face tracking iouThres = 0.5 # Minimum IOU between consecutive face detections tracks = [] while True: track = [] for frameFaces in sceneFaces: for face in frameFaces: if track == []: track.append(face) frameFaces.remove(face) elif face["frame"] - track[-1]["frame"] <= args.numFailedDet: iou = bb_intersection_over_union(face["bbox"], track[-1]["bbox"]) if iou > iouThres: track.append(face) frameFaces.remove(face) continue else: break if track == []: break elif len(track) > args.minTrack: frameNum = numpy.array([f["frame"] for f in track]) bboxes = numpy.array([numpy.array(f["bbox"]) for f in track]) frameI = numpy.arange(frameNum[0], frameNum[-1] + 1) bboxesI = [] for ij in range(0, 4): interpfn = interp1d(frameNum, bboxes[:, ij]) bboxesI.append(interpfn(frameI)) bboxesI = numpy.stack(bboxesI, axis=1) if ( max( numpy.mean(bboxesI[:, 2] - bboxesI[:, 0]), numpy.mean(bboxesI[:, 3] - bboxesI[:, 1]), ) > args.minFaceSize ): tracks.append({"frame": frameI, "bbox": bboxesI}) return tracks def crop_video(args, track, cropFile): # CPU: crop the face clips flist = glob.glob(os.path.join(args.pyframesPath, "*.jpg")) # Read the frames flist.sort() vOut = cv2.VideoWriter( cropFile + "t.avi", cv2.VideoWriter_fourcc(*"XVID"), 25, (224, 224) ) # Write video dets = {"x": [], "y": [], "s": []} for det in track["bbox"]: # Read the tracks dets["s"].append(max((det[3] - det[1]), (det[2] - det[0])) / 2) dets["y"].append((det[1] + det[3]) / 2) # crop center x dets["x"].append((det[0] + det[2]) / 2) # crop center y dets["s"] = signal.medfilt(dets["s"], kernel_size=13) # Smooth detections dets["x"] = signal.medfilt(dets["x"], kernel_size=13) dets["y"] = signal.medfilt(dets["y"], kernel_size=13) for fidx, frame in enumerate(track["frame"]): cs = args.cropScale bs = dets["s"][fidx] # Detection box size bsi = int(bs * (1 + 2 * cs)) # Pad videos by this amount image = cv2.imread(flist[frame]) frame = numpy.pad( image, ((bsi, bsi), (bsi, bsi), (0, 0)), "constant", constant_values=(110, 110), ) my = dets["y"][fidx] + bsi # BBox center Y mx = dets["x"][fidx] + bsi # BBox center X face = frame[ int(my - bs) : int(my + bs * (1 + 2 * cs)), int(mx - bs * (1 + cs)) : int(mx + bs * (1 + cs)), ] vOut.write(cv2.resize(face, (224, 224))) audioTmp = cropFile + ".wav" audioStart = (track["frame"][0]) / 25 audioEnd = (track["frame"][-1] + 1) / 25 vOut.release() command = ( "ffmpeg -y -i %s -async 1 -ac 1 -vn -acodec pcm_s16le -ar 16000 -threads %d -ss %.3f -to %.3f %s -loglevel panic" % (args.audioFilePath, args.nDataLoaderThread, audioStart, audioEnd, audioTmp) ) output = subprocess.call(command, shell=True, stdout=None) # Crop audio file _, audio = wavfile.read(audioTmp) command = ( "ffmpeg -y -i %st.avi -i %s -threads %d -c:v copy -c:a copy %s.avi -loglevel panic" % (cropFile, audioTmp, args.nDataLoaderThread, cropFile) ) # Combine audio and video file output = subprocess.call(command, shell=True, stdout=None) os.remove(cropFile + "t.avi") return {"track": track, "proc_track": dets} def extract_MFCC(file, outPath): # CPU: extract mfcc sr, audio = wavfile.read(file) mfcc = python_speech_features.mfcc(audio, sr) # (N_frames, 13) [1s = 100 frames] featuresPath = os.path.join(outPath, file.split("/")[-1].replace(".wav", ".npy")) numpy.save(featuresPath, mfcc) def evaluate_network(files, args): # GPU: active speaker detection by pretrained TalkNet s = talkNet() s.loadParameters(args.pretrainModel) sys.stderr.write("Model %s loaded from previous state! \r\n" % args.pretrainModel) s.eval() allScores = [] # durationSet = {1,2,4,6} # To make the result more reliable durationSet = { 1, 1, 1, 2, 2, 2, 3, 3, 4, 5, 6, } # Use this line can get more reliable result for file in tqdm.tqdm(files, total=len(files)): fileName = os.path.splitext(file.split("/")[-1])[0] # Load audio and video _, audio = wavfile.read(os.path.join(args.pycropPath, fileName + ".wav")) audioFeature = python_speech_features.mfcc( audio, 16000, numcep=13, winlen=0.025, winstep=0.010 ) video = cv2.VideoCapture(os.path.join(args.pycropPath, fileName + ".avi")) videoFeature = [] while video.isOpened(): ret, frames = video.read() if ret == True: face = cv2.cvtColor(frames, cv2.COLOR_BGR2GRAY) face = cv2.resize(face, (224, 224)) face = face[ int(112 - (112 / 2)) : int(112 + (112 / 2)), int(112 - (112 / 2)) : int(112 + (112 / 2)), ] videoFeature.append(face) else: break video.release() videoFeature = numpy.array(videoFeature) length = min( (audioFeature.shape[0] - audioFeature.shape[0] % 4) / 100, videoFeature.shape[0] / 25, ) audioFeature = audioFeature[: int(round(length * 100)), :] videoFeature = videoFeature[: int(round(length * 25)), :, :] allScore = [] # Evaluation use TalkNet for duration in durationSet: batchSize = int(math.ceil(length / duration)) scores = [] with torch.no_grad(): for i in range(batchSize): inputA = ( torch.FloatTensor( audioFeature[ i * duration * 100 : (i + 1) * duration * 100, : ] ) .unsqueeze(0) .cuda() ) inputV = ( torch.FloatTensor( videoFeature[ i * duration * 25 : (i + 1) * duration * 25, :, : ] ) .unsqueeze(0) .cuda() ) embedA = s.model.forward_audio_frontend(inputA) embedV = s.model.forward_visual_frontend(inputV) embedA, embedV = s.model.forward_cross_attention(embedA, embedV) out = s.model.forward_audio_visual_backend(embedA, embedV) score = s.lossAV.forward(out, labels=None) scores.extend(score) allScore.append(scores) allScore = numpy.round((numpy.mean(numpy.array(allScore), axis=0)), 1).astype( float ) allScores.append(allScore) return allScores def visualization(tracks, scores, args): # CPU: visulize the result for video format flist = glob.glob(os.path.join(args.pyframesPath, "*.jpg")) flist.sort() faces = [[] for i in range(len(flist))] for tidx, track in enumerate(tracks): score = scores[tidx] for fidx, frame in enumerate(track["track"]["frame"].tolist()): s = score[ max(fidx - 2, 0) : min(fidx + 3, len(score) - 1) ] # average smoothing s = numpy.mean(s) faces[frame].append( { "track": tidx, "score": float(s), "s": track["proc_track"]["s"][fidx], "x": track["proc_track"]["x"][fidx], "y": track["proc_track"]["y"][fidx], } ) firstImage = cv2.imread(flist[0]) fw = firstImage.shape[1] fh = firstImage.shape[0] vOut = cv2.VideoWriter( os.path.join(args.pyaviPath, "video_only.avi"), cv2.VideoWriter_fourcc(*"XVID"), 25, (fw, fh), ) colorDict = {0: 0, 1: 255} for fidx, fname in tqdm.tqdm(enumerate(flist), total=len(flist)): image = cv2.imread(fname) for face in faces[fidx]: clr = colorDict[int((face["score"] >= 0))] txt = round(face["score"], 1) cv2.rectangle( image, (int(face["x"] - face["s"]), int(face["y"] - face["s"])), (int(face["x"] + face["s"]), int(face["y"] + face["s"])), (0, clr, 255 - clr), 10, ) cv2.putText( image, "%s" % (txt), (int(face["x"] - face["s"]), int(face["y"] - face["s"])), cv2.FONT_HERSHEY_SIMPLEX, 1.5, (0, clr, 255 - clr), 5, ) vOut.write(image) vOut.release() command = ( "ffmpeg -y -i %s -i %s -threads %d -c:v copy -c:a copy %s -loglevel panic" % ( os.path.join(args.pyaviPath, "video_only.avi"), os.path.join(args.pyaviPath, "audio.wav"), args.nDataLoaderThread, os.path.join(args.pyaviPath, "video_out.avi"), ) ) output = subprocess.call(command, shell=True, stdout=None) def evaluate_col_ASD(tracks, scores, args): txtPath = args.videoFolder + "/col_labels/fusion/*.txt" # Load labels predictionSet = {} for name in {"long", "bell", "boll", "lieb", "sick", "abbas"}: predictionSet[name] = [[], []] dictGT = {} txtFiles = glob.glob("%s" % txtPath) for file in txtFiles: lines = open(file).read().splitlines() idName = file.split("/")[-1][:-4] for line in lines: data = line.split("\t") frame = int(int(data[0]) / 29.97 * 25) x1 = int(data[1]) y1 = int(data[2]) x2 = int(data[1]) + int(data[3]) y2 = int(data[2]) + int(data[3]) gt = int(data[4]) if frame in dictGT: dictGT[frame].append([x1, y1, x2, y2, gt, idName]) else: dictGT[frame] = [[x1, y1, x2, y2, gt, idName]] flist = glob.glob(os.path.join(args.pyframesPath, "*.jpg")) # Load files flist.sort() faces = [[] for i in range(len(flist))] for tidx, track in enumerate(tracks): score = scores[tidx] for fidx, frame in enumerate(track["track"]["frame"].tolist()): s = numpy.mean( score[max(fidx - 2, 0) : min(fidx + 3, len(score) - 1)] ) # average smoothing faces[frame].append( { "track": tidx, "score": float(s), "s": track["proc_track"]["s"][fidx], "x": track["proc_track"]["x"][fidx], "y": track["proc_track"]["y"][fidx], } ) for fidx, fname in tqdm.tqdm(enumerate(flist), total=len(flist)): if fidx in dictGT: # This frame has label for gtThisFrame in dictGT[fidx]: # What this label is ? faceGT = gtThisFrame[0:4] labelGT = gtThisFrame[4] idGT = gtThisFrame[5] ious = [] for face in faces[fidx]: # Find the right face in my result faceLocation = [ int(face["x"] - face["s"]), int(face["y"] - face["s"]), int(face["x"] + face["s"]), int(face["y"] + face["s"]), ] faceLocation_new = [ int(face["x"] - face["s"]) // 2, int(face["y"] - face["s"]) // 2, int(face["x"] + face["s"]) // 2, int(face["y"] + face["s"]) // 2, ] iou = bb_intersection_over_union( faceLocation_new, faceGT, evalCol=True ) if iou > 0.5: ious.append([iou, round(face["score"], 2)]) if len(ious) > 0: # Find my result ious.sort() labelPredict = ious[-1][1] else: labelPredict = 0 x1 = faceGT[0] y1 = faceGT[1] width = faceGT[2] - faceGT[0] predictionSet[idGT][0].append(labelPredict) predictionSet[idGT][1].append(labelGT) names = ["long", "bell", "boll", "lieb", "sick", "abbas"] # Evaluate names.sort() F1s = 0 for i in names: scores = numpy.array(predictionSet[i][0]) labels = numpy.array(predictionSet[i][1]) scores = numpy.int64(scores > 0) F1 = f1_score(labels, scores) ACC = accuracy_score(labels, scores) if i != "abbas": F1s += F1 print("%s, ACC:%.2f, F1:%.2f" % (i, 100 * ACC, 100 * F1)) print("Average F1:%.2f" % (100 * (F1s / 5))) # Main function def main(): # This preprocesstion is modified based on this [repository](https://github.com/joonson/syncnet_python). # ``` # . # ├── pyavi # │   ├── audio.wav (Audio from input video) # │   ├── video.avi (Copy of the input video) # │   ├── video_only.avi (Output video without audio) # │   └── video_out.avi (Output video with audio) # ├── pycrop (The detected face videos and audios) # │ ├── 000000.avi # │ ├── 000000.wav # │ ├── 000001.avi # │ ├── 000001.wav # │ └── ... # ├── pyframes (All the video frames in this video) # │ ├── 000001.jpg # │ ├── 000002.jpg # │ └── ... # └── pywork # ├── faces.pckl (face detection result) # ├── scene.pckl (scene detection result) # ├── scores.pckl (ASD result) # └── tracks.pckl (face tracking result) # ``` # Initialization args.pyaviPath = os.path.join(args.savePath, "pyavi") args.pyframesPath = os.path.join(args.savePath, "pyframes") args.pyworkPath = os.path.join(args.savePath, "pywork") args.pycropPath = os.path.join(args.savePath, "pycrop") if os.path.exists(args.savePath): rmtree(args.savePath) os.makedirs( args.pyaviPath, exist_ok=True ) # The path for the input video, input audio, output video os.makedirs(args.pyframesPath, exist_ok=True) # Save all the video frames os.makedirs( args.pyworkPath, exist_ok=True ) # Save the results in this process by the pckl method os.makedirs( args.pycropPath, exist_ok=True ) # Save the detected face clips (audio+video) in this process # Extract video args.videoFilePath = os.path.join(args.pyaviPath, "video.avi") # If duration did not set, extract the whole video, otherwise extract the video from 'args.start' to 'args.start + args.duration' if args.duration == 0: command = ( "ffmpeg -y -i %s -qscale:v 2 -threads %d -async 1 -r 25 %s -loglevel panic" % (args.videoPath, args.nDataLoaderThread, args.videoFilePath) ) else: command = ( "ffmpeg -y -i %s -qscale:v 2 -threads %d -ss %.3f -to %.3f -async 1 -r 25 %s -loglevel panic" % ( args.videoPath, args.nDataLoaderThread, args.start, args.start + args.duration, args.videoFilePath, ) ) subprocess.call(command, shell=True, stdout=None) sys.stderr.write( time.strftime("%Y-%m-%d %H:%M:%S") + " Extract the video and save in %s \r\n" % (args.videoFilePath) ) # Extract audio args.audioFilePath = os.path.join(args.pyaviPath, "audio.wav") command = ( "ffmpeg -y -i %s -qscale:a 0 -ac 1 -vn -threads %d -ar 16000 %s -loglevel panic" % (args.videoFilePath, args.nDataLoaderThread, args.audioFilePath) ) subprocess.call(command, shell=True, stdout=None) sys.stderr.write( time.strftime("%Y-%m-%d %H:%M:%S") + " Extract the audio and save in %s \r\n" % (args.audioFilePath) ) # Extract the video frames command = "ffmpeg -y -i %s -qscale:v 2 -threads %d -f image2 %s -loglevel panic" % ( args.videoFilePath, args.nDataLoaderThread, os.path.join(args.pyframesPath, "%06d.jpg"), ) subprocess.call(command, shell=True, stdout=None) sys.stderr.write( time.strftime("%Y-%m-%d %H:%M:%S") + " Extract the frames and save in %s \r\n" % (args.pyframesPath) ) # Scene detection for the video frames scene = scene_detect(args) sys.stderr.write( time.strftime("%Y-%m-%d %H:%M:%S") + " Scene detection and save in %s \r\n" % (args.pyworkPath) ) # Face detection for the video frames faces = inference_video(args) sys.stderr.write( time.strftime("%Y-%m-%d %H:%M:%S") + " Face detection and save in %s \r\n" % (args.pyworkPath) ) # Face tracking allTracks, vidTracks = [], [] for shot in scene: if ( shot[1].frame_num - shot[0].frame_num >= args.minTrack ): # Discard the shot frames less than minTrack frames allTracks.extend( track_shot(args, faces[shot[0].frame_num : shot[1].frame_num]) ) # 'frames' to present this tracks' timestep, 'bbox' presents the location of the faces sys.stderr.write( time.strftime("%Y-%m-%d %H:%M:%S") + " Face track and detected %d tracks \r\n" % len(allTracks) ) # Face clips cropping for ii, track in tqdm.tqdm(enumerate(allTracks), total=len(allTracks)): vidTracks.append( crop_video(args, track, os.path.join(args.pycropPath, "%05d" % ii)) ) savePath = os.path.join(args.pyworkPath, "tracks.pckl") with open(savePath, "wb") as fil: pickle.dump(vidTracks, fil) sys.stderr.write( time.strftime("%Y-%m-%d %H:%M:%S") + " Face Crop and saved in %s tracks \r\n" % args.pycropPath ) fil = open(savePath, "rb") vidTracks = pickle.load(fil) # Active Speaker Detection by TalkNet files = glob.glob("%s/*.avi" % args.pycropPath) files.sort() scores = evaluate_network(files, args) savePath = os.path.join(args.pyworkPath, "scores.pckl") with open(savePath, "wb") as fil: pickle.dump(scores, fil) sys.stderr.write( time.strftime("%Y-%m-%d %H:%M:%S") + " Scores extracted and saved in %s \r\n" % args.pyworkPath ) if args.evalCol == True: evaluate_col_ASD( vidTracks, scores, args ) # The columnbia video is too big for visualization. You can still add the `visualization` funcition here if you want quit() else: # Visualization, save the result as the new video visualization(vidTracks, scores, args) if __name__ == "__main__": main()