| | import sys, time, os, tqdm, torch, argparse, glob, subprocess, warnings, cv2, pickle, numpy, pdb, math, python_speech_features
|
| |
|
| | from scipy import signal
|
| | from shutil import rmtree
|
| | from scipy.io import wavfile
|
| | from scipy.interpolate import interp1d
|
| | from sklearn.metrics import accuracy_score, f1_score
|
| |
|
| | from scenedetect.video_manager import VideoManager
|
| | from scenedetect.scene_manager import SceneManager
|
| | from scenedetect.frame_timecode import FrameTimecode
|
| | from scenedetect.stats_manager import StatsManager
|
| | from scenedetect.detectors import ContentDetector
|
| |
|
| | from model.faceDetector.s3fd import S3FD
|
| | from talkNet import talkNet
|
| |
|
| | warnings.filterwarnings("ignore")
|
| |
|
| | parser = argparse.ArgumentParser(description="TalkNet Demo or Columnbia ASD Evaluation")
|
| |
|
| | parser.add_argument("--videoName", type=str, default="001", help="Demo video name")
|
| | parser.add_argument(
|
| | "--videoFolder", type=str, default="demo", help="Path for inputs, tmps and outputs"
|
| | )
|
| | parser.add_argument(
|
| | "--pretrainModel",
|
| | type=str,
|
| | default="pretrain_TalkSet.model",
|
| | help="Path for the pretrained TalkNet model",
|
| | )
|
| |
|
| | parser.add_argument(
|
| | "--nDataLoaderThread", type=int, default=10, help="Number of workers"
|
| | )
|
| | parser.add_argument(
|
| | "--facedetScale",
|
| | type=float,
|
| | default=0.25,
|
| | help="Scale factor for face detection, the frames will be scale to 0.25 orig",
|
| | )
|
| | parser.add_argument(
|
| | "--minTrack", type=int, default=10, help="Number of min frames for each shot"
|
| | )
|
| | parser.add_argument(
|
| | "--numFailedDet",
|
| | type=int,
|
| | default=10,
|
| | help="Number of missed detections allowed before tracking is stopped",
|
| | )
|
| | parser.add_argument(
|
| | "--minFaceSize", type=int, default=1, help="Minimum face size in pixels"
|
| | )
|
| | parser.add_argument("--cropScale", type=float, default=0.40, help="Scale bounding box")
|
| |
|
| | parser.add_argument("--start", type=int, default=0, help="The start time of the video")
|
| | parser.add_argument(
|
| | "--duration",
|
| | type=int,
|
| | default=0,
|
| | help="The duration of the video, when set as 0, will extract the whole video",
|
| | )
|
| |
|
| | parser.add_argument(
|
| | "--evalCol",
|
| | dest="evalCol",
|
| | action="store_true",
|
| | help="Evaluate on Columnbia dataset",
|
| | )
|
| | parser.add_argument(
|
| | "--colSavePath",
|
| | type=str,
|
| | default="/data08/col",
|
| | help="Path for inputs, tmps and outputs",
|
| | )
|
| |
|
| | args = parser.parse_args()
|
| |
|
| | if os.path.isfile(args.pretrainModel) == False:
|
| | Link = "1AbN9fCf9IexMxEKXLQY2KYBlb-IhSEea"
|
| | cmd = "gdown --id %s -O %s" % (Link, args.pretrainModel)
|
| | subprocess.call(cmd, shell=True, stdout=None)
|
| |
|
| | if args.evalCol == True:
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| | args.videoName = "col"
|
| | args.videoFolder = args.colSavePath
|
| | args.savePath = os.path.join(args.videoFolder, args.videoName)
|
| | args.videoPath = os.path.join(args.videoFolder, args.videoName + ".mp4")
|
| | args.duration = 0
|
| | if os.path.isfile(args.videoPath) == False:
|
| | link = "https://www.youtube.com/watch?v=6GzxbrO0DHM&t=2s"
|
| | cmd = "youtube-dl -f best -o %s '%s'" % (args.videoPath, link)
|
| | output = subprocess.call(cmd, shell=True, stdout=None)
|
| | if os.path.isdir(args.videoFolder + "/col_labels") == False:
|
| | link = "1Tto5JBt6NsEOLFRWzyZEeV6kCCddc6wv"
|
| | cmd = "gdown --id %s -O %s" % (link, args.videoFolder + "/col_labels.tar.gz")
|
| | subprocess.call(cmd, shell=True, stdout=None)
|
| | cmd = "tar -xzvf %s -C %s" % (
|
| | args.videoFolder + "/col_labels.tar.gz",
|
| | args.videoFolder,
|
| | )
|
| | subprocess.call(cmd, shell=True, stdout=None)
|
| | os.remove(args.videoFolder + "/col_labels.tar.gz")
|
| | else:
|
| | args.videoPath = glob.glob(os.path.join(args.videoFolder, args.videoName + ".*"))[0]
|
| | args.savePath = os.path.join(args.videoFolder, args.videoName)
|
| |
|
| |
|
| | def scene_detect(args):
|
| |
|
| | videoManager = VideoManager([args.videoFilePath])
|
| | statsManager = StatsManager()
|
| | sceneManager = SceneManager(statsManager)
|
| | sceneManager.add_detector(ContentDetector())
|
| | baseTimecode = videoManager.get_base_timecode()
|
| | videoManager.set_downscale_factor()
|
| | videoManager.start()
|
| | sceneManager.detect_scenes(frame_source=videoManager)
|
| | sceneList = sceneManager.get_scene_list(baseTimecode)
|
| | savePath = os.path.join(args.pyworkPath, "scene.pckl")
|
| | if sceneList == []:
|
| | sceneList = [
|
| | (videoManager.get_base_timecode(), videoManager.get_current_timecode())
|
| | ]
|
| | with open(savePath, "wb") as fil:
|
| | pickle.dump(sceneList, fil)
|
| | sys.stderr.write(
|
| | "%s - scenes detected %d\n" % (args.videoFilePath, len(sceneList))
|
| | )
|
| | return sceneList
|
| |
|
| |
|
| | def inference_video(args):
|
| |
|
| | DET = S3FD(device="cuda")
|
| | flist = glob.glob(os.path.join(args.pyframesPath, "*.jpg"))
|
| | flist.sort()
|
| | dets = []
|
| | for fidx, fname in enumerate(flist):
|
| | image = cv2.imread(fname)
|
| | imageNumpy = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
|
| | bboxes = DET.detect_faces(imageNumpy, conf_th=0.9, scales=[args.facedetScale])
|
| | dets.append([])
|
| | for bbox in bboxes:
|
| | dets[-1].append(
|
| | {"frame": fidx, "bbox": (bbox[:-1]).tolist(), "conf": bbox[-1]}
|
| | )
|
| | sys.stderr.write(
|
| | "%s-%05d; %d dets\r" % (args.videoFilePath, fidx, len(dets[-1]))
|
| | )
|
| | savePath = os.path.join(args.pyworkPath, "faces.pckl")
|
| | with open(savePath, "wb") as fil:
|
| | pickle.dump(dets, fil)
|
| | return dets
|
| |
|
| |
|
| | def bb_intersection_over_union(boxA, boxB, evalCol=False):
|
| |
|
| | xA = max(boxA[0], boxB[0])
|
| | yA = max(boxA[1], boxB[1])
|
| | xB = min(boxA[2], boxB[2])
|
| | yB = min(boxA[3], boxB[3])
|
| | interArea = max(0, xB - xA) * max(0, yB - yA)
|
| | boxAArea = (boxA[2] - boxA[0]) * (boxA[3] - boxA[1])
|
| | boxBArea = (boxB[2] - boxB[0]) * (boxB[3] - boxB[1])
|
| | if evalCol == True:
|
| | iou = interArea / float(boxAArea)
|
| | else:
|
| | iou = interArea / float(boxAArea + boxBArea - interArea)
|
| | return iou
|
| |
|
| |
|
| | def track_shot(args, sceneFaces):
|
| |
|
| | iouThres = 0.5
|
| | tracks = []
|
| | while True:
|
| | track = []
|
| | for frameFaces in sceneFaces:
|
| | for face in frameFaces:
|
| | if track == []:
|
| | track.append(face)
|
| | frameFaces.remove(face)
|
| | elif face["frame"] - track[-1]["frame"] <= args.numFailedDet:
|
| | iou = bb_intersection_over_union(face["bbox"], track[-1]["bbox"])
|
| | if iou > iouThres:
|
| | track.append(face)
|
| | frameFaces.remove(face)
|
| | continue
|
| | else:
|
| | break
|
| | if track == []:
|
| | break
|
| | elif len(track) > args.minTrack:
|
| | frameNum = numpy.array([f["frame"] for f in track])
|
| | bboxes = numpy.array([numpy.array(f["bbox"]) for f in track])
|
| | frameI = numpy.arange(frameNum[0], frameNum[-1] + 1)
|
| | bboxesI = []
|
| | for ij in range(0, 4):
|
| | interpfn = interp1d(frameNum, bboxes[:, ij])
|
| | bboxesI.append(interpfn(frameI))
|
| | bboxesI = numpy.stack(bboxesI, axis=1)
|
| | if (
|
| | max(
|
| | numpy.mean(bboxesI[:, 2] - bboxesI[:, 0]),
|
| | numpy.mean(bboxesI[:, 3] - bboxesI[:, 1]),
|
| | )
|
| | > args.minFaceSize
|
| | ):
|
| | tracks.append({"frame": frameI, "bbox": bboxesI})
|
| | return tracks
|
| |
|
| |
|
| | def crop_video(args, track, cropFile):
|
| |
|
| | flist = glob.glob(os.path.join(args.pyframesPath, "*.jpg"))
|
| | flist.sort()
|
| | vOut = cv2.VideoWriter(
|
| | cropFile + "t.avi", cv2.VideoWriter_fourcc(*"XVID"), 25, (224, 224)
|
| | )
|
| | dets = {"x": [], "y": [], "s": []}
|
| | for det in track["bbox"]:
|
| | dets["s"].append(max((det[3] - det[1]), (det[2] - det[0])) / 2)
|
| | dets["y"].append((det[1] + det[3]) / 2)
|
| | dets["x"].append((det[0] + det[2]) / 2)
|
| | dets["s"] = signal.medfilt(dets["s"], kernel_size=13)
|
| | dets["x"] = signal.medfilt(dets["x"], kernel_size=13)
|
| | dets["y"] = signal.medfilt(dets["y"], kernel_size=13)
|
| | for fidx, frame in enumerate(track["frame"]):
|
| | cs = args.cropScale
|
| | bs = dets["s"][fidx]
|
| | bsi = int(bs * (1 + 2 * cs))
|
| | image = cv2.imread(flist[frame])
|
| | frame = numpy.pad(
|
| | image,
|
| | ((bsi, bsi), (bsi, bsi), (0, 0)),
|
| | "constant",
|
| | constant_values=(110, 110),
|
| | )
|
| | my = dets["y"][fidx] + bsi
|
| | mx = dets["x"][fidx] + bsi
|
| | face = frame[
|
| | int(my - bs) : int(my + bs * (1 + 2 * cs)),
|
| | int(mx - bs * (1 + cs)) : int(mx + bs * (1 + cs)),
|
| | ]
|
| | vOut.write(cv2.resize(face, (224, 224)))
|
| | audioTmp = cropFile + ".wav"
|
| | audioStart = (track["frame"][0]) / 25
|
| | audioEnd = (track["frame"][-1] + 1) / 25
|
| | vOut.release()
|
| | command = (
|
| | "ffmpeg -y -i %s -async 1 -ac 1 -vn -acodec pcm_s16le -ar 16000 -threads %d -ss %.3f -to %.3f %s -loglevel panic"
|
| | % (args.audioFilePath, args.nDataLoaderThread, audioStart, audioEnd, audioTmp)
|
| | )
|
| | output = subprocess.call(command, shell=True, stdout=None)
|
| | _, audio = wavfile.read(audioTmp)
|
| | command = (
|
| | "ffmpeg -y -i %st.avi -i %s -threads %d -c:v copy -c:a copy %s.avi -loglevel panic"
|
| | % (cropFile, audioTmp, args.nDataLoaderThread, cropFile)
|
| | )
|
| | output = subprocess.call(command, shell=True, stdout=None)
|
| | os.remove(cropFile + "t.avi")
|
| | return {"track": track, "proc_track": dets}
|
| |
|
| |
|
| | def extract_MFCC(file, outPath):
|
| |
|
| | sr, audio = wavfile.read(file)
|
| | mfcc = python_speech_features.mfcc(audio, sr)
|
| | featuresPath = os.path.join(outPath, file.split("/")[-1].replace(".wav", ".npy"))
|
| | numpy.save(featuresPath, mfcc)
|
| |
|
| |
|
| | def evaluate_network(files, args):
|
| |
|
| | s = talkNet()
|
| | s.loadParameters(args.pretrainModel)
|
| | sys.stderr.write("Model %s loaded from previous state! \r\n" % args.pretrainModel)
|
| | s.eval()
|
| | allScores = []
|
| |
|
| | durationSet = {
|
| | 1,
|
| | 1,
|
| | 1,
|
| | 2,
|
| | 2,
|
| | 2,
|
| | 3,
|
| | 3,
|
| | 4,
|
| | 5,
|
| | 6,
|
| | }
|
| | for file in tqdm.tqdm(files, total=len(files)):
|
| | fileName = os.path.splitext(file.split("/")[-1])[0]
|
| | _, audio = wavfile.read(os.path.join(args.pycropPath, fileName + ".wav"))
|
| | audioFeature = python_speech_features.mfcc(
|
| | audio, 16000, numcep=13, winlen=0.025, winstep=0.010
|
| | )
|
| | video = cv2.VideoCapture(os.path.join(args.pycropPath, fileName + ".avi"))
|
| | videoFeature = []
|
| | while video.isOpened():
|
| | ret, frames = video.read()
|
| | if ret == True:
|
| | face = cv2.cvtColor(frames, cv2.COLOR_BGR2GRAY)
|
| | face = cv2.resize(face, (224, 224))
|
| | face = face[
|
| | int(112 - (112 / 2)) : int(112 + (112 / 2)),
|
| | int(112 - (112 / 2)) : int(112 + (112 / 2)),
|
| | ]
|
| | videoFeature.append(face)
|
| | else:
|
| | break
|
| | video.release()
|
| | videoFeature = numpy.array(videoFeature)
|
| | length = min(
|
| | (audioFeature.shape[0] - audioFeature.shape[0] % 4) / 100,
|
| | videoFeature.shape[0] / 25,
|
| | )
|
| | audioFeature = audioFeature[: int(round(length * 100)), :]
|
| | videoFeature = videoFeature[: int(round(length * 25)), :, :]
|
| | allScore = []
|
| | for duration in durationSet:
|
| | batchSize = int(math.ceil(length / duration))
|
| | scores = []
|
| | with torch.no_grad():
|
| | for i in range(batchSize):
|
| | inputA = (
|
| | torch.FloatTensor(
|
| | audioFeature[
|
| | i * duration * 100 : (i + 1) * duration * 100, :
|
| | ]
|
| | )
|
| | .unsqueeze(0)
|
| | .cuda()
|
| | )
|
| | inputV = (
|
| | torch.FloatTensor(
|
| | videoFeature[
|
| | i * duration * 25 : (i + 1) * duration * 25, :, :
|
| | ]
|
| | )
|
| | .unsqueeze(0)
|
| | .cuda()
|
| | )
|
| | embedA = s.model.forward_audio_frontend(inputA)
|
| | embedV = s.model.forward_visual_frontend(inputV)
|
| | embedA, embedV = s.model.forward_cross_attention(embedA, embedV)
|
| | out = s.model.forward_audio_visual_backend(embedA, embedV)
|
| | score = s.lossAV.forward(out, labels=None)
|
| | scores.extend(score)
|
| | allScore.append(scores)
|
| | allScore = numpy.round((numpy.mean(numpy.array(allScore), axis=0)), 1).astype(
|
| | float
|
| | )
|
| | allScores.append(allScore)
|
| | return allScores
|
| |
|
| |
|
| | def visualization(tracks, scores, args):
|
| |
|
| | flist = glob.glob(os.path.join(args.pyframesPath, "*.jpg"))
|
| | flist.sort()
|
| | faces = [[] for i in range(len(flist))]
|
| | for tidx, track in enumerate(tracks):
|
| | score = scores[tidx]
|
| | for fidx, frame in enumerate(track["track"]["frame"].tolist()):
|
| | s = score[
|
| | max(fidx - 2, 0) : min(fidx + 3, len(score) - 1)
|
| | ]
|
| | s = numpy.mean(s)
|
| | faces[frame].append(
|
| | {
|
| | "track": tidx,
|
| | "score": float(s),
|
| | "s": track["proc_track"]["s"][fidx],
|
| | "x": track["proc_track"]["x"][fidx],
|
| | "y": track["proc_track"]["y"][fidx],
|
| | }
|
| | )
|
| | firstImage = cv2.imread(flist[0])
|
| | fw = firstImage.shape[1]
|
| | fh = firstImage.shape[0]
|
| | vOut = cv2.VideoWriter(
|
| | os.path.join(args.pyaviPath, "video_only.avi"),
|
| | cv2.VideoWriter_fourcc(*"XVID"),
|
| | 25,
|
| | (fw, fh),
|
| | )
|
| | colorDict = {0: 0, 1: 255}
|
| | for fidx, fname in tqdm.tqdm(enumerate(flist), total=len(flist)):
|
| | image = cv2.imread(fname)
|
| | for face in faces[fidx]:
|
| | clr = colorDict[int((face["score"] >= 0))]
|
| | txt = round(face["score"], 1)
|
| | cv2.rectangle(
|
| | image,
|
| | (int(face["x"] - face["s"]), int(face["y"] - face["s"])),
|
| | (int(face["x"] + face["s"]), int(face["y"] + face["s"])),
|
| | (0, clr, 255 - clr),
|
| | 10,
|
| | )
|
| | cv2.putText(
|
| | image,
|
| | "%s" % (txt),
|
| | (int(face["x"] - face["s"]), int(face["y"] - face["s"])),
|
| | cv2.FONT_HERSHEY_SIMPLEX,
|
| | 1.5,
|
| | (0, clr, 255 - clr),
|
| | 5,
|
| | )
|
| | vOut.write(image)
|
| | vOut.release()
|
| | command = (
|
| | "ffmpeg -y -i %s -i %s -threads %d -c:v copy -c:a copy %s -loglevel panic"
|
| | % (
|
| | os.path.join(args.pyaviPath, "video_only.avi"),
|
| | os.path.join(args.pyaviPath, "audio.wav"),
|
| | args.nDataLoaderThread,
|
| | os.path.join(args.pyaviPath, "video_out.avi"),
|
| | )
|
| | )
|
| | output = subprocess.call(command, shell=True, stdout=None)
|
| |
|
| |
|
| | def evaluate_col_ASD(tracks, scores, args):
|
| | txtPath = args.videoFolder + "/col_labels/fusion/*.txt"
|
| | predictionSet = {}
|
| | for name in {"long", "bell", "boll", "lieb", "sick", "abbas"}:
|
| | predictionSet[name] = [[], []]
|
| | dictGT = {}
|
| | txtFiles = glob.glob("%s" % txtPath)
|
| | for file in txtFiles:
|
| | lines = open(file).read().splitlines()
|
| | idName = file.split("/")[-1][:-4]
|
| | for line in lines:
|
| | data = line.split("\t")
|
| | frame = int(int(data[0]) / 29.97 * 25)
|
| | x1 = int(data[1])
|
| | y1 = int(data[2])
|
| | x2 = int(data[1]) + int(data[3])
|
| | y2 = int(data[2]) + int(data[3])
|
| | gt = int(data[4])
|
| | if frame in dictGT:
|
| | dictGT[frame].append([x1, y1, x2, y2, gt, idName])
|
| | else:
|
| | dictGT[frame] = [[x1, y1, x2, y2, gt, idName]]
|
| | flist = glob.glob(os.path.join(args.pyframesPath, "*.jpg"))
|
| | flist.sort()
|
| | faces = [[] for i in range(len(flist))]
|
| | for tidx, track in enumerate(tracks):
|
| | score = scores[tidx]
|
| | for fidx, frame in enumerate(track["track"]["frame"].tolist()):
|
| | s = numpy.mean(
|
| | score[max(fidx - 2, 0) : min(fidx + 3, len(score) - 1)]
|
| | )
|
| | faces[frame].append(
|
| | {
|
| | "track": tidx,
|
| | "score": float(s),
|
| | "s": track["proc_track"]["s"][fidx],
|
| | "x": track["proc_track"]["x"][fidx],
|
| | "y": track["proc_track"]["y"][fidx],
|
| | }
|
| | )
|
| | for fidx, fname in tqdm.tqdm(enumerate(flist), total=len(flist)):
|
| | if fidx in dictGT:
|
| | for gtThisFrame in dictGT[fidx]:
|
| | faceGT = gtThisFrame[0:4]
|
| | labelGT = gtThisFrame[4]
|
| | idGT = gtThisFrame[5]
|
| | ious = []
|
| | for face in faces[fidx]:
|
| | faceLocation = [
|
| | int(face["x"] - face["s"]),
|
| | int(face["y"] - face["s"]),
|
| | int(face["x"] + face["s"]),
|
| | int(face["y"] + face["s"]),
|
| | ]
|
| | faceLocation_new = [
|
| | int(face["x"] - face["s"]) // 2,
|
| | int(face["y"] - face["s"]) // 2,
|
| | int(face["x"] + face["s"]) // 2,
|
| | int(face["y"] + face["s"]) // 2,
|
| | ]
|
| | iou = bb_intersection_over_union(
|
| | faceLocation_new, faceGT, evalCol=True
|
| | )
|
| | if iou > 0.5:
|
| | ious.append([iou, round(face["score"], 2)])
|
| | if len(ious) > 0:
|
| | ious.sort()
|
| | labelPredict = ious[-1][1]
|
| | else:
|
| | labelPredict = 0
|
| | x1 = faceGT[0]
|
| | y1 = faceGT[1]
|
| | width = faceGT[2] - faceGT[0]
|
| | predictionSet[idGT][0].append(labelPredict)
|
| | predictionSet[idGT][1].append(labelGT)
|
| | names = ["long", "bell", "boll", "lieb", "sick", "abbas"]
|
| | names.sort()
|
| | F1s = 0
|
| | for i in names:
|
| | scores = numpy.array(predictionSet[i][0])
|
| | labels = numpy.array(predictionSet[i][1])
|
| | scores = numpy.int64(scores > 0)
|
| | F1 = f1_score(labels, scores)
|
| | ACC = accuracy_score(labels, scores)
|
| | if i != "abbas":
|
| | F1s += F1
|
| | print("%s, ACC:%.2f, F1:%.2f" % (i, 100 * ACC, 100 * F1))
|
| | print("Average F1:%.2f" % (100 * (F1s / 5)))
|
| |
|
| |
|
| |
|
| | def main():
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| | args.pyaviPath = os.path.join(args.savePath, "pyavi")
|
| | args.pyframesPath = os.path.join(args.savePath, "pyframes")
|
| | args.pyworkPath = os.path.join(args.savePath, "pywork")
|
| | args.pycropPath = os.path.join(args.savePath, "pycrop")
|
| | if os.path.exists(args.savePath):
|
| | rmtree(args.savePath)
|
| | os.makedirs(
|
| | args.pyaviPath, exist_ok=True
|
| | )
|
| | os.makedirs(args.pyframesPath, exist_ok=True)
|
| | os.makedirs(
|
| | args.pyworkPath, exist_ok=True
|
| | )
|
| | os.makedirs(
|
| | args.pycropPath, exist_ok=True
|
| | )
|
| |
|
| |
|
| | args.videoFilePath = os.path.join(args.pyaviPath, "video.avi")
|
| |
|
| | if args.duration == 0:
|
| | command = (
|
| | "ffmpeg -y -i %s -qscale:v 2 -threads %d -async 1 -r 25 %s -loglevel panic"
|
| | % (args.videoPath, args.nDataLoaderThread, args.videoFilePath)
|
| | )
|
| | else:
|
| | command = (
|
| | "ffmpeg -y -i %s -qscale:v 2 -threads %d -ss %.3f -to %.3f -async 1 -r 25 %s -loglevel panic"
|
| | % (
|
| | args.videoPath,
|
| | args.nDataLoaderThread,
|
| | args.start,
|
| | args.start + args.duration,
|
| | args.videoFilePath,
|
| | )
|
| | )
|
| | subprocess.call(command, shell=True, stdout=None)
|
| | sys.stderr.write(
|
| | time.strftime("%Y-%m-%d %H:%M:%S")
|
| | + " Extract the video and save in %s \r\n" % (args.videoFilePath)
|
| | )
|
| |
|
| |
|
| | args.audioFilePath = os.path.join(args.pyaviPath, "audio.wav")
|
| | command = (
|
| | "ffmpeg -y -i %s -qscale:a 0 -ac 1 -vn -threads %d -ar 16000 %s -loglevel panic"
|
| | % (args.videoFilePath, args.nDataLoaderThread, args.audioFilePath)
|
| | )
|
| | subprocess.call(command, shell=True, stdout=None)
|
| | sys.stderr.write(
|
| | time.strftime("%Y-%m-%d %H:%M:%S")
|
| | + " Extract the audio and save in %s \r\n" % (args.audioFilePath)
|
| | )
|
| |
|
| |
|
| | command = "ffmpeg -y -i %s -qscale:v 2 -threads %d -f image2 %s -loglevel panic" % (
|
| | args.videoFilePath,
|
| | args.nDataLoaderThread,
|
| | os.path.join(args.pyframesPath, "%06d.jpg"),
|
| | )
|
| | subprocess.call(command, shell=True, stdout=None)
|
| | sys.stderr.write(
|
| | time.strftime("%Y-%m-%d %H:%M:%S")
|
| | + " Extract the frames and save in %s \r\n" % (args.pyframesPath)
|
| | )
|
| |
|
| |
|
| | scene = scene_detect(args)
|
| | sys.stderr.write(
|
| | time.strftime("%Y-%m-%d %H:%M:%S")
|
| | + " Scene detection and save in %s \r\n" % (args.pyworkPath)
|
| | )
|
| |
|
| |
|
| | faces = inference_video(args)
|
| | sys.stderr.write(
|
| | time.strftime("%Y-%m-%d %H:%M:%S")
|
| | + " Face detection and save in %s \r\n" % (args.pyworkPath)
|
| | )
|
| |
|
| |
|
| | allTracks, vidTracks = [], []
|
| | for shot in scene:
|
| | if (
|
| | shot[1].frame_num - shot[0].frame_num >= args.minTrack
|
| | ):
|
| | allTracks.extend(
|
| | track_shot(args, faces[shot[0].frame_num : shot[1].frame_num])
|
| | )
|
| | sys.stderr.write(
|
| | time.strftime("%Y-%m-%d %H:%M:%S")
|
| | + " Face track and detected %d tracks \r\n" % len(allTracks)
|
| | )
|
| |
|
| |
|
| | for ii, track in tqdm.tqdm(enumerate(allTracks), total=len(allTracks)):
|
| | vidTracks.append(
|
| | crop_video(args, track, os.path.join(args.pycropPath, "%05d" % ii))
|
| | )
|
| | savePath = os.path.join(args.pyworkPath, "tracks.pckl")
|
| | with open(savePath, "wb") as fil:
|
| | pickle.dump(vidTracks, fil)
|
| | sys.stderr.write(
|
| | time.strftime("%Y-%m-%d %H:%M:%S")
|
| | + " Face Crop and saved in %s tracks \r\n" % args.pycropPath
|
| | )
|
| | fil = open(savePath, "rb")
|
| | vidTracks = pickle.load(fil)
|
| |
|
| |
|
| | files = glob.glob("%s/*.avi" % args.pycropPath)
|
| | files.sort()
|
| | scores = evaluate_network(files, args)
|
| | savePath = os.path.join(args.pyworkPath, "scores.pckl")
|
| | with open(savePath, "wb") as fil:
|
| | pickle.dump(scores, fil)
|
| | sys.stderr.write(
|
| | time.strftime("%Y-%m-%d %H:%M:%S")
|
| | + " Scores extracted and saved in %s \r\n" % args.pyworkPath
|
| | )
|
| |
|
| | if args.evalCol == True:
|
| | evaluate_col_ASD(
|
| | vidTracks, scores, args
|
| | )
|
| | quit()
|
| | else:
|
| |
|
| | visualization(vidTracks, scores, args)
|
| |
|
| |
|
| | if __name__ == "__main__":
|
| | main()
|
| |
|