shopsmart

Sleeping

App Files Files Community

Spanicin commited on Aug 20, 2024

Commit

d8431dd

verified ·

1 Parent(s): 5b1ae50

Upload 6 files

Browse files

Files changed (6) hide show

videoretalking/utils/alignment_stit.py +211 -0
videoretalking/utils/audio.py +136 -0
videoretalking/utils/ffhq_preprocess.py +140 -0
videoretalking/utils/flow_util.py +56 -0
videoretalking/utils/hparams.py +137 -0
videoretalking/utils/inference_utils.py +254 -0

videoretalking/utils/alignment_stit.py ADDED Viewed

	@@ -0,0 +1,211 @@

+import PIL
+import PIL.Image
+import dlib
+import face_alignment
+import numpy as np
+import scipy
+import scipy.ndimage
+import skimage.io as io
+import torch
+from PIL import Image
+from scipy.ndimage import gaussian_filter1d
+from tqdm import tqdm
+# from configs import paths_config
+def paste_image(inverse_transform, img, orig_image):
+    pasted_image = orig_image.copy().convert('RGBA')
+    projected = img.convert('RGBA').transform(orig_image.size, Image.PERSPECTIVE, inverse_transform, Image.BILINEAR)
+    pasted_image.paste(projected, (0, 0), mask=projected)
+    return pasted_image
+def get_landmark(filepath, predictor, detector=None, fa=None):
+    """get landmark with dlib
+    :return: np.array shape=(68, 2)
+    """
+    if fa is not None:
+        image = io.imread(filepath)
+        lms, _, bboxes = fa.get_landmarks(image, return_bboxes=True)
+        if len(lms) == 0:
+            return None
+        return lms[0]
+    if detector is None:
+        detector = dlib.get_frontal_face_detector()
+    if isinstance(filepath, PIL.Image.Image):
+        img = np.array(filepath)
+    else:
+        img = dlib.load_rgb_image(filepath)
+    dets = detector(img)
+    for k, d in enumerate(dets):
+        shape = predictor(img, d)
+        break
+    else:
+        return None
+    t = list(shape.parts())
+    a = []
+    for tt in t:
+        a.append([tt.x, tt.y])
+    lm = np.array(a)
+    return lm
+def align_face(filepath_or_image, predictor, output_size, detector=None,
+               enable_padding=False, scale=1.0):
+    """
+    :param filepath: str
+    :return: PIL Image
+    """
+    c, x, y = compute_transform(filepath_or_image, predictor, detector=detector,
+                                scale=scale)
+    quad = np.stack([c - x - y, c - x + y, c + x + y, c + x - y])
+    img = crop_image(filepath_or_image, output_size, quad, enable_padding=enable_padding)
+    # Return aligned image.
+    return img
+def crop_image(filepath, output_size, quad, enable_padding=False):
+    x = (quad[3] - quad[1]) / 2
+    qsize = np.hypot(*x) * 2
+    # read image
+    if isinstance(filepath, PIL.Image.Image):
+        img = filepath
+    else:
+        img = PIL.Image.open(filepath)
+    transform_size = output_size
+    # Shrink.
+    shrink = int(np.floor(qsize / output_size * 0.5))
+    if shrink > 1:
+        rsize = (int(np.rint(float(img.size[0]) / shrink)), int(np.rint(float(img.size[1]) / shrink)))
+        img = img.resize(rsize, PIL.Image.ANTIALIAS)
+        quad /= shrink
+        qsize /= shrink
+    # Crop.
+    border = max(int(np.rint(qsize * 0.1)), 3)
+    crop = (int(np.floor(min(quad[:, 0]))), int(np.floor(min(quad[:, 1]))), int(np.ceil(max(quad[:, 0]))),
+            int(np.ceil(max(quad[:, 1]))))
+    crop = (max(crop[0] - border, 0), max(crop[1] - border, 0), min(crop[2] + border, img.size[0]),
+            min(crop[3] + border, img.size[1]))
+    if (crop[2] - crop[0] < img.size[0] or crop[3] - crop[1] < img.size[1]):
+        img = img.crop(crop)
+        quad -= crop[0:2]
+    # Pad.
+    pad = (int(np.floor(min(quad[:, 0]))), int(np.floor(min(quad[:, 1]))), int(np.ceil(max(quad[:, 0]))),
+           int(np.ceil(max(quad[:, 1]))))
+    pad = (max(-pad[0] + border, 0), max(-pad[1] + border, 0), max(pad[2] - img.size[0] + border, 0),
+           max(pad[3] - img.size[1] + border, 0))
+    if enable_padding and max(pad) > border - 4:
+        pad = np.maximum(pad, int(np.rint(qsize * 0.3)))
+        img = np.pad(np.float32(img), ((pad[1], pad[3]), (pad[0], pad[2]), (0, 0)), 'reflect')
+        h, w, _ = img.shape
+        y, x, _ = np.ogrid[:h, :w, :1]
+        mask = np.maximum(1.0 - np.minimum(np.float32(x) / pad[0], np.float32(w - 1 - x) / pad[2]),
+                          1.0 - np.minimum(np.float32(y) / pad[1], np.float32(h - 1 - y) / pad[3]))
+        blur = qsize * 0.02
+        img += (scipy.ndimage.gaussian_filter(img, [blur, blur, 0]) - img) * np.clip(mask * 3.0 + 1.0, 0.0, 1.0)
+        img += (np.median(img, axis=(0, 1)) - img) * np.clip(mask, 0.0, 1.0)
+        img = PIL.Image.fromarray(np.uint8(np.clip(np.rint(img), 0, 255)), 'RGB')
+        quad += pad[:2]
+    # Transform.
+    img = img.transform((transform_size, transform_size), PIL.Image.QUAD, (quad + 0.5).flatten(), PIL.Image.BILINEAR)
+    if output_size < transform_size:
+        img = img.resize((output_size, output_size), PIL.Image.ANTIALIAS)
+    return img
+def compute_transform(lm, predictor, detector=None, scale=1.0, fa=None):
+    # lm = get_landmark(filepath, predictor, detector, fa)
+    # if lm is None:
+        # raise Exception(f'Did not detect any faces in image: {filepath}')
+    lm_chin = lm[0: 17]  # left-right
+    lm_eyebrow_left = lm[17: 22]  # left-right
+    lm_eyebrow_right = lm[22: 27]  # left-right
+    lm_nose = lm[27: 31]  # top-down
+    lm_nostrils = lm[31: 36]  # top-down
+    lm_eye_left = lm[36: 42]  # left-clockwise
+    lm_eye_right = lm[42: 48]  # left-clockwise
+    lm_mouth_outer = lm[48: 60]  # left-clockwise
+    lm_mouth_inner = lm[60: 68]  # left-clockwise
+    # Calculate auxiliary vectors.
+    eye_left = np.mean(lm_eye_left, axis=0)
+    eye_right = np.mean(lm_eye_right, axis=0)
+    eye_avg = (eye_left + eye_right) * 0.5
+    eye_to_eye = eye_right - eye_left
+    mouth_left = lm_mouth_outer[0]
+    mouth_right = lm_mouth_outer[6]
+    mouth_avg = (mouth_left + mouth_right) * 0.5
+    eye_to_mouth = mouth_avg - eye_avg
+    # Choose oriented crop rectangle.
+    x = eye_to_eye - np.flipud(eye_to_mouth) * [-1, 1]
+    x /= np.hypot(*x)
+    x *= max(np.hypot(*eye_to_eye) * 2.0, np.hypot(*eye_to_mouth) * 1.8)
+    x *= scale
+    y = np.flipud(x) * [-1, 1]
+    c = eye_avg + eye_to_mouth * 0.1
+    return c, x, y
+def crop_faces(IMAGE_SIZE, files, scale, center_sigma=0.0, xy_sigma=0.0, use_fa=False, fa=None):
+    if use_fa:
+        if fa == None:
+            device = 'cuda' if torch.cuda.is_available() else 'cpu'
+            fa = face_alignment.FaceAlignment(face_alignment.LandmarksType.TWO_D, flip_input=True, device=device)
+        predictor = None
+        detector = None
+    else:
+        fa = None
+        predictor = None
+        detector = None
+        # predictor = dlib.shape_predictor(paths_config.shape_predictor_path)
+        # detector = dlib.get_frontal_face_detector()
+    cs, xs, ys = [], [], []
+    for lm, pil in tqdm(files):
+        c, x, y = compute_transform(lm, predictor, detector=detector,
+                                    scale=scale, fa=fa)
+        cs.append(c)
+        xs.append(x)
+        ys.append(y)
+    cs = np.stack(cs)
+    xs = np.stack(xs)
+    ys = np.stack(ys)
+    if center_sigma != 0:
+        cs = gaussian_filter1d(cs, sigma=center_sigma, axis=0)
+    if xy_sigma != 0:
+        xs = gaussian_filter1d(xs, sigma=xy_sigma, axis=0)
+        ys = gaussian_filter1d(ys, sigma=xy_sigma, axis=0)
+    quads = np.stack([cs - xs - ys, cs - xs + ys, cs + xs + ys, cs + xs - ys], axis=1)
+    quads = list(quads)
+    crops, orig_images = crop_faces_by_quads(IMAGE_SIZE, files, quads)
+    return crops, orig_images, quads
+def crop_faces_by_quads(IMAGE_SIZE, files, quads):
+    orig_images = []
+    crops = []
+    for quad, (_, path) in tqdm(zip(quads, files), total=len(quads)):
+        crop = crop_image(path, IMAGE_SIZE, quad.copy())
+        orig_image = path # Image.open(path)
+        orig_images.append(orig_image)
+        crops.append(crop)
+    return crops, orig_images
+def calc_alignment_coefficients(pa, pb):
+    matrix = []
+    for p1, p2 in zip(pa, pb):
+        matrix.append([p1[0], p1[1], 1, 0, 0, 0, -p2[0] * p1[0], -p2[0] * p1[1]])
+        matrix.append([0, 0, 0, p1[0], p1[1], 1, -p2[1] * p1[0], -p2[1] * p1[1]])
+    a = np.matrix(matrix, dtype=float)
+    b = np.array(pb).reshape(8)
+    res = np.dot(np.linalg.inv(a.T * a) * a.T, b)
+    return np.array(res).reshape(8)

videoretalking/utils/audio.py ADDED Viewed

	@@ -0,0 +1,136 @@

+import librosa
+import librosa.filters
+import numpy as np
+# import tensorflow as tf
+from scipy import signal
+from scipy.io import wavfile
+from .hparams import hparams as hp
+def load_wav(path, sr):
+    return librosa.core.load(path, sr=sr)[0]
+def save_wav(wav, path, sr):
+    wav *= 32767 / max(0.01, np.max(np.abs(wav)))
+    #proposed by @dsmiller
+    wavfile.write(path, sr, wav.astype(np.int16))
+def save_wavenet_wav(wav, path, sr):
+    librosa.output.write_wav(path, wav, sr=sr)
+def preemphasis(wav, k, preemphasize=True):
+    if preemphasize:
+        return signal.lfilter([1, -k], [1], wav)
+    return wav
+def inv_preemphasis(wav, k, inv_preemphasize=True):
+    if inv_preemphasize:
+        return signal.lfilter([1], [1, -k], wav)
+    return wav
+def get_hop_size():
+    hop_size = hp.hop_size
+    if hop_size is None:
+        assert hp.frame_shift_ms is not None
+        hop_size = int(hp.frame_shift_ms / 1000 * hp.sample_rate)
+    return hop_size
+def linearspectrogram(wav):
+    D = _stft(preemphasis(wav, hp.preemphasis, hp.preemphasize))
+    S = _amp_to_db(np.abs(D)) - hp.ref_level_db
+    if hp.signal_normalization:
+        return _normalize(S)
+    return S
+def melspectrogram(wav):
+    D = _stft(preemphasis(wav, hp.preemphasis, hp.preemphasize))
+    S = _amp_to_db(_linear_to_mel(np.abs(D))) - hp.ref_level_db
+    if hp.signal_normalization:
+        return _normalize(S)
+    return S
+def _lws_processor():
+    import lws
+    return lws.lws(hp.n_fft, get_hop_size(), fftsize=hp.win_size, mode="speech")
+def _stft(y):
+    if hp.use_lws:
+        return _lws_processor(hp).stft(y).T
+    else:
+        return librosa.stft(y=y, n_fft=hp.n_fft, hop_length=get_hop_size(), win_length=hp.win_size)
+##########################################################
+#Those are only correct when using lws!!! (This was messing with Wavenet quality for a long time!)
+def num_frames(length, fsize, fshift):
+    """Compute number of time frames of spectrogram
+    """
+    pad = (fsize - fshift)
+    if length % fshift == 0:
+        M = (length + pad * 2 - fsize) // fshift + 1
+    else:
+        M = (length + pad * 2 - fsize) // fshift + 2
+    return M
+def pad_lr(x, fsize, fshift):
+    """Compute left and right padding
+    """
+    M = num_frames(len(x), fsize, fshift)
+    pad = (fsize - fshift)
+    T = len(x) + 2 * pad
+    r = (M - 1) * fshift + fsize - T
+    return pad, pad + r
+##########################################################
+#Librosa correct padding
+def librosa_pad_lr(x, fsize, fshift):
+    return 0, (x.shape[0] // fshift + 1) * fshift - x.shape[0]
+# Conversions
+_mel_basis = None
+def _linear_to_mel(spectogram):
+    global _mel_basis
+    if _mel_basis is None:
+        _mel_basis = _build_mel_basis()
+    return np.dot(_mel_basis, spectogram)
+def _build_mel_basis():
+    assert hp.fmax <= hp.sample_rate // 2
+    return librosa.filters.mel(hp.sample_rate, hp.n_fft, n_mels=hp.num_mels,
+                               fmin=hp.fmin, fmax=hp.fmax)
+def _amp_to_db(x):
+    min_level = np.exp(hp.min_level_db / 20 * np.log(10))
+    return 20 * np.log10(np.maximum(min_level, x))
+def _db_to_amp(x):
+    return np.power(10.0, (x) * 0.05)
+def _normalize(S):
+    if hp.allow_clipping_in_normalization:
+        if hp.symmetric_mels:
+            return np.clip((2 * hp.max_abs_value) * ((S - hp.min_level_db) / (-hp.min_level_db)) - hp.max_abs_value,
+                           -hp.max_abs_value, hp.max_abs_value)
+        else:
+            return np.clip(hp.max_abs_value * ((S - hp.min_level_db) / (-hp.min_level_db)), 0, hp.max_abs_value)
+    assert S.max() <= 0 and S.min() - hp.min_level_db >= 0
+    if hp.symmetric_mels:
+        return (2 * hp.max_abs_value) * ((S - hp.min_level_db) / (-hp.min_level_db)) - hp.max_abs_value
+    else:
+        return hp.max_abs_value * ((S - hp.min_level_db) / (-hp.min_level_db))
+def _denormalize(D):
+    if hp.allow_clipping_in_normalization:
+        if hp.symmetric_mels:
+            return (((np.clip(D, -hp.max_abs_value,
+                              hp.max_abs_value) + hp.max_abs_value) * -hp.min_level_db / (2 * hp.max_abs_value))
+                    + hp.min_level_db)
+        else:
+            return ((np.clip(D, 0, hp.max_abs_value) * -hp.min_level_db / hp.max_abs_value) + hp.min_level_db)
+    if hp.symmetric_mels:
+        return (((D + hp.max_abs_value) * -hp.min_level_db / (2 * hp.max_abs_value)) + hp.min_level_db)
+    else:
+        return ((D * -hp.min_level_db / hp.max_abs_value) + hp.min_level_db)

videoretalking/utils/ffhq_preprocess.py ADDED Viewed

	@@ -0,0 +1,140 @@

+import os
+import cv2
+import time
+import glob
+import argparse
+import scipy
+import numpy as np
+from PIL import Image
+from tqdm import tqdm
+from itertools import cycle
+from torch.multiprocessing import Pool, Process, set_start_method
+"""
+brief: face alignment with FFHQ method (https://github.com/NVlabs/ffhq-dataset)
+author: lzhbrian (https://lzhbrian.me)
+date: 2020.1.5
+note: code is heavily borrowed from
+    https://github.com/NVlabs/ffhq-dataset
+    http://dlib.net/face_landmark_detection.py.html
+requirements:
+    apt install cmake
+    conda install Pillow numpy scipy
+    pip install dlib
+    # download face landmark model from:
+    # http://dlib.net/files/shape_predictor_68_face_landmarks.dat.bz2
+"""
+import numpy as np
+from PIL import Image
+import dlib
+class Croper:
+    def __init__(self, path_of_lm):
+        # download model from: http://dlib.net/files/shape_predictor_68_face_landmarks.dat.bz2
+        self.predictor = dlib.shape_predictor(path_of_lm)
+    def get_landmark(self, img_np):
+        """get landmark with dlib
+        :return: np.array shape=(68, 2)
+        """
+        detector = dlib.get_frontal_face_detector()
+        dets = detector(img_np, 1)
+        if len(dets) == 0:
+            return None
+        d = dets[0]
+        # Get the landmarks/parts for the face in box d.
+        shape = self.predictor(img_np, d)
+        t = list(shape.parts())
+        a = []
+        for tt in t:
+            a.append([tt.x, tt.y])
+        lm = np.array(a)
+        return lm
+    def align_face(self, img, lm, output_size=1024):
+        """
+        :param filepath: str
+        :return: PIL Image
+        """
+        lm_chin = lm[0: 17]  # left-right
+        lm_eyebrow_left = lm[17: 22]  # left-right
+        lm_eyebrow_right = lm[22: 27]  # left-right
+        lm_nose = lm[27: 31]  # top-down
+        lm_nostrils = lm[31: 36]  # top-down
+        lm_eye_left = lm[36: 42]  # left-clockwise
+        lm_eye_right = lm[42: 48]  # left-clockwise
+        lm_mouth_outer = lm[48: 60]  # left-clockwise
+        lm_mouth_inner = lm[60: 68]  # left-clockwise
+        # Calculate auxiliary vectors.
+        eye_left = np.mean(lm_eye_left, axis=0)
+        eye_right = np.mean(lm_eye_right, axis=0)
+        eye_avg = (eye_left + eye_right) * 0.5
+        eye_to_eye = eye_right - eye_left
+        mouth_left = lm_mouth_outer[0]
+        mouth_right = lm_mouth_outer[6]
+        mouth_avg = (mouth_left + mouth_right) * 0.5
+        eye_to_mouth = mouth_avg - eye_avg
+        # Choose oriented crop rectangle.
+        x = eye_to_eye - np.flipud(eye_to_mouth) * [-1, 1]
+        x /= np.hypot(*x)
+        x *= max(np.hypot(*eye_to_eye) * 2.0, np.hypot(*eye_to_mouth) * 1.8)
+        y = np.flipud(x) * [-1, 1]
+        c = eye_avg + eye_to_mouth * 0.1
+        quad = np.stack([c - x - y, c - x + y, c + x + y, c + x - y])
+        qsize = np.hypot(*x) * 2
+        # Shrink.
+        shrink = int(np.floor(qsize / output_size * 0.5))
+        if shrink > 1:
+            rsize = (int(np.rint(float(img.size[0]) / shrink)), int(np.rint(float(img.size[1]) / shrink)))
+            img = img.resize(rsize, Image.ANTIALIAS)
+            quad /= shrink
+            qsize /= shrink
+        # Crop.
+        border = max(int(np.rint(qsize * 0.1)), 3)
+        crop = (int(np.floor(min(quad[:, 0]))), int(np.floor(min(quad[:, 1]))), int(np.ceil(max(quad[:, 0]))),
+                int(np.ceil(max(quad[:, 1]))))
+        crop = (max(crop[0] - border, 0), max(crop[1] - border, 0), min(crop[2] + border, img.size[0]),
+                min(crop[3] + border, img.size[1]))
+        if crop[2] - crop[0] < img.size[0] or crop[3] - crop[1] < img.size[1]:
+            quad -= crop[0:2]
+        # Transform.
+        quad = (quad + 0.5).flatten()
+        lx = max(min(quad[0], quad[2]), 0)
+        ly = max(min(quad[1], quad[7]), 0)
+        rx = min(max(quad[4], quad[6]), img.size[0])
+        ry = min(max(quad[3], quad[5]), img.size[0])
+        # Save aligned image.
+        return crop, [lx, ly, rx, ry]
+    def crop(self, img_np_list, xsize=512):    # first frame for all video
+        idx = 0
+        while idx < len(img_np_list)//2 :   # TODO
+            img_np = img_np_list[idx]
+            lm = self.get_landmark(img_np)
+            if lm is not None:
+                break   # can detect face
+            idx += 1
+        if lm is None:
+            return None
+        crop, quad = self.align_face(img=Image.fromarray(img_np), lm=lm, output_size=xsize)
+        clx, cly, crx, cry = crop
+        lx, ly, rx, ry = quad
+        lx, ly, rx, ry = int(lx), int(ly), int(rx), int(ry)
+        for _i in range(len(img_np_list)):
+            _inp = img_np_list[_i]
+            _inp = _inp[cly:cry, clx:crx]
+            _inp = _inp[ly:ry, lx:rx]
+            img_np_list[_i] = _inp
+        return img_np_list, crop, quad

videoretalking/utils/flow_util.py ADDED Viewed

	@@ -0,0 +1,56 @@

+import torch
+def convert_flow_to_deformation(flow):
+    r"""convert flow fields to deformations.
+    Args:
+        flow (tensor): Flow field obtained by the model
+    Returns:
+        deformation (tensor): The deformation used for warping
+    """
+    b,c,h,w = flow.shape
+    flow_norm = 2 * torch.cat([flow[:,:1,...]/(w-1),flow[:,1:,...]/(h-1)], 1)
+    grid = make_coordinate_grid(flow)
+    deformation = grid + flow_norm.permute(0,2,3,1)
+    return deformation
+def make_coordinate_grid(flow):
+    r"""obtain coordinate grid with the same size as the flow filed.
+    Args:
+        flow (tensor): Flow field obtained by the model
+    Returns:
+        grid (tensor): The grid with the same size as the input flow
+    """
+    b,c,h,w = flow.shape
+    x = torch.arange(w).to(flow)
+    y = torch.arange(h).to(flow)
+    x = (2 * (x / (w - 1)) - 1)
+    y = (2 * (y / (h - 1)) - 1)
+    yy = y.view(-1, 1).repeat(1, w)
+    xx = x.view(1, -1).repeat(h, 1)
+    meshed = torch.cat([xx.unsqueeze_(2), yy.unsqueeze_(2)], 2)
+    meshed = meshed.expand(b, -1, -1, -1)
+    return meshed
+def warp_image(source_image, deformation):
+    r"""warp the input image according to the deformation
+    Args:
+        source_image (tensor): source images to be warped
+        deformation (tensor): deformations used to warp the images; value in range (-1, 1)
+    Returns:
+        output (tensor): the warped images
+    """
+    _, h_old, w_old, _ = deformation.shape
+    _, _, h, w = source_image.shape
+    if h_old != h or w_old != w:
+        deformation = deformation.permute(0, 3, 1, 2)
+        deformation = torch.nn.functional.interpolate(deformation, size=(h, w), mode='bilinear')
+        deformation = deformation.permute(0, 2, 3, 1)
+    return torch.nn.functional.grid_sample(source_image, deformation)

videoretalking/utils/hparams.py ADDED Viewed

	@@ -0,0 +1,137 @@

+import os
+class HParams:
+	def __init__(self, **kwargs):
+		self.data = {}
+		for key, value in kwargs.items():
+			self.data[key] = value
+	def __getattr__(self, key):
+		if key not in self.data:
+			raise AttributeError("'HParams' object has no attribute %s" % key)
+		return self.data[key]
+	def set_hparam(self, key, value):
+		self.data[key] = value
+# Default hyperparameters
+hparams = HParams(
+	num_mels=80,  # Number of mel-spectrogram channels and local conditioning dimensionality
+	#  network
+	rescale=True,  # Whether to rescale audio prior to preprocessing
+	rescaling_max=0.9,  # Rescaling value
+	# Use LWS (https://github.com/Jonathan-LeRoux/lws) for STFT and phase reconstruction
+	# It"s preferred to set True to use with https://github.com/r9y9/wavenet_vocoder
+	# Does not work if n_ffit is not multiple of hop_size!!
+	use_lws=False,
+	n_fft=800,  # Extra window size is filled with 0 paddings to match this parameter
+	hop_size=200,  # For 16000Hz, 200 = 12.5 ms (0.0125 * sample_rate)
+	win_size=800,  # For 16000Hz, 800 = 50 ms (If None, win_size = n_fft) (0.05 * sample_rate)
+	sample_rate=16000,  # 16000Hz (corresponding to librispeech) (sox --i <filename>)
+	frame_shift_ms=None,  # Can replace hop_size parameter. (Recommended: 12.5)
+	# Mel and Linear spectrograms normalization/scaling and clipping
+	signal_normalization=True,
+	# Whether to normalize mel spectrograms to some predefined range (following below parameters)
+	allow_clipping_in_normalization=True,  # Only relevant if mel_normalization = True
+	symmetric_mels=True,
+	# Whether to scale the data to be symmetric around 0. (Also multiplies the output range by 2,
+	# faster and cleaner convergence)
+	max_abs_value=4.,
+	# max absolute value of data. If symmetric, data will be [-max, max] else [0, max] (Must not
+	# be too big to avoid gradient explosion,
+	# not too small for fast convergence)
+	# Contribution by @begeekmyfriend
+	# Spectrogram Pre-Emphasis (Lfilter: Reduce spectrogram noise and helps model certitude
+	# levels. Also allows for better G&L phase reconstruction)
+	preemphasize=True,  # whether to apply filter
+	preemphasis=0.97,  # filter coefficient.
+	# Limits
+	min_level_db=-100,
+	ref_level_db=20,
+	fmin=55,
+	# Set this to 55 if your speaker is male! if female, 95 should help taking off noise. (To
+	# test depending on dataset. Pitch info: male~[65, 260], female~[100, 525])
+	fmax=7600,  # To be increased/reduced depending on data.
+	###################### Our training parameters #################################
+	img_size=96,
+	fps=25,
+	batch_size=8,
+	initial_learning_rate=1e-4,
+	nepochs=300000,  ### ctrl + c, stop whenever eval loss is consistently greater than train loss for ~10 epochs
+	num_workers=20,
+	checkpoint_interval=3000,
+	eval_interval=3000,
+	writer_interval=300,
+    save_optimizer_state=True,
+    syncnet_wt=0.0, # is initially zero, will be set automatically to 0.03 later. Leads to faster convergence.
+	syncnet_batch_size=64,
+	syncnet_lr=1e-4,
+	syncnet_eval_interval=10000,
+	syncnet_checkpoint_interval=10000,
+	disc_wt=0.07,
+	disc_initial_learning_rate=1e-4,
+)
+# Default hyperparameters
+hparamsdebug = HParams(
+	num_mels=80,  # Number of mel-spectrogram channels and local conditioning dimensionality
+	#  network
+	rescale=True,  # Whether to rescale audio prior to preprocessing
+	rescaling_max=0.9,  # Rescaling value
+	# Use LWS (https://github.com/Jonathan-LeRoux/lws) for STFT and phase reconstruction
+	# It"s preferred to set True to use with https://github.com/r9y9/wavenet_vocoder
+	# Does not work if n_ffit is not multiple of hop_size!!
+	use_lws=False,
+	n_fft=800,  # Extra window size is filled with 0 paddings to match this parameter
+	hop_size=200,  # For 16000Hz, 200 = 12.5 ms (0.0125 * sample_rate)
+	win_size=800,  # For 16000Hz, 800 = 50 ms (If None, win_size = n_fft) (0.05 * sample_rate)
+	sample_rate=16000,  # 16000Hz (corresponding to librispeech) (sox --i <filename>)
+	frame_shift_ms=None,  # Can replace hop_size parameter. (Recommended: 12.5)
+	# Mel and Linear spectrograms normalization/scaling and clipping
+	signal_normalization=True,
+	# Whether to normalize mel spectrograms to some predefined range (following below parameters)
+	allow_clipping_in_normalization=True,  # Only relevant if mel_normalization = True
+	symmetric_mels=True,
+	# Whether to scale the data to be symmetric around 0. (Also multiplies the output range by 2,
+	# faster and cleaner convergence)
+	max_abs_value=4.,
+	# max absolute value of data. If symmetric, data will be [-max, max] else [0, max] (Must not
+	# be too big to avoid gradient explosion,
+	# not too small for fast convergence)
+	# Contribution by @begeekmyfriend
+	# Spectrogram Pre-Emphasis (Lfilter: Reduce spectrogram noise and helps model certitude
+	# levels. Also allows for better G&L phase reconstruction)
+	preemphasize=True,  # whether to apply filter
+	preemphasis=0.97,  # filter coefficient.
+	# Limits
+	min_level_db=-100,
+	ref_level_db=20,
+	fmin=55,
+	# Set this to 55 if your speaker is male! if female, 95 should help taking off noise. (To
+	# test depending on dataset. Pitch info: male~[65, 260], female~[100, 525])
+	fmax=7600,  # To be increased/reduced depending on data.
+)
+def hparams_debug_string():
+	values = hparams.values()
+	hp = ["  %s: %s" % (name, values[name]) for name in sorted(values) if name != "sentences"]
+	return "Hyperparameters:\n" + "\n".join(hp)

videoretalking/utils/inference_utils.py ADDED Viewed

	@@ -0,0 +1,254 @@

+import numpy as np
+import cv2, argparse, torch
+import torchvision.transforms.functional as TF
+from models import load_network, load_DNet
+from tqdm import tqdm
+from PIL import Image
+from scipy.spatial import ConvexHull
+from third_part import face_detection
+from third_part.face3d.models import networks
+import warnings
+warnings.filterwarnings("ignore")
+def options():
+    parser = argparse.ArgumentParser(description='Inference code to lip-sync videos in the wild using Wav2Lip models')
+    parser.add_argument('--DNet_path', type=str, default='checkpoints/DNet.pt')
+    parser.add_argument('--LNet_path', type=str, default='checkpoints/LNet.pth')
+    parser.add_argument('--ENet_path', type=str, default='checkpoints/ENet.pth')
+    parser.add_argument('--face3d_net_path', type=str, default='checkpoints/face3d_pretrain_epoch_20.pth')
+    parser.add_argument('--face', type=str, help='Filepath of video/image that contains faces to use', required=True)
+    parser.add_argument('--audio', type=str, help='Filepath of video/audio file to use as raw audio source', required=True)
+    parser.add_argument('--exp_img', type=str, help='Expression template. neutral, smile or image path', default='neutral')
+    parser.add_argument('--outfile', type=str, help='Video path to save result')
+    parser.add_argument('--fps', type=float, help='Can be specified only if input is a static image (default: 25)', default=25., required=False)
+    parser.add_argument('--pads', nargs='+', type=int, default=[0, 20, 0, 0], help='Padding (top, bottom, left, right). Please adjust to include chin at least')
+    parser.add_argument('--face_det_batch_size', type=int, help='Batch size for face detection', default=4)
+    parser.add_argument('--LNet_batch_size', type=int, help='Batch size for LNet', default=16)
+    parser.add_argument('--img_size', type=int, default=384)
+    parser.add_argument('--crop', nargs='+', type=int, default=[0, -1, 0, -1],
+                        help='Crop video to a smaller region (top, bottom, left, right). Applied after resize_factor and rotate arg. '
+                        'Useful if multiple face present. -1 implies the value will be auto-inferred based on height, width')
+    parser.add_argument('--box', nargs='+', type=int, default=[-1, -1, -1, -1],
+                        help='Specify a constant bounding box for the face. Use only as a last resort if the face is not detected.'
+                        'Also, might work only if the face is not moving around much. Syntax: (top, bottom, left, right).')
+    parser.add_argument('--nosmooth', default=False, action='store_true', help='Prevent smoothing face detections over a short temporal window')
+    parser.add_argument('--static', default=False, action='store_true')
+    parser.add_argument('--up_face', default='original')
+    parser.add_argument('--one_shot', action='store_true')
+    parser.add_argument('--without_rl1', default=False, action='store_true', help='Do not use the relative l1')
+    parser.add_argument('--tmp_dir', type=str, default='temp', help='Folder to save tmp results')
+    parser.add_argument('--re_preprocess', action='store_true')
+    args = parser.parse_args()
+    return args
+exp_aus_dict = {        # AU01_r, AU02_r, AU04_r, AU05_r, AU06_r, AU07_r, AU09_r, AU10_r, AU12_r, AU14_r, AU15_r, AU17_r, AU20_r, AU23_r, AU25_r, AU26_r, AU45_r.
+    'sad': torch.Tensor([[ 0,     0,      0,      0,      0,      0,      0,      0,      0,      0,      0,      0,      0,      0,      0,      0,      0]]),
+    'angry':torch.Tensor([[0,     0,      0.3,    0,      0,      0,      0,      0,      0,      0,      0,      0,      0,      0,      0,      0,      0]]),
+    'surprise': torch.Tensor([[0, 0,      0,      0.2,    0,      0,      0,      0,      0,      0,      0,      0,      0,      0,      0,      0,      0]])
+}
+def mask_postprocess(mask, thres=20):
+    mask[:thres, :] = 0; mask[-thres:, :] = 0
+    mask[:, :thres] = 0; mask[:, -thres:] = 0
+    mask = cv2.GaussianBlur(mask, (101, 101), 11)
+    mask = cv2.GaussianBlur(mask, (101, 101), 11)
+    return mask.astype(np.float32)
+def trans_image(image):
+    image = TF.resize(
+        image, size=256, interpolation=Image.BICUBIC)
+    image = TF.to_tensor(image)
+    image = TF.normalize(image, mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5))
+    return image
+def obtain_seq_index(index, num_frames):
+    seq = list(range(index-13, index+13))
+    seq = [ min(max(item, 0), num_frames-1) for item in seq ]
+    return seq
+def transform_semantic(semantic, frame_index, crop_norm_ratio=None):
+    index = obtain_seq_index(frame_index, semantic.shape[0])
+    coeff_3dmm = semantic[index,...]
+    ex_coeff = coeff_3dmm[:,80:144] #expression # 64
+    angles = coeff_3dmm[:,224:227] #euler angles for pose
+    translation = coeff_3dmm[:,254:257] #translation
+    crop = coeff_3dmm[:,259:262] #crop param
+    if crop_norm_ratio:
+        crop[:, -3] = crop[:, -3] * crop_norm_ratio
+    coeff_3dmm = np.concatenate([ex_coeff, angles, translation, crop], 1)
+    return torch.Tensor(coeff_3dmm).permute(1,0)
+def find_crop_norm_ratio(source_coeff, target_coeffs):
+    alpha = 0.3
+    exp_diff = np.mean(np.abs(target_coeffs[:,80:144] - source_coeff[:,80:144]), 1) # mean different exp
+    angle_diff = np.mean(np.abs(target_coeffs[:,224:227] - source_coeff[:,224:227]), 1) # mean different angle
+    index = np.argmin(alpha*exp_diff + (1-alpha)*angle_diff)  # find the smallerest index
+    crop_norm_ratio = source_coeff[:,-3] / target_coeffs[index:index+1, -3]
+    return crop_norm_ratio
+def get_smoothened_boxes(boxes, T):
+    for i in range(len(boxes)):
+        if i + T > len(boxes):
+            window = boxes[len(boxes) - T:]
+        else:
+            window = boxes[i : i + T]
+        boxes[i] = np.mean(window, axis=0)
+    return boxes
+def face_detect(images, face_det_batch_size, nosmooth, pads, jaw_correction, detector=None):
+# def face_detect(images, args, jaw_correction=False, detector=None):
+    if detector == None:
+        device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
+        detector = face_detection.FaceAlignment(face_detection.LandmarksType._2D,
+                                                flip_input=False, device=device)
+    batch_size = face_det_batch_size
+    while 1:
+        predictions = []
+        try:
+            for i in tqdm(range(0, len(images), batch_size),desc='FaceDet:'):
+                predictions.extend(detector.get_detections_for_batch(np.array(images[i:i + batch_size])))
+        except RuntimeError:
+            if batch_size == 1:
+                raise RuntimeError('Image too big to run face detection on GPU. Please use the --resize_factor argument')
+            batch_size //= 2
+            print('Recovering from OOM error; New batch size: {}'.format(batch_size))
+            continue
+        break
+    results = []
+    pady1, pady2, padx1, padx2 = pads if jaw_correction else (0,20,0,0)
+    for rect, image in zip(predictions, images):
+        if rect is None:
+            cv2.imwrite('temp/faulty_frame.jpg', image) # check this frame where the face was not detected.
+            raise ValueError('Face not detected! Ensure the video contains a face in all the frames.')
+        y1 = max(0, rect[1] - pady1)
+        y2 = min(image.shape[0], rect[3] + pady2)
+        x1 = max(0, rect[0] - padx1)
+        x2 = min(image.shape[1], rect[2] + padx2)
+        results.append([x1, y1, x2, y2])
+    boxes = np.array(results)
+    if not nosmooth: boxes = get_smoothened_boxes(boxes, T=5)
+    results = [[image[y1: y2, x1:x2], (y1, y2, x1, x2)] for image, (x1, y1, x2, y2) in zip(images, boxes)]
+    del detector
+    torch.cuda.empty_cache()
+    return results
+def _load(checkpoint_path, device):
+    if device == 'cuda':
+        checkpoint = torch.load(checkpoint_path)
+    else:
+        checkpoint = torch.load(checkpoint_path,
+                                map_location=lambda storage, loc: storage)
+    return checkpoint
+def split_coeff(coeffs):
+        """
+        Return:
+            coeffs_dict     -- a dict of torch.tensors
+        Parameters:
+            coeffs          -- torch.tensor, size (B, 256)
+        """
+        id_coeffs = coeffs[:, :80]
+        exp_coeffs = coeffs[:, 80: 144]
+        tex_coeffs = coeffs[:, 144: 224]
+        angles = coeffs[:, 224: 227]
+        gammas = coeffs[:, 227: 254]
+        translations = coeffs[:, 254:]
+        return {
+            'id': id_coeffs,
+            'exp': exp_coeffs,
+            'tex': tex_coeffs,
+            'angle': angles,
+            'gamma': gammas,
+            'trans': translations
+        }
+def Laplacian_Pyramid_Blending_with_mask(A, B, m, num_levels = 6):
+    # generate Gaussian pyramid for A,B and mask
+    GA = A.copy()
+    GB = B.copy()
+    GM = m.copy()
+    gpA = [GA]
+    gpB = [GB]
+    gpM = [GM]
+    for i in range(num_levels):
+        GA = cv2.pyrDown(GA)
+        GB = cv2.pyrDown(GB)
+        GM = cv2.pyrDown(GM)
+        gpA.append(np.float32(GA))
+        gpB.append(np.float32(GB))
+        gpM.append(np.float32(GM))
+    # generate Laplacian Pyramids for A,B and masks
+    lpA  = [gpA[num_levels-1]] # the bottom of the Lap-pyr holds the last (smallest) Gauss level
+    lpB  = [gpB[num_levels-1]]
+    gpMr = [gpM[num_levels-1]]
+    for i in range(num_levels-1,0,-1):
+        # Laplacian: subtract upscaled version of lower level from current level
+        # to get the high frequencies
+        LA = np.subtract(gpA[i-1], cv2.pyrUp(gpA[i]))
+        LB = np.subtract(gpB[i-1], cv2.pyrUp(gpB[i]))
+        lpA.append(LA)
+        lpB.append(LB)
+        gpMr.append(gpM[i-1]) # also reverse the masks
+    # Now blend images according to mask in each level
+    LS = []
+    for la,lb,gm in zip(lpA,lpB,gpMr):
+        gm = gm[:,:,np.newaxis]
+        ls = la * gm + lb * (1.0 - gm)
+        LS.append(ls)
+    # now reconstruct
+    ls_ = LS[0]
+    for i in range(1,num_levels):
+        ls_ = cv2.pyrUp(ls_)
+        ls_ = cv2.add(ls_, LS[i])
+    return ls_
+def load_model(device,DNet_path,LNet_path,ENet_path):
+    D_Net = load_DNet(DNet_path).to(device)
+    model = load_network(LNet_path,ENet_path).to(device)
+    return D_Net, model
+def normalize_kp(kp_source, kp_driving, kp_driving_initial, adapt_movement_scale=False,
+                 use_relative_movement=False, use_relative_jacobian=False):
+    if adapt_movement_scale:
+        source_area = ConvexHull(kp_source['value'][0].data.cpu().numpy()).volume
+        driving_area = ConvexHull(kp_driving_initial['value'][0].data.cpu().numpy()).volume
+        adapt_movement_scale = np.sqrt(source_area) / np.sqrt(driving_area)
+    else:
+        adapt_movement_scale = 1
+    kp_new = {k: v for k, v in kp_driving.items()}
+    if use_relative_movement:
+        kp_value_diff = (kp_driving['value'] - kp_driving_initial['value'])
+        kp_value_diff *= adapt_movement_scale
+        kp_new['value'] = kp_value_diff + kp_source['value']
+        if use_relative_jacobian:
+            jacobian_diff = torch.matmul(kp_driving['jacobian'], torch.inverse(kp_driving_initial['jacobian']))
+            kp_new['jacobian'] = torch.matmul(jacobian_diff, kp_source['jacobian'])
+    return kp_new
+def load_face3d_net(ckpt_path, device):
+    net_recon = networks.define_net_recon(net_recon='resnet50', use_last_fc=False, init_path='').to(device)
+    checkpoint = torch.load(ckpt_path, map_location=device)
+    net_recon.load_state_dict(checkpoint['net_recon'])
+    net_recon.eval()
+    return net_recon