Spaces:

darshankr
/

diff2lip

Runtime error

App Files Files Community

darshankr commited on Oct 23, 2024

Commit

872e650

verified ·

1 Parent(s): 766f849

Upload 49 files

Browse files

Files changed (50) hide show

.gitattributes +2 -0
app.py +66 -0
assets/id02548.0pAkJZmlFqc.00001_id04570.0YMGn6BI9rg.00001.gif +3 -0
assets/website_gif_v2.gif +3 -0
audio/__init__.py +0 -0
audio/audio.py +136 -0
audio/hparams.py +66 -0
checkpoints/checkpoint.pt +3 -0
dataset/LRW/lrw_fullpath.py +25 -0
dataset/filelists/lrw_cross.txt +0 -0
dataset/filelists/lrw_cross_relative_path.txt +0 -0
dataset/filelists/lrw_reconstruction.txt +0 -0
dataset/filelists/lrw_reconstruction_relative_path.txt +0 -0
dataset/filelists/voxceleb2_test_n_5000_reconstruction_5k.txt +0 -0
dataset/filelists/voxceleb2_test_n_5000_seed_797_cross_5K.txt +0 -0
dataset/filelists/voxceleb2_test_n_500_reconstruction.txt +500 -0
dataset/filelists/voxceleb2_test_n_500_seed_797_cross.txt +500 -0
face_detection/README.md +1 -0
face_detection/__init__.py +7 -0
face_detection/api.py +98 -0
face_detection/detection/__init__.py +1 -0
face_detection/detection/core.py +130 -0
face_detection/detection/sfd/__init__.py +1 -0
face_detection/detection/sfd/bbox.py +129 -0
face_detection/detection/sfd/detect.py +112 -0
face_detection/detection/sfd/net_s3fd.py +129 -0
face_detection/detection/sfd/sfd_detector.py +59 -0
face_detection/models.py +261 -0
face_detection/utils.py +313 -0
generate.py +398 -0
generate_dist.py +428 -0
guided-diffusion/LICENSE +21 -0
guided-diffusion/guided_diffusion/__init__.py +3 -0
guided-diffusion/guided_diffusion/dist_util.py +94 -0
guided-diffusion/guided_diffusion/fp16_util.py +237 -0
guided-diffusion/guided_diffusion/gaussian_diffusion.py +843 -0
guided-diffusion/guided_diffusion/image_datasets.py +167 -0
guided-diffusion/guided_diffusion/logger.py +491 -0
guided-diffusion/guided_diffusion/losses.py +77 -0
guided-diffusion/guided_diffusion/lpips.py +20 -0
guided-diffusion/guided_diffusion/nn.py +170 -0
guided-diffusion/guided_diffusion/resample.py +154 -0
guided-diffusion/guided_diffusion/respace.py +128 -0
guided-diffusion/guided_diffusion/script_util.py +614 -0
guided-diffusion/guided_diffusion/tfg_data_util.py +75 -0
guided-diffusion/guided_diffusion/unet.py +1275 -0
guided-diffusion/setup.py +7 -0
requirements.txt +11 -0
scripts/inference.sh +40 -0
scripts/inference_single_video.sh +35 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+assets/id02548.0pAkJZmlFqc.00001_id04570.0YMGn6BI9rg.00001.gif filter=lfs diff=lfs merge=lfs -text
+assets/website_gif_v2.gif filter=lfs diff=lfs merge=lfs -text

app.py ADDED Viewed

	@@ -0,0 +1,66 @@

+import gradio as gr
+import subprocess
+import os
+import requests
+def process_video(audio_file, video_file):
+    # Define file paths
+    audio_path = audio_file.name
+    video_path = video_file.name
+    out_path = "output_video.mp4"
+    # Save uploaded files
+    audio_file.save(audio_path)
+    video_file.save(video_path)
+    # Define command flags
+    sample_mode = "cross"  # or "reconstruction"
+    generate_from_filelist = 0
+    model_path = "checkpoints/checkpoint.pt"
+    pads = "0,0,0,0"
+    if sample_mode == "reconstruction":
+        sample_input_flags = "--sampling_input_type=first_frame --sampling_ref_type=first_frame"
+    elif sample_mode == "cross":
+        sample_input_flags = "--sampling_input_type=gt --sampling_ref_type=gt"
+    else:
+        return "Error: sample_mode can only be \"cross\" or \"reconstruction\""
+    MODEL_FLAGS = "--attention_resolutions 32,16,8 --class_cond False --learn_sigma True --num_channels 128 --num_head_channels 64 --num_res_blocks 2 --resblock_updown True --use_fp16 True --use_scale_shift_norm False"
+    DIFFUSION_FLAGS = "--predict_xstart False --diffusion_steps 1000 --noise_schedule linear --rescale_timesteps False"
+    SAMPLE_FLAGS = f"--sampling_seed=7 {sample_input_flags} --timestep_respacing ddim25 --use_ddim True --model_path={model_path}"
+    DATA_FLAGS = "--nframes 5 --nrefer 1 --image_size 128 --sampling_batch_size=32"
+    TFG_FLAGS = "--face_hide_percentage 0.5 --use_ref=True --use_audio=True --audio_as_style=True"
+    GEN_FLAGS = f"--generate_from_filelist {generate_from_filelist} --video_path={video_path} --audio_path={audio_path} --out_path={out_path} --save_orig=False --face_det_batch_size 16 --pads {pads} --is_voxceleb2=False"
+    # Combine all flags into one command
+    command = f"python your_model_script.py {MODEL_FLAGS} {DIFFUSION_FLAGS} {SAMPLE_FLAGS} {DATA_FLAGS} {TFG_FLAGS} {GEN_FLAGS}"
+    # Execute the command
+    try:
+        subprocess.run(command, shell=True, check=True)
+        return out_path
+    except subprocess.CalledProcessError as e:
+        return f"Error processing video: {e}"
+    # Clean up the files after processing
+    os.remove(audio_path)
+    os.remove(video_path)
+    # Delete output video after sending to the user
+    os.remove(out_path)
+# Create a Gradio interface
+iface = gr.Interface(
+    fn=process_video,
+    inputs=[
+        gr.inputs.Audio(label="Input Audio", type="file"),
+        gr.inputs.Video(label="Input Video", type="file")
+    ],
+    outputs=gr.outputs.Video(label="Processed Video"),
+    title="Audio-Video Processing",
+    description="Upload an audio file and a video file to process the video based on the audio input."
+)
+# Launch the interface
+iface.launch()

assets/id02548.0pAkJZmlFqc.00001_id04570.0YMGn6BI9rg.00001.gif ADDED Viewed

Git LFS Details

SHA256: e870f498b739b783cd69ade2991dd1b0021eab47a3c5a6fe4abf3d07c931dc73
Pointer size: 132 Bytes
Size of remote file: 9.61 MB

assets/website_gif_v2.gif ADDED Viewed

Git LFS Details

SHA256: 5ef17ae4f9de5b9397dfe97077d4a82aa592ad34fe8d7559e08189661ef38753
Pointer size: 132 Bytes
Size of remote file: 6.19 MB

audio/__init__.py ADDED Viewed

File without changes

audio/audio.py ADDED Viewed

	@@ -0,0 +1,136 @@

+import librosa
+import librosa.filters
+import numpy as np
+# import tensorflow as tf
+from scipy import signal
+from scipy.io import wavfile
+from .hparams import hparams as hp
+def load_wav(path, sr):
+    return librosa.core.load(path, sr=sr)[0]
+def save_wav(wav, path, sr):
+    wav *= 32767 / max(0.01, np.max(np.abs(wav)))
+    #proposed by @dsmiller
+    wavfile.write(path, sr, wav.astype(np.int16))
+def save_wavenet_wav(wav, path, sr):
+    librosa.output.write_wav(path, wav, sr=sr)
+def preemphasis(wav, k, preemphasize=True):
+    if preemphasize:
+        return signal.lfilter([1, -k], [1], wav)
+    return wav
+def inv_preemphasis(wav, k, inv_preemphasize=True):
+    if inv_preemphasize:
+        return signal.lfilter([1], [1, -k], wav)
+    return wav
+def get_hop_size():
+    hop_size = hp.hop_size
+    if hop_size is None:
+        assert hp.frame_shift_ms is not None
+        hop_size = int(hp.frame_shift_ms / 1000 * hp.sample_rate)
+    return hop_size
+def linearspectrogram(wav):
+    D = _stft(preemphasis(wav, hp.preemphasis, hp.preemphasize))
+    S = _amp_to_db(np.abs(D)) - hp.ref_level_db
+    if hp.signal_normalization:
+        return _normalize(S)
+    return S
+def melspectrogram(wav):
+    D = _stft(preemphasis(wav, hp.preemphasis, hp.preemphasize))
+    S = _amp_to_db(_linear_to_mel(np.abs(D))) - hp.ref_level_db
+    if hp.signal_normalization:
+        return _normalize(S)
+    return S
+def _lws_processor():
+    import lws
+    return lws.lws(hp.n_fft, get_hop_size(), fftsize=hp.win_size, mode="speech")
+def _stft(y):
+    if hp.use_lws:
+        return _lws_processor(hp).stft(y).T
+    else:
+        return librosa.stft(y=y, n_fft=hp.n_fft, hop_length=get_hop_size(), win_length=hp.win_size)
+##########################################################
+#Those are only correct when using lws!!! (This was messing with Wavenet quality for a long time!)
+def num_frames(length, fsize, fshift):
+    """Compute number of time frames of spectrogram
+    """
+    pad = (fsize - fshift)
+    if length % fshift == 0:
+        M = (length + pad * 2 - fsize) // fshift + 1
+    else:
+        M = (length + pad * 2 - fsize) // fshift + 2
+    return M
+def pad_lr(x, fsize, fshift):
+    """Compute left and right padding
+    """
+    M = num_frames(len(x), fsize, fshift)
+    pad = (fsize - fshift)
+    T = len(x) + 2 * pad
+    r = (M - 1) * fshift + fsize - T
+    return pad, pad + r
+##########################################################
+#Librosa correct padding
+def librosa_pad_lr(x, fsize, fshift):
+    return 0, (x.shape[0] // fshift + 1) * fshift - x.shape[0]
+# Conversions
+_mel_basis = None
+def _linear_to_mel(spectogram):
+    global _mel_basis
+    if _mel_basis is None:
+        _mel_basis = _build_mel_basis()
+    return np.dot(_mel_basis, spectogram)
+def _build_mel_basis():
+    assert hp.fmax <= hp.sample_rate // 2
+    return librosa.filters.mel(sr=hp.sample_rate, n_fft=hp.n_fft, n_mels=hp.num_mels,
+                               fmin=hp.fmin, fmax=hp.fmax)
+def _amp_to_db(x):
+    min_level = np.exp(hp.min_level_db / 20 * np.log(10))
+    return 20 * np.log10(np.maximum(min_level, x))
+def _db_to_amp(x):
+    return np.power(10.0, (x) * 0.05)
+def _normalize(S):
+    if hp.allow_clipping_in_normalization:
+        if hp.symmetric_mels:
+            return np.clip((2 * hp.max_abs_value) * ((S - hp.min_level_db) / (-hp.min_level_db)) - hp.max_abs_value,
+                           -hp.max_abs_value, hp.max_abs_value)
+        else:
+            return np.clip(hp.max_abs_value * ((S - hp.min_level_db) / (-hp.min_level_db)), 0, hp.max_abs_value)
+    assert S.max() <= 0 and S.min() - hp.min_level_db >= 0
+    if hp.symmetric_mels:
+        return (2 * hp.max_abs_value) * ((S - hp.min_level_db) / (-hp.min_level_db)) - hp.max_abs_value
+    else:
+        return hp.max_abs_value * ((S - hp.min_level_db) / (-hp.min_level_db))
+def _denormalize(D):
+    if hp.allow_clipping_in_normalization:
+        if hp.symmetric_mels:
+            return (((np.clip(D, -hp.max_abs_value,
+                              hp.max_abs_value) + hp.max_abs_value) * -hp.min_level_db / (2 * hp.max_abs_value))
+                    + hp.min_level_db)
+        else:
+            return ((np.clip(D, 0, hp.max_abs_value) * -hp.min_level_db / hp.max_abs_value) + hp.min_level_db)
+    if hp.symmetric_mels:
+        return (((D + hp.max_abs_value) * -hp.min_level_db / (2 * hp.max_abs_value)) + hp.min_level_db)
+    else:
+        return ((D * -hp.min_level_db / hp.max_abs_value) + hp.min_level_db)

audio/hparams.py ADDED Viewed

	@@ -0,0 +1,66 @@

+from glob import glob
+import os
+class HParams:
+	def __init__(self, **kwargs):
+		self.data = {}
+		for key, value in kwargs.items():
+			self.data[key] = value
+	def __getattr__(self, key):
+		if key not in self.data:
+			raise AttributeError("'HParams' object has no attribute %s" % key)
+		return self.data[key]
+	def set_hparam(self, key, value):
+		self.data[key] = value
+# Default hyperparameters
+hparams = HParams(
+	num_mels=80,  # Number of mel-spectrogram channels and local conditioning dimensionality
+	#  network
+	rescale=True,  # Whether to rescale audio prior to preprocessing
+	rescaling_max=0.9,  # Rescaling value
+	# Use LWS (https://github.com/Jonathan-LeRoux/lws) for STFT and phase reconstruction
+	# It"s preferred to set True to use with https://github.com/r9y9/wavenet_vocoder
+	# Does not work if n_ffit is not multiple of hop_size!!
+	use_lws=False,
+	n_fft=800,  # Extra window size is filled with 0 paddings to match this parameter
+	hop_size=200,  # For 16000Hz, 200 = 12.5 ms (0.0125 * sample_rate)
+	win_size=800,  # For 16000Hz, 800 = 50 ms (If None, win_size = n_fft) (0.05 * sample_rate)
+	sample_rate=16000,  # 16000Hz (corresponding to librispeech) (sox --i <filename>)
+	frame_shift_ms=None,  # Can replace hop_size parameter. (Recommended: 12.5)
+	# Mel and Linear spectrograms normalization/scaling and clipping
+	signal_normalization=True,
+	# Whether to normalize mel spectrograms to some predefined range (following below parameters)
+	allow_clipping_in_normalization=True,  # Only relevant if mel_normalization = True
+	symmetric_mels=True,
+	# Whether to scale the data to be symmetric around 0. (Also multiplies the output range by 2,
+	# faster and cleaner convergence)
+	max_abs_value=4.,
+	# max absolute value of data. If symmetric, data will be [-max, max] else [0, max] (Must not
+	# be too big to avoid gradient explosion,
+	# not too small for fast convergence)
+	# Contribution by @begeekmyfriend
+	# Spectrogram Pre-Emphasis (Lfilter: Reduce spectrogram noise and helps model certitude
+	# levels. Also allows for better G&L phase reconstruction)
+	preemphasize=True,  # whether to apply filter
+	preemphasis=0.97,  # filter coefficient.
+	# Limits
+	min_level_db=-100,
+	ref_level_db=20,
+	fmin=55,
+	# Set this to 55 if your speaker is male! if female, 95 should help taking off noise. (To
+	# test depending on dataset. Pitch info: male~[65, 260], female~[100, 525])
+	fmax=7600,  # To be increased/reduced depending on data.
+)

checkpoints/checkpoint.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8c71166482d2b893f2f77450563a1bb31d805f3048c7213b974fd9201e9aa4b3
+size 406815527

dataset/LRW/lrw_fullpath.py ADDED Viewed

	@@ -0,0 +1,25 @@

+'''Converts the LRW video names in filelists to LRW relative paths and dumps them unto new filelists'''
+import os
+filelist = "../filelists/lrw_cross.txt"
+filelist_split_path = filelist.replace(".txt","_relative_path.txt")
+with open(filelist, 'r') as f:
+        lines = f.readlines()
+with open(filelist_split_path, 'w') as f:
+    for i in range(len(lines)):
+        audio_name, video_name=lines[i].split(' ')
+        audio_word = audio_name.split('_')[0]
+        video_word = video_name.split('_')[0]
+        f.write(os.path.join(audio_word,'test',audio_name)+' '+os.path.join(video_word,'test',video_name))
+filelist = "../filelists/lrw_reconstruction.txt"
+filelist_split_path = filelist.replace(".txt","_relative_path.txt")
+with open(filelist, 'r') as f:
+        lines = f.readlines()
+with open(filelist_split_path, 'w') as f:
+    for i in range(len(lines)):
+        audio_name, video_name=lines[i].split(' ')
+        audio_word = audio_name.split('_')[0]
+        video_word = video_name.split('_')[0]
+        f.write(os.path.join(audio_word,'test',audio_name)+' '+os.path.join(video_word,'test',video_name))

dataset/filelists/lrw_cross.txt ADDED Viewed