File size: 3,262 Bytes
bd546bf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
import torch
from einops import rearrange
from moviepy.editor import VideoFileClip
import numpy as np
from moviepy.editor import (VideoFileClip,
                            AudioFileClip,
                            concatenate_videoclips,
                            CompositeAudioClip,
                            ImageSequenceClip,
                            )
from moviepy.audio.AudioClip import AudioArrayClip
def read_frames(video_path:str):
    video_clip = VideoFileClip(video_path)
    audio_clip = video_clip.audio
    frames = np.array(list(video_clip.iter_frames()))
    video_fps = video_clip.fps
    return {
        "video": frames,
        "fps": video_fps,
        "audio_clip": audio_clip
    }
def audio2array(audio_clip,start_frame,end_frame,video_fps,sampling_rate,audio_len):
    audio_fps = audio_clip.fps 
    start_time = start_frame / video_fps
    end_time = end_frame / video_fps
    sampled_audio = audio_clip.subclip(start_time, end_time)
    sampled_audio = sampled_audio.set_fps(sampling_rate) 
    audio_chunks = list(sampled_audio.iter_chunks(fps=sampling_rate,chunksize=50000))
    audio = np.concatenate(audio_chunks, axis=0)
    if len(audio.shape) == 2:
        audio = np.mean(audio,axis=1)
    audio = np.interp(np.arange(0,audio_len),np.arange(0,len(audio)),audio)
    return {
        "audio": audio,
        "fps": audio_fps
    }
def read_video(video_path,sampling_rate:int = 16000,frame_idx=None):
    video_clip = VideoFileClip(video_path)
    audio_clip = video_clip.audio

    frames = np.array(list(video_clip.iter_frames()))
    video_fps = video_clip.fps
    audio_fps = audio_clip.fps 
    if frame_idx is not None:
        start_time = frame_idx[0] / video_fps
        end_time = frame_idx[-1] / video_fps
    else:
        start_time = 0
        end_time = len(frames)  / video_fps
    sampled_audio = audio_clip.subclip(start_time, end_time)
    sampled_audio = sampled_audio.set_fps(sampling_rate) 
    audio_chunks = list(sampled_audio.iter_chunks(fps=sampling_rate,chunksize=50000))
    audio = np.concatenate(audio_chunks, axis=0)
    if len(audio.shape) == 2:
        audio = np.mean(audio,axis=1)
    return { 
        "video": frames,
        "audio": audio,
        "video_fps" : video_fps,
        "audio_fps": audio_fps
    } 
def write_video(output_path, frames, audio, video_fps, audio_fps):
    video_clip = ImageSequenceClip(list(frames), fps=video_fps)
    audio_clip = AudioArrayClip(audio,audio_fps)
    video_clip = video_clip.set_audio(audio_clip)
    video_clip.write_videofile(output_path, codec='libx264', audio_codec='aac')
def tensor2frames(x:torch.Tensor):
    if len(x.shape) == 5:
        x = rearrange(x,'b t c h w -> (b t) c h w')
    frames = []
    for image in x:
        image = image.permute(1, 2, 0)
        image_np = ((image / 2.0 + 0.5).clamp(0, 1) * 255).to(dtype=torch.uint8).cpu().contiguous().numpy()
        frames.append(image_np)
    return frames
if __name__ == "__main__":
    video_path = "/mnt/spaceai-data/tts/team/digital_avatar_group/fenghe/datasets/celebv-hq/-3Bl8i34Z7Q_0.mp4"
    d = read_frames(video_path)
    print(audio2array(d["audio_clip"],0,16,24,16000,10000))
    # video = VideoFileClip(video_path)
    # print(len(list(video.iter_frames())))