Spaces:

JackIsNotInTheBox
/

Generate_Audio_for_Video

Sleeping

Generate_Audio_for_Video / MMAudio /mmaudio /utils /video_joiner.py

Jack Wu

c60109f 28 days ago

2.28 kB

	from pathlib import Path
	from typing import Union

	import torch
	from torio.io import StreamingMediaDecoder, StreamingMediaEncoder


	class VideoJoiner:

	def __init__(self, src_root: Union[str, Path], output_root: Union[str, Path], sample_rate: int,
	duration_seconds: float):
	self.src_root = Path(src_root)
	self.output_root = Path(output_root)
	self.sample_rate = sample_rate
	self.duration_seconds = duration_seconds

	self.output_root.mkdir(parents=True, exist_ok=True)

	def join(self, video_id: str, output_name: str, audio: torch.Tensor):
	video_path = self.src_root / f'{video_id}.mp4'
	output_path = self.output_root / f'{output_name}.mp4'
	merge_audio_into_video(video_path, output_path, audio, self.sample_rate,
	self.duration_seconds)


	def merge_audio_into_video(video_path: Union[str, Path], output_path: Union[str, Path],
	audio: torch.Tensor, sample_rate: int, duration_seconds: float):
	# audio: (num_samples, num_channels=1/2)

	frame_rate = 24
	# read the video
	reader = StreamingMediaDecoder(video_path)
	reader.add_basic_video_stream(
	frames_per_chunk=int(frame_rate * duration_seconds),
	# buffer_chunk_size=1, # does not work with this -- extracted audio would be too short
	format="rgb24",
	frame_rate=frame_rate,
	)

	reader.fill_buffer()
	video_chunk = reader.pop_chunks()[0]
	t, _, h, w = video_chunk.shape

	writer = StreamingMediaEncoder(output_path)
	writer.add_audio_stream(
	sample_rate=sample_rate,
	num_channels=audio.shape[-1],
	encoder="libmp3lame",
	)
	writer.add_video_stream(frame_rate=frame_rate,
	width=w,
	height=h,
	format="rgb24",
	encoder="libx264",
	encoder_format="yuv420p")

	with writer.open():
	writer.write_audio_chunk(0, audio.float())
	writer.write_video_chunk(1, video_chunk)


	if __name__ == '__main__':
	# Usage example
	import sys
	audio = torch.randn(16000 * 4, 1)
	merge_audio_into_video(sys.argv[1], sys.argv[2], audio, 16000, 4)