| | import gradio as gr |
| | import torch |
| | import numpy as np |
| | from diffusers import I2VGenXLPipeline |
| | from transformers import MusicgenForConditionalGeneration, AutoProcessor |
| | from PIL import Image |
| | from moviepy.editor import ImageSequenceClip |
| | import io |
| | import scipy.io.wavfile |
| | import ffmpeg |
| |
|
| | def generate_video(image, prompt, negative_prompt, video_length): |
| | generator = torch.manual_seed(8888) |
| |
|
| | |
| | device = torch.device("mps" if torch.backends.mps.is_available() else "cpu") |
| | print(f"Using device: {device}") |
| |
|
| | |
| | pipeline = I2VGenXLPipeline.from_pretrained("ali-vilab/i2vgen-xl", torch_dtype=torch.float32) |
| | pipeline.to(device) |
| |
|
| | |
| | frames = [] |
| | total_frames = video_length * 20 |
| |
|
| | for i in range(total_frames): |
| | frame = pipeline( |
| | prompt=prompt, |
| | image=image, |
| | num_inference_steps=1, |
| | negative_prompt=negative_prompt, |
| | guidance_scale=9.0, |
| | generator=generator, |
| | num_frames=1 |
| | ).frames[0] |
| | frames.append(np.array(frame)) |
| |
|
| | |
| | yield (i + 1) / total_frames |
| |
|
| | |
| | output_file = "output_video.mp4" |
| | clip = ImageSequenceClip(frames, fps=30) |
| | clip.write_videofile(output_file, codec='libx264', audio=False) |
| |
|
| | return output_file |
| |
|
| | def generate_music(prompt, unconditional=False): |
| | model = MusicgenForConditionalGeneration.from_pretrained("facebook/musicgen-small") |
| | device = "cuda:0" if torch.cuda.is_available() else "cpu" |
| | model.to(device) |
| |
|
| | |
| | if unconditional: |
| | unconditional_inputs = model.get_unconditional_inputs(num_samples=1) |
| | audio_values = model.generate(**unconditional_inputs, do_sample=True, max_new_tokens=256) |
| | else: |
| | processor = AutoProcessor.from_pretrained("facebook/musicgen-small") |
| | inputs = processor( |
| | text=prompt, |
| | padding=True, |
| | return_tensors="pt", |
| | ) |
| | audio_values = model.generate(**inputs.to(device), do_sample=True, guidance_scale=3, max_new_tokens=256) |
| |
|
| | sampling_rate = model.config.audio_encoder.sampling_rate |
| | audio_file = "musicgen_out.wav" |
| | |
| | |
| | audio_data = audio_values[0].cpu().numpy() |
| | |
| | |
| | if audio_data.ndim > 1: |
| | audio_data = audio_data[0] |
| |
|
| | |
| | audio_data = np.clip(audio_data, -1.0, 1.0) |
| | audio_data = (audio_data * 32767).astype(np.int16) |
| |
|
| | |
| | scipy.io.wavfile.write(audio_file, sampling_rate, audio_data) |
| | |
| | return audio_file |
| |
|
| | def combine_audio_video(audio_file, video_file): |
| | output_file = "combined_output.mp4" |
| | audio = ffmpeg.input(audio_file) |
| | video = ffmpeg.input(video_file) |
| | output = ffmpeg.output(video, audio, output_file, vcodec='copy', acodec='aac') |
| | ffmpeg.run(output) |
| | return output_file |
| |
|
| | def interface(image_path, prompt, negative_prompt, video_length, music_prompt, unconditional): |
| | image = Image.open(image_path) |
| | video_file = generate_video(image, prompt, negative_prompt, video_length) |
| | audio_file = generate_music(music_prompt, unconditional) |
| | combined_file = combine_audio_video(audio_file, video_file) |
| | return combined_file |
| |
|
| | with gr.Blocks() as demo: |
| | gr.Markdown("# AI-Powered Video and Music Generation") |
| | |
| | with gr.Row(): |
| | image_input = gr.Image(type="filepath", label="Upload Image") |
| | prompt_input = gr.Textbox(label="Enter the Video Prompt") |
| | negative_prompt_input = gr.Textbox(label="Enter the Negative Prompt") |
| | video_length_input = gr.Number(label="Video Length (seconds)", value=10, precision=0) |
| | music_prompt_input = gr.Textbox(label="Enter the Music Prompt") |
| | unconditional_checkbox = gr.Checkbox(label="Generate Unconditional Music") |
| |
|
| | generate_button = gr.Button("Generate Video and Music") |
| | output_video = gr.Video(label="Output Video with Sound") |
| |
|
| | generate_button.click( |
| | interface, |
| | inputs=[image_input, prompt_input, negative_prompt_input, video_length_input, music_prompt_input, unconditional_checkbox], |
| | outputs=output_video, |
| | show_progress=True |
| | ) |
| |
|
| | demo.launch() |
| |
|