Spaces:

fffiloni
/

MEMO

Running on Zero

App Files Files Community

fffiloni commited on Dec 9, 2024

Commit

eb23cc1

verified ·

1 Parent(s): b4acdb1

Update hf_gradio_app.py

Browse files

Files changed (1) hide show

hf_gradio_app.py +32 -3

hf_gradio_app.py CHANGED Viewed

@@ -1,5 +1,8 @@
 import os, random, time
 import uuid
 from huggingface_hub import snapshot_download
 # Download models
@@ -70,8 +73,36 @@ with torch.inference_mode():
     pipeline = VideoPipeline(vae=vae, reference_net=reference_net, diffusion_net=diffusion_net, scheduler=noise_scheduler, image_proj=image_proj)
     pipeline.to(device=device, dtype=weight_dtype)
 @torch.inference_mode()
 def generate(input_video, input_audio, seed, progress=gr.Progress(track_tqdm=True)):
     resolution = 512
     num_generated_frames_per_clip = 16
     fps = 30
@@ -157,8 +188,6 @@ def generate(input_video, input_audio, seed, progress=gr.Progress(track_tqdm=Tru
     return video_path
-import gradio as gr
 with gr.Blocks(analytics_enabled=False) as demo:
     with gr.Column():
         gr.Markdown("# MEMO: Memory-Guided Diffusion for Expressive Talking Video Generation")
@@ -185,7 +214,7 @@ with gr.Blocks(analytics_enabled=False) as demo:
         with gr.Row():
             with gr.Column():
                 input_video = gr.Image(label="Upload Input Image", type="filepath")
-                input_audio = gr.Audio(label="Upload Input Audio", type="filepath")
                 seed = gr.Number(label="Seed (0 for Random)", value=0, precision=0)
             with gr.Column():
                 video_output = gr.Video(label="Generated Video")

 import os, random, time
 import uuid
+import tempfile
+from pydub import AudioSegment
+import gradio as gr
 from huggingface_hub import snapshot_download
 # Download models
     pipeline = VideoPipeline(vae=vae, reference_net=reference_net, diffusion_net=diffusion_net, scheduler=noise_scheduler, image_proj=image_proj)
     pipeline.to(device=device, dtype=weight_dtype)
+def process_audio(file_path):
+    # Create a temporary directory
+    with tempfile.TemporaryDirectory() as temp_dir:
+        # Load the audio file
+        audio = AudioSegment.from_file(file_path)
+        # Check and cut the audio if longer than 4 seconds
+        max_duration = 4 * 1000  # 4 seconds in milliseconds
+        if len(audio) > max_duration:
+            audio = audio[:max_duration]
+        # Save the processed audio in the temporary directory
+        output_path = os.path.join(temp_dir, "trimmed_audio.wav")
+        audio.export(output_path, format="wav")
+        # Temporary file is available here for use
+        print(f"Processed audio saved at: {output_path}")
+        # Return the path for reference (optional)
+        return output_path
 @torch.inference_mode()
 def generate(input_video, input_audio, seed, progress=gr.Progress(track_tqdm=True)):
+    is_shared_ui = True if "fffiloni/MEMO" in os.environ['SPACE_ID'] else False
+    if is_shared_ui:
+        input_audio = process_audio(input_audio)
+        print(f"Processed file was stored temporarily at: {input_audio}")
     resolution = 512
     num_generated_frames_per_clip = 16
     fps = 30
     return video_path
 with gr.Blocks(analytics_enabled=False) as demo:
     with gr.Column():
         gr.Markdown("# MEMO: Memory-Guided Diffusion for Expressive Talking Video Generation")
         with gr.Row():
             with gr.Column():
                 input_video = gr.Image(label="Upload Input Image", type="filepath")
+                input_audio = gr.Audio(label="Upload Input Audio", type="filepath", info="On shared UI, audio length is trimmed to max 4 seconds")
                 seed = gr.Number(label="Seed (0 for Random)", value=0, precision=0)
             with gr.Column():
                 video_output = gr.Video(label="Generated Video")