Spaces:

Moon11111
/

avatar

Runtime error

App Files Files Community

Moon11111 commited on Feb 28, 2025

Commit

e3b1be4

verified ·

1 Parent(s): 4fa3412

Create app.py

Browse files

Files changed (1) hide show

app.py +134 -0

app.py ADDED Viewed

	@@ -0,0 +1,134 @@

+import argparse
+import tempfile
+import os
+from flask import Flask, request, jsonify
+from omegaconf import OmegaConf
+import torch
+from diffusers import AutoencoderKL, DDIMScheduler
+from latentsync.models.unet import UNet3DConditionModel
+from latentsync.pipelines.lipsync_pipeline import LipsyncPipeline
+from diffusers.utils.import_utils import is_xformers_available
+from accelerate.utils import set_seed
+from latentsync.whisper.audio2feature import Audio2Feature
+from werkzeug.utils import secure_filename
+# Initialize the Flask app
+app = Flask(__name__)
+def run_inference(video_path, audio_path, video_out_path,
+                  inference_ckpt_path, unet_config_path="configs/unet/second_stage.yaml",
+                  inference_steps=20, guidance_scale=1.0, seed=1247):
+    # Load configuration
+    config = OmegaConf.load(unet_config_path)
+    # Choose whisper model based on config settings
+    if config.model.cross_attention_dim == 768:
+        whisper_model_path = "checkpoints/whisper/small.pt"
+    elif config.model.cross_attention_dim == 384:
+        whisper_model_path = "checkpoints/whisper/tiny.pt"
+    else:
+        raise NotImplementedError("cross_attention_dim must be 768 or 384")
+    # Determine proper dtype based on GPU capabilities
+    is_fp16_supported = torch.cuda.is_available() and torch.cuda.get_device_capability()[0] > 7
+    dtype = torch.float16 if is_fp16_supported else torch.float32
+    # Setup scheduler
+    scheduler = DDIMScheduler.from_pretrained("configs")
+    # Initialize the audio encoder
+    audio_encoder = Audio2Feature(model_path=whisper_model_path,
+                                  device="cuda", num_frames=config.data.num_frames)
+    # Load VAE
+    vae = AutoencoderKL.from_pretrained("stabilityai/sd-vae-ft-mse", torch_dtype=dtype)
+    vae.config.scaling_factor = 0.18215
+    vae.config.shift_factor = 0
+    # Load UNet model from the checkpoint
+    unet, _ = UNet3DConditionModel.from_pretrained(
+        OmegaConf.to_container(config.model),
+        inference_ckpt_path,  # load checkpoint
+        device="cpu",
+    )
+    unet = unet.to(dtype=dtype)
+    # Optionally enable memory-efficient attention if available
+    if is_xformers_available():
+        unet.enable_xformers_memory_efficient_attention()
+    # Initialize the pipeline and move to GPU
+    pipeline = LipsyncPipeline(
+        vae=vae,
+        audio_encoder=audio_encoder,
+        unet=unet,
+        scheduler=scheduler,
+    ).to("cuda")
+    # Set seed
+    if seed != -1:
+        set_seed(seed)
+    else:
+        torch.seed()
+    # Run the pipeline
+    pipeline(
+        video_path=video_path,
+        audio_path=audio_path,
+        video_out_path=video_out_path,
+        video_mask_path=video_out_path.replace(".mp4", "_mask.mp4"),
+        num_frames=config.data.num_frames,
+        num_inference_steps=inference_steps,
+        guidance_scale=guidance_scale,
+        weight_dtype=dtype,
+        width=config.data.resolution,
+        height=config.data.resolution,
+    )
+@app.route('/lipsync', methods=['POST'])
+def lipsync_endpoint():
+    # Ensure both video and audio files are present in the request
+    if 'video' not in request.files or 'audio' not in request.files:
+        return jsonify({'error': 'Both video and audio files are required.'}), 400
+    video_file = request.files['video']
+    audio_file = request.files['audio']
+    # Save uploaded files to temporary locations
+    temp_video = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4")
+    temp_audio = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
+    video_file.save(temp_video.name)
+    audio_file.save(temp_audio.name)
+    # Create a temporary file for the output video
+    output_video = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4").name
+    # You can pass additional parameters via form data if needed (e.g., checkpoint path)
+    inference_ckpt_path = request.form.get('inference_ckpt_path', 'checkpoints/latentsync_unet.pt')
+    unet_config_path = request.form.get('unet_config_path', 'configs/unet/second_stage.yaml')
+    try:
+        run_inference(
+            video_path=temp_video.name,
+            audio_path=temp_audio.name,
+            video_out_path=output_video,
+            inference_ckpt_path=inference_ckpt_path,
+            unet_config_path=unet_config_path,
+            inference_steps=int(request.form.get('inference_steps', 20)),
+            guidance_scale=float(request.form.get('guidance_scale', 1.0)),
+            seed=int(request.form.get('seed', 1247))
+        )
+        # Return the output video path or further process the file for download
+        return jsonify({'output_video': output_video}), 200
+    except Exception as e:
+        return jsonify({'error': str(e)}), 500
+if __name__ == "__main__":
+    # Using pyngrok to expose the server to the internet
+    from pyngrok import ngrok
+    public_url = ngrok.connect(5000)
+    print(" * ngrok tunnel available at:", public_url)
+    # Run the Flask app on port 5000
+    app.run(port=5000)