Spaces:

Moon11111
/

avatar

Runtime error

App Files Files Community

Moon11111 commited on Mar 31, 2025

Commit

0b0af61

verified ·

1 Parent(s): c644e10

Update app.py

Browse files

Files changed (1) hide show

app.py +329 -13

app.py CHANGED Viewed

@@ -1,16 +1,332 @@
-try:
-    from flask import Flask, request, jsonify
-except ModuleNotFoundError:
-    import subprocess, sys
-    subprocess.check_call([sys.executable, "-m", "pip", "install", "flask"])
-    from flask import Flask, request, jsonify
-# The rest of your app code follows...
 app = Flask(__name__)
-@app.route('/')
-def hello():
-    return jsonify({"message": "Hello, Flask!"})
-if __name__ == "__main__":
-    app.run(port=5000)

+import tempfile
+import os
+import shutil
+import librosa
+import json
+import subprocess
+import gc
+from googletrans import Translator
+import asyncio
+from flask import Flask, request, jsonify, send_from_directory
+from omegaconf import OmegaConf
+import torch
+from diffusers import AutoencoderKL, DDIMScheduler
+from latentsync.models.unet import UNet3DConditionModel
+from latentsync.pipelines.lipsync_pipeline import LipsyncPipeline
+from diffusers.utils.import_utils import is_xformers_available
+from accelerate.utils import set_seed
+from latentsync.whisper.audio2feature import Audio2Feature
+from openai import OpenAI
+from elevenlabs import set_api_key, generate, play, clone, Voice, VoiceSettings
+from torch.cuda.amp import autocast
+# Initialize the Flask app
 app = Flask(__name__)
+TEMP_DIR = None
+VIDEO_DIRECTORY = os.path.abspath("videos")
+os.makedirs(VIDEO_DIRECTORY, exist_ok=True)
+def clear_cuda_memory():
+    torch.cuda.empty_cache()
+    gc.collect()
+def run_inference(video_path, audio_path, video_out_path,
+                  inference_ckpt_path, unet_config_path="configs/unet/second_stage.yaml",
+                  inference_steps=20, guidance_scale=1.0, seed=1247):
+    clear_cuda_memory()
+    # Load configuration
+    config = OmegaConf.load(unet_config_path)
+    # Determine proper dtype based on GPU capabilities
+    is_fp16_supported = torch.cuda.is_available() and torch.cuda.get_device_capability()[0] > 7
+    dtype = torch.float16 if is_fp16_supported else torch.float32
+    # Setup scheduler
+    scheduler = DDIMScheduler.from_pretrained("configs")
+    # Choose whisper model based on config settings
+    if config.model.cross_attention_dim == 768:
+        whisper_model_path = "checkpoints/whisper/small.pt"
+    elif config.model.cross_attention_dim == 384:
+        whisper_model_path = "checkpoints/whisper/tiny.pt"
+    else:
+        raise NotImplementedError("cross_attention_dim must be 768 or 384")
+    # Initialize the audio encoder
+    audio_encoder = Audio2Feature(model_path=whisper_model_path,
+                                  device="cuda", num_frames=config.data.num_frames)
+    # Load VAE
+    vae = AutoencoderKL.from_pretrained("stabilityai/sd-vae-ft-mse", torch_dtype=dtype)
+    vae.config.scaling_factor = 0.18215
+    vae.config.shift_factor = 0
+    # Load UNet model from the checkpoint
+    unet, _ = UNet3DConditionModel.from_pretrained(
+        OmegaConf.to_container(config.model),
+        inference_ckpt_path,  # load checkpoint
+        device="cpu",
+    )
+    unet = unet.to(dtype=dtype)
+    # Optionally enable memory-efficient attention if available
+    if is_xformers_available():
+        unet.enable_xformers_memory_efficient_attention()
+    # Initialize the pipeline and move to GPU
+    pipeline = LipsyncPipeline(
+        vae=vae,
+        audio_encoder=audio_encoder,
+        unet=unet,
+        scheduler=scheduler,
+    ).to("cuda")
+    # Set seed
+    if seed != -1:
+        set_seed(seed)
+    else:
+        torch.seed()
+    with autocast():
+        try:
+            pipeline(
+                video_path=video_path,
+                audio_path=audio_path,
+                video_out_path=video_out_path,
+                video_mask_path=video_out_path.replace(".mp4", "_mask.mp4"),
+                num_frames=config.data.num_frames,
+                num_inference_steps=inference_steps,
+                guidance_scale=guidance_scale,
+                weight_dtype=dtype,
+                width=config.data.resolution,
+                height=config.data.resolution,
+            )
+        finally:
+            clear_cuda_memory()
+def create_temp_dir():
+    return tempfile.TemporaryDirectory()
+def generate_audio(voice_cloning, text_prompt):
+    if voice_cloning == 'yes':
+        print('Entering Custom Audio creation using elevenlabs')
+        set_api_key('92e149985ea2732b4359c74346c3daee')
+        voice = Voice(voice_id="VJpttplXHolgV2leGe5V",name="Marc",settings=VoiceSettings(
+                        stability=0.71, similarity_boost=0.9, style=0.0, use_speaker_boost=True),)
+        audio = generate(text = text_prompt, voice = voice, model = "eleven_multilingual_v2",stream=True, latency=4)
+        with tempfile.NamedTemporaryFile(suffix=".mp3", prefix="cloned_audio_",dir=TEMP_DIR.name, delete=False) as temp_file:
+            for chunk in audio:
+                temp_file.write(chunk)
+            driven_audio_path = temp_file.name
+            print('driven_audio_path',driven_audio_path)
+        return driven_audio_path
+    elif voice_cloning == 'no':
+        voice = 'echo'
+        print('Entering Default Audio creation using elevenlabs')
+        set_api_key('92e149985ea2732b4359c74346c3daee')
+        audio = generate(text = text_prompt, voice = "Daniel", model = "eleven_multilingual_v2",stream=True, latency=4)
+        with tempfile.NamedTemporaryFile(suffix=".mp3", prefix="default_audio_",dir=TEMP_DIR.name, delete=False) as temp_file:
+            for chunk in audio:
+                temp_file.write(chunk)
+            driven_audio_path = temp_file.name
+            print('driven_audio_path',driven_audio_path)
+        return driven_audio_path
+def get_video_duration(video_path):
+    """Extracts video duration dynamically using ffprobe."""
+    cmd = [
+        "ffprobe", "-v", "error", "-show_entries", "format=duration",
+        "-of", "json", video_path
+    ]
+    result = subprocess.run(cmd, capture_output=True, text=True)
+    duration = json.loads(result.stdout)["format"]["duration"]
+    return float(duration)
+def extend_video_simple(video_path, audio_path, output_path):
+    """Extends video duration by appending a reversed version if audio is longer."""
+    audio_duration = librosa.get_duration(path=audio_path)
+    video_duration = get_video_duration(video_path)
+    print(f"Video Duration: {video_duration:.2f} sec")
+    print(f"Audio Duration: {audio_duration:.2f} sec")
+    if audio_duration > video_duration:
+        print("Extending video by adding reversed version.")
+        # Create a reversed version of the full video
+        reversed_clip = tempfile.NamedTemporaryFile(dir=TEMP_DIR.name, delete=False, suffix=".mp4").name
+        subprocess.run(
+            f"ffmpeg -y -i {video_path} -vf reverse -an {reversed_clip}", shell=True
+        )
+        # Merge original + reversed
+        subprocess.run(
+            f"ffmpeg -y -i {video_path} -i {reversed_clip} -filter_complex \"[0:v:0][1:v:0]concat=n=2:v=1[outv]\" -map \"[outv]\" -an {output_path}",
+            shell=True
+        )
+    else:
+        print("Audio is not longer than video. No extension needed.")
+        subprocess.run(f"cp {video_path} {output_path}", shell=True)
+def extend_video_loop(video_path, audio_path, output_path):
+    """Extends video duration by repeating original and reversed video until it meets/exceeds audio duration."""
+    audio_duration = librosa.get_duration(path=audio_path)
+    video_duration = get_video_duration(video_path)
+    print(f"Video Duration: {video_duration:.2f} sec")
+    print(f"Audio Duration: {audio_duration:.2f} sec")
+    if audio_duration > video_duration:
+        print("Extending video by repeating original and reversed versions.")
+        # Create reversed video
+        reversed_clip = tempfile.NamedTemporaryFile(dir=TEMP_DIR.name, delete=False, suffix=".mp4").name
+        subprocess.run(
+            f"ffmpeg -y -i {video_path} -vf reverse -an {reversed_clip}", shell=True
+        )
+        # Generate a list of clips to reach/exceed audio duration
+        video_clips = [video_path, reversed_clip]
+        total_duration = video_duration * 2  # Original + reversed
+        while total_duration < audio_duration:
+            video_clips.append(video_path)
+            video_clips.append(reversed_clip)
+            total_duration += video_duration * 2
+        print(f"Total Clips: {len(video_clips)}")
+        # Use FFmpeg filter_complex concat for seamless merging
+        concat_filter = "".join(f"[{i}:v:0]" for i in range(len(video_clips))) + f"concat=n={len(video_clips)}:v=1[outv]"
+        input_files = " ".join(f"-i {clip}" for clip in video_clips)
+        subprocess.run(
+            f"ffmpeg -y {input_files} -filter_complex \"{concat_filter}\" -map \"[outv]\" -an {output_path}",
+            shell=True
+        )
+        print(f"Extended video saved to {output_path}")
+    else:
+        print("Audio is not longer than video. No extension needed.")
+        subprocess.run(f"cp {video_path} {output_path}", shell=True)
+def translate_text(text, target_language):
+    if not text or text.strip() == "":
+        return ""
+    LANGUAGE_CODES = {"english": "en", "hindi": "hi"}
+    try:
+        # Convert language name to code
+        target_language_code = LANGUAGE_CODES.get(target_language.lower())
+        # Use Google Translate with proper coroutine handling
+        async def perform_translation():
+            translator = Translator()
+            result = await translator.translate(text, dest=target_language_code)
+            return result.text if hasattr(result, 'text') else text
+        # Run the async function in the event loop
+        loop = asyncio.new_event_loop()
+        asyncio.set_event_loop(loop)
+        result = loop.run_until_complete(perform_translation())
+        loop.close()
+        return result
+    except Exception as e:
+        logger.error(f"Error translating text: {e}")
+        # Return original text if translation fails
+        return text
+@app.route('/run', methods=['POST'])
+def generate_video():
+    global TEMP_DIR
+    # global VIDEO_DIRECTORY
+    TEMP_DIR = create_temp_dir()
+    if 'video' not in request.files:
+        return jsonify({'error': 'Video file is required.'}), 400
+    video_file = request.files['video']
+    text_prompt = request.form['text_prompt']
+    print('Input text prompt: ',text_prompt)
+    text_prompt = text_prompt.strip()
+    if not text_prompt:
+        return jsonify({'error': 'Input text prompt cannot be blank'}), 400
+    voice_cloning = request.form.get('voice_cloning', 'no')
+    target_language = request.form.get('target_language', 'original_text')
+    if target_language != 'original_text':
+        response = translate_text(text_prompt, target_language)
+        text_prompt = response.strip()
+        print('Translated input text prompt: ',text_prompt)
+    temp_audio_path = generate_audio(voice_cloning, text_prompt)
+    with tempfile.NamedTemporaryFile(suffix=".mp4", prefix="input_",dir=TEMP_DIR.name, delete=False) as temp_file:
+        temp_video_path = temp_file.name
+        video_file.save(temp_video_path)
+        print('temp_video_path',temp_video_path)
+    # output_video = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4").name
+    # You can pass additional parameters via form data if needed (e.g., checkpoint path)
+    inference_ckpt_path = request.form.get('inference_ckpt_path', 'checkpoints/latentsync_unet.pt')
+    unet_config_path = request.form.get('unet_config_path', 'configs/unet/second_stage.yaml')
+    output_video = tempfile.NamedTemporaryFile(dir=TEMP_DIR.name, delete=False, suffix=".mp4").name
+    extend_video_loop(temp_video_path, temp_audio_path, output_video)
+    final_output_video = tempfile.NamedTemporaryFile(dir=TEMP_DIR.name, delete=False, suffix="_final_extended.mp4").name
+    try:
+        run_inference(
+            video_path=output_video,
+            audio_path=temp_audio_path,
+            video_out_path=final_output_video,
+            inference_ckpt_path=inference_ckpt_path,
+            unet_config_path=unet_config_path,
+            inference_steps=int(request.form.get('inference_steps', 20)),
+            guidance_scale=float(request.form.get('guidance_scale', 1.0)),
+            seed=int(request.form.get('seed', 1247))
+        )
+        # Return the output video path or further process the file for download
+        if final_output_video and final_output_video.endswith('.mp4'):
+            filename = os.path.basename(final_output_video)
+            # os.makedirs('videos', exist_ok=True)
+            # VIDEO_DIRECTORY = os.path.abspath('videos')
+            print("VIDEO_DIRECTORY: ",VIDEO_DIRECTORY)
+            destination_path = os.path.join(VIDEO_DIRECTORY, filename)
+            shutil.copy(final_output_video, destination_path)
+            video_url = f"/videos/{filename}"
+        return jsonify({"message": "Video processed and saved successfully.",
+                        "output_video": video_url,
+                       "status": "success"}), 200
+    except Exception as e:
+        return jsonify({'error': str(e)}), 500
+@app.route("/videos/<string:filename>", methods=['GET'])
+def serve_video(filename):
+    # global VIDEO_DIRECTORY
+    return send_from_directory(VIDEO_DIRECTORY, filename, as_attachment=False)
+@app.route("/health", methods=["GET"])
+def health_status():
+    response = {"online": "true"}
+    return jsonify(response)
+if __name__ == '__main__':
+    app.run(debug=True)