Moon11111 commited on
Commit
0b0af61
·
verified ·
1 Parent(s): c644e10

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +329 -13
app.py CHANGED
@@ -1,16 +1,332 @@
1
- try:
2
- from flask import Flask, request, jsonify
3
- except ModuleNotFoundError:
4
- import subprocess, sys
5
- subprocess.check_call([sys.executable, "-m", "pip", "install", "flask"])
6
- from flask import Flask, request, jsonify
7
-
8
- # The rest of your app code follows...
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
  app = Flask(__name__)
 
 
 
10
 
11
- @app.route('/')
12
- def hello():
13
- return jsonify({"message": "Hello, Flask!"})
14
 
15
- if __name__ == "__main__":
16
- app.run(port=5000)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import tempfile
2
+ import os
3
+ import shutil
4
+ import librosa
5
+ import json
6
+ import subprocess
7
+ import gc
8
+ from googletrans import Translator
9
+ import asyncio
10
+ from flask import Flask, request, jsonify, send_from_directory
11
+ from omegaconf import OmegaConf
12
+ import torch
13
+ from diffusers import AutoencoderKL, DDIMScheduler
14
+ from latentsync.models.unet import UNet3DConditionModel
15
+ from latentsync.pipelines.lipsync_pipeline import LipsyncPipeline
16
+ from diffusers.utils.import_utils import is_xformers_available
17
+ from accelerate.utils import set_seed
18
+ from latentsync.whisper.audio2feature import Audio2Feature
19
+ from openai import OpenAI
20
+ from elevenlabs import set_api_key, generate, play, clone, Voice, VoiceSettings
21
+ from torch.cuda.amp import autocast
22
+
23
+ # Initialize the Flask app
24
  app = Flask(__name__)
25
+ TEMP_DIR = None
26
+ VIDEO_DIRECTORY = os.path.abspath("videos")
27
+ os.makedirs(VIDEO_DIRECTORY, exist_ok=True)
28
 
29
+ def clear_cuda_memory():
30
+ torch.cuda.empty_cache()
31
+ gc.collect()
32
 
33
+ def run_inference(video_path, audio_path, video_out_path,
34
+ inference_ckpt_path, unet_config_path="configs/unet/second_stage.yaml",
35
+ inference_steps=20, guidance_scale=1.0, seed=1247):
36
+ clear_cuda_memory()
37
+
38
+ # Load configuration
39
+ config = OmegaConf.load(unet_config_path)
40
+
41
+ # Determine proper dtype based on GPU capabilities
42
+ is_fp16_supported = torch.cuda.is_available() and torch.cuda.get_device_capability()[0] > 7
43
+ dtype = torch.float16 if is_fp16_supported else torch.float32
44
+
45
+ # Setup scheduler
46
+ scheduler = DDIMScheduler.from_pretrained("configs")
47
+
48
+ # Choose whisper model based on config settings
49
+ if config.model.cross_attention_dim == 768:
50
+ whisper_model_path = "checkpoints/whisper/small.pt"
51
+ elif config.model.cross_attention_dim == 384:
52
+ whisper_model_path = "checkpoints/whisper/tiny.pt"
53
+ else:
54
+ raise NotImplementedError("cross_attention_dim must be 768 or 384")
55
+
56
+ # Initialize the audio encoder
57
+ audio_encoder = Audio2Feature(model_path=whisper_model_path,
58
+ device="cuda", num_frames=config.data.num_frames)
59
+
60
+ # Load VAE
61
+ vae = AutoencoderKL.from_pretrained("stabilityai/sd-vae-ft-mse", torch_dtype=dtype)
62
+ vae.config.scaling_factor = 0.18215
63
+ vae.config.shift_factor = 0
64
+
65
+ # Load UNet model from the checkpoint
66
+ unet, _ = UNet3DConditionModel.from_pretrained(
67
+ OmegaConf.to_container(config.model),
68
+ inference_ckpt_path, # load checkpoint
69
+ device="cpu",
70
+ )
71
+ unet = unet.to(dtype=dtype)
72
+
73
+ # Optionally enable memory-efficient attention if available
74
+ if is_xformers_available():
75
+ unet.enable_xformers_memory_efficient_attention()
76
+
77
+ # Initialize the pipeline and move to GPU
78
+ pipeline = LipsyncPipeline(
79
+ vae=vae,
80
+ audio_encoder=audio_encoder,
81
+ unet=unet,
82
+ scheduler=scheduler,
83
+ ).to("cuda")
84
+
85
+ # Set seed
86
+ if seed != -1:
87
+ set_seed(seed)
88
+ else:
89
+ torch.seed()
90
+
91
+ with autocast():
92
+ try:
93
+ pipeline(
94
+ video_path=video_path,
95
+ audio_path=audio_path,
96
+ video_out_path=video_out_path,
97
+ video_mask_path=video_out_path.replace(".mp4", "_mask.mp4"),
98
+ num_frames=config.data.num_frames,
99
+ num_inference_steps=inference_steps,
100
+ guidance_scale=guidance_scale,
101
+ weight_dtype=dtype,
102
+ width=config.data.resolution,
103
+ height=config.data.resolution,
104
+ )
105
+ finally:
106
+ clear_cuda_memory()
107
+
108
+ def create_temp_dir():
109
+ return tempfile.TemporaryDirectory()
110
+
111
+ def generate_audio(voice_cloning, text_prompt):
112
+ if voice_cloning == 'yes':
113
+ print('Entering Custom Audio creation using elevenlabs')
114
+ set_api_key('92e149985ea2732b4359c74346c3daee')
115
+ voice = Voice(voice_id="VJpttplXHolgV2leGe5V",name="Marc",settings=VoiceSettings(
116
+ stability=0.71, similarity_boost=0.9, style=0.0, use_speaker_boost=True),)
117
+
118
+ audio = generate(text = text_prompt, voice = voice, model = "eleven_multilingual_v2",stream=True, latency=4)
119
+ with tempfile.NamedTemporaryFile(suffix=".mp3", prefix="cloned_audio_",dir=TEMP_DIR.name, delete=False) as temp_file:
120
+ for chunk in audio:
121
+ temp_file.write(chunk)
122
+ driven_audio_path = temp_file.name
123
+ print('driven_audio_path',driven_audio_path)
124
+
125
+ return driven_audio_path
126
+
127
+ elif voice_cloning == 'no':
128
+ voice = 'echo'
129
+ print('Entering Default Audio creation using elevenlabs')
130
+ set_api_key('92e149985ea2732b4359c74346c3daee')
131
+ audio = generate(text = text_prompt, voice = "Daniel", model = "eleven_multilingual_v2",stream=True, latency=4)
132
+ with tempfile.NamedTemporaryFile(suffix=".mp3", prefix="default_audio_",dir=TEMP_DIR.name, delete=False) as temp_file:
133
+ for chunk in audio:
134
+ temp_file.write(chunk)
135
+ driven_audio_path = temp_file.name
136
+ print('driven_audio_path',driven_audio_path)
137
+ return driven_audio_path
138
+
139
+
140
+
141
+ def get_video_duration(video_path):
142
+ """Extracts video duration dynamically using ffprobe."""
143
+ cmd = [
144
+ "ffprobe", "-v", "error", "-show_entries", "format=duration",
145
+ "-of", "json", video_path
146
+ ]
147
+ result = subprocess.run(cmd, capture_output=True, text=True)
148
+ duration = json.loads(result.stdout)["format"]["duration"]
149
+ return float(duration)
150
+
151
+
152
+ def extend_video_simple(video_path, audio_path, output_path):
153
+ """Extends video duration by appending a reversed version if audio is longer."""
154
+ audio_duration = librosa.get_duration(path=audio_path)
155
+ video_duration = get_video_duration(video_path)
156
+
157
+ print(f"Video Duration: {video_duration:.2f} sec")
158
+ print(f"Audio Duration: {audio_duration:.2f} sec")
159
+
160
+ if audio_duration > video_duration:
161
+ print("Extending video by adding reversed version.")
162
+
163
+ # Create a reversed version of the full video
164
+ reversed_clip = tempfile.NamedTemporaryFile(dir=TEMP_DIR.name, delete=False, suffix=".mp4").name
165
+
166
+ subprocess.run(
167
+ f"ffmpeg -y -i {video_path} -vf reverse -an {reversed_clip}", shell=True
168
+ )
169
+
170
+ # Merge original + reversed
171
+ subprocess.run(
172
+ f"ffmpeg -y -i {video_path} -i {reversed_clip} -filter_complex \"[0:v:0][1:v:0]concat=n=2:v=1[outv]\" -map \"[outv]\" -an {output_path}",
173
+ shell=True
174
+ )
175
+ else:
176
+ print("Audio is not longer than video. No extension needed.")
177
+ subprocess.run(f"cp {video_path} {output_path}", shell=True)
178
+
179
+
180
+ def extend_video_loop(video_path, audio_path, output_path):
181
+ """Extends video duration by repeating original and reversed video until it meets/exceeds audio duration."""
182
+ audio_duration = librosa.get_duration(path=audio_path)
183
+ video_duration = get_video_duration(video_path)
184
+
185
+ print(f"Video Duration: {video_duration:.2f} sec")
186
+ print(f"Audio Duration: {audio_duration:.2f} sec")
187
+
188
+ if audio_duration > video_duration:
189
+ print("Extending video by repeating original and reversed versions.")
190
+
191
+ # Create reversed video
192
+ reversed_clip = tempfile.NamedTemporaryFile(dir=TEMP_DIR.name, delete=False, suffix=".mp4").name
193
+ subprocess.run(
194
+ f"ffmpeg -y -i {video_path} -vf reverse -an {reversed_clip}", shell=True
195
+ )
196
+
197
+ # Generate a list of clips to reach/exceed audio duration
198
+ video_clips = [video_path, reversed_clip]
199
+ total_duration = video_duration * 2 # Original + reversed
200
+
201
+ while total_duration < audio_duration:
202
+ video_clips.append(video_path)
203
+ video_clips.append(reversed_clip)
204
+ total_duration += video_duration * 2
205
+
206
+ print(f"Total Clips: {len(video_clips)}")
207
+
208
+ # Use FFmpeg filter_complex concat for seamless merging
209
+ concat_filter = "".join(f"[{i}:v:0]" for i in range(len(video_clips))) + f"concat=n={len(video_clips)}:v=1[outv]"
210
+ input_files = " ".join(f"-i {clip}" for clip in video_clips)
211
+
212
+ subprocess.run(
213
+ f"ffmpeg -y {input_files} -filter_complex \"{concat_filter}\" -map \"[outv]\" -an {output_path}",
214
+ shell=True
215
+ )
216
+
217
+ print(f"Extended video saved to {output_path}")
218
+
219
+ else:
220
+ print("Audio is not longer than video. No extension needed.")
221
+ subprocess.run(f"cp {video_path} {output_path}", shell=True)
222
+
223
+
224
+ def translate_text(text, target_language):
225
+ if not text or text.strip() == "":
226
+ return ""
227
+ LANGUAGE_CODES = {"english": "en", "hindi": "hi"}
228
+ try:
229
+ # Convert language name to code
230
+ target_language_code = LANGUAGE_CODES.get(target_language.lower())
231
+
232
+ # Use Google Translate with proper coroutine handling
233
+ async def perform_translation():
234
+ translator = Translator()
235
+ result = await translator.translate(text, dest=target_language_code)
236
+ return result.text if hasattr(result, 'text') else text
237
+
238
+ # Run the async function in the event loop
239
+ loop = asyncio.new_event_loop()
240
+ asyncio.set_event_loop(loop)
241
+ result = loop.run_until_complete(perform_translation())
242
+ loop.close()
243
+
244
+ return result
245
+ except Exception as e:
246
+ logger.error(f"Error translating text: {e}")
247
+ # Return original text if translation fails
248
+ return text
249
+
250
+
251
+ @app.route('/run', methods=['POST'])
252
+ def generate_video():
253
+ global TEMP_DIR
254
+ # global VIDEO_DIRECTORY
255
+ TEMP_DIR = create_temp_dir()
256
+
257
+ if 'video' not in request.files:
258
+ return jsonify({'error': 'Video file is required.'}), 400
259
+
260
+ video_file = request.files['video']
261
+ text_prompt = request.form['text_prompt']
262
+ print('Input text prompt: ',text_prompt)
263
+ text_prompt = text_prompt.strip()
264
+ if not text_prompt:
265
+ return jsonify({'error': 'Input text prompt cannot be blank'}), 400
266
+
267
+ voice_cloning = request.form.get('voice_cloning', 'no')
268
+ target_language = request.form.get('target_language', 'original_text')
269
+
270
+ if target_language != 'original_text':
271
+ response = translate_text(text_prompt, target_language)
272
+ text_prompt = response.strip()
273
+ print('Translated input text prompt: ',text_prompt)
274
+
275
+
276
+ temp_audio_path = generate_audio(voice_cloning, text_prompt)
277
+ with tempfile.NamedTemporaryFile(suffix=".mp4", prefix="input_",dir=TEMP_DIR.name, delete=False) as temp_file:
278
+ temp_video_path = temp_file.name
279
+ video_file.save(temp_video_path)
280
+ print('temp_video_path',temp_video_path)
281
+
282
+ # output_video = tempfile.NamedTemporaryFile(delete=False, suffix=".mp4").name
283
+
284
+ # You can pass additional parameters via form data if needed (e.g., checkpoint path)
285
+ inference_ckpt_path = request.form.get('inference_ckpt_path', 'checkpoints/latentsync_unet.pt')
286
+ unet_config_path = request.form.get('unet_config_path', 'configs/unet/second_stage.yaml')
287
+
288
+ output_video = tempfile.NamedTemporaryFile(dir=TEMP_DIR.name, delete=False, suffix=".mp4").name
289
+
290
+ extend_video_loop(temp_video_path, temp_audio_path, output_video)
291
+ final_output_video = tempfile.NamedTemporaryFile(dir=TEMP_DIR.name, delete=False, suffix="_final_extended.mp4").name
292
+
293
+
294
+ try:
295
+ run_inference(
296
+ video_path=output_video,
297
+ audio_path=temp_audio_path,
298
+ video_out_path=final_output_video,
299
+ inference_ckpt_path=inference_ckpt_path,
300
+ unet_config_path=unet_config_path,
301
+ inference_steps=int(request.form.get('inference_steps', 20)),
302
+ guidance_scale=float(request.form.get('guidance_scale', 1.0)),
303
+ seed=int(request.form.get('seed', 1247))
304
+ )
305
+ # Return the output video path or further process the file for download
306
+ if final_output_video and final_output_video.endswith('.mp4'):
307
+ filename = os.path.basename(final_output_video)
308
+ # os.makedirs('videos', exist_ok=True)
309
+ # VIDEO_DIRECTORY = os.path.abspath('videos')
310
+ print("VIDEO_DIRECTORY: ",VIDEO_DIRECTORY)
311
+ destination_path = os.path.join(VIDEO_DIRECTORY, filename)
312
+ shutil.copy(final_output_video, destination_path)
313
+ video_url = f"/videos/{filename}"
314
+
315
+ return jsonify({"message": "Video processed and saved successfully.",
316
+ "output_video": video_url,
317
+ "status": "success"}), 200
318
+ except Exception as e:
319
+ return jsonify({'error': str(e)}), 500
320
+
321
+ @app.route("/videos/<string:filename>", methods=['GET'])
322
+ def serve_video(filename):
323
+ # global VIDEO_DIRECTORY
324
+ return send_from_directory(VIDEO_DIRECTORY, filename, as_attachment=False)
325
+
326
+ @app.route("/health", methods=["GET"])
327
+ def health_status():
328
+ response = {"online": "true"}
329
+ return jsonify(response)
330
+
331
+ if __name__ == '__main__':
332
+ app.run(debug=True)