Spaces:
Paused
Paused
Update app.py
Browse files
app.py
CHANGED
|
@@ -121,6 +121,11 @@ def interpolate_frames(video_path, target_fps=30):
|
|
| 121 |
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
|
| 122 |
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
|
| 123 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 124 |
print(f"Original FPS: {original_fps}, Target FPS: {target_fps}")
|
| 125 |
|
| 126 |
# If target FPS is not higher, return original
|
|
@@ -204,6 +209,25 @@ except Exception as e:
|
|
| 204 |
# Invert the emo_map for easy lookup from the dropdown value
|
| 205 |
emo_name_to_id = {v: k for k, v in emo_map.items()}
|
| 206 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 207 |
# --- Core Generation Function ---
|
| 208 |
@spaces.GPU(duration=180) # Increased duration for smoothing and interpolation
|
| 209 |
def generate_motion(source_image_path, driving_audio_path, emotion_name,
|
|
@@ -227,6 +251,15 @@ def generate_motion(source_image_path, driving_audio_path, emotion_name,
|
|
| 227 |
raise gr.Error("Please upload a source image.")
|
| 228 |
if driving_audio_path is None:
|
| 229 |
raise gr.Error("Please upload a driving audio file.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 230 |
|
| 231 |
start_time = time.time()
|
| 232 |
|
|
@@ -341,7 +374,7 @@ with gr.Blocks(theme=gr.themes.Soft(), css=".gradio-container {max-width: 960px
|
|
| 341 |
|
| 342 |
with gr.Row():
|
| 343 |
driving_audio = gr.Audio(
|
| 344 |
-
label="Driving Audio",
|
| 345 |
type="filepath",
|
| 346 |
value="src/examples/driving_audios/5.wav"
|
| 347 |
)
|
|
@@ -352,7 +385,7 @@ with gr.Blocks(theme=gr.themes.Soft(), css=".gradio-container {max-width: 960px
|
|
| 352 |
emotion_dropdown = gr.Dropdown(
|
| 353 |
label="Emotion",
|
| 354 |
choices=list(emo_map.values()),
|
| 355 |
-
value="
|
| 356 |
info="Select an emotion for more natural facial expressions"
|
| 357 |
)
|
| 358 |
|
|
@@ -371,7 +404,7 @@ with gr.Blocks(theme=gr.themes.Soft(), css=".gradio-container {max-width: 960px
|
|
| 371 |
with gr.Row():
|
| 372 |
smooth_checkbox = gr.Checkbox(
|
| 373 |
label="Enable Smoothing (Experimental)",
|
| 374 |
-
value=
|
| 375 |
info="May cause errors on some systems. If errors occur, disable this option."
|
| 376 |
)
|
| 377 |
|
|
@@ -400,8 +433,9 @@ with gr.Blocks(theme=gr.themes.Soft(), css=".gradio-container {max-width: 960px
|
|
| 400 |
<b>Tips for best results:</b><br>
|
| 401 |
• Use high-quality front-facing images<br>
|
| 402 |
• Clear audio without background noise<br>
|
| 403 |
-
•
|
| 404 |
-
• Adjust CFG scale if motion seems stiff
|
|
|
|
| 405 |
</p>
|
| 406 |
</div>
|
| 407 |
"""
|
|
@@ -415,10 +449,16 @@ with gr.Blocks(theme=gr.themes.Soft(), css=".gradio-container {max-width: 960px
|
|
| 415 |
Users are solely liable for their actions while using this generative model.
|
| 416 |
|
| 417 |
### 🚀 **Enhancement Features**
|
| 418 |
-
- **Frame Smoothing**: Reduces jitter and improves transition between frames
|
| 419 |
- **Frame Interpolation**: Increases FPS for smoother motion
|
| 420 |
- **Optimized Audio Processing**: Better lip-sync with 24kHz sampling
|
| 421 |
- **Fine-tuned CFG Scale**: Better control over motion naturalness
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 422 |
"""
|
| 423 |
)
|
| 424 |
|
|
|
|
| 121 |
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
|
| 122 |
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
|
| 123 |
|
| 124 |
+
# Fix for FPS detection issue
|
| 125 |
+
if original_fps == 0 or original_fps is None:
|
| 126 |
+
print("Warning: Could not detect original FPS. Assuming 25 FPS.")
|
| 127 |
+
original_fps = 25.0
|
| 128 |
+
|
| 129 |
print(f"Original FPS: {original_fps}, Target FPS: {target_fps}")
|
| 130 |
|
| 131 |
# If target FPS is not higher, return original
|
|
|
|
| 209 |
# Invert the emo_map for easy lookup from the dropdown value
|
| 210 |
emo_name_to_id = {v: k for k, v in emo_map.items()}
|
| 211 |
|
| 212 |
+
# --- Audio Length Check Function ---
|
| 213 |
+
def check_audio_length(audio_path):
|
| 214 |
+
"""
|
| 215 |
+
Check the length of an audio file and warn if it's too long.
|
| 216 |
+
|
| 217 |
+
Args:
|
| 218 |
+
audio_path: Path to the audio file
|
| 219 |
+
|
| 220 |
+
Returns:
|
| 221 |
+
Duration in seconds
|
| 222 |
+
"""
|
| 223 |
+
try:
|
| 224 |
+
audio = AudioSegment.from_file(audio_path)
|
| 225 |
+
duration_seconds = len(audio) / 1000.0
|
| 226 |
+
return duration_seconds
|
| 227 |
+
except Exception as e:
|
| 228 |
+
print(f"Error checking audio length: {e}")
|
| 229 |
+
return None
|
| 230 |
+
|
| 231 |
# --- Core Generation Function ---
|
| 232 |
@spaces.GPU(duration=180) # Increased duration for smoothing and interpolation
|
| 233 |
def generate_motion(source_image_path, driving_audio_path, emotion_name,
|
|
|
|
| 251 |
raise gr.Error("Please upload a source image.")
|
| 252 |
if driving_audio_path is None:
|
| 253 |
raise gr.Error("Please upload a driving audio file.")
|
| 254 |
+
|
| 255 |
+
# Check audio length
|
| 256 |
+
audio_duration = check_audio_length(driving_audio_path)
|
| 257 |
+
if audio_duration:
|
| 258 |
+
print(f"Audio duration: {audio_duration:.1f} seconds")
|
| 259 |
+
if audio_duration > 60:
|
| 260 |
+
gr.Warning(f"⚠️ Audio is {audio_duration:.1f} seconds long. MoDA works best with audio under 60 seconds. Processing may be slow and quality may degrade.")
|
| 261 |
+
if audio_duration > 180:
|
| 262 |
+
raise gr.Error("Audio is too long. Please use audio files under 3 minutes (180 seconds) for best results.")
|
| 263 |
|
| 264 |
start_time = time.time()
|
| 265 |
|
|
|
|
| 374 |
|
| 375 |
with gr.Row():
|
| 376 |
driving_audio = gr.Audio(
|
| 377 |
+
label="Driving Audio (Recommended: < 60 seconds)",
|
| 378 |
type="filepath",
|
| 379 |
value="src/examples/driving_audios/5.wav"
|
| 380 |
)
|
|
|
|
| 385 |
emotion_dropdown = gr.Dropdown(
|
| 386 |
label="Emotion",
|
| 387 |
choices=list(emo_map.values()),
|
| 388 |
+
value="None",
|
| 389 |
info="Select an emotion for more natural facial expressions"
|
| 390 |
)
|
| 391 |
|
|
|
|
| 404 |
with gr.Row():
|
| 405 |
smooth_checkbox = gr.Checkbox(
|
| 406 |
label="Enable Smoothing (Experimental)",
|
| 407 |
+
value=False, # Changed to False due to CUDA issues
|
| 408 |
info="May cause errors on some systems. If errors occur, disable this option."
|
| 409 |
)
|
| 410 |
|
|
|
|
| 433 |
<b>Tips for best results:</b><br>
|
| 434 |
• Use high-quality front-facing images<br>
|
| 435 |
• Clear audio without background noise<br>
|
| 436 |
+
• <b>Keep audio under 60 seconds</b><br>
|
| 437 |
+
• Adjust CFG scale if motion seems stiff<br>
|
| 438 |
+
• For longer audio, split into segments
|
| 439 |
</p>
|
| 440 |
</div>
|
| 441 |
"""
|
|
|
|
| 449 |
Users are solely liable for their actions while using this generative model.
|
| 450 |
|
| 451 |
### 🚀 **Enhancement Features**
|
| 452 |
+
- **Frame Smoothing**: Reduces jitter and improves transition between frames (currently experimental)
|
| 453 |
- **Frame Interpolation**: Increases FPS for smoother motion
|
| 454 |
- **Optimized Audio Processing**: Better lip-sync with 24kHz sampling
|
| 455 |
- **Fine-tuned CFG Scale**: Better control over motion naturalness
|
| 456 |
+
|
| 457 |
+
### ⏱️ **Audio Length Limitations**
|
| 458 |
+
- **Optimal**: Under 30 seconds for best quality and speed
|
| 459 |
+
- **Recommended**: Under 60 seconds
|
| 460 |
+
- **Maximum**: 180 seconds (3 minutes) - very slow processing
|
| 461 |
+
- For longer content, consider splitting audio into segments
|
| 462 |
"""
|
| 463 |
)
|
| 464 |
|