Spaces:
Running
on
Zero
Running
on
Zero
alex
commited on
Commit
·
33b0370
1
Parent(s):
23fe401
aspect ratio options added
Browse files
app.py
CHANGED
|
@@ -58,6 +58,8 @@ from functools import partial
|
|
| 58 |
from omegaconf import OmegaConf
|
| 59 |
from argparse import Namespace
|
| 60 |
from gradio_extendedaudio import ExtendedAudio
|
|
|
|
|
|
|
| 61 |
import torchaudio
|
| 62 |
|
| 63 |
# load the one true config you dumped
|
|
@@ -561,12 +563,15 @@ def slider_value_change(image_path, audio_path, text, num_steps, session_state,
|
|
| 561 |
|
| 562 |
if adaptive_text:
|
| 563 |
|
| 564 |
-
if
|
| 565 |
-
|
| 566 |
-
|
| 567 |
-
|
|
|
|
|
|
|
|
|
|
| 568 |
else:
|
| 569 |
-
text = ADAPTIVE_PROMPT_TEMPLATES[
|
| 570 |
|
| 571 |
return update_generate_button(image_path, audio_path, text, num_steps, session_state), text
|
| 572 |
|
|
@@ -614,12 +619,18 @@ def get_duration(image_path, audio_path, text, num_steps, session_id, progress):
|
|
| 614 |
|
| 615 |
return int(duration_s)
|
| 616 |
|
| 617 |
-
def preprocess_img(
|
| 618 |
|
| 619 |
if session_id is None:
|
| 620 |
session_id = uuid.uuid4().hex
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 621 |
|
| 622 |
-
image = Image.open(
|
| 623 |
|
| 624 |
image = inferpipe.transform(image).unsqueeze(0).to(dtype=inferpipe.dtype)
|
| 625 |
|
|
@@ -638,8 +649,18 @@ def preprocess_img(image_path, session_id = None):
|
|
| 638 |
image = tensor_to_pil(image)
|
| 639 |
image.save(input_img_path)
|
| 640 |
|
| 641 |
-
return input_img_path
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 642 |
|
|
|
|
|
|
|
|
|
|
| 643 |
|
| 644 |
@spaces.GPU(duration=get_duration)
|
| 645 |
def infer(image_path, audio_path, text, num_steps, session_id = None, progress=gr.Progress(track_tqdm=True),):
|
|
@@ -654,6 +675,7 @@ def infer(image_path, audio_path, text, num_steps, session_id = None, progress=g
|
|
| 654 |
|
| 655 |
if session_id is None:
|
| 656 |
session_id = uuid.uuid4().hex
|
|
|
|
| 657 |
|
| 658 |
output_dir = os.path.join(os.environ["PROCESSED_RESULTS"], session_id)
|
| 659 |
|
|
@@ -693,7 +715,10 @@ def infer(image_path, audio_path, text, num_steps, session_id = None, progress=g
|
|
| 693 |
|
| 694 |
return video_paths[0]
|
| 695 |
|
| 696 |
-
def
|
|
|
|
|
|
|
|
|
|
| 697 |
|
| 698 |
return request
|
| 699 |
|
|
@@ -712,6 +737,22 @@ def check_box_clicked(adapative_tick):
|
|
| 712 |
print("checkbox clicked")
|
| 713 |
return gr.update(interactive=not adapative_tick)
|
| 714 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 715 |
def preprocess_audio_first_5s_librosa(audio_path, limit_on, session_id=None):
|
| 716 |
"""
|
| 717 |
If the uploaded audio is < 5s, return it unchanged.
|
|
@@ -808,7 +849,7 @@ with gr.Blocks(css=css) as demo:
|
|
| 808 |
|
| 809 |
with gr.Column():
|
| 810 |
|
| 811 |
-
image_input =
|
| 812 |
audio_input = ExtendedAudio(label="Input Audio", type="filepath", options=["EMPTY"], show_download_button=True)
|
| 813 |
gr.Markdown("*A 5-second limit is applied to audio files to shorten generation time. You can turn this off in Advanced Settings*")
|
| 814 |
|
|
@@ -821,6 +862,7 @@ with gr.Blocks(css=css) as demo:
|
|
| 821 |
time_required = gr.Text(value="⌚ Zero GPU Required: --", show_label=False)
|
| 822 |
infer_btn = gr.Button("🦜 Avatar Me", variant="primary")
|
| 823 |
with gr.Accordion("Advanced Settings", open=False):
|
|
|
|
| 824 |
limit_on = gr.Checkbox(label="Limit Audio files to 5 seconds", value=True)
|
| 825 |
adaptive_text = gr.Checkbox(label="Adaptive Video Prompt", value=True)
|
| 826 |
text_input = gr.Textbox(show_label=False, lines=6, elem_classes=["stateful"], interactive=False, value= ADAPTIVE_PROMPT_TEMPLATES[1])
|
|
@@ -829,31 +871,36 @@ with gr.Blocks(css=css) as demo:
|
|
| 829 |
|
| 830 |
cached_examples = gr.Examples(
|
| 831 |
examples=[
|
|
|
|
| 832 |
[
|
| 833 |
-
"examples/images/
|
| 834 |
-
"examples/audios/
|
| 835 |
ADAPTIVE_PROMPT_TEMPLATES[2],
|
| 836 |
-
|
|
|
|
| 837 |
],
|
| 838 |
|
| 839 |
[
|
| 840 |
"examples/images/female-001.png",
|
| 841 |
"examples/audios/script.wav",
|
| 842 |
ADAPTIVE_PROMPT_TEMPLATES[2],
|
| 843 |
-
14
|
|
|
|
| 844 |
],
|
| 845 |
-
|
| 846 |
[
|
| 847 |
-
"examples/images/
|
| 848 |
-
"examples/audios/
|
| 849 |
ADAPTIVE_PROMPT_TEMPLATES[2],
|
| 850 |
-
|
|
|
|
| 851 |
],
|
|
|
|
| 852 |
],
|
| 853 |
label="Cached Examples",
|
| 854 |
-
inputs=[image_input, audio_input, text_input, num_steps],
|
| 855 |
outputs=[output_video],
|
| 856 |
-
fn=
|
| 857 |
cache_examples=True
|
| 858 |
)
|
| 859 |
|
|
@@ -864,10 +911,11 @@ with gr.Blocks(css=css) as demo:
|
|
| 864 |
"examples/audios/listen.wav",
|
| 865 |
ADAPTIVE_PROMPT_TEMPLATES[1],
|
| 866 |
8,
|
|
|
|
| 867 |
],
|
| 868 |
],
|
| 869 |
label="Uncached Examples",
|
| 870 |
-
inputs=[image_input, audio_input, text_input, num_steps],
|
| 871 |
cache_examples=False
|
| 872 |
)
|
| 873 |
|
|
@@ -882,11 +930,14 @@ with gr.Blocks(css=css) as demo:
|
|
| 882 |
[
|
| 883 |
"examples/images/female-003.png",
|
| 884 |
],
|
|
|
|
|
|
|
|
|
|
| 885 |
],
|
| 886 |
label="Image Samples",
|
| 887 |
inputs=[image_input],
|
| 888 |
-
outputs=[image_input],
|
| 889 |
-
fn=
|
| 890 |
)
|
| 891 |
|
| 892 |
audio_examples = gr.Examples(
|
|
@@ -902,11 +953,15 @@ with gr.Blocks(css=css) as demo:
|
|
| 902 |
[
|
| 903 |
"examples/audios/matcha.wav",
|
| 904 |
],
|
|
|
|
|
|
|
|
|
|
|
|
|
| 905 |
],
|
| 906 |
label="Audio Samples",
|
| 907 |
inputs=[audio_input],
|
| 908 |
outputs=[audio_input],
|
| 909 |
-
fn=
|
| 910 |
)
|
| 911 |
|
| 912 |
infer_btn.click(
|
|
@@ -920,7 +975,7 @@ with gr.Blocks(css=css) as demo:
|
|
| 920 |
inputs=[session_state],
|
| 921 |
outputs=[audio_input]
|
| 922 |
).then(
|
| 923 |
-
fn=
|
| 924 |
inputs=[audio_input],
|
| 925 |
outputs=[audio_input]
|
| 926 |
).then(
|
|
@@ -928,13 +983,14 @@ with gr.Blocks(css=css) as demo:
|
|
| 928 |
inputs=[audio_input, limit_on, session_state],
|
| 929 |
outputs=[audio_input],
|
| 930 |
)
|
| 931 |
-
|
| 932 |
-
image_input.
|
|
|
|
| 933 |
image_input.change(fn=update_generate_button, inputs=[image_input, audio_input, text_input, num_steps, session_state], outputs=[time_required])
|
| 934 |
audio_input.change(fn=update_generate_button, inputs=[image_input, audio_input, text_input, num_steps, session_state], outputs=[time_required])
|
| 935 |
num_steps.change(fn=slider_value_change, inputs=[image_input, audio_input, text_input, num_steps, session_state, adaptive_text], outputs=[time_required, text_input])
|
| 936 |
adaptive_text.change(fn=check_box_clicked, inputs=[adaptive_text], outputs=[text_input])
|
| 937 |
-
audio_input.upload(fn=
|
| 938 |
).then(
|
| 939 |
fn=preprocess_audio_first_5s_librosa,
|
| 940 |
inputs=[audio_input, limit_on, session_state],
|
|
|
|
| 58 |
from omegaconf import OmegaConf
|
| 59 |
from argparse import Namespace
|
| 60 |
from gradio_extendedaudio import ExtendedAudio
|
| 61 |
+
from gradio_extendedimage import extendedimage
|
| 62 |
+
|
| 63 |
import torchaudio
|
| 64 |
|
| 65 |
# load the one true config you dumped
|
|
|
|
| 563 |
|
| 564 |
if adaptive_text:
|
| 565 |
|
| 566 |
+
if not args.image_sizes_720 == [[720, 720]]:
|
| 567 |
+
if num_steps < 8:
|
| 568 |
+
text = ADAPTIVE_PROMPT_TEMPLATES[1]
|
| 569 |
+
elif num_steps < 10:
|
| 570 |
+
text = ADAPTIVE_PROMPT_TEMPLATES[1]
|
| 571 |
+
else:
|
| 572 |
+
text = ADAPTIVE_PROMPT_TEMPLATES[2]
|
| 573 |
else:
|
| 574 |
+
text = ADAPTIVE_PROMPT_TEMPLATES[1]
|
| 575 |
|
| 576 |
return update_generate_button(image_path, audio_path, text, num_steps, session_state), text
|
| 577 |
|
|
|
|
| 619 |
|
| 620 |
return int(duration_s)
|
| 621 |
|
| 622 |
+
def preprocess_img(input_image_path, raw_image_path, session_id = None):
|
| 623 |
|
| 624 |
if session_id is None:
|
| 625 |
session_id = uuid.uuid4().hex
|
| 626 |
+
|
| 627 |
+
if input_image_path is None:
|
| 628 |
+
return None, None
|
| 629 |
+
|
| 630 |
+
if raw_image_path is '':
|
| 631 |
+
raw_image_path = input_image_path
|
| 632 |
|
| 633 |
+
image = Image.open(raw_image_path).convert("RGB")
|
| 634 |
|
| 635 |
image = inferpipe.transform(image).unsqueeze(0).to(dtype=inferpipe.dtype)
|
| 636 |
|
|
|
|
| 649 |
image = tensor_to_pil(image)
|
| 650 |
image.save(input_img_path)
|
| 651 |
|
| 652 |
+
return input_img_path, raw_image_path
|
| 653 |
+
|
| 654 |
+
def infer_example(image_path, audio_path, text, num_steps, raw_image_path, session_id = None, progress=gr.Progress(track_tqdm=True),):
|
| 655 |
+
|
| 656 |
+
current_image_size = args.image_sizes_720
|
| 657 |
+
args.image_sizes_720 = [[720, 400]]
|
| 658 |
+
|
| 659 |
+
result = infer(image_path, audio_path, text, num_steps, session_id, progress)
|
| 660 |
|
| 661 |
+
args.image_sizes_720 = current_image_size
|
| 662 |
+
|
| 663 |
+
return result
|
| 664 |
|
| 665 |
@spaces.GPU(duration=get_duration)
|
| 666 |
def infer(image_path, audio_path, text, num_steps, session_id = None, progress=gr.Progress(track_tqdm=True),):
|
|
|
|
| 675 |
|
| 676 |
if session_id is None:
|
| 677 |
session_id = uuid.uuid4().hex
|
| 678 |
+
|
| 679 |
|
| 680 |
output_dir = os.path.join(os.environ["PROCESSED_RESULTS"], session_id)
|
| 681 |
|
|
|
|
| 715 |
|
| 716 |
return video_paths[0]
|
| 717 |
|
| 718 |
+
def apply_image(request):
|
| 719 |
+
return request, None
|
| 720 |
+
|
| 721 |
+
def apply_audio(request):
|
| 722 |
|
| 723 |
return request
|
| 724 |
|
|
|
|
| 737 |
print("checkbox clicked")
|
| 738 |
return gr.update(interactive=not adapative_tick)
|
| 739 |
|
| 740 |
+
def orientation_changed(session_id, evt: gr.EventData):
|
| 741 |
+
|
| 742 |
+
detail = getattr(evt, "data", None) or getattr(evt, "_data", {}) or {}
|
| 743 |
+
|
| 744 |
+
if detail['value'] == "9:16":
|
| 745 |
+
args.image_sizes_720 = [[720, 400]]
|
| 746 |
+
elif detail['value'] == "1:1":
|
| 747 |
+
args.image_sizes_720 = [[720, 720]]
|
| 748 |
+
elif detail['value'] == "16:9":
|
| 749 |
+
args.image_sizes_720 = [[400, 720]]
|
| 750 |
+
|
| 751 |
+
print(f'{session_id} has {args.image_sizes_720} orientation')
|
| 752 |
+
|
| 753 |
+
def clear_raw_image():
|
| 754 |
+
return ''
|
| 755 |
+
|
| 756 |
def preprocess_audio_first_5s_librosa(audio_path, limit_on, session_id=None):
|
| 757 |
"""
|
| 758 |
If the uploaded audio is < 5s, return it unchanged.
|
|
|
|
| 849 |
|
| 850 |
with gr.Column():
|
| 851 |
|
| 852 |
+
image_input = extendedimage(label="Reference Image", type="filepath", height=512)
|
| 853 |
audio_input = ExtendedAudio(label="Input Audio", type="filepath", options=["EMPTY"], show_download_button=True)
|
| 854 |
gr.Markdown("*A 5-second limit is applied to audio files to shorten generation time. You can turn this off in Advanced Settings*")
|
| 855 |
|
|
|
|
| 862 |
time_required = gr.Text(value="⌚ Zero GPU Required: --", show_label=False)
|
| 863 |
infer_btn = gr.Button("🦜 Avatar Me", variant="primary")
|
| 864 |
with gr.Accordion("Advanced Settings", open=False):
|
| 865 |
+
raw_img_text = gr.Text(show_label=False, label="", value='', visible=False)
|
| 866 |
limit_on = gr.Checkbox(label="Limit Audio files to 5 seconds", value=True)
|
| 867 |
adaptive_text = gr.Checkbox(label="Adaptive Video Prompt", value=True)
|
| 868 |
text_input = gr.Textbox(show_label=False, lines=6, elem_classes=["stateful"], interactive=False, value= ADAPTIVE_PROMPT_TEMPLATES[1])
|
|
|
|
| 871 |
|
| 872 |
cached_examples = gr.Examples(
|
| 873 |
examples=[
|
| 874 |
+
|
| 875 |
[
|
| 876 |
+
"examples/images/creature-001.png",
|
| 877 |
+
"examples/audios/keen.wav",
|
| 878 |
ADAPTIVE_PROMPT_TEMPLATES[2],
|
| 879 |
+
20,
|
| 880 |
+
''
|
| 881 |
],
|
| 882 |
|
| 883 |
[
|
| 884 |
"examples/images/female-001.png",
|
| 885 |
"examples/audios/script.wav",
|
| 886 |
ADAPTIVE_PROMPT_TEMPLATES[2],
|
| 887 |
+
14,
|
| 888 |
+
''
|
| 889 |
],
|
| 890 |
+
|
| 891 |
[
|
| 892 |
+
"examples/images/male-001.png",
|
| 893 |
+
"examples/audios/denial.wav",
|
| 894 |
ADAPTIVE_PROMPT_TEMPLATES[2],
|
| 895 |
+
12,
|
| 896 |
+
''
|
| 897 |
],
|
| 898 |
+
|
| 899 |
],
|
| 900 |
label="Cached Examples",
|
| 901 |
+
inputs=[image_input, audio_input, text_input, num_steps, raw_img_text],
|
| 902 |
outputs=[output_video],
|
| 903 |
+
fn=infer_example,
|
| 904 |
cache_examples=True
|
| 905 |
)
|
| 906 |
|
|
|
|
| 911 |
"examples/audios/listen.wav",
|
| 912 |
ADAPTIVE_PROMPT_TEMPLATES[1],
|
| 913 |
8,
|
| 914 |
+
''
|
| 915 |
],
|
| 916 |
],
|
| 917 |
label="Uncached Examples",
|
| 918 |
+
inputs=[image_input , audio_input, text_input, num_steps, raw_img_text],
|
| 919 |
cache_examples=False
|
| 920 |
)
|
| 921 |
|
|
|
|
| 930 |
[
|
| 931 |
"examples/images/female-003.png",
|
| 932 |
],
|
| 933 |
+
[
|
| 934 |
+
"examples/images/female-002.png",
|
| 935 |
+
],
|
| 936 |
],
|
| 937 |
label="Image Samples",
|
| 938 |
inputs=[image_input],
|
| 939 |
+
outputs=[image_input, raw_img_text],
|
| 940 |
+
fn=apply_image
|
| 941 |
)
|
| 942 |
|
| 943 |
audio_examples = gr.Examples(
|
|
|
|
| 953 |
[
|
| 954 |
"examples/audios/matcha.wav",
|
| 955 |
],
|
| 956 |
+
|
| 957 |
+
[
|
| 958 |
+
"examples/audios/nature.wav",
|
| 959 |
+
],
|
| 960 |
],
|
| 961 |
label="Audio Samples",
|
| 962 |
inputs=[audio_input],
|
| 963 |
outputs=[audio_input],
|
| 964 |
+
fn=apply_audio
|
| 965 |
)
|
| 966 |
|
| 967 |
infer_btn.click(
|
|
|
|
| 975 |
inputs=[session_state],
|
| 976 |
outputs=[audio_input]
|
| 977 |
).then(
|
| 978 |
+
fn=apply_audio,
|
| 979 |
inputs=[audio_input],
|
| 980 |
outputs=[audio_input]
|
| 981 |
).then(
|
|
|
|
| 983 |
inputs=[audio_input, limit_on, session_state],
|
| 984 |
outputs=[audio_input],
|
| 985 |
)
|
| 986 |
+
image_input.orientation(fn=orientation_changed, inputs=[session_state]).then(fn=preprocess_img, inputs=[image_input, raw_img_text, session_state], outputs=[image_input, raw_img_text])
|
| 987 |
+
image_input.clear(fn=clear_raw_image, outputs=[raw_img_text])
|
| 988 |
+
image_input.upload(fn=preprocess_img, inputs=[image_input, raw_img_text, session_state], outputs=[image_input, raw_img_text])
|
| 989 |
image_input.change(fn=update_generate_button, inputs=[image_input, audio_input, text_input, num_steps, session_state], outputs=[time_required])
|
| 990 |
audio_input.change(fn=update_generate_button, inputs=[image_input, audio_input, text_input, num_steps, session_state], outputs=[time_required])
|
| 991 |
num_steps.change(fn=slider_value_change, inputs=[image_input, audio_input, text_input, num_steps, session_state, adaptive_text], outputs=[time_required, text_input])
|
| 992 |
adaptive_text.change(fn=check_box_clicked, inputs=[adaptive_text], outputs=[text_input])
|
| 993 |
+
audio_input.upload(fn=apply_audio, inputs=[audio_input], outputs=[audio_input]
|
| 994 |
).then(
|
| 995 |
fn=preprocess_audio_first_5s_librosa,
|
| 996 |
inputs=[audio_input, limit_on, session_state],
|