alex commited on
Commit
33b0370
·
1 Parent(s): 23fe401

aspect ratio options added

Browse files
Files changed (1) hide show
  1. app.py +84 -28
app.py CHANGED
@@ -58,6 +58,8 @@ from functools import partial
58
  from omegaconf import OmegaConf
59
  from argparse import Namespace
60
  from gradio_extendedaudio import ExtendedAudio
 
 
61
  import torchaudio
62
 
63
  # load the one true config you dumped
@@ -561,12 +563,15 @@ def slider_value_change(image_path, audio_path, text, num_steps, session_state,
561
 
562
  if adaptive_text:
563
 
564
- if num_steps < 8:
565
- text = ADAPTIVE_PROMPT_TEMPLATES[1]
566
- elif num_steps < 10:
567
- text = ADAPTIVE_PROMPT_TEMPLATES[1]
 
 
 
568
  else:
569
- text = ADAPTIVE_PROMPT_TEMPLATES[2]
570
 
571
  return update_generate_button(image_path, audio_path, text, num_steps, session_state), text
572
 
@@ -614,12 +619,18 @@ def get_duration(image_path, audio_path, text, num_steps, session_id, progress):
614
 
615
  return int(duration_s)
616
 
617
- def preprocess_img(image_path, session_id = None):
618
 
619
  if session_id is None:
620
  session_id = uuid.uuid4().hex
 
 
 
 
 
 
621
 
622
- image = Image.open(image_path).convert("RGB")
623
 
624
  image = inferpipe.transform(image).unsqueeze(0).to(dtype=inferpipe.dtype)
625
 
@@ -638,8 +649,18 @@ def preprocess_img(image_path, session_id = None):
638
  image = tensor_to_pil(image)
639
  image.save(input_img_path)
640
 
641
- return input_img_path
 
 
 
 
 
 
 
642
 
 
 
 
643
 
644
  @spaces.GPU(duration=get_duration)
645
  def infer(image_path, audio_path, text, num_steps, session_id = None, progress=gr.Progress(track_tqdm=True),):
@@ -654,6 +675,7 @@ def infer(image_path, audio_path, text, num_steps, session_id = None, progress=g
654
 
655
  if session_id is None:
656
  session_id = uuid.uuid4().hex
 
657
 
658
  output_dir = os.path.join(os.environ["PROCESSED_RESULTS"], session_id)
659
 
@@ -693,7 +715,10 @@ def infer(image_path, audio_path, text, num_steps, session_id = None, progress=g
693
 
694
  return video_paths[0]
695
 
696
- def apply(request):
 
 
 
697
 
698
  return request
699
 
@@ -712,6 +737,22 @@ def check_box_clicked(adapative_tick):
712
  print("checkbox clicked")
713
  return gr.update(interactive=not adapative_tick)
714
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
715
  def preprocess_audio_first_5s_librosa(audio_path, limit_on, session_id=None):
716
  """
717
  If the uploaded audio is < 5s, return it unchanged.
@@ -808,7 +849,7 @@ with gr.Blocks(css=css) as demo:
808
 
809
  with gr.Column():
810
 
811
- image_input = gr.Image(label="Reference Image", type="filepath", height=512)
812
  audio_input = ExtendedAudio(label="Input Audio", type="filepath", options=["EMPTY"], show_download_button=True)
813
  gr.Markdown("*A 5-second limit is applied to audio files to shorten generation time. You can turn this off in Advanced Settings*")
814
 
@@ -821,6 +862,7 @@ with gr.Blocks(css=css) as demo:
821
  time_required = gr.Text(value="⌚ Zero GPU Required: --", show_label=False)
822
  infer_btn = gr.Button("🦜 Avatar Me", variant="primary")
823
  with gr.Accordion("Advanced Settings", open=False):
 
824
  limit_on = gr.Checkbox(label="Limit Audio files to 5 seconds", value=True)
825
  adaptive_text = gr.Checkbox(label="Adaptive Video Prompt", value=True)
826
  text_input = gr.Textbox(show_label=False, lines=6, elem_classes=["stateful"], interactive=False, value= ADAPTIVE_PROMPT_TEMPLATES[1])
@@ -829,31 +871,36 @@ with gr.Blocks(css=css) as demo:
829
 
830
  cached_examples = gr.Examples(
831
  examples=[
 
832
  [
833
- "examples/images/male-001.png",
834
- "examples/audios/denial.wav",
835
  ADAPTIVE_PROMPT_TEMPLATES[2],
836
- 12
 
837
  ],
838
 
839
  [
840
  "examples/images/female-001.png",
841
  "examples/audios/script.wav",
842
  ADAPTIVE_PROMPT_TEMPLATES[2],
843
- 14
 
844
  ],
845
-
846
  [
847
- "examples/images/female-002.png",
848
- "examples/audios/nature.wav",
849
  ADAPTIVE_PROMPT_TEMPLATES[2],
850
- 10
 
851
  ],
 
852
  ],
853
  label="Cached Examples",
854
- inputs=[image_input, audio_input, text_input, num_steps],
855
  outputs=[output_video],
856
- fn=infer,
857
  cache_examples=True
858
  )
859
 
@@ -864,10 +911,11 @@ with gr.Blocks(css=css) as demo:
864
  "examples/audios/listen.wav",
865
  ADAPTIVE_PROMPT_TEMPLATES[1],
866
  8,
 
867
  ],
868
  ],
869
  label="Uncached Examples",
870
- inputs=[image_input, audio_input, text_input, num_steps],
871
  cache_examples=False
872
  )
873
 
@@ -882,11 +930,14 @@ with gr.Blocks(css=css) as demo:
882
  [
883
  "examples/images/female-003.png",
884
  ],
 
 
 
885
  ],
886
  label="Image Samples",
887
  inputs=[image_input],
888
- outputs=[image_input],
889
- fn=apply
890
  )
891
 
892
  audio_examples = gr.Examples(
@@ -902,11 +953,15 @@ with gr.Blocks(css=css) as demo:
902
  [
903
  "examples/audios/matcha.wav",
904
  ],
 
 
 
 
905
  ],
906
  label="Audio Samples",
907
  inputs=[audio_input],
908
  outputs=[audio_input],
909
- fn=apply
910
  )
911
 
912
  infer_btn.click(
@@ -920,7 +975,7 @@ with gr.Blocks(css=css) as demo:
920
  inputs=[session_state],
921
  outputs=[audio_input]
922
  ).then(
923
- fn=apply,
924
  inputs=[audio_input],
925
  outputs=[audio_input]
926
  ).then(
@@ -928,13 +983,14 @@ with gr.Blocks(css=css) as demo:
928
  inputs=[audio_input, limit_on, session_state],
929
  outputs=[audio_input],
930
  )
931
-
932
- image_input.upload(fn=preprocess_img, inputs=[image_input, session_state], outputs=[image_input])
 
933
  image_input.change(fn=update_generate_button, inputs=[image_input, audio_input, text_input, num_steps, session_state], outputs=[time_required])
934
  audio_input.change(fn=update_generate_button, inputs=[image_input, audio_input, text_input, num_steps, session_state], outputs=[time_required])
935
  num_steps.change(fn=slider_value_change, inputs=[image_input, audio_input, text_input, num_steps, session_state, adaptive_text], outputs=[time_required, text_input])
936
  adaptive_text.change(fn=check_box_clicked, inputs=[adaptive_text], outputs=[text_input])
937
- audio_input.upload(fn=apply, inputs=[audio_input], outputs=[audio_input]
938
  ).then(
939
  fn=preprocess_audio_first_5s_librosa,
940
  inputs=[audio_input, limit_on, session_state],
 
58
  from omegaconf import OmegaConf
59
  from argparse import Namespace
60
  from gradio_extendedaudio import ExtendedAudio
61
+ from gradio_extendedimage import extendedimage
62
+
63
  import torchaudio
64
 
65
  # load the one true config you dumped
 
563
 
564
  if adaptive_text:
565
 
566
+ if not args.image_sizes_720 == [[720, 720]]:
567
+ if num_steps < 8:
568
+ text = ADAPTIVE_PROMPT_TEMPLATES[1]
569
+ elif num_steps < 10:
570
+ text = ADAPTIVE_PROMPT_TEMPLATES[1]
571
+ else:
572
+ text = ADAPTIVE_PROMPT_TEMPLATES[2]
573
  else:
574
+ text = ADAPTIVE_PROMPT_TEMPLATES[1]
575
 
576
  return update_generate_button(image_path, audio_path, text, num_steps, session_state), text
577
 
 
619
 
620
  return int(duration_s)
621
 
622
+ def preprocess_img(input_image_path, raw_image_path, session_id = None):
623
 
624
  if session_id is None:
625
  session_id = uuid.uuid4().hex
626
+
627
+ if input_image_path is None:
628
+ return None, None
629
+
630
+ if raw_image_path is '':
631
+ raw_image_path = input_image_path
632
 
633
+ image = Image.open(raw_image_path).convert("RGB")
634
 
635
  image = inferpipe.transform(image).unsqueeze(0).to(dtype=inferpipe.dtype)
636
 
 
649
  image = tensor_to_pil(image)
650
  image.save(input_img_path)
651
 
652
+ return input_img_path, raw_image_path
653
+
654
+ def infer_example(image_path, audio_path, text, num_steps, raw_image_path, session_id = None, progress=gr.Progress(track_tqdm=True),):
655
+
656
+ current_image_size = args.image_sizes_720
657
+ args.image_sizes_720 = [[720, 400]]
658
+
659
+ result = infer(image_path, audio_path, text, num_steps, session_id, progress)
660
 
661
+ args.image_sizes_720 = current_image_size
662
+
663
+ return result
664
 
665
  @spaces.GPU(duration=get_duration)
666
  def infer(image_path, audio_path, text, num_steps, session_id = None, progress=gr.Progress(track_tqdm=True),):
 
675
 
676
  if session_id is None:
677
  session_id = uuid.uuid4().hex
678
+
679
 
680
  output_dir = os.path.join(os.environ["PROCESSED_RESULTS"], session_id)
681
 
 
715
 
716
  return video_paths[0]
717
 
718
+ def apply_image(request):
719
+ return request, None
720
+
721
+ def apply_audio(request):
722
 
723
  return request
724
 
 
737
  print("checkbox clicked")
738
  return gr.update(interactive=not adapative_tick)
739
 
740
+ def orientation_changed(session_id, evt: gr.EventData):
741
+
742
+ detail = getattr(evt, "data", None) or getattr(evt, "_data", {}) or {}
743
+
744
+ if detail['value'] == "9:16":
745
+ args.image_sizes_720 = [[720, 400]]
746
+ elif detail['value'] == "1:1":
747
+ args.image_sizes_720 = [[720, 720]]
748
+ elif detail['value'] == "16:9":
749
+ args.image_sizes_720 = [[400, 720]]
750
+
751
+ print(f'{session_id} has {args.image_sizes_720} orientation')
752
+
753
+ def clear_raw_image():
754
+ return ''
755
+
756
  def preprocess_audio_first_5s_librosa(audio_path, limit_on, session_id=None):
757
  """
758
  If the uploaded audio is < 5s, return it unchanged.
 
849
 
850
  with gr.Column():
851
 
852
+ image_input = extendedimage(label="Reference Image", type="filepath", height=512)
853
  audio_input = ExtendedAudio(label="Input Audio", type="filepath", options=["EMPTY"], show_download_button=True)
854
  gr.Markdown("*A 5-second limit is applied to audio files to shorten generation time. You can turn this off in Advanced Settings*")
855
 
 
862
  time_required = gr.Text(value="⌚ Zero GPU Required: --", show_label=False)
863
  infer_btn = gr.Button("🦜 Avatar Me", variant="primary")
864
  with gr.Accordion("Advanced Settings", open=False):
865
+ raw_img_text = gr.Text(show_label=False, label="", value='', visible=False)
866
  limit_on = gr.Checkbox(label="Limit Audio files to 5 seconds", value=True)
867
  adaptive_text = gr.Checkbox(label="Adaptive Video Prompt", value=True)
868
  text_input = gr.Textbox(show_label=False, lines=6, elem_classes=["stateful"], interactive=False, value= ADAPTIVE_PROMPT_TEMPLATES[1])
 
871
 
872
  cached_examples = gr.Examples(
873
  examples=[
874
+
875
  [
876
+ "examples/images/creature-001.png",
877
+ "examples/audios/keen.wav",
878
  ADAPTIVE_PROMPT_TEMPLATES[2],
879
+ 20,
880
+ ''
881
  ],
882
 
883
  [
884
  "examples/images/female-001.png",
885
  "examples/audios/script.wav",
886
  ADAPTIVE_PROMPT_TEMPLATES[2],
887
+ 14,
888
+ ''
889
  ],
890
+
891
  [
892
+ "examples/images/male-001.png",
893
+ "examples/audios/denial.wav",
894
  ADAPTIVE_PROMPT_TEMPLATES[2],
895
+ 12,
896
+ ''
897
  ],
898
+
899
  ],
900
  label="Cached Examples",
901
+ inputs=[image_input, audio_input, text_input, num_steps, raw_img_text],
902
  outputs=[output_video],
903
+ fn=infer_example,
904
  cache_examples=True
905
  )
906
 
 
911
  "examples/audios/listen.wav",
912
  ADAPTIVE_PROMPT_TEMPLATES[1],
913
  8,
914
+ ''
915
  ],
916
  ],
917
  label="Uncached Examples",
918
+ inputs=[image_input , audio_input, text_input, num_steps, raw_img_text],
919
  cache_examples=False
920
  )
921
 
 
930
  [
931
  "examples/images/female-003.png",
932
  ],
933
+ [
934
+ "examples/images/female-002.png",
935
+ ],
936
  ],
937
  label="Image Samples",
938
  inputs=[image_input],
939
+ outputs=[image_input, raw_img_text],
940
+ fn=apply_image
941
  )
942
 
943
  audio_examples = gr.Examples(
 
953
  [
954
  "examples/audios/matcha.wav",
955
  ],
956
+
957
+ [
958
+ "examples/audios/nature.wav",
959
+ ],
960
  ],
961
  label="Audio Samples",
962
  inputs=[audio_input],
963
  outputs=[audio_input],
964
+ fn=apply_audio
965
  )
966
 
967
  infer_btn.click(
 
975
  inputs=[session_state],
976
  outputs=[audio_input]
977
  ).then(
978
+ fn=apply_audio,
979
  inputs=[audio_input],
980
  outputs=[audio_input]
981
  ).then(
 
983
  inputs=[audio_input, limit_on, session_state],
984
  outputs=[audio_input],
985
  )
986
+ image_input.orientation(fn=orientation_changed, inputs=[session_state]).then(fn=preprocess_img, inputs=[image_input, raw_img_text, session_state], outputs=[image_input, raw_img_text])
987
+ image_input.clear(fn=clear_raw_image, outputs=[raw_img_text])
988
+ image_input.upload(fn=preprocess_img, inputs=[image_input, raw_img_text, session_state], outputs=[image_input, raw_img_text])
989
  image_input.change(fn=update_generate_button, inputs=[image_input, audio_input, text_input, num_steps, session_state], outputs=[time_required])
990
  audio_input.change(fn=update_generate_button, inputs=[image_input, audio_input, text_input, num_steps, session_state], outputs=[time_required])
991
  num_steps.change(fn=slider_value_change, inputs=[image_input, audio_input, text_input, num_steps, session_state, adaptive_text], outputs=[time_required, text_input])
992
  adaptive_text.change(fn=check_box_clicked, inputs=[adaptive_text], outputs=[text_input])
993
+ audio_input.upload(fn=apply_audio, inputs=[audio_input], outputs=[audio_input]
994
  ).then(
995
  fn=preprocess_audio_first_5s_librosa,
996
  inputs=[audio_input, limit_on, session_state],