Spaces:
No application file
No application file
| import os | |
| import time | |
| import pdb | |
| import cuid | |
| import gradio as gr | |
| from huggingface_hub import snapshot_download | |
| ProjectDir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../..")) | |
| CheckpointsDir = os.path.join(ProjectDir, "checkpoints") | |
| def download_model(): | |
| if not os.path.exists(CheckpointsDir): | |
| print("Checkpoint Not Downloaded, start downloading...") | |
| tic = time.time() | |
| snapshot_download( | |
| repo_id="TMElyralab/MuseV", | |
| local_dir=CheckpointsDir, | |
| max_workers=8, | |
| ) | |
| toc = time.time() | |
| print(f"download cost {toc-tic} seconds") | |
| else: | |
| print("Already download the model.") | |
| download_model() # for huggingface deployment. | |
| from gradio_video2video import online_v2v_inference | |
| from gradio_text2video import online_t2v_inference | |
| def update_shape(image): | |
| if image != None: | |
| h, w, _ = image.shape | |
| else: | |
| h, w = 768, 512 | |
| return w, h | |
| class ConcatenateBlock(gr.blocks.Block): | |
| def __init__(self, options): | |
| self.options = options | |
| self.current_string = "" | |
| def update_string(self, new_choice): | |
| if new_choice and new_choice not in self.current_string.split(", "): | |
| if self.current_string == "": | |
| self.current_string = new_choice | |
| else: | |
| self.current_string += ", " + new_choice | |
| return self.current_string | |
| def process_input(new_choice): | |
| return concatenate_block.update_string(new_choice), "" | |
| control_options = [ | |
| "pose", | |
| "pose_body", | |
| "pose_hand", | |
| "pose_face", | |
| "pose_hand_body", | |
| "pose_hand_face", | |
| "dwpose", | |
| "dwpose_face", | |
| "dwpose_hand", | |
| "dwpose_body", | |
| "dwpose_body_hand", | |
| "canny", | |
| "tile", | |
| "hed", | |
| "hed_scribble", | |
| "depth", | |
| "pidi", | |
| "normal_bae", | |
| "lineart", | |
| "lineart_anime", | |
| "zoe", | |
| "sam", | |
| "mobile_sam", | |
| "leres", | |
| "content", | |
| "face_detector", | |
| ] | |
| concatenate_block = ConcatenateBlock(control_options) | |
| css = """#input_img {max-width: 1024px !important} #output_vid {max-width: 1024px; max-height: 576px}""" | |
| with gr.Blocks(css=css) as demo: | |
| gr.Markdown( | |
| "<div align='center'> <h1> MuseV: Infinite-length and High Fidelity Virtual Human Video Generation with Visual Conditioned Parallel Denoising</span> </h1> \ | |
| <h2 style='font-weight: 450; font-size: 1rem; margin: 0rem'>\ | |
| </br>\ | |
| Zhiqiang Xia <sup>*</sup>,\ | |
| Zhaokang Chen<sup>*</sup>,\ | |
| Bin Wu<sup>†</sup>,\ | |
| Chao Li,\ | |
| Kwok-Wai Hung,\ | |
| Chao Zhan,\ | |
| Yingjie He,\ | |
| Wenjiang Zhou\ | |
| (<sup>*</sup>Equal Contribution, <sup>†</sup>Corresponding Author, benbinwu@tencent.com)\ | |
| </br>\ | |
| Lyra Lab, Tencent Music Entertainment\ | |
| </h2> \ | |
| <a style='font-size:18px;color: #000000' href='https://github.com/TMElyralab/MuseV'>[Github Repo]</a>\ | |
| <a style='font-size:18px;color: #000000'>, which is important to Open-Source projects. Thanks!</a>\ | |
| <a style='font-size:18px;color: #000000' href=''> [ArXiv(Coming Soon)] </a>\ | |
| <a style='font-size:18px;color: #000000' href=''> [Project Page(Coming Soon)] </a> \ | |
| <a style='font-size:18px;color: #000000'>If MuseV is useful, please help star the repo~ </a> </div>" | |
| ) | |
| with gr.Tab("Text to Video"): | |
| with gr.Row(): | |
| with gr.Column(): | |
| prompt = gr.Textbox(label="Prompt") | |
| image = gr.Image(label="VisionCondImage") | |
| gr.Markdown("seed=-1 means that the seeds run each time are different") | |
| seed = gr.Number(label="Seed", value=-1) | |
| video_length = gr.Number(label="Video Length", value=12) | |
| fps = gr.Number(label="Generate Video FPS", value=6) | |
| gr.Markdown( | |
| ( | |
| "If W&H is -1, then use the Reference Image's Size. Size of target video is $(W, H)*img\_edge\_ratio$. \n" | |
| "The shorter the image size, the larger the motion amplitude, and the lower video quality.\n" | |
| "The longer the W&H, the smaller the motion amplitude, and the higher video quality" | |
| ) | |
| ) | |
| with gr.Row(): | |
| w = gr.Number(label="Width", value=-1) | |
| h = gr.Number(label="Height", value=-1) | |
| img_edge_ratio = gr.Number(label="img_edge_ratio", value=1.0) | |
| btn1 = gr.Button("Generate") | |
| out = gr.outputs.Video() | |
| # pdb.set_trace() | |
| with gr.Row(): | |
| board = gr.Dataframe( | |
| value=[["", "", ""]] * 3, | |
| interactive=False, | |
| type="array", | |
| label="Demo Video", | |
| ) | |
| # image.change(fn=update_shape, inputs=[image], outputs=[w, h]) | |
| btn1.click( | |
| fn=online_t2v_inference, | |
| inputs=[prompt, image, seed, fps, w, h, video_length, img_edge_ratio], | |
| outputs=out, | |
| ) | |
| with gr.Tab("Video to Video"): | |
| with gr.Row(): | |
| with gr.Column(): | |
| prompt = gr.Textbox(label="Prompt") | |
| gr.Markdown( | |
| ( | |
| "pose of VisionCondImage should be same as of the first frame of the video. " | |
| "its better generate target first frame whose pose is same as of first frame of the video with text2image tool, sch as MJ, SDXL." | |
| ) | |
| ) | |
| image = gr.Image(label="VisionCondImage") | |
| video = gr.Video(label="ReferVideo") | |
| # radio = gr.inputs.Radio(, label="Select an option") | |
| # ctr_button = gr.inputs.Button(label="Add ControlNet List") | |
| # output_text = gr.outputs.Textbox() | |
| processor = gr.Textbox( | |
| label=f"Control Condition. gradio code now only support dwpose_body_hand, use command can support multi of {control_options}", | |
| value="dwpose_body_hand", | |
| ) | |
| gr.Markdown("seed=-1 means that seeds are different in every run") | |
| seed = gr.Number(label="Seed", value=-1) | |
| video_length = gr.Number(label="Video Length", value=12) | |
| fps = gr.Number(label="Generate Video FPS", value=6) | |
| gr.Markdown( | |
| ( | |
| "If W&H is -1, then use the Reference Image's Size. Size of target video is $(W, H)*img\_edge\_ratio$. \n" | |
| "The shorter the image size, the larger the motion amplitude, and the lower video quality. \n" | |
| "The longer the W&H, the smaller the motion amplitude, and the higher video quality. " | |
| ) | |
| ) | |
| with gr.Row(): | |
| w = gr.Number(label="Width", value=-1) | |
| h = gr.Number(label="Height", value=-1) | |
| img_edge_ratio = gr.Number(label="img_edge_ratio", value=1.0) | |
| btn2 = gr.Button("Generate") | |
| out1 = gr.outputs.Video() | |
| # image.change(fn=update_shape, inputs=[image], outputs=[w, h]) | |
| btn2.click( | |
| fn=online_v2v_inference, | |
| inputs=[ | |
| prompt, | |
| image, | |
| video, | |
| processor, | |
| seed, | |
| fps, | |
| w, | |
| h, | |
| video_length, | |
| img_edge_ratio, | |
| ], | |
| outputs=out1, | |
| ) | |
| # Set the IP and port | |
| ip_address = "0.0.0.0" # Replace with your desired IP address | |
| port_number = 7860 # Replace with your desired port number | |
| demo.queue().launch( | |
| share=False, debug=True, server_name=ip_address, server_port=port_number | |
| ) | |