Spaces:

PeiqingYang
/

MatAnyone

Running on Zero

App Files Files Community

PeiqingYang

hysts HF Staff commited on Mar 15

Commit

ea74e2d

1 Parent(s): 40efe7a

Migrate to ZeroGPU + Gradio 6 with bug fixes (#6)

Browse files

- Migrate to ZeroGPU + Gradio 6 with bug fixes (a7e19d44de87a1baae3c06e890d6ac6255b8ebbd)

Co-authored-by: hysts <hysts@users.noreply.huggingface.co>

Files changed (6) hide show

README.md +2 -2
hugging_face/app.py +208 -180
hugging_face/tools/base_segmenter.py +0 -1
hugging_face/tools/download_util.py +0 -1
hugging_face/tools/interact_tools.py +1 -3
requirements.txt +312 -37

README.md CHANGED Viewed

@@ -4,8 +4,8 @@ emoji: 🤡
 colorFrom: red
 colorTo: green
 sdk: gradio
-sdk_version: 4.31.0
-python_version: 3.10.13
 app_file: hugging_face/app.py
 pinned: false
 license: other

 colorFrom: red
 colorTo: green
 sdk: gradio
+sdk_version: 6.9.0
+python_version: 3.12.12
 app_file: hugging_face/app.py
 pinned: false
 license: other

hugging_face/app.py CHANGED Viewed

@@ -1,10 +1,13 @@
 import sys
-sys.path.append("../")
-sys.path.append("../../")
 import os
 import json
 import time
 import psutil
 import ffmpeg
 import imageio
@@ -12,9 +15,9 @@ import argparse
 from PIL import Image
 import cv2
-import torch
 import numpy as np
 import gradio as gr
 from tools.painter import mask_painter
 from tools.interact_tools import SamControler
@@ -75,14 +78,23 @@ def get_frames_from_image(image_input, image_state):
     Args:
         video_path:str
         timestamp:float64
-    Return
         [[0:nearest_frame], [nearest_frame:], nearest_frame]
     """
     user_name = time.time()
     frames = [image_input] * 2  # hardcode: mimic a video with 2 frames
-    image_size = (frames[0].shape[0],frames[0].shape[1])
-    # initialize video_state
     image_state = {
         "user_name": user_name,
         "image_name": "output.png",
@@ -94,16 +106,15 @@ def get_frames_from_image(image_input, image_state):
         "fps": None
         }
     image_info = "Image Name: N/A,\nFPS: N/A,\nTotal Frames: {},\nImage Size:{}".format(len(frames), image_size)
-    model.samcontroler.sam_controler.reset_image()
-    model.samcontroler.sam_controler.set_image(image_state["origin_images"][0])
-    return image_state, image_info, image_state["origin_images"][0], \
-                        gr.update(visible=True, maximum=10, value=10), gr.update(visible=False, maximum=len(frames), value=len(frames)), \
-                        gr.update(visible=True), gr.update(visible=True), \
-                        gr.update(visible=True), gr.update(visible=True),\
-                        gr.update(visible=True), gr.update(visible=True), \
-                        gr.update(visible=True), gr.update(visible=False), \
-                        gr.update(visible=False), gr.update(visible=True), \
-                        gr.update(visible=True)
 # extract frames from upload video
 def get_frames_from_video(video_input, video_state):
@@ -123,10 +134,12 @@ def get_frames_from_video(video_input, video_state):
         audio_path = video_input.replace(".mp4", "_audio.wav")
         ffmpeg.input(video_path).output(audio_path, format='wav', acodec='pcm_s16le', ac=2, ar='44100').run(overwrite_output=True, quiet=True)
     except Exception as e:
-        print(f"Audio extraction error: {str(e)}")
         audio_path = ""  # Set to "" if extraction fails
     # extract frames
     try:
         cap = cv2.VideoCapture(video_path)
         fps = cap.get(cv2.CAP_PROP_FPS)
@@ -135,16 +148,24 @@ def get_frames_from_video(video_input, video_state):
             if ret == True:
                 current_memory_usage = psutil.virtual_memory().percent
                 frames.append(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
                 if current_memory_usage > 90:
                     break
             else:
                 break
     except (OSError, TypeError, ValueError, KeyError, SyntaxError) as e:
         print("read_frame_source:{} error. {}\n".format(video_path, str(e)))
-    image_size = (frames[0].shape[0],frames[0].shape[1])
     # [remove for local demo] resize if resolution too big
-    if image_size[0]>=1080 and image_size[0]>=1080:
         scale = 1080 / min(image_size)
         new_w = int(image_size[1] * scale)
         new_h = int(image_size[0] * scale)
@@ -165,16 +186,38 @@ def get_frames_from_video(video_input, video_state):
         "fps": fps,
         "audio": audio_path
         }
     video_info = "Video Name: {},\nFPS: {},\nTotal Frames: {},\nImage Size:{}".format(video_state["video_name"], round(video_state["fps"], 0), len(frames), image_size)
-    model.samcontroler.sam_controler.reset_image()
-    model.samcontroler.sam_controler.set_image(video_state["origin_images"][0])
-    return video_state, video_info, video_state["origin_images"][0], gr.update(visible=True, maximum=len(frames), value=1), gr.update(visible=False, maximum=len(frames), value=len(frames)), \
-                        gr.update(visible=True), gr.update(visible=True), \
-                        gr.update(visible=True), gr.update(visible=True),\
-                        gr.update(visible=True), gr.update(visible=True), \
-                        gr.update(visible=True), gr.update(visible=False), \
-                        gr.update(visible=False), gr.update(visible=True), \
-                        gr.update(visible=True)
 # get the select frame from gradio slider
 def select_video_template(image_selection_slider, video_state, interactive_state):
@@ -182,10 +225,6 @@ def select_video_template(image_selection_slider, video_state, interactive_state
     image_selection_slider -= 1
     video_state["select_frame_number"] = image_selection_slider
-    # once select a new template frame, set the image in sam
-    model.samcontroler.sam_controler.reset_image()
-    model.samcontroler.sam_controler.set_image(video_state["origin_images"][image_selection_slider])
     return video_state["painted_images"][image_selection_slider], video_state, interactive_state
 def select_image_template(image_selection_slider, video_state, interactive_state):
@@ -193,10 +232,6 @@ def select_image_template(image_selection_slider, video_state, interactive_state
     image_selection_slider = 0 # fixed for image
     video_state["select_frame_number"] = image_selection_slider
-    # once select a new template frame, set the image in sam
-    model.samcontroler.sam_controler.reset_image()
-    model.samcontroler.sam_controler.set_image(video_state["origin_images"][image_selection_slider])
     return video_state["painted_images"][image_selection_slider], video_state, interactive_state
 # set the tracking end frame
@@ -206,36 +241,40 @@ def get_end_number(track_pause_number_slider, video_state, interactive_state):
     return video_state["painted_images"][track_pause_number_slider],interactive_state
 # use sam to get the mask
-def sam_refine(video_state, point_prompt, click_state, interactive_state, evt:gr.SelectData):
-    """
-    Args:
-        template_frame: PIL.Image
-        point_prompt: flag for positive or negative button click
-        click_state: [[points], [labels]]
-    """
-    if point_prompt == "Positive":
-        coordinate = "[[{},{},1]]".format(evt.index[0], evt.index[1])
-        interactive_state["positive_click_times"] += 1
-    else:
-        coordinate = "[[{},{},0]]".format(evt.index[0], evt.index[1])
-        interactive_state["negative_click_times"] += 1
     # prompt for sam model
     model.samcontroler.sam_controler.reset_image()
-    model.samcontroler.sam_controler.set_image(video_state["origin_images"][video_state["select_frame_number"]])
     prompt = get_prompt(click_state=click_state, click_input=coordinate)
-    mask, logit, painted_image = model.first_frame_click(
-                                                      image=video_state["origin_images"][video_state["select_frame_number"]],
                                                       points=np.array(prompt["input_point"]),
                                                       labels=np.array(prompt["input_label"]),
                                                       multimask=prompt["multimask_output"],
                                                       )
     video_state["masks"][video_state["select_frame_number"]] = mask
     video_state["logits"][video_state["select_frame_number"]] = logit
     video_state["painted_images"][video_state["select_frame_number"]] = painted_image
-    return painted_image, video_state, interactive_state
 def add_multi_mask(video_state, interactive_state, mask_dropdown):
     mask = video_state["masks"][video_state["select_frame_number"]]
@@ -269,6 +308,7 @@ def show_mask(video_state, interactive_state, mask_dropdown):
         return select_frame
 # image matting
 def image_matting(video_state, interactive_state, mask_dropdown, erode_kernel_size, dilate_kernel_size, refine_iter, model_selection):
     # Load model if not already loaded
     try:
@@ -298,9 +338,8 @@ def image_matting(video_state, interactive_state, mask_dropdown, erode_kernel_si
     else:
         template_mask = video_state["masks"][video_state["select_frame_number"]]
-    # operation error
-    if len(np.unique(template_mask))==1:
-        template_mask[0][0]=1
     foreground, alpha = matanyone2(matanyone_processor, following_frames, template_mask*255, r_erode=erode_kernel_size, r_dilate=dilate_kernel_size, n_warmup=refine_iter)
     foreground_output = Image.fromarray(foreground[-1])
     alpha_output = Image.fromarray(alpha[-1][:,:,0])
@@ -308,6 +347,7 @@ def image_matting(video_state, interactive_state, mask_dropdown, erode_kernel_si
     return foreground_output, alpha_output
 # video matting
 def video_matting(video_state, interactive_state, mask_dropdown, erode_kernel_size, dilate_kernel_size, model_selection):
     # Load model if not already loaded
     try:
@@ -340,13 +380,13 @@ def video_matting(video_state, interactive_state, mask_dropdown, erode_kernel_si
     audio_path = video_state["audio"]
-    # operation error
-    if len(np.unique(template_mask))==1:
-        template_mask[0][0]=1
     foreground, alpha = matanyone2(matanyone_processor, following_frames, template_mask*255, r_erode=erode_kernel_size, r_dilate=dilate_kernel_size)
-    foreground_output = generate_video_from_frames(foreground, output_path="./results/{}_fg.mp4".format(video_state["video_name"]), fps=fps, audio_path=audio_path) # import video_input to name the output video
-    alpha_output = generate_video_from_frames(alpha, output_path="./results/{}_alpha.mp4".format(video_state["video_name"]), fps=fps, gray2rgb=True, audio_path=audio_path) # import video_input to name the output video
     return foreground_output, alpha_output
@@ -403,9 +443,24 @@ def generate_video_from_frames(frames, output_path, fps=30, gray2rgb=False, audi
         return output_path
     return video_temp_path
 # reset all states for a new input
 def restart():
-    return {
             "user_name": "",
             "video_name": "",
             "origin_images": None,
@@ -415,9 +470,10 @@ def restart():
             "logits": None,
             "select_frame_number": 0,
             "fps": 30
-        }, {
             "inference_times": 0,
-            "negative_click_times" : 0,
             "positive_click_times": 0,
             "mask_save": args.mask_save,
             "multi_mask": {
@@ -425,11 +481,24 @@ def restart():
                 "masks": []
             },
             "track_end_number": None,
-        }, [[],[]], None, None, \
-        gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False),\
-        gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), \
-        gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), \
-        gr.update(visible=False), gr.update(visible=False, choices=[], value=[]), "", gr.update(visible=False)
 # args, defined in track_anything.py
 args = parse_augment()
@@ -438,7 +507,7 @@ sam_checkpoint_url_dict = {
     'vit_l': "https://dl.fbaipublicfiles.com/segment_anything/sam_vit_l_0b3195.pth",
     'vit_b': "https://dl.fbaipublicfiles.com/segment_anything/sam_vit_b_01ec64.pth"
 }
-checkpoint_folder = os.path.join('/home/user/app/', 'pretrained_models')
 sam_checkpoint = load_file_from_url(sam_checkpoint_url_dict[args.sam_model_type], checkpoint_folder)
 # initialize sams
@@ -518,8 +587,12 @@ if not available_models:
     raise RuntimeError("No models are available! Please ensure at least one model file exists in ../pretrained_models/")
 default_model = "MatAnyone 2" if "MatAnyone 2" in available_models else available_models[0]
 # download test samples
-test_sample_path = os.path.join('/home/user/app/hugging_face/', "test_sample/")
 load_file_from_url('https://github.com/pq-yang/MatAnyone2/releases/download/media/test-sample-0-720p.mp4', test_sample_path)
 load_file_from_url('https://github.com/pq-yang/MatAnyone2/releases/download/media/test-sample-1-720p.mp4', test_sample_path)
 load_file_from_url('https://github.com/pq-yang/MatAnyone2/releases/download/media/test-sample-2-720p.mp4', test_sample_path)
@@ -532,7 +605,7 @@ load_file_from_url('https://github.com/pq-yang/MatAnyone2/releases/download/medi
 load_file_from_url('https://github.com/pq-yang/MatAnyone2/releases/download/media/test-sample-3.jpg', test_sample_path)
 # download assets
-assets_path = os.path.join('/home/user/app/hugging_face/', "assets/")
 load_file_from_url('https://github.com/pq-yang/MatAnyone/releases/download/media/tutorial_single_target.mp4', assets_path)
 load_file_from_url('https://github.com/pq-yang/MatAnyone/releases/download/media/tutorial_multi_targets.mp4', assets_path)
@@ -588,52 +661,24 @@ This project is built upon [Cutie](https://github.com/hkchengrex/Cutie), with th
 my_custom_css = """
 .gradio-container {width: 85% !important; margin: 0 auto;}
-.gr-monochrome-group {border-radius: 5px !important; border: revert-layer !important; border-width: 2px !important; color: black !important}
 button {border-radius: 8px !important;}
-.new_button {background-color: #171717 !important; color: #ffffff !important; border: none !important;}
 .green_button {background-color: #4CAF50 !important; color: #ffffff !important; border: none !important;}
-.new_button:hover {background-color: #4b4b4b !important;}
 .green_button:hover {background-color: #77bd79 !important;}
 .mask_button_group {gap: 10px !important;}
-.video .wrap.svelte-lcpz3o {
-    display: flex !important;
-    align-items: center !important;
-    justify-content: center !important;
-    height: auto !important;
-    max-height: 300px !important;
-}
-.video .wrap.svelte-lcpz3o > :first-child {
-    height: auto !important;
-    width: 100% !important;
-    object-fit: contain !important;
-}
-.video .container.svelte-sxyn79 {
-    display: none !important;
-}
 .margin_center {width: 50% !important; margin: auto !important;}
 .jc_center {justify-content: center !important;}
 .video-title {
     margin-bottom: 5px !important;
 }
 .custom-bg {
-        background-color: #f0f0f0;
-        padding: 10px;
-        border-radius: 10px;
-    }
-<style>
-@import url('https://fonts.googleapis.com/css2?family=Sarpanch:wght@400;500;600;700;800;900&family=Sen:wght@400..800&family=Sixtyfour+Convergence&family=Stardos+Stencil:wght@400;700&display=swap');
-body {
-    display: flex;
-    justify-content: center;
-    align-items: center;
-    height: 100vh;
-    margin: 0;
-    background-color: #0d1117;
-    font-family: Arial, sans-serif;
-    font-size: 18px;
-    }
 .title-container {
     text-align: center;
     padding: 0;
@@ -652,16 +697,16 @@ body {
 small {
     font-size: 60%;
 }
-</style>
 """
-with gr.Blocks(theme=gr.themes.Monochrome(), css=my_custom_css) as demo:
     gr.HTML('''
         <div class="title-container">
             <h1 class="title is-2 publication-title"
-                style="font-size:50px; font-family: 'Sarpanch', serif;
-                    background: linear-gradient(to right, #000000, #2dc464);
-                    display: inline-block; -webkit-background-clip: text;
                     -webkit-text-fill-color: transparent;">
                 MatAnyone Series
             </h1>
@@ -676,11 +721,11 @@ with gr.Blocks(theme=gr.themes.Monochrome(), css=my_custom_css) as demo:
                 with gr.Row():
                     with gr.Column():
                         gr.Markdown("### Case 1: Single Target")
-                        gr.Video(value="/home/user/app/hugging_face/assets/tutorial_single_target.mp4", elem_classes="video")
                     with gr.Column():
                         gr.Markdown("### Case 2: Multiple Targets")
-                        gr.Video(value="/home/user/app/hugging_face/assets/tutorial_multi_targets.mp4", elem_classes="video")
     with gr.Tabs():
         with gr.TabItem("Video"):
@@ -789,16 +834,19 @@ with gr.Blocks(theme=gr.themes.Monochrome(), css=my_custom_css) as demo:
                         alpha_output_button = gr.Button(value="Alpha Mask Output", visible=False, elem_classes="new_button")
-            # first step: get the video information
             extract_frames_button.click(
                 fn=get_frames_from_video,
-                inputs=[
-                    video_input, video_state
-                ],
-                outputs=[video_state, video_info, template_frame,
-                        image_selection_slider, track_pause_number_slider, point_prompt, clear_button_click, add_mask_button, matting_button, template_frame,
-                        foreground_video_output, alpha_video_output, foreground_output_button, alpha_output_button, mask_dropdown, step2_title]
-            )
             # second step: select images from slider
             image_selection_slider.release(fn=select_video_template,
@@ -812,7 +860,7 @@ with gr.Blocks(theme=gr.themes.Monochrome(), css=my_custom_css) as demo:
             template_frame.select(
                 fn=sam_refine,
                 inputs=[video_state, point_prompt, click_state, interactive_state],
-                outputs=[template_frame, video_state, interactive_state]
             )
             # add different mask
@@ -842,35 +890,20 @@ with gr.Blocks(theme=gr.themes.Monochrome(), css=my_custom_css) as demo:
                 outputs=[template_frame]
             )
-            # clear input
-            video_input.change(
-                fn=restart,
-                inputs=[],
-                outputs=[
-                    video_state,
-                    interactive_state,
-                    click_state,
-                    foreground_video_output, alpha_video_output,
-                    template_frame,
-                    image_selection_slider , track_pause_number_slider,point_prompt, clear_button_click,
-                    add_mask_button, matting_button, template_frame, foreground_video_output, alpha_video_output, remove_mask_button, foreground_output_button, alpha_output_button, mask_dropdown, video_info, step2_title
-                ],
-                queue=False,
-                show_progress=False)
             video_input.clear(
                 fn=restart,
                 inputs=[],
-                outputs=[
-                    video_state,
-                    interactive_state,
-                    click_state,
-                    foreground_video_output, alpha_video_output,
-                    template_frame,
-                    image_selection_slider , track_pause_number_slider,point_prompt, clear_button_click,
-                    add_mask_button, matting_button, template_frame, foreground_video_output, alpha_video_output, remove_mask_button, foreground_output_button, alpha_output_button, mask_dropdown, video_info, step2_title
-                ],
-                queue=False,
                 show_progress=False)
             # points clear
@@ -992,16 +1025,19 @@ with gr.Blocks(theme=gr.themes.Monochrome(), css=my_custom_css) as demo:
                         alpha_image_output = gr.Image(type="pil", label="Alpha Output", visible=False, elem_classes="image")
                         alpha_output_button = gr.Button(value="Alpha Mask Output", visible=False, elem_classes="new_button")
-            # first step: get the image information
             extract_frames_button.click(
                 fn=get_frames_from_image,
-                inputs=[
-                    image_input, image_state
-                ],
                 outputs=[image_state, image_info, template_frame,
-                        image_selection_slider, track_pause_number_slider,point_prompt, clear_button_click, add_mask_button, matting_button, template_frame,
-                        foreground_image_output, alpha_image_output, foreground_output_button, alpha_output_button, mask_dropdown, step2_title]
-            )
             # second step: select images from slider
             image_selection_slider.release(fn=select_image_template,
@@ -1015,7 +1051,7 @@ with gr.Blocks(theme=gr.themes.Monochrome(), css=my_custom_css) as demo:
             template_frame.select(
                 fn=sam_refine,
                 inputs=[image_state, point_prompt, click_state, interactive_state],
-                outputs=[template_frame, image_state, interactive_state]
             )
             # add different mask
@@ -1046,34 +1082,26 @@ with gr.Blocks(theme=gr.themes.Monochrome(), css=my_custom_css) as demo:
             )
             # clear input
             image_input.change(
                 fn=restart,
                 inputs=[],
-                outputs=[
-                    image_state,
-                    interactive_state,
-                    click_state,
-                    foreground_image_output, alpha_image_output,
-                    template_frame,
-                    image_selection_slider , track_pause_number_slider,point_prompt, clear_button_click,
-                    add_mask_button, matting_button, template_frame, foreground_image_output, alpha_image_output, remove_mask_button, foreground_output_button, alpha_output_button, mask_dropdown, image_info, step2_title
-                ],
-                queue=False,
                 show_progress=False)
             image_input.clear(
                 fn=restart,
                 inputs=[],
-                outputs=[
-                    image_state,
-                    interactive_state,
-                    click_state,
-                    foreground_image_output, alpha_image_output,
-                    template_frame,
-                    image_selection_slider , track_pause_number_slider,point_prompt, clear_button_click,
-                    add_mask_button, matting_button, template_frame, foreground_image_output, alpha_image_output, remove_mask_button, foreground_output_button, alpha_output_button, mask_dropdown, image_info, step2_title
-                ],
-                queue=False,
                 show_progress=False)
             # points clear
@@ -1094,4 +1122,4 @@ with gr.Blocks(theme=gr.themes.Monochrome(), css=my_custom_css) as demo:
     gr.Markdown(article)
 demo.queue()
-demo.launch(debug=True)

 import sys
+from pathlib import Path
+_HERE = Path(__file__).resolve().parent
+sys.path.insert(0, str(_HERE))          # hugging_face/ (for tools, matanyone2_wrapper)
+sys.path.insert(0, str(_HERE.parent))   # repo root (for matanyone2)
 import os
 import json
 import time
+import tempfile
 import psutil
 import ffmpeg
 import imageio
 from PIL import Image
 import cv2
 import numpy as np
 import gradio as gr
+import spaces
 from tools.painter import mask_painter
 from tools.interact_tools import SamControler
     Args:
         video_path:str
         timestamp:float64
+    Return
         [[0:nearest_frame], [nearest_frame:], nearest_frame]
     """
     user_name = time.time()
+    image_size = (image_input.shape[0], image_input.shape[1])
+    # resize if resolution too big
+    if image_size[0] >= 1080 and image_size[1] >= 1080:
+        scale = 1080 / min(image_size)
+        new_w = int(image_size[1] * scale)
+        new_h = int(image_size[0] * scale)
+        image_input = cv2.resize(image_input, (new_w, new_h), interpolation=cv2.INTER_AREA)
+        image_size = (image_input.shape[0], image_input.shape[1])
     frames = [image_input] * 2  # hardcode: mimic a video with 2 frames
+    # initialize image_state
     image_state = {
         "user_name": user_name,
         "image_name": "output.png",
         "fps": None
         }
     image_info = "Image Name: N/A,\nFPS: N/A,\nTotal Frames: {},\nImage Size:{}".format(len(frames), image_size)
+    return (
+        image_state,
+        image_info,
+        image_state["origin_images"][0],
+        gr.Slider(visible=True, maximum=10, value=10),
+        gr.Slider(visible=False, maximum=len(frames), value=len(frames)),
+        gr.Image(visible=True),
+        gr.Image(visible=True),
+    )
 # extract frames from upload video
 def get_frames_from_video(video_input, video_state):
         audio_path = video_input.replace(".mp4", "_audio.wav")
         ffmpeg.input(video_path).output(audio_path, format='wav', acodec='pcm_s16le', ac=2, ar='44100').run(overwrite_output=True, quiet=True)
     except Exception as e:
+        print(f"No audio stream found, skipping. ({e})")
         audio_path = ""  # Set to "" if extraction fails
     # extract frames
+    max_frames = int(os.environ.get("MAX_FRAMES", "200"))
+    truncated = False
     try:
         cap = cv2.VideoCapture(video_path)
         fps = cap.get(cv2.CAP_PROP_FPS)
             if ret == True:
                 current_memory_usage = psutil.virtual_memory().percent
                 frames.append(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
+                if len(frames) >= max_frames:
+                    truncated = True
+                    break
                 if current_memory_usage > 90:
+                    truncated = True
                     break
             else:
                 break
     except (OSError, TypeError, ValueError, KeyError, SyntaxError) as e:
         print("read_frame_source:{} error. {}\n".format(video_path, str(e)))
+    if not frames:
+        raise gr.Error("Failed to read any frames from the video. Please try uploading again.")
+    if truncated:
+        gr.Warning(f"Video truncated to {len(frames)} frames (limit: {max_frames}). Set MAX_FRAMES env var to adjust.")
+    image_size = (frames[0].shape[0],frames[0].shape[1])
     # [remove for local demo] resize if resolution too big
+    if image_size[0]>=1080 and image_size[1]>=1080:
         scale = 1080 / min(image_size)
         new_w = int(image_size[1] * scale)
         new_h = int(image_size[0] * scale)
         "fps": fps,
         "audio": audio_path
         }
+    interactive_state = {
+        "inference_times": 0,
+        "negative_click_times": 0,
+        "positive_click_times": 0,
+        "mask_save": args.mask_save,
+        "multi_mask": {
+            "mask_names": [],
+            "masks": []
+        },
+        "track_end_number": None,
+    }
     video_info = "Video Name: {},\nFPS: {},\nTotal Frames: {},\nImage Size:{}".format(video_state["video_name"], round(video_state["fps"], 0), len(frames), image_size)
+    return (
+        video_state,
+        interactive_state,
+        [[], []],                     # click_state
+        gr.Video(value=None, visible=True),   # foreground_video_output
+        gr.Video(value=None, visible=True),   # alpha_video_output
+        gr.Image(value=video_state["origin_images"][0], visible=True),  # template_frame
+        gr.Slider(visible=True, maximum=len(frames), value=1),
+        gr.Slider(visible=False, maximum=len(frames), value=len(frames)),
+        gr.Radio(visible=True),       # point_prompt
+        gr.Button(visible=True),      # clear_button_click
+        gr.Button(visible=True),      # add_mask_button
+        gr.Button(visible=True),      # matting_button
+        gr.Button(visible=False),     # remove_mask_button
+        gr.Button(visible=False),     # foreground_output_button
+        gr.Button(visible=False),     # alpha_output_button
+        gr.Dropdown(visible=True, choices=[], value=[]),  # mask_dropdown
+        video_info,
+        gr.Markdown(visible=True),    # step2_title
+    )
 # get the select frame from gradio slider
 def select_video_template(image_selection_slider, video_state, interactive_state):
     image_selection_slider -= 1
     video_state["select_frame_number"] = image_selection_slider
     return video_state["painted_images"][image_selection_slider], video_state, interactive_state
 def select_image_template(image_selection_slider, video_state, interactive_state):
     image_selection_slider = 0 # fixed for image
     video_state["select_frame_number"] = image_selection_slider
     return video_state["painted_images"][image_selection_slider], video_state, interactive_state
 # set the tracking end frame
     return video_state["painted_images"][track_pause_number_slider],interactive_state
 # use sam to get the mask
+@spaces.GPU(duration=30)
+def _sam_refine_gpu(frame, click_state, click_index, is_positive):
+    """GPU worker for SAM refinement. Takes only the single frame needed."""
+    coordinate = "[[{},{},{}]]".format(click_index[0], click_index[1], 1 if is_positive else 0)
     # prompt for sam model
     model.samcontroler.sam_controler.reset_image()
+    model.samcontroler.sam_controler.set_image(frame)
     prompt = get_prompt(click_state=click_state, click_input=coordinate)
+    mask, logit, painted_image = model.first_frame_click(
+                                                      image=frame,
                                                       points=np.array(prompt["input_point"]),
                                                       labels=np.array(prompt["input_label"]),
                                                       multimask=prompt["multimask_output"],
                                                       )
+    return mask, logit, painted_image, click_state
+def sam_refine(video_state, point_prompt, click_state, interactive_state, evt:gr.SelectData):
+    """Gradio handler that extracts evt.index, delegates to GPU, then updates state."""
+    is_positive = point_prompt == "Positive"
+    if is_positive:
+        interactive_state["positive_click_times"] += 1
+    else:
+        interactive_state["negative_click_times"] += 1
+    frame = video_state["origin_images"][video_state["select_frame_number"]]
+    mask, logit, painted_image, click_state = _sam_refine_gpu(frame, click_state, evt.index, is_positive)
     video_state["masks"][video_state["select_frame_number"]] = mask
     video_state["logits"][video_state["select_frame_number"]] = logit
     video_state["painted_images"][video_state["select_frame_number"]] = painted_image
+    return painted_image, video_state, interactive_state, click_state
 def add_multi_mask(video_state, interactive_state, mask_dropdown):
     mask = video_state["masks"][video_state["select_frame_number"]]
         return select_frame
 # image matting
+@spaces.GPU(duration=60)
 def image_matting(video_state, interactive_state, mask_dropdown, erode_kernel_size, dilate_kernel_size, refine_iter, model_selection):
     # Load model if not already loaded
     try:
     else:
         template_mask = video_state["masks"][video_state["select_frame_number"]]
+    if len(np.unique(template_mask)) == 1 and template_mask.max() == 0:
+        raise gr.Error("Please set a mask on the template frame before running matting.")
     foreground, alpha = matanyone2(matanyone_processor, following_frames, template_mask*255, r_erode=erode_kernel_size, r_dilate=dilate_kernel_size, n_warmup=refine_iter)
     foreground_output = Image.fromarray(foreground[-1])
     alpha_output = Image.fromarray(alpha[-1][:,:,0])
     return foreground_output, alpha_output
 # video matting
+@spaces.GPU(duration=120)
 def video_matting(video_state, interactive_state, mask_dropdown, erode_kernel_size, dilate_kernel_size, model_selection):
     # Load model if not already loaded
     try:
     audio_path = video_state["audio"]
+    if len(np.unique(template_mask)) == 1 and template_mask.max() == 0:
+        raise gr.Error("Please set a mask on the template frame before running matting.")
     foreground, alpha = matanyone2(matanyone_processor, following_frames, template_mask*255, r_erode=erode_kernel_size, r_dilate=dilate_kernel_size)
+    tmpdir = tempfile.mkdtemp()
+    foreground_output = generate_video_from_frames(foreground, output_path=os.path.join(tmpdir, "fg.mp4"), fps=fps, audio_path=audio_path)
+    alpha_output = generate_video_from_frames(alpha, output_path=os.path.join(tmpdir, "alpha.mp4"), fps=fps, gray2rgb=True, audio_path=audio_path)
     return foreground_output, alpha_output
         return output_path
     return video_temp_path
+def show_load_components():
+    """Show right-side components immediately when Load button is clicked."""
+    return (
+        gr.Markdown(visible=True),    # step2_title
+        gr.Image(visible=True),       # template_frame
+        gr.Radio(visible=True),       # point_prompt
+        gr.Button(visible=True),      # clear_button_click
+        gr.Button(visible=True),      # add_mask_button
+        gr.Button(visible=True),      # matting_button
+        gr.Dropdown(visible=True),    # mask_dropdown
+        gr.Video(visible=True),       # foreground_video_output
+        gr.Video(visible=True),       # alpha_video_output
+    )
 # reset all states for a new input
 def restart():
+    return (
+        {
             "user_name": "",
             "video_name": "",
             "origin_images": None,
             "logits": None,
             "select_frame_number": 0,
             "fps": 30
+        },
+        {
             "inference_times": 0,
+            "negative_click_times": 0,
             "positive_click_times": 0,
             "mask_save": args.mask_save,
             "multi_mask": {
                 "masks": []
             },
             "track_end_number": None,
+        },
+        [[], []],
+        gr.update(value=None, visible=False),               # foreground output
+        gr.update(value=None, visible=False),               # alpha output
+        gr.update(visible=False),                           # template_frame
+        gr.update(visible=False),                           # image_selection_slider
+        gr.update(visible=False),                           # track_pause_number_slider
+        gr.update(visible=False),                           # point_prompt
+        gr.update(visible=False),                           # clear_button_click
+        gr.update(visible=False),                           # add_mask_button
+        gr.update(visible=False),                           # matting_button
+        gr.update(visible=False),                           # remove_mask_button
+        gr.update(visible=False),                           # foreground_output_button
+        gr.update(visible=False),                           # alpha_output_button
+        gr.update(visible=False, choices=[], value=[]),     # mask_dropdown
+        "",                                                  # video_info / image_info
+        gr.update(visible=False),                           # step2_title
+    )
 # args, defined in track_anything.py
 args = parse_augment()
     'vit_l': "https://dl.fbaipublicfiles.com/segment_anything/sam_vit_l_0b3195.pth",
     'vit_b': "https://dl.fbaipublicfiles.com/segment_anything/sam_vit_b_01ec64.pth"
 }
+checkpoint_folder = str(_HERE.parent / 'pretrained_models')
 sam_checkpoint = load_file_from_url(sam_checkpoint_url_dict[args.sam_model_type], checkpoint_folder)
 # initialize sams
     raise RuntimeError("No models are available! Please ensure at least one model file exists in ../pretrained_models/")
 default_model = "MatAnyone 2" if "MatAnyone 2" in available_models else available_models[0]
+# Eagerly load all available models (required for ZeroGPU)
+for _display_name in available_models:
+    load_model(_display_name)
 # download test samples
+test_sample_path = str(_HERE / "test_sample")
 load_file_from_url('https://github.com/pq-yang/MatAnyone2/releases/download/media/test-sample-0-720p.mp4', test_sample_path)
 load_file_from_url('https://github.com/pq-yang/MatAnyone2/releases/download/media/test-sample-1-720p.mp4', test_sample_path)
 load_file_from_url('https://github.com/pq-yang/MatAnyone2/releases/download/media/test-sample-2-720p.mp4', test_sample_path)
 load_file_from_url('https://github.com/pq-yang/MatAnyone2/releases/download/media/test-sample-3.jpg', test_sample_path)
 # download assets
+assets_path = str(_HERE / "assets")
 load_file_from_url('https://github.com/pq-yang/MatAnyone/releases/download/media/tutorial_single_target.mp4', assets_path)
 load_file_from_url('https://github.com/pq-yang/MatAnyone/releases/download/media/tutorial_multi_targets.mp4', assets_path)
 my_custom_css = """
 .gradio-container {width: 85% !important; margin: 0 auto;}
+.gr-monochrome-group {border-radius: 5px !important; border: revert-layer !important; border-width: 2px !important; color: var(--body-text-color) !important}
 button {border-radius: 8px !important;}
+.new_button {background-color: var(--button-secondary-background-fill) !important; color: var(--button-secondary-text-color) !important; border: 1px solid var(--border-color-primary) !important;}
 .green_button {background-color: #4CAF50 !important; color: #ffffff !important; border: none !important;}
+.new_button:hover {background-color: var(--button-secondary-background-fill-hover) !important;}
 .green_button:hover {background-color: #77bd79 !important;}
 .mask_button_group {gap: 10px !important;}
 .margin_center {width: 50% !important; margin: auto !important;}
 .jc_center {justify-content: center !important;}
 .video-title {
     margin-bottom: 5px !important;
 }
 .custom-bg {
+    background-color: var(--background-fill-secondary);
+    padding: 10px;
+    border-radius: 10px;
+}
 .title-container {
     text-align: center;
     padding: 0;
 small {
     font-size: 60%;
 }
 """
+with gr.Blocks() as demo:
     gr.HTML('''
+        <link href="https://fonts.googleapis.com/css2?family=Sarpanch:wght@400;500;600;700;800;900&display=swap" rel="stylesheet">
         <div class="title-container">
             <h1 class="title is-2 publication-title"
+                style="font-size:50px; font-family: 'Sarpanch', serif;
+                    background: linear-gradient(to right, #000000, #2dc464);
+                    display: inline-block; -webkit-background-clip: text;
                     -webkit-text-fill-color: transparent;">
                 MatAnyone Series
             </h1>
                 with gr.Row():
                     with gr.Column():
                         gr.Markdown("### Case 1: Single Target")
+                        gr.Video(value=str(_HERE / "assets" / "tutorial_single_target.mp4"), elem_classes="video")
                     with gr.Column():
                         gr.Markdown("### Case 2: Multiple Targets")
+                        gr.Video(value=str(_HERE / "assets" / "tutorial_multi_targets.mp4"), elem_classes="video")
     with gr.Tabs():
         with gr.TabItem("Video"):
                         alpha_output_button = gr.Button(value="Alpha Mask Output", visible=False, elem_classes="new_button")
+            # first step: get the video information
             extract_frames_button.click(
                 fn=get_frames_from_video,
+                inputs=[video_input, video_state],
+                outputs=[video_state, interactive_state, click_state,
+                         foreground_video_output, alpha_video_output,
+                         template_frame,
+                         image_selection_slider, track_pause_number_slider,
+                         point_prompt, clear_button_click,
+                         add_mask_button, matting_button,
+                         remove_mask_button, foreground_output_button, alpha_output_button,
+                         mask_dropdown, video_info, step2_title],
+            )
             # second step: select images from slider
             image_selection_slider.release(fn=select_video_template,
             template_frame.select(
                 fn=sam_refine,
                 inputs=[video_state, point_prompt, click_state, interactive_state],
+                outputs=[template_frame, video_state, interactive_state, click_state]
             )
             # add different mask
                 outputs=[template_frame]
             )
+            _video_restart_outputs = [
+                video_state, interactive_state, click_state,
+                foreground_video_output, alpha_video_output,
+                template_frame,
+                image_selection_slider, track_pause_number_slider,
+                point_prompt, clear_button_click,
+                add_mask_button, matting_button,
+                remove_mask_button, foreground_output_button, alpha_output_button,
+                mask_dropdown, video_info, step2_title,
+            ]
             video_input.clear(
                 fn=restart,
                 inputs=[],
+                outputs=_video_restart_outputs,
                 show_progress=False)
             # points clear
                         alpha_image_output = gr.Image(type="pil", label="Alpha Output", visible=False, elem_classes="image")
                         alpha_output_button = gr.Button(value="Alpha Mask Output", visible=False, elem_classes="new_button")
+            # first step: get the image information
             extract_frames_button.click(
+                fn=show_load_components,
+                inputs=[],
+                outputs=[step2_title, template_frame, point_prompt, clear_button_click,
+                         add_mask_button, matting_button, mask_dropdown],
+            ).then(
                 fn=get_frames_from_image,
+                inputs=[image_input, image_state],
                 outputs=[image_state, image_info, template_frame,
+                         image_selection_slider, track_pause_number_slider,
+                         foreground_image_output, alpha_image_output],
+            )
             # second step: select images from slider
             image_selection_slider.release(fn=select_image_template,
             template_frame.select(
                 fn=sam_refine,
                 inputs=[image_state, point_prompt, click_state, interactive_state],
+                outputs=[template_frame, image_state, interactive_state, click_state]
             )
             # add different mask
             )
             # clear input
+            _image_restart_outputs = [
+                image_state, interactive_state, click_state,
+                foreground_image_output, alpha_image_output,
+                template_frame,
+                image_selection_slider, track_pause_number_slider,
+                point_prompt, clear_button_click,
+                add_mask_button, matting_button,
+                remove_mask_button, foreground_output_button, alpha_output_button,
+                mask_dropdown, image_info, step2_title,
+            ]
             image_input.change(
                 fn=restart,
                 inputs=[],
+                outputs=_image_restart_outputs,
                 show_progress=False)
             image_input.clear(
                 fn=restart,
                 inputs=[],
+                outputs=_image_restart_outputs,
                 show_progress=False)
             # points clear
     gr.Markdown(article)
 demo.queue()
+demo.launch(theme=gr.themes.Monochrome(), css=my_custom_css)

hugging_face/tools/base_segmenter.py CHANGED Viewed

@@ -5,7 +5,6 @@ from PIL import Image, ImageDraw, ImageOps
 import numpy as np
 from typing import Union
 from segment_anything import sam_model_registry, SamPredictor, SamAutomaticMaskGenerator
-import matplotlib.pyplot as plt
 import PIL
 from .mask_painter import mask_painter

 import numpy as np
 from typing import Union
 from segment_anything import sam_model_registry, SamPredictor, SamAutomaticMaskGenerator
 import PIL
 from .mask_painter import mask_painter

hugging_face/tools/download_util.py CHANGED Viewed

@@ -1,6 +1,5 @@
 import math
 import os
-import requests
 from torch.hub import download_url_to_file, get_dir
 from tqdm import tqdm
 from urllib.parse import urlparse

 import math
 import os
 from torch.hub import download_url_to_file, get_dir
 from tqdm import tqdm
 from urllib.parse import urlparse

hugging_face/tools/interact_tools.py CHANGED Viewed

@@ -5,14 +5,12 @@ from PIL import Image, ImageDraw, ImageOps
 import numpy as np
 from typing import Union
 from segment_anything import sam_model_registry, SamPredictor, SamAutomaticMaskGenerator
-import matplotlib.pyplot as plt
 import PIL
 from .mask_painter import mask_painter as mask_painter2
 from .base_segmenter import BaseSegmenter
 from .painter import mask_painter, point_painter
 import os
-import requests
-import sys
 mask_color = 3

 import numpy as np
 from typing import Union
 from segment_anything import sam_model_registry, SamPredictor, SamAutomaticMaskGenerator
 import PIL
 from .mask_painter import mask_painter as mask_painter2
 from .base_segmenter import BaseSegmenter
 from .painter import mask_painter, point_painter
 import os
+import sys
 mask_color = 3

requirements.txt CHANGED Viewed

@@ -1,37 +1,312 @@
-progressbar2
-gdown >= 4.7.1
-gitpython >= 3.1
-git+https://github.com/cheind/py-thin-plate-spline
-hickle >= 5.0
-tensorboard >= 2.11
-numpy >= 1.21
-git+https://github.com/facebookresearch/segment-anything.git
-# gradio==4.31.0
-fastapi==0.111.0
-pydantic==2.7.1
-opencv-python >= 4.8
-matplotlib
-pyyaml
-av >= 0.5.2
-openmim
-tqdm >= 4.66.1
-psutil
-ffmpeg-python
-cython
-Pillow >= 9.5
-scipy >= 1.7
-pycocotools >= 2.0.7
-einops >= 0.6
-hydra-core >= 1.3.2
-PySide6 >= 6.2.0
-charset-normalizer >= 3.1.0
-netifaces >= 0.11.0
-cchardet >= 2.1.7
-easydict
-requests
-pyqtdarktheme
-imageio == 2.25.0
-imageio[ffmpeg]
-ffmpeg-python
-safetensors
-huggingface_hub < 1.0

+# This file was autogenerated by uv via the following command:
+#    uv export --no-hashes --no-dev --group hf-spaces --no-emit-package typer-slim -o requirements.txt
+aiofiles==24.1.0
+    # via gradio
+aiohappyeyeballs==2.6.1
+    # via aiohttp
+aiohttp==3.13.3
+    # via fsspec
+aiosignal==1.4.0
+    # via aiohttp
+annotated-doc==0.0.4
+    # via
+    #   fastapi
+    #   typer
+annotated-types==0.7.0
+    # via pydantic
+antlr4-python3-runtime==4.9.3
+    # via
+    #   hydra-core
+    #   omegaconf
+anyio==4.12.1
+    # via
+    #   gradio
+    #   httpx
+    #   starlette
+attrs==25.4.0
+    # via aiohttp
+audioop-lts==0.2.2 ; python_full_version >= '3.13'
+    # via gradio
+brotli==1.2.0
+    # via gradio
+certifi==2026.2.25
+    # via
+    #   httpcore
+    #   httpx
+    #   requests
+charset-normalizer==3.4.5
+    # via requests
+click==8.3.1
+    # via
+    #   typer
+    #   uvicorn
+colorama==0.4.6 ; sys_platform == 'win32'
+    # via
+    #   click
+    #   tqdm
+datasets==4.7.0
+dill==0.4.0
+    # via
+    #   datasets
+    #   multiprocess
+fastapi==0.135.1
+    # via gradio
+ffmpeg-python==0.2.0
+    # via matanyone
+ffmpy==1.0.0
+    # via gradio
+filelock==3.25.2
+    # via
+    #   datasets
+    #   huggingface-hub
+    #   torch
+frozenlist==1.8.0
+    # via
+    #   aiohttp
+    #   aiosignal
+fsspec==2026.2.0
+    # via
+    #   datasets
+    #   gradio-client
+    #   huggingface-hub
+    #   torch
+future==1.0.0
+    # via ffmpeg-python
+gradio==6.9.0
+    # via
+    #   matanyone
+    #   spaces
+gradio-client==2.3.0
+    # via gradio
+groovy==0.1.2
+    # via gradio
+h11==0.16.0
+    # via
+    #   httpcore
+    #   uvicorn
+hf-xet==1.4.2 ; platform_machine == 'AMD64' or platform_machine == 'aarch64' or platform_machine == 'amd64' or platform_machine == 'arm64' or platform_machine == 'x86_64'
+    # via huggingface-hub
+httpcore==1.0.9
+    # via httpx
+httpx==0.28.1
+    # via
+    #   datasets
+    #   gradio
+    #   gradio-client
+    #   huggingface-hub
+    #   safehttpx
+    #   spaces
+huggingface-hub==1.7.0
+    # via
+    #   datasets
+    #   gradio
+    #   gradio-client
+hydra-core==1.3.2
+    # via matanyone
+idna==3.11
+    # via
+    #   anyio
+    #   httpx
+    #   requests
+    #   yarl
+imageio==2.37.3
+    # via matanyone
+imageio-ffmpeg==0.6.0
+    # via imageio
+jinja2==3.1.6
+    # via
+    #   gradio
+    #   torch
+markdown-it-py==4.0.0
+    # via rich
+markupsafe==3.0.3
+    # via
+    #   gradio
+    #   jinja2
+mdurl==0.1.2
+    # via markdown-it-py
+mpmath==1.3.0
+    # via sympy
+multidict==6.7.1
+    # via
+    #   aiohttp
+    #   yarl
+multiprocess==0.70.18
+    # via datasets
+networkx==3.6.1
+    # via torch
+numpy==2.4.3
+    # via
+    #   datasets
+    #   gradio
+    #   imageio
+    #   opencv-python
+    #   pandas
+    #   torchvision
+nvidia-cublas-cu12==12.8.4.1 ; platform_machine == 'x86_64' and sys_platform == 'linux'
+    # via
+    #   nvidia-cudnn-cu12
+    #   nvidia-cusolver-cu12
+    #   torch
+nvidia-cuda-cupti-cu12==12.8.90 ; platform_machine == 'x86_64' and sys_platform == 'linux'
+    # via torch
+nvidia-cuda-nvrtc-cu12==12.8.93 ; platform_machine == 'x86_64' and sys_platform == 'linux'
+    # via torch
+nvidia-cuda-runtime-cu12==12.8.90 ; platform_machine == 'x86_64' and sys_platform == 'linux'
+    # via torch
+nvidia-cudnn-cu12==9.10.2.21 ; platform_machine == 'x86_64' and sys_platform == 'linux'
+    # via torch
+nvidia-cufft-cu12==11.3.3.83 ; platform_machine == 'x86_64' and sys_platform == 'linux'
+    # via torch
+nvidia-cufile-cu12==1.13.1.3 ; platform_machine == 'x86_64' and sys_platform == 'linux'
+    # via torch
+nvidia-curand-cu12==10.3.9.90 ; platform_machine == 'x86_64' and sys_platform == 'linux'
+    # via torch
+nvidia-cusolver-cu12==11.7.3.90 ; platform_machine == 'x86_64' and sys_platform == 'linux'
+    # via torch
+nvidia-cusparse-cu12==12.5.8.93 ; platform_machine == 'x86_64' and sys_platform == 'linux'
+    # via
+    #   nvidia-cusolver-cu12
+    #   torch
+nvidia-cusparselt-cu12==0.7.1 ; platform_machine == 'x86_64' and sys_platform == 'linux'
+    # via torch
+nvidia-nccl-cu12==2.27.5 ; platform_machine == 'x86_64' and sys_platform == 'linux'
+    # via torch
+nvidia-nvjitlink-cu12==12.8.93 ; platform_machine == 'x86_64' and sys_platform == 'linux'
+    # via
+    #   nvidia-cufft-cu12
+    #   nvidia-cusolver-cu12
+    #   nvidia-cusparse-cu12
+    #   torch
+nvidia-nvshmem-cu12==3.3.20 ; platform_machine == 'x86_64' and sys_platform == 'linux'
+    # via torch
+nvidia-nvtx-cu12==12.8.90 ; platform_machine == 'x86_64' and sys_platform == 'linux'
+    # via torch
+omegaconf==2.3.0
+    # via hydra-core
+opencv-python==4.13.0.92
+    # via matanyone
+orjson==3.11.7
+    # via gradio
+packaging==26.0
+    # via
+    #   datasets
+    #   gradio
+    #   gradio-client
+    #   huggingface-hub
+    #   hydra-core
+    #   spaces
+pandas==3.0.1
+    # via
+    #   datasets
+    #   gradio
+pillow==12.1.1
+    # via
+    #   gradio
+    #   imageio
+    #   torchvision
+propcache==0.4.1
+    # via
+    #   aiohttp
+    #   yarl
+psutil==5.9.8
+    # via
+    #   imageio
+    #   spaces
+pyarrow==23.0.1
+    # via datasets
+pydantic==2.12.5
+    # via
+    #   fastapi
+    #   gradio
+    #   spaces
+pydantic-core==2.41.5
+    # via pydantic
+pydub==0.25.1
+    # via gradio
+pygments==2.19.2
+    # via rich
+python-dateutil==2.9.0.post0
+    # via pandas
+python-multipart==0.0.22
+    # via gradio
+pytz==2026.1.post1
+    # via gradio
+pyyaml==6.0.3
+    # via
+    #   datasets
+    #   gradio
+    #   huggingface-hub
+    #   omegaconf
+requests==2.32.5
+    # via
+    #   datasets
+    #   spaces
+rich==14.3.3
+    # via typer
+safehttpx==0.1.7
+    # via gradio
+segment-anything @ git+https://github.com/facebookresearch/segment-anything.git@dca509fe793f601edb92606367a655c15ac00fdf
+    # via matanyone
+semantic-version==2.10.0
+    # via gradio
+setuptools==82.0.1
+    # via torch
+shellingham==1.5.4
+    # via typer
+six==1.17.0
+    # via python-dateutil
+spaces==0.47.0
+    # via matanyone
+starlette==0.52.1
+    # via
+    #   fastapi
+    #   gradio
+sympy==1.14.0
+    # via torch
+tomlkit==0.13.3
+    # via gradio
+torch==2.9.1
+    # via
+    #   matanyone
+    #   torchvision
+torchvision==0.24.1
+    # via matanyone
+tqdm==4.67.3
+    # via
+    #   datasets
+    #   huggingface-hub
+triton==3.5.1 ; platform_machine == 'x86_64' and sys_platform == 'linux'
+    # via torch
+typer==0.24.1
+    # via
+    #   gradio
+    #   huggingface-hub
+typing-extensions==4.15.0
+    # via
+    #   aiosignal
+    #   anyio
+    #   fastapi
+    #   gradio
+    #   gradio-client
+    #   huggingface-hub
+    #   pydantic
+    #   pydantic-core
+    #   spaces
+    #   starlette
+    #   torch
+    #   typing-inspection
+typing-inspection==0.4.2
+    # via
+    #   fastapi
+    #   pydantic
+tzdata==2025.3 ; sys_platform == 'emscripten' or sys_platform == 'win32'
+    # via pandas
+urllib3==2.6.3
+    # via requests
+uvicorn==0.41.0
+    # via gradio
+xxhash==3.6.0
+    # via datasets
+yarl==1.23.0
+    # via aiohttp