PeiqingYang hysts HF Staff commited on
Commit
ea74e2d
·
1 Parent(s): 40efe7a

Migrate to ZeroGPU + Gradio 6 with bug fixes (#6)

Browse files

- Migrate to ZeroGPU + Gradio 6 with bug fixes (a7e19d44de87a1baae3c06e890d6ac6255b8ebbd)


Co-authored-by: hysts <hysts@users.noreply.huggingface.co>

README.md CHANGED
@@ -4,8 +4,8 @@ emoji: 🤡
4
  colorFrom: red
5
  colorTo: green
6
  sdk: gradio
7
- sdk_version: 4.31.0
8
- python_version: 3.10.13
9
  app_file: hugging_face/app.py
10
  pinned: false
11
  license: other
 
4
  colorFrom: red
5
  colorTo: green
6
  sdk: gradio
7
+ sdk_version: 6.9.0
8
+ python_version: 3.12.12
9
  app_file: hugging_face/app.py
10
  pinned: false
11
  license: other
hugging_face/app.py CHANGED
@@ -1,10 +1,13 @@
1
  import sys
2
- sys.path.append("../")
3
- sys.path.append("../../")
 
 
4
 
5
  import os
6
  import json
7
  import time
 
8
  import psutil
9
  import ffmpeg
10
  import imageio
@@ -12,9 +15,9 @@ import argparse
12
  from PIL import Image
13
 
14
  import cv2
15
- import torch
16
  import numpy as np
17
  import gradio as gr
 
18
 
19
  from tools.painter import mask_painter
20
  from tools.interact_tools import SamControler
@@ -75,14 +78,23 @@ def get_frames_from_image(image_input, image_state):
75
  Args:
76
  video_path:str
77
  timestamp:float64
78
- Return
79
  [[0:nearest_frame], [nearest_frame:], nearest_frame]
80
  """
81
 
82
  user_name = time.time()
 
 
 
 
 
 
 
 
 
 
83
  frames = [image_input] * 2 # hardcode: mimic a video with 2 frames
84
- image_size = (frames[0].shape[0],frames[0].shape[1])
85
- # initialize video_state
86
  image_state = {
87
  "user_name": user_name,
88
  "image_name": "output.png",
@@ -94,16 +106,15 @@ def get_frames_from_image(image_input, image_state):
94
  "fps": None
95
  }
96
  image_info = "Image Name: N/A,\nFPS: N/A,\nTotal Frames: {},\nImage Size:{}".format(len(frames), image_size)
97
- model.samcontroler.sam_controler.reset_image()
98
- model.samcontroler.sam_controler.set_image(image_state["origin_images"][0])
99
- return image_state, image_info, image_state["origin_images"][0], \
100
- gr.update(visible=True, maximum=10, value=10), gr.update(visible=False, maximum=len(frames), value=len(frames)), \
101
- gr.update(visible=True), gr.update(visible=True), \
102
- gr.update(visible=True), gr.update(visible=True),\
103
- gr.update(visible=True), gr.update(visible=True), \
104
- gr.update(visible=True), gr.update(visible=False), \
105
- gr.update(visible=False), gr.update(visible=True), \
106
- gr.update(visible=True)
107
 
108
  # extract frames from upload video
109
  def get_frames_from_video(video_input, video_state):
@@ -123,10 +134,12 @@ def get_frames_from_video(video_input, video_state):
123
  audio_path = video_input.replace(".mp4", "_audio.wav")
124
  ffmpeg.input(video_path).output(audio_path, format='wav', acodec='pcm_s16le', ac=2, ar='44100').run(overwrite_output=True, quiet=True)
125
  except Exception as e:
126
- print(f"Audio extraction error: {str(e)}")
127
  audio_path = "" # Set to "" if extraction fails
128
 
129
  # extract frames
 
 
130
  try:
131
  cap = cv2.VideoCapture(video_path)
132
  fps = cap.get(cv2.CAP_PROP_FPS)
@@ -135,16 +148,24 @@ def get_frames_from_video(video_input, video_state):
135
  if ret == True:
136
  current_memory_usage = psutil.virtual_memory().percent
137
  frames.append(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
 
 
 
138
  if current_memory_usage > 90:
 
139
  break
140
  else:
141
  break
142
  except (OSError, TypeError, ValueError, KeyError, SyntaxError) as e:
143
  print("read_frame_source:{} error. {}\n".format(video_path, str(e)))
144
- image_size = (frames[0].shape[0],frames[0].shape[1])
 
 
 
 
145
 
146
  # [remove for local demo] resize if resolution too big
147
- if image_size[0]>=1080 and image_size[0]>=1080:
148
  scale = 1080 / min(image_size)
149
  new_w = int(image_size[1] * scale)
150
  new_h = int(image_size[0] * scale)
@@ -165,16 +186,38 @@ def get_frames_from_video(video_input, video_state):
165
  "fps": fps,
166
  "audio": audio_path
167
  }
 
 
 
 
 
 
 
 
 
 
 
168
  video_info = "Video Name: {},\nFPS: {},\nTotal Frames: {},\nImage Size:{}".format(video_state["video_name"], round(video_state["fps"], 0), len(frames), image_size)
169
- model.samcontroler.sam_controler.reset_image()
170
- model.samcontroler.sam_controler.set_image(video_state["origin_images"][0])
171
- return video_state, video_info, video_state["origin_images"][0], gr.update(visible=True, maximum=len(frames), value=1), gr.update(visible=False, maximum=len(frames), value=len(frames)), \
172
- gr.update(visible=True), gr.update(visible=True), \
173
- gr.update(visible=True), gr.update(visible=True),\
174
- gr.update(visible=True), gr.update(visible=True), \
175
- gr.update(visible=True), gr.update(visible=False), \
176
- gr.update(visible=False), gr.update(visible=True), \
177
- gr.update(visible=True)
 
 
 
 
 
 
 
 
 
 
 
178
 
179
  # get the select frame from gradio slider
180
  def select_video_template(image_selection_slider, video_state, interactive_state):
@@ -182,10 +225,6 @@ def select_video_template(image_selection_slider, video_state, interactive_state
182
  image_selection_slider -= 1
183
  video_state["select_frame_number"] = image_selection_slider
184
 
185
- # once select a new template frame, set the image in sam
186
- model.samcontroler.sam_controler.reset_image()
187
- model.samcontroler.sam_controler.set_image(video_state["origin_images"][image_selection_slider])
188
-
189
  return video_state["painted_images"][image_selection_slider], video_state, interactive_state
190
 
191
  def select_image_template(image_selection_slider, video_state, interactive_state):
@@ -193,10 +232,6 @@ def select_image_template(image_selection_slider, video_state, interactive_state
193
  image_selection_slider = 0 # fixed for image
194
  video_state["select_frame_number"] = image_selection_slider
195
 
196
- # once select a new template frame, set the image in sam
197
- model.samcontroler.sam_controler.reset_image()
198
- model.samcontroler.sam_controler.set_image(video_state["origin_images"][image_selection_slider])
199
-
200
  return video_state["painted_images"][image_selection_slider], video_state, interactive_state
201
 
202
  # set the tracking end frame
@@ -206,36 +241,40 @@ def get_end_number(track_pause_number_slider, video_state, interactive_state):
206
  return video_state["painted_images"][track_pause_number_slider],interactive_state
207
 
208
  # use sam to get the mask
209
- def sam_refine(video_state, point_prompt, click_state, interactive_state, evt:gr.SelectData):
210
- """
211
- Args:
212
- template_frame: PIL.Image
213
- point_prompt: flag for positive or negative button click
214
- click_state: [[points], [labels]]
215
- """
216
- if point_prompt == "Positive":
217
- coordinate = "[[{},{},1]]".format(evt.index[0], evt.index[1])
218
- interactive_state["positive_click_times"] += 1
219
- else:
220
- coordinate = "[[{},{},0]]".format(evt.index[0], evt.index[1])
221
- interactive_state["negative_click_times"] += 1
222
-
223
  # prompt for sam model
224
  model.samcontroler.sam_controler.reset_image()
225
- model.samcontroler.sam_controler.set_image(video_state["origin_images"][video_state["select_frame_number"]])
226
  prompt = get_prompt(click_state=click_state, click_input=coordinate)
227
 
228
- mask, logit, painted_image = model.first_frame_click(
229
- image=video_state["origin_images"][video_state["select_frame_number"]],
230
  points=np.array(prompt["input_point"]),
231
  labels=np.array(prompt["input_label"]),
232
  multimask=prompt["multimask_output"],
233
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
234
  video_state["masks"][video_state["select_frame_number"]] = mask
235
  video_state["logits"][video_state["select_frame_number"]] = logit
236
  video_state["painted_images"][video_state["select_frame_number"]] = painted_image
237
 
238
- return painted_image, video_state, interactive_state
239
 
240
  def add_multi_mask(video_state, interactive_state, mask_dropdown):
241
  mask = video_state["masks"][video_state["select_frame_number"]]
@@ -269,6 +308,7 @@ def show_mask(video_state, interactive_state, mask_dropdown):
269
  return select_frame
270
 
271
  # image matting
 
272
  def image_matting(video_state, interactive_state, mask_dropdown, erode_kernel_size, dilate_kernel_size, refine_iter, model_selection):
273
  # Load model if not already loaded
274
  try:
@@ -298,9 +338,8 @@ def image_matting(video_state, interactive_state, mask_dropdown, erode_kernel_si
298
  else:
299
  template_mask = video_state["masks"][video_state["select_frame_number"]]
300
 
301
- # operation error
302
- if len(np.unique(template_mask))==1:
303
- template_mask[0][0]=1
304
  foreground, alpha = matanyone2(matanyone_processor, following_frames, template_mask*255, r_erode=erode_kernel_size, r_dilate=dilate_kernel_size, n_warmup=refine_iter)
305
  foreground_output = Image.fromarray(foreground[-1])
306
  alpha_output = Image.fromarray(alpha[-1][:,:,0])
@@ -308,6 +347,7 @@ def image_matting(video_state, interactive_state, mask_dropdown, erode_kernel_si
308
  return foreground_output, alpha_output
309
 
310
  # video matting
 
311
  def video_matting(video_state, interactive_state, mask_dropdown, erode_kernel_size, dilate_kernel_size, model_selection):
312
  # Load model if not already loaded
313
  try:
@@ -340,13 +380,13 @@ def video_matting(video_state, interactive_state, mask_dropdown, erode_kernel_si
340
 
341
  audio_path = video_state["audio"]
342
 
343
- # operation error
344
- if len(np.unique(template_mask))==1:
345
- template_mask[0][0]=1
346
  foreground, alpha = matanyone2(matanyone_processor, following_frames, template_mask*255, r_erode=erode_kernel_size, r_dilate=dilate_kernel_size)
347
 
348
- foreground_output = generate_video_from_frames(foreground, output_path="./results/{}_fg.mp4".format(video_state["video_name"]), fps=fps, audio_path=audio_path) # import video_input to name the output video
349
- alpha_output = generate_video_from_frames(alpha, output_path="./results/{}_alpha.mp4".format(video_state["video_name"]), fps=fps, gray2rgb=True, audio_path=audio_path) # import video_input to name the output video
 
350
 
351
  return foreground_output, alpha_output
352
 
@@ -403,9 +443,24 @@ def generate_video_from_frames(frames, output_path, fps=30, gray2rgb=False, audi
403
  return output_path
404
  return video_temp_path
405
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
406
  # reset all states for a new input
407
  def restart():
408
- return {
 
409
  "user_name": "",
410
  "video_name": "",
411
  "origin_images": None,
@@ -415,9 +470,10 @@ def restart():
415
  "logits": None,
416
  "select_frame_number": 0,
417
  "fps": 30
418
- }, {
 
419
  "inference_times": 0,
420
- "negative_click_times" : 0,
421
  "positive_click_times": 0,
422
  "mask_save": args.mask_save,
423
  "multi_mask": {
@@ -425,11 +481,24 @@ def restart():
425
  "masks": []
426
  },
427
  "track_end_number": None,
428
- }, [[],[]], None, None, \
429
- gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False),\
430
- gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), \
431
- gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), \
432
- gr.update(visible=False), gr.update(visible=False, choices=[], value=[]), "", gr.update(visible=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
433
 
434
  # args, defined in track_anything.py
435
  args = parse_augment()
@@ -438,7 +507,7 @@ sam_checkpoint_url_dict = {
438
  'vit_l': "https://dl.fbaipublicfiles.com/segment_anything/sam_vit_l_0b3195.pth",
439
  'vit_b': "https://dl.fbaipublicfiles.com/segment_anything/sam_vit_b_01ec64.pth"
440
  }
441
- checkpoint_folder = os.path.join('/home/user/app/', 'pretrained_models')
442
 
443
  sam_checkpoint = load_file_from_url(sam_checkpoint_url_dict[args.sam_model_type], checkpoint_folder)
444
  # initialize sams
@@ -518,8 +587,12 @@ if not available_models:
518
  raise RuntimeError("No models are available! Please ensure at least one model file exists in ../pretrained_models/")
519
  default_model = "MatAnyone 2" if "MatAnyone 2" in available_models else available_models[0]
520
 
 
 
 
 
521
  # download test samples
522
- test_sample_path = os.path.join('/home/user/app/hugging_face/', "test_sample/")
523
  load_file_from_url('https://github.com/pq-yang/MatAnyone2/releases/download/media/test-sample-0-720p.mp4', test_sample_path)
524
  load_file_from_url('https://github.com/pq-yang/MatAnyone2/releases/download/media/test-sample-1-720p.mp4', test_sample_path)
525
  load_file_from_url('https://github.com/pq-yang/MatAnyone2/releases/download/media/test-sample-2-720p.mp4', test_sample_path)
@@ -532,7 +605,7 @@ load_file_from_url('https://github.com/pq-yang/MatAnyone2/releases/download/medi
532
  load_file_from_url('https://github.com/pq-yang/MatAnyone2/releases/download/media/test-sample-3.jpg', test_sample_path)
533
 
534
  # download assets
535
- assets_path = os.path.join('/home/user/app/hugging_face/', "assets/")
536
  load_file_from_url('https://github.com/pq-yang/MatAnyone/releases/download/media/tutorial_single_target.mp4', assets_path)
537
  load_file_from_url('https://github.com/pq-yang/MatAnyone/releases/download/media/tutorial_multi_targets.mp4', assets_path)
538
 
@@ -588,52 +661,24 @@ This project is built upon [Cutie](https://github.com/hkchengrex/Cutie), with th
588
 
589
  my_custom_css = """
590
  .gradio-container {width: 85% !important; margin: 0 auto;}
591
- .gr-monochrome-group {border-radius: 5px !important; border: revert-layer !important; border-width: 2px !important; color: black !important}
592
  button {border-radius: 8px !important;}
593
- .new_button {background-color: #171717 !important; color: #ffffff !important; border: none !important;}
594
  .green_button {background-color: #4CAF50 !important; color: #ffffff !important; border: none !important;}
595
- .new_button:hover {background-color: #4b4b4b !important;}
596
  .green_button:hover {background-color: #77bd79 !important;}
597
 
598
  .mask_button_group {gap: 10px !important;}
599
- .video .wrap.svelte-lcpz3o {
600
- display: flex !important;
601
- align-items: center !important;
602
- justify-content: center !important;
603
- height: auto !important;
604
- max-height: 300px !important;
605
- }
606
- .video .wrap.svelte-lcpz3o > :first-child {
607
- height: auto !important;
608
- width: 100% !important;
609
- object-fit: contain !important;
610
- }
611
- .video .container.svelte-sxyn79 {
612
- display: none !important;
613
- }
614
  .margin_center {width: 50% !important; margin: auto !important;}
615
  .jc_center {justify-content: center !important;}
616
  .video-title {
617
  margin-bottom: 5px !important;
618
  }
619
  .custom-bg {
620
- background-color: #f0f0f0;
621
- padding: 10px;
622
- border-radius: 10px;
623
- }
624
-
625
- <style>
626
- @import url('https://fonts.googleapis.com/css2?family=Sarpanch:wght@400;500;600;700;800;900&family=Sen:wght@400..800&family=Sixtyfour+Convergence&family=Stardos+Stencil:wght@400;700&display=swap');
627
- body {
628
- display: flex;
629
- justify-content: center;
630
- align-items: center;
631
- height: 100vh;
632
- margin: 0;
633
- background-color: #0d1117;
634
- font-family: Arial, sans-serif;
635
- font-size: 18px;
636
- }
637
  .title-container {
638
  text-align: center;
639
  padding: 0;
@@ -652,16 +697,16 @@ body {
652
  small {
653
  font-size: 60%;
654
  }
655
- </style>
656
  """
657
 
658
- with gr.Blocks(theme=gr.themes.Monochrome(), css=my_custom_css) as demo:
659
  gr.HTML('''
 
660
  <div class="title-container">
661
  <h1 class="title is-2 publication-title"
662
- style="font-size:50px; font-family: 'Sarpanch', serif;
663
- background: linear-gradient(to right, #000000, #2dc464);
664
- display: inline-block; -webkit-background-clip: text;
665
  -webkit-text-fill-color: transparent;">
666
  MatAnyone Series
667
  </h1>
@@ -676,11 +721,11 @@ with gr.Blocks(theme=gr.themes.Monochrome(), css=my_custom_css) as demo:
676
  with gr.Row():
677
  with gr.Column():
678
  gr.Markdown("### Case 1: Single Target")
679
- gr.Video(value="/home/user/app/hugging_face/assets/tutorial_single_target.mp4", elem_classes="video")
680
 
681
  with gr.Column():
682
  gr.Markdown("### Case 2: Multiple Targets")
683
- gr.Video(value="/home/user/app/hugging_face/assets/tutorial_multi_targets.mp4", elem_classes="video")
684
 
685
  with gr.Tabs():
686
  with gr.TabItem("Video"):
@@ -789,16 +834,19 @@ with gr.Blocks(theme=gr.themes.Monochrome(), css=my_custom_css) as demo:
789
  alpha_output_button = gr.Button(value="Alpha Mask Output", visible=False, elem_classes="new_button")
790
 
791
 
792
- # first step: get the video information
793
  extract_frames_button.click(
794
  fn=get_frames_from_video,
795
- inputs=[
796
- video_input, video_state
797
- ],
798
- outputs=[video_state, video_info, template_frame,
799
- image_selection_slider, track_pause_number_slider, point_prompt, clear_button_click, add_mask_button, matting_button, template_frame,
800
- foreground_video_output, alpha_video_output, foreground_output_button, alpha_output_button, mask_dropdown, step2_title]
801
- )
 
 
 
802
 
803
  # second step: select images from slider
804
  image_selection_slider.release(fn=select_video_template,
@@ -812,7 +860,7 @@ with gr.Blocks(theme=gr.themes.Monochrome(), css=my_custom_css) as demo:
812
  template_frame.select(
813
  fn=sam_refine,
814
  inputs=[video_state, point_prompt, click_state, interactive_state],
815
- outputs=[template_frame, video_state, interactive_state]
816
  )
817
 
818
  # add different mask
@@ -842,35 +890,20 @@ with gr.Blocks(theme=gr.themes.Monochrome(), css=my_custom_css) as demo:
842
  outputs=[template_frame]
843
  )
844
 
845
- # clear input
846
- video_input.change(
847
- fn=restart,
848
- inputs=[],
849
- outputs=[
850
- video_state,
851
- interactive_state,
852
- click_state,
853
- foreground_video_output, alpha_video_output,
854
- template_frame,
855
- image_selection_slider , track_pause_number_slider,point_prompt, clear_button_click,
856
- add_mask_button, matting_button, template_frame, foreground_video_output, alpha_video_output, remove_mask_button, foreground_output_button, alpha_output_button, mask_dropdown, video_info, step2_title
857
- ],
858
- queue=False,
859
- show_progress=False)
860
-
861
  video_input.clear(
862
  fn=restart,
863
  inputs=[],
864
- outputs=[
865
- video_state,
866
- interactive_state,
867
- click_state,
868
- foreground_video_output, alpha_video_output,
869
- template_frame,
870
- image_selection_slider , track_pause_number_slider,point_prompt, clear_button_click,
871
- add_mask_button, matting_button, template_frame, foreground_video_output, alpha_video_output, remove_mask_button, foreground_output_button, alpha_output_button, mask_dropdown, video_info, step2_title
872
- ],
873
- queue=False,
874
  show_progress=False)
875
 
876
  # points clear
@@ -992,16 +1025,19 @@ with gr.Blocks(theme=gr.themes.Monochrome(), css=my_custom_css) as demo:
992
  alpha_image_output = gr.Image(type="pil", label="Alpha Output", visible=False, elem_classes="image")
993
  alpha_output_button = gr.Button(value="Alpha Mask Output", visible=False, elem_classes="new_button")
994
 
995
- # first step: get the image information
996
  extract_frames_button.click(
 
 
 
 
 
997
  fn=get_frames_from_image,
998
- inputs=[
999
- image_input, image_state
1000
- ],
1001
  outputs=[image_state, image_info, template_frame,
1002
- image_selection_slider, track_pause_number_slider,point_prompt, clear_button_click, add_mask_button, matting_button, template_frame,
1003
- foreground_image_output, alpha_image_output, foreground_output_button, alpha_output_button, mask_dropdown, step2_title]
1004
- )
1005
 
1006
  # second step: select images from slider
1007
  image_selection_slider.release(fn=select_image_template,
@@ -1015,7 +1051,7 @@ with gr.Blocks(theme=gr.themes.Monochrome(), css=my_custom_css) as demo:
1015
  template_frame.select(
1016
  fn=sam_refine,
1017
  inputs=[image_state, point_prompt, click_state, interactive_state],
1018
- outputs=[template_frame, image_state, interactive_state]
1019
  )
1020
 
1021
  # add different mask
@@ -1046,34 +1082,26 @@ with gr.Blocks(theme=gr.themes.Monochrome(), css=my_custom_css) as demo:
1046
  )
1047
 
1048
  # clear input
 
 
 
 
 
 
 
 
 
 
1049
  image_input.change(
1050
  fn=restart,
1051
  inputs=[],
1052
- outputs=[
1053
- image_state,
1054
- interactive_state,
1055
- click_state,
1056
- foreground_image_output, alpha_image_output,
1057
- template_frame,
1058
- image_selection_slider , track_pause_number_slider,point_prompt, clear_button_click,
1059
- add_mask_button, matting_button, template_frame, foreground_image_output, alpha_image_output, remove_mask_button, foreground_output_button, alpha_output_button, mask_dropdown, image_info, step2_title
1060
- ],
1061
- queue=False,
1062
  show_progress=False)
1063
-
1064
  image_input.clear(
1065
  fn=restart,
1066
  inputs=[],
1067
- outputs=[
1068
- image_state,
1069
- interactive_state,
1070
- click_state,
1071
- foreground_image_output, alpha_image_output,
1072
- template_frame,
1073
- image_selection_slider , track_pause_number_slider,point_prompt, clear_button_click,
1074
- add_mask_button, matting_button, template_frame, foreground_image_output, alpha_image_output, remove_mask_button, foreground_output_button, alpha_output_button, mask_dropdown, image_info, step2_title
1075
- ],
1076
- queue=False,
1077
  show_progress=False)
1078
 
1079
  # points clear
@@ -1094,4 +1122,4 @@ with gr.Blocks(theme=gr.themes.Monochrome(), css=my_custom_css) as demo:
1094
  gr.Markdown(article)
1095
 
1096
  demo.queue()
1097
- demo.launch(debug=True)
 
1
  import sys
2
+ from pathlib import Path
3
+ _HERE = Path(__file__).resolve().parent
4
+ sys.path.insert(0, str(_HERE)) # hugging_face/ (for tools, matanyone2_wrapper)
5
+ sys.path.insert(0, str(_HERE.parent)) # repo root (for matanyone2)
6
 
7
  import os
8
  import json
9
  import time
10
+ import tempfile
11
  import psutil
12
  import ffmpeg
13
  import imageio
 
15
  from PIL import Image
16
 
17
  import cv2
 
18
  import numpy as np
19
  import gradio as gr
20
+ import spaces
21
 
22
  from tools.painter import mask_painter
23
  from tools.interact_tools import SamControler
 
78
  Args:
79
  video_path:str
80
  timestamp:float64
81
+ Return
82
  [[0:nearest_frame], [nearest_frame:], nearest_frame]
83
  """
84
 
85
  user_name = time.time()
86
+ image_size = (image_input.shape[0], image_input.shape[1])
87
+
88
+ # resize if resolution too big
89
+ if image_size[0] >= 1080 and image_size[1] >= 1080:
90
+ scale = 1080 / min(image_size)
91
+ new_w = int(image_size[1] * scale)
92
+ new_h = int(image_size[0] * scale)
93
+ image_input = cv2.resize(image_input, (new_w, new_h), interpolation=cv2.INTER_AREA)
94
+ image_size = (image_input.shape[0], image_input.shape[1])
95
+
96
  frames = [image_input] * 2 # hardcode: mimic a video with 2 frames
97
+ # initialize image_state
 
98
  image_state = {
99
  "user_name": user_name,
100
  "image_name": "output.png",
 
106
  "fps": None
107
  }
108
  image_info = "Image Name: N/A,\nFPS: N/A,\nTotal Frames: {},\nImage Size:{}".format(len(frames), image_size)
109
+ return (
110
+ image_state,
111
+ image_info,
112
+ image_state["origin_images"][0],
113
+ gr.Slider(visible=True, maximum=10, value=10),
114
+ gr.Slider(visible=False, maximum=len(frames), value=len(frames)),
115
+ gr.Image(visible=True),
116
+ gr.Image(visible=True),
117
+ )
 
118
 
119
  # extract frames from upload video
120
  def get_frames_from_video(video_input, video_state):
 
134
  audio_path = video_input.replace(".mp4", "_audio.wav")
135
  ffmpeg.input(video_path).output(audio_path, format='wav', acodec='pcm_s16le', ac=2, ar='44100').run(overwrite_output=True, quiet=True)
136
  except Exception as e:
137
+ print(f"No audio stream found, skipping. ({e})")
138
  audio_path = "" # Set to "" if extraction fails
139
 
140
  # extract frames
141
+ max_frames = int(os.environ.get("MAX_FRAMES", "200"))
142
+ truncated = False
143
  try:
144
  cap = cv2.VideoCapture(video_path)
145
  fps = cap.get(cv2.CAP_PROP_FPS)
 
148
  if ret == True:
149
  current_memory_usage = psutil.virtual_memory().percent
150
  frames.append(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
151
+ if len(frames) >= max_frames:
152
+ truncated = True
153
+ break
154
  if current_memory_usage > 90:
155
+ truncated = True
156
  break
157
  else:
158
  break
159
  except (OSError, TypeError, ValueError, KeyError, SyntaxError) as e:
160
  print("read_frame_source:{} error. {}\n".format(video_path, str(e)))
161
+ if not frames:
162
+ raise gr.Error("Failed to read any frames from the video. Please try uploading again.")
163
+ if truncated:
164
+ gr.Warning(f"Video truncated to {len(frames)} frames (limit: {max_frames}). Set MAX_FRAMES env var to adjust.")
165
+ image_size = (frames[0].shape[0],frames[0].shape[1])
166
 
167
  # [remove for local demo] resize if resolution too big
168
+ if image_size[0]>=1080 and image_size[1]>=1080:
169
  scale = 1080 / min(image_size)
170
  new_w = int(image_size[1] * scale)
171
  new_h = int(image_size[0] * scale)
 
186
  "fps": fps,
187
  "audio": audio_path
188
  }
189
+ interactive_state = {
190
+ "inference_times": 0,
191
+ "negative_click_times": 0,
192
+ "positive_click_times": 0,
193
+ "mask_save": args.mask_save,
194
+ "multi_mask": {
195
+ "mask_names": [],
196
+ "masks": []
197
+ },
198
+ "track_end_number": None,
199
+ }
200
  video_info = "Video Name: {},\nFPS: {},\nTotal Frames: {},\nImage Size:{}".format(video_state["video_name"], round(video_state["fps"], 0), len(frames), image_size)
201
+ return (
202
+ video_state,
203
+ interactive_state,
204
+ [[], []], # click_state
205
+ gr.Video(value=None, visible=True), # foreground_video_output
206
+ gr.Video(value=None, visible=True), # alpha_video_output
207
+ gr.Image(value=video_state["origin_images"][0], visible=True), # template_frame
208
+ gr.Slider(visible=True, maximum=len(frames), value=1),
209
+ gr.Slider(visible=False, maximum=len(frames), value=len(frames)),
210
+ gr.Radio(visible=True), # point_prompt
211
+ gr.Button(visible=True), # clear_button_click
212
+ gr.Button(visible=True), # add_mask_button
213
+ gr.Button(visible=True), # matting_button
214
+ gr.Button(visible=False), # remove_mask_button
215
+ gr.Button(visible=False), # foreground_output_button
216
+ gr.Button(visible=False), # alpha_output_button
217
+ gr.Dropdown(visible=True, choices=[], value=[]), # mask_dropdown
218
+ video_info,
219
+ gr.Markdown(visible=True), # step2_title
220
+ )
221
 
222
  # get the select frame from gradio slider
223
  def select_video_template(image_selection_slider, video_state, interactive_state):
 
225
  image_selection_slider -= 1
226
  video_state["select_frame_number"] = image_selection_slider
227
 
 
 
 
 
228
  return video_state["painted_images"][image_selection_slider], video_state, interactive_state
229
 
230
  def select_image_template(image_selection_slider, video_state, interactive_state):
 
232
  image_selection_slider = 0 # fixed for image
233
  video_state["select_frame_number"] = image_selection_slider
234
 
 
 
 
 
235
  return video_state["painted_images"][image_selection_slider], video_state, interactive_state
236
 
237
  # set the tracking end frame
 
241
  return video_state["painted_images"][track_pause_number_slider],interactive_state
242
 
243
  # use sam to get the mask
244
+ @spaces.GPU(duration=30)
245
+ def _sam_refine_gpu(frame, click_state, click_index, is_positive):
246
+ """GPU worker for SAM refinement. Takes only the single frame needed."""
247
+ coordinate = "[[{},{},{}]]".format(click_index[0], click_index[1], 1 if is_positive else 0)
248
+
 
 
 
 
 
 
 
 
 
249
  # prompt for sam model
250
  model.samcontroler.sam_controler.reset_image()
251
+ model.samcontroler.sam_controler.set_image(frame)
252
  prompt = get_prompt(click_state=click_state, click_input=coordinate)
253
 
254
+ mask, logit, painted_image = model.first_frame_click(
255
+ image=frame,
256
  points=np.array(prompt["input_point"]),
257
  labels=np.array(prompt["input_label"]),
258
  multimask=prompt["multimask_output"],
259
  )
260
+ return mask, logit, painted_image, click_state
261
+
262
+ def sam_refine(video_state, point_prompt, click_state, interactive_state, evt:gr.SelectData):
263
+ """Gradio handler that extracts evt.index, delegates to GPU, then updates state."""
264
+ is_positive = point_prompt == "Positive"
265
+ if is_positive:
266
+ interactive_state["positive_click_times"] += 1
267
+ else:
268
+ interactive_state["negative_click_times"] += 1
269
+
270
+ frame = video_state["origin_images"][video_state["select_frame_number"]]
271
+ mask, logit, painted_image, click_state = _sam_refine_gpu(frame, click_state, evt.index, is_positive)
272
+
273
  video_state["masks"][video_state["select_frame_number"]] = mask
274
  video_state["logits"][video_state["select_frame_number"]] = logit
275
  video_state["painted_images"][video_state["select_frame_number"]] = painted_image
276
 
277
+ return painted_image, video_state, interactive_state, click_state
278
 
279
  def add_multi_mask(video_state, interactive_state, mask_dropdown):
280
  mask = video_state["masks"][video_state["select_frame_number"]]
 
308
  return select_frame
309
 
310
  # image matting
311
+ @spaces.GPU(duration=60)
312
  def image_matting(video_state, interactive_state, mask_dropdown, erode_kernel_size, dilate_kernel_size, refine_iter, model_selection):
313
  # Load model if not already loaded
314
  try:
 
338
  else:
339
  template_mask = video_state["masks"][video_state["select_frame_number"]]
340
 
341
+ if len(np.unique(template_mask)) == 1 and template_mask.max() == 0:
342
+ raise gr.Error("Please set a mask on the template frame before running matting.")
 
343
  foreground, alpha = matanyone2(matanyone_processor, following_frames, template_mask*255, r_erode=erode_kernel_size, r_dilate=dilate_kernel_size, n_warmup=refine_iter)
344
  foreground_output = Image.fromarray(foreground[-1])
345
  alpha_output = Image.fromarray(alpha[-1][:,:,0])
 
347
  return foreground_output, alpha_output
348
 
349
  # video matting
350
+ @spaces.GPU(duration=120)
351
  def video_matting(video_state, interactive_state, mask_dropdown, erode_kernel_size, dilate_kernel_size, model_selection):
352
  # Load model if not already loaded
353
  try:
 
380
 
381
  audio_path = video_state["audio"]
382
 
383
+ if len(np.unique(template_mask)) == 1 and template_mask.max() == 0:
384
+ raise gr.Error("Please set a mask on the template frame before running matting.")
 
385
  foreground, alpha = matanyone2(matanyone_processor, following_frames, template_mask*255, r_erode=erode_kernel_size, r_dilate=dilate_kernel_size)
386
 
387
+ tmpdir = tempfile.mkdtemp()
388
+ foreground_output = generate_video_from_frames(foreground, output_path=os.path.join(tmpdir, "fg.mp4"), fps=fps, audio_path=audio_path)
389
+ alpha_output = generate_video_from_frames(alpha, output_path=os.path.join(tmpdir, "alpha.mp4"), fps=fps, gray2rgb=True, audio_path=audio_path)
390
 
391
  return foreground_output, alpha_output
392
 
 
443
  return output_path
444
  return video_temp_path
445
 
446
+ def show_load_components():
447
+ """Show right-side components immediately when Load button is clicked."""
448
+ return (
449
+ gr.Markdown(visible=True), # step2_title
450
+ gr.Image(visible=True), # template_frame
451
+ gr.Radio(visible=True), # point_prompt
452
+ gr.Button(visible=True), # clear_button_click
453
+ gr.Button(visible=True), # add_mask_button
454
+ gr.Button(visible=True), # matting_button
455
+ gr.Dropdown(visible=True), # mask_dropdown
456
+ gr.Video(visible=True), # foreground_video_output
457
+ gr.Video(visible=True), # alpha_video_output
458
+ )
459
+
460
  # reset all states for a new input
461
  def restart():
462
+ return (
463
+ {
464
  "user_name": "",
465
  "video_name": "",
466
  "origin_images": None,
 
470
  "logits": None,
471
  "select_frame_number": 0,
472
  "fps": 30
473
+ },
474
+ {
475
  "inference_times": 0,
476
+ "negative_click_times": 0,
477
  "positive_click_times": 0,
478
  "mask_save": args.mask_save,
479
  "multi_mask": {
 
481
  "masks": []
482
  },
483
  "track_end_number": None,
484
+ },
485
+ [[], []],
486
+ gr.update(value=None, visible=False), # foreground output
487
+ gr.update(value=None, visible=False), # alpha output
488
+ gr.update(visible=False), # template_frame
489
+ gr.update(visible=False), # image_selection_slider
490
+ gr.update(visible=False), # track_pause_number_slider
491
+ gr.update(visible=False), # point_prompt
492
+ gr.update(visible=False), # clear_button_click
493
+ gr.update(visible=False), # add_mask_button
494
+ gr.update(visible=False), # matting_button
495
+ gr.update(visible=False), # remove_mask_button
496
+ gr.update(visible=False), # foreground_output_button
497
+ gr.update(visible=False), # alpha_output_button
498
+ gr.update(visible=False, choices=[], value=[]), # mask_dropdown
499
+ "", # video_info / image_info
500
+ gr.update(visible=False), # step2_title
501
+ )
502
 
503
  # args, defined in track_anything.py
504
  args = parse_augment()
 
507
  'vit_l': "https://dl.fbaipublicfiles.com/segment_anything/sam_vit_l_0b3195.pth",
508
  'vit_b': "https://dl.fbaipublicfiles.com/segment_anything/sam_vit_b_01ec64.pth"
509
  }
510
+ checkpoint_folder = str(_HERE.parent / 'pretrained_models')
511
 
512
  sam_checkpoint = load_file_from_url(sam_checkpoint_url_dict[args.sam_model_type], checkpoint_folder)
513
  # initialize sams
 
587
  raise RuntimeError("No models are available! Please ensure at least one model file exists in ../pretrained_models/")
588
  default_model = "MatAnyone 2" if "MatAnyone 2" in available_models else available_models[0]
589
 
590
+ # Eagerly load all available models (required for ZeroGPU)
591
+ for _display_name in available_models:
592
+ load_model(_display_name)
593
+
594
  # download test samples
595
+ test_sample_path = str(_HERE / "test_sample")
596
  load_file_from_url('https://github.com/pq-yang/MatAnyone2/releases/download/media/test-sample-0-720p.mp4', test_sample_path)
597
  load_file_from_url('https://github.com/pq-yang/MatAnyone2/releases/download/media/test-sample-1-720p.mp4', test_sample_path)
598
  load_file_from_url('https://github.com/pq-yang/MatAnyone2/releases/download/media/test-sample-2-720p.mp4', test_sample_path)
 
605
  load_file_from_url('https://github.com/pq-yang/MatAnyone2/releases/download/media/test-sample-3.jpg', test_sample_path)
606
 
607
  # download assets
608
+ assets_path = str(_HERE / "assets")
609
  load_file_from_url('https://github.com/pq-yang/MatAnyone/releases/download/media/tutorial_single_target.mp4', assets_path)
610
  load_file_from_url('https://github.com/pq-yang/MatAnyone/releases/download/media/tutorial_multi_targets.mp4', assets_path)
611
 
 
661
 
662
  my_custom_css = """
663
  .gradio-container {width: 85% !important; margin: 0 auto;}
664
+ .gr-monochrome-group {border-radius: 5px !important; border: revert-layer !important; border-width: 2px !important; color: var(--body-text-color) !important}
665
  button {border-radius: 8px !important;}
666
+ .new_button {background-color: var(--button-secondary-background-fill) !important; color: var(--button-secondary-text-color) !important; border: 1px solid var(--border-color-primary) !important;}
667
  .green_button {background-color: #4CAF50 !important; color: #ffffff !important; border: none !important;}
668
+ .new_button:hover {background-color: var(--button-secondary-background-fill-hover) !important;}
669
  .green_button:hover {background-color: #77bd79 !important;}
670
 
671
  .mask_button_group {gap: 10px !important;}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
672
  .margin_center {width: 50% !important; margin: auto !important;}
673
  .jc_center {justify-content: center !important;}
674
  .video-title {
675
  margin-bottom: 5px !important;
676
  }
677
  .custom-bg {
678
+ background-color: var(--background-fill-secondary);
679
+ padding: 10px;
680
+ border-radius: 10px;
681
+ }
 
 
 
 
 
 
 
 
 
 
 
 
 
682
  .title-container {
683
  text-align: center;
684
  padding: 0;
 
697
  small {
698
  font-size: 60%;
699
  }
 
700
  """
701
 
702
+ with gr.Blocks() as demo:
703
  gr.HTML('''
704
+ <link href="https://fonts.googleapis.com/css2?family=Sarpanch:wght@400;500;600;700;800;900&display=swap" rel="stylesheet">
705
  <div class="title-container">
706
  <h1 class="title is-2 publication-title"
707
+ style="font-size:50px; font-family: 'Sarpanch', serif;
708
+ background: linear-gradient(to right, #000000, #2dc464);
709
+ display: inline-block; -webkit-background-clip: text;
710
  -webkit-text-fill-color: transparent;">
711
  MatAnyone Series
712
  </h1>
 
721
  with gr.Row():
722
  with gr.Column():
723
  gr.Markdown("### Case 1: Single Target")
724
+ gr.Video(value=str(_HERE / "assets" / "tutorial_single_target.mp4"), elem_classes="video")
725
 
726
  with gr.Column():
727
  gr.Markdown("### Case 2: Multiple Targets")
728
+ gr.Video(value=str(_HERE / "assets" / "tutorial_multi_targets.mp4"), elem_classes="video")
729
 
730
  with gr.Tabs():
731
  with gr.TabItem("Video"):
 
834
  alpha_output_button = gr.Button(value="Alpha Mask Output", visible=False, elem_classes="new_button")
835
 
836
 
837
+ # first step: get the video information
838
  extract_frames_button.click(
839
  fn=get_frames_from_video,
840
+ inputs=[video_input, video_state],
841
+ outputs=[video_state, interactive_state, click_state,
842
+ foreground_video_output, alpha_video_output,
843
+ template_frame,
844
+ image_selection_slider, track_pause_number_slider,
845
+ point_prompt, clear_button_click,
846
+ add_mask_button, matting_button,
847
+ remove_mask_button, foreground_output_button, alpha_output_button,
848
+ mask_dropdown, video_info, step2_title],
849
+ )
850
 
851
  # second step: select images from slider
852
  image_selection_slider.release(fn=select_video_template,
 
860
  template_frame.select(
861
  fn=sam_refine,
862
  inputs=[video_state, point_prompt, click_state, interactive_state],
863
+ outputs=[template_frame, video_state, interactive_state, click_state]
864
  )
865
 
866
  # add different mask
 
890
  outputs=[template_frame]
891
  )
892
 
893
+ _video_restart_outputs = [
894
+ video_state, interactive_state, click_state,
895
+ foreground_video_output, alpha_video_output,
896
+ template_frame,
897
+ image_selection_slider, track_pause_number_slider,
898
+ point_prompt, clear_button_click,
899
+ add_mask_button, matting_button,
900
+ remove_mask_button, foreground_output_button, alpha_output_button,
901
+ mask_dropdown, video_info, step2_title,
902
+ ]
 
 
 
 
 
 
903
  video_input.clear(
904
  fn=restart,
905
  inputs=[],
906
+ outputs=_video_restart_outputs,
 
 
 
 
 
 
 
 
 
907
  show_progress=False)
908
 
909
  # points clear
 
1025
  alpha_image_output = gr.Image(type="pil", label="Alpha Output", visible=False, elem_classes="image")
1026
  alpha_output_button = gr.Button(value="Alpha Mask Output", visible=False, elem_classes="new_button")
1027
 
1028
+ # first step: get the image information
1029
  extract_frames_button.click(
1030
+ fn=show_load_components,
1031
+ inputs=[],
1032
+ outputs=[step2_title, template_frame, point_prompt, clear_button_click,
1033
+ add_mask_button, matting_button, mask_dropdown],
1034
+ ).then(
1035
  fn=get_frames_from_image,
1036
+ inputs=[image_input, image_state],
 
 
1037
  outputs=[image_state, image_info, template_frame,
1038
+ image_selection_slider, track_pause_number_slider,
1039
+ foreground_image_output, alpha_image_output],
1040
+ )
1041
 
1042
  # second step: select images from slider
1043
  image_selection_slider.release(fn=select_image_template,
 
1051
  template_frame.select(
1052
  fn=sam_refine,
1053
  inputs=[image_state, point_prompt, click_state, interactive_state],
1054
+ outputs=[template_frame, image_state, interactive_state, click_state]
1055
  )
1056
 
1057
  # add different mask
 
1082
  )
1083
 
1084
  # clear input
1085
+ _image_restart_outputs = [
1086
+ image_state, interactive_state, click_state,
1087
+ foreground_image_output, alpha_image_output,
1088
+ template_frame,
1089
+ image_selection_slider, track_pause_number_slider,
1090
+ point_prompt, clear_button_click,
1091
+ add_mask_button, matting_button,
1092
+ remove_mask_button, foreground_output_button, alpha_output_button,
1093
+ mask_dropdown, image_info, step2_title,
1094
+ ]
1095
  image_input.change(
1096
  fn=restart,
1097
  inputs=[],
1098
+ outputs=_image_restart_outputs,
 
 
 
 
 
 
 
 
 
1099
  show_progress=False)
1100
+
1101
  image_input.clear(
1102
  fn=restart,
1103
  inputs=[],
1104
+ outputs=_image_restart_outputs,
 
 
 
 
 
 
 
 
 
1105
  show_progress=False)
1106
 
1107
  # points clear
 
1122
  gr.Markdown(article)
1123
 
1124
  demo.queue()
1125
+ demo.launch(theme=gr.themes.Monochrome(), css=my_custom_css)
hugging_face/tools/base_segmenter.py CHANGED
@@ -5,7 +5,6 @@ from PIL import Image, ImageDraw, ImageOps
5
  import numpy as np
6
  from typing import Union
7
  from segment_anything import sam_model_registry, SamPredictor, SamAutomaticMaskGenerator
8
- import matplotlib.pyplot as plt
9
  import PIL
10
  from .mask_painter import mask_painter
11
 
 
5
  import numpy as np
6
  from typing import Union
7
  from segment_anything import sam_model_registry, SamPredictor, SamAutomaticMaskGenerator
 
8
  import PIL
9
  from .mask_painter import mask_painter
10
 
hugging_face/tools/download_util.py CHANGED
@@ -1,6 +1,5 @@
1
  import math
2
  import os
3
- import requests
4
  from torch.hub import download_url_to_file, get_dir
5
  from tqdm import tqdm
6
  from urllib.parse import urlparse
 
1
  import math
2
  import os
 
3
  from torch.hub import download_url_to_file, get_dir
4
  from tqdm import tqdm
5
  from urllib.parse import urlparse
hugging_face/tools/interact_tools.py CHANGED
@@ -5,14 +5,12 @@ from PIL import Image, ImageDraw, ImageOps
5
  import numpy as np
6
  from typing import Union
7
  from segment_anything import sam_model_registry, SamPredictor, SamAutomaticMaskGenerator
8
- import matplotlib.pyplot as plt
9
  import PIL
10
  from .mask_painter import mask_painter as mask_painter2
11
  from .base_segmenter import BaseSegmenter
12
  from .painter import mask_painter, point_painter
13
  import os
14
- import requests
15
- import sys
16
 
17
 
18
  mask_color = 3
 
5
  import numpy as np
6
  from typing import Union
7
  from segment_anything import sam_model_registry, SamPredictor, SamAutomaticMaskGenerator
 
8
  import PIL
9
  from .mask_painter import mask_painter as mask_painter2
10
  from .base_segmenter import BaseSegmenter
11
  from .painter import mask_painter, point_painter
12
  import os
13
+ import sys
 
14
 
15
 
16
  mask_color = 3
requirements.txt CHANGED
@@ -1,37 +1,312 @@
1
- progressbar2
2
- gdown >= 4.7.1
3
- gitpython >= 3.1
4
- git+https://github.com/cheind/py-thin-plate-spline
5
- hickle >= 5.0
6
- tensorboard >= 2.11
7
- numpy >= 1.21
8
- git+https://github.com/facebookresearch/segment-anything.git
9
- # gradio==4.31.0
10
- fastapi==0.111.0
11
- pydantic==2.7.1
12
- opencv-python >= 4.8
13
- matplotlib
14
- pyyaml
15
- av >= 0.5.2
16
- openmim
17
- tqdm >= 4.66.1
18
- psutil
19
- ffmpeg-python
20
- cython
21
- Pillow >= 9.5
22
- scipy >= 1.7
23
- pycocotools >= 2.0.7
24
- einops >= 0.6
25
- hydra-core >= 1.3.2
26
- PySide6 >= 6.2.0
27
- charset-normalizer >= 3.1.0
28
- netifaces >= 0.11.0
29
- cchardet >= 2.1.7
30
- easydict
31
- requests
32
- pyqtdarktheme
33
- imageio == 2.25.0
34
- imageio[ffmpeg]
35
- ffmpeg-python
36
- safetensors
37
- huggingface_hub < 1.0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # This file was autogenerated by uv via the following command:
2
+ # uv export --no-hashes --no-dev --group hf-spaces --no-emit-package typer-slim -o requirements.txt
3
+ aiofiles==24.1.0
4
+ # via gradio
5
+ aiohappyeyeballs==2.6.1
6
+ # via aiohttp
7
+ aiohttp==3.13.3
8
+ # via fsspec
9
+ aiosignal==1.4.0
10
+ # via aiohttp
11
+ annotated-doc==0.0.4
12
+ # via
13
+ # fastapi
14
+ # typer
15
+ annotated-types==0.7.0
16
+ # via pydantic
17
+ antlr4-python3-runtime==4.9.3
18
+ # via
19
+ # hydra-core
20
+ # omegaconf
21
+ anyio==4.12.1
22
+ # via
23
+ # gradio
24
+ # httpx
25
+ # starlette
26
+ attrs==25.4.0
27
+ # via aiohttp
28
+ audioop-lts==0.2.2 ; python_full_version >= '3.13'
29
+ # via gradio
30
+ brotli==1.2.0
31
+ # via gradio
32
+ certifi==2026.2.25
33
+ # via
34
+ # httpcore
35
+ # httpx
36
+ # requests
37
+ charset-normalizer==3.4.5
38
+ # via requests
39
+ click==8.3.1
40
+ # via
41
+ # typer
42
+ # uvicorn
43
+ colorama==0.4.6 ; sys_platform == 'win32'
44
+ # via
45
+ # click
46
+ # tqdm
47
+ datasets==4.7.0
48
+ dill==0.4.0
49
+ # via
50
+ # datasets
51
+ # multiprocess
52
+ fastapi==0.135.1
53
+ # via gradio
54
+ ffmpeg-python==0.2.0
55
+ # via matanyone
56
+ ffmpy==1.0.0
57
+ # via gradio
58
+ filelock==3.25.2
59
+ # via
60
+ # datasets
61
+ # huggingface-hub
62
+ # torch
63
+ frozenlist==1.8.0
64
+ # via
65
+ # aiohttp
66
+ # aiosignal
67
+ fsspec==2026.2.0
68
+ # via
69
+ # datasets
70
+ # gradio-client
71
+ # huggingface-hub
72
+ # torch
73
+ future==1.0.0
74
+ # via ffmpeg-python
75
+ gradio==6.9.0
76
+ # via
77
+ # matanyone
78
+ # spaces
79
+ gradio-client==2.3.0
80
+ # via gradio
81
+ groovy==0.1.2
82
+ # via gradio
83
+ h11==0.16.0
84
+ # via
85
+ # httpcore
86
+ # uvicorn
87
+ hf-xet==1.4.2 ; platform_machine == 'AMD64' or platform_machine == 'aarch64' or platform_machine == 'amd64' or platform_machine == 'arm64' or platform_machine == 'x86_64'
88
+ # via huggingface-hub
89
+ httpcore==1.0.9
90
+ # via httpx
91
+ httpx==0.28.1
92
+ # via
93
+ # datasets
94
+ # gradio
95
+ # gradio-client
96
+ # huggingface-hub
97
+ # safehttpx
98
+ # spaces
99
+ huggingface-hub==1.7.0
100
+ # via
101
+ # datasets
102
+ # gradio
103
+ # gradio-client
104
+ hydra-core==1.3.2
105
+ # via matanyone
106
+ idna==3.11
107
+ # via
108
+ # anyio
109
+ # httpx
110
+ # requests
111
+ # yarl
112
+ imageio==2.37.3
113
+ # via matanyone
114
+ imageio-ffmpeg==0.6.0
115
+ # via imageio
116
+ jinja2==3.1.6
117
+ # via
118
+ # gradio
119
+ # torch
120
+ markdown-it-py==4.0.0
121
+ # via rich
122
+ markupsafe==3.0.3
123
+ # via
124
+ # gradio
125
+ # jinja2
126
+ mdurl==0.1.2
127
+ # via markdown-it-py
128
+ mpmath==1.3.0
129
+ # via sympy
130
+ multidict==6.7.1
131
+ # via
132
+ # aiohttp
133
+ # yarl
134
+ multiprocess==0.70.18
135
+ # via datasets
136
+ networkx==3.6.1
137
+ # via torch
138
+ numpy==2.4.3
139
+ # via
140
+ # datasets
141
+ # gradio
142
+ # imageio
143
+ # opencv-python
144
+ # pandas
145
+ # torchvision
146
+ nvidia-cublas-cu12==12.8.4.1 ; platform_machine == 'x86_64' and sys_platform == 'linux'
147
+ # via
148
+ # nvidia-cudnn-cu12
149
+ # nvidia-cusolver-cu12
150
+ # torch
151
+ nvidia-cuda-cupti-cu12==12.8.90 ; platform_machine == 'x86_64' and sys_platform == 'linux'
152
+ # via torch
153
+ nvidia-cuda-nvrtc-cu12==12.8.93 ; platform_machine == 'x86_64' and sys_platform == 'linux'
154
+ # via torch
155
+ nvidia-cuda-runtime-cu12==12.8.90 ; platform_machine == 'x86_64' and sys_platform == 'linux'
156
+ # via torch
157
+ nvidia-cudnn-cu12==9.10.2.21 ; platform_machine == 'x86_64' and sys_platform == 'linux'
158
+ # via torch
159
+ nvidia-cufft-cu12==11.3.3.83 ; platform_machine == 'x86_64' and sys_platform == 'linux'
160
+ # via torch
161
+ nvidia-cufile-cu12==1.13.1.3 ; platform_machine == 'x86_64' and sys_platform == 'linux'
162
+ # via torch
163
+ nvidia-curand-cu12==10.3.9.90 ; platform_machine == 'x86_64' and sys_platform == 'linux'
164
+ # via torch
165
+ nvidia-cusolver-cu12==11.7.3.90 ; platform_machine == 'x86_64' and sys_platform == 'linux'
166
+ # via torch
167
+ nvidia-cusparse-cu12==12.5.8.93 ; platform_machine == 'x86_64' and sys_platform == 'linux'
168
+ # via
169
+ # nvidia-cusolver-cu12
170
+ # torch
171
+ nvidia-cusparselt-cu12==0.7.1 ; platform_machine == 'x86_64' and sys_platform == 'linux'
172
+ # via torch
173
+ nvidia-nccl-cu12==2.27.5 ; platform_machine == 'x86_64' and sys_platform == 'linux'
174
+ # via torch
175
+ nvidia-nvjitlink-cu12==12.8.93 ; platform_machine == 'x86_64' and sys_platform == 'linux'
176
+ # via
177
+ # nvidia-cufft-cu12
178
+ # nvidia-cusolver-cu12
179
+ # nvidia-cusparse-cu12
180
+ # torch
181
+ nvidia-nvshmem-cu12==3.3.20 ; platform_machine == 'x86_64' and sys_platform == 'linux'
182
+ # via torch
183
+ nvidia-nvtx-cu12==12.8.90 ; platform_machine == 'x86_64' and sys_platform == 'linux'
184
+ # via torch
185
+ omegaconf==2.3.0
186
+ # via hydra-core
187
+ opencv-python==4.13.0.92
188
+ # via matanyone
189
+ orjson==3.11.7
190
+ # via gradio
191
+ packaging==26.0
192
+ # via
193
+ # datasets
194
+ # gradio
195
+ # gradio-client
196
+ # huggingface-hub
197
+ # hydra-core
198
+ # spaces
199
+ pandas==3.0.1
200
+ # via
201
+ # datasets
202
+ # gradio
203
+ pillow==12.1.1
204
+ # via
205
+ # gradio
206
+ # imageio
207
+ # torchvision
208
+ propcache==0.4.1
209
+ # via
210
+ # aiohttp
211
+ # yarl
212
+ psutil==5.9.8
213
+ # via
214
+ # imageio
215
+ # spaces
216
+ pyarrow==23.0.1
217
+ # via datasets
218
+ pydantic==2.12.5
219
+ # via
220
+ # fastapi
221
+ # gradio
222
+ # spaces
223
+ pydantic-core==2.41.5
224
+ # via pydantic
225
+ pydub==0.25.1
226
+ # via gradio
227
+ pygments==2.19.2
228
+ # via rich
229
+ python-dateutil==2.9.0.post0
230
+ # via pandas
231
+ python-multipart==0.0.22
232
+ # via gradio
233
+ pytz==2026.1.post1
234
+ # via gradio
235
+ pyyaml==6.0.3
236
+ # via
237
+ # datasets
238
+ # gradio
239
+ # huggingface-hub
240
+ # omegaconf
241
+ requests==2.32.5
242
+ # via
243
+ # datasets
244
+ # spaces
245
+ rich==14.3.3
246
+ # via typer
247
+ safehttpx==0.1.7
248
+ # via gradio
249
+ segment-anything @ git+https://github.com/facebookresearch/segment-anything.git@dca509fe793f601edb92606367a655c15ac00fdf
250
+ # via matanyone
251
+ semantic-version==2.10.0
252
+ # via gradio
253
+ setuptools==82.0.1
254
+ # via torch
255
+ shellingham==1.5.4
256
+ # via typer
257
+ six==1.17.0
258
+ # via python-dateutil
259
+ spaces==0.47.0
260
+ # via matanyone
261
+ starlette==0.52.1
262
+ # via
263
+ # fastapi
264
+ # gradio
265
+ sympy==1.14.0
266
+ # via torch
267
+ tomlkit==0.13.3
268
+ # via gradio
269
+ torch==2.9.1
270
+ # via
271
+ # matanyone
272
+ # torchvision
273
+ torchvision==0.24.1
274
+ # via matanyone
275
+ tqdm==4.67.3
276
+ # via
277
+ # datasets
278
+ # huggingface-hub
279
+ triton==3.5.1 ; platform_machine == 'x86_64' and sys_platform == 'linux'
280
+ # via torch
281
+ typer==0.24.1
282
+ # via
283
+ # gradio
284
+ # huggingface-hub
285
+ typing-extensions==4.15.0
286
+ # via
287
+ # aiosignal
288
+ # anyio
289
+ # fastapi
290
+ # gradio
291
+ # gradio-client
292
+ # huggingface-hub
293
+ # pydantic
294
+ # pydantic-core
295
+ # spaces
296
+ # starlette
297
+ # torch
298
+ # typing-inspection
299
+ typing-inspection==0.4.2
300
+ # via
301
+ # fastapi
302
+ # pydantic
303
+ tzdata==2025.3 ; sys_platform == 'emscripten' or sys_platform == 'win32'
304
+ # via pandas
305
+ urllib3==2.6.3
306
+ # via requests
307
+ uvicorn==0.41.0
308
+ # via gradio
309
+ xxhash==3.6.0
310
+ # via datasets
311
+ yarl==1.23.0
312
+ # via aiohttp